2
2
#include < array>
3
3
#include < cstdio>
4
4
#include < future>
5
+ #include < map>
5
6
6
7
#include " kernels.h"
7
8
#include " unittest_llmc/unittest_kernels.h"
@@ -229,6 +230,38 @@ void matmul_forward_dummy(float* out,
229
230
const float * inp, const float * weight, const float * bias,
230
231
int B, int T, int C, int OC);
231
232
233
+ static WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
234
+ static Context ctx = createContext({},{},{
235
+ .requiredLimits = &requiredLimits
236
+ });
237
+
238
+ static constexpr size_t BT = 64 ;
239
+ static constexpr size_t BC = 16 ;
240
+ static constexpr size_t BOC = 64 ;
241
+ static constexpr size_t TT = BT / BC;
242
+ static constexpr size_t TOC = BOC / BC;
243
+ static constexpr size_t num_threads = BT * BOC / (TT * TOC);
244
+ static Shape wgSize = {num_threads, 1 , 1 };
245
+
246
+ static std::string codeString (kShaderMatmul2DTiling );
247
+ static std::string unrolledCode = loopUnrolling(replaceAll(codeString, {{" {{precision}}" , toString (kf32)},
248
+ {" {{BT}}" , toString (BT)},
249
+ {" {{BC}}" , toString (BC)},
250
+ {" {{BOC}}" , toString (BOC)},
251
+ {" {{TT}}" , toString (TT)},
252
+ {" {{TOC}}" , toString (TOC)},
253
+ {" {{NUM_TILEI}}" , toString (BT * BC / num_threads)},
254
+ {" {{NUM_TILEW}}" , toString (BOC * BC / num_threads)}
255
+ }));
256
+
257
+
258
+ struct CodeCache {
259
+ std::shared_ptr<Kernel> op;
260
+ std::vector<Tensor> tensors;
261
+ };
262
+
263
+ static std::map<std::string,CodeCache> cacheMap;
264
+
232
265
void MATMUL_FORWARD_GPU (float * out,
233
266
const float * inp, const float * weight, const float * bias,
234
267
int B, int T, int C, int OC){
@@ -255,79 +288,73 @@ void MATMUL_FORWARD_GPU(float* out,
255
288
unsigned long c = static_cast <unsigned long >(C);
256
289
unsigned long oc = static_cast <unsigned long >(OC);
257
290
setLogLevel (kError );
258
- WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
259
- Context ctx = createContext ({},{},{
260
- .requiredLimits = &requiredLimits
261
- });
262
291
263
292
{
264
293
DurationTime duration (" matmul_forward_gpu: before creating tensors" , verbose);
265
- Tensor inp_i = createTensor (ctx, Shape{b * t * c}, kf32, inp);
266
- Tensor weight_i = createTensor (ctx, Shape{oc * c}, kf32, weight);
267
- Tensor bias_i = bias == NULL ? createTensor (ctx, Shape{1 }, kf32) : createTensor (ctx, Shape{oc}, kf32, bias);
268
- Tensor out_o = createTensor (ctx, Shape{b * t * oc}, kf32);
294
+ // Generate the key of the cache by arguments.
295
+ std::string key = std::to_string (B) + " _" + std::to_string (T) + " _" + std::to_string (C) + " _" + std::to_string (OC);
296
+ CodeCache cache;
297
+ if (cacheMap.find (key) == cacheMap.end ()) {
298
+ Shape nWorkgroups = {b, cdiv (T, BT), cdiv (OC, BOC)};
299
+ Tensor inp_i = createTensor (ctx, Shape{b * t * c}, kf32, inp);
300
+ Tensor weight_i = createTensor (ctx, Shape{oc * c}, kf32, weight);
301
+ Tensor bias_i = bias == NULL ? createTensor (ctx, Shape{1 }, kf32) : createTensor (ctx, Shape{oc}, kf32, bias);
302
+ Tensor out_o = createTensor (ctx, Shape{b * t * oc}, kf32);
303
+ cache.op = std::shared_ptr<Kernel>(new Kernel (createKernel (ctx, {unrolledCode, wgSize, kf32},
304
+ Bindings{inp_i, weight_i, bias_i, out_o},
305
+ nWorkgroups,
306
+ /* params */
307
+ MatmulParams{
308
+ static_cast <uint32_t >(b),
309
+ static_cast <uint32_t >(t),
310
+ static_cast <uint32_t >(c),
311
+ static_cast <uint32_t >(oc)
312
+ })));
313
+ cache.tensors .push_back (inp_i);
314
+ cache.tensors .push_back (weight_i);
315
+ cache.tensors .push_back (bias_i);
316
+ cache.tensors .push_back (out_o);
317
+ cacheMap[key] = cache;
318
+ } else {
319
+ toGPU (ctx, inp, cacheMap[key].tensors [0 ]);
320
+ toGPU (ctx, weight, cacheMap[key].tensors [1 ]);
321
+ if (bias != NULL ) {
322
+ toGPU (ctx, bias, cacheMap[key].tensors [2 ]);
323
+ }
324
+ cache = cacheMap[key];
325
+ }
326
+
269
327
std::promise<void > promise;
270
328
std::future<void > future = promise.get_future ();
271
329
272
330
if (version == 2 ) {
273
331
DurationTime duration (" matmul_forward_gpu: after creating tensors" , verbose);
274
- static constexpr size_t BT = 64 ;
275
- static constexpr size_t BC = 16 ;
276
- static constexpr size_t BOC = 64 ;
277
- static constexpr size_t TT = BT / BC;
278
- static constexpr size_t TOC = BOC / BC;
279
- static constexpr size_t num_threads = BT * BOC / (TT * TOC);
280
- Shape wgSize = {num_threads, 1 , 1 };
281
- Shape nWorkgroups = {b, cdiv (T, BT), cdiv (OC, BOC)};
282
-
283
- std::string codeString (kShaderMatmul2DTiling );
284
- replaceAll (codeString, {{" {{precision}}" , toString (kf32)},
285
- {" {{BT}}" , toString (BT)},
286
- {" {{BC}}" , toString (BC)},
287
- {" {{BOC}}" , toString (BOC)},
288
- {" {{TT}}" , toString (TT)},
289
- {" {{TOC}}" , toString (TOC)},
290
- {" {{NUM_TILEI}}" , toString (BT * BC / num_threads)},
291
- {" {{NUM_TILEW}}" , toString (BOC * BC / num_threads)}
292
- });
293
- std::string unrolledCode = loopUnrolling (codeString);
294
332
{
295
333
DurationTime duration (" matmul_forward_gpu: before creating kernels" , verbose);
296
-
297
- Kernel op = createKernel (ctx, {unrolledCode, wgSize, kf32},
298
- Bindings{inp_i, weight_i, bias_i, out_o},
299
- nWorkgroups,
300
- /* params */
301
- MatmulParams{
302
- static_cast <uint32_t >(b),
303
- static_cast <uint32_t >(t),
304
- static_cast <uint32_t >(c),
305
- static_cast <uint32_t >(oc)
306
- });
307
334
{
308
335
DurationTime duration (" matmul_forward_gpu without creating context" , verbose);
309
- dispatchKernel (ctx, op, promise);
336
+ dispatchKernel (ctx, *cache. op , promise);
310
337
wait (ctx, future);
311
- toCPU (ctx, out_o , out, b * t * oc * sizeof (float ));
338
+ toCPU (ctx, cacheMap[key]. tensors [ 3 ] , out, b * t * oc * sizeof (float ));
312
339
}
313
340
}
314
- } else if (version == 1 ) {
315
- Kernel op = createKernel (ctx, {kShaderMatmul , 256 , kf32},
316
- Bindings{inp_i, weight_i, bias_i, out_o},
317
- /* nWorkgroups */ {cdiv (b * t, 256 ), 1 , 1 },
318
- /* params */
319
- MatmulParams{
320
- static_cast <uint32_t >(b),
321
- static_cast <uint32_t >(t),
322
- static_cast <uint32_t >(c),
323
- static_cast <uint32_t >(oc)
324
- });
325
- {
326
- DurationTime duration (" matmul_forward_gpu without creating context" , verbose);
327
- dispatchKernel (ctx, op, promise);
328
- wait (ctx, future);
329
- toCPU (ctx, out_o, out, b * t * oc * sizeof (float ));
330
- }
341
+ // } else if (version == 1) {
342
+ // Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
343
+ // Bindings{inp_i, weight_i, bias_i, out_o},
344
+ // /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
345
+ // /* params */
346
+ // MatmulParams{
347
+ // static_cast<uint32_t>(b),
348
+ // static_cast<uint32_t>(t),
349
+ // static_cast<uint32_t>(c),
350
+ // static_cast<uint32_t>(oc)
351
+ // });
352
+ // {
353
+ // DurationTime duration("matmul_forward_gpu without creating context", verbose);
354
+ // dispatchKernel(ctx, op, promise);
355
+ // wait(ctx, future);
356
+ // toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
357
+ // }
331
358
} else {
332
359
DurationTime duration (" matmul_forward_cpu" , verbose);
333
360
matmul_forward_dummy (out, inp, weight, bias, B, T, C, OC);
0 commit comments