Skip to content

Commit 3ee5944

Browse files
Implement a cached kernel
1 parent da1f32d commit 3ee5944

File tree

1 file changed

+85
-58
lines changed

1 file changed

+85
-58
lines changed

experimental/kernels/unittest_llmc/unittest_kernels.cpp

Lines changed: 85 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <array>
33
#include <cstdio>
44
#include <future>
5+
#include <map>
56

67
#include "kernels.h"
78
#include "unittest_llmc/unittest_kernels.h"
@@ -229,6 +230,38 @@ void matmul_forward_dummy(float* out,
229230
const float* inp, const float* weight, const float* bias,
230231
int B, int T, int C, int OC);
231232

233+
static WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
234+
static Context ctx = createContext({},{},{
235+
.requiredLimits = &requiredLimits
236+
});
237+
238+
static constexpr size_t BT = 64;
239+
static constexpr size_t BC = 16;
240+
static constexpr size_t BOC = 64;
241+
static constexpr size_t TT = BT / BC;
242+
static constexpr size_t TOC = BOC / BC;
243+
static constexpr size_t num_threads = BT * BOC / (TT * TOC);
244+
static Shape wgSize = {num_threads, 1, 1};
245+
246+
static std::string codeString(kShaderMatmul2DTiling);
247+
static std::string unrolledCode = loopUnrolling(replaceAll(codeString, {{"{{precision}}", toString(kf32)},
248+
{"{{BT}}", toString(BT)},
249+
{"{{BC}}", toString(BC)},
250+
{"{{BOC}}", toString(BOC)},
251+
{"{{TT}}", toString(TT)},
252+
{"{{TOC}}", toString(TOC)},
253+
{"{{NUM_TILEI}}", toString(BT * BC / num_threads)},
254+
{"{{NUM_TILEW}}", toString(BOC * BC / num_threads)}
255+
}));
256+
257+
258+
struct CodeCache {
259+
std::shared_ptr<Kernel> op;
260+
std::vector<Tensor> tensors;
261+
};
262+
263+
static std::map<std::string,CodeCache> cacheMap;
264+
232265
void MATMUL_FORWARD_GPU(float* out,
233266
const float* inp, const float* weight, const float* bias,
234267
int B, int T, int C, int OC){
@@ -255,79 +288,73 @@ void MATMUL_FORWARD_GPU(float* out,
255288
unsigned long c = static_cast<unsigned long>(C);
256289
unsigned long oc = static_cast<unsigned long>(OC);
257290
setLogLevel(kError);
258-
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
259-
Context ctx = createContext({},{},{
260-
.requiredLimits = &requiredLimits
261-
});
262291

263292
{
264293
DurationTime duration("matmul_forward_gpu: before creating tensors", verbose);
265-
Tensor inp_i = createTensor(ctx, Shape{b * t * c}, kf32, inp);
266-
Tensor weight_i = createTensor(ctx, Shape{oc * c}, kf32, weight);
267-
Tensor bias_i = bias == NULL ? createTensor(ctx, Shape{1}, kf32) : createTensor(ctx, Shape{oc}, kf32, bias);
268-
Tensor out_o = createTensor(ctx, Shape{b * t * oc}, kf32);
294+
// Generate the key of the cache by arguments.
295+
std::string key = std::to_string(B) + "_" + std::to_string(T) + "_" + std::to_string(C) + "_" + std::to_string(OC);
296+
CodeCache cache;
297+
if (cacheMap.find(key) == cacheMap.end()) {
298+
Shape nWorkgroups = {b, cdiv(T, BT), cdiv(OC, BOC)};
299+
Tensor inp_i = createTensor(ctx, Shape{b * t * c}, kf32, inp);
300+
Tensor weight_i = createTensor(ctx, Shape{oc * c}, kf32, weight);
301+
Tensor bias_i = bias == NULL ? createTensor(ctx, Shape{1}, kf32) : createTensor(ctx, Shape{oc}, kf32, bias);
302+
Tensor out_o = createTensor(ctx, Shape{b * t * oc}, kf32);
303+
cache.op = std::shared_ptr<Kernel>(new Kernel(createKernel(ctx, {unrolledCode, wgSize, kf32},
304+
Bindings{inp_i, weight_i, bias_i, out_o},
305+
nWorkgroups,
306+
/* params */
307+
MatmulParams{
308+
static_cast<uint32_t>(b),
309+
static_cast<uint32_t>(t),
310+
static_cast<uint32_t>(c),
311+
static_cast<uint32_t>(oc)
312+
})));
313+
cache.tensors.push_back(inp_i);
314+
cache.tensors.push_back(weight_i);
315+
cache.tensors.push_back(bias_i);
316+
cache.tensors.push_back(out_o);
317+
cacheMap[key] = cache;
318+
} else {
319+
toGPU(ctx, inp, cacheMap[key].tensors[0]);
320+
toGPU(ctx, weight, cacheMap[key].tensors[1]);
321+
if (bias != NULL) {
322+
toGPU(ctx, bias, cacheMap[key].tensors[2]);
323+
}
324+
cache = cacheMap[key];
325+
}
326+
269327
std::promise<void> promise;
270328
std::future<void> future = promise.get_future();
271329

272330
if (version == 2) {
273331
DurationTime duration("matmul_forward_gpu: after creating tensors", verbose);
274-
static constexpr size_t BT = 64;
275-
static constexpr size_t BC = 16;
276-
static constexpr size_t BOC = 64;
277-
static constexpr size_t TT = BT / BC;
278-
static constexpr size_t TOC = BOC / BC;
279-
static constexpr size_t num_threads = BT * BOC / (TT * TOC);
280-
Shape wgSize = {num_threads, 1, 1};
281-
Shape nWorkgroups = {b, cdiv(T, BT), cdiv(OC, BOC)};
282-
283-
std::string codeString(kShaderMatmul2DTiling);
284-
replaceAll(codeString, {{"{{precision}}", toString(kf32)},
285-
{"{{BT}}", toString(BT)},
286-
{"{{BC}}", toString(BC)},
287-
{"{{BOC}}", toString(BOC)},
288-
{"{{TT}}", toString(TT)},
289-
{"{{TOC}}", toString(TOC)},
290-
{"{{NUM_TILEI}}", toString(BT * BC / num_threads)},
291-
{"{{NUM_TILEW}}", toString(BOC * BC / num_threads)}
292-
});
293-
std::string unrolledCode = loopUnrolling(codeString);
294332
{
295333
DurationTime duration("matmul_forward_gpu: before creating kernels", verbose);
296-
297-
Kernel op = createKernel(ctx, {unrolledCode, wgSize, kf32},
298-
Bindings{inp_i, weight_i, bias_i, out_o},
299-
nWorkgroups,
300-
/* params */
301-
MatmulParams{
302-
static_cast<uint32_t>(b),
303-
static_cast<uint32_t>(t),
304-
static_cast<uint32_t>(c),
305-
static_cast<uint32_t>(oc)
306-
});
307334
{
308335
DurationTime duration("matmul_forward_gpu without creating context", verbose);
309-
dispatchKernel(ctx, op, promise);
336+
dispatchKernel(ctx, *cache.op, promise);
310337
wait(ctx, future);
311-
toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
338+
toCPU(ctx, cacheMap[key].tensors[3], out, b * t * oc * sizeof(float));
312339
}
313340
}
314-
} else if (version == 1) {
315-
Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
316-
Bindings{inp_i, weight_i, bias_i, out_o},
317-
/* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
318-
/* params */
319-
MatmulParams{
320-
static_cast<uint32_t>(b),
321-
static_cast<uint32_t>(t),
322-
static_cast<uint32_t>(c),
323-
static_cast<uint32_t>(oc)
324-
});
325-
{
326-
DurationTime duration("matmul_forward_gpu without creating context", verbose);
327-
dispatchKernel(ctx, op, promise);
328-
wait(ctx, future);
329-
toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
330-
}
341+
// } else if (version == 1) {
342+
// Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
343+
// Bindings{inp_i, weight_i, bias_i, out_o},
344+
// /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
345+
// /* params */
346+
// MatmulParams{
347+
// static_cast<uint32_t>(b),
348+
// static_cast<uint32_t>(t),
349+
// static_cast<uint32_t>(c),
350+
// static_cast<uint32_t>(oc)
351+
// });
352+
// {
353+
// DurationTime duration("matmul_forward_gpu without creating context", verbose);
354+
// dispatchKernel(ctx, op, promise);
355+
// wait(ctx, future);
356+
// toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
357+
// }
331358
} else {
332359
DurationTime duration("matmul_forward_cpu", verbose);
333360
matmul_forward_dummy(out, inp, weight, bias, B, T, C, OC);

0 commit comments

Comments
 (0)