Skip to content

Commit 503722c

Browse files
committed
ggml : sync all changes from llama.cpp and whisper.cpp
1 parent 3adf02e commit 503722c

File tree

12 files changed

+2534
-2024
lines changed

12 files changed

+2534
-2024
lines changed

examples/gpt-2/main.cpp

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct gpt2_layer {
4040
struct ggml_tensor * c_mlp_fc_w;
4141
struct ggml_tensor * c_mlp_fc_b;
4242

43-
struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
43+
struct ggml_tensor * c_mlp_proj_w;
4444
struct ggml_tensor * c_mlp_proj_b;
4545
};
4646

@@ -231,23 +231,23 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
231231
for (int i = 0; i < n_layer; ++i) {
232232
auto & layer = model.layers[i];
233233

234-
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
235-
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
234+
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
235+
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
236236

237-
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
238-
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
237+
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
238+
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239239

240-
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
241-
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
240+
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
241+
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
242242

243-
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
244-
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
243+
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
244+
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
245245

246-
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
247-
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
246+
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
247+
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
248248

249-
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
250-
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
249+
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
250+
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
251251

252252
// map by name
253253
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
@@ -265,7 +265,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
265265
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
266266
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
267267

268-
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
268+
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
269269
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
270270
}
271271
}
@@ -537,11 +537,13 @@ bool gpt2_eval(
537537
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
538538
// [n_past + N, 64, 12]
539539
struct ggml_tensor * V_trans =
540-
ggml_permute(ctx0,
541-
ggml_reshape_3d(ctx0,
542-
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
543-
n_embd/n_head, n_head, n_past + N),
544-
1, 2, 0, 3);
540+
ggml_cpy(ctx0,
541+
ggml_permute(ctx0,
542+
ggml_reshape_3d(ctx0,
543+
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
544+
n_embd/n_head, n_head, n_past + N),
545+
1, 2, 0, 3),
546+
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
545547

546548
// KQV = transpose(V) * KQ_soft_max
547549
// [64, N, 12]
@@ -625,7 +627,7 @@ bool gpt2_eval(
625627
// cur = proj_w*cur + proj_b
626628
// [768, N]
627629
cur = ggml_mul_mat(ctx0,
628-
model.layers[il].c_mlp_proj_w_trans,
630+
model.layers[il].c_mlp_proj_w,
629631
cur);
630632

631633
cur = ggml_add(ctx0,

examples/gpt-2/quantize.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212
#include <vector>
1313
#include <regex>
1414

15-
// TODO: move somewhere else
16-
#define QK 32
17-
1815
// default hparams (GPT-2 117M)
1916
struct gpt2_hparams {
2017
int32_t n_vocab = 50257;
@@ -223,11 +220,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
223220
switch (type) {
224221
case GGML_TYPE_Q4_0:
225222
{
226-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
223+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
227224
} break;
228225
case GGML_TYPE_Q4_1:
229226
{
230-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
227+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
231228
} break;
232229
default:
233230
{

examples/gpt-j/main.cpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct gptj_layer {
3838
struct ggml_tensor * c_mlp_fc_w;
3939
struct ggml_tensor * c_mlp_fc_b;
4040

41-
struct ggml_tensor * c_mlp_proj_w_trans;
41+
struct ggml_tensor * c_mlp_proj_w;
4242
struct ggml_tensor * c_mlp_proj_b;
4343
};
4444

@@ -180,7 +180,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
180180
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
181181
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
182182

183-
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w_trans
183+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
184184
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
185185

186186
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
@@ -236,20 +236,20 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
236236
for (int i = 0; i < n_layer; ++i) {
237237
auto & layer = model.layers[i];
238238

239-
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240-
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239+
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240+
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
241241

242-
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
243-
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
244-
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
242+
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
243+
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
244+
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
245245

246-
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
246+
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
247247

248-
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
249-
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
248+
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
249+
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
250250

251-
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
252-
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
251+
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
252+
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
253253

254254
// map by name
255255
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
@@ -264,7 +264,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
264264
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w;
265265
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b;
266266

267-
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w_trans;
267+
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w;
268268
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b;
269269
}
270270
}
@@ -510,11 +510,13 @@ bool gptj_eval(
510510

511511
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
512512
struct ggml_tensor * V_trans =
513-
ggml_permute(ctx0,
514-
ggml_reshape_3d(ctx0,
515-
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
516-
n_embd/n_head, n_head, n_past + N),
517-
1, 2, 0, 3);
513+
ggml_cpy(ctx0,
514+
ggml_permute(ctx0,
515+
ggml_reshape_3d(ctx0,
516+
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
517+
n_embd/n_head, n_head, n_past + N),
518+
1, 2, 0, 3),
519+
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
518520

519521
// KQV = transpose(V) * KQ_soft_max
520522
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@@ -553,7 +555,7 @@ bool gptj_eval(
553555
// projection
554556
// cur = proj_w*cur + proj_b
555557
cur = ggml_mul_mat(ctx0,
556-
model.layers[il].c_mlp_proj_w_trans,
558+
model.layers[il].c_mlp_proj_w,
557559
cur);
558560

559561
cur = ggml_add(ctx0,

examples/gpt-j/quantize.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212
#include <vector>
1313
#include <regex>
1414

15-
// TODO: move somewhere else
16-
#define QK 32
17-
1815
// default hparams (GPT-J 6B)
1916
struct gptj_hparams {
2017
int32_t n_vocab = 50400;
@@ -225,11 +222,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
225222
switch (type) {
226223
case GGML_TYPE_Q4_0:
227224
{
228-
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
225+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
229226
} break;
230227
case GGML_TYPE_Q4_1:
231228
{
232-
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
229+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
233230
} break;
234231
default:
235232
{

examples/utils.cpp

Lines changed: 0 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -328,113 +328,3 @@ gpt_vocab::id gpt_sample_top_k_top_p(
328328

329329
return logits_id[idx].second;
330330
}
331-
332-
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
333-
const int nb = k / qk;
334-
const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*qk/2);
335-
336-
assert(k % qk == 0);
337-
338-
uint8_t pp[qk/2];
339-
340-
char * pdst = (char *) dst;
341-
342-
for (int j = 0; j < n; j += k) {
343-
float * pd = (float *) (pdst + (j/k)*row_size);
344-
uint8_t * pb = (uint8_t *) (pd + nb);
345-
346-
for (int i = 0; i < nb; i++) {
347-
float amax = 0.0f; // absolute max
348-
349-
{
350-
for (int l = 0; l < qk; l++) {
351-
const float v = src[j + i*qk + l];
352-
amax = std::max(amax, fabsf(v));
353-
}
354-
355-
const float d = amax / ((1 << 3) - 1);
356-
const float id = d ? 1.0f/d : 0.0f;
357-
358-
pd[i] = d;
359-
360-
for (int l = 0; l < qk; l += 2) {
361-
const float v0 = (src[j + i*qk + l + 0])*id;
362-
const float v1 = (src[j + i*qk + l + 1])*id;
363-
364-
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
365-
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
366-
367-
assert(vi0 >= 0 && vi0 < 16);
368-
assert(vi1 >= 0 && vi1 < 16);
369-
370-
hist[vi0]++;
371-
hist[vi1]++;
372-
373-
pp[l/2] = vi0 | (vi1 << 4);
374-
}
375-
376-
memcpy(pb + i*qk/2, pp, sizeof(pp));
377-
}
378-
}
379-
}
380-
381-
return (n/k)*row_size;
382-
}
383-
384-
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
385-
const int nb = k / qk;
386-
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
387-
388-
assert(k % qk == 0);
389-
390-
uint8_t pp[qk/2];
391-
392-
char * pdst = (char *) dst;
393-
394-
for (int j = 0; j < n; j += k) {
395-
float * pm = (float *) (pdst + (j/k)*row_size);
396-
float * pd = (float *) (pm + nb);
397-
uint8_t * pb = (uint8_t *) (pd + nb);
398-
399-
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
400-
401-
for (int i = 0; i < nb; i++) {
402-
float min = std::numeric_limits<float>::max();
403-
float max = std::numeric_limits<float>::min();
404-
405-
{
406-
for (int l = 0; l < qk; l++) {
407-
const float v = src[j + i*qk + l];
408-
if (v < min) min = v;
409-
if (v > max) max = v;
410-
}
411-
412-
const float d = (max - min) / ((1 << 4) - 1);
413-
const float id = d ? 1.0f/d : 0.0f;
414-
415-
pm[i] = min;
416-
pd[i] = d;
417-
418-
for (int l = 0; l < qk; l += 2) {
419-
const float v0 = (src[j + i*qk + l + 0] - min)*id;
420-
const float v1 = (src[j + i*qk + l + 1] - min)*id;
421-
422-
const uint8_t vi0 = round(v0);
423-
const uint8_t vi1 = round(v1);
424-
425-
assert(vi0 >= 0 && vi0 < 16);
426-
assert(vi1 >= 0 && vi1 < 16);
427-
428-
hist[vi0]++;
429-
hist[vi1]++;
430-
431-
pp[l/2] = vi0 | (vi1 << 4);
432-
}
433-
434-
memcpy(pb + i*qk/2, pp, sizeof(pp));
435-
}
436-
}
437-
}
438-
439-
return (n/k)*row_size;
440-
}

examples/utils.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ struct gpt_params {
2020
// sampling parameters
2121
int32_t top_k = 40;
2222
float top_p = 0.9f;
23-
float temp = 1.0f;
23+
float temp = 0.9f;
2424

2525
int32_t n_batch = 8; // batch size for prompt processing
2626

@@ -81,10 +81,3 @@ gpt_vocab::id gpt_sample_top_k_top_p(
8181
double top_p,
8282
double temp,
8383
std::mt19937 & rng);
84-
85-
//
86-
// Quantization
87-
//
88-
89-
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
90-
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);

0 commit comments

Comments
 (0)