1
+ #include " build-info.h"
1
2
#include " common.h"
2
3
#include " ggml.h"
3
4
@@ -32,11 +33,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
32
33
}
33
34
34
35
static float tensor_sum_elements (const ggml_tensor * tensor) {
35
- float sum = 0 ;
36
- if (tensor->type == GGML_TYPE_F32) {
36
+ double sum = 0 ;
37
+ if (tensor->type == GGML_TYPE_F32) {
37
38
for (int j = 0 ; j < tensor->ne [1 ]; j++) {
38
39
for (int k = 0 ; k < tensor->ne [0 ]; k++) {
39
- sum += ((float *) tensor->data )[j*tensor->ne [0 ]+ k];
40
+ sum += ((float *) tensor->data )[j*tensor->ne [0 ] + k];
40
41
}
41
42
}
42
43
}
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
125
126
126
127
// printf("Memsize required = %i\n", sizex*sizex);
127
128
129
+ // TODO: perform the bench for all types or for a user specified type
130
+ const ggml_type qtype = GGML_TYPE_Q4_1;
131
+
128
132
size_t ctx_size = 0 ;
129
133
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
130
134
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
131
135
ctx_size += sizex*sizez*ggml_type_sizef (GGML_TYPE_F32);
132
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
133
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
136
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
137
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
134
138
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
135
139
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
136
140
ctx_size += 1024 *1024 *16 ;
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
163
167
struct ggml_tensor * m2 = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, sizex, sizez);
164
168
ggml_set_f32 (m2, 2 .0f );
165
169
166
- printf (" \n ------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------ \n " );
170
+ printf (" \n ------ Test 1 - Matrix Mult via F32 code\n " );
167
171
// printf("Creating new tensor m11xm2\n");
168
172
struct ggml_tensor * m11xm2 = ggml_mul_mat (ctx, m11, m2);
169
173
@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
181
185
182
186
TENSOR_DUMP (gf.nodes [0 ]);
183
187
184
- printf (" \n ------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------ \n " );
188
+ printf (" \n ------ Test 2 - Matrix Mult via %s code\n " , ggml_type_name (qtype) );
185
189
186
190
int32_t nelements = sizex*sizey;
187
- int32_t ne[2 ] = { sizex, sizey };
188
191
189
192
std::vector<int64_t > hist_cur (1 << 4 , 0 );
190
193
191
194
// Set up a the benchmark matrices
192
195
// printf("Creating new tensor q11 & Running quantize\n");
193
- struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
194
- ggml_quantize_q4_0 ( (const float *) m11->data , q11->data , nelements, ne[ 0 ] , hist_cur.data ());
196
+ struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
197
+ ggml_quantize_chunk (qtype, (const float *) m11->data , q11->data , 0 , nelements , hist_cur.data ());
195
198
196
199
// Set up a the compute graph
197
200
// printf("Creating new tensor q31\n");
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
202
205
203
206
// Set up a second graph computation to make sure we override the CPU cache lines
204
207
// printf("Creating new tensor q12 & Running quantize\n");
205
- struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
206
- ggml_quantize_q4_0 ( (const float *) m12->data , q12->data , nelements, ne[ 0 ] , hist_cur.data ());
208
+ struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
209
+ ggml_quantize_chunk (qtype, (const float *) m12->data , q12->data , 0 , nelements , hist_cur.data ());
207
210
208
211
// printf("Creating new tensor q32\n");
209
212
struct ggml_tensor * q32 = ggml_mul_mat (ctx, q12, m2);
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
220
223
printf (" Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n " , sizex, sizey, 1 , sizex, sizez, 1 , 1 .0f *flops_per_matrix / 1000 / 1000 / 1000 );
221
224
222
225
223
- // Let's use the F32 result from above as a reference for the q4_0 multiplication
226
+ // Let's use the F32 result from above as a reference for the quantized multiplication
224
227
float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
225
228
226
229
printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
250
253
// Check that the matrix multiplication result is in the right ballpark
251
254
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
252
255
float sum_of_Q4_result = tensor_sum_elements (gf31.nodes [0 ]);
253
- float delta = abs (sum_of_Q4_result - sum_of_F32_reference);
256
+ float delta = std:: abs (sum_of_Q4_result - sum_of_F32_reference);
254
257
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000 ; // Let's accept an epsilon of 10^-6
255
258
256
259
if (delta > allowed_delta) {
0 commit comments