@@ -154,66 +154,161 @@ int main() {
154
154
155
155
for (unsigned int i = 0 ; i < combinations.size (); i++) {
156
156
if (combinations[i].nsize == 0 ) { // Intel AMX
157
- test_get_coord_op<bfloat16, float , /* TM*/ 16 , /* TK*/ 32 , use::a,
158
- layout::row_major, 1 >();
159
- test_get_coord_op<int8_t , int , /* TM*/ 16 , /* TK*/ 64 , use::a,
160
- layout::row_major, 1 >();
161
- test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 16 , use::b,
162
- layout::row_major, 1 >();
163
- test_get_coord_op<int8_t , int32_t , /* TK*/ 64 , /* TN*/ 16 , use::b,
164
- layout::row_major, 1 >();
165
- test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 16 , use::b,
166
- layout::ext_intel_packed, 2 >();
167
- test_get_coord_op<int8_t , int32_t , /* TK*/ 64 , /* TN*/ 16 , use::b,
168
- layout::ext_intel_packed, 4 >();
169
- test_get_coord_op<float , float , /* TM*/ 16 , /* TN*/ 16 , use::accumulator,
170
- layout::row_major, 1 >();
171
- test_get_coord_op<int32_t , int32_t , /* TM*/ 16 , /* TN*/ 16 ,
172
- use::accumulator, layout::row_major, 1 >();
157
+ // test_get_coord_op<bfloat16, float, /*TM*/ 16, /*TK*/ 32, use::a,
158
+ // layout::row_major, 1>();
159
+ // test_get_coord_op<int8_t, int, /*TM*/ 16, /*TK*/ 64, use::a,
160
+ // layout::row_major, 1>();
161
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
162
+ // layout::row_major, 1>();
163
+ // test_get_coord_op<int8_t, int32_t, /*TK*/ 64, /*TN*/ 16, use::b,
164
+ // layout::row_major, 1>();
165
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
166
+ // layout::ext_intel_packed, 2>();
167
+ // test_get_coord_op<int8_t, int32_t, /*TK*/ 64, /*TN*/ 16, use::b,
168
+ // layout::ext_intel_packed, 4>();
169
+ // test_get_coord_op<float, float, /*TM*/ 16, /*TN*/ 16, use::accumulator,
170
+ // layout::row_major, 1>();
171
+ // test_get_coord_op<int32_t, int32_t, /*TM*/ 16, /*TN*/ 16,
172
+ // use::accumulator, layout::row_major, 1>();
173
173
break ;
174
174
}
175
175
176
176
if (combinations[i].nsize == 16 ) { // architecture::intel_gpu_pvc
177
- test_get_coord_op<bfloat16, float , /* TM*/ 8 , /* TK*/ 16 , use::a,
178
- layout::row_major, 1 >();
179
177
test_get_coord_op<int8_t , int , /* TM*/ 8 , /* TK*/ 32 , use::a,
180
178
layout::row_major, 1 >();
181
179
test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 16 , use::b,
182
180
layout::ext_intel_packed, 2 >();
183
181
test_get_coord_op<int8_t , int32_t , /* TK*/ 32 , /* TN*/ 16 , use::b,
184
182
layout::ext_intel_packed, 4 >();
185
- test_get_coord_op<float , float , /* TM*/ 8 , /* TN*/ 16 , use::accumulator,
186
- layout::row_major, 1 >();
183
+
184
+
185
+
186
+
187
187
test_get_coord_op<int32_t , int32_t , /* TM*/ 8 , /* TN*/ 16 , use::accumulator,
188
188
layout::row_major, 1 >();
189
189
// This combination is not currently supported for sub group size = 32 in
190
190
// IGC
191
191
#if (!defined(SG_SZ) || SG_SZ != 32)
192
+ // 8x16x16 float/bfloat16
193
+ std::cout << " 8x16x16 float/bfloat16" << std::endl;
194
+ // A
195
+ test_get_coord_op<bfloat16, float , /* TM*/ 8 , /* TK*/ 16 , use::a,
196
+ layout::row_major, 1 >();
197
+ // B
198
+ test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 16 , use::b,
199
+ layout::ext_intel_packed, 2 >();
192
200
test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 16 , use::b,
193
201
layout::row_major, 1 >();
194
- test_get_coord_op<int8_t , int32_t , /* TK*/ 32 , /* TN*/ 16 , use::b,
202
+ // Accumulator
203
+ test_get_coord_op<bfloat16, float , /* TM*/ 8 , /* TN*/ 16 , use::accumulator,
204
+ layout::row_major, 1 >();
205
+ test_get_coord_op<float , float , /* TM*/ 8 , /* TN*/ 16 , use::accumulator,
195
206
layout::row_major, 1 >();
196
- #endif
197
- break ;
198
- }
199
207
200
- if (combinations[i].nsize == 8 ) { // architecture::intel_gpu_dg2*
201
- test_get_coord_op<bfloat16, float , /* TM*/ 8 , /* TK*/ 16 , use::a,
208
+
209
+ // 16x16x16 float/bfloat16
210
+ std::cout << " 16x16x16 float/bfloat16" << std::endl;
211
+ // A
212
+ test_get_coord_op<bfloat16, float , /* TM*/ 16 , /* TK*/ 16 , use::a,
202
213
layout::row_major, 1 >();
203
- test_get_coord_op<int8_t , int , /* TM*/ 8 , /* TK*/ 32 , use::a,
214
+ // B
215
+ // Duplicate from 8x16x16
216
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
217
+ // layout::ext_intel_packed, 2>();
218
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
219
+ // layout::row_major, 1>();
220
+ // Accumulator
221
+ test_get_coord_op<bfloat16, float , /* TM*/ 16 , /* TN*/ 16 , use::accumulator,
204
222
layout::row_major, 1 >();
205
- test_get_coord_op<bfloat16 , float , /* TK */ 16 , /* TN*/ 8 , use::b ,
223
+ test_get_coord_op<float , float , /* TM */ 16 , /* TN*/ 16 , use::accumulator ,
206
224
layout::row_major, 1 >();
207
- test_get_coord_op<int8_t , int32_t , /* TK*/ 32 , /* TN*/ 8 , use::b,
225
+
226
+ // 1x64x16 float/bfloat16
227
+ std::cout << " 1x64x16 float/bfloat16" << std::endl;
228
+ // A
229
+ test_get_coord_op<bfloat16, float , /* TM*/ 1 , /* TK*/ 16 , use::a,
208
230
layout::row_major, 1 >();
209
- test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 8 , use::b,
210
- layout::ext_intel_packed, 2 >();
211
- test_get_coord_op<int8_t , int32_t , /* TK*/ 32 , /* TN*/ 8 , use::b,
212
- layout::ext_intel_packed, 4 >();
213
- test_get_coord_op<float , float , /* TM*/ 8 , /* TN*/ 8 , use::accumulator,
231
+ // B
232
+ test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 64 , use::b,
233
+ layout::ext_intel_packed, 2 >();
234
+ test_get_coord_op<bfloat16, float , /* TK*/ 16 , /* TN*/ 64 , use::b,
235
+ layout::row_major, 1 >();
236
+ // Accumulator
237
+ test_get_coord_op<bfloat16, float , /* TM*/ 1 , /* TN*/ 64 , use::accumulator,
238
+ layout::row_major, 1 >();
239
+ test_get_coord_op<float , float , /* TM*/ 1 , /* TN*/ 64 , use::accumulator,
240
+ layout::row_major, 1 >();
241
+
242
+ // 1x64x32 float/bfloat16
243
+ std::cout << " 1x64x32 float/bfloat16" << std::endl;
244
+ // A
245
+ test_get_coord_op<bfloat16, float , /* TM*/ 1 , /* TK*/ 32 , use::a,
246
+ layout::row_major, 1 >();
247
+ // B
248
+ test_get_coord_op<bfloat16, float , /* TK*/ 32 , /* TN*/ 64 , use::b,
249
+ layout::ext_intel_packed, 2 >();
250
+ test_get_coord_op<bfloat16, float , /* TK*/ 32 , /* TN*/ 64 , use::b,
251
+ layout::row_major, 1 >();
252
+ // Accumulator
253
+ test_get_coord_op<bfloat16, float , /* TM*/ 1 , /* TN*/ 64 , use::accumulator,
214
254
layout::row_major, 1 >();
215
- test_get_coord_op<int32_t , int32_t , /* TM*/ 8 , /* TN*/ 8 , use::accumulator,
255
+ test_get_coord_op<float , float , /* TM*/ 1 , /* TN*/ 64 , use::accumulator,
216
256
layout::row_major, 1 >();
257
+
258
+ // 32x64x16 float/bfloat16
259
+ std::cout << " 32x64x16 float/bfloat16" << std::endl;
260
+ // A
261
+ test_get_coord_op<bfloat16, float , /* TM*/ 32 , /* TK*/ 16 , use::a,
262
+ layout::row_major, 1 >();
263
+ // B
264
+ // Duplicate from 1x64x16
265
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 64, use::b,
266
+ // layout::ext_intel_packed, 2>();
267
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 64, use::b,
268
+ // layout::row_major, 1>();
269
+ // Accumulator
270
+ test_get_coord_op<bfloat16, float , /* TM*/ 32 , /* TN*/ 64 , use::accumulator,
271
+ layout::row_major, 1 >();
272
+ test_get_coord_op<float , float , /* TM*/ 32 , /* TN*/ 64 , use::accumulator,
273
+ layout::row_major, 1 >();
274
+
275
+ // // 32x64x32 float/bfloat16
276
+ std::cout << " 32x64x32 float/bfloat16" << std::endl;
277
+ // A
278
+ test_get_coord_op<bfloat16, float , /* TM*/ 32 , /* TK*/ 32 , use::a,
279
+ layout::row_major, 1 >();
280
+ // B
281
+ // Duplicate from 1x64x32
282
+ // test_get_coord_op<bfloat16, float, /*TK*/ 32, /*TN*/ 64, use::b,
283
+ // layout::ext_intel_packed, 2>();
284
+ // test_get_coord_op<bfloat16, float, /*TK*/ 32, /*TN*/ 64, use::b,
285
+ // layout::row_major, 1>();
286
+ // Accumulator
287
+ test_get_coord_op<bfloat16, float , /* TM*/ 32 , /* TN*/ 64 , use::accumulator,
288
+ layout::row_major, 1 >();
289
+ test_get_coord_op<float , float , /* TM*/ 32 , /* TN*/ 64 , use::accumulator,
290
+ layout::row_major, 1 >();
291
+ #endif
292
+ break ;
293
+ }
294
+
295
+ if (combinations[i].nsize == 8 ) { // architecture::intel_gpu_dg2*
296
+ // test_get_coord_op<bfloat16, float, /*TM*/ 8, /*TK*/ 16, use::a,
297
+ // layout::row_major, 1>();
298
+ // test_get_coord_op<int8_t, int, /*TM*/ 8, /*TK*/ 32, use::a,
299
+ // layout::row_major, 1>();
300
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 8, use::b,
301
+ // layout::row_major, 1>();
302
+ // test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 8, use::b,
303
+ // layout::row_major, 1>();
304
+ // test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 8, use::b,
305
+ // layout::ext_intel_packed, 2>();
306
+ // test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 8, use::b,
307
+ // layout::ext_intel_packed, 4>();
308
+ // test_get_coord_op<float, float, /*TM*/ 8, /*TN*/ 8, use::accumulator,
309
+ // layout::row_major, 1>();
310
+ // test_get_coord_op<int32_t, int32_t, /*TM*/ 8, /*TN*/ 8, use::accumulator,
311
+ // layout::row_major, 1>();
217
312
break ;
218
313
}
219
314
}
0 commit comments