diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index b05c317231ad9..0a37ae70b5d99 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s -split-input-file -verify-diagnostics // ----- -func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) { +func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) { // expected-error@+1 {{Expecting the TensorDesc rank is up to 2 and not greater than the ranks of shape, strides, offsets or the memref source}} %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32> return @@ -9,49 +9,49 @@ func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) { // ----- -func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { +func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16> return } // ----- -func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) { +func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{SLM is not supported for 2D block tensor}} %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } // ----- -func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) { +func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{Memory space mismatch}} %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32> return } // ----- -func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- -func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- -func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- -func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { +func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> @@ -59,7 +59,7 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { } // ----- -func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { +func.func @prefetch_nd_vc_2(%src: memref<24xf16>) { %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex> -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> @@ -70,7 +70,7 @@ func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { } // ----- -func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { +func.func @load_nd_vc_1(%src: memref<8x16xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> @@ -79,7 +79,7 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { } // ----- -func.func @test_load_nd_vc_2(%src: memref<16xf16>) { +func.func @load_nd_vc_2(%src: memref<16xf16>) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> @@ -90,7 +90,7 @@ func.func @test_load_nd_vc_2(%src: memref<16xf16>) { } // ----- -func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) { +func.func @load_nd_vc_3(%src: memref<8x16xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> // expected-warning@+1 {{Invalid Packed Attr.}} %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> @@ -99,7 +99,7 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) { } // ----- -func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) { +func.func @load_nd_vc_4(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}} @@ -110,7 +110,7 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) { } // ----- -func.func @test_load_nd_layout(%src: memref<24x32xf32>) { +func.func @load_nd_layout(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, @@ -119,7 +119,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) { } // ----- -func.func @test_load_nd_simt(%src: memref<24x32xf32>) { +func.func @load_nd_simt(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}} %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8xf32> @@ -127,7 +127,7 @@ func.func @test_load_nd_simt(%src: memref<24x32xf32>) { } // ----- -func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { +func.func @store_nd_vc_1(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<24x32xf16> %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} @@ -136,7 +136,7 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { } // ----- -func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { +func.func @store_nd_vc_2(%dst: memref<16xf16>) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> %1 = arith.constant dense<1.0>: vector<8x2xf16> %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> @@ -148,7 +148,7 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { } // ----- -func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) { +func.func @store_nd_vc_3(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<2x24x32xf16> %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> // expected-error@+1 {{array length is not supported by store_nd}} @@ -157,7 +157,7 @@ func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) { } // ----- -func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) { +func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) { %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}} xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32> @@ -165,7 +165,7 @@ func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) { } // ----- -func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) { +func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}} xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -173,7 +173,7 @@ func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) { } // ----- -func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { +func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}} @@ -182,7 +182,7 @@ func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { } // ----- -func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { +func.func @update_nd_offset_1(%dst: memref<16xf16>) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> @@ -192,7 +192,7 @@ func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { } // ----- -func.func @test_create_tdesc_vc_1(%src: ui64) { +func.func @create_tdesc_vc_1(%src: ui64) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> // expected-error@+1 {{Expects a scattered TensorDesc}} %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16> @@ -200,7 +200,7 @@ func.func @test_create_tdesc_vc_1(%src: ui64) { } // ----- -func.func @test_create_tdesc_vc_2(%src: ui64) { +func.func @create_tdesc_vc_2(%src: ui64) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> // expected-error@+1 {{expected chunk blocks for 2D tensor}} @@ -209,7 +209,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64) { } // ----- -func.func @test_create_tdesc_vc_3(%src: memref) { +func.func @create_tdesc_vc_3(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> // expected-error@+1 {{Memory space mismatch}} %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> @@ -218,7 +218,7 @@ func.func @test_create_tdesc_vc_3(%src: memref) { } // ----- -func.func @test_create_tdesc_vc_4(%src: memref) { +func.func @create_tdesc_vc_4(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> // expected-error@+1 {{invalid chunk size}} @@ -227,7 +227,7 @@ func.func @test_create_tdesc_vc_4(%src: memref) { } // ----- -func.func @test_create_tdesc_vc_5(%src: memref) { +func.func @create_tdesc_vc_5(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> // expected-error@+1 {{expected tensor shape[1] to match chunk size}} @@ -236,7 +236,7 @@ func.func @test_create_tdesc_vc_5(%src: memref) { } // ----- -func.func @test_create_tdesc_vc_6(%src: memref) { +func.func @create_tdesc_vc_6(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> // expected-error@+1 {{tensor shape[1] to be a multiple of packing factor 2}} @@ -246,7 +246,7 @@ func.func @test_create_tdesc_vc_6(%src: memref) { // ----- -func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) { +func.func @prefetch_vc_1(%src: memref<24x32xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> // expected-error@+1 {{Expects a scattered TensorDesc}} xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<24x32xf16> @@ -254,7 +254,7 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) { } // ----- -func.func @test_prefetch_vc_2(%src: ui64) { +func.func @prefetch_vc_2(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> @@ -264,7 +264,7 @@ func.func @test_prefetch_vc_2(%src: ui64) { } // ----- -func.func @test_create_tdesc_layout_1(%src: ui64) { +func.func @create_tdesc_layout_1(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> // expected-error@+1 {{expected layout rank to match tensor rank}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> @@ -272,7 +272,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) { } // ----- -func.func @test_create_tdesc_layout_2(%src: ui64) { +func.func @create_tdesc_layout_2(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> // expected-error@+1 {{cannot map over non-contiguous scattered row elements}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> @@ -280,7 +280,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) { } // ----- -func.func @test_create_tdesc_layout_3(%src: ui64) { +func.func @create_tdesc_layout_3(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> // expected-error@+1 {{work item data mapping must match the number of contiguous elements}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> @@ -288,7 +288,7 @@ func.func @test_create_tdesc_layout_3(%src: ui64) { } // ----- -func.func @test_load_gather_simt_1(%src: ui64) { +func.func @load_gather_simt_1(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> @@ -298,7 +298,7 @@ func.func @test_load_gather_simt_1(%src: ui64) { } // ----- -func.func @test_store_scatter_simt_1(%src: ui64) { +func.func @store_scatter_simt_1(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %val = arith.constant dense<2.9>: vector<6xf32> @@ -309,7 +309,7 @@ func.func @test_store_scatter_simt_1(%src: ui64) { } // ----- -func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) { +func.func @load_gather_vc_1(%src: memref<24x32xf16>) { %0 = arith.constant dense<1>: vector<4xi1> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16> // expected-error@+1 {{Expects a scattered TensorDesc}} @@ -319,7 +319,7 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) { } // ----- -func.func @test_load_gather_vc_2(%src: ui64) { +func.func @load_gather_vc_2(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<1>: vector<4xi1> %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> @@ -332,7 +332,7 @@ func.func @test_load_gather_vc_2(%src: ui64) { } // ----- -func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) { +func.func @store_scatter_vc_1(%src: memref<24x32xf32>) { %0 = arith.constant dense<1>: vector<4xi1> %1 = arith.constant dense<2.9>: vector<4x2xf32> %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32> @@ -343,7 +343,7 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) { } // ----- -func.func @test_store_scatter_vc_2(%src: ui64) { +func.func @store_scatter_vc_2(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex> %0 = arith.constant dense<1>: vector<4xi1> %1 = arith.constant dense<2.9>: vector<4x2xf32> @@ -356,49 +356,49 @@ func.func @test_store_scatter_vc_2(%src: ui64) { } // ----- -func.func @test_dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) { +func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) { // expected-error@+1 {{K-dimension mismatch}} %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) { +func.func @dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) { // expected-error@+1 {{expecting lhs and result to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}} %1 = xegpu.dpas %a, %b : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) { +func.func @dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) { // expected-error@+1 {{K-dimension mismatch}} %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) { +func.func @dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) { // expected-error@+1 {{M-dimension mismatch}} %1 = xegpu.dpas %a, %b : vector<16x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) { +func.func @dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) { // expected-error@+1 {{N-dimension mismatch}} %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) { +func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) { // expected-error@+1 {{Expecting B operand to be a multiple of 32 bits}} %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<15xf16> -> vector<8xf32> return } // ----- -func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) { +func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) { %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}} @@ -512,7 +512,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto } // ----- -func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) { +func.func @convert_layout_same_map(%a: vector<32x64xf16>) { // expected-error@+1 {{expected different srcMap and resMap}} %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, resMap = #xegpu.layout} : vector<32x64xf16> @@ -520,7 +520,7 @@ func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) { } // ----- -func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) { +func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}} %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, resMap = #xegpu.layout} : vector<32x64xf16> diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir new file mode 100644 index 0000000000000..7f3ebec225cdf --- /dev/null +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-opt %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s + +// CHECK-LABEL: gpu.module @test { +gpu.module @test { +// CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) { +gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + gpu.return +} + +// CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) { +gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + gpu.return +} + +// CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) { +gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + gpu.return +} + +// CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + gpu.return +} + +gpu.func @convert_layout(%a: vector<32x64xf16>) { + %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, + resMap = #xegpu.layout} : vector<32x64xf16> + gpu.return +} + +gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { + %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, + resMap = #xegpu.layout} : vector<32x64xf16> + gpu.return +} + +} diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 76af59d6aedc7..054c4d12fdb28 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -6,23 +6,15 @@ // CHECK-LABEL: gpu.module @test { gpu.module @test { -// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { +// CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { -gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { +// CHECK: gpu.func @create_nd_tdesc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { +gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { //CHECK: %[[C:.*]] = arith.constant 1 : index %c1 = arith.constant 1 : index // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> @@ -30,94 +22,41 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { -gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { - //CHECK: %[[C:.*]] = arith.constant 1 : index - %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { +// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) { -gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) { +// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) { +gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) { -gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { -gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) { +// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { +gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { -gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) { +// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) { -gpu.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) { -gpu.func @test_create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) { -gpu.func @test_create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { +// CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @prefetch_nd(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> @@ -125,17 +64,9 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { gpu.return } -// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - gpu.return -} -// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) { -gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { +// CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> @@ -144,8 +75,8 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { gpu.return } -// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) { -gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) { +// CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @simt_load_nd(%src: memref<8x16xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> @@ -154,8 +85,8 @@ gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) { gpu.return } -// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) { -gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) { +// CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> @@ -163,8 +94,8 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) { gpu.return } -// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) { -gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) { +// CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> @@ -172,8 +103,8 @@ gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) { gpu.return } -// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { +// CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> @@ -181,8 +112,8 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) { +// CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> @@ -190,8 +121,8 @@ gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) { +// CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> @@ -199,8 +130,8 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) { +// CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> @@ -208,8 +139,8 @@ gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_vc_5(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) { +// CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32> @@ -217,8 +148,8 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) { +// CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> @@ -226,8 +157,8 @@ gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_load_nd_vc_6(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) { +// CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> @@ -235,8 +166,8 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) { +// CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> @@ -245,8 +176,8 @@ gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_vc_7(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) { +// CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16x2xf16> @@ -254,8 +185,8 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) { +// CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> @@ -264,8 +195,8 @@ gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_load_nd_vc_8(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) { +// CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> @@ -273,8 +204,8 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) { +// CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> @@ -282,8 +213,8 @@ gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) { gpu.return } -// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { +// CHECK: func @subgroup_store_nd(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> %1 = arith.constant dense<1.0>: vector<24x32xf16> // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> @@ -293,8 +224,8 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { gpu.return } -// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) { +// CHECK: func @simt_store_nd(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @simt_store_nd(%src: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16> %1 = arith.constant dense<1.0>: vector<48xf16> // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> @@ -306,8 +237,8 @@ gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) { -// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) { +// CHECK: func @subgroup_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16> %1 = arith.constant dense<1.0>: vector<32xf16> // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> @@ -318,8 +249,8 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) { } -// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) { +// CHECK: func @simt_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> %1 = arith.constant dense<1.0>: vector<2xf16> // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> @@ -329,8 +260,8 @@ gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) { gpu.return } -// CHECK: gpu.func @test_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) { +// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32> @@ -338,17 +269,9 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) { gpu.return } -// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) { -gpu.func @test_create_tdesc_vc(%src: ui64) { +// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) { +gpu.func @create_tdesc(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> @@ -356,18 +279,9 @@ gpu.func @test_create_tdesc_vc(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_create_tdesc_simt(%[[arg0:.*]]: ui64) { -gpu.func @test_create_tdesc_simt(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref) { -gpu.func @test_create_tdesc_vc_1(%src: memref) { +// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref) { +gpu.func @create_tdesc_1(%src: memref) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> @@ -375,18 +289,9 @@ gpu.func @test_create_tdesc_vc_1(%src: memref) { gpu.return } -// CHECK: gpu.func @test_create_tdesc_simt_1(%[[arg0:.*]]: memref) { -gpu.func @test_create_tdesc_simt_1(%src: memref) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref) { -gpu.func @test_create_tdesc_vc_2(%src: memref) { +// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref) { +gpu.func @create_tdesc_2(%src: memref) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<> @@ -394,17 +299,9 @@ gpu.func @test_create_tdesc_vc_2(%src: memref) { gpu.return } -// CHECK: gpu.func @test_create_tdesc_simt_2(%[[arg0:.*]]: memref) { -gpu.func @test_create_tdesc_simt_2(%src: memref) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return -} -// CHECK: gpu.func @test_create_tdesc_vc_3(%[[arg0:.*]]: ui64) { -gpu.func @test_create_tdesc_vc_3(%src: ui64) { +// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) { +gpu.func @create_tdesc_3(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> @@ -413,17 +310,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) { } -// CHECK: gpu.func @test_create_tdesc_simt_3(%arg0: ui64) { -gpu.func @test_create_tdesc_simt_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return -} - -// CHECK: gpu.func @test_load_vc(%[[arg0:.*]]: ui64) { -gpu.func @test_load_vc(%src: ui64) { +// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_load(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -435,8 +323,8 @@ gpu.func @test_load_vc(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_load_simt(%[[arg0:.*]]: ui64) { -gpu.func @test_load_simt(%src: ui64) { +// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) { +gpu.func @simt_load(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -448,8 +336,8 @@ gpu.func @test_load_simt(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_load_vc_2(%[[arg0:.*]]: ui64) { -gpu.func @test_load_vc_2(%src: ui64) { +// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_load_2(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -461,8 +349,8 @@ gpu.func @test_load_vc_2(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_load_simt_2(%[[arg0:.*]]: ui64) { -gpu.func @test_load_simt_2(%src: ui64) { +// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) { +gpu.func @simt_load_2(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -474,8 +362,8 @@ gpu.func @test_load_simt_2(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_load_vc_3(%[[arg0:.*]]: ui64) { -gpu.func @test_load_vc_3(%src: ui64) { +// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_load_3(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -487,8 +375,8 @@ gpu.func @test_load_vc_3(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_load_simt_3(%[[arg0:.*]]: ui64) { -gpu.func @test_load_simt_3(%src: ui64) { +// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) { +gpu.func @simt_load_3(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -500,8 +388,8 @@ gpu.func @test_load_simt_3(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_store_vc(%[[arg0:.*]]: ui64) { -gpu.func @test_store_vc(%src: ui64) { +// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_store(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -517,8 +405,8 @@ gpu.func @test_store_vc(%src: ui64) { -// CHECK: gpu.func @test_store_simt(%[[arg0:.*]]: ui64) { -gpu.func @test_store_simt(%src: ui64) { +// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) { +gpu.func @simt_store(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -532,8 +420,8 @@ gpu.func @test_store_simt(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_store_vc_2(%[[arg0:.*]]: ui64) { -gpu.func @test_store_vc_2(%src: ui64) { +// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_store_2(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -549,8 +437,8 @@ gpu.func @test_store_vc_2(%src: ui64) { -// CHECK: gpu.func @test_store_simt_2(%[[arg0:.*]]: ui64) { -gpu.func @test_store_simt_2(%src: ui64) { +// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) { +gpu.func @simt_store_2(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -564,8 +452,8 @@ gpu.func @test_store_simt_2(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_store_vc_3(%[[arg0:.*]]: ui64) { -gpu.func @test_store_vc_3(%src: ui64) { +// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) { +gpu.func @subgroup_store_3(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -580,8 +468,8 @@ gpu.func @test_store_vc_3(%src: ui64) { } -// CHECK: gpu.func @test_store_simt_3(%[[arg0:.*]]: ui64) { -gpu.func @test_store_simt_3(%src: ui64) { +// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) { +gpu.func @simt_store_3(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> @@ -595,20 +483,8 @@ gpu.func @test_store_simt_3(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_prefetch_simt(%[[arg0:.*]]: ui64) { -gpu.func @test_prefetch_simt(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return -} - - -// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) { -gpu.func @test_prefetch_vc(%src: ui64) { +// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) { +gpu.func @prefetch(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> @@ -618,21 +494,9 @@ gpu.func @test_prefetch_vc(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) { -gpu.func @test_create_update_tdesc_simt(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex> - //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> - %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xindex> - gpu.return -} -// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) { -gpu.func @test_create_update_tdesc_vc(%src: ui64) { +// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) { +gpu.func @create_update_tdesc(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex> @@ -644,29 +508,29 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) { gpu.return } -// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>) -gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) { +// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>) +gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) { // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> gpu.return } -// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>) -gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) { +// CHECK: gpu.func @simt_dpas(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>) +gpu.func @simt_dpas(%a : vector<8xf16>, %b: vector<16xf16>) { // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32> gpu.return } -// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>) -gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) { +// CHECK: gpu.func @subgroup_dpas_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>) +gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) { // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> gpu.return } -// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) -gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { +// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) +gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> @@ -715,23 +579,4 @@ gpu.func @fence() { gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - gpu.return -} - -gpu.func @test_convert_layout(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> - gpu.return -} - -gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> - gpu.return -} - } diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir index c7c82fc8dbb3c..35ac39d074c70 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s -// CHECK: function: test_dpas_f16: +// CHECK: function: dpas_f16: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 @@ -23,7 +23,7 @@ // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -38,7 +38,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg // ----- -// CHECK: function: test_dpas_i8: +// CHECK: function: dpas_i8: // CHECK-NEXT: argument: of type 'vector<8x32xi8>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2] // CHECK-NEXT: argument: of type 'vector<32x16xi8>' at index: 1 @@ -51,7 +51,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { +func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> @@ -60,7 +60,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: } // ----- -// CHECK: function: test_load_with_transpose_effect: +// CHECK: function: load_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 @@ -83,7 +83,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -97,7 +97,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre } // ----- -// CHECK: function: test_vector_transpose: +// CHECK: function: vector_transpose: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 @@ -122,7 +122,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -137,7 +137,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1 } // ----- -// CHECK: function: test_extf_truncf: +// CHECK: function: extf_truncf: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 @@ -152,7 +152,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: layout for result #0: Not assigned. -func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { +func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32> @@ -162,7 +162,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t } // ----- -// CHECK: function: test_load_gather_with_transpose_effect: +// CHECK: function: load_gather_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<256xf16>' at index: 1 @@ -187,7 +187,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { +func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> @@ -202,7 +202,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1 } // ----- -// CHECK: function: test_load_gather_1d: +// CHECK: function: load_gather_1d: // CHECK: argument: of type 'memref<256xf32>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 @@ -215,7 +215,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> @@ -225,7 +225,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc } // ----- -// CHECK: function: test_store_scatter_with_transpose_effect: +// CHECK: function: store_scatter_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<128xf32>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32> @@ -236,7 +236,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> // CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1] -func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { +func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> %cst_0 = arith.constant dense : vector<16xi1> %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -246,7 +246,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { } // ----- -// CHECK: function: test_store_scatter_1d: +// CHECK: function: store_scatter_1d: // CHECK-NEXT: argument: of type 'vector<16xf32>' at index: 0 // CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] // CHECK-NEXT: argument: of type 'memref<256xf32>' at index: 1 @@ -257,7 +257,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { +func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> @@ -266,7 +266,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) } // ----- -// CHECK: function: test_vector_bitcast_i16_to_i8: +// CHECK: function: vector_bitcast_i16_to_i8: // CHECK-NEXT: argument: of type 'memref<8x16xi16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<32x16xi8>' at index: 1 @@ -289,7 +289,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { +func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> @@ -303,7 +303,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref< } // ----- -// CHECK: function: test_vector_bitcast_i8_to_f16: +// CHECK: function: vector_bitcast_i8_to_f16: // CHECK-NEXT: argument: of type 'memref<8x32xi8>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x32xi8>' at index: 1 @@ -328,7 +328,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref< // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { +func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> @@ -343,7 +343,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1 } // ----- -// CHECK: function: test_binary_op_one_use: +// CHECK: function: binary_op_one_use: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 @@ -360,7 +360,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { +func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %2 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -371,7 +371,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x } // ----- -// CHECK: function: test_binary_op_multiple_uses: +// CHECK: function: binary_op_multiple_uses: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 @@ -390,7 +390,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { +func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16> @@ -402,7 +402,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar } // ----- -// CHECK: function: test_for_op: +// CHECK: function: for_op: // CHECK-NEXT: argument: of type 'memref<8x128xf16>' at index: 0 // CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<128x16xf16>' at index: 1 @@ -437,7 +437,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar // CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { +func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c16 = arith.constant 16 : index @@ -458,7 +458,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg } // ----- -// CHECK: function: test_if_single_use: +// CHECK: function: if_single_use: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 @@ -477,7 +477,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { +func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -492,7 +492,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu } // ----- -// CHECK: function: test_if_multiple_uses: +// CHECK: function: if_multiple_uses: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 @@ -513,7 +513,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { +func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -529,7 +529,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe } // ----- -// CHECK: function: test_vector_outer_reduction: +// CHECK: function: vector_outer_reduction: // CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 @@ -538,7 +538,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> @@ -546,7 +546,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t } // ----- -// CHECK: function: test_vector_inner_reduction: +// CHECK: function: vector_inner_reduction: // CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 // CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 @@ -555,7 +555,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32> // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32> xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 8e3673d04eacb..67d3bd9b393c0 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -4,7 +4,7 @@ #b = #xegpu.layout #c = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -45,7 +45,7 @@ gpu.module @test_kernel { #l1 = #xegpu.layout #l2 = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -86,7 +86,7 @@ gpu.module @test_kernel { #l1 = #xegpu.layout #l2 = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index @@ -130,7 +130,7 @@ gpu.module @test_kernel { #b = #xegpu.layout #c = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -172,7 +172,7 @@ gpu.module @test_kernel { // ----- #l = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %c1024 = arith.constant 1024 : index @@ -211,7 +211,7 @@ gpu.module @test_kernel { // ----- #l = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + gpu.func @elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %c1024 = arith.constant 1024 : index diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 35ad16d8cd9a9..c6124f90e0f48 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -1,9 +1,9 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s gpu.module @test_round_robin_assignment { - // CHECK-LABEL: test_create_nd_tdesc + // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) { // CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32> // CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout> // CHECK-NOT: xegpu.create_nd_tdesc @@ -12,9 +12,9 @@ gpu.module @test_round_robin_assignment { gpu.return } - // CHECK-LABEL: test_load_nd_tdesc + // CHECK-LABEL: load_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> // CHECK-COUNT-12: xegpu.load_nd %{{.*}} @@ -27,9 +27,9 @@ gpu.module @test_round_robin_assignment { gpu.return } - // CHECK-LABEL: test_store_nd + // CHECK-LABEL: store_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_store_nd(%src: memref<24x32xf32>) { + gpu.func @store_nd(%src: memref<24x32xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> // CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}} @@ -43,9 +43,9 @@ gpu.module @test_round_robin_assignment { gpu.return } - // CHECK-LABEL: test_update_nd + // CHECK-LABEL: update_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_update_nd(%src: memref<24x32xf32>){ + gpu.func @update_nd(%src: memref<24x32xf32>){ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> // CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16] @@ -56,9 +56,9 @@ gpu.module @test_round_robin_assignment { gpu.return } - // CHECK-LABEL: test_dpas + // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>) - gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) { + gpu.func @dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) { // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32> // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout> // CHECK-NOT: xegpu.create_nd_tdesc @@ -90,9 +90,9 @@ gpu.module @test_round_robin_assignment { gpu.return } - // CHECK-LABEL: test_prefetch_nd_tdesc + // CHECK-LABEL: prefetch_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) { // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}} // CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout> // CHECK-NOT: xegpu.prefetch_nd @@ -103,7 +103,7 @@ gpu.module @test_round_robin_assignment { gpu.return } - gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c0 = arith.constant 0 : index @@ -126,7 +126,7 @@ gpu.module @test_round_robin_assignment { gpu.return } - gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c1_i32 = arith.constant 1 : i32 %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 @@ -150,7 +150,7 @@ gpu.module @test_round_robin_assignment { gpu.return } - gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c10 = arith.constant 10 : index %0 = gpu.subgroup_id : index %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> @@ -173,7 +173,7 @@ gpu.module @test_round_robin_assignment { gpu.return } - gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index 466842c968448..44b11c304cc80 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -3,9 +3,9 @@ //CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> gpu.module @test_1_1_assignment { - // CHECK-LABEL: test_create_nd_tdesc + // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) { // CHECK: %[[SGID:.*]] = gpu.subgroup_id // CHECK: %[[C12:.*]] = arith.constant 12 : index // CHECK: %[[C4:.*]] = arith.constant 4 : index @@ -30,9 +30,9 @@ gpu.module @test_1_1_assignment { gpu.return } - // CHECK-LABEL: test_load_nd_tdesc + // CHECK-LABEL: load_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) { // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] @@ -46,9 +46,9 @@ gpu.module @test_1_1_assignment { gpu.return } - // CHECK-LABEL: test_store_nd + // CHECK-LABEL: store_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_store_nd(%src: memref<24x32xf32>) { + gpu.func @store_nd(%src: memref<24x32xf32>) { // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] @@ -66,9 +66,9 @@ gpu.module @test_1_1_assignment { gpu.return } -// CHECK-LABEL: test_update_nd +// CHECK-LABEL: update_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> -gpu.func @test_update_nd(%src: memref<24x32xf32>){ +gpu.func @update_nd(%src: memref<24x32xf32>){ // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] @@ -80,10 +80,10 @@ gpu.func @test_update_nd(%src: memref<24x32xf32>){ gpu.return } -// CHECK-LABEL: test_dpas +// CHECK-LABEL: dpas // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32> -gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { +gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]] @@ -114,10 +114,10 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { } -// CHECK-LABEL: test_dpas_no_sg_data +// CHECK-LABEL: dpas_no_sg_data // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32> -gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { +gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]] @@ -147,9 +147,9 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - // CHECK-LABEL: test_prefetch_nd_tdesc + // CHECK-LABEL: prefetch_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32> - gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) { + gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) { // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32> // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout> // CHECK: xegpu.prefetch_nd %[[TDESC]] @@ -161,8 +161,8 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - // CHECK-LABEL: test_dpas_with_no_create_nd_desc - gpu.func @test_dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) { + // CHECK-LABEL: dpas_with_no_create_nd_desc + gpu.func @dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) { // CHECK-NOT: vector<12x12xf32> %dpas = xegpu.dpas %a, %b {layout = #xegpu.layout} @@ -170,7 +170,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { + gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[c128:%.+]] = arith.constant 128 : index //CHECK: [[c1024:%.+]] = arith.constant 1024 : index @@ -213,7 +213,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c1_i32 = arith.constant 1 : i32 %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 @@ -238,7 +238,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index @@ -267,7 +267,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { gpu.return } - gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index