@@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
19
19
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
20
20
; GCN-NEXT: s_nop 1
21
21
; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
22
- ; GCN-NEXT: s_nop 6
22
+ ; GCN-NEXT: s_nop 7
23
23
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
24
24
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
25
25
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
39
39
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
40
40
; GCN-NEXT: s_nop 1
41
41
; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
42
- ; GCN-NEXT: s_nop 6
42
+ ; GCN-NEXT: s_nop 7
43
43
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
44
44
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
45
45
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
67
67
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
68
68
; SDAG-NEXT: s_nop 1
69
69
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
70
- ; SDAG-NEXT: s_nop 6
70
+ ; SDAG-NEXT: s_nop 7
71
71
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
72
72
; SDAG-NEXT: s_endpgm
73
73
;
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
88
88
; GISEL-NEXT: s_nop 1
89
89
; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
90
90
; GISEL-NEXT: v_mov_b32_e32 v0, 0
91
- ; GISEL-NEXT: s_nop 5
91
+ ; GISEL-NEXT: s_nop 6
92
92
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
93
93
; GISEL-NEXT: s_endpgm
94
94
%result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.f16 (<8 x half > %arg0 , <8 x half > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
@@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
114
114
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
115
115
; SDAG-NEXT: s_nop 1
116
116
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
117
- ; SDAG-NEXT: s_nop 6
117
+ ; SDAG-NEXT: s_nop 7
118
118
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
119
119
; SDAG-NEXT: s_endpgm
120
120
;
@@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
135
135
; GISEL-NEXT: s_nop 1
136
136
; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
137
137
; GISEL-NEXT: v_mov_b32_e32 v0, 0
138
- ; GISEL-NEXT: s_nop 5
138
+ ; GISEL-NEXT: s_nop 6
139
139
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
140
140
; GISEL-NEXT: s_endpgm
141
141
%result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.f16 (<8 x half > %arg0 , <8 x half > %arg1 , <4 x float > %arg2 , i32 3 , i32 2 , i32 1 )
@@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
186
186
; SDAG-NEXT: v_mov_b32_e32 v9, s17
187
187
; SDAG-NEXT: v_mov_b32_e32 v10, s18
188
188
; SDAG-NEXT: v_mov_b32_e32 v11, s19
189
- ; SDAG-NEXT: s_nop 3
189
+ ; SDAG-NEXT: s_nop 4
190
190
; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
191
191
; SDAG-NEXT: s_waitcnt vmcnt(0)
192
192
; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
253
253
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
254
254
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
255
255
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
256
- ; GISEL-NEXT: s_nop 3
256
+ ; GISEL-NEXT: s_nop 4
257
257
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
258
258
; GISEL-NEXT: s_waitcnt vmcnt(0)
259
259
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
316
316
; SDAG-NEXT: v_mov_b32_e32 v9, s17
317
317
; SDAG-NEXT: v_mov_b32_e32 v10, s18
318
318
; SDAG-NEXT: v_mov_b32_e32 v11, s19
319
- ; SDAG-NEXT: s_nop 3
319
+ ; SDAG-NEXT: s_nop 4
320
320
; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
321
321
; SDAG-NEXT: s_waitcnt vmcnt(0)
322
322
; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
383
383
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
384
384
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
385
385
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
386
- ; GISEL-NEXT: s_nop 3
386
+ ; GISEL-NEXT: s_nop 4
387
387
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
388
388
; GISEL-NEXT: s_waitcnt vmcnt(0)
389
389
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
430
430
; GCN-NEXT: s_nop 1
431
431
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
432
432
; GCN-NEXT: s_nop 7
433
- ; GCN-NEXT: s_nop 2
433
+ ; GCN-NEXT: s_nop 3
434
434
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
435
435
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
436
436
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
475
475
; GCN-NEXT: s_nop 1
476
476
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
477
477
; GCN-NEXT: s_nop 7
478
- ; GCN-NEXT: s_nop 2
478
+ ; GCN-NEXT: s_nop 3
479
479
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
480
480
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
481
481
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
776
776
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
777
777
; SDAG-NEXT: v_mov_b32_e32 v0, 0
778
778
; SDAG-NEXT: s_nop 7
779
- ; SDAG-NEXT: s_nop 1
779
+ ; SDAG-NEXT: s_nop 2
780
780
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
781
781
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
782
782
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
813
813
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
814
814
; GISEL-NEXT: v_mov_b32_e32 v0, 0
815
815
; GISEL-NEXT: s_nop 7
816
- ; GISEL-NEXT: s_nop 1
816
+ ; GISEL-NEXT: s_nop 2
817
817
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
818
818
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
819
819
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
855
855
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
856
856
; SDAG-NEXT: v_mov_b32_e32 v0, 0
857
857
; SDAG-NEXT: s_nop 7
858
- ; SDAG-NEXT: s_nop 1
858
+ ; SDAG-NEXT: s_nop 2
859
859
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
860
860
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
861
861
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
892
892
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
893
893
; GISEL-NEXT: v_mov_b32_e32 v0, 0
894
894
; GISEL-NEXT: s_nop 7
895
- ; GISEL-NEXT: s_nop 1
895
+ ; GISEL-NEXT: s_nop 2
896
896
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
897
897
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
898
898
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
919
919
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
920
920
; GCN-NEXT: s_nop 1
921
921
; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
922
- ; GCN-NEXT: s_nop 6
922
+ ; GCN-NEXT: s_nop 7
923
923
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
924
924
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
925
925
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
939
939
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
940
940
; GCN-NEXT: s_nop 1
941
941
; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
942
- ; GCN-NEXT: s_nop 6
942
+ ; GCN-NEXT: s_nop 7
943
943
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
944
944
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
945
945
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
971
971
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
972
972
; SDAG-NEXT: s_nop 1
973
973
; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
974
- ; SDAG-NEXT: s_nop 6
974
+ ; SDAG-NEXT: s_nop 7
975
975
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
976
976
; SDAG-NEXT: s_endpgm
977
977
;
@@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
992
992
; GISEL-NEXT: s_nop 1
993
993
; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
994
994
; GISEL-NEXT: v_mov_b32_e32 v0, 0
995
- ; GISEL-NEXT: s_nop 5
995
+ ; GISEL-NEXT: s_nop 6
996
996
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
997
997
; GISEL-NEXT: s_endpgm
998
998
%result = call <4 x i32 > @llvm.amdgcn.mfma.i32.16x16x64.i8 (<4 x i32 > %arg0 , <4 x i32 > %arg1 , <4 x i32 > %arg2 , i32 0 , i32 0 , i32 0 )
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
1022
1022
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
1023
1023
; SDAG-NEXT: s_nop 1
1024
1024
; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1025
- ; SDAG-NEXT: s_nop 6
1025
+ ; SDAG-NEXT: s_nop 7
1026
1026
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
1027
1027
; SDAG-NEXT: s_endpgm
1028
1028
;
@@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
1043
1043
; GISEL-NEXT: s_nop 1
1044
1044
; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1045
1045
; GISEL-NEXT: v_mov_b32_e32 v0, 0
1046
- ; GISEL-NEXT: s_nop 5
1046
+ ; GISEL-NEXT: s_nop 6
1047
1047
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
1048
1048
; GISEL-NEXT: s_endpgm
1049
1049
%result = call <4 x i32 > @llvm.amdgcn.mfma.i32.16x16x64.i8 (<4 x i32 > %arg0 , <4 x i32 > %arg1 , <4 x i32 > %arg2 , i32 3 , i32 2 , i32 1 )
@@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
1097
1097
; SDAG-NEXT: v_mov_b32_e32 v1, s17
1098
1098
; SDAG-NEXT: v_mov_b32_e32 v2, s18
1099
1099
; SDAG-NEXT: v_mov_b32_e32 v3, s19
1100
- ; SDAG-NEXT: s_nop 6
1100
+ ; SDAG-NEXT: s_nop 7
1101
1101
; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
1102
1102
; SDAG-NEXT: s_waitcnt vmcnt(0)
1103
1103
; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
1169
1169
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1170
1170
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
1171
1171
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
1172
- ; GISEL-NEXT: s_nop 3
1172
+ ; GISEL-NEXT: s_nop 4
1173
1173
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
1174
1174
; GISEL-NEXT: s_waitcnt vmcnt(0)
1175
1175
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
1233
1233
; SDAG-NEXT: v_mov_b32_e32 v1, s17
1234
1234
; SDAG-NEXT: v_mov_b32_e32 v2, s18
1235
1235
; SDAG-NEXT: v_mov_b32_e32 v3, s19
1236
- ; SDAG-NEXT: s_nop 6
1236
+ ; SDAG-NEXT: s_nop 7
1237
1237
; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
1238
1238
; SDAG-NEXT: s_waitcnt vmcnt(0)
1239
1239
; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
1305
1305
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1306
1306
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
1307
1307
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
1308
- ; GISEL-NEXT: s_nop 3
1308
+ ; GISEL-NEXT: s_nop 4
1309
1309
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
1310
1310
; GISEL-NEXT: s_waitcnt vmcnt(0)
1311
1311
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
1352
1352
; GCN-NEXT: s_nop 1
1353
1353
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1354
1354
; GCN-NEXT: s_nop 7
1355
- ; GCN-NEXT: s_nop 2
1355
+ ; GCN-NEXT: s_nop 3
1356
1356
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1357
1357
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1358
1358
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
1397
1397
; GCN-NEXT: s_nop 1
1398
1398
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
1399
1399
; GCN-NEXT: s_nop 7
1400
- ; GCN-NEXT: s_nop 2
1400
+ ; GCN-NEXT: s_nop 3
1401
1401
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1402
1402
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1403
1403
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
1717
1717
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1718
1718
; SDAG-NEXT: v_mov_b32_e32 v0, 0
1719
1719
; SDAG-NEXT: s_nop 7
1720
- ; SDAG-NEXT: s_nop 1
1720
+ ; SDAG-NEXT: s_nop 2
1721
1721
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1722
1722
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1723
1723
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
1754
1754
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
1755
1755
; GISEL-NEXT: v_mov_b32_e32 v0, 0
1756
1756
; GISEL-NEXT: s_nop 7
1757
- ; GISEL-NEXT: s_nop 1
1757
+ ; GISEL-NEXT: s_nop 2
1758
1758
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
1759
1759
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1760
1760
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
1801
1801
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
1802
1802
; SDAG-NEXT: v_mov_b32_e32 v0, 0
1803
1803
; SDAG-NEXT: s_nop 7
1804
- ; SDAG-NEXT: s_nop 1
1804
+ ; SDAG-NEXT: s_nop 2
1805
1805
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
1806
1806
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
1807
1807
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
1838
1838
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
1839
1839
; GISEL-NEXT: v_mov_b32_e32 v0, 0
1840
1840
; GISEL-NEXT: s_nop 7
1841
- ; GISEL-NEXT: s_nop 1
1841
+ ; GISEL-NEXT: s_nop 2
1842
1842
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
1843
1843
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
1844
1844
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
1865
1865
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
1866
1866
; GCN-NEXT: s_nop 1
1867
1867
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1868
- ; GCN-NEXT: s_nop 6
1868
+ ; GCN-NEXT: s_nop 7
1869
1869
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1870
1870
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1871
1871
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
1885
1885
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
1886
1886
; GCN-NEXT: s_nop 1
1887
1887
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
1888
- ; GCN-NEXT: s_nop 6
1888
+ ; GCN-NEXT: s_nop 7
1889
1889
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1890
1890
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1891
1891
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
1913
1913
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
1914
1914
; GCN-NEXT: s_nop 1
1915
1915
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
1916
- ; GCN-NEXT: s_nop 6
1916
+ ; GCN-NEXT: s_nop 7
1917
1917
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
1918
1918
; GCN-NEXT: s_endpgm
1919
1919
%result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 0 )
@@ -1939,7 +1939,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
1939
1939
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
1940
1940
; GCN-NEXT: s_nop 1
1941
1941
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
1942
- ; GCN-NEXT: s_nop 6
1942
+ ; GCN-NEXT: s_nop 7
1943
1943
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
1944
1944
; GCN-NEXT: s_endpgm
1945
1945
%result = call <4 x float > @llvm.amdgcn.mfma.f32.16x16x32.bf16 (<8 x bfloat> %arg0 , <8 x bfloat> %arg1 , <4 x float > %arg2 , i32 3 , i32 2 , i32 1 )
0 commit comments