Skip to content

Commit db096ad

Browse files
authored
[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization (#94133)
This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec.
1 parent 2fe7238 commit db096ad

File tree

5 files changed

+28
-82
lines changed

5 files changed

+28
-82
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25192519
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
25202520
break;
25212521
}
2522-
case AMDGPU::ENTER_PSEUDO_WM:
2523-
case AMDGPU::EXIT_PSEUDO_WM: {
2524-
// These do nothing.
2525-
MI.eraseFromParent();
2526-
break;
2527-
}
25282522
case AMDGPU::SI_RETURN: {
25292523
const MachineFunction *MF = MBB.getParent();
25302524
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
217217
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
218218
} // End usesCustomInserter = 1
219219

220-
// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
221-
def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
222-
let Uses = [EXEC];
223-
let Defs = [EXEC];
224-
let hasSideEffects = 0;
225-
let mayLoad = 0;
226-
let mayStore = 0;
227-
}
228-
229-
def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
230-
let hasSideEffects = 0;
231-
let mayLoad = 0;
232-
let mayStore = 0;
233-
}
234-
235220
// Pseudo instructions used for @llvm.fptrunc.round upward
236221
// and @llvm.fptrunc.round downward.
237222
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD

llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
165165

166166
unsigned Opc = MI.getOpcode();
167167

168-
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
169-
Opc == AMDGPU::ENTER_PSEUDO_WM) {
168+
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
170169
dbgs() << "Entering ";
171170
} else {
172-
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
173-
Opc == AMDGPU::EXIT_PSEUDO_WM);
171+
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
174172
dbgs() << "Exiting ";
175173
}
176174

177175
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
178176
dbgs() << "Strict WWM ";
179-
} else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
180-
dbgs() << "Pseudo WWM/WQM ";
181177
} else {
182178
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
183179
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
230226
}
231227

232228
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
233-
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
234-
MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
229+
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
235230
LLVM_DEBUG(printWWMInfo(MI));
236231
InWWM = true;
237232
continue;
238233
}
239234

240235
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
241-
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
242-
MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
236+
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
243237
LLVM_DEBUG(printWWMInfo(MI));
244238
InWWM = false;
245239
}

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
215215
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216216
bool IsWQM);
217217
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218-
void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
219-
MachineInstr *Exit);
220218

221219
void lowerBlock(MachineBasicBlock &MBB);
222220
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1025,31 +1023,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
10251023
return NewTerm;
10261024
}
10271025

1028-
// Convert a strict mode transition to a pseudo transition.
1029-
// This still pre-allocates registers to prevent clobbering,
1030-
// but avoids any EXEC mask changes.
1031-
void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1032-
MachineInstr *Entry,
1033-
MachineInstr *Exit) {
1034-
assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1035-
assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1036-
1037-
Register SaveOrig = Entry->getOperand(0).getReg();
1038-
1039-
MachineInstr *NewEntry =
1040-
BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1041-
MachineInstr *NewExit =
1042-
BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1043-
1044-
LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1045-
Exit->eraseFromParent();
1046-
1047-
LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1048-
Entry->eraseFromParent();
1049-
1050-
LIS->removeInterval(SaveOrig);
1051-
}
1052-
10531026
// Replace (or supplement) instructions accessing live mask.
10541027
// This can only happen once all the live mask registers have been created
10551028
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1066,12 +1039,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10661039

10671040
SmallVector<MachineInstr *, 4> SplitPoints;
10681041
char State = BI.InitialState;
1069-
MachineInstr *StrictEntry = nullptr;
10701042

10711043
for (MachineInstr &MI : llvm::make_early_inc_range(
10721044
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1073-
char PreviousState = State;
1074-
10751045
if (StateTransition.count(&MI))
10761046
State = StateTransition[&MI];
10771047

@@ -1084,20 +1054,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10841054
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10851055
SplitPoint = lowerKillF32(MBB, MI);
10861056
break;
1087-
case AMDGPU::ENTER_STRICT_WQM:
1088-
StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1089-
break;
1090-
case AMDGPU::EXIT_STRICT_WQM:
1091-
if (State == StateWQM && StrictEntry) {
1092-
// Transition WQM -> StrictWQM -> WQM detected.
1093-
lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1094-
}
1095-
StrictEntry = nullptr;
1096-
break;
1097-
case AMDGPU::ENTER_STRICT_WWM:
1098-
case AMDGPU::EXIT_STRICT_WWM:
1099-
StrictEntry = nullptr;
1100-
break;
11011057
default:
11021058
break;
11031059
}
@@ -1251,11 +1207,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
12511207
}
12521208
LIS->InsertMachineInstrInMaps(*MI);
12531209
StateTransition[MI] = StrictStateNeeded;
1254-
1255-
// Mark block as needing lower so it will be checked for unnecessary transitions.
1256-
auto BII = Blocks.find(&MBB);
1257-
if (BII != Blocks.end())
1258-
BII->second.NeedsLowering = true;
12591210
}
12601211

12611212
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
28722872
; GFX9-W64: ; %bb.0: ; %main_body
28732873
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
28742874
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2875+
; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
2876+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
28752877
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2878+
; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
28762879
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
28772880
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
28782881
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
28792882
; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
28802883
; GFX9-W64-NEXT: ; %bb.1: ; %IF
2884+
; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
2885+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
28812886
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
28822887
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
28832888
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
28842889
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
28852890
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
28862891
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2892+
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
28872893
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
28882894
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
28892895
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
28962902
; GFX10-W32: ; %bb.0: ; %main_body
28972903
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
28982904
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2905+
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2906+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
28992907
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2908+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
29002909
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
29012910
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
29022911
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
29032912
; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
29042913
; GFX10-W32-NEXT: ; %bb.1: ; %IF
2914+
; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
2915+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
29052916
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
29062917
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
29072918
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
29082919
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
29092920
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
29102921
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2922+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
29112923
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
29122924
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
29132925
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2955,7 +2967,10 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
29552967
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2
29562968
; GFX9-W64-NEXT: ; %bb.1: ; %main_body
29572969
; GFX9-W64-NEXT: s_and_b64 exec, exec, vcc
2970+
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2971+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
29582972
; GFX9-W64-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2973+
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
29592974
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
29602975
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v3
29612976
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v1
@@ -2987,7 +3002,10 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
29873002
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2
29883003
; GFX10-W32-NEXT: ; %bb.1: ; %main_body
29893004
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
3005+
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3006+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
29903007
; GFX10-W32-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
3008+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
29913009
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
29923010
; GFX10-W32-NEXT: v_mov_b32_e32 v1, v3
29933011
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v1
@@ -3275,8 +3293,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
32753293
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
32763294
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
32773295
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
3278-
; GFX9-W64-NEXT: s_nop 0
3296+
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3297+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
32793298
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3299+
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
32803300
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
32813301
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
32823302
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3317,9 +3337,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
33173337
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
33183338
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
33193339
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3320-
; GFX10-W32-NEXT: s_clause 0x1
33213340
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
3341+
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3342+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
33223343
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3344+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
33233345
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
33243346
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
33253347
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)