Skip to content

Commit 5d6e4c9

Browse files
committed
[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization
This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec.
1 parent 0310f7f commit 5d6e4c9

File tree

5 files changed

+22
-82
lines changed

5 files changed

+22
-82
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25192519
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
25202520
break;
25212521
}
2522-
case AMDGPU::ENTER_PSEUDO_WM:
2523-
case AMDGPU::EXIT_PSEUDO_WM: {
2524-
// These do nothing.
2525-
MI.eraseFromParent();
2526-
break;
2527-
}
25282522
case AMDGPU::SI_RETURN: {
25292523
const MachineFunction *MF = MBB.getParent();
25302524
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
217217
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
218218
} // End usesCustomInserter = 1
219219

220-
// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
221-
def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
222-
let Uses = [EXEC];
223-
let Defs = [EXEC];
224-
let hasSideEffects = 0;
225-
let mayLoad = 0;
226-
let mayStore = 0;
227-
}
228-
229-
def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
230-
let hasSideEffects = 0;
231-
let mayLoad = 0;
232-
let mayStore = 0;
233-
}
234-
235220
// Pseudo instructions used for @llvm.fptrunc.round upward
236221
// and @llvm.fptrunc.round downward.
237222
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD

llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
165165

166166
unsigned Opc = MI.getOpcode();
167167

168-
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
169-
Opc == AMDGPU::ENTER_PSEUDO_WM) {
168+
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
170169
dbgs() << "Entering ";
171170
} else {
172-
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
173-
Opc == AMDGPU::EXIT_PSEUDO_WM);
171+
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
174172
dbgs() << "Exiting ";
175173
}
176174

177175
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
178176
dbgs() << "Strict WWM ";
179-
} else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
180-
dbgs() << "Pseudo WWM/WQM ";
181177
} else {
182178
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
183179
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
230226
}
231227

232228
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
233-
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
234-
MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
229+
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
235230
LLVM_DEBUG(printWWMInfo(MI));
236231
InWWM = true;
237232
continue;
238233
}
239234

240235
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
241-
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
242-
MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
236+
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
243237
LLVM_DEBUG(printWWMInfo(MI));
244238
InWWM = false;
245239
}

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
214214
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
215215
bool IsWQM);
216216
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
217-
void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
218-
MachineInstr *Exit);
219217

220218
void lowerBlock(MachineBasicBlock &MBB);
221219
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1019,31 +1017,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
10191017
return NewTerm;
10201018
}
10211019

1022-
// Convert a strict mode transition to a pseudo transition.
1023-
// This still pre-allocates registers to prevent clobbering,
1024-
// but avoids any EXEC mask changes.
1025-
void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1026-
MachineInstr *Entry,
1027-
MachineInstr *Exit) {
1028-
assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1029-
assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1030-
1031-
Register SaveOrig = Entry->getOperand(0).getReg();
1032-
1033-
MachineInstr *NewEntry =
1034-
BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1035-
MachineInstr *NewExit =
1036-
BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1037-
1038-
LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1039-
Exit->eraseFromParent();
1040-
1041-
LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1042-
Entry->eraseFromParent();
1043-
1044-
LIS->removeInterval(SaveOrig);
1045-
}
1046-
10471020
// Replace (or supplement) instructions accessing live mask.
10481021
// This can only happen once all the live mask registers have been created
10491022
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1060,12 +1033,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10601033

10611034
SmallVector<MachineInstr *, 4> SplitPoints;
10621035
char State = BI.InitialState;
1063-
MachineInstr *StrictEntry = nullptr;
10641036

10651037
for (MachineInstr &MI : llvm::make_early_inc_range(
10661038
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1067-
char PreviousState = State;
1068-
10691039
if (StateTransition.count(&MI))
10701040
State = StateTransition[&MI];
10711041

@@ -1078,20 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10781048
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10791049
SplitPoint = lowerKillF32(MBB, MI);
10801050
break;
1081-
case AMDGPU::ENTER_STRICT_WQM:
1082-
StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1083-
break;
1084-
case AMDGPU::EXIT_STRICT_WQM:
1085-
if (State == StateWQM && StrictEntry) {
1086-
// Transition WQM -> StrictWQM -> WQM detected.
1087-
lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1088-
}
1089-
StrictEntry = nullptr;
1090-
break;
1091-
case AMDGPU::ENTER_STRICT_WWM:
1092-
case AMDGPU::EXIT_STRICT_WWM:
1093-
StrictEntry = nullptr;
1094-
break;
10951051
default:
10961052
break;
10971053
}
@@ -1245,11 +1201,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
12451201
}
12461202
LIS->InsertMachineInstrInMaps(*MI);
12471203
StateTransition[MI] = StrictStateNeeded;
1248-
1249-
// Mark block as needing lower so it will be checked for unnecessary transitions.
1250-
auto BII = Blocks.find(&MBB);
1251-
if (BII != Blocks.end())
1252-
BII->second.NeedsLowering = true;
12531204
}
12541205

12551206
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
28722872
; GFX9-W64: ; %bb.0: ; %main_body
28732873
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
28742874
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2875+
; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
2876+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
28752877
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2878+
; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
28762879
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
28772880
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
28782881
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
28792882
; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
28802883
; GFX9-W64-NEXT: ; %bb.1: ; %IF
2884+
; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
2885+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
28812886
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
28822887
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
28832888
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
28842889
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
28852890
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
28862891
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2892+
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
28872893
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
28882894
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
28892895
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
28962902
; GFX10-W32: ; %bb.0: ; %main_body
28972903
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
28982904
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2905+
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2906+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
28992907
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2908+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
29002909
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
29012910
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
29022911
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
29032912
; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
29042913
; GFX10-W32-NEXT: ; %bb.1: ; %IF
2914+
; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
2915+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
29052916
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
29062917
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
29072918
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
29082919
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
29092920
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
29102921
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2922+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
29112923
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
29122924
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
29132925
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -3192,8 +3204,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
31923204
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
31933205
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
31943206
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
3195-
; GFX9-W64-NEXT: s_nop 0
3207+
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3208+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
31963209
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3210+
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
31973211
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
31983212
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
31993213
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3234,9 +3248,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
32343248
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
32353249
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
32363250
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3237-
; GFX10-W32-NEXT: s_clause 0x1
32383251
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
3252+
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3253+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
32393254
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3255+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
32403256
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
32413257
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
32423258
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)