-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) ChangesThis does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec. Full diff: https://github.com/llvm/llvm-project/pull/94133.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d8e21da8019a7..0edcdb337b5af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::ENTER_PSEUDO_WM:
- case AMDGPU::EXIT_PSEUDO_WM: {
- // These do nothing.
- MI.eraseFromParent();
- break;
- }
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..89c96791bfd1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
} // End usesCustomInserter = 1
-// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
-def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let Uses = [EXEC];
- let Defs = [EXEC];
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
-def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 398f870a9f531..5837dbeb3f986 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
- Opc == AMDGPU::ENTER_PSEUDO_WM) {
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
dbgs() << "Entering ";
} else {
- assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
- Opc == AMDGPU::EXIT_PSEUDO_WM);
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Exiting ";
}
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
- } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
- dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
}
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..e95f1924b45d9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -214,8 +214,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
- void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
- MachineInstr *Exit);
void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1019,31 +1017,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}
-// Convert a strict mode transition to a pseudo transition.
-// This still pre-allocates registers to prevent clobbering,
-// but avoids any EXEC mask changes.
-void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
- MachineInstr *Entry,
- MachineInstr *Exit) {
- assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
- assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
-
- Register SaveOrig = Entry->getOperand(0).getReg();
-
- MachineInstr *NewEntry =
- BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
- MachineInstr *NewExit =
- BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
-
- LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
- Exit->eraseFromParent();
-
- LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
- Entry->eraseFromParent();
-
- LIS->removeInterval(SaveOrig);
-}
-
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1060,12 +1033,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
- MachineInstr *StrictEntry = nullptr;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
- char PreviousState = State;
-
if (StateTransition.count(&MI))
State = StateTransition[&MI];
@@ -1078,20 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
- case AMDGPU::ENTER_STRICT_WQM:
- StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
- break;
- case AMDGPU::EXIT_STRICT_WQM:
- if (State == StateWQM && StrictEntry) {
- // Transition WQM -> StrictWQM -> WQM detected.
- lowerPseudoStrictMode(MBB, StrictEntry, &MI);
- }
- StrictEntry = nullptr;
- break;
- case AMDGPU::ENTER_STRICT_WWM:
- case AMDGPU::EXIT_STRICT_WWM:
- StrictEntry = nullptr;
- break;
default:
break;
}
@@ -1245,11 +1201,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StrictStateNeeded;
-
- // Mark block as needing lower so it will be checked for unnecessary transitions.
- auto BII = Blocks.find(&MBB);
- if (BII != Blocks.end())
- BII->second.NeedsLowering = true;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95dfb12c8dbae..a2d6c4c10a5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
+; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
+; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -3192,8 +3204,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
+; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3234,9 +3248,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
+; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
|
Code changes look good as this is a clean revert. Would it be possible to post a simple test case? |
This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec.
5d6e4c9
to
9f7b45f
Compare
It is technically one line different from a direct revert, because the original commit fixed a one line bug.
The existing if-endif cases kind of cover this, but I have added a new test which shows the issue with kill call in WQM shaders and strict WQM.
I think there is scope for adding a exec mask analysis which could detect regions where the mask would be the same and removed unnecessary changes. |
I don't see the new test? |
Pre-committed in a44d740. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer.
This reverts commit a3646ec.