[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133

perlfu · 2024-06-02T01:37:29Z

This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer.

This reverts commit a3646ec.

llvmbot · 2024-06-02T01:38:00Z

@llvm/pr-subscribers-backend-amdgpu

Author: Carl Ritson (perlfu)

Changes

This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer.

This reverts commit a3646ec.

Full diff: https://github.com/llvm/llvm-project/pull/94133.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (-6)
(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (-15)
(modified) llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp (+4-10)
(modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (-49)
(modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+18-2)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d8e21da8019a7..0edcdb337b5af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case AMDGPU::ENTER_PSEUDO_WM:
-  case AMDGPU::EXIT_PSEUDO_WM: {
-    // These do nothing.
-    MI.eraseFromParent();
-    break;
-  }
   case AMDGPU::SI_RETURN: {
     const MachineFunction *MF = MBB.getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..89c96791bfd1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
 def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
 } // End usesCustomInserter = 1
 
-// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
-def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
-  let Uses = [EXEC];
-  let Defs = [EXEC];
-  let hasSideEffects = 0;
-  let mayLoad = 0;
-  let mayStore = 0;
-}
-
-def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
-  let hasSideEffects = 0;
-  let mayLoad = 0;
-  let mayStore = 0;
-}
-
 // Pseudo instructions used for @llvm.fptrunc.round upward
 // and @llvm.fptrunc.round downward.
 // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 398f870a9f531..5837dbeb3f986 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
 
   unsigned Opc = MI.getOpcode();
 
-  if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
-      Opc == AMDGPU::ENTER_PSEUDO_WM) {
+  if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
     dbgs() << "Entering ";
   } else {
-    assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
-           Opc == AMDGPU::EXIT_PSEUDO_WM);
+    assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
     dbgs() << "Exiting ";
   }
 
   if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
     dbgs() << "Strict WWM ";
-  } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
-    dbgs() << "Pseudo WWM/WQM ";
   } else {
     assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
     dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
       }
 
       if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
-          MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
-          MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
+          MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
         LLVM_DEBUG(printWWMInfo(MI));
         InWWM = true;
         continue;
       }
 
       if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
-          MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
-          MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
+          MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
         LLVM_DEBUG(printWWMInfo(MI));
         InWWM = false;
       }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..e95f1924b45d9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -214,8 +214,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
                             bool IsWQM);
   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
-  void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
-                             MachineInstr *Exit);
 
   void lowerBlock(MachineBasicBlock &MBB);
   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1019,31 +1017,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   return NewTerm;
 }
 
-// Convert a strict mode transition to a pseudo transition.
-// This still pre-allocates registers to prevent clobbering,
-// but avoids any EXEC mask changes.
-void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
-                                            MachineInstr *Entry,
-                                            MachineInstr *Exit) {
-  assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
-  assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
-
-  Register SaveOrig = Entry->getOperand(0).getReg();
-
-  MachineInstr *NewEntry =
-    BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
-  MachineInstr *NewExit =
-    BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
-
-  LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
-  Exit->eraseFromParent();
-
-  LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
-  Entry->eraseFromParent();
-
-  LIS->removeInterval(SaveOrig);
-}
-
 // Replace (or supplement) instructions accessing live mask.
 // This can only happen once all the live mask registers have been created
 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1060,12 +1033,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
 
   SmallVector<MachineInstr *, 4> SplitPoints;
   char State = BI.InitialState;
-  MachineInstr *StrictEntry = nullptr;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(
            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
-    char PreviousState = State;
-
     if (StateTransition.count(&MI))
       State = StateTransition[&MI];
 
@@ -1078,20 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(MBB, MI);
       break;
-    case AMDGPU::ENTER_STRICT_WQM:
-      StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
-      break;
-    case AMDGPU::EXIT_STRICT_WQM:
-      if (State == StateWQM && StrictEntry) {
-        // Transition WQM -> StrictWQM -> WQM detected.
-        lowerPseudoStrictMode(MBB, StrictEntry, &MI);
-      }
-      StrictEntry = nullptr;
-      break;
-    case AMDGPU::ENTER_STRICT_WWM:
-    case AMDGPU::EXIT_STRICT_WWM:
-      StrictEntry = nullptr;
-      break;
     default:
       break;
     }
@@ -1245,11 +1201,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
   }
   LIS->InsertMachineInstrInMaps(*MI);
   StateTransition[MI] = StrictStateNeeded;
-
-  // Mark block as needing lower so it will be checked for unnecessary transitions.
-  auto BII = Blocks.find(&MBB);
-  if (BII != Blocks.end())
-    BII->second.NeedsLowering = true;
 }
 
 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95dfb12c8dbae..a2d6c4c10a5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
+; GFX9-W64-NEXT:    s_mov_b64 s[16:17], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
@@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
 ; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
+; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
@@ -3192,8 +3204,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
-; GFX9-W64-NEXT:    s_nop 0
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3234,9 +3248,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_clause 0x1
 ; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)

piotrAMD · 2024-06-03T16:06:30Z

Code changes look good as this is a clean revert.

Would it be possible to post a simple test case?
What is "a later exec mask manipulation optimizer"?

This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec.

perlfu · 2024-06-05T12:14:44Z

Code changes look good as this is a clean revert.

It is technically one line different from a direct revert, because the original commit fixed a one line bug.

Would it be possible to post a simple test case?

The existing if-endif cases kind of cover this, but I have added a new test which shows the issue with kill call in WQM shaders and strict WQM.

What is "a later exec mask manipulation optimizer"?

I think there is scope for adding a exec mask analysis which could detect regions where the mask would be the same and removed unnecessary changes.
This would be separate follow up work which would safely reimplement the optimization removed with this PR.

arsenm · 2024-06-06T13:18:17Z

The existing if-endif cases kind of cover this, but I have added a new test which shows the issue with kill call in WQM shaders and strict WQM.

I don't see the new test?

piotrAMD · 2024-06-07T08:49:31Z

The existing if-endif cases kind of cover this, but I have added a new test which shows the issue with kill call in WQM shaders and strict WQM.

I don't see the new test?

Pre-committed in a44d740.

piotrAMD

LGTM

perlfu requested review from jayfoad and piotrAMD June 2, 2024 01:37

llvmbot added the backend:AMDGPU label Jun 2, 2024

perlfu added a commit that referenced this pull request Jun 5, 2024

[AMDGPU][NFC] Pre-commit test for PR #94133

a44d740

[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization

9f7b45f

This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer. This reverts commit a3646ec.

perlfu force-pushed the amdgpu-remove-pseudo-wavemodes branch from 5d6e4c9 to 9f7b45f Compare June 5, 2024 12:10

piotrAMD approved these changes Jun 7, 2024

View reviewed changes

perlfu merged commit db096ad into llvm:main Jun 12, 2024
7 checks passed

perlfu deleted the amdgpu-remove-pseudo-wavemodes branch June 12, 2024 06:51

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133

[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133

Uh oh!

perlfu commented Jun 2, 2024

Uh oh!

llvmbot commented Jun 2, 2024

Uh oh!

piotrAMD commented Jun 3, 2024 •

edited

Loading

Uh oh!

perlfu commented Jun 5, 2024

Uh oh!

arsenm commented Jun 6, 2024

Uh oh!

piotrAMD commented Jun 7, 2024

Uh oh!

piotrAMD left a comment

Uh oh!

Uh oh!

Uh oh!

[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133

[AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization #94133

Uh oh!

Conversation

perlfu commented Jun 2, 2024

Uh oh!

llvmbot commented Jun 2, 2024

Uh oh!

piotrAMD commented Jun 3, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

perlfu commented Jun 5, 2024

Uh oh!

arsenm commented Jun 6, 2024

Uh oh!

piotrAMD commented Jun 7, 2024

Uh oh!

piotrAMD left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

piotrAMD commented Jun 3, 2024 •

edited

Loading