diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d8e21da8019a7..0edcdb337b5af 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case AMDGPU::ENTER_PSEUDO_WM: - case AMDGPU::EXIT_PSEUDO_WM: { - // These do nothing. - MI.eraseFromParent(); - break; - } case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c1b844f844c32..89c96791bfd1c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; } // End usesCustomInserter = 1 -// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. -def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let Uses = [EXEC]; - let Defs = [EXEC]; - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - -def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - // Pseudo instructions used for @llvm.fptrunc.round upward // and @llvm.fptrunc.round downward. // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 398f870a9f531..5837dbeb3f986 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM || - Opc == AMDGPU::ENTER_PSEUDO_WM) { + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { dbgs() << "Entering "; } else { - assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM || - Opc == AMDGPU::EXIT_PSEUDO_WM); + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Exiting "; } if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { dbgs() << "Strict WWM "; - } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) { - dbgs() << "Pseudo WWM/WQM "; } else { assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Strict WQM "; @@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { } if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || - MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM || - MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || - MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM || - MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 09dc1c781e2f3..e95f1924b45d9 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -214,8 +214,6 @@ class SIWholeQuadMode : public MachineFunctionPass { MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM); MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); - void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, - MachineInstr *Exit); void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); @@ -1019,31 +1017,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, return NewTerm; } -// Convert a strict mode transition to a pseudo transition. -// This still pre-allocates registers to prevent clobbering, -// but avoids any EXEC mask changes. -void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, - MachineInstr *Entry, - MachineInstr *Exit) { - assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); - assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); - - Register SaveOrig = Entry->getOperand(0).getReg(); - - MachineInstr *NewEntry = - BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); - MachineInstr *NewExit = - BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); - - LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit); - Exit->eraseFromParent(); - - LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry); - Entry->eraseFromParent(); - - LIS->removeInterval(SaveOrig); -} - // Replace (or supplement) instructions accessing live mask. // This can only happen once all the live mask registers have been created // and the execute state (WQM/StrictWWM/Exact) of instructions is known. @@ -1060,12 +1033,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { SmallVector SplitPoints; char State = BI.InitialState; - MachineInstr *StrictEntry = nullptr; for (MachineInstr &MI : llvm::make_early_inc_range( llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) { - char PreviousState = State; - if (StateTransition.count(&MI)) State = StateTransition[&MI]; @@ -1078,20 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; - case AMDGPU::ENTER_STRICT_WQM: - StrictEntry = PreviousState == StateWQM ? &MI : nullptr; - break; - case AMDGPU::EXIT_STRICT_WQM: - if (State == StateWQM && StrictEntry) { - // Transition WQM -> StrictWQM -> WQM detected. - lowerPseudoStrictMode(MBB, StrictEntry, &MI); - } - StrictEntry = nullptr; - break; - case AMDGPU::ENTER_STRICT_WWM: - case AMDGPU::EXIT_STRICT_WWM: - StrictEntry = nullptr; - break; default: break; } @@ -1245,11 +1201,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, } LIS->InsertMachineInstrInMaps(*MI); StateTransition[MI] = StrictStateNeeded; - - // Mark block as needing lower so it will be checked for unnecessary transitions. - auto BII = Blocks.find(&MBB); - if (BII != Blocks.end()) - BII->second.NeedsLowering = true; } void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 6fcf5067b0225..52b0a90560da9 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF +; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF +; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -2955,7 +2967,10 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body ; GFX9-W64-NEXT: s_and_b64 exec, exec, vcc +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2) +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v1 @@ -2987,7 +3002,10 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2) +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v1 @@ -3275,8 +3293,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen -; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen +; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(1) ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 @@ -3317,9 +3337,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_clause 0x1 ; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1) ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)