From 2c753b0d6d727d4ccd561bd14bd8077483878964 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 15:27:20 +0700 Subject: [PATCH 1/3] new test --- .../AMDGPU/fold-literal-multiple-gfx10.mir | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir diff --git a/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir new file mode 100644 index 0000000000000..95929b5d97f23 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir @@ -0,0 +1,67 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-fold-operands -o - %s | FileCheck %s + +# The same literal may be used multiple times in different operands, +# as long as it is the same value. + +--- +name: fold_multiple_same_literal_use_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, %2, 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: fold_multiple_same_literal_use_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1178657792, implicit $exec + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: no_fold_multiple_same_literal_different_value +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: no_fold_multiple_same_literal_different_value + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1178657793, implicit $exec + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657793 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... From 57468eb659ec2de38386e670b520835ddfafcdae Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 15:12:12 +0700 Subject: [PATCH 2/3] AMDGPU: Correct legal literal operand logic for multiple uses The same literal can be used multiple times in an instruction, not just once. We were not tracking the used value to verify this, so correct this. This helps avoid regressions in a future patch. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 15 ++++++++++-- .../eliminate-frame-index-v-add-co-u32.mir | 24 +++++++------------ .../AMDGPU/fold-literal-multiple-gfx10.mir | 3 +-- llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir | 3 +-- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4ee5ebd7681b8..33bec4a561622 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5918,11 +5918,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); + const MachineOperand *UsedLiteral = nullptr; + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { - if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) - return false; + if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { + UsedLiteral = MO; + if (!LiteralLimit--) + return false; + } SmallDenseSet SGPRsUsed; if (MO->isReg()) @@ -5943,6 +5948,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } } else if (AMDGPU::isSISrcOperand(InstDesc, i) && !isInlineConstant(Op, InstDesc.operands()[i])) { + // The same literal may be used multiple times. + if (!UsedLiteral) + UsedLiteral = &Op; + else if (UsedLiteral->isIdenticalTo(Op)) + continue; + if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 12e8d24cb3675..ade7b4266e9e6 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -2162,8 +2162,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2178,8 +2177,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2315,8 +2313,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2332,8 +2329,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2469,8 +2465,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2485,8 +2480,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2622,8 +2616,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2639,8 +2632,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir index 95929b5d97f23..e71516e74f17e 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir @@ -35,8 +35,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1178657792, implicit $exec - ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = S_MOV_B32 1178657792 diff --git a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir index 268a8a4783d24..edd5d0a119e5f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir @@ -55,8 +55,7 @@ body: | # GCN-LABEL: name: fma_sgpr_sgpr_use # GCN: %0:sgpr_32 = IMPLICIT_DEF -# GCN-NEXT: %2:vgpr_32 = V_MOV_B32_e32 1234567, implicit $exec -# GCN-NEXT: %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, 1234567, 0, %2, 0, 0, implicit $mode, implicit $exec +# GCN: %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, 1234567, 0, 1234567, 0, 0, implicit $mode, implicit $exec --- name: fma_sgpr_sgpr_use body: | From 07b9e4e64517e61f8058c92652c718411b8ee9e1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 16:10:45 +0700 Subject: [PATCH 3/3] Moved set of UsedLiteral --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 33bec4a561622..215e15d963133 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5924,9 +5924,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { - UsedLiteral = MO; if (!LiteralLimit--) return false; + + UsedLiteral = MO; } SmallDenseSet SGPRsUsed;