From a29b0b83bab0dd603c516078d0b927ee0030e12e Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 17 Oct 2024 09:59:03 +0800 Subject: [PATCH 01/12] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT operations. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7ed055e8da2b6..a3d176e75e989 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3744,6 +3744,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return DAG.getBuildVector(VT, DL, Args); } +bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const { + if (Op->isDivergent()) + return false; + + EVT VT = Op.getValueType(); + return VT == MVT::i32 || VT == MVT::i64; +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0dd2183b72b24..edde293500ab1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -102,6 +102,8 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + bool hasAndNot(SDValue Y) const override; + protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; From e0ddab1f55e2c350b286487bdda37bff99d008cf Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 17 Oct 2024 10:31:26 +0800 Subject: [PATCH 02/12] [AMDGPU] Update value name. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index edde293500ab1..fb4c29d4f0de3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -102,7 +102,7 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - bool hasAndNot(SDValue Y) const override; + bool hasAndNot(SDValue Op) const override; protected: bool shouldCombineMemoryType(EVT VT) const; From 7c900c2fbe3631c2ef30edcb7f6aaf0164f1bc61 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 17 Oct 2024 12:07:09 +0800 Subject: [PATCH 03/12] [AMDGPU] Update patch. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++-------- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 9 +++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a3d176e75e989..e2aa94a242dbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3744,14 +3744,6 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return DAG.getBuildVector(VT, DL, Args); } -bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const { - if (Op->isDivergent()) - return false; - - EVT VT = Op.getValueType(); - return VT == MVT::i32 || VT == MVT::i64; -} - //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -6116,3 +6108,11 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const { return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks } + +bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const { + if (Op->isDivergent()) + return false; + + EVT VT = Op.getValueType(); + return VT == MVT::i32 || VT == MVT::i64; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index fb4c29d4f0de3..3c95bdaa8387c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -102,8 +102,6 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - bool hasAndNot(SDValue Op) const override; - protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -390,6 +388,13 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + /// Return true if the target supports a bitwise and-not operation: + /// X = ~A & B + /// This function checks if the operation can be directly mapped to the + /// target's native instructions, potentially simplifying select or other + /// related instructions by using more efficient hardware-specific operations. + bool hasAndNot(SDValue Op) const override; }; namespace AMDGPUISD { From 563de337813767afc5aa8048c96591710ec627d7 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 17 Oct 2024 12:54:13 +0800 Subject: [PATCH 04/12] [AMDGPU] Move to SIISelLowering. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 -------- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 ------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 7 +++++++ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e2aa94a242dbe..7ed055e8da2b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6108,11 +6108,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const { return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks } - -bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const { - if (Op->isDivergent()) - return false; - - EVT VT = Op.getValueType(); - return VT == MVT::i32 || VT == MVT::i64; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 3c95bdaa8387c..0dd2183b72b24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -388,13 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } - - /// Return true if the target supports a bitwise and-not operation: - /// X = ~A & B - /// This function checks if the operation can be directly mapped to the - /// target's native instructions, potentially simplifying select or other - /// related instructions by using more efficient hardware-specific operations. - bool hasAndNot(SDValue Op) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ba7e11a853347..54fc06f2ac29d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17577,3 +17577,11 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { AI->eraseFromParent(); return LI; } + +bool SITargetLowering::hasAndNot(SDValue Op) const { + if (Op->isDivergent()) + return false; + + EVT VT = Op.getValueType(); + return VT == MVT::i32 || VT == MVT::i64; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c42366a1c04c8..484e0a221b4a8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -611,6 +611,13 @@ class SITargetLowering final : public AMDGPUTargetLowering { MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override; + + /// Return true if the target supports a bitwise and-not operation: + /// X = ~A & B + /// This function checks if the operation can be directly mapped to the + /// target's native instructions, potentially simplifying select or other + /// related instructions by using more efficient hardware-specific operations. + bool hasAndNot(SDValue Op) const override; }; // Returns true if argument is a boolean value which is not serialized into From b06240ea60483cb30f511b2ba045401c10fd9527 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 17 Oct 2024 14:22:34 +0800 Subject: [PATCH 05/12] [AMDGPU] Update comments. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 ++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 54fc06f2ac29d..4e8305646c0e4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17579,6 +17579,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { } bool SITargetLowering::hasAndNot(SDValue Op) const { + // Return false if the operation is divergent, as AND-NOT optimization + // requires uniform behavior across threads. if (Op->isDivergent()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 484e0a221b4a8..b348702a5bd8d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -611,12 +611,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override; - - /// Return true if the target supports a bitwise and-not operation: - /// X = ~A & B - /// This function checks if the operation can be directly mapped to the - /// target's native instructions, potentially simplifying select or other - /// related instructions by using more efficient hardware-specific operations. bool hasAndNot(SDValue Op) const override; }; From 244612de314bd3d972159d592bed8d4637e96159 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Fri, 18 Oct 2024 13:21:18 +0800 Subject: [PATCH 06/12] [AMDGPU] Add a lit test for hasAndNot. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 86 +++++++++++++++++++++-- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ++++++++++ 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4e8305646c0e4..a033523276992 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7061,6 +7061,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) { } } +SDValue SITargetLowering::combineAnd(SDValue Op, + DAGCombinerInfo &DCI) const { + const unsigned Opc = Op.getOpcode(); + assert(Opc == ISD::AND); + + auto &DAG = DCI.DAG; + SDLoc DL(Op); + + if(hasAndNot(Op)) { + SDValue LHS = Op->getOperand(0); + SDValue RHS = Op->getOperand(1); + + // (and LHS, (or Y, ~Z)) + if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) { + SDValue Y = RHS->getOperand(0); + SDValue NotZ = RHS->getOperand(1); + + if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) { + SDValue Z = NotZ->getOperand(0); + + if (!isa(Y)) { + SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType()); + SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z); + SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType()); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ); + return NewAnd; + } + } + } + } + + EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() + : Op->getOperand(0).getValueType(); + auto ExtTy = OpTy.changeElementType(MVT::i32); + + if (DCI.isBeforeLegalizeOps() || + isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) + return SDValue(); + + SDValue LHS; + SDValue RHS; + if (Opc == ISD::SELECT) { + LHS = Op->getOperand(1); + RHS = Op->getOperand(2); + } else { + LHS = Op->getOperand(0); + RHS = Op->getOperand(1); + } + + const unsigned ExtOp = getExtOpcodeForPromotedOp(Op); + LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS}); + + // Special case: for shifts, the RHS always needs a zext. + if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) + RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS}); + else + RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS}); + + // setcc always return i1/i1 vec so no need to truncate after. + if (Opc == ISD::SETCC) { + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC); + } + + // For other ops, we extend the operation's return type as well so we need to + // truncate back to the original type. + SDValue NewVal; + if (Opc == ISD::SELECT) + NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS}); + else + NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}); + + return DAG.getZExtOrTrunc(NewVal, DL, OpTy); +} + SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const { const unsigned Opc = Op.getOpcode(); @@ -15294,13 +15369,17 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { + case ISD::AND: + if (auto Res = combineAnd(SDValue(N, 0), DCI)) + return Res; + break; case ISD::ADD: case ISD::SUB: case ISD::SHL: case ISD::SRL: case ISD::SRA: - case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::MUL: @@ -15408,7 +15487,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CLAMP: return performClampCombine(N, DCI); case ISD::SCALAR_TO_VECTOR: { - SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) @@ -17579,8 +17657,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { } bool SITargetLowering::hasAndNot(SDValue Op) const { - // Return false if the operation is divergent, as AND-NOT optimization - // requires uniform behavior across threads. + // Return false if the operation is divergent, as AND-NOT is a scalar-only + // instruction. if (Op->isDivergent()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index b348702a5bd8d..a418bae67ebc4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; + SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll new file mode 100644 index 0000000000000..821709847ab8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/andornot.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: {{^}}scalar_and_or_not_i16 +; GCN: s_not_b32 +; GCN-NEXT: s_lshr_b32 +; GCN-NEXT: s_and_b32 +; GCN-NEXT: s_andn2_b32 +define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) { +entry: + %not_z = xor i16 %z, -1 + %or_y_not_z = or i16 %y, %not_z + %and_result = and i16 %x, %or_y_not_z + store i16 %and_result, ptr addrspace(1) %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}scalar_and_or_not_i32 +; GCN: s_andn2_b32 +; GCN-NEXT: s_andn2_b32 +define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { +entry: + %not_z = xor i32 %z, -1 + %or_y_not_z = or i32 %y, %not_z + %and_result = and i32 %x, %or_y_not_z + store i32 %and_result, ptr addrspace(1) %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}scalar_and_or_not_i64 +; GCN: s_andn2_b64 +; GCN-NEXT: s_andn2_b64 +define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) { +entry: + %not_z = xor i64 %z, -1 + %or_y_not_z = or i64 %y, %not_z + %and_result = and i64 %x, %or_y_not_z + store i64 %and_result, ptr addrspace(1) %out, align 4 + ret void +} From 70d8ac0a21294220985e67826d8971c28f88da1c Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Fri, 18 Oct 2024 13:37:51 +0800 Subject: [PATCH 07/12] [AMDGPU] Fix clang format issue. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a033523276992..751b344bae37a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7061,15 +7061,14 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) { } } -SDValue SITargetLowering::combineAnd(SDValue Op, - DAGCombinerInfo &DCI) const { +SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const { const unsigned Opc = Op.getOpcode(); assert(Opc == ISD::AND); auto &DAG = DCI.DAG; SDLoc DL(Op); - if(hasAndNot(Op)) { + if (hasAndNot(Op)) { SDValue LHS = Op->getOperand(0); SDValue RHS = Op->getOperand(1); @@ -7078,20 +7077,24 @@ SDValue SITargetLowering::combineAnd(SDValue Op, SDValue Y = RHS->getOperand(0); SDValue NotZ = RHS->getOperand(1); - if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) { + if (NotZ.getOpcode() == ISD::XOR && + isAllOnesConstant(NotZ->getOperand(1))) { SDValue Z = NotZ->getOperand(0); if (!isa(Y)) { SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType()); - SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z); - SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType()); - SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ); + SDValue AndNotYZ = + DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z); + SDValue NotAndNotYZ = + DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType()); + SDValue NewAnd = + DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ); return NewAnd; } } } } - + EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() : Op->getOperand(0).getValueType(); auto ExtTy = OpTy.changeElementType(MVT::i32); From ee5ca4e00a0c06200fd11901a85d3ec59fb57eef Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Fri, 18 Oct 2024 14:27:15 +0800 Subject: [PATCH 08/12] [AMDGPU] Remove combineAnd. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 87 +---------------------- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 - llvm/test/CodeGen/AMDGPU/andorn2.ll | 44 ++++++++++++ llvm/test/CodeGen/AMDGPU/andornot.ll | 39 ---------- 4 files changed, 47 insertions(+), 124 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 751b344bae37a..9e1f887449dc3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7061,84 +7061,6 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) { } } -SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const { - const unsigned Opc = Op.getOpcode(); - assert(Opc == ISD::AND); - - auto &DAG = DCI.DAG; - SDLoc DL(Op); - - if (hasAndNot(Op)) { - SDValue LHS = Op->getOperand(0); - SDValue RHS = Op->getOperand(1); - - // (and LHS, (or Y, ~Z)) - if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) { - SDValue Y = RHS->getOperand(0); - SDValue NotZ = RHS->getOperand(1); - - if (NotZ.getOpcode() == ISD::XOR && - isAllOnesConstant(NotZ->getOperand(1))) { - SDValue Z = NotZ->getOperand(0); - - if (!isa(Y)) { - SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType()); - SDValue AndNotYZ = - DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z); - SDValue NotAndNotYZ = - DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType()); - SDValue NewAnd = - DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ); - return NewAnd; - } - } - } - } - - EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() - : Op->getOperand(0).getValueType(); - auto ExtTy = OpTy.changeElementType(MVT::i32); - - if (DCI.isBeforeLegalizeOps() || - isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) - return SDValue(); - - SDValue LHS; - SDValue RHS; - if (Opc == ISD::SELECT) { - LHS = Op->getOperand(1); - RHS = Op->getOperand(2); - } else { - LHS = Op->getOperand(0); - RHS = Op->getOperand(1); - } - - const unsigned ExtOp = getExtOpcodeForPromotedOp(Op); - LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS}); - - // Special case: for shifts, the RHS always needs a zext. - if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) - RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS}); - else - RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS}); - - // setcc always return i1/i1 vec so no need to truncate after. - if (Opc == ISD::SETCC) { - ISD::CondCode CC = cast(Op.getOperand(2))->get(); - return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC); - } - - // For other ops, we extend the operation's return type as well so we need to - // truncate back to the original type. - SDValue NewVal; - if (Opc == ISD::SELECT) - NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS}); - else - NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}); - - return DAG.getZExtOrTrunc(NewVal, DL, OpTy); -} - SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const { const unsigned Opc = Op.getOpcode(); @@ -15372,17 +15294,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { - case ISD::AND: - if (auto Res = combineAnd(SDValue(N, 0), DCI)) - return Res; - break; case ISD::ADD: case ISD::SUB: case ISD::SHL: case ISD::SRL: case ISD::SRA: + case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::MUL: @@ -15490,6 +15408,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CLAMP: return performClampCombine(N, DCI); case ISD::SCALAR_TO_VECTOR: { + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) @@ -17662,7 +17581,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { bool SITargetLowering::hasAndNot(SDValue Op) const { // Return false if the operation is divergent, as AND-NOT is a scalar-only // instruction. - if (Op->isDivergent()) + if (Op->isDivergent() || !Op->isMachineOpcode()) return false; EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index a418bae67ebc4..b348702a5bd8d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -148,7 +148,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; - SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll index 3226a77bb9d34..e1fdddf4438b6 100644 --- a/llvm/test/CodeGen/AMDGPU/andorn2.ll +++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll @@ -25,6 +25,28 @@ entry: ret void } +; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr +; GCN: s_andn2_b32 +define amdgpu_kernel void @scalar_andn2_i32_one_sgpr( + ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) { +entry: + %nb = xor i32 %b, -1 + %r0.val = and i32 %a, %nb + store i32 %r0.val, ptr addrspace(1) %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr +; GCN: s_andn2_b64 +define amdgpu_kernel void @scalar_andn2_i64_one_sgpr( + ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) { +entry: + %nb = xor i64 %b, -1 + %r0.val = and i64 %a, %nb + store i64 %r0.val, ptr addrspace(1) %r0 + ret void +} + ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use ; GCN: s_orn2_b32 define amdgpu_kernel void @scalar_orn2_i32_one_use( @@ -47,6 +69,28 @@ entry: ret void } +; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr +; GCN: s_orn2_b32 +define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr( + ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) { +entry: + %nb = xor i32 %b, -1 + %r0.val = or i32 %a, %nb + store i32 %r0.val, ptr addrspace(1) %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr +; GCN: s_orn2_b64 +define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr( + ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) { +entry: + %nb = xor i64 %b, -1 + %r0.val = or i64 %a, %nb + store i64 %r0.val, ptr addrspace(1) %r0 + ret void +} + ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use ; GCN: v_not_b32 ; GCN: v_and_b32 diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll deleted file mode 100644 index 821709847ab8d..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/andornot.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s - -; GCN-LABEL: {{^}}scalar_and_or_not_i16 -; GCN: s_not_b32 -; GCN-NEXT: s_lshr_b32 -; GCN-NEXT: s_and_b32 -; GCN-NEXT: s_andn2_b32 -define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) { -entry: - %not_z = xor i16 %z, -1 - %or_y_not_z = or i16 %y, %not_z - %and_result = and i16 %x, %or_y_not_z - store i16 %and_result, ptr addrspace(1) %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}scalar_and_or_not_i32 -; GCN: s_andn2_b32 -; GCN-NEXT: s_andn2_b32 -define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { -entry: - %not_z = xor i32 %z, -1 - %or_y_not_z = or i32 %y, %not_z - %and_result = and i32 %x, %or_y_not_z - store i32 %and_result, ptr addrspace(1) %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}scalar_and_or_not_i64 -; GCN: s_andn2_b64 -; GCN-NEXT: s_andn2_b64 -define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) { -entry: - %not_z = xor i64 %z, -1 - %or_y_not_z = or i64 %y, %not_z - %and_result = and i64 %x, %or_y_not_z - store i64 %and_result, ptr addrspace(1) %out, align 4 - ret void -} From 2ec01c60b08fcc65c2fa65929e201099843dfcfe Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Mon, 21 Oct 2024 15:59:26 +0800 Subject: [PATCH 09/12] [AMDGPU] Update lit test. --- llvm/test/CodeGen/AMDGPU/andorn2.ll | 32 +++++++++++------------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll index e1fdddf4438b6..4fe7e21b2adea 100644 --- a/llvm/test/CodeGen/AMDGPU/andorn2.ll +++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll @@ -27,24 +27,20 @@ entry: ; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr ; GCN: s_andn2_b32 -define amdgpu_kernel void @scalar_andn2_i32_one_sgpr( - ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) { +define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) { entry: %nb = xor i32 %b, -1 - %r0.val = and i32 %a, %nb - store i32 %r0.val, ptr addrspace(1) %r0 - ret void + %and = and i32 %a, %nb + ret i32 %and } ; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr ; GCN: s_andn2_b64 -define amdgpu_kernel void @scalar_andn2_i64_one_sgpr( - ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) { +define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) { entry: %nb = xor i64 %b, -1 - %r0.val = and i64 %a, %nb - store i64 %r0.val, ptr addrspace(1) %r0 - ret void + %and = and i64 %a, %nb + ret i64 %and } ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use @@ -71,24 +67,20 @@ entry: ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr ; GCN: s_orn2_b32 -define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr( - ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) { +define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) { entry: %nb = xor i32 %b, -1 - %r0.val = or i32 %a, %nb - store i32 %r0.val, ptr addrspace(1) %r0 - ret void + %or = or i32 %a, %nb + ret i32 %or; } ; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr ; GCN: s_orn2_b64 -define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr( - ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) { +define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) { entry: %nb = xor i64 %b, -1 - %r0.val = or i64 %a, %nb - store i64 %r0.val, ptr addrspace(1) %r0 - ret void + %or = or i64 %a, %nb + ret i64 %or; } ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use From 28ea08444b1b5f591f030f7de266949987ccfa19 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Sun, 27 Apr 2025 18:03:24 +0800 Subject: [PATCH 10/12] [AMDGPU] Add unfold test. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/test/CodeGen/AMDGPU/andorn2.ll | 36 ----------------------- 2 files changed, 1 insertion(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9e1f887449dc3..075a6374a5427 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17581,7 +17581,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { bool SITargetLowering::hasAndNot(SDValue Op) const { // Return false if the operation is divergent, as AND-NOT is a scalar-only // instruction. - if (Op->isDivergent() || !Op->isMachineOpcode()) + if (Op->isDivergent()) return false; EVT VT = Op.getValueType(); diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll index 4fe7e21b2adea..3226a77bb9d34 100644 --- a/llvm/test/CodeGen/AMDGPU/andorn2.ll +++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll @@ -25,24 +25,6 @@ entry: ret void } -; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr -; GCN: s_andn2_b32 -define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) { -entry: - %nb = xor i32 %b, -1 - %and = and i32 %a, %nb - ret i32 %and -} - -; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr -; GCN: s_andn2_b64 -define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) { -entry: - %nb = xor i64 %b, -1 - %and = and i64 %a, %nb - ret i64 %and -} - ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use ; GCN: s_orn2_b32 define amdgpu_kernel void @scalar_orn2_i32_one_use( @@ -65,24 +47,6 @@ entry: ret void } -; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr -; GCN: s_orn2_b32 -define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) { -entry: - %nb = xor i32 %b, -1 - %or = or i32 %a, %nb - ret i32 %or; -} - -; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr -; GCN: s_orn2_b64 -define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) { -entry: - %nb = xor i64 %b, -1 - %or = or i64 %a, %nb - ret i64 %or; -} - ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use ; GCN: v_not_b32 ; GCN: v_and_b32 From dba615541bbdd6da84068d3450ba07164b8c9f02 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Sun, 27 Apr 2025 18:57:18 +0800 Subject: [PATCH 11/12] [AMDGPU] Update. --- llvm/test/CodeGen/AMDGPU/bfi_int.ll | 109 ++++++++----- llvm/test/CodeGen/AMDGPU/commute-compares.ll | 17 ++- ...unfold-masked-merge-scalar-variablemask.ll | 143 ++++++++---------- 3 files changed, 140 insertions(+), 129 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 201b97d479c68..6e9cd8807b379 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s0, s0, s1 -; GFX7-NEXT: s_xor_b32 s0, s2, s0 +; GFX7-NEXT: s_andn2_b32 s2, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s0, s2, s0 +; GFX8-NEXT: s_andn2_b32 s2, s2, s0 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: s_xor_b32 s0, s2, s0 +; GFX10-NEXT: s_andn2_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s0, s1, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -317,19 +317,26 @@ entry: define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { ; GFX7-LABEL: s_s_v_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_nand_b32 s0, s1, s0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_s_v_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0 +; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: s_nand_b32 s0, s1, s0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_s_v_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_nand_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch: @@ -350,30 +357,40 @@ entry: ret float %cast } -define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) { +define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { ; GFX7-LABEL: s_v_v_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_nand_b32 s0, s1, s0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_v_v_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: s_nand_b32 s0, s1, s0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_v_v_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_nand_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %xor0 = xor i32 %y, %z @@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6 define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) { ; GFX7-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX7-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX7-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX8-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0 -; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1 +; GFX10-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1: @@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index ae8080cf9f06a..f6deb6c6df422 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs ; GCN-LABEL: commute_sgt_neg1_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll index 69724aa75af4f..56990a7b73310 100644 --- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll @@ -39,11 +39,10 @@ define i32 @s_in32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_in32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -56,11 +55,10 @@ define i64 @s_in64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) { ; GCN-LABEL: s_in64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17] ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i64 %x, %y @@ -75,11 +73,10 @@ define i32 @s_in_commutativity_0_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_0_0_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -92,11 +89,10 @@ define i32 @s_in_commutativity_0_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_0_1_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -109,11 +105,10 @@ define i32 @in_commutativity_0_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: in_commutativity_0_1_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -126,11 +121,10 @@ define i32 @s_in_commutativity_1_0_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -143,11 +137,10 @@ define i32 @s_in_commutativity_1_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_0_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s1, s2, s1 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -160,11 +153,10 @@ define i32 @s_in_commutativity_1_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_1_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -177,11 +169,10 @@ define i32 @s_in_commutativity_1_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_1_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s1, s2, s1 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -197,11 +188,10 @@ define i32 @s_in_complex_y0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s0, s0, s3 +; GCN-NEXT: s_and_not1_b32 s1, s1, s3 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %y = and i32 %y_hi, %y_low @@ -216,11 +206,10 @@ define i32 @s_in_complex_y1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s0, s0, s3 +; GCN-NEXT: s_and_not1_b32 s1, s1, s3 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %y = and i32 %y_hi, %y_low @@ -237,10 +226,10 @@ define i32 @s_in_complex_m0(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -256,10 +245,10 @@ define i32 @s_in_complex_m1(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -278,10 +267,10 @@ define i32 @s_in_complex_y0_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -299,10 +288,10 @@ define i32 @s_in_complex_y1_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -320,10 +309,10 @@ define i32 @s_in_complex_y0_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -341,10 +330,10 @@ define i32 @s_in_complex_y1_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -444,11 +433,10 @@ define i32 @in_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: in_constant_varx_42: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, 42, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, 42 @@ -480,11 +468,10 @@ define i32 @s_in_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_in_constant_varx_42_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_b32 s1, s2, 42 ; GCN-NEXT: s_and_not1_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -577,11 +564,10 @@ define i32 @s_in_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_in_constant_42_vary: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_not1_b32 s0, s1, s2 +; GCN-NEXT: s_and_b32 s1, s2, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 42, %y @@ -613,11 +599,10 @@ define i32 @s_in_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_in_constant_42_vary_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_b32 s0, s1, s2 +; GCN-NEXT: s_and_not1_b32 s1, 42, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_not1_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 From 9990cfb04e3f7a6078d2039b50d9df8894171eaf Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Mon, 19 May 2025 16:46:40 +0000 Subject: [PATCH 12/12] [AMDGPU] Update comments. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 075a6374a5427..3c564c76e0f9f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17579,8 +17579,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { } bool SITargetLowering::hasAndNot(SDValue Op) const { - // Return false if the operation is divergent, as AND-NOT is a scalar-only - // instruction. + // AND-NOT is only valid on uniform (SGPR) values; divergent values live in + // VGPRs. if (Op->isDivergent()) return false;