Skip to content

Commit 249fbdf

Browse files
committed
[SDAG][AMDGPU] Allow opting in to OOB-generating PTRADD transforms
This PR adds a TargetLowering hook, canTransformPtrArithOutOfBounds, that targets can use to allow transformations to introduce out-of-bounds pointer arithmetic. It also moves two such transformations from the AMDGPU-specific DAG combines to the generic DAGCombiner. This is motivated by target features like AArch64's checked pointer arithmetic, CPA, which does not tolerate the introduction of out-of-bounds pointer arithmetic.
1 parent 4950255 commit 249fbdf

File tree

4 files changed

+94
-100
lines changed

4 files changed

+94
-100
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3510,6 +3510,13 @@ class LLVM_ABI TargetLoweringBase {
35103510
return false;
35113511
}
35123512

3513+
/// True if the target allows transformations of in-bounds pointer
3514+
/// arithmetic that cause out-of-bounds intermediate results.
3515+
virtual bool canTransformPtrArithOutOfBounds(const Function &F,
3516+
EVT PtrVT) const {
3517+
return false;
3518+
}
3519+
35133520
/// Does this target support complex deinterleaving
35143521
virtual bool isComplexDeinterleavingSupported() const { return false; }
35153522

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,59 +2689,82 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
26892689
if (PtrVT == IntVT && isNullConstant(N0))
26902690
return N1;
26912691

2692-
if (N0.getOpcode() != ISD::PTRADD ||
2693-
reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1))
2694-
return SDValue();
2695-
2696-
SDValue X = N0.getOperand(0);
2697-
SDValue Y = N0.getOperand(1);
2698-
SDValue Z = N1;
2699-
bool N0OneUse = N0.hasOneUse();
2700-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2701-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2702-
2703-
// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
2704-
// * y is a constant and (ptradd x, y) has one use; or
2705-
// * y and z are both constants.
2706-
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
2707-
// If both additions in the original were NUW, the new ones are as well.
2708-
SDNodeFlags Flags =
2709-
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2710-
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
2711-
AddToWorklist(Add.getNode());
2712-
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
2692+
if (N0.getOpcode() == ISD::PTRADD &&
2693+
!reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
2694+
SDValue X = N0.getOperand(0);
2695+
SDValue Y = N0.getOperand(1);
2696+
SDValue Z = N1;
2697+
bool N0OneUse = N0.hasOneUse();
2698+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2699+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2700+
2701+
// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
2702+
// * y is a constant and (ptradd x, y) has one use; or
2703+
// * y and z are both constants.
2704+
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
2705+
// If both additions in the original were NUW, the new ones are as well.
2706+
SDNodeFlags Flags =
2707+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2708+
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
2709+
AddToWorklist(Add.getNode());
2710+
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
2711+
}
2712+
}
2713+
2714+
// The following combines can turn in-bounds pointer arithmetic out of bounds.
2715+
// That is problematic for settings like AArch64's CPA, which checks that
2716+
// intermediate results of pointer arithmetic remain in bounds. The target
2717+
// therefore needs to opt-in to enable them.
2718+
if (!TLI.canTransformPtrArithOutOfBounds(
2719+
DAG.getMachineFunction().getFunction(), PtrVT))
2720+
return SDValue();
2721+
2722+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
2723+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
2724+
// global address GA and constant c, such that c can be folded into GA.
2725+
SDValue GAValue = N0.getOperand(0);
2726+
if (const GlobalAddressSDNode *GA =
2727+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
2728+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2729+
if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
2730+
// If both additions in the original were NUW, reassociation preserves
2731+
// that.
2732+
SDNodeFlags Flags =
2733+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2734+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
2735+
AddToWorklist(Inner.getNode());
2736+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
2737+
}
2738+
}
27132739
}
27142740

2715-
// TODO: There is another possible fold here that was proven useful.
2716-
// It would be this:
2717-
//
2718-
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
2719-
// * (ptradd x, y) has one use; and
2720-
// * y is a constant; and
2721-
// * z is not a constant.
2722-
//
2723-
// In some cases, specifically in AArch64's FEAT_CPA, it exposes the
2724-
// opportunity to select more complex instructions such as SUBPT and
2725-
// MSUBPT. However, a hypothetical corner case has been found that we could
2726-
// not avoid. Consider this (pseudo-POSIX C):
2727-
//
2728-
// char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
2729-
// char *p = mmap(LARGE_CONSTANT);
2730-
// char *q = foo(p, -LARGE_CONSTANT);
2731-
//
2732-
// Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
2733-
// further + z takes it back to the start of the mapping, so valid,
2734-
// regardless of the address mmap gave back. However, if mmap gives you an
2735-
// address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
2736-
// borrow from the high bits (with the subsequent + z carrying back into
2737-
// the high bits to give you a well-defined pointer) and thus trip
2738-
// FEAT_CPA's pointer corruption checks.
2739-
//
2740-
// We leave this fold as an opportunity for future work, addressing the
2741-
// corner case for FEAT_CPA, as well as reconciling the solution with the
2742-
// more general application of pointer arithmetic in other future targets.
2743-
// For now each architecture that wants this fold must implement it in the
2744-
// target-specific code (see e.g. SITargetLowering::performPtrAddCombine)
2741+
if (N1.getOpcode() == ISD::ADD && N1.hasOneUse()) {
2742+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
2743+
// y is not, and (add y, z) is used only once.
2744+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
2745+
// z is not, and (add y, z) is used only once.
2746+
// The goal is to move constant offsets to the outermost ptradd, to create
2747+
// more opportunities to fold offsets into memory instructions.
2748+
// Together with the another combine above, this also implements
2749+
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
2750+
SDValue X = N0;
2751+
SDValue Y = N1.getOperand(0);
2752+
SDValue Z = N1.getOperand(1);
2753+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2754+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2755+
2756+
// If both additions in the original were NUW, reassociation preserves that.
2757+
SDNodeFlags ReassocFlags =
2758+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2759+
2760+
if (ZIsConstant != YIsConstant) {
2761+
if (YIsConstant)
2762+
std::swap(Y, Z);
2763+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
2764+
AddToWorklist(Inner.getNode());
2765+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
2766+
}
2767+
}
27452768

27462769
return SDValue();
27472770
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10602,6 +10602,11 @@ bool SITargetLowering::shouldPreservePtrArith(const Function &F,
1060210602
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
1060310603
}
1060410604

10605+
bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
10606+
EVT PtrVT) const {
10607+
return true;
10608+
}
10609+
1060510610
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
1060610611
// offset (the offset that is included in bounds checking and swizzling, to be
1060710612
// split between the instruction's voffset and immoffset fields) and soffset
@@ -15139,64 +15144,17 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1513915144
return Folded;
1514015145
}
1514115146

15142-
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15143-
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15144-
// global address GA and constant c, such that c can be folded into GA.
15145-
SDValue GAValue = N0.getOperand(0);
15146-
if (const GlobalAddressSDNode *GA =
15147-
dyn_cast<GlobalAddressSDNode>(GAValue)) {
15148-
if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15149-
// If both additions in the original were NUW, reassociation preserves
15150-
// that.
15151-
SDNodeFlags Flags =
15152-
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15153-
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15154-
DCI.AddToWorklist(Inner.getNode());
15155-
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15156-
}
15157-
}
15158-
}
15159-
1516015147
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
1516115148
return SDValue();
1516215149

15163-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15164-
// y is not, and (add y, z) is used only once.
15165-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15166-
// z is not, and (add y, z) is used only once.
15167-
// The goal is to move constant offsets to the outermost ptradd, to create
15168-
// more opportunities to fold offsets into memory instructions.
15169-
// Together with the generic combines in DAGCombiner.cpp, this also
15170-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15171-
//
15172-
// This transform is here instead of in the general DAGCombiner as it can
15173-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15174-
// AArch64's CPA.
1517515150
SDValue X = N0;
1517615151
SDValue Y = N1.getOperand(0);
1517715152
SDValue Z = N1.getOperand(1);
1517815153
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
1517915154
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
1518015155

15181-
// If both additions in the original were NUW, reassociation preserves that.
15182-
SDNodeFlags ReassocFlags =
15183-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15184-
15185-
if (ZIsConstant != YIsConstant) {
15186-
if (YIsConstant)
15187-
std::swap(Y, Z);
15188-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15189-
DCI.AddToWorklist(Inner.getNode());
15190-
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15191-
}
15192-
15193-
// If one of Y and Z is constant, they have been handled above. If both were
15194-
// constant, the addition would have been folded in SelectionDAG::getNode
15195-
// already. This ensures that the generic DAG combines won't undo the
15196-
// following reassociation.
15197-
assert(!YIsConstant && !ZIsConstant);
15198-
15199-
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15156+
if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
15157+
Y->isDivergent() != Z->isDivergent()) {
1520015158
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
1520115159
// y are uniform and z isn't.
1520215160
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
@@ -15207,6 +15165,9 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1520715165
// reassociate.
1520815166
if (Y->isDivergent())
1520915167
std::swap(Y, Z);
15168+
// If both additions in the original were NUW, reassociation preserves that.
15169+
SDNodeFlags ReassocFlags =
15170+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
1521015171
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
1521115172
DCI.AddToWorklist(UniformInner.getNode());
1521215173
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
264264

265265
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
266266

267+
bool canTransformPtrArithOutOfBounds(const Function &F,
268+
EVT PtrVT) const override;
269+
267270
private:
268271
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
269272
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]

0 commit comments

Comments
 (0)