Skip to content

Commit 030a405

Browse files
committed
SWDEV-321398: replace hostcall module flag with function attribute
This internal version is currently a squash of four upstream reviews: 1. D119087: [AMDGPU] [NFC] refactor the AMDGPU attributor 2. D119308: [AMDGPU] [NFC] Fix incorrect use of bitwise operator. 3. D119249: [Attributor][NFC] Expose new API in AAPointerInfo 4. D119216: [AMDGPU] replace hostcall module flag with function attribute Of these #1, #2 and #3 are submitted in upstream/main, while #4 is under review. The module flag to indicate use of hostcall is insufficient to catch all cases where hostcall might be in use by a kernel. This is now replaced by a function attribute that gets propagated to top-level kernel functions via their respective call-graph. If the attribute "amdgpu-no-hostcall-ptr" is absent on a kernel, the default behaviour is to emit kernel metadata indicating that the kernel uses the hostcall buffer pointer passed as an implicit argument. The attribute may be placed explicitly by the user, or inferred by the AMDGPU attributor by examining the call-graph. The attribute is inferred only if the function is not being sanitized, and the implictarg_ptr does not result in a load of any byte in the hostcall pointer argument. Change-Id: I6cc12050602c3f477575c3ca09a883797169e9e3
1 parent 6935430 commit 030a405

27 files changed

+623
-348
lines changed

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4592,6 +4592,48 @@ struct AAPointerInfo : public AbstractAttribute {
45924592
/// See AbstractAttribute::getIdAddr()
45934593
const char *getIdAddr() const override { return &ID; }
45944594

4595+
/// Helper to represent an access offset and size, with logic to deal with
4596+
/// uncertainty and check for overlapping accesses.
4597+
struct OffsetAndSize : public std::pair<int64_t, int64_t> {
4598+
using BaseTy = std::pair<int64_t, int64_t>;
4599+
OffsetAndSize(int64_t Offset, int64_t Size) : BaseTy(Offset, Size) {}
4600+
OffsetAndSize(const BaseTy &P) : BaseTy(P) {}
4601+
int64_t getOffset() const { return first; }
4602+
int64_t getSize() const { return second; }
4603+
static OffsetAndSize getUnknown() {
4604+
return OffsetAndSize(Unknown, Unknown);
4605+
}
4606+
4607+
/// Return true if offset or size are unknown.
4608+
bool offsetOrSizeAreUnknown() const {
4609+
return getOffset() == OffsetAndSize::Unknown ||
4610+
getSize() == OffsetAndSize::Unknown;
4611+
}
4612+
4613+
/// Return true if this offset and size pair might describe an address that
4614+
/// overlaps with \p OAS.
4615+
bool mayOverlap(const OffsetAndSize &OAS) const {
4616+
// Any unknown value and we are giving up -> overlap.
4617+
if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
4618+
return true;
4619+
4620+
// Check if one offset point is in the other interval [offset,
4621+
// offset+size].
4622+
return OAS.getOffset() + OAS.getSize() > getOffset() &&
4623+
OAS.getOffset() < getOffset() + getSize();
4624+
}
4625+
4626+
/// Constant used to represent unknown offset or sizes.
4627+
static constexpr int64_t Unknown = 1 << 31;
4628+
};
4629+
4630+
/// Call \p CB on all accesses that might interfere with \p OAS and return
4631+
/// true if all such accesses were known and the callback returned true for
4632+
/// all of them, false otherwise. An access interferes with an offset-size
4633+
/// pair if it might read or write that memory region.
4634+
virtual bool forallInterferingAccesses(
4635+
OffsetAndSize OAS, function_ref<bool(const Access &, bool)> CB) const = 0;
4636+
45954637
/// Call \p CB on all accesses that might interfere with \p LI and return true
45964638
/// if all such accesses were known and the callback returned true for all of
45974639
/// them, false otherwise.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
//===--- AMDGPUAttributes.def ---------------------------------*- C++ -*---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file contains descriptions of the various function attributes
10+
// that indicate *absence* of the corresponding implicit kernel
11+
// arguments.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
// NOTE: NO INCLUDE GUARD DESIRED!
16+
17+
AMDGPU_ATTRIBUTE(DISPATCH_PTR, "amdgpu-no-dispatch-ptr")
18+
AMDGPU_ATTRIBUTE(QUEUE_PTR, "amdgpu-no-queue-ptr")
19+
AMDGPU_ATTRIBUTE(DISPATCH_ID, "amdgpu-no-dispatch-id")
20+
AMDGPU_ATTRIBUTE(IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr")
21+
AMDGPU_ATTRIBUTE(HOSTCALL_PTR, "amdgpu-no-hostcall-ptr")
22+
AMDGPU_ATTRIBUTE(WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x")
23+
AMDGPU_ATTRIBUTE(WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y")
24+
AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
25+
AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
26+
AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
27+
AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
28+
29+
#undef AMDGPU_ATTRIBUTE

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 127 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "AMDGPU.h"
1414
#include "GCNSubtarget.h"
15+
#include "Utils/AMDGPUBaseInfo.h"
1516
#include "llvm/CodeGen/TargetPassConfig.h"
1617
#include "llvm/IR/IntrinsicsAMDGPU.h"
1718
#include "llvm/IR/IntrinsicsR600.h"
@@ -22,37 +23,25 @@
2223

2324
using namespace llvm;
2425

26+
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
27+
28+
enum ImplicitArgumentPositions {
29+
#include "AMDGPUAttributes.def"
30+
LAST_ARG_POS
31+
};
32+
33+
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
34+
2535
enum ImplicitArgumentMask {
2636
NOT_IMPLICIT_INPUT = 0,
27-
28-
// SGPRs
29-
DISPATCH_PTR = 1 << 0,
30-
QUEUE_PTR = 1 << 1,
31-
DISPATCH_ID = 1 << 2,
32-
IMPLICIT_ARG_PTR = 1 << 3,
33-
WORKGROUP_ID_X = 1 << 4,
34-
WORKGROUP_ID_Y = 1 << 5,
35-
WORKGROUP_ID_Z = 1 << 6,
36-
37-
// VGPRS:
38-
WORKITEM_ID_X = 1 << 7,
39-
WORKITEM_ID_Y = 1 << 8,
40-
WORKITEM_ID_Z = 1 << 9,
41-
ALL_ARGUMENT_MASK = (1 << 10) - 1
37+
#include "AMDGPUAttributes.def"
38+
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
4239
};
4340

41+
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
4442
static constexpr std::pair<ImplicitArgumentMask,
4543
StringLiteral> ImplicitAttrs[] = {
46-
{DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
47-
{QUEUE_PTR, "amdgpu-no-queue-ptr"},
48-
{DISPATCH_ID, "amdgpu-no-dispatch-id"},
49-
{IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
50-
{WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
51-
{WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
52-
{WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
53-
{WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
54-
{WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
55-
{WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
44+
#include "AMDGPUAttributes.def"
5645
};
5746

5847
// We do not need to note the x workitem or workgroup id because they are always
@@ -90,7 +79,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
9079
case Intrinsic::amdgcn_queue_ptr:
9180
case Intrinsic::amdgcn_is_shared:
9281
case Intrinsic::amdgcn_is_private:
93-
// TODO: Does not require queue ptr on gfx9+
82+
// TODO: Does not require the queue pointer on gfx9+
9483
case Intrinsic::trap:
9584
case Intrinsic::debugtrap:
9685
IsQueuePtr = true;
@@ -112,6 +101,17 @@ static bool isDSAddress(const Constant *C) {
112101
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
113102
}
114103

104+
/// Returns true if the function requires the implicit argument be passed
105+
/// regardless of the function contents.
106+
static bool funcRequiresHostcallPtr(const Function &F) {
107+
// Sanitizers require the hostcall buffer passed in the implicit arguments.
108+
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
109+
F.hasFnAttribute(Attribute::SanitizeThread) ||
110+
F.hasFnAttribute(Attribute::SanitizeMemory) ||
111+
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
112+
F.hasFnAttribute(Attribute::SanitizeMemTag);
113+
}
114+
115115
class AMDGPUInformationCache : public InformationCache {
116116
public:
117117
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
@@ -129,7 +129,7 @@ class AMDGPUInformationCache : public InformationCache {
129129
}
130130

131131
private:
132-
/// Check if the ConstantExpr \p CE requires queue ptr attribute.
132+
/// Check if the ConstantExpr \p CE requires the queue pointer.
133133
static bool visitConstExpr(const ConstantExpr *CE) {
134134
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
135135
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
@@ -163,7 +163,7 @@ class AMDGPUInformationCache : public InformationCache {
163163
}
164164

165165
public:
166-
/// Returns true if \p Fn needs a queue ptr attribute because of \p C.
166+
/// Returns true if \p Fn needs the queue pointer because of \p C.
167167
bool needsQueuePtr(const Constant *C, Function &Fn) {
168168
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
169169
bool HasAperture = hasApertureRegs(Fn);
@@ -182,7 +182,7 @@ class AMDGPUInformationCache : public InformationCache {
182182
}
183183

184184
private:
185-
/// Used to determine if the Constant needs a queue ptr attribute.
185+
/// Used to determine if the Constant needs the queue pointer.
186186
DenseMap<const Constant *, uint8_t> ConstantStatus;
187187
};
188188

@@ -327,7 +327,20 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
327327

328328
void initialize(Attributor &A) override {
329329
Function *F = getAssociatedFunction();
330+
331+
// If the function requires the implicit arg pointer due to sanitizers,
332+
// assume it's needed even if explicitly marked as not requiring it.
333+
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
334+
if (NeedsHostcall) {
335+
removeAssumedBits(IMPLICIT_ARG_PTR);
336+
removeAssumedBits(HOSTCALL_PTR);
337+
}
338+
330339
for (auto Attr : ImplicitAttrs) {
340+
if (NeedsHostcall &&
341+
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
342+
continue;
343+
331344
if (F->hasFnAttribute(Attr.second))
332345
addKnownBits(Attr.first);
333346
}
@@ -355,7 +368,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
355368
return indicatePessimisticFixpoint();
356369

357370
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
358-
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
359371

360372
bool NeedsQueuePtr = false;
361373

@@ -377,13 +389,58 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
377389
}
378390
}
379391

380-
// If we found that we need amdgpu-queue-ptr, nothing else to do.
392+
if (!NeedsQueuePtr) {
393+
NeedsQueuePtr = checkForQueuePtr(A);
394+
}
395+
381396
if (NeedsQueuePtr) {
382397
removeAssumedBits(QUEUE_PTR);
383-
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
384-
ChangeStatus::UNCHANGED;
385398
}
386399

400+
if (funcRetrievesHostcallPtr(A)) {
401+
removeAssumedBits(IMPLICIT_ARG_PTR);
402+
removeAssumedBits(HOSTCALL_PTR);
403+
}
404+
405+
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
406+
: ChangeStatus::UNCHANGED;
407+
}
408+
409+
ChangeStatus manifest(Attributor &A) override {
410+
SmallVector<Attribute, 8> AttrList;
411+
LLVMContext &Ctx = getAssociatedFunction()->getContext();
412+
413+
for (auto Attr : ImplicitAttrs) {
414+
if (isKnown(Attr.first))
415+
AttrList.push_back(Attribute::get(Ctx, Attr.second));
416+
}
417+
418+
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
419+
/* ForceReplace */ true);
420+
}
421+
422+
const std::string getAsStr() const override {
423+
std::string Str;
424+
raw_string_ostream OS(Str);
425+
OS << "AMDInfo[";
426+
for (auto Attr : ImplicitAttrs)
427+
OS << ' ' << Attr.second;
428+
OS << " ]";
429+
return OS.str();
430+
}
431+
432+
/// See AbstractAttribute::trackStatistics()
433+
void trackStatistics() const override {}
434+
435+
private:
436+
bool checkForQueuePtr(Attributor &A) {
437+
Function *F = getAssociatedFunction();
438+
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
439+
440+
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
441+
442+
bool NeedsQueuePtr = false;
443+
387444
auto CheckAddrSpaceCasts = [&](Instruction &I) {
388445
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
389446
if (castRequiresQueuePtr(SrcAS)) {
@@ -398,69 +455,63 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
398455
// `checkForAllInstructions` is much more cheaper than going through all
399456
// instructions, try it first.
400457

401-
// amdgpu-queue-ptr is not needed if aperture regs is present.
458+
// The queue pointer is not needed if aperture regs is present.
402459
if (!HasApertureRegs) {
403460
bool UsedAssumedInformation = false;
404461
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
405462
{Instruction::AddrSpaceCast},
406463
UsedAssumedInformation);
407464
}
408465

409-
// If we found that we need amdgpu-queue-ptr, nothing else to do.
410-
if (NeedsQueuePtr) {
411-
removeAssumedBits(QUEUE_PTR);
412-
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
413-
ChangeStatus::UNCHANGED;
414-
}
466+
// If we found that we need the queue pointer, nothing else to do.
467+
if (NeedsQueuePtr)
468+
return true;
415469

416-
if (!IsNonEntryFunc && HasApertureRegs) {
417-
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
418-
ChangeStatus::UNCHANGED;
419-
}
470+
if (!IsNonEntryFunc && HasApertureRegs)
471+
return false;
420472

421473
for (BasicBlock &BB : *F) {
422474
for (Instruction &I : BB) {
423475
for (const Use &U : I.operands()) {
424476
if (const auto *C = dyn_cast<Constant>(U)) {
425-
if (InfoCache.needsQueuePtr(C, *F)) {
426-
removeAssumedBits(QUEUE_PTR);
427-
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
428-
ChangeStatus::UNCHANGED;
429-
}
477+
if (InfoCache.needsQueuePtr(C, *F))
478+
return true;
430479
}
431480
}
432481
}
433482
}
434483

435-
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
436-
ChangeStatus::UNCHANGED;
484+
return false;
437485
}
438486

439-
ChangeStatus manifest(Attributor &A) override {
440-
SmallVector<Attribute, 8> AttrList;
441-
LLVMContext &Ctx = getAssociatedFunction()->getContext();
442-
443-
for (auto Attr : ImplicitAttrs) {
444-
if (isKnown(Attr.first))
445-
AttrList.push_back(Attribute::get(Ctx, Attr.second));
446-
}
487+
bool funcRetrievesHostcallPtr(Attributor &A) {
488+
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
489+
490+
// Check if this is a call to the implicitarg_ptr builtin and it
491+
// is used to retrieve the hostcall pointer. The implicit arg for
492+
// hostcall is not used only if every use of the implicitarg_ptr
493+
// is a load that clearly does not retrieve any byte of the
494+
// hostcall pointer. We check this by tracing all the uses of the
495+
// initial call to the implicitarg_ptr intrinsic.
496+
auto DoesNotLeadToHostcallPtr = [&](Instruction &I) {
497+
auto &Call = cast<CallBase>(I);
498+
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
499+
return true;
500+
501+
const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
502+
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
503+
504+
AAPointerInfo::OffsetAndSize OAS(Pos, 8);
505+
return PointerInfoAA.forallInterferingAccesses(
506+
OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
507+
return Acc.getRemoteInst()->isDroppable();
508+
});
509+
};
447510

448-
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
449-
/* ForceReplace */ true);
511+
bool UsedAssumedInformation = false;
512+
return !A.checkForAllCallLikeInstructions(DoesNotLeadToHostcallPtr, *this,
513+
UsedAssumedInformation);
450514
}
451-
452-
const std::string getAsStr() const override {
453-
std::string Str;
454-
raw_string_ostream OS(Str);
455-
OS << "AMDInfo[";
456-
for (auto Attr : ImplicitAttrs)
457-
OS << ' ' << Attr.second;
458-
OS << " ]";
459-
return OS.str();
460-
}
461-
462-
/// See AbstractAttribute::trackStatistics()
463-
void trackStatistics() const override {}
464515
};
465516

466517
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -497,7 +548,8 @@ class AMDGPUAttributor : public ModulePass {
497548
BumpPtrAllocator Allocator;
498549
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
499550
DenseSet<const char *> Allowed(
500-
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID});
551+
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
552+
&AACallEdges::ID, &AAPointerInfo::ID});
501553

502554
Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
503555

0 commit comments

Comments
 (0)