Skip to content

[llvm][amdgpu] Handle indirect refs to LDS GVs during LDS lowering #124089

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 45 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,25 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
FunctionVariableMap DirectMapFunction;
getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);

// Collect variables that are used by functions whose address has escaped
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
// Collect functions whose address has escaped
DenseSet<Function *> AddressTakenFuncs;
for (Function &F : M.functions()) {
if (!isKernelLDS(&F))
if (F.hasAddressTaken(nullptr,
/* IgnoreCallbackUses */ false,
/* IgnoreAssumeLikeCalls */ false,
/* IgnoreLLVMUsed */ true,
/* IgnoreArcAttachedCall */ false)) {
set_union(VariablesReachableThroughFunctionPointer,
DirectMapFunction[&F]);
AddressTakenFuncs.insert(&F);
}
}

// Collect variables that are used by functions whose address has escaped
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
for (Function *F : AddressTakenFuncs) {
set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]);
}

auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
assert(!F->isDeclaration());
for (const CallGraphNode::CallRecord &R : *CG[F]) {
Expand Down Expand Up @@ -206,6 +211,13 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
}
}

// Collect variables that are transitively used by functions whose address has
// escaped
for (Function *F : AddressTakenFuncs) {
set_union(VariablesReachableThroughFunctionPointer,
TransitiveMapFunction[F]);
}

// DirectMapKernel lists which variables are used by the kernel
// find the variables which are used through a function call
FunctionVariableMap IndirectMapKernel;
Expand All @@ -218,11 +230,37 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
Function *Ith = R.second->getFunction();
if (Ith) {
set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
} else {
set_union(IndirectMapKernel[&Func],
VariablesReachableThroughFunctionPointer);
}
}

// Check if the kernel encounters unknows calls, wheher directly or
// indirectly.
bool SeesUnknownCalls = [&]() {
SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
SmallPtrSet<Function *, 8> Visited;

while (!WorkList.empty()) {
Function *F = WorkList.pop_back_val();

for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
if (!CallRecord.second)
continue;

Function *Callee = CallRecord.second->getFunction();
if (!Callee)
return true;

if (Visited.insert(Callee).second)
WorkList.push_back(Callee);
}
}
return false;
}();

if (SeesUnknownCalls) {
set_union(IndirectMapKernel[&Func],
VariablesReachableThroughFunctionPointer);
}
}

// Verify that we fall into one of 2 cases:
Expand Down
44 changes: 44 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s

; Tests that the LDS lowering pass handles indirect references to LDS GVs; i.e.
; that it lowers to accesses into the generated LDS struct if these references
; are deep in the call graph starting at the kernel.

@lds_item_to_indirectly_load = internal addrspace(3) global ptr poison, align 8

%store_type = type { i32, ptr }
@place_to_store_indirect_caller = internal addrspace(3) global %store_type poison, align 8

define amdgpu_kernel void @offloading_kernel() {
store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8
call void @call_unknown()
ret void
}

define void @call_unknown() {
%1 = alloca ptr, align 8
%2 = call i32 %1()
ret void
}

define void @indirectly_load_lds() {
call void @directly_load_lds()
ret void
}

define void @directly_load_lds() {
%2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8
ret void
}

; CHECK: %[[LDS_STRUCT_TY:.*]] = type { %store_type, ptr }
; CHECK: @[[LDS_STRUCT:.*]] = {{.*}} %[[LDS_STRUCT_TY]] {{.*}} !absolute_symbol

; CHECK: define amdgpu_kernel void @offloading_kernel() {{.*}} {
; CHECK: store ptr @indirectly_load_lds, {{.*}} @[[LDS_STRUCT]]
; CHECK: call void @call_unknown()
; CHECK: }

; CHECK: define void @directly_load_lds() {
; CHECK: load ptr, {{.*}} (%[[LDS_STRUCT_TY]], {{.*}} @[[LDS_STRUCT]], i32 0, i32 1)
; CHECK: }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
Expand Down
Loading