Skip to content

[OMPIRBuilder] - Fix emitTargetTaskProxyFunc to not generate empty functions #126958

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7022,10 +7022,11 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
Function *KernelLaunchFunction = StaleCI->getCalledFunction();

// StaleCI is the CallInst which is the call to the outlined
// target kernel launch function. If there are values that the
// outlined function uses then these are aggregated into a structure
// which is passed as the second argument. If not, then there's
// only one argument, the threadID. So, StaleCI can be
// target kernel launch function. If there are local live-in values
// that the outlined function uses then these are aggregated into a structure
// which is passed as the second argument. If there are no local live-in
// values or if all values used by the outlined kernel are global variables,
// then there's only one argument, the threadID. So, StaleCI can be
//
// %structArg = alloca { ptr, ptr }, align 8
// %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
Expand Down Expand Up @@ -7063,6 +7064,8 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
// host and device.
assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
"StaleCI with shareds should have exactly two arguments.");

Value *ThreadId = ProxyFn->getArg(0);
if (HasShareds) {
auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
assert(ArgStructAlloca &&
Expand All @@ -7073,7 +7076,6 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
AllocaInst *NewArgStructAlloca =
Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
Value *TaskT = ProxyFn->getArg(1);
Value *ThreadId = ProxyFn->getArg(0);
Value *SharedsSize =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));

Expand All @@ -7086,7 +7088,10 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);

Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
} else {
Builder.CreateCall(KernelLaunchFunction, {ThreadId});
}

Builder.CreateRetVoid();
return ProxyFn;
}
Expand Down Expand Up @@ -7229,11 +7234,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));

Builder.restoreIP(TargetTaskBodyIP);

if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
return Err;

OI.ExitBB = Builder.saveIP().getBlock();
// The outliner (CodeExtractor) extract a sequence or vector of blocks that
// it is given. These blocks are enumerated by
// OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
// to be outside the region. In other words, OI.ExitBlock is expected to be
// the start of the region after the outlining. We used to set OI.ExitBlock
// to the InsertBlock after TaskBodyCB is done. This is fine in most cases
// except when the task body is a single basic block. In that case,
// OI.ExitBlock is set to the single task body block and will get left out of
// the outlining process. So, simply create a new empty block to which we
// uncoditionally branch from where TaskBodyCB left off
OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
/*IsFinished=*/true);

OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
DeviceID](Function &OutlinedFn) mutable {
assert(OutlinedFn.getNumUses() == 1 &&
Expand Down
21 changes: 21 additions & 0 deletions mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,29 @@ module attributes {omp.is_target_device = false} {
// CHECK: define void @omp_target_depend_()
// CHECK-NOT: define {{.*}} @
// CHECK-NOT: call i32 @__tgt_target_kernel({{.*}})
// CHECK: call void @__kmpc_omp_task_begin_if0
// CHECK-NEXT: call void @.omp_target_task_proxy_func
// CHECK: call void @__kmpc_omp_task_complete_if0
// https://github.com/llvm/llvm-project/issues/126949 exposes two issues
// 1. Empty target task proxy functions
// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice
// Once via the target task proxy function and a second time after the target task is done.
// The following checks check problem #2.
// functions. The following checks tests the fix for this issue.
// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]]
// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]:
// CHECK-NEXT: ret void

// CHECK: define internal void @omp_target_depend_..omp_par
// CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_depend__l[[LINE:.*]](ptr {{.*}})
// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]]
// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]:
// CHECK-NEXT: ret void


// CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_depend__l[[LINE]](ptr %[[ADDR_A:.*]])
// CHECK: store i32 100, ptr %[[ADDR_A]], align 4

// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949
// CHECK: define internal void @.omp_target_task_proxy_func
// CHECK: call void @omp_target_depend_..omp_par
20 changes: 20 additions & 0 deletions mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,30 @@ module attributes {omp.is_target_device = false} {
// CHECK: define void @omp_target_nowait_()
// CHECK-NOT: define {{.*}} @
// CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}})
// CHECK: call void @__kmpc_omp_task_begin_if0
// CHECK-NEXT: call void @.omp_target_task_proxy_func
// CHECK: call void @__kmpc_omp_task_complete_if0
// https://github.com/llvm/llvm-project/issues/126949 exposes two issues
// 1. Empty target task proxy functions
// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice
// Once via the target task proxy function and a second time after the target task is done.
// The following checks check problem #2.
// functions. The following checks tests the fix for this issue.
// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]]
// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]:
// CHECK-NEXT: ret void

// Verify that we directly emit a call to the "target" region's body from the
// parent function of the the `omp.target` op.
// CHECK: define internal void @omp_target_nowait_..omp_par
// CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}})
// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]]
// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]:
// CHECK-NEXT: ret void

// CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]])
// CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4

// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949
// CHECK: define internal void @.omp_target_task_proxy_func
// CHECK: call void @omp_target_nowait_..omp_par