Skip to content

Commit c5ea469

Browse files
[OMPIRBuilder] - Fix emitTargetTaskProxyFunc to not generate empty functions (#126958)
This is a fix for #126949 There are two issues being fixed here. First, in some cases, OMPIRBuilder generates empty target task proxy functions. This happens when the target kernel doesn't use any stack-allocated data (either no data or only globals). The second problem is encountered when the target task i.e the code that makes the target call spans a single basic block. This usually happens when we do not generate a target or device kernel launch and instead fall back to the host. In such cases, we end up not outlining the target task entirely. This can cause us to call target kernel twice - once via the target task proxy function and a second time via the host fallback This PR fixes both of these problems and updates some tests to catch these problems should this patch fail.
1 parent 6c62783 commit c5ea469

File tree

3 files changed

+65
-7
lines changed

3 files changed

+65
-7
lines changed

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7099,10 +7099,11 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
70997099
Function *KernelLaunchFunction = StaleCI->getCalledFunction();
71007100

71017101
// StaleCI is the CallInst which is the call to the outlined
7102-
// target kernel launch function. If there are values that the
7103-
// outlined function uses then these are aggregated into a structure
7104-
// which is passed as the second argument. If not, then there's
7105-
// only one argument, the threadID. So, StaleCI can be
7102+
// target kernel launch function. If there are local live-in values
7103+
// that the outlined function uses then these are aggregated into a structure
7104+
// which is passed as the second argument. If there are no local live-in
7105+
// values or if all values used by the outlined kernel are global variables,
7106+
// then there's only one argument, the threadID. So, StaleCI can be
71067107
//
71077108
// %structArg = alloca { ptr, ptr }, align 8
71087109
// %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
@@ -7140,6 +7141,8 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
71407141
// host and device.
71417142
assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
71427143
"StaleCI with shareds should have exactly two arguments.");
7144+
7145+
Value *ThreadId = ProxyFn->getArg(0);
71437146
if (HasShareds) {
71447147
auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
71457148
assert(ArgStructAlloca &&
@@ -7150,7 +7153,6 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
71507153
AllocaInst *NewArgStructAlloca =
71517154
Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
71527155
Value *TaskT = ProxyFn->getArg(1);
7153-
Value *ThreadId = ProxyFn->getArg(0);
71547156
Value *SharedsSize =
71557157
Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
71567158

@@ -7163,7 +7165,10 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
71637165
LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
71647166

71657167
Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7168+
} else {
7169+
Builder.CreateCall(KernelLaunchFunction, {ThreadId});
71667170
}
7171+
71677172
Builder.CreateRetVoid();
71687173
return ProxyFn;
71697174
}
@@ -7306,11 +7311,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
73067311
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
73077312

73087313
Builder.restoreIP(TargetTaskBodyIP);
7309-
73107314
if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
73117315
return Err;
73127316

7313-
OI.ExitBB = Builder.saveIP().getBlock();
7317+
// The outliner (CodeExtractor) extract a sequence or vector of blocks that
7318+
// it is given. These blocks are enumerated by
7319+
// OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7320+
// to be outside the region. In other words, OI.ExitBlock is expected to be
7321+
// the start of the region after the outlining. We used to set OI.ExitBlock
7322+
// to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7323+
// except when the task body is a single basic block. In that case,
7324+
// OI.ExitBlock is set to the single task body block and will get left out of
7325+
// the outlining process. So, simply create a new empty block to which we
7326+
// uncoditionally branch from where TaskBodyCB left off
7327+
OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7328+
emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7329+
/*IsFinished=*/true);
7330+
73147331
OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
73157332
DeviceID](Function &OutlinedFn) mutable {
73167333
assert(OutlinedFn.getNumUses() == 1 &&

mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,29 @@ module attributes {omp.is_target_device = false} {
2525
// CHECK: define void @omp_target_depend_()
2626
// CHECK-NOT: define {{.*}} @
2727
// CHECK-NOT: call i32 @__tgt_target_kernel({{.*}})
28+
// CHECK: call void @__kmpc_omp_task_begin_if0
29+
// CHECK-NEXT: call void @.omp_target_task_proxy_func
30+
// CHECK: call void @__kmpc_omp_task_complete_if0
31+
// https://github.com/llvm/llvm-project/issues/126949 exposes two issues
32+
// 1. Empty target task proxy functions
33+
// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice
34+
// Once via the target task proxy function and a second time after the target task is done.
35+
// The following checks check problem #2.
36+
// functions. The following checks tests the fix for this issue.
37+
// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]]
38+
// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]:
39+
// CHECK-NEXT: ret void
40+
41+
// CHECK: define internal void @omp_target_depend_..omp_par
2842
// CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_depend__l[[LINE:.*]](ptr {{.*}})
43+
// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]]
44+
// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]:
2945
// CHECK-NEXT: ret void
3046

47+
3148
// CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_depend__l[[LINE]](ptr %[[ADDR_A:.*]])
3249
// CHECK: store i32 100, ptr %[[ADDR_A]], align 4
50+
51+
// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949
52+
// CHECK: define internal void @.omp_target_task_proxy_func
53+
// CHECK: call void @omp_target_depend_..omp_par

mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,30 @@ module attributes {omp.is_target_device = false} {
2020
// CHECK: define void @omp_target_nowait_()
2121
// CHECK-NOT: define {{.*}} @
2222
// CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}})
23+
// CHECK: call void @__kmpc_omp_task_begin_if0
24+
// CHECK-NEXT: call void @.omp_target_task_proxy_func
25+
// CHECK: call void @__kmpc_omp_task_complete_if0
26+
// https://github.com/llvm/llvm-project/issues/126949 exposes two issues
27+
// 1. Empty target task proxy functions
28+
// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice
29+
// Once via the target task proxy function and a second time after the target task is done.
30+
// The following checks check problem #2.
31+
// functions. The following checks tests the fix for this issue.
32+
// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]]
33+
// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]:
34+
// CHECK-NEXT: ret void
35+
2336
// Verify that we directly emit a call to the "target" region's body from the
2437
// parent function of the the `omp.target` op.
38+
// CHECK: define internal void @omp_target_nowait_..omp_par
2539
// CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}})
40+
// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]]
41+
// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]:
2642
// CHECK-NEXT: ret void
2743

2844
// CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]])
2945
// CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4
46+
47+
// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949
48+
// CHECK: define internal void @.omp_target_task_proxy_func
49+
// CHECK: call void @omp_target_nowait_..omp_par

0 commit comments

Comments
 (0)