diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 91fc16e54c88f..53c6b295fc5e1 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7022,10 +7022,11 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, Function *KernelLaunchFunction = StaleCI->getCalledFunction(); // StaleCI is the CallInst which is the call to the outlined - // target kernel launch function. If there are values that the - // outlined function uses then these are aggregated into a structure - // which is passed as the second argument. If not, then there's - // only one argument, the threadID. So, StaleCI can be + // target kernel launch function. If there are local live-in values + // that the outlined function uses then these are aggregated into a structure + // which is passed as the second argument. If there are no local live-in + // values or if all values used by the outlined kernel are global variables, + // then there's only one argument, the threadID. So, StaleCI can be // // %structArg = alloca { ptr, ptr }, align 8 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 @@ -7063,6 +7064,8 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, // host and device. assert((!HasShareds || (StaleCI->arg_size() == 2)) && "StaleCI with shareds should have exactly two arguments."); + + Value *ThreadId = ProxyFn->getArg(0); if (HasShareds) { auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); assert(ArgStructAlloca && @@ -7073,7 +7076,6 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, AllocaInst *NewArgStructAlloca = Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); Value *TaskT = ProxyFn->getArg(1); - Value *ThreadId = ProxyFn->getArg(0); Value *SharedsSize = Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); @@ -7086,7 +7088,10 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); + } else { + Builder.CreateCall(KernelLaunchFunction, {ThreadId}); } + Builder.CreateRetVoid(); return ProxyFn; } @@ -7229,11 +7234,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); Builder.restoreIP(TargetTaskBodyIP); - if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP)) return Err; - OI.ExitBB = Builder.saveIP().getBlock(); + // The outliner (CodeExtractor) extract a sequence or vector of blocks that + // it is given. These blocks are enumerated by + // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock + // to be outside the region. In other words, OI.ExitBlock is expected to be + // the start of the region after the outlining. We used to set OI.ExitBlock + // to the InsertBlock after TaskBodyCB is done. This is fine in most cases + // except when the task body is a single basic block. In that case, + // OI.ExitBlock is set to the single task body block and will get left out of + // the outlining process. So, simply create a new empty block to which we + // uncoditionally branch from where TaskBodyCB left off + OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont"); + emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(), + /*IsFinished=*/true); + OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, DeviceID](Function &OutlinedFn) mutable { assert(OutlinedFn.getNumUses() == 1 && diff --git a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir index 621a206e18053..ece32bb5419c6 100644 --- a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir @@ -25,8 +25,29 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_depend_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call i32 @__tgt_target_kernel({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + +// CHECK: define internal void @omp_target_depend_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_depend__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void + // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_depend__l[[LINE]](ptr %[[ADDR_A:.*]]) // CHECK: store i32 100, ptr %[[ADDR_A]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_depend_..omp_par diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir index 6b634226a3568..94d8d052d087e 100644 --- a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir @@ -20,10 +20,30 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_nowait_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + // Verify that we directly emit a call to the "target" region's body from the // parent function of the the `omp.target` op. +// CHECK: define internal void @omp_target_nowait_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]]) // CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_nowait_..omp_par