Skip to content

Commit 8293955

Browse files
committed
[OpenACC][CIR] Lower 'num_workers' for combined constructs
Similar to num_gangs, implementation is trivial, so adding tests and adding the small amount of implementation.
1 parent 5c1ef33 commit 8293955

File tree

2 files changed

+69
-5
lines changed

2 files changed

+69
-5
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,10 @@ class OpenACCClauseCIREmitter final
229229
operation.addNumWorkersOperand(builder.getContext(),
230230
createIntExpr(clause.getIntExpr()),
231231
lastDeviceTypeValues);
232-
} else if constexpr (isOneOfTypes<OpTy, mlir::acc::SerialOp>) {
233-
llvm_unreachable("num_workers not valid on serial");
232+
} else if constexpr (isCombinedType<OpTy>) {
233+
applyToComputeOp(clause);
234234
} else {
235-
// TODO: When we've implemented this for everything, switch this to an
236-
// unreachable. Combined constructs remain.
237-
return clauseNotImplemented(clause);
235+
llvm_unreachable("Unknown construct kind in VisitNumGangsClause");
238236
}
239237
}
240238

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,4 +813,70 @@ extern "C" void acc_combined(int N, int cond) {
813813
// CHECK-NEXT: acc.yield
814814
// CHECK-NEXT: } loc
815815

816+
#pragma acc parallel loop num_workers(cond)
817+
for(unsigned I = 0; I < N; ++I);
818+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
819+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
820+
// CHECK-NEXT: acc.parallel combined(loop) num_workers(%[[CONV_CAST]] : si32) {
821+
// CHECK-NEXT: acc.loop combined(parallel) {
822+
// CHECK: acc.yield
823+
// CHECK-NEXT: } loc
824+
// CHECK-NEXT: acc.yield
825+
// CHECK-NEXT: } loc
826+
827+
#pragma acc kernels loop num_workers(cond) device_type(nvidia) num_workers(2u)
828+
for(unsigned I = 0; I < N; ++I);
829+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
830+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
831+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
832+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
833+
// CHECK-NEXT: acc.kernels combined(loop) num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
834+
// CHECK-NEXT: acc.loop combined(kernels) {
835+
// CHECK: acc.yield
836+
// CHECK-NEXT: } loc
837+
// CHECK-NEXT: acc.terminator
838+
// CHECK-NEXT: } loc
839+
840+
#pragma acc parallel loop num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
841+
for(unsigned I = 0; I < N; ++I);
842+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
843+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
844+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
845+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
846+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
847+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
848+
// CHECK-NEXT: acc.parallel combined(loop) num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
849+
// CHECK-NEXT: acc.loop combined(parallel) {
850+
// CHECK: acc.yield
851+
// CHECK-NEXT: } loc
852+
// CHECK-NEXT: acc.yield
853+
// CHECK-NEXT: } loc
854+
855+
#pragma acc kernels loop num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4)
856+
for(unsigned I = 0; I < N; ++I);
857+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
858+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
859+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
860+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
861+
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
862+
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
863+
// CHECK-NEXT: acc.kernels combined(loop) num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
864+
// CHECK-NEXT: acc.loop combined(kernels) {
865+
// CHECK: acc.yield
866+
// CHECK-NEXT: } loc
867+
// CHECK-NEXT: acc.terminator
868+
// CHECK-NEXT: } loc
869+
870+
#pragma acc parallel loop device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
871+
for(unsigned I = 0; I < N; ++I);
872+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
873+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
874+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
875+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
876+
// CHECK-NEXT: acc.parallel combined(loop) num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
877+
// CHECK-NEXT: acc.loop combined(parallel) {
878+
// CHECK: acc.yield
879+
// CHECK-NEXT: } loc
880+
// CHECK-NEXT: acc.yield
881+
// CHECK-NEXT: } loc
816882
}

0 commit comments

Comments
 (0)