Skip to content

Commit 597d2f7

Browse files
tgymnichjhuber6
andauthored
[OpenMP] Add Environment Variable to disable Reuse of Blocks for High Loop Trip Counts (#89239)
Sometimes it might be beneficial to spawn more thread blocks instead of reusing existing for multiple loop iterations. **Alternatives considered:** Make `DefaultNumBlocks` settable via an environment variable. --------- Co-authored-by: Joseph Huber <[email protected]>
1 parent 43e6f46 commit 597d2f7

File tree

4 files changed

+55
-1
lines changed

4 files changed

+55
-1
lines changed

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
826826
return OMPX_MinThreadsForLowTripCount;
827827
}
828828

829+
/// Whether or not to reuse blocks for high trip count loops.
830+
/// @see OMPX_ReuseBlocksForHighTripCount
831+
bool getReuseBlocksForHighTripCount() {
832+
return OMPX_ReuseBlocksForHighTripCount;
833+
}
834+
829835
/// Get the total amount of hardware parallelism supported by the target
830836
/// device. This is the total amount of warps or wavefronts that can be
831837
/// resident on the device simultaneously.
@@ -901,6 +907,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
901907
UInt32Envar OMPX_MinThreadsForLowTripCount =
902908
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
903909

910+
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
911+
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
912+
904913
protected:
905914
/// Environment variables defined by the LLVM OpenMP implementation
906915
/// regarding the initial number of streams and events.

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,8 +701,11 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
701701
TripCountNumBlocks = LoopTripCount;
702702
}
703703
}
704+
705+
uint32_t PreferredNumBlocks = TripCountNumBlocks;
704706
// If the loops are long running we rather reuse blocks than spawn too many.
705-
uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
707+
if (GenericDevice.getReuseBlocksForHighTripCount())
708+
PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
706709
return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
707710
}
708711

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic 2>&1 | %fcheck-generic
3+
// RUN: %libomptarget-compilexx-generic && %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
4+
5+
// UNSUPPORTED: aarch64-unknown-linux-gnu
6+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
7+
// UNSUPPORTED: x86_64-pc-linux-gnu
8+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
9+
// UNSUPPORTED: s390x-ibm-linux-gnu
10+
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
11+
// clang-format on
12+
13+
/*
14+
Check if there is a thread for each loop iteration
15+
*/
16+
#include <omp.h>
17+
#include <stdio.h>
18+
19+
int main() {
20+
int N = 819200;
21+
int num_threads[N];
22+
23+
#pragma omp target teams distribute parallel for
24+
for (int j = 0; j < N; j++) {
25+
num_threads[j] = omp_get_num_threads() * omp_get_num_teams();
26+
}
27+
28+
if (num_threads[0] == N)
29+
// CHECK: PASS
30+
printf("PASS\n");
31+
else
32+
// DEFAULT: FAIL
33+
printf("FAIL: num_threads: %d\n != N: %d", num_threads[0], N);
34+
return 0;
35+
}

openmp/docs/design/Runtimes.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ variables is defined below.
742742
* ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
743743
* ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
744744
* ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
745+
* ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
745746

746747
LIBOMPTARGET_DEBUG
747748
""""""""""""""""""
@@ -1162,6 +1163,12 @@ of threads possible times the number of teams (aka. blocks) the device prefers
11621163
count to increase outer (team/block) parallelism. The thread count will never
11631164
be reduced below the value passed for this environment variable though.
11641165

1166+
LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT
1167+
"""""""""""""""""""""""""""""""""""""""""""""
1168+
1169+
This environment variable can be used to control how the OpenMP runtime assigns
1170+
blocks to loops with high trip counts. By default we reuse existing blocks
1171+
rather than spawning new blocks.
11651172

11661173

11671174
.. _libomptarget_plugin:

0 commit comments

Comments
 (0)