From 4a8c738990db9475da0d56774603a06d3696832f Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Tue, 21 Sep 2021 02:37:50 -0400 Subject: [PATCH 01/18] Use uv_thread_getaffinity to bound nthreads --- src/sys.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/src/sys.c b/src/sys.c index 5080c361494dc..f018904d7331d 100644 --- a/src/sys.c +++ b/src/sys.c @@ -587,6 +587,40 @@ typedef DWORD (WINAPI *GAPC)(WORD); #endif #endif +int num_threads_bound(void) +{ +#ifdef _OS_DARWIN_ + return INT_MAX; +#else + int masksize = uv_cpumask_size(); + uv_thread_t tid = uv_thread_self(); + char *cpumask = (char *)calloc(masksize, sizeof(char)); + int err = uv_thread_getaffinity(&tid, cpumask, masksize); + if (err) { + free(cpumask); + jl_printf(JL_STDERR, "WARNING: failed to get thread affinity (%s %d)\n", + uv_err_name(err), err); + return INT_MAX; + } + int n = 0; + for (size_t i = 0; i < masksize; i++) { + n += cpumask[i]; + } + free(cpumask); + return n; +#endif +} + +int intmin(int a, int b) +{ + if (a < b) { + return a; + } + else { + return b; + } +} + // Apple's M1 processor is a big.LITTLE style processor, with 4x "performance" // cores, and 4x "efficiency" cores. Because Julia expects to be able to run // things like heavy linear algebra workloads on all cores, it's best for us @@ -598,6 +632,7 @@ typedef DWORD (WINAPI *GAPC)(WORD); JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT { + int upper_bound = num_threads_bound(); #if defined(HW_AVAILCPU) && defined(HW_NCPU) size_t len = 4; int32_t count; @@ -621,22 +656,22 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT } } #endif - return count; + return intmin(upper_bound, count); #elif defined(_SC_NPROCESSORS_ONLN) long count = sysconf(_SC_NPROCESSORS_ONLN); if (count < 1) return 1; - return count; + return intmin(upper_bound, count); #elif defined(_OS_WINDOWS_) //Try to get WIN7 API method GAPC gapc; if (jl_dlsym(jl_kernel32_handle, "GetActiveProcessorCount", (void **)&gapc, 0)) { - return gapc(ALL_PROCESSOR_GROUPS); + return intmin(upper_bound, gapc(ALL_PROCESSOR_GROUPS)); } else { //fall back on GetSystemInfo SYSTEM_INFO info; GetSystemInfo(&info); - return info.dwNumberOfProcessors; + return intmin(upper_bound, info.dwNumberOfProcessors); } #else #warning "cpu core detection not defined for this platform" From 4309748532774a9cca2dfb11d0ebd3d34fdf0815 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Wed, 22 Sep 2021 04:06:45 -0400 Subject: [PATCH 02/18] Add some safepoint annotations --- src/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sys.c b/src/sys.c index f018904d7331d..251e564217bc3 100644 --- a/src/sys.c +++ b/src/sys.c @@ -587,7 +587,7 @@ typedef DWORD (WINAPI *GAPC)(WORD); #endif #endif -int num_threads_bound(void) +int num_threads_bound(void) JL_NOTSAFEPOINT { #ifdef _OS_DARWIN_ return INT_MAX; @@ -598,8 +598,8 @@ int num_threads_bound(void) int err = uv_thread_getaffinity(&tid, cpumask, masksize); if (err) { free(cpumask); - jl_printf(JL_STDERR, "WARNING: failed to get thread affinity (%s %d)\n", - uv_err_name(err), err); + jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err), + err); return INT_MAX; } int n = 0; @@ -611,7 +611,7 @@ int num_threads_bound(void) #endif } -int intmin(int a, int b) +int intmin(int a, int b) JL_NOTSAFEPOINT { if (a < b) { return a; From d2df381bc40ee4bf992a0756a0c653787bb8204d Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 23 Sep 2021 00:03:58 -0400 Subject: [PATCH 03/18] Use ternary expression --- src/sys.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/sys.c b/src/sys.c index 251e564217bc3..a25b5a867a2b3 100644 --- a/src/sys.c +++ b/src/sys.c @@ -587,7 +587,7 @@ typedef DWORD (WINAPI *GAPC)(WORD); #endif #endif -int num_threads_bound(void) JL_NOTSAFEPOINT +static int num_threads_bound(void) JL_NOTSAFEPOINT { #ifdef _OS_DARWIN_ return INT_MAX; @@ -611,16 +611,6 @@ int num_threads_bound(void) JL_NOTSAFEPOINT #endif } -int intmin(int a, int b) JL_NOTSAFEPOINT -{ - if (a < b) { - return a; - } - else { - return b; - } -} - // Apple's M1 processor is a big.LITTLE style processor, with 4x "performance" // cores, and 4x "efficiency" cores. Because Julia expects to be able to run // things like heavy linear algebra workloads on all cores, it's best for us @@ -656,22 +646,24 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT } } #endif - return intmin(upper_bound, count); + return upper_bound < count ? upper_bound : count; #elif defined(_SC_NPROCESSORS_ONLN) long count = sysconf(_SC_NPROCESSORS_ONLN); if (count < 1) return 1; - return intmin(upper_bound, count); + return upper_bound < count ? upper_bound : count; #elif defined(_OS_WINDOWS_) //Try to get WIN7 API method GAPC gapc; if (jl_dlsym(jl_kernel32_handle, "GetActiveProcessorCount", (void **)&gapc, 0)) { - return intmin(upper_bound, gapc(ALL_PROCESSOR_GROUPS)); + DWORD count = gapc(ALL_PROCESSOR_GROUPS); + return upper_bound < count ? upper_bound : count; } else { //fall back on GetSystemInfo SYSTEM_INFO info; GetSystemInfo(&info); - return intmin(upper_bound, info.dwNumberOfProcessors); + DWORD count = info.dwNumberOfProcessors; + return upper_bound < count ? upper_bound : count; } #else #warning "cpu core detection not defined for this platform" From eaf190880d4d78a52e5ab49cb87d7ee02b0022be Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 23 Sep 2021 00:18:03 -0400 Subject: [PATCH 04/18] Ignore EBADF on Windows --- src/sys.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/sys.c b/src/sys.c index a25b5a867a2b3..d0b63753fb2d2 100644 --- a/src/sys.c +++ b/src/sys.c @@ -598,6 +598,14 @@ static int num_threads_bound(void) JL_NOTSAFEPOINT int err = uv_thread_getaffinity(&tid, cpumask, masksize); if (err) { free(cpumask); +#ifdef _OS_WINDOWS_ +// On windows, it seems like this can fail due to access right, presumably +// because `uv_thread_getaffinity` calls `SetThreadAffinityMask`. +// TODO: Use `GetProcessAffinityMask` directly? + if (err == UV_EBADF) { + return INT_MAX; + } +#endif jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err), err); return INT_MAX; From 68bc481e5c15b21df8dd7dd6dcf4b386f4f055cb Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Wed, 15 Dec 2021 17:15:46 -0500 Subject: [PATCH 05/18] Revert "Ignore EBADF on Windows" This reverts commit eaf190880d4d78a52e5ab49cb87d7ee02b0022be. --- src/sys.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/sys.c b/src/sys.c index 2e8f114085be1..c944c295eeab1 100644 --- a/src/sys.c +++ b/src/sys.c @@ -608,14 +608,6 @@ static int num_threads_bound(void) JL_NOTSAFEPOINT int err = uv_thread_getaffinity(&tid, cpumask, masksize); if (err) { free(cpumask); -#ifdef _OS_WINDOWS_ -// On windows, it seems like this can fail due to access right, presumably -// because `uv_thread_getaffinity` calls `SetThreadAffinityMask`. -// TODO: Use `GetProcessAffinityMask` directly? - if (err == UV_EBADF) { - return INT_MAX; - } -#endif jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err), err); return INT_MAX; From c9b3ca5f5d2853be0d821b75e6c8783501933492 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 13 Jan 2022 01:22:21 -0500 Subject: [PATCH 06/18] Add jl_effective_threads --- src/jloptions.c | 2 +- src/julia_internal.h | 1 + src/sys.c | 60 +++++++++++++++++++++----------------------- src/threading.c | 4 +-- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/src/jloptions.c b/src/jloptions.c index 1ff4da7c5c10b..0a60d606f26a5 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -441,7 +441,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) case 'p': // procs errno = 0; if (!strcmp(optarg,"auto")) { - jl_options.nprocs = jl_cpu_threads(); + jl_options.nprocs = jl_effective_threads(); } else { long nprocs = strtol(optarg, &endptr, 10); diff --git a/src/julia_internal.h b/src/julia_internal.h index da5195b3cd83c..be5044b4f5464 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -749,6 +749,7 @@ extern JL_DLLEXPORT ssize_t jl_tls_offset; extern JL_DLLEXPORT const int jl_tls_elf_support; void jl_init_threading(void); void jl_start_threads(void); +int jl_effective_threads(void); // Whether the GC is running extern char *jl_safepoint_pages; diff --git a/src/sys.c b/src/sys.c index 5010e90ea64dd..8e7c92dc47463 100644 --- a/src/sys.c +++ b/src/sys.c @@ -595,30 +595,6 @@ typedef DWORD (WINAPI *GAPC)(WORD); #endif #endif -static int num_threads_bound(void) JL_NOTSAFEPOINT -{ -#ifdef _OS_DARWIN_ - return INT_MAX; -#else - int masksize = uv_cpumask_size(); - uv_thread_t tid = uv_thread_self(); - char *cpumask = (char *)calloc(masksize, sizeof(char)); - int err = uv_thread_getaffinity(&tid, cpumask, masksize); - if (err) { - free(cpumask); - jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err), - err); - return INT_MAX; - } - int n = 0; - for (size_t i = 0; i < masksize; i++) { - n += cpumask[i]; - } - free(cpumask); - return n; -#endif -} - // Apple's M1 processor is a big.LITTLE style processor, with 4x "performance" // cores, and 4x "efficiency" cores. Because Julia expects to be able to run // things like heavy linear algebra workloads on all cores, it's best for us @@ -630,7 +606,6 @@ static int num_threads_bound(void) JL_NOTSAFEPOINT JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT { - int upper_bound = num_threads_bound(); #if defined(HW_AVAILCPU) && defined(HW_NCPU) size_t len = 4; int32_t count; @@ -654,24 +629,22 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT } } #endif - return upper_bound < count ? upper_bound : count; + return count; #elif defined(_SC_NPROCESSORS_ONLN) long count = sysconf(_SC_NPROCESSORS_ONLN); if (count < 1) return 1; - return upper_bound < count ? upper_bound : count; + return count; #elif defined(_OS_WINDOWS_) //Try to get WIN7 API method GAPC gapc; if (jl_dlsym(jl_kernel32_handle, "GetActiveProcessorCount", (void **)&gapc, 0)) { - DWORD count = gapc(ALL_PROCESSOR_GROUPS); - return upper_bound < count ? upper_bound : count; + return gapc(ALL_PROCESSOR_GROUPS); } else { //fall back on GetSystemInfo SYSTEM_INFO info; GetSystemInfo(&info); - DWORD count = info.dwNumberOfProcessors; - return upper_bound < count ? upper_bound : count; + return info.dwNumberOfProcessors; } #else #warning "cpu core detection not defined for this platform" @@ -679,6 +652,31 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT #endif } +int jl_effective_threads(void) JL_NOTSAFEPOINT +{ + int cpu = jl_cpu_threads(); +#ifdef _OS_DARWIN_ + return cpu; +#else + int masksize = uv_cpumask_size(); + uv_thread_t tid = uv_thread_self(); + char *cpumask = (char *)calloc(masksize, sizeof(char)); + int err = uv_thread_getaffinity(&tid, cpumask, masksize); + if (err) { + free(cpumask); + jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err), + err); + return cpu; + } + int n = 0; + for (size_t i = 0; i < masksize; i++) { + n += cpumask[i]; + } + free(cpumask); + return n; +#endif +} + // -- high resolution timers -- // Returns time in nanosec diff --git a/src/threading.c b/src/threading.c index f10612016ef8a..c69196f7cebb2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -454,7 +454,7 @@ void jl_init_threading(void) // how many threads available, usable jl_n_threads = JULIA_NUM_THREADS; if (jl_options.nthreads < 0) { // --threads=auto - jl_n_threads = jl_cpu_threads(); + jl_n_threads = jl_effective_threads(); } else if (jl_options.nthreads > 0) { // --threads=N jl_n_threads = jl_options.nthreads; @@ -463,7 +463,7 @@ void jl_init_threading(void) if (strcmp(cp, "auto")) jl_n_threads = (uint64_t)strtol(cp, NULL, 10); // ENV[NUM_THREADS_NAME] == "N" else - jl_n_threads = jl_cpu_threads(); // ENV[NUM_THREADS_NAME] == "auto" + jl_n_threads = jl_effective_threads(); // ENV[NUM_THREADS_NAME] == "auto" } if (jl_n_threads <= 0) jl_n_threads = 1; From a1572a2b514e59d83d856ee2ecfedcca4c2924ce Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 15 Jan 2022 21:41:34 -0500 Subject: [PATCH 07/18] Test affinity-based nthreads setup --- test/cmdlineargs.jl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 0a03e60f6dd03..50a258e909f49 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -781,3 +781,27 @@ end @test lines[4] == "bar" end end + +function get_nthreads(options = ``; cpus) + cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` + cmd = `$cmd -e "print(Threads.nthreads())"` + cmd = setcpuaffinity(cmd, cpus) + cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") + return parse(Int, read(cmd, String)) +end + +@testset "nthreads determined based on CPU affinity" begin + if !Sys.isapple() && Sys.CPU_THREADS ≥ 2 + @test get_nthreads(cpus = [1]) == 1 + @test get_nthreads(cpus = [2]) == 1 + @test get_nthreads(cpus = [1, 2]) == 2 + @test get_nthreads(`-t1`, cpus = [1]) == + get_nthreads(`-t1`, cpus = [2]) == + get_nthreads(`-t1`, cpus = [1, 2]) == + 1 + + if Sys.CPU_THREADS ≥ 3 + @test get_nthreads(cpus = [1, 3]) == get_nthreads(cpus = [2, 3]) == 2 + end + end +end From c2d3f9bfb640521ba7aec4ad694ce7408b5b6f1a Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 16 Jan 2022 23:18:48 -0500 Subject: [PATCH 08/18] Tweak testing --- test/cmdlineargs.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 50a258e909f49..6805221c0a5f0 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -784,9 +784,9 @@ end function get_nthreads(options = ``; cpus) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` - cmd = `$cmd -e "print(Threads.nthreads())"` - cmd = setcpuaffinity(cmd, cpus) + cmd = `$cmd -e "print(Threads.nthreads())"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") + cmd = setcpuaffinity(cmd, cpus) return parse(Int, read(cmd, String)) end @@ -795,13 +795,13 @@ end @test get_nthreads(cpus = [1]) == 1 @test get_nthreads(cpus = [2]) == 1 @test get_nthreads(cpus = [1, 2]) == 2 - @test get_nthreads(`-t1`, cpus = [1]) == - get_nthreads(`-t1`, cpus = [2]) == - get_nthreads(`-t1`, cpus = [1, 2]) == - 1 + @test get_nthreads(`-t1`, cpus = [1]) == 1 + @test get_nthreads(`-t1`, cpus = [2]) == 1 + @test get_nthreads(`-t1`, cpus = [1, 2]) == 1 if Sys.CPU_THREADS ≥ 3 - @test get_nthreads(cpus = [1, 3]) == get_nthreads(cpus = [2, 3]) == 2 + @test get_nthreads(cpus = [1, 3]) == 2 + @test get_nthreads(cpus = [2, 3]) == 2 end end end From e64ff3aa7459367862a9575a4cb158a4190518c6 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 16 Jan 2022 23:35:26 -0500 Subject: [PATCH 09/18] Compare output without affinity setting --- test/cmdlineargs.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 6805221c0a5f0..b23ed0f310336 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -782,16 +782,19 @@ end end end -function get_nthreads(options = ``; cpus) +function get_nthreads(options = ``; cpus = nothing) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` cmd = `$cmd -e "print(Threads.nthreads())"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") - cmd = setcpuaffinity(cmd, cpus) + if cpus !== nothing + cmd = setcpuaffinity(cmd, cpus) + end return parse(Int, read(cmd, String)) end @testset "nthreads determined based on CPU affinity" begin if !Sys.isapple() && Sys.CPU_THREADS ≥ 2 + @test get_nthreads() ≥ 2 @test get_nthreads(cpus = [1]) == 1 @test get_nthreads(cpus = [2]) == 1 @test get_nthreads(cpus = [1, 2]) == 2 From 81b1d664802a15aa94b3727d088f2a2e04e1f71f Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 16 Jan 2022 23:38:15 -0500 Subject: [PATCH 10/18] DEBUG: print input/output of get_nthreads --- test/cmdlineargs.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index b23ed0f310336..324d7bed17ee7 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -785,11 +785,14 @@ end function get_nthreads(options = ``; cpus = nothing) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` cmd = `$cmd -e "print(Threads.nthreads())"` + cmd = `$cmd -e "println(); using InteractiveUtils; versioninfo()"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") if cpus !== nothing cmd = setcpuaffinity(cmd, cpus) end - return parse(Int, read(cmd, String)) + out = read(cmd, String) + @info "`get_nthreads`" options cpus Text(out) + return parse(Int, split(out)[1]) end @testset "nthreads determined based on CPU affinity" begin From 4f65152c6271f1a53b1cfeb7bd9886d97ffcf840 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 16 Jan 2022 23:58:10 -0500 Subject: [PATCH 11/18] Document how `--threads=auto` now works --- src/jloptions.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/jloptions.c b/src/jloptions.c index 0a60d606f26a5..529b0c4dcad02 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -108,8 +108,13 @@ static const char opts[] = " -L, --load Load immediately on all processors\n\n" // parallel options - " -t, --threads {N|auto} Enable N threads; \"auto\" currently sets N to the number of local\n" - " CPU threads but this might change in the future\n" + " -t, --threads {N|auto} Enable N threads; \"auto\" tries to infer a useful default number\n" + " of threads to use but the exact behavior might change in the future.\n" + " Currently, \"auto\" uses the number of CPUs assigned to this julia\n" + " process based on the OS-specific affinity assignment interface, if\n" + " supported (Linux and Windows). If this is not supported (macOS) or\n" + " process affinity is not configured, it uses the number of CPU\n" + " threads.\n" " -p, --procs {N|auto} Integer value N launches N additional local worker processes\n" " \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n" " --machine-file Run processes on hosts listed in \n\n" From c3039fcf98218f61a6d68565828d65909077e804 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Mon, 17 Jan 2022 01:13:12 -0500 Subject: [PATCH 12/18] Revert "DEBUG: print input/output of get_nthreads" This reverts commit 81b1d664802a15aa94b3727d088f2a2e04e1f71f. --- test/cmdlineargs.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 324d7bed17ee7..b23ed0f310336 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -785,14 +785,11 @@ end function get_nthreads(options = ``; cpus = nothing) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` cmd = `$cmd -e "print(Threads.nthreads())"` - cmd = `$cmd -e "println(); using InteractiveUtils; versioninfo()"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") if cpus !== nothing cmd = setcpuaffinity(cmd, cpus) end - out = read(cmd, String) - @info "`get_nthreads`" options cpus Text(out) - return parse(Int, split(out)[1]) + return parse(Int, read(cmd, String)) end @testset "nthreads determined based on CPU affinity" begin From f7252ec17b57f7dd9a6ab183cd9ee6ef4cda16a7 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Mon, 17 Jan 2022 01:14:09 -0500 Subject: [PATCH 13/18] Don't run get_nthreads test under rr --- test/cmdlineargs.jl | 27 --------------------------- test/threads.jl | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index b23ed0f310336..0a03e60f6dd03 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -781,30 +781,3 @@ end @test lines[4] == "bar" end end - -function get_nthreads(options = ``; cpus = nothing) - cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` - cmd = `$cmd -e "print(Threads.nthreads())"` - cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") - if cpus !== nothing - cmd = setcpuaffinity(cmd, cpus) - end - return parse(Int, read(cmd, String)) -end - -@testset "nthreads determined based on CPU affinity" begin - if !Sys.isapple() && Sys.CPU_THREADS ≥ 2 - @test get_nthreads() ≥ 2 - @test get_nthreads(cpus = [1]) == 1 - @test get_nthreads(cpus = [2]) == 1 - @test get_nthreads(cpus = [1, 2]) == 2 - @test get_nthreads(`-t1`, cpus = [1]) == 1 - @test get_nthreads(`-t1`, cpus = [2]) == 1 - @test get_nthreads(`-t1`, cpus = [1, 2]) == 1 - - if Sys.CPU_THREADS ≥ 3 - @test get_nthreads(cpus = [1, 3]) == 2 - @test get_nthreads(cpus = [2, 3]) == 2 - end - end -end diff --git a/test/threads.jl b/test/threads.jl index 99f508d42b6c0..27b87d8d8f441 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -54,6 +54,33 @@ if Sys.islinux() || Sys.iswindows() end end +function get_nthreads(options = ``; cpus = nothing) + cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` + cmd = `$cmd -e "print(Threads.nthreads())"` + cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") + if cpus !== nothing + cmd = setcpuaffinity(cmd, cpus) + end + return parse(Int, read(cmd, String)) +end + +@testset "nthreads determined based on CPU affinity" begin + if !Sys.isapple() && !running_under_rr() && Sys.CPU_THREADS ≥ 2 + @test get_nthreads() ≥ 2 + @test get_nthreads(cpus = [1]) == 1 + @test get_nthreads(cpus = [2]) == 1 + @test get_nthreads(cpus = [1, 2]) == 2 + @test get_nthreads(`-t1`, cpus = [1]) == 1 + @test get_nthreads(`-t1`, cpus = [2]) == 1 + @test get_nthreads(`-t1`, cpus = [1, 2]) == 1 + + if Sys.CPU_THREADS ≥ 3 + @test get_nthreads(cpus = [1, 3]) == 2 + @test get_nthreads(cpus = [2, 3]) == 2 + end + end +end + # issue #34769 function idle_callback(handle) idle = @Base.handle_as handle UvTestIdle From 140654ce3432d68b645e7a89b2b51dda108a7a0d Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Wed, 26 Jan 2022 12:40:54 -0800 Subject: [PATCH 14/18] Better error handling Co-authored-by: Jameson Nash --- src/sys.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/sys.c b/src/sys.c index 8e7c92dc47463..7c06a8d708a72 100644 --- a/src/sys.c +++ b/src/sys.c @@ -655,10 +655,9 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT int jl_effective_threads(void) JL_NOTSAFEPOINT { int cpu = jl_cpu_threads(); -#ifdef _OS_DARWIN_ - return cpu; -#else int masksize = uv_cpumask_size(); + if (masksize < 0) + return cpu; uv_thread_t tid = uv_thread_self(); char *cpumask = (char *)calloc(masksize, sizeof(char)); int err = uv_thread_getaffinity(&tid, cpumask, masksize); @@ -673,8 +672,7 @@ int jl_effective_threads(void) JL_NOTSAFEPOINT n += cpumask[i]; } free(cpumask); - return n; -#endif + return n < cpu ? n : cpu; } From 8c33dc1e3e27f42f3023160aa1cd970d1d498d9c Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 27 Jan 2022 17:31:23 -0800 Subject: [PATCH 15/18] Check if running under rr Co-authored-by: Jameson Nash --- src/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sys.c b/src/sys.c index 7c06a8d708a72..a0abad86b3444 100644 --- a/src/sys.c +++ b/src/sys.c @@ -656,7 +656,7 @@ int jl_effective_threads(void) JL_NOTSAFEPOINT { int cpu = jl_cpu_threads(); int masksize = uv_cpumask_size(); - if (masksize < 0) + if (masksize < 0 || jl_running_under_rr(0)) return cpu; uv_thread_t tid = uv_thread_self(); char *cpumask = (char *)calloc(masksize, sizeof(char)); From 23758732a6a88a590a0ef266285ecb1640868deb Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 27 Jan 2022 23:31:08 -0500 Subject: [PATCH 16/18] Mention -t=auto behavior elsewhere --- doc/man/julia.1 | 8 +++++++- doc/src/manual/command-line-options.md | 4 +++- doc/src/manual/multi-threading.md | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/man/julia.1 b/doc/man/julia.1 index 0b008619014e1..552c45eb131a2 100644 --- a/doc/man/julia.1 +++ b/doc/man/julia.1 @@ -103,7 +103,13 @@ Load immediately on all processors .TP -t, --threads -Enable n threads +Enable n threads; "auto" tries to infer a useful default number +of threads to use but the exact behavior might change in the future. +Currently, "auto" uses the number of CPUs assigned to this julia +process based on the OS-specific affinity assignment interface, if +supported (Linux and Windows). If this is not supported (macOS) or +process affinity is not configured, it uses the number of CPU +threads. .TP -p, --procs diff --git a/doc/src/manual/command-line-options.md b/doc/src/manual/command-line-options.md index f3ad39a6aed16..387c0d9d896bd 100644 --- a/doc/src/manual/command-line-options.md +++ b/doc/src/manual/command-line-options.md @@ -89,7 +89,7 @@ The following is a complete list of command-line switches available when launchi |`-e`, `--eval ` |Evaluate ``| |`-E`, `--print ` |Evaluate `` and display the result| |`-L`, `--load ` |Load `` immediately on all processors| -|`-t`, `--threads {N\|auto`} |Enable N threads; `auto` currently sets N to the number of local CPU threads but this might change in the future| +|`-t`, `--threads {N\|auto`} |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| |`-p`, `--procs {N\|auto`} |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| |`--machine-file ` |Run processes on hosts listed in ``| |`-i` |Interactive mode; REPL runs and `isinteractive()` is true| @@ -111,6 +111,8 @@ The following is a complete list of command-line switches available when launchi |`--track-allocation={none\|user\|all}` |Count bytes allocated by each source line| |`--track-allocation` |equivalent to `--track-allocation=user`| + + !!! compat "Julia 1.1" In Julia 1.0, the default `--project=@.` option did not search up from the root directory of a Git repository for the `Project.toml` file. From Julia 1.1 forward, it diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md index 1b1f1949d3e01..4fcb4cb1b6eb6 100644 --- a/doc/src/manual/multi-threading.md +++ b/doc/src/manual/multi-threading.md @@ -19,8 +19,8 @@ The number of execution threads is controlled either by using the specified, then `-t`/`--threads` takes precedence. The number of threads can either be specified as an integer (`--threads=4`) or as `auto` -(`--threads=auto`), where `auto` sets the number of threads to the number of local CPU -threads. +(`--threads=auto`), where `auto` tries to infer a useful default number of threads to use +(see [Command-line Options](@id command-line-options) for more details). !!! compat "Julia 1.5" The `-t`/`--threads` command line argument requires at least Julia 1.5. From 85cba54c7b2731e32c17d0dd37d7d8f13b8ba9d9 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Thu, 27 Jan 2022 23:44:35 -0500 Subject: [PATCH 17/18] Mention it in NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3f23aaf573b72..d4b39582c451e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -61,6 +61,8 @@ Command-line option changes code when building a system image. The resulting image will only work if `--compile=all` is used, or if all needed code is precompiled ([#42925]). * When the program file is `-` the code to be executed is read from standard in ([#43191]). +* In Linux and Windows, `--threads=auto` now tries to infer usable number of CPUs from the + process affinity which is set typically in HPC and cloud environments ([#42340]). Multi-threading changes ----------------------- From b263a8df42badca391ae974a70f941856af02466 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 29 Jan 2022 21:48:46 -0500 Subject: [PATCH 18/18] Check affinity support in one place --- test/threads.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/threads.jl b/test/threads.jl index 5be1731ae9c5a..dde50590ae08b 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -93,8 +93,10 @@ else end # Note also that libuv does not support affinity in macOS and it is known to # hang in FreeBSD. So, it's tested only in Linux and Windows: -if Sys.islinux() || Sys.iswindows() - if Sys.CPU_THREADS > 1 && !running_under_rr() +const AFFINITY_SUPPORTED = (Sys.islinux() || Sys.iswindows()) && !running_under_rr() + +if AFFINITY_SUPPORTED + if Sys.CPU_THREADS > 1 @test run_with_affinity([2]) == "2" @test run_with_affinity([1, 2]) == "1,2" end @@ -111,7 +113,7 @@ function get_nthreads(options = ``; cpus = nothing) end @testset "nthreads determined based on CPU affinity" begin - if !Sys.isapple() && !running_under_rr() && Sys.CPU_THREADS ≥ 2 + if AFFINITY_SUPPORTED && Sys.CPU_THREADS ≥ 2 @test get_nthreads() ≥ 2 @test get_nthreads(cpus = [1]) == 1 @test get_nthreads(cpus = [2]) == 1