From c44a4ba42adc8f8fd7ab959df4d0ce15259d5a60 Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 15 Feb 2023 06:09:02 -0800 Subject: [PATCH 1/6] Add test for cuda runtime errors --- test/smoke_test/smoke_test.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index a20a94b76..a2957f300 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -55,6 +55,19 @@ def check_nightly_binaries_date(package: str) -> None: f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}" ) +def cuda_runtime_error(): + try: + torch._assert_async(torch.tensor(0, device='cuda')) + torch._assert_async(torch.tensor(0 + 0j, device='cuda')) + raise RuntimeError( f"Expected CUDA RuntimeError but have not received anything") + except RuntimeError as e: + if re.search("CUDA", f'{e}'): + print(f"Caught CUDA exception with success: {e}") + else: + raise(e) + except Exception as e: + raise(e) + def smoke_test_cuda(package: str) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") @@ -67,6 +80,7 @@ def smoke_test_cuda(package: str) -> None: # todo add cudnn version validation print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") + cuda_runtime_error() if(package == 'all' and is_cuda_system): for module in MODULES: @@ -100,7 +114,6 @@ def smoke_test_conv2d() -> None: with torch.cuda.amp.autocast(): out = conv(x) - def smoke_test_modules(): for module in MODULES: if module["repo"]: From 58a82802390d381a22c356f957a5f7a2a6850fee Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 15 Feb 2023 06:28:20 -0800 Subject: [PATCH 2/6] Add cuda exception smoke test --- test/smoke_test/smoke_test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index a2957f300..a34110e80 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -56,17 +56,18 @@ def check_nightly_binaries_date(package: str) -> None: ) def cuda_runtime_error(): + cuda_exception_missed=True try: - torch._assert_async(torch.tensor(0, device='cuda')) - torch._assert_async(torch.tensor(0 + 0j, device='cuda')) - raise RuntimeError( f"Expected CUDA RuntimeError but have not received anything") + torch._assert_async(torch.tensor(0, device="cuda")) + torch._assert_async(torch.tensor(0 + 0j, device="cuda")) except RuntimeError as e: - if re.search("CUDA", f'{e}'): + if re.search("CUDA", f"{e}"): print(f"Caught CUDA exception with success: {e}") + cuda_exception_missed = False else: raise(e) - except Exception as e: - raise(e) + if(cuda_exception_missed): + raise RuntimeError( f"Expected CUDA RuntimeError but have not received!") def smoke_test_cuda(package: str) -> None: if not torch.cuda.is_available() and is_cuda_system: From 6bedc82007f376fdd1c7c1810c6d879c6b68bf13 Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 15 Feb 2023 07:31:37 -0800 Subject: [PATCH 3/6] Move cuda runtime error to end --- test/smoke_test/smoke_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index a34110e80..3547df40c 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -72,7 +72,7 @@ def cuda_runtime_error(): def smoke_test_cuda(package: str) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") - if torch.cuda.is_available(): + if torch.version.cuda != gpu_arch_ver: raise RuntimeError( f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" @@ -81,7 +81,6 @@ def smoke_test_cuda(package: str) -> None: # todo add cudnn version validation print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") - cuda_runtime_error() if(package == 'all' and is_cuda_system): for module in MODULES: @@ -152,6 +151,10 @@ def main() -> None: if installation_str.find("nightly") != -1: check_nightly_binaries_date(options.package) + # This check has to be run last, since its messing up CUDA runtime + if torch.cuda.is_available(): + cuda_runtime_error() + if __name__ == "__main__": main() From 956360029fff914293b24b0e72b72fbe43a260dd Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 15 Feb 2023 07:34:37 -0800 Subject: [PATCH 4/6] Move cuda runtime error to end --- test/smoke_test/smoke_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3547df40c..88f7c9230 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -72,7 +72,7 @@ def cuda_runtime_error(): def smoke_test_cuda(package: str) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") - + if torch.cuda.is_available(): if torch.version.cuda != gpu_arch_ver: raise RuntimeError( f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" From ad832b8c4944cc4a4930cac9108b7e8e7a46b195 Mon Sep 17 00:00:00 2001 From: atalman Date: Mon, 6 Mar 2023 08:45:54 -0800 Subject: [PATCH 5/6] Address comments --- test/smoke_test/smoke_test.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 88f7c9230..3338dc3c6 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -55,7 +55,7 @@ def check_nightly_binaries_date(package: str) -> None: f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}" ) -def cuda_runtime_error(): +def test_cuda_runtime_errors_captured(): cuda_exception_missed=True try: torch._assert_async(torch.tensor(0, device="cuda")) @@ -65,22 +65,13 @@ def cuda_runtime_error(): print(f"Caught CUDA exception with success: {e}") cuda_exception_missed = False else: - raise(e) + raise e if(cuda_exception_missed): raise RuntimeError( f"Expected CUDA RuntimeError but have not received!") def smoke_test_cuda(package: str) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") - if torch.cuda.is_available(): - if torch.version.cuda != gpu_arch_ver: - raise RuntimeError( - f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" - ) - print(f"torch cuda: {torch.version.cuda}") - # todo add cudnn version validation - print(f"torch cudnn: {torch.backends.cudnn.version()}") - print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") if(package == 'all' and is_cuda_system): for module in MODULES: @@ -94,6 +85,19 @@ def smoke_test_cuda(package: str) -> None: version = imported_module._extension._check_cuda_version() print(f"{module['name']} CUDA: {version}") + if torch.cuda.is_available(): + if torch.version.cuda != gpu_arch_ver: + raise RuntimeError( + f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" + ) + print(f"torch cuda: {torch.version.cuda}") + # todo add cudnn version validation + print(f"torch cudnn: {torch.backends.cudnn.version()}") + print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") + + # This check has to be run last, since its messing up CUDA runtime + test_cuda_runtime_errors_captured() + def smoke_test_conv2d() -> None: import torch.nn as nn @@ -114,6 +118,7 @@ def smoke_test_conv2d() -> None: with torch.cuda.amp.autocast(): out = conv(x) + def smoke_test_modules(): for module in MODULES: if module["repo"]: @@ -141,7 +146,6 @@ def main() -> None: ) options = parser.parse_args() print(f"torch: {torch.__version__}") - smoke_test_cuda(options.package) smoke_test_conv2d() if options.package == "all": @@ -151,9 +155,7 @@ def main() -> None: if installation_str.find("nightly") != -1: check_nightly_binaries_date(options.package) - # This check has to be run last, since its messing up CUDA runtime - if torch.cuda.is_available(): - cuda_runtime_error() + smoke_test_cuda(options.package) if __name__ == "__main__": From 2e2e6ba16d09240d85da8766cefd93417a5c702d Mon Sep 17 00:00:00 2001 From: atalman Date: Mon, 6 Mar 2023 09:23:56 -0800 Subject: [PATCH 6/6] Address comments --- test/smoke_test/smoke_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3338dc3c6..a271a5a3e 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -55,7 +55,7 @@ def check_nightly_binaries_date(package: str) -> None: f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}" ) -def test_cuda_runtime_errors_captured(): +def test_cuda_runtime_errors_captured() -> None: cuda_exception_missed=True try: torch._assert_async(torch.tensor(0, device="cuda"))