From a4db20b970d334f43fac86d1c388d07ab90ba110 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 5 Nov 2025 02:14:29 +0000
Subject: [PATCH 01/20] add ltoir test support

---
 cuda_core/tests/test_program.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index adc778973c..5c966296fe 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -387,3 +387,33 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     assert ".visible .entry simple(" in ptx_text
 
     program.close()
+
+@nvvm_available
+@pytest.mark.parametrize(
+    "options",
+    [
+       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+       ProgramOptions(
+            name="ltoir_test3",
+            arch="sm_90",
+            ftz=True,
+            prec_sqrt=False,
+            prec_div=False,
+            fma=True,
+            device_code_optimize=True,
+            link_time_optimization=True,
+       ),
+    ],
+)
+def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
+    """Test NVVM programs for LTOIR with different options"""
+    program = Program(nvvm_ir, "nvvm", options)
+    assert program.backend == "NVVM"
+
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == options.name
+    code_content = ltoir_code.code
+    assert len(ltoir_code.code) > 0
+    program.close()

From fb6cfb3902b8a22aba92624ea417616033018852 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 04:10:17 +0000
Subject: [PATCH 02/20] add options for multi-modules

---
 cuda_core/cuda/core/experimental/_program.py | 30 ++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1db453fed1..2e0115be48 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,6 +298,7 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
+    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -479,6 +480,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
@@ -487,6 +491,9 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
         elif code_type == "ptx":
             assert_type(code, str)
+            if options.extra_sources is not None:
+                raise ValueError("extra_sources is not supported by the PTX backend.")
+            
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -502,6 +509,29 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            if options.extra_sources is not None:
+                if isinstance(options.extra_sources, (str, bytes)):
+                    extra_sources = [options.extra_sources]
+                elif isinstance(options.extra_sources, (list, tuple)):
+                    extra_sources = options.extra_sources
+                else:
+                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
+                
+                if len(extra_sources) == 0:
+                    raise ValueError("extra_sources cannot be empty if provided")
+                
+                for i, extra_source in enumerate(extra_sources):
+                    if isinstance(extra_source, str):
+                        extra_source = extra_source.encode("utf-8")
+                    elif not isinstance(extra_source, (bytes, bytearray)):
+                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
+                    
+                    if len(extra_source) == 0:
+                        raise ValueError(f"Extra source {i} cannot be empty")
+                    
+                    extra_name = f"{options.name}_extra_{i}"
+                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+            
             self._backend = "NVVM"
             self._linker = None
 

From 7aaed4e4f36b890b73098430ae39384217e3d4d1 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 11:05:56 +0000
Subject: [PATCH 03/20] add tests

---
 cuda_core/tests/test_program.py | 119 ++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 5c966296fe..70b131556d 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -417,3 +417,122 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     code_content = ltoir_code.code
     assert len(ltoir_code.code) > 0
     program.close()
+
+
+@nvvm_available
+def test_nvvm_program_with_single_extra_source(nvvm_ir):
+    """Test NVVM program with a single extra source"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    #helper nvvm ir for multiple module loading
+    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    program = Program(nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "multi_module_test"
+    
+    program.close()
+
+@nvvm_available
+def test_nvvm_program_with_multiple_extra_sources():
+    """Test NVVM program with multiple extra sources"""
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    
+    main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i32 @helper_add(i32) nounwind readnone
+declare i32 @helper_mul(i32) nounwind readnone
+
+define void @main_kernel(i32* %data) {{
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
+  %val = load i32, i32* %ptr, align 4
+  
+  %val1 = call i32 @helper_add(i32 %val)
+  %val2 = call i32 @helper_mul(i32 %val1)
+  
+  store i32 %val2, i32* %ptr, align 4
+  ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_add(i32 %x) nounwind readnone {{
+entry:
+  %result = add i32 %x, 1
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @helper_mul(i32 %x) nounwind readnone {{
+entry:
+  %result = mul i32 %x, 2
+  ret i32 %result
+}}
+
+!nvvmir.version = !{{!0}}
+!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
+"""
+
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test", 
+        extra_sources=[helper1_ir, helper2_ir]
+    )
+    program = Program(main_nvvm_ir, "nvvm", options)
+    
+    assert program.backend == "NVVM"
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == "nvvm_multi_helper_test"
+    
+    ltoir_code = program.compile("ltoir")
+    assert isinstance(ltoir_code, ObjectCode)
+    assert ltoir_code.name == "nvvm_multi_helper_test"
+    
+    program.close()
+       
+def test_cpp_program_with_extra_sources():
+    #negative test with NVRTC with multiple sources
+    code = 'extern "C" __global__ void my_kernel(){}'
+    helper = 'extern "C" __global__ void helper(){}'
+    options = ProgramOptions(extra_sources = helper)
+    with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
+        Program(code, "c++", options)
+    
\ No newline at end of file

From 64c7f7d3d7acc4785bd475530786777e5babe2ab Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 14:48:43 +0000
Subject: [PATCH 04/20] add bitcode test

---
 cuda_core/tests/test_program.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 70b131556d..24ae767220 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -527,6 +527,51 @@ def test_nvvm_program_with_multiple_extra_sources():
     assert ltoir_code.name == "nvvm_multi_helper_test"
     
     program.close()
+
+@nvvm_available
+def test_bitcode_format():
+    import os
+    from pathlib import Path
+    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
+    if not bitcode_path:
+        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
+                    "Disabling the test.")
+    bitcode_file = Path(bitcode_path)
+    if not bitcode_file.exists():
+        pytest.skip(f"Bitcode file not found: {bitcode_path}")
+    
+    if not bitcode_file.suffix == '.bc':
+        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    
+    try:
+        with open(bitcode_file, 'rb') as f:
+            bitcode_data = f.read()
+        
+        if len(bitcode_data) < 4:
+            pytest.skip("Bitcode file appears to be empty or invalid")
+        
+        options = ProgramOptions(
+            name=f"existing_bitcode_{bitcode_file.stem}",
+            arch="sm_90"
+        )
+        program = Program(bitcode_data, "nvvm", options)
+        
+        assert program.backend == "NVVM"
+        ptx_result = program.compile("ptx")
+        assert isinstance(ptx_result, ObjectCode)
+        assert ptx_result.name.startswith("existing_bitcode_")
+        assert len(ptx_result.code) > 0
+        try:
+            ltoir_result = program.compile("ltoir")
+            assert isinstance(ltoir_result, ObjectCode)
+            assert len(ltoir_result.code) > 0
+            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
+        except Exception as e:
+            print(f"LTOIR compilation failed : {e}")
+        program.close()
+    except Exception as e:
+        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
+
        
 def test_cpp_program_with_extra_sources():
     #negative test with NVRTC with multiple sources

From 42ba301078bc17c796cddbe99499c957633eb5d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:51:09 +0000
Subject: [PATCH 05/20] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_program.py | 14 ++--
 cuda_core/tests/test_program.py              | 68 ++++++++++----------
 2 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 2e0115be48..eadd37ca95 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -480,7 +480,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -493,7 +493,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             assert_type(code, str)
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the PTX backend.")
-            
+
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -516,22 +516,22 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                     extra_sources = options.extra_sources
                 else:
                     raise TypeError("extra_sources must be str, bytes, list, or tuple")
-                
+
                 if len(extra_sources) == 0:
                     raise ValueError("extra_sources cannot be empty if provided")
-                
+
                 for i, extra_source in enumerate(extra_sources):
                     if isinstance(extra_source, str):
                         extra_source = extra_source.encode("utf-8")
                     elif not isinstance(extra_source, (bytes, bytearray)):
                         raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-                    
+
                     if len(extra_source) == 0:
                         raise ValueError(f"Extra source {i} cannot be empty")
-                    
+
                     extra_name = f"{options.name}_extra_{i}"
                     nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
-            
+
             self._backend = "NVVM"
             self._linker = None
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 24ae767220..9aca1878d4 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -388,13 +388,14 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
 
     program.close()
 
+
 @nvvm_available
 @pytest.mark.parametrize(
     "options",
     [
-       ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
-       ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
-       ProgramOptions(
+        ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
+        ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
+        ProgramOptions(
             name="ltoir_test3",
             arch="sm_90",
             ftz=True,
@@ -403,7 +404,7 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
             fma=True,
             device_code_optimize=True,
             link_time_optimization=True,
-       ),
+        ),
     ],
 )
 def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
@@ -423,9 +424,10 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
 def test_nvvm_program_with_single_extra_source(nvvm_ir):
     """Test NVVM program with a single extra source"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    #helper nvvm ir for multiple module loading
+    # helper nvvm ir for multiple module loading
     helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -441,22 +443,24 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
-    
+
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "multi_module_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_nvvm_program_with_multiple_extra_sources():
     """Test NVVM program with multiple extra sources"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
-    
+
     main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -468,10 +472,10 @@ def test_nvvm_program_with_multiple_extra_sources():
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %ptr = getelementptr inbounds i32, i32* %data, i32 %tid
   %val = load i32, i32* %ptr, align 4
-  
+
   %val1 = call i32 @helper_add(i32 %val)
   %val2 = call i32 @helper_mul(i32 %val1)
-  
+
   store i32 %val2, i32* %ptr, align 4
   ret void
 }}
@@ -511,51 +515,46 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """
 
-    options = ProgramOptions(
-        name="nvvm_multi_helper_test", 
-        extra_sources=[helper1_ir, helper2_ir]
-    )
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
-    
+
     assert program.backend == "NVVM"
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == "nvvm_multi_helper_test"
-    
+
     ltoir_code = program.compile("ltoir")
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == "nvvm_multi_helper_test"
-    
+
     program.close()
 
+
 @nvvm_available
 def test_bitcode_format():
     import os
     from pathlib import Path
+
     bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
     if not bitcode_path:
-        pytest.skip(f"BITCODE_NVVM_PATH environment variable is not set."
-                    "Disabling the test.")
+        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
     bitcode_file = Path(bitcode_path)
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
-    
-    if not bitcode_file.suffix == '.bc':
+
+    if not bitcode_file.suffix == ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
-    
+
     try:
-        with open(bitcode_file, 'rb') as f:
+        with open(bitcode_file, "rb") as f:
             bitcode_data = f.read()
-        
+
         if len(bitcode_data) < 4:
             pytest.skip("Bitcode file appears to be empty or invalid")
-        
-        options = ProgramOptions(
-            name=f"existing_bitcode_{bitcode_file.stem}",
-            arch="sm_90"
-        )
+
+        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
         program = Program(bitcode_data, "nvvm", options)
-        
+
         assert program.backend == "NVVM"
         ptx_result = program.compile("ptx")
         assert isinstance(ptx_result, ObjectCode)
@@ -572,12 +571,11 @@ def test_bitcode_format():
     except Exception as e:
         pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
 
-       
+
 def test_cpp_program_with_extra_sources():
-    #negative test with NVRTC with multiple sources
+    # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'
     helper = 'extern "C" __global__ void helper(){}'
-    options = ProgramOptions(extra_sources = helper)
+    options = ProgramOptions(extra_sources=helper)
     with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
         Program(code, "c++", options)
-    
\ No newline at end of file

From 7ca68992514e63cffbd51642a59a9f277e79540f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 25 Nov 2025 18:14:22 +0000
Subject: [PATCH 06/20] fix format

---
 cuda_core/tests/test_program.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9aca1878d4..7d3e1c8709 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -416,7 +416,7 @@ def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
     assert isinstance(ltoir_code, ObjectCode)
     assert ltoir_code.name == options.name
     code_content = ltoir_code.code
-    assert len(ltoir_code.code) > 0
+    assert len(code_content) > 0
     program.close()
 
 
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-"""
+""" # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)
@@ -542,7 +542,7 @@ def test_bitcode_format():
     if not bitcode_file.exists():
         pytest.skip(f"Bitcode file not found: {bitcode_path}")
 
-    if not bitcode_file.suffix == ".bc":
+    if bitcode_file.suffix != ".bc":
         pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
 
     try:

From 0674ea17dda35f886279bdef2f33a4aa56071562 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:15:33 +0000
Subject: [PATCH 07/20] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 7d3e1c8709..1b96a5f094 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -439,7 +439,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
     program = Program(nvvm_ir, "nvvm", options)
@@ -487,7 +487,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -500,7 +500,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
@@ -513,7 +513,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 !nvvmir.version = !{{!0}}
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
-""" # noqa: E501
+"""  # noqa: E501
 
     options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
     program = Program(main_nvvm_ir, "nvvm", options)

From 03b122466a0eb10b4adb1b692b4a36b017f7241e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 26 Nov 2025 10:26:42 +0000
Subject: [PATCH 08/20] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index eadd37ca95..7d3ea15e46 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -479,11 +479,11 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-
+            
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
             self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"

From 033f11cacf560de6ccf1bf9605070cc912081ddf Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 11:45:49 +0000
Subject: [PATCH 09/20] apply bitcode file from cupy_test helpers

---
 cuda_core/tests/test_program.py               |  61 ++++----
 .../cuda_python_test_helpers/nvvm_bitcode.py  | 136 ++++++++++++++++++
 2 files changed, 163 insertions(+), 34 deletions(-)
 create mode 100644 cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 1b96a5f094..c62e1712f6 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,13 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+try:
+    from cuda_python_test_helpers.nvvm_bitcode import (
+        minimal_nvvmir
+    )
+    _test_helpers_available = True
+except ImportError:
+    _test_helpers_available = False
 
 def _is_nvvm_available():
     """Check if NVVM is available."""
@@ -531,46 +538,32 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-def test_bitcode_format():
+@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
+    
+    if len(minimal_nvvmir) < 4:
+        pytest.skip("Bitcode file is not valid or empty")
 
-    bitcode_path = os.environ.get("BITCODE_NVVM_PATH")
-    if not bitcode_path:
-        pytest.skip("BITCODE_NVVM_PATH environment variable is not set.Disabling the test.")
-    bitcode_file = Path(bitcode_path)
-    if not bitcode_file.exists():
-        pytest.skip(f"Bitcode file not found: {bitcode_path}")
-
-    if bitcode_file.suffix != ".bc":
-        pytest.skip(f"Expected .bc file, got: {bitcode_file.suffix}")
+    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    program = Program(minimal_nvvmir, "nvvm", options)
 
+    assert program.backend == "NVVM"
+    ptx_result = program.compile("ptx")
+    assert isinstance(ptx_result, ObjectCode)
+    assert ptx_result.name == "minimal_nvvmir_bitcode_test"
+    assert len(ptx_result.code) > 0
+    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
-        with open(bitcode_file, "rb") as f:
-            bitcode_data = f.read()
-
-        if len(bitcode_data) < 4:
-            pytest.skip("Bitcode file appears to be empty or invalid")
-
-        options = ProgramOptions(name=f"existing_bitcode_{bitcode_file.stem}", arch="sm_90")
-        program = Program(bitcode_data, "nvvm", options)
-
-        assert program.backend == "NVVM"
-        ptx_result = program.compile("ptx")
-        assert isinstance(ptx_result, ObjectCode)
-        assert ptx_result.name.startswith("existing_bitcode_")
-        assert len(ptx_result.code) > 0
-        try:
-            ltoir_result = program.compile("ltoir")
-            assert isinstance(ltoir_result, ObjectCode)
-            assert len(ltoir_result.code) > 0
-            print(f"LTOIR size: {len(ltoir_result.code)} bytes")
-        except Exception as e:
-            print(f"LTOIR compilation failed : {e}")
-        program.close()
+        ltoir_result = program_lto.compile("ltoir")
+        assert isinstance(ltoir_result, ObjectCode)
+        assert len(ltoir_result.code) > 0
+        print(f"LTOIR size: {len(ltoir_result.code)} bytes")
     except Exception as e:
-        pytest.fail(f"Failed to compile existing bitcode file {bitcode_path}: {str(e)}")
-
+        print(f"LTOIR compilation failed : {e}")
+    finally:
+        program.close()
 
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
new file mode 100644
index 0000000000..cba22d5425
--- /dev/null
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -0,0 +1,136 @@
+import binascii
+import re
+from contextlib import contextmanager
+
+import pytest
+from cuda.bindings import nvvm
+
+MINIMAL_NVVMIR_TXT_TEMPLATE = b"""\
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @kernel() {
+entry:
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
+
+!nvvmir.version = !{!1}
+!1 = !{i32 %d, i32 0, i32 %d, i32 0}
+"""  # noqa: E501
+
+MINIMAL_NVVMIR_BITCODE_STATIC = {
+    (1, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000"
+    "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40"
+    "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b"
+    "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6"
+    "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4"
+    "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc"
+    "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e"
+    "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000"
+    "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022"
+    "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000"
+    "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000"
+    "1f000000080000000600000000000000700000000000000000000000010000000000000000000000"
+    "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000"
+    "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00"
+    "00000000",
+    (2, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000"
+    "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320"
+    "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb"
+    "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339"
+    "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f"
+    "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c"
+    "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150"
+    "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000"
+    "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000"
+    "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000"
+    "060000004c000000010000005800000000000000580000000100000070000000000000000c000000"
+    "130000001f0000000800000006000000000000007000000000000000000000000100000000000000"
+    "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467"
+    "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269"
+    "6e673e0000000000",
+}
+
+@pytest.fixture(params=("txt", "bitcode_static"))
+def minimal_nvvmir(request):
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+
+    if request.param == "txt":
+        return MINIMAL_NVVMIR_TXT_TEMPLATE % (major, debug_major)
+
+    bitcode_static_binascii = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major))
+    if bitcode_static_binascii:
+        return binascii.unhexlify(bitcode_static_binascii)
+    raise RuntimeError(
+        "Static bitcode for NVVM IR version "
+        f"{major}.{debug_major} is not available in this test.\n"
+        "Maintainers: Please run the helper script to generate it and add the "
+        "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
+        "  ../../toolshed/build_static_bitcode_input.py"
+    )
\ No newline at end of file

From 6e411ee8619868ec7ac6d4b4304a2e950edf490c Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 1 Dec 2025 17:44:23 +0000
Subject: [PATCH 10/20] use 2 tuples

---
 cuda_core/cuda/core/experimental/_program.py | 58 ++++++++++++--------
 cuda_core/tests/test_program.py              | 11 +++-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7d3ea15e46..ba6c4cf0af 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -298,7 +298,10 @@ class ProgramOptions:
     split_compile: int | None = None
     fdevice_syntax_only: bool | None = None
     minimal: bool | None = None
-    extra_sources: Union[str, bytes, list[Union[str, bytes]], tuple[Union[str, bytes]]] | None = None
+    # Creating as 2 tuples ((names, source), (names,source))
+    extra_sources: (
+        Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
+    ) = None
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -468,13 +471,14 @@ def close(self):
                     nvvm.destroy_program(self.handle)
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
+        self._module_count = 0
 
         if code_type == "c++":
             assert_type(code, str)
@@ -509,28 +513,36 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            self._module_count = 1
+             # Add extra modules if provided
             if options.extra_sources is not None:
-                if isinstance(options.extra_sources, (str, bytes)):
-                    extra_sources = [options.extra_sources]
-                elif isinstance(options.extra_sources, (list, tuple)):
-                    extra_sources = options.extra_sources
-                else:
-                    raise TypeError("extra_sources must be str, bytes, list, or tuple")
-
-                if len(extra_sources) == 0:
-                    raise ValueError("extra_sources cannot be empty if provided")
-
-                for i, extra_source in enumerate(extra_sources):
-                    if isinstance(extra_source, str):
-                        extra_source = extra_source.encode("utf-8")
-                    elif not isinstance(extra_source, (bytes, bytearray)):
-                        raise TypeError(f"Extra source {i} must be provided as str, bytes, or bytearray")
-
-                    if len(extra_source) == 0:
-                        raise ValueError(f"Extra source {i} cannot be empty")
-
-                    extra_name = f"{options.name}_extra_{i}"
-                    nvvm.add_module_to_program(self._mnff.handle, extra_source, len(extra_source), extra_name)
+                if not is_sequence(options.extra_sources):
+                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                for i, module in enumerate(options.extra_sources):
+                    if not isinstance(module, tuple) or len(module) != 2:
+                        raise TypeError(
+                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                        )
+                    
+                    module_name, module_source = module
+                    
+                    if not isinstance(module_name, str):
+                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
+                    
+                    if isinstance(module_source, str):
+                        # Textual LLVM IR - encode to UTF-8 bytes
+                        module_source = module_source.encode("utf-8")
+                    elif not isinstance(module_source, (bytes, bytearray)):
+                        raise TypeError(
+                            f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
+                            f"or bytearray, got {type(module_source).__name__}"
+                        )
+                    
+                    if len(module_source) == 0:
+                        raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
+                    
+                    nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
+                    self._module_count += 1
 
             self._backend = "NVVM"
             self._linker = None
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c62e1712f6..aec975e84e 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -435,7 +435,7 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
     # helper nvvm ir for multiple module loading
-    helper_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
+    helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @helper_add(i32 %x) {{
@@ -448,7 +448,12 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="multi_module_test", extra_sources=helper_nvvm_ir)
+    options = ProgramOptions(
+            name="multi_module_test",
+            extra_sources=[
+                ("helper", helper_nvvmir),
+            ]
+        )   
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -522,7 +527,7 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[helper1_ir, helper2_ir])
+    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"

From aeb26aa1f8dcdaf8a81f6715faa47c40703ced4e Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 04:16:32 +0000
Subject: [PATCH 11/20] refresh

---
 cuda_core/cuda/core/experimental/_program.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 9f74c3b10a..3aab5de3ae 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-
+    
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -423,8 +423,6 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
-        if self.numba_debug:
-            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b3d6d960faa8f9ff130cd558e91b56213d1b448f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 3 Dec 2025 07:54:25 +0000
Subject: [PATCH 12/20] format

---
 cuda_core/cuda/core/experimental/_program.py | 27 +++++-----
 cuda_core/tests/test_program.py              | 57 +++++++-------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 3aab5de3ae..efdb39807a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, List, Tuple, Union
 from warnings import warn
 
 if TYPE_CHECKING:
@@ -303,7 +303,7 @@ class ProgramOptions:
         Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
     ) = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
-    
+
     def __post_init__(self):
         self._name = self.name.encode()
 
@@ -484,7 +484,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         if code_type == "c++":
             assert_type(code, str)
             # TODO: support pre-loaded headers & include names
-            
+
             if options.extra_sources is not None:
                 raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
 
@@ -515,21 +515,24 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
             self._module_count = 1
-             # Add extra modules if provided
+            # Add extra modules if provided
             if options.extra_sources is not None:
                 if not is_sequence(options.extra_sources):
-                    raise TypeError("extra_modules must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)")
+                    raise TypeError(
+                        "extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
+                    )
                 for i, module in enumerate(options.extra_sources):
                     if not isinstance(module, tuple) or len(module) != 2:
                         raise TypeError(
-                            f"Each extra module must be a 2-tuple (name, source), got {type(module).__name__} at index {i}"
+                            f"Each extra module must be a 2-tuple (name, source)"
+                            f", got {type(module).__name__} at index {i}"
                         )
-                    
+
                     module_name, module_source = module
-                    
+
                     if not isinstance(module_name, str):
-                        raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
-                    
+                        raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")
+
                     if isinstance(module_source, str):
                         # Textual LLVM IR - encode to UTF-8 bytes
                         module_source = module_source.encode("utf-8")
@@ -538,10 +541,10 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                             f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
                             f"or bytearray, got {type(module_source).__name__}"
                         )
-                    
+
                     if len(module_source) == 0:
                         raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
-                    
+
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e0b9a61023..f9cb8dc5f2 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -15,13 +15,13 @@
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
 try:
-    from cuda_python_test_helpers.nvvm_bitcode import (
-        minimal_nvvmir
-    )
+    from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir
+
     _test_helpers_available = True
 except ImportError:
     _test_helpers_available = False
 
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -38,29 +38,12 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
     _cuda_driver_version = 0
 
-
-def _get_nvrtc_version_for_tests():
-    """
-    Get NVRTC version.
-
-    Returns:
-        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
-        None: If NVRTC is not available
-    """
-    try:
-        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
-        version = nvrtc_major * 1000 + nvrtc_minor * 100
-        return version
-    except Exception:
-        return None
-
-
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -200,13 +183,6 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
-        pytest.param(
-            ProgramOptions(debug=True, numba_debug=True),
-            marks=pytest.mark.skipif(
-                (_get_nvrtc_version_for_tests() or 0) < 13200,
-                reason="numba_debug requires NVRTC >= 13.2",
-            ),
-        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -473,11 +449,11 @@ def test_nvvm_program_with_single_extra_source(nvvm_ir):
 """  # noqa: E501
 
     options = ProgramOptions(
-            name="multi_module_test",
-            extra_sources=[
-                ("helper", helper_nvvmir),
-            ]
-        )   
+        name="multi_module_test",
+        extra_sources=[
+            ("helper", helper_nvvmir),
+        ],
+    )
     program = Program(nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -551,7 +527,13 @@ def test_nvvm_program_with_multiple_extra_sources():
 !0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-    options = ProgramOptions(name="nvvm_multi_helper_test", extra_sources=[("helper1", helper1_ir), ("helper2", helper2_ir),])
+    options = ProgramOptions(
+        name="nvvm_multi_helper_test",
+        extra_sources=[
+            ("helper1", helper1_ir),
+            ("helper2", helper2_ir),
+        ],
+    )
     program = Program(main_nvvm_ir, "nvvm", options)
 
     assert program.backend == "NVVM"
@@ -567,11 +549,11 @@ def test_nvvm_program_with_multiple_extra_sources():
 
 
 @nvvm_available
-@pytest.mark.skipif(not _test_helpers_available, reason = "cuda_python_test_helpers not accessible")
+@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
     import os
     from pathlib import Path
-    
+
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
@@ -583,7 +565,7 @@ def test_bitcode_format(minimal_nvvmir):
     assert isinstance(ptx_result, ObjectCode)
     assert ptx_result.name == "minimal_nvvmir_bitcode_test"
     assert len(ptx_result.code) > 0
-    prgram_lto = Program(minimal_nvvmir, "nvvm", options)
+    program_lto = Program(minimal_nvvmir, "nvvm", options)
     try:
         ltoir_result = program_lto.compile("ltoir")
         assert isinstance(ltoir_result, ObjectCode)
@@ -594,6 +576,7 @@ def test_bitcode_format(minimal_nvvmir):
     finally:
         program.close()
 
+
 def test_cpp_program_with_extra_sources():
     # negative test with NVRTC with multiple sources
     code = 'extern "C" __global__ void my_kernel(){}'

From edd6401cc5300c03adb583c021da60b6da18bfe6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 07:55:54 +0000
Subject: [PATCH 13/20] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py                              | 5 +----
 .../cuda_python_test_helpers/nvvm_bitcode.py                 | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index f9cb8dc5f2..d0de56a41c 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -551,13 +551,10 @@ def test_nvvm_program_with_multiple_extra_sources():
 @nvvm_available
 @pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
 def test_bitcode_format(minimal_nvvmir):
-    import os
-    from pathlib import Path
-
     if len(minimal_nvvmir) < 4:
         pytest.skip("Bitcode file is not valid or empty")
 
-    options = ProgramOptions(name=f"minimal_nvvmir_bitcode_test", arch="sm_90")
+    options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90")
     program = Program(minimal_nvvmir, "nvvm", options)
 
     assert program.backend == "NVVM"
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index cba22d5425..e42dbff085 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,6 +1,4 @@
 import binascii
-import re
-from contextlib import contextmanager
 
 import pytest
 from cuda.bindings import nvvm
@@ -117,6 +115,7 @@
     "6e673e0000000000",
 }
 
+
 @pytest.fixture(params=("txt", "bitcode_static"))
 def minimal_nvvmir(request):
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -133,4 +132,4 @@ def minimal_nvvmir(request):
         "Maintainers: Please run the helper script to generate it and add the "
         "output to the MINIMAL_NVVMIR_BITCODE_STATIC dict:\n"
         "  ../../toolshed/build_static_bitcode_input.py"
-    )
\ No newline at end of file
+    )

From 8dbbafece0fc088fa8fbb7d042347f6ce905f9b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Dec 2025 17:59:25 +0000
Subject: [PATCH 14/20] fix from upstream

---
 cuda_core/cuda/core/experimental/_program.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index efdb39807a..2610bc1232 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -423,6 +423,8 @@ def __post_init__(self):
             self._formatted_options.append("--fdevice-syntax-only")
         if self.minimal is not None and self.minimal:
             self._formatted_options.append("--minimal")
+        if self.numba_debug:
+            self._formatted_options.append("--numba-debug")
 
     def _as_bytes(self):
         # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

From b78f0c3e820bce5b6f70903d3d83e10821b02fba Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:22:24 +0000
Subject: [PATCH 15/20] refresh from upstream

---
 cuda_core/tests/test_program.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 91567fdced..02b9899eea 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -184,6 +184,13 @@ def ptx_code_object():
     [
         ProgramOptions(name="abc"),
         ProgramOptions(device_code_optimize=True, debug=True),
+        pytest.param(
+            ProgramOptions(debug=True, numba_debug=True),
+            marks=pytest.mark.skipif(
+                (_get_nvrtc_version_for_tests() or 0) < 13200,
+                reason="numba_debug requires NVRTC >= 13.2",
+            ),
+        ),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
         ProgramOptions(fma=False, use_fast_math=True),
@@ -714,4 +721,4 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     """Test that unsupported options raise CUDAError for NVVM backend"""
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
-        options.as_bytes("nvvm")
+        options.as_bytes("nvvm")
\ No newline at end of file

From 99a5593d0d82d5de399228eadf4fc6b6ed825ff6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:27:55 +0000
Subject: [PATCH 16/20] fix tests

---
 cuda_core/tests/test_program.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 02b9899eea..00d0709870 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -45,6 +45,21 @@ def _is_nvvm_available():
 except Exception:
     _cuda_driver_version = 0
 
+
+def _get_nvrtc_version_for_tests():
+    """
+    Get NVRTC version.
+    Returns:
+        int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
+        None: If NVRTC is not available
+    """
+    try:
+        nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
+        version = nvrtc_major * 1000 + nvrtc_minor * 100
+        return version
+    except Exception:
+        return None
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 

From 783f6e5a419b74757c7620824a7a23ec847321fd Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:29:35 +0000
Subject: [PATCH 17/20] take path_finder from PR 447

---
 .../bindings/_internal/nvjitlink_linux.pyx    |   1 +
 .../cuda/bindings/_path_finder/cuda_paths.py  | 400 ++++++++++++++++++
 .../find_nvidia_dynamic_library.py            | 139 ++++++
 .../cuda/bindings/_path_finder/findlib.py     |  69 +++
 .../load_nvidia_dynamic_library.py            |  92 ++++
 .../_path_finder/sys_path_find_sub_dirs.py    |  40 ++
 cuda_bindings/cuda/bindings/path_finder.py    |  37 ++
 7 files changed, 778 insertions(+)
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/findlib.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
 create mode 100644 cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
 create mode 100644 cuda_bindings/cuda/bindings/path_finder.py

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 8c96b6d640..d095b3f524 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -12,6 +12,7 @@ from .utils import FunctionNotFoundError, NotSupportedError
 from cuda.pathfinder import load_nvidia_dynamic_lib
 
 
+
 ###############################################################################
 # Extern
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
new file mode 100644
index 0000000000..c827699b3a
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/cuda_paths.py
@@ -0,0 +1,400 @@
+import os
+import platform
+import re
+import site
+import sys
+import traceback
+import warnings
+from collections import namedtuple
+from pathlib import Path
+
+from .findlib import find_file, find_lib
+
+IS_WIN32 = sys.platform.startswith("win32")
+
+_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
+
+
+def _get_numba_CUDA_INCLUDE_PATH():
+    # From numba/numba/core/config.py
+
+    def _readenv(name, ctor, default):
+        value = os.environ.get(name)
+        if value is None:
+            return default() if callable(default) else default
+        try:
+            return ctor(value)
+        except Exception:
+            warnings.warn(  # noqa: B028
+                f"Environment variable '{name}' is defined but "
+                f"its associated value '{value}' could not be "
+                "parsed.\nThe parse failed with exception:\n"
+                f"{traceback.format_exc()}",
+                RuntimeWarning,
+            )
+            return default
+
+    if IS_WIN32:
+        cuda_path = os.environ.get("CUDA_PATH")
+        if cuda_path:  # noqa: SIM108
+            default_cuda_include_path = os.path.join(cuda_path, "include")
+        else:
+            default_cuda_include_path = "cuda_include_not_found"
+    else:
+        default_cuda_include_path = os.path.join(os.sep, "usr", "local", "cuda", "include")
+    CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, default_cuda_include_path)
+    return CUDA_INCLUDE_PATH
+
+
+config_CUDA_INCLUDE_PATH = _get_numba_CUDA_INCLUDE_PATH()
+
+
+def _find_valid_path(options):
+    """Find valid path from *options*, which is a list of 2-tuple of
+    (name, path).  Return first pair where *path* is not None.
+    If no valid path is found, return ('<unknown>', None)
+    """
+    for by, data in options:
+        if data is not None:
+            return by, data
+    else:
+        return "<unknown>", None
+
+
+def _get_libdevice_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
+        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
+        ("Debian package", get_debian_pkg_libdevice()),
+        ("NVIDIA NVCC Wheel", get_libdevice_wheel()),
+    ]
+    libdevice_ctk_dir = get_system_ctk("nvvm", "libdevice")
+    if libdevice_ctk_dir and os.path.exists(libdevice_ctk_dir):
+        options.append(("System", libdevice_ctk_dir))
+
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _nvvm_lib_dir():
+    if IS_WIN32:
+        return "nvvm", "bin"
+    else:
+        return "nvvm", "lib64"
+
+
+def _get_nvvm_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
+        ("NVIDIA NVCC Wheel", _get_nvvm_wheel()),
+    ]
+    # need to ensure nvvm dir actually exists
+    nvvm_ctk_dir = get_system_ctk(*_nvvm_lib_dir())
+    if nvvm_ctk_dir and os.path.exists(nvvm_ctk_dir):
+        options.append(("System", nvvm_ctk_dir))
+
+    by, path = _find_valid_path(options)
+    return by, path
+
+
+def _get_nvvm_wheel():
+    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
+    for sp in site_paths:
+        # The SONAME is taken based on public CTK 12.x releases
+        if sys.platform.startswith("linux"):
+            dso_dir = "lib64"
+            # Hack: libnvvm from Linux wheel
+            # does not have any soname (CUDAINST-3183)
+            dso_path = "libnvvm.so"
+        elif sys.platform.startswith("win32"):
+            dso_dir = "bin"
+            dso_path = "nvvm64_40_0.dll"
+        else:
+            raise AssertionError()
+
+        if sp is not None:
+            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
+            dso_path = os.path.join(dso_dir, dso_path)
+            if os.path.exists(dso_path):
+                return str(Path(dso_path).parent)
+
+
+def _get_libdevice_paths():
+    by, libdir = _get_libdevice_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        out = os.path.join(libdir, "libdevice.10.bc")
+    else:
+        # Search for pattern
+        pat = r"libdevice(\.\d+)*\.bc$"
+        candidates = find_file(re.compile(pat), libdir)
+        # Keep only the max (most recent version) of the bitcode files.
+        out = max(candidates, default=None)
+    return _env_path_tuple(by, out)
+
+
+def _cudalib_path():
+    if IS_WIN32:
+        return "bin"
+    else:
+        return "lib64"
+
+
+def _cuda_home_static_cudalib_path():
+    if IS_WIN32:
+        return ("lib", "x64")
+    else:
+        return ("lib64",)
+
+
+def _get_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_static_cudalib_dir_path_decision():
+    options = [
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
+    ]
+    by, libdir = _find_valid_path(options)
+    return by, libdir
+
+
+def _get_cudalib_dir():
+    by, libdir = _get_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def _get_static_cudalib_dir():
+    by, libdir = _get_static_cudalib_dir_path_decision()
+    return _env_path_tuple(by, libdir)
+
+
+def get_system_ctk(*subdirs):
+    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
+    # Linux?
+    if sys.platform.startswith("linux"):
+        # Is cuda alias to /usr/local/cuda?
+        # We are intentionally not getting versioned cuda installation.
+        base = "/usr/local/cuda"
+        if os.path.exists(base):
+            return os.path.join(base, *subdirs)
+
+
+def get_conda_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+    # Assume the existence of NVVM to imply cudatoolkit installed
+    paths = find_lib("nvvm")
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_nvvm_ctk():
+    """Return path to directory containing the NVVM shared library."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    # Assume the existence of NVVM in the conda env implies that a CUDA toolkit
+    # conda package is installed.
+
+    # First, try the location used on Linux and the Windows 11.x packages
+    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
+    if not os.path.exists(libdir) or not os.path.isdir(libdir):
+        # If that fails, try the location used for Windows 12.x packages
+        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
+        if not os.path.exists(libdir) or not os.path.isdir(libdir):
+            # If that doesn't exist either, assume we don't have the NVIDIA
+            # conda package
+            return
+
+    paths = find_lib("nvvm", libdir=libdir)
+    if not paths:
+        return
+    # Use the directory name of the max path
+    return os.path.dirname(max(paths))
+
+
+def get_nvidia_libdevice_ctk():
+    """Return path to directory containing the libdevice library."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    nvvm_dir = os.path.dirname(nvvm_ctk)
+    return os.path.join(nvvm_dir, "libdevice")
+
+
+def get_nvidia_cudalib_ctk():
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    subdir = "bin" if IS_WIN32 else "lib"
+    return os.path.join(env_dir, subdir)
+
+
+def get_nvidia_static_cudalib_ctk():
+    """Return path to directory containing the static libraries of cudatoolkit."""
+    nvvm_ctk = get_nvidia_nvvm_ctk()
+    if not nvvm_ctk:
+        return
+
+    if IS_WIN32 and ("Library" not in nvvm_ctk):  # noqa: SIM108
+        # Location specific to CUDA 11.x packages on Windows
+        dirs = ("Lib", "x64")
+    else:
+        # Linux, or Windows with CUDA 12.x packages
+        dirs = ("lib",)
+
+    env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
+    return os.path.join(env_dir, *dirs)
+
+
+def get_cuda_home(*subdirs):
+    """Get paths of CUDA_HOME.
+    If *subdirs* are the subdirectory name to be appended in the resulting
+    path.
+    """
+    cuda_home = os.environ.get("CUDA_HOME")
+    if cuda_home is None:
+        # Try Windows CUDA installation without Anaconda
+        cuda_home = os.environ.get("CUDA_PATH")
+    if cuda_home is not None:
+        return os.path.join(cuda_home, *subdirs)
+
+
+def _get_nvvm_path():
+    by, path = _get_nvvm_path_decision()
+    if by == "NVIDIA NVCC Wheel":
+        # The NVVM path is a directory, not a file
+        path = os.path.join(path, "libnvvm.so")
+    else:
+        candidates = find_lib("nvvm", path)
+        path = max(candidates) if candidates else None
+    return _env_path_tuple(by, path)
+
+
+def get_cuda_paths():
+    """Returns a dictionary mapping component names to a 2-tuple
+    of (source_variable, info).
+    The returned dictionary will have the following keys and infos:
+    - "nvvm": file_path
+    - "libdevice": List[Tuple[arch, file_path]]
+    - "cudalib_dir": directory_path
+    Note: The result of the function is cached.
+    """
+    # Check cache
+    if hasattr(get_cuda_paths, "_cached_result"):
+        return get_cuda_paths._cached_result
+    else:
+        # Not in cache
+        d = {
+            "nvvm": _get_nvvm_path(),
+            "libdevice": _get_libdevice_paths(),
+            "cudalib_dir": _get_cudalib_dir(),
+            "static_cudalib_dir": _get_static_cudalib_dir(),
+            "include_dir": _get_include_dir(),
+        }
+        # Cache result
+        get_cuda_paths._cached_result = d
+        return d
+
+
+def get_debian_pkg_libdevice():
+    """
+    Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
+    exists.
+    """
+    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
+    if not os.path.exists(pkg_libdevice_location):
+        return None
+    return pkg_libdevice_location
+
+
+def get_libdevice_wheel():
+    nvvm_path = _get_nvvm_wheel()
+    if nvvm_path is None:
+        return None
+    nvvm_path = Path(nvvm_path)
+    libdevice_path = nvvm_path.parent / "libdevice"
+
+    return str(libdevice_path)
+
+
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+
+    if system == "Linux":
+        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
+    elif system == "Windows":
+        arch_to_targets = {
+            "AMD64": "x64",
+        }
+    else:
+        arch_to_targets = {}
+
+    return arch_to_targets.get(machine, None)
+
+
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if not is_conda_env:
+        return
+
+    if platform.system() == "Windows":
+        include_dir = os.path.join(sys.prefix, "Library", "include")
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(sys.prefix, "targets", target_name, "include")
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, "include")
+
+    if (
+        os.path.exists(include_dir)
+        and os.path.isdir(include_dir)
+        and os.path.exists(os.path.join(include_dir, "cuda_device_runtime_api.h"))
+    ):
+        return include_dir
+    return
+
+
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
+        ("CUDA_INCLUDE_PATH Config Entry", config_CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..d8413282e6
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
@@ -0,0 +1,139 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import glob
+import os
+
+from .cuda_paths import IS_WIN32, get_cuda_paths
+from .sys_path_find_sub_dirs import sys_path_find_sub_dirs
+
+
+def _no_such_file_in_sub_dirs(sub_dirs, file_wild, error_messages, attachments):
+    error_messages.append(f"No such file: {file_wild}")
+    for sub_dir in sys_path_find_sub_dirs(sub_dirs):
+        attachments.append(f'  listdir("{sub_dir}"):')
+        for node in sorted(os.listdir(sub_dir)):
+            attachments.append(f"    {node}")
+
+
+def _find_so_using_nvidia_lib_dirs(libname, so_basename, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "lib")
+    file_wild = so_basename + "*"
+    for lib_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        # First look for an exact match
+        so_name = os.path.join(lib_dir, so_basename)
+        if os.path.isfile(so_name):
+            return so_name
+        # Look for a versioned library
+        # Using sort here mainly to make the result deterministic.
+        for node in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
+            so_name = os.path.join(lib_dir, node)
+            if os.path.isfile(so_name):
+                return so_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _find_dll_using_nvidia_bin_dirs(libname, error_messages, attachments):
+    if libname == "nvvm":  # noqa: SIM108
+        nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin")
+    else:
+        nvidia_sub_dirs = ("nvidia", "*", "bin")
+    file_wild = libname + "*.dll"
+    for bin_dir in sys_path_find_sub_dirs(nvidia_sub_dirs):
+        for node in sorted(glob.glob(os.path.join(bin_dir, file_wild))):
+            dll_name = os.path.join(bin_dir, node)
+            if os.path.isfile(dll_name):
+                return dll_name
+    _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
+    return None
+
+
+def _get_cuda_paths_info(key, error_messages):
+    env_path_tuple = get_cuda_paths()[key]
+    if not env_path_tuple:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"]')
+        return None
+    if not env_path_tuple.info:
+        error_messages.append(f'Failure obtaining get_cuda_paths()["{key}"].info')
+        return None
+    return env_path_tuple.info
+
+
+def _find_so_using_cudalib_dir(so_basename, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    primary_so_dir = cudalib_dir + "/"
+    candidate_so_dirs = [primary_so_dir]
+    libs = ["/lib/", "/lib64/"]
+    for _ in range(2):
+        alt_dir = libs[0].join(primary_so_dir.rsplit(libs[1], 1))
+        if alt_dir not in candidate_so_dirs:
+            candidate_so_dirs.append(alt_dir)
+        libs.reverse()
+    candidate_so_names = [so_dirname + so_basename for so_dirname in candidate_so_dirs]
+    error_messages = []
+    for so_name in candidate_so_names:
+        if os.path.isfile(so_name):
+            return so_name
+        error_messages.append(f"No such file: {so_name}")
+    for so_dirname in candidate_so_dirs:
+        attachments.append(f'  listdir("{so_dirname}"):')
+        if not os.path.isdir(so_dirname):
+            attachments.append("    DIRECTORY DOES NOT EXIST")
+        else:
+            for node in sorted(os.listdir(so_dirname)):
+                attachments.append(f"    {node}")
+    return None
+
+
+def _find_dll_using_cudalib_dir(libname, error_messages, attachments):
+    cudalib_dir = _get_cuda_paths_info("cudalib_dir", error_messages)
+    if cudalib_dir is None:
+        return None
+    file_wild = libname + "*.dll"
+    for node in sorted(glob.glob(os.path.join(cudalib_dir, file_wild))):
+        dll_name = os.path.join(cudalib_dir, node)
+        if os.path.isfile(dll_name):
+            return dll_name
+    error_messages.append(f"No such file: {file_wild}")
+    attachments.append(f'  listdir("{cudalib_dir}"):')
+    for node in sorted(os.listdir(cudalib_dir)):
+        attachments.append(f"    {node}")
+    return None
+
+
+@functools.cache
+def find_nvidia_dynamic_library(name: str) -> str:
+    error_messages = []
+    attachments = []
+
+    if IS_WIN32:
+        dll_name = _find_dll_using_nvidia_bin_dirs(name, error_messages, attachments)
+        if dll_name is None:
+            if name == "nvvm":
+                dll_name = _get_cuda_paths_info("nvvm", error_messages)
+            else:
+                dll_name = _find_dll_using_cudalib_dir(name, error_messages, attachments)
+        if dll_name is None:
+            attachments = "\n".join(attachments)
+            raise RuntimeError(f"Failure finding {name}*.dll: {', '.join(error_messages)}\n{attachments}")
+        return dll_name
+
+    so_basename = f"lib{name}.so"
+    so_name = _find_so_using_nvidia_lib_dirs(name, so_basename, error_messages, attachments)
+    if so_name is None:
+        if name == "nvvm":
+            so_name = _get_cuda_paths_info("nvvm", error_messages)
+        else:
+            so_name = _find_so_using_cudalib_dir(so_basename, error_messages, attachments)
+    if so_name is None:
+        attachments = "\n".join(attachments)
+        raise RuntimeError(f"Failure finding {so_basename}: {', '.join(error_messages)}\n{attachments}")
+    return so_name
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/findlib.py b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
new file mode 100644
index 0000000000..c64c3c9577
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/findlib.py
@@ -0,0 +1,69 @@
+# Forked from:
+# https://github.com/numba/numba/blob/f0d24824fcd6a454827e3c108882395d00befc04/numba/misc/findlib.py
+
+import os
+import re
+import sys
+
+
+def get_lib_dirs():
+    """
+    Anaconda specific
+    """
+    if sys.platform == "win32":
+        # on windows, historically `DLLs` has been used for CUDA libraries,
+        # since approximately CUDA 9.2, `Library\bin` has been used.
+        dirnames = ["DLLs", os.path.join("Library", "bin")]
+    else:
+        dirnames = [
+            "lib",
+        ]
+    libdirs = [os.path.join(sys.prefix, x) for x in dirnames]
+    return libdirs
+
+
+DLLNAMEMAP = {
+    "linux": r"lib%(name)s\.so\.%(ver)s$",
+    "linux2": r"lib%(name)s\.so\.%(ver)s$",
+    "linux-static": r"lib%(name)s\.a$",
+    "darwin": r"lib%(name)s\.%(ver)s\.dylib$",
+    "win32": r"%(name)s%(ver)s\.dll$",
+    "win32-static": r"%(name)s\.lib$",
+    "bsd": r"lib%(name)s\.so\.%(ver)s$",
+}
+
+RE_VER = r"[0-9]*([_\.][0-9]+)*"
+
+
+def find_lib(libname, libdir=None, platform=None, static=False):
+    platform = platform or sys.platform
+    platform = "bsd" if "bsd" in platform else platform
+    if static:
+        platform = f"{platform}-static"
+    if platform not in DLLNAMEMAP:
+        # Return empty list if platform name is undefined.
+        # Not all platforms define their static library paths.
+        return []
+    pat = DLLNAMEMAP[platform] % {"name": libname, "ver": RE_VER}
+    regex = re.compile(pat)
+    return find_file(regex, libdir)
+
+
+def find_file(pat, libdir=None):
+    if libdir is None:
+        libdirs = get_lib_dirs()
+    elif isinstance(libdir, str):
+        libdirs = [
+            libdir,
+        ]
+    else:
+        libdirs = list(libdir)
+    files = []
+    for ldir in libdirs:
+        try:
+            entries = os.listdir(ldir)
+        except FileNotFoundError:
+            continue
+        candidates = [os.path.join(ldir, ent) for ent in entries if pat.match(ent)]
+        files.extend([c for c in candidates if os.path.isfile(c)])
+    return files
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
new file mode 100644
index 0000000000..69aadabcbf
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/load_nvidia_dynamic_library.py
@@ -0,0 +1,92 @@
+import functools
+import sys
+
+if sys.platform == "win32":
+    import ctypes.wintypes
+
+    import pywintypes
+    import win32api
+
+    # Mirrors WinBase.h (unfortunately not defined already elsewhere)
+    _WINBASE_LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+
+else:
+    import ctypes
+    import os
+
+    _LINUX_CDLL_MODE = os.RTLD_NOW | os.RTLD_GLOBAL
+
+from .find_nvidia_dynamic_library import find_nvidia_dynamic_library
+
+
+@functools.cache
+def _windows_cuDriverGetVersion() -> int:
+    handle = win32api.LoadLibrary("nvcuda.dll")
+
+    kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
+    GetProcAddress = kernel32.GetProcAddress
+    GetProcAddress.argtypes = [ctypes.wintypes.HMODULE, ctypes.wintypes.LPCSTR]
+    GetProcAddress.restype = ctypes.c_void_p
+    cuDriverGetVersion = GetProcAddress(handle, b"cuDriverGetVersion")
+    assert cuDriverGetVersion
+
+    FUNC_TYPE = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_int))
+    cuDriverGetVersion_fn = FUNC_TYPE(cuDriverGetVersion)
+    driver_ver = ctypes.c_int()
+    err = cuDriverGetVersion_fn(ctypes.byref(driver_ver))
+    assert err == 0
+    return driver_ver.value
+
+
+@functools.cache
+def _windows_load_with_dll_basename(name: str) -> int:
+    driver_ver = _windows_cuDriverGetVersion()
+    del driver_ver  # Keeping this here because it will probably be needed in the future.
+
+    if name == "nvJitLink":
+        dll_name = "nvJitLink_120_0.dll"
+    elif name == "nvrtc":
+        dll_name = "nvrtc64_120_0.dll"
+    elif name == "nvvm":
+        dll_name = "nvvm64_40_0.dll"
+
+    try:
+        return win32api.LoadLibrary(dll_name)
+    except pywintypes.error:
+        pass
+
+    return None
+
+
+@functools.cache
+def load_nvidia_dynamic_library(name: str) -> int:
+    # First try using the platform-specific dynamic loader search mechanisms
+    if sys.platform == "win32":
+        handle = _windows_load_with_dll_basename(name)
+        if handle:
+            return handle
+    else:
+        dl_path = f"lib{name}.so"  # Version intentionally no specified.
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError:
+            pass
+        else:
+            # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+            return handle._handle  # C unsigned int
+
+    dl_path = find_nvidia_dynamic_library(name)
+    if sys.platform == "win32":
+        try:
+            handle = win32api.LoadLibrary(dl_path)
+        except pywintypes.error as e:
+            raise RuntimeError(f"Failed to load DLL at {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><intptr_t>` in cython to convert back to void*
+        return handle  # C signed int, matches win32api.GetProcAddress
+    else:
+        try:
+            handle = ctypes.CDLL(dl_path, _LINUX_CDLL_MODE)
+        except OSError as e:
+            raise RuntimeError(f"Failed to dlopen {dl_path}: {e}") from e
+        # Use `cdef void* ptr = <void*><uintptr_t>` in cython to convert back to void*
+        return handle._handle  # C unsigned int
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
new file mode 100644
index 0000000000..324cdeec30
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_path_finder/sys_path_find_sub_dirs.py
@@ -0,0 +1,40 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import functools
+import os
+import sys
+
+
+@functools.cache
+def _impl(sys_path, sub_dirs):
+    results = []
+    for base in sys_path:
+        stack = [(base, 0)]  # (current_path, index into sub_dirs)
+        while stack:
+            current_path, idx = stack.pop()
+            if idx == len(sub_dirs):
+                if os.path.isdir(current_path):
+                    results.append(current_path)
+                continue
+
+            sub = sub_dirs[idx]
+            if sub == "*":
+                try:
+                    entries = sorted(os.listdir(current_path))
+                except OSError:
+                    continue
+                for entry in entries:
+                    entry_path = os.path.join(current_path, entry)
+                    if os.path.isdir(entry_path):
+                        stack.append((entry_path, idx + 1))
+            else:
+                next_path = os.path.join(current_path, sub)
+                if os.path.isdir(next_path):
+                    stack.append((next_path, idx + 1))
+    return results
+
+
+def sys_path_find_sub_dirs(sub_dirs):
+    return _impl(tuple(sys.path), tuple(sub_dirs))
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/path_finder.py b/cuda_bindings/cuda/bindings/path_finder.py
new file mode 100644
index 0000000000..12c02b930b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/path_finder.py
@@ -0,0 +1,37 @@
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.bindings._path_finder.cuda_paths import (
+    get_conda_ctk,
+    get_conda_include_dir,
+    get_cuda_home,
+    get_cuda_paths,
+    get_current_cuda_target_name,
+    get_debian_pkg_libdevice,
+    get_libdevice_wheel,
+    get_nvidia_cudalib_ctk,
+    get_nvidia_libdevice_ctk,
+    get_nvidia_nvvm_ctk,
+    get_nvidia_static_cudalib_ctk,
+    get_system_ctk,
+)
+from cuda.bindings._path_finder.find_nvidia_dynamic_library import find_nvidia_dynamic_library
+from cuda.bindings._path_finder.load_nvidia_dynamic_library import load_nvidia_dynamic_library
+
+__all__ = [
+    "find_nvidia_dynamic_library",
+    "load_nvidia_dynamic_library",
+    "get_conda_ctk",
+    "get_conda_include_dir",
+    "get_cuda_home",
+    "get_cuda_paths",
+    "get_current_cuda_target_name",
+    "get_debian_pkg_libdevice",
+    "get_libdevice_wheel",
+    "get_nvidia_cudalib_ctk",
+    "get_nvidia_libdevice_ctk",
+    "get_nvidia_nvvm_ctk",
+    "get_nvidia_static_cudalib_ctk",
+    "get_system_ctk",
+]
\ No newline at end of file

From 5dbfb2d0061d6ca9a656e151d922f4fd9ec255b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 13:54:03 +0000
Subject: [PATCH 18/20] add builder files

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 4 ++--
 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index d095b3f524..785f587951 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t, uintptr_t
 import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
-from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 
@@ -77,7 +77,7 @@ cdef void* __nvJitLinkVersion = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvJitLink")
     return <void*>handle
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 408a2cb592..15845e3228 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -10,6 +10,7 @@ import threading
 from .utils import FunctionNotFoundError, NotSupportedError
 
 from cuda.pathfinder import load_nvidia_dynamic_lib
+from cuda.bindings import path_finder
 
 
 ###############################################################################
@@ -75,7 +76,7 @@ cdef void* __nvvmGetProgramLog = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvvm")._handle_uint
+    cdef uintptr_t handle = path_finder.load_nvidia_dynamic_library("nvvm")
     return <void*>handle
 
 

From 0a9eea9967e017bb20eea4801fdbba16e078ce50 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Dec 2025 14:06:11 +0000
Subject: [PATCH 19/20] use python lists/tuples

---
 cuda_core/cuda/core/experimental/_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 4da8191f26..1422ceda5f 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -7,7 +7,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union
 from warnings import warn
 
 if TYPE_CHECKING:

From 79138c0950aaee94426e5858df48010bb6db4530 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 18 Dec 2025 20:01:25 +0000
Subject: [PATCH 20/20] libdevice integration

---
 cuda_core/cuda/core/experimental/_program.py | 58 +++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1422ceda5f..a5ad2a3be2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -102,6 +102,47 @@ def _get_nvvm_module():
         _nvvm_module = None
         raise e
 
+def _find_libdevice_path():
+    """
+    Find libdevice.10.bc using cuda.bindings.path_finder.
+    
+    Returns:
+        str: Path to libdevice.10.bc, or None if not found
+    """
+    try:
+        from cuda.bindings.path_finder import (
+            get_nvidia_libdevice_ctk,
+            get_libdevice_wheel,
+            get_debian_pkg_libdevice,
+        )
+        
+        for getter in [get_nvidia_libdevice_ctk, get_libdevice_wheel, get_debian_pkg_libdevice]:
+            try:
+                result = getter()
+                if result is not None and result.info is not None:
+                    return result.info
+            except Exception:
+                continue
+        
+        return None
+    except ImportError:
+        import os
+        
+        # CUDA_HOME
+        cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+        if cuda_home:
+            libdevice_path = os.path.join(cuda_home, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        # Linux paths
+        for base in ["/usr/local/cuda", "/opt/cuda"]:
+            libdevice_path = os.path.join(base, "nvvm", "libdevice", "libdevice.10.bc")
+            if os.path.isfile(libdevice_path):
+                return libdevice_path
+        
+        return None
+
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -352,6 +393,7 @@ class ProgramOptions:
     pch_messages: bool | None = None
     instantiate_templates_in_pch: bool | None = None
     numba_debug: bool | None = None  # Custom option for Numba debugging
+    use_libdevice: bool | None = None # Use libdevice
 
     def __post_init__(self):
         self._name = self.name.encode()
@@ -748,7 +790,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
                     nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
                     self._module_count += 1
-
+                    
+            self._use_libdevice = options.use_libdevice
             self._backend = "NVVM"
             self._linker = None
 
@@ -866,6 +909,19 @@ def compile(self, target_type, name_expressions=(), logs=None):
             nvvm = _get_nvvm_module()
             with _nvvm_exception_manager(self):
                 nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+                # Invoke libdevice
+                if getattr(self, '_use_libdevice', False):
+                    libdevice_path = _find_libdevice_path()
+                    if libdevice_path is None:
+                        raise RuntimeError(
+                            "use_libdevice=True but could not find libdevice.10.bc. "
+                            "Ensure CUDA toolkit is installed."
+                        )
+                    with open(libdevice_path, "rb") as f:
+                        libdevice_bc = f.read()
+                    # Use lazy_add_module for libdevice bitcode only following numba-cuda
+                    nvvm.lazy_add_module_to_program(self._mnff.handle, libdevice_bc, len(libdevice_bc), None)
+                
                 nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
 
             size = nvvm.get_compiled_result_size(self._mnff.handle)