mikex86
diff --git a/‎driverapi/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎driverapi/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎driverapi/include/librecuda.h‎
Lines changed: 2 additions & 0 deletions b/‎driverapi/include/librecuda.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎driverapi/internal/librecuda_internal.h‎
Lines changed: 32 additions & 8 deletions b/‎driverapi/internal/librecuda_internal.h‎
Lines changed: 32 additions & 8 deletions
diff --git a/‎driverapi/internal/memcopy.h‎
Lines changed: 1 addition & 1 deletion b/‎driverapi/internal/memcopy.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎driverapi/internal/memcopy_kernel.h‎
Lines changed: 0 additions & 408 deletions b/‎driverapi/internal/memcopy_kernel.h‎
Lines changed: 0 additions & 408 deletions
diff --git a/‎driverapi/internal/memcopy_kernels.h‎
Lines changed: 2659 additions & 0 deletions b/‎driverapi/internal/memcopy_kernels.h‎
Lines changed: 2659 additions & 0 deletions
diff --git a/‎driverapi/kernels/memcpy/compile_memcpy.sh‎
Lines changed: 48 additions & 0 deletions b/‎driverapi/kernels/memcpy/compile_memcpy.sh‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎driverapi/kernels/memcpy/generate_header.py‎
Lines changed: 35 additions & 0 deletions b/‎driverapi/kernels/memcpy/generate_header.py‎
Lines changed: 35 additions & 0 deletions
@@ -5,7 +5,7 @@ set(
         src/cmdqueue.cpp
         src/memcopy.cpp
         internal/memcopy.h
-        internal/memcopy_kernel.h
+        internal/memcopy_kernels.h
 )
 if (BUILD_LIBRECUDA_DRVIER_API_STATIC_LIB)
     add_library(driverapi STATIC ${DRIVERAPI_SOURCES})
 
@@ -26,6 +26,8 @@ typedef LibreCUEvent_ *LibreCUEvent;
 
 enum LibreCuDeviceAttribute {
     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
 };
 enum LibreCuFunctionAttribute {
 
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <nvidia/ctrl2080gr.h>
 
 #include "nvidia/nvtypes.h"
 #include "nvidia/nvCpuUuid.h"
@@ -14,6 +15,12 @@
 
 struct NvCommandQueue;
 
+enum DriverType {
+    OPEN_KERNEL_MODULES, NVIDIA_PROPRIETARY
+};
+
+extern DriverType driver_type;
+
 struct GPFifo {
     NvU64 *ring;
     NvU32 entries_count;
@@ -28,6 +35,7 @@ struct LibreCUdevice_ {
     NvProcessorUuid uuid;
     NvU32 *gpu_mmio;
     NvU32 compute_class;
+    NV2080_CTRL_GR_INFO device_info[NV2080_CTRL_GR_INFO_INDEX_MAX + 1];
 };
 
 #define UVM_HEAP_START 0x1000000000
@@ -47,6 +55,7 @@ struct LibreCUcontext_ {
     NvHandle channel_group;
 };
 
+extern LibreCUcontext current_ctx;
 
 struct KernelConstantInfo {
     NvU32 const_nr;
@@ -73,6 +82,7 @@ struct LibreCUmodule_ {
     NvU64 module_va_addr{};
 };
 
+
 struct LibreCUFunction_ {
     std::string name;
     NvU64 func_va_addr;
@@ -87,13 +97,19 @@ struct LibreCUFunction_ {
      * Virtual address of shader local memory used for shaders/kernels.
      */
     NvU64 shader_local_memory_va{};
-
 };
 
 struct LibreCUstream_ {
     NvCommandQueue *command_queue;
 };
 
+struct LibreCuRoute {
+    NvBool valid;
+    NvU32 swizId;
+    NvU32 engineId;
+};
+
+
 #define LIBRECUDA_VALIDATE_UVM_IOCTL(ret, data_ptr) {                                    \
     int return_value = ret;                                                              \
     int status = (data_ptr) != nullptr ? (data_ptr)->rmStatus : return_value;            \
@@ -131,12 +147,12 @@ static inline libreCudaStatus_t rm_alloc(int fd, NvV32 clss,
                                          NvHandle *pObjectNew) {
     LIBRECUDA_VALIDATE(fd > 0, LIBRECUDA_ERROR_INVALID_VALUE);
     NVOS21_PARAMETERS parameters{
-            .hRoot=client,
-            .hObjectParent=parent,
-            .hObjectNew=object,
-            .hClass=clss,
-            .pAllocParms=params,
-            .paramsSize=paramSize
+        .hRoot = client,
+        .hObjectParent = parent,
+        .hObjectNew = object,
+        .hClass = clss,
+        .pAllocParms = params,
+        .paramsSize = paramSize
     };
     NV_IOWR(fd, NV_ESC_RM_ALLOC, &parameters, sizeof(parameters));
     if (pObjectNew != nullptr) {
@@ -161,6 +177,7 @@ static inline libreCudaStatus_t rm_ctrl(int fd,
 
 #define RM_CTRL(fd, cmd, client, object, params, paramSize) LIBRECUDA_VALIDATE_RM_CTRL(rm_ctrl(fd, cmd, client, object, params, paramSize))
 
+#define LIBRECUDA_ENSURE_CTX_VALID() LIBRECUDA_VALIDATE(current_ctx != nullptr, LIBRECUDA_ERROR_INVALID_CONTEXT);
 
 libreCudaStatus_t
 gpuAlloc(LibreCUcontext ctx, size_t size, bool physicalContiguity, bool hugePages, bool mapToCpu, NvU32 mapFlags,
@@ -173,9 +190,16 @@ gpuSystemAlloc(LibreCUcontext ctx, size_t size, bool mapToCpu, NvU32 mapFlags,
 libreCudaStatus_t gpuFree(LibreCUcontext ctx, NvU64 virtualAddress);
 
 /**
- * Returns if the pointer is a device pointer.
+ * @param ptr the pointer to check for being a device pointer
+ * @return true if the pointer is a device pointer.
  * This does not mean the ptr is still allocated. It just means it is or was a device pointer.
  */
 bool isDevicePtr(void *ptr);
 
+/**
+ * @param ptr the pointer to check for being mapped to host.
+ * @return true if the pointer is a device pointer and mapped to host. will return false for host pointers.
+ */
+bool isHostMappedPtr(void *ptr);
+
 #endif //LIBRECUDA_LIBRECUDA_INTERNAL_H
@@ -3,6 +3,6 @@
 #include <librecuda.h>
 #include <cstddef>
 
-libreCudaStatus_t loadMemcpyKernelsIfNeeded();
+libreCudaStatus_t loadMemcpyKernelsIfNeeded(LibreCUdevice device);
 
 libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async);
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# NOTE: THIS FILE IS NOT PART OF CI
+# Why? Because it would require having nvcc & ptxas installed, which we don't wan to assume
+
+# Define the list of compute capabilities and corresponding architecture
+declare -A compute_capabilities=(
+    [5.0]="sm_50"
+    [5.2]="sm_52"
+    [5.3]="sm_53"
+    [6.0]="sm_60"
+    [6.1]="sm_61"
+    [6.2]="sm_62"
+    [7.0]="sm_70"
+    [7.2]="sm_72"
+    [7.5]="sm_75"
+    [8.0]="sm_80"
+    [8.6]="sm_86"
+    [8.7]="sm_87"
+    [8.9]="sm_89"
+    [9.0]="sm_90"
+)
+
+# Define the CUDA source file and the output directory
+source_file="memcpy.cu"
+output_dir="output"
+
+# Create the output directory if it does not exist
+mkdir -p "$output_dir"
+
+# Loop through each compute capability and run nvcc and ptxas
+for capability in "${!compute_capabilities[@]}"; do
+    arch="${compute_capabilities[$capability]}"
+    ptx_file="$output_dir/memcpy_${arch}.ptx"
+    ptxas_file="$output_dir/memcpy_${arch}.cubin"
+
+    # Run nvcc to generate the PTX file
+    nvcc -ptx -arch="$arch" "$source_file" -o "$ptx_file"
+
+    # Run ptxas to compile the PTX file to SASS
+    ptxas -arch="$arch" "$ptx_file" -o "$ptxas_file"
+
+    echo "Processed compute capability $capability ($arch)"
+done
+
+echo "Processing complete."
+
+python3 generate_header.py
@@ -0,0 +1,35 @@
+import os
+import sys
+import binascii
+
+def generate_header_from_cubin(cubin_dir, header_file):
+    # Open the header file for writing
+    with open(header_file, 'w') as header:
+        # Write header guards
+        header.write('#pragma once\n\n#include <cstdint>\n\n')
+
+        # Iterate over all .cubin files in the directory
+        for file_name in os.listdir(cubin_dir):
+            if file_name.endswith('.cubin'):
+                # Determine the array name from the file name
+                array_name = file_name.replace('.cubin', '').replace('-', '_').replace(' ', '_').upper()
+                cubin_path = os.path.join(cubin_dir, file_name)
+
+                # Read the contents of the .cubin file
+                with open(cubin_path, 'rb') as cubin_file:
+                    cubin_data = cubin_file.read()
+
+                # Convert binary data to hex and format as uint8_t array
+                hex_data = binascii.hexlify(cubin_data).decode('ascii')
+                hex_data_lines = [hex_data[i:i+64] for i in range(0, len(hex_data), 64)]  # Split into 64-char lines
+
+                # Write array declaration to header file
+                header.write(f'const uint8_t {array_name}[] = {{\n')
+
+                for line in hex_data_lines:
+                    header.write('    ' + ', '.join(f'0x{line[i:i+2]}' for i in range(0, len(line), 2)) + ',\n')
+
+                header.write('};\n\n')
+
+if __name__ == '__main__':
+    generate_header_from_cubin("output", "memcopy_kernels.h")
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ set(`
`5`	`5`	`src/cmdqueue.cpp`
`6`	`6`	`src/memcopy.cpp`
`7`	`7`	`internal/memcopy.h`
`8`		`- internal/memcopy_kernel.h`
	`8`	`+ internal/memcopy_kernels.h`
`9`	`9`	`)`
`10`	`10`	`if (BUILD_LIBRECUDA_DRVIER_API_STATIC_LIB)`
`11`	`11`	`add_library(driverapi STATIC ${DRIVERAPI_SOURCES})`