Skip to content

Commit 7712865

Browse files
committed
Implemented memcpy kernel sm variants
1 parent bbd6fb5 commit 7712865

40 files changed

+7549
-706
lines changed

driverapi/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ set(
55
src/cmdqueue.cpp
66
src/memcopy.cpp
77
internal/memcopy.h
8-
internal/memcopy_kernel.h
8+
internal/memcopy_kernels.h
99
)
1010
if (BUILD_LIBRECUDA_DRVIER_API_STATIC_LIB)
1111
add_library(driverapi STATIC ${DRIVERAPI_SOURCES})

driverapi/include/librecuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ typedef LibreCUEvent_ *LibreCUEvent;
2626

2727
enum LibreCuDeviceAttribute {
2828
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
29+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
30+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
2931
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
3032
};
3133
enum LibreCuFunctionAttribute {

driverapi/internal/librecuda_internal.h

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <unordered_map>
66
#include <unordered_set>
77
#include <vector>
8+
#include <nvidia/ctrl2080gr.h>
89

910
#include "nvidia/nvtypes.h"
1011
#include "nvidia/nvCpuUuid.h"
@@ -14,6 +15,12 @@
1415

1516
struct NvCommandQueue;
1617

18+
enum DriverType {
19+
OPEN_KERNEL_MODULES, NVIDIA_PROPRIETARY
20+
};
21+
22+
extern DriverType driver_type;
23+
1724
struct GPFifo {
1825
NvU64 *ring;
1926
NvU32 entries_count;
@@ -28,6 +35,7 @@ struct LibreCUdevice_ {
2835
NvProcessorUuid uuid;
2936
NvU32 *gpu_mmio;
3037
NvU32 compute_class;
38+
NV2080_CTRL_GR_INFO device_info[NV2080_CTRL_GR_INFO_INDEX_MAX + 1];
3139
};
3240

3341
#define UVM_HEAP_START 0x1000000000
@@ -47,6 +55,7 @@ struct LibreCUcontext_ {
4755
NvHandle channel_group;
4856
};
4957

58+
extern LibreCUcontext current_ctx;
5059

5160
struct KernelConstantInfo {
5261
NvU32 const_nr;
@@ -73,6 +82,7 @@ struct LibreCUmodule_ {
7382
NvU64 module_va_addr{};
7483
};
7584

85+
7686
struct LibreCUFunction_ {
7787
std::string name;
7888
NvU64 func_va_addr;
@@ -87,13 +97,19 @@ struct LibreCUFunction_ {
8797
* Virtual address of shader local memory used for shaders/kernels.
8898
*/
8999
NvU64 shader_local_memory_va{};
90-
91100
};
92101

93102
struct LibreCUstream_ {
94103
NvCommandQueue *command_queue;
95104
};
96105

106+
struct LibreCuRoute {
107+
NvBool valid;
108+
NvU32 swizId;
109+
NvU32 engineId;
110+
};
111+
112+
97113
#define LIBRECUDA_VALIDATE_UVM_IOCTL(ret, data_ptr) { \
98114
int return_value = ret; \
99115
int status = (data_ptr) != nullptr ? (data_ptr)->rmStatus : return_value; \
@@ -131,12 +147,12 @@ static inline libreCudaStatus_t rm_alloc(int fd, NvV32 clss,
131147
NvHandle *pObjectNew) {
132148
LIBRECUDA_VALIDATE(fd > 0, LIBRECUDA_ERROR_INVALID_VALUE);
133149
NVOS21_PARAMETERS parameters{
134-
.hRoot=client,
135-
.hObjectParent=parent,
136-
.hObjectNew=object,
137-
.hClass=clss,
138-
.pAllocParms=params,
139-
.paramsSize=paramSize
150+
.hRoot = client,
151+
.hObjectParent = parent,
152+
.hObjectNew = object,
153+
.hClass = clss,
154+
.pAllocParms = params,
155+
.paramsSize = paramSize
140156
};
141157
NV_IOWR(fd, NV_ESC_RM_ALLOC, &parameters, sizeof(parameters));
142158
if (pObjectNew != nullptr) {
@@ -161,6 +177,7 @@ static inline libreCudaStatus_t rm_ctrl(int fd,
161177

162178
#define RM_CTRL(fd, cmd, client, object, params, paramSize) LIBRECUDA_VALIDATE_RM_CTRL(rm_ctrl(fd, cmd, client, object, params, paramSize))
163179

180+
#define LIBRECUDA_ENSURE_CTX_VALID() LIBRECUDA_VALIDATE(current_ctx != nullptr, LIBRECUDA_ERROR_INVALID_CONTEXT);
164181

165182
libreCudaStatus_t
166183
gpuAlloc(LibreCUcontext ctx, size_t size, bool physicalContiguity, bool hugePages, bool mapToCpu, NvU32 mapFlags,
@@ -173,9 +190,16 @@ gpuSystemAlloc(LibreCUcontext ctx, size_t size, bool mapToCpu, NvU32 mapFlags,
173190
libreCudaStatus_t gpuFree(LibreCUcontext ctx, NvU64 virtualAddress);
174191

175192
/**
176-
* Returns if the pointer is a device pointer.
193+
* @param ptr the pointer to check for being a device pointer
194+
* @return true if the pointer is a device pointer.
177195
* This does not mean the ptr is still allocated. It just means it is or was a device pointer.
178196
*/
179197
bool isDevicePtr(void *ptr);
180198

199+
/**
200+
* @param ptr the pointer to check for being mapped to host.
201+
* @return true if the pointer is a device pointer and mapped to host. will return false for host pointers.
202+
*/
203+
bool isHostMappedPtr(void *ptr);
204+
181205
#endif //LIBRECUDA_LIBRECUDA_INTERNAL_H

driverapi/internal/memcopy.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
#include <librecuda.h>
44
#include <cstddef>
55

6-
libreCudaStatus_t loadMemcpyKernelsIfNeeded();
6+
libreCudaStatus_t loadMemcpyKernelsIfNeeded(LibreCUdevice device);
77

88
libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async);

driverapi/internal/memcopy_kernel.h

Lines changed: 0 additions & 408 deletions
This file was deleted.

driverapi/internal/memcopy_kernels.h

Lines changed: 2659 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# NOTE: THIS FILE IS NOT PART OF CI
4+
# Why? Because it would require having nvcc & ptxas installed, which we don't wan to assume
5+
6+
# Define the list of compute capabilities and corresponding architecture
7+
declare -A compute_capabilities=(
8+
[5.0]="sm_50"
9+
[5.2]="sm_52"
10+
[5.3]="sm_53"
11+
[6.0]="sm_60"
12+
[6.1]="sm_61"
13+
[6.2]="sm_62"
14+
[7.0]="sm_70"
15+
[7.2]="sm_72"
16+
[7.5]="sm_75"
17+
[8.0]="sm_80"
18+
[8.6]="sm_86"
19+
[8.7]="sm_87"
20+
[8.9]="sm_89"
21+
[9.0]="sm_90"
22+
)
23+
24+
# Define the CUDA source file and the output directory
25+
source_file="memcpy.cu"
26+
output_dir="output"
27+
28+
# Create the output directory if it does not exist
29+
mkdir -p "$output_dir"
30+
31+
# Loop through each compute capability and run nvcc and ptxas
32+
for capability in "${!compute_capabilities[@]}"; do
33+
arch="${compute_capabilities[$capability]}"
34+
ptx_file="$output_dir/memcpy_${arch}.ptx"
35+
ptxas_file="$output_dir/memcpy_${arch}.cubin"
36+
37+
# Run nvcc to generate the PTX file
38+
nvcc -ptx -arch="$arch" "$source_file" -o "$ptx_file"
39+
40+
# Run ptxas to compile the PTX file to SASS
41+
ptxas -arch="$arch" "$ptx_file" -o "$ptxas_file"
42+
43+
echo "Processed compute capability $capability ($arch)"
44+
done
45+
46+
echo "Processing complete."
47+
48+
python3 generate_header.py
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import sys
3+
import binascii
4+
5+
def generate_header_from_cubin(cubin_dir, header_file):
6+
# Open the header file for writing
7+
with open(header_file, 'w') as header:
8+
# Write header guards
9+
header.write('#pragma once\n\n#include <cstdint>\n\n')
10+
11+
# Iterate over all .cubin files in the directory
12+
for file_name in os.listdir(cubin_dir):
13+
if file_name.endswith('.cubin'):
14+
# Determine the array name from the file name
15+
array_name = file_name.replace('.cubin', '').replace('-', '_').replace(' ', '_').upper()
16+
cubin_path = os.path.join(cubin_dir, file_name)
17+
18+
# Read the contents of the .cubin file
19+
with open(cubin_path, 'rb') as cubin_file:
20+
cubin_data = cubin_file.read()
21+
22+
# Convert binary data to hex and format as uint8_t array
23+
hex_data = binascii.hexlify(cubin_data).decode('ascii')
24+
hex_data_lines = [hex_data[i:i+64] for i in range(0, len(hex_data), 64)] # Split into 64-char lines
25+
26+
# Write array declaration to header file
27+
header.write(f'const uint8_t {array_name}[] = {{\n')
28+
29+
for line in hex_data_lines:
30+
header.write(' ' + ', '.join(f'0x{line[i:i+2]}' for i in range(0, len(line), 2)) + ',\n')
31+
32+
header.write('};\n\n')
33+
34+
if __name__ == '__main__':
35+
generate_header_from_cubin("output", "memcopy_kernels.h")

0 commit comments

Comments
 (0)