1
+ #include < librecuda.h>
2
+
3
+ #include < iostream>
4
+ #include < vector>
5
+ #include < fstream>
6
+ #include < cstring>
7
+ #include < chrono>
8
+
9
+ inline void cudaCheck (libreCudaStatus_t error, const char *file, int line) {
10
+ if (error != LIBRECUDA_SUCCESS) {
11
+ const char *error_string;
12
+ libreCuGetErrorString (error, &error_string);
13
+ printf (" [CUDA ERROR] at file %s:%d: %s\n " , file, line, error_string);
14
+ exit (EXIT_FAILURE);
15
+ }
16
+ };
17
+ #define CUDA_CHECK (err ) (cudaCheck(err, __FILE__, __LINE__))
18
+
19
+ int main () {
20
+ CUDA_CHECK (libreCuInit (0 ));
21
+
22
+ int device_count{};
23
+ CUDA_CHECK (libreCuDeviceGetCount (&device_count));
24
+ std::cout << " Device count: " + std::to_string (device_count) << std::endl;
25
+
26
+ LibreCUdevice device{};
27
+ CUDA_CHECK (libreCuDeviceGet (&device, 0 ));
28
+
29
+ LibreCUcontext ctx{};
30
+ CUDA_CHECK (libreCuCtxCreate_v2 (&ctx, CU_CTX_SCHED_YIELD, device));
31
+
32
+ char name_buffer[256 ] = {};
33
+ libreCuDeviceGetName (name_buffer, 256 , device);
34
+ std::cout << " Device Name: " + std::string (name_buffer) << std::endl;
35
+
36
+ LibreCUmodule module {};
37
+
38
+ // read cubin file
39
+ uint8_t *image;
40
+ size_t n_bytes;
41
+ {
42
+ std::ifstream input (" write_float.cubin" , std::ios::binary);
43
+ std::vector<uint8_t > bytes (
44
+ (std::istreambuf_iterator<char >(input)),
45
+ (std::istreambuf_iterator<char >()));
46
+ input.close ();
47
+ image = new uint8_t [bytes.size ()];
48
+ memcpy (image, bytes.data (), bytes.size ());
49
+ n_bytes = bytes.size ();
50
+ }
51
+ CUDA_CHECK (libreCuModuleLoadData (&module , image, n_bytes));
52
+
53
+ // read functions
54
+ uint32_t num_funcs{};
55
+ CUDA_CHECK (libreCuModuleGetFunctionCount (&num_funcs, module ));
56
+ std::cout << " Num functions: " << num_funcs << std::endl;
57
+
58
+ auto *functions = new LibreCUFunction[num_funcs];
59
+ CUDA_CHECK (libreCuModuleEnumerateFunctions (functions, num_funcs, module ));
60
+
61
+ for (size_t i = 0 ; i < num_funcs; i++) {
62
+ LibreCUFunction func = functions[i];
63
+ const char *func_name{};
64
+ CUDA_CHECK (libreCuFuncGetName (&func_name, func));
65
+ std::cout << " function \" " << func_name << " \" " << std::endl;
66
+ }
67
+
68
+ delete[] functions;
69
+
70
+ // find function
71
+ LibreCUFunction func{};
72
+ CUDA_CHECK (libreCuModuleGetFunction (&func, module , " write_float" ));
73
+
74
+ // create stream
75
+ LibreCUstream stream{};
76
+ CUDA_CHECK (libreCuStreamCreate (&stream, 0 ));
77
+
78
+ void *float_dst_compute_va{};
79
+ void *float_dst_dma_va{};
80
+ CUDA_CHECK (libreCuMemAlloc (&float_dst_compute_va, sizeof (float ), true ));
81
+ CUDA_CHECK (libreCuMemAlloc (&float_dst_dma_va, sizeof (float ), true ));
82
+ *(float *) float_dst_compute_va = 0 .0f ;
83
+ *(float *) float_dst_dma_va = 0 .0f ;
84
+
85
+ // first time execution of the kernel
86
+ auto start = std::chrono::high_resolution_clock::now ();
87
+ {
88
+ void *params[] = {
89
+ &float_dst_compute_va, &float_dst_dma_va
90
+ };
91
+ CUDA_CHECK (
92
+ libreCuLaunchKernel (func,
93
+ 1 , 1 , 1 ,
94
+ 1 , 1 , 1 ,
95
+ 0 ,
96
+ stream,
97
+ params, sizeof (params) / sizeof (void *),
98
+ nullptr ,
99
+ false
100
+ )
101
+ );
102
+ }
103
+ CUDA_CHECK (libreCuStreamCommence (stream));
104
+ CUDA_CHECK (libreCuStreamAwait (stream));
105
+ auto end = std::chrono::high_resolution_clock::now ();
106
+ std::cout << " Single kernel took: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count ()
107
+ << " ms" << std::endl;
108
+
109
+ start = std::chrono::high_resolution_clock::now ();
110
+ {
111
+ void *params[] = {
112
+ &float_dst_compute_va, &float_dst_dma_va
113
+ };
114
+ for (int i = 0 ; i < 5 ; i++) {
115
+ CUDA_CHECK (
116
+ libreCuLaunchKernel (func,
117
+ 1 , 1 , 1 ,
118
+ 1 , 1 , 1 ,
119
+ 0 ,
120
+ stream,
121
+ params, sizeof (params) / sizeof (void *),
122
+ nullptr ,
123
+ true
124
+ )
125
+ );
126
+ }
127
+ }
128
+ CUDA_CHECK (libreCuStreamCommence (stream));
129
+ CUDA_CHECK (libreCuStreamAwait (stream));
130
+ end = std::chrono::high_resolution_clock::now ();
131
+ std::cout << " 5xParallel kernel took: "
132
+ << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count ()
133
+ << " ms" << std::endl;
134
+
135
+ // free memory
136
+ CUDA_CHECK (libreCuMemFree (float_dst_compute_va));
137
+ CUDA_CHECK (libreCuMemFree (float_dst_dma_va));
138
+
139
+ // destroy stream
140
+ CUDA_CHECK (libreCuStreamDestroy (stream));
141
+
142
+ // unload module
143
+ CUDA_CHECK (libreCuModuleUnload (module ));
144
+
145
+ // destroy ctx
146
+ CUDA_CHECK (libreCuCtxDestroy (ctx));
147
+ return 0 ;
148
+ }
0 commit comments