@@ -83,13 +83,35 @@ endif()
83
83
#
84
84
find_package (Torch REQUIRED )
85
85
86
+ #
87
+ message (STATUS "Enabling core extension." )
88
+
89
+ # # Define _core_C extension
90
+ # # built for (almost) every target platform, (excludes TPU and Neuron)
91
+
92
+ # set(VLLM_EXT_SRC
93
+ # "csrc/core/torch_bindings.cpp")
94
+
95
+ # define_gpu_extension_target(
96
+ # _core_C
97
+ # DESTINATION vllm
98
+ # LANGUAGE CXX
99
+ # SOURCES ${VLLM_EXT_SRC}
100
+ # COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
101
+ # USE_SABI 3
102
+ # WITH_SOABI)
103
+
86
104
#
87
105
# Forward the non-CUDA device extensions to external CMake scripts.
88
106
#
89
107
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
90
108
NOT VLLM_TARGET_DEVICE STREQUAL "rocm" )
91
109
if (VLLM_TARGET_DEVICE STREQUAL "cpu" )
92
110
include (${CMAKE_CURRENT_LIST_DIR} /cmake/cpu_extension.cmake )
111
+ elseif (VLLM_TARGET_DEVICE STREQUAL "xpu" )
112
+ message (STATUS "Building XPU" )
113
+ set (VLLM_GPU_LANG "SYCL" )
114
+ include (${CMAKE_CURRENT_LIST_DIR} /cmake/xpu_extension.cmake )
93
115
else ()
94
116
return ()
95
117
endif ()
@@ -242,112 +264,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
242
264
"csrc/custom_all_reduce.cu"
243
265
"csrc/permute_cols.cu"
244
266
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
245
- "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
246
- "csrc/sparse/cutlass/sparse_compressor_entry.cu"
247
- "csrc/cutlass_extensions/common.cpp" )
248
-
249
- set_gencode_flags_for_srcs (
250
- SRCS "${VLLM_EXT_SRC} "
251
- CUDA_ARCHS "${CUDA_ARCHS} " )
252
-
253
- # Only build Marlin kernels if we are building for at least some compatible archs.
254
- # Keep building Marlin for 9.0 as there are some group sizes and shapes that
255
- # are not supported by Machete yet.
256
- cuda_archs_loose_intersection (MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS} )
257
- if (MARLIN_ARCHS )
258
- set (MARLIN_SRCS
259
- "csrc/quantization/fp8/fp8_marlin.cu"
260
- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
261
- "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
262
- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
263
- "csrc/quantization/gptq_marlin/gptq_marlin.cu"
264
- "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
265
- "csrc/quantization/gptq_marlin/awq_marlin_repack.cu" )
266
- set_gencode_flags_for_srcs (
267
- SRCS "${MARLIN_SRCS} "
268
- CUDA_ARCHS "${MARLIN_ARCHS} " )
269
- list (APPEND VLLM_EXT_SRC "${MARLIN_SRCS} " )
270
- message (STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS} " )
271
- else ()
272
- message (STATUS "Not building Marlin kernels as no compatible archs found"
273
- " in CUDA target architectures" )
274
- endif ()
275
-
276
- # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
277
- # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
278
- cuda_archs_loose_intersection (SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS} " )
279
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
280
- set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
281
- set_gencode_flags_for_srcs (
282
- SRCS "${SRCS} "
283
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS} " )
284
- list (APPEND VLLM_EXT_SRC "${SRCS} " )
285
- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
286
- message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
287
- else ()
288
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
289
- message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
290
- "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
291
- "later if you intend on running FP8 quantized models on "
292
- "Hopper." )
293
- else ()
294
- message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
295
- "in CUDA target architectures" )
296
- endif ()
297
-
298
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
299
- # build any 3x kernels
300
- set (SCALED_MM_3X_ARCHS )
301
- endif ()
267
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
268
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
302
269
303
270
#
304
- # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
305
- # kernels for the remaining archs that are not already built for 3x.
306
- cuda_archs_loose_intersection (SCALED_MM_2X_ARCHS
307
- "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
308
- # subtract out the archs that are already built for 3x
309
- list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
310
- if (SCALED_MM_2X_ARCHS )
311
- set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" )
312
- set_gencode_flags_for_srcs (
313
- SRCS "${SRCS} "
314
- CUDA_ARCHS "${SCALED_MM_2X_ARCHS} " )
315
- list (APPEND VLLM_EXT_SRC "${SRCS} " )
316
- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1" )
317
- message (STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS} " )
318
- else ()
319
- if (SCALED_MM_3X_ARCHS )
320
- message (STATUS "Not building scaled_mm_c2x as all archs are already built"
321
- " for and covered by scaled_mm_c3x" )
322
- else ()
323
- message (STATUS "Not building scaled_mm_c2x as no compatible archs found "
324
- "in CUDA target architectures" )
325
- endif ()
326
- endif ()
327
-
328
- #
329
- # 2:4 Sparse Kernels
330
-
331
- # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
332
- # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
333
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
334
- set (SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
335
- "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
336
- set_gencode_flags_for_srcs (
337
- SRCS "${SRCS} "
338
- CUDA_ARCHS "${SCALED_MM_3X_ARCHS} " )
339
- list (APPEND VLLM_EXT_SRC "${SRCS} " )
340
- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" )
341
- message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
342
- else ()
343
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
344
- message (STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
345
- "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
346
- "if you intend on running FP8 sparse quantized models on Hopper." )
347
- else ()
348
- message (STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
349
- "in CUDA target architectures" )
350
- endif ()
271
+ # The CUTLASS kernels for Hopper require sm90a to be enabled.
272
+ # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
273
+ # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
274
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 )
275
+ set_source_files_properties (
276
+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
277
+ PROPERTIES
278
+ COMPILE_FLAGS
279
+ "-gencode arch=compute_90a,code=sm_90a" )
351
280
endif ()
352
281
353
282
0 commit comments