Skip to content

Commit b24e128

Browse files
committed
init 0.6.6
2 parents 2339d59 + 8fb3efa commit b24e128

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+9897
-515
lines changed

CMakeLists.txt

Lines changed: 33 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,35 @@ endif()
8383
#
8484
find_package(Torch REQUIRED)
8585

86+
#
87+
message(STATUS "Enabling core extension.")
88+
89+
# # Define _core_C extension
90+
# # built for (almost) every target platform, (excludes TPU and Neuron)
91+
92+
# set(VLLM_EXT_SRC
93+
# "csrc/core/torch_bindings.cpp")
94+
95+
# define_gpu_extension_target(
96+
# _core_C
97+
# DESTINATION vllm
98+
# LANGUAGE CXX
99+
# SOURCES ${VLLM_EXT_SRC}
100+
# COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
101+
# USE_SABI 3
102+
# WITH_SOABI)
103+
86104
#
87105
# Forward the non-CUDA device extensions to external CMake scripts.
88106
#
89107
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
90108
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
91109
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
92110
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
111+
elseif(VLLM_TARGET_DEVICE STREQUAL "xpu")
112+
message(STATUS "Building XPU")
113+
set(VLLM_GPU_LANG "SYCL")
114+
include(${CMAKE_CURRENT_LIST_DIR}/cmake/xpu_extension.cmake)
93115
else()
94116
return()
95117
endif()
@@ -242,112 +264,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
242264
"csrc/custom_all_reduce.cu"
243265
"csrc/permute_cols.cu"
244266
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
245-
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
246-
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
247-
"csrc/cutlass_extensions/common.cpp")
248-
249-
set_gencode_flags_for_srcs(
250-
SRCS "${VLLM_EXT_SRC}"
251-
CUDA_ARCHS "${CUDA_ARCHS}")
252-
253-
# Only build Marlin kernels if we are building for at least some compatible archs.
254-
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
255-
# are not supported by Machete yet.
256-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
257-
if (MARLIN_ARCHS)
258-
set(MARLIN_SRCS
259-
"csrc/quantization/fp8/fp8_marlin.cu"
260-
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
261-
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
262-
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
263-
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
264-
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
265-
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
266-
set_gencode_flags_for_srcs(
267-
SRCS "${MARLIN_SRCS}"
268-
CUDA_ARCHS "${MARLIN_ARCHS}")
269-
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
270-
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
271-
else()
272-
message(STATUS "Not building Marlin kernels as no compatible archs found"
273-
" in CUDA target architectures")
274-
endif()
275-
276-
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
277-
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
278-
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
279-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
280-
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
281-
set_gencode_flags_for_srcs(
282-
SRCS "${SRCS}"
283-
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
284-
list(APPEND VLLM_EXT_SRC "${SRCS}")
285-
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
286-
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
287-
else()
288-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
289-
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
290-
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
291-
"later if you intend on running FP8 quantized models on "
292-
"Hopper.")
293-
else()
294-
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
295-
"in CUDA target architectures")
296-
endif()
297-
298-
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
299-
# build any 3x kernels
300-
set(SCALED_MM_3X_ARCHS)
301-
endif()
267+
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
268+
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
302269

303270
#
304-
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
305-
# kernels for the remaining archs that are not already built for 3x.
306-
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
307-
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
308-
# subtract out the archs that are already built for 3x
309-
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
310-
if (SCALED_MM_2X_ARCHS)
311-
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
312-
set_gencode_flags_for_srcs(
313-
SRCS "${SRCS}"
314-
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
315-
list(APPEND VLLM_EXT_SRC "${SRCS}")
316-
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
317-
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
318-
else()
319-
if (SCALED_MM_3X_ARCHS)
320-
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
321-
" for and covered by scaled_mm_c3x")
322-
else()
323-
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
324-
"in CUDA target architectures")
325-
endif()
326-
endif()
327-
328-
#
329-
# 2:4 Sparse Kernels
330-
331-
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
332-
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
333-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
334-
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
335-
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
336-
set_gencode_flags_for_srcs(
337-
SRCS "${SRCS}"
338-
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
339-
list(APPEND VLLM_EXT_SRC "${SRCS}")
340-
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
341-
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
342-
else()
343-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
344-
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
345-
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
346-
"if you intend on running FP8 sparse quantized models on Hopper.")
347-
else()
348-
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
349-
"in CUDA target architectures")
350-
endif()
271+
# The CUTLASS kernels for Hopper require sm90a to be enabled.
272+
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
273+
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
274+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
275+
set_source_files_properties(
276+
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
277+
PROPERTIES
278+
COMPILE_FLAGS
279+
"-gencode arch=compute_90a,code=sm_90a")
351280
endif()
352281

353282

benchmarks/backend_request_func.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,8 @@ async def async_request_openai_completions(
240240
"temperature": 0.0,
241241
"best_of": request_func_input.best_of,
242242
"max_tokens": request_func_input.output_len,
243-
"logprobs": request_func_input.logprobs,
243+
"min_tokens": request_func_input.output_len,
244+
"ignore_eos": True,
244245
"stream": True,
245246
"ignore_eos": request_func_input.ignore_eos,
246247
}

benchmarks/benchmark_prefix_caching.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333

3434
from transformers import PreTrainedTokenizerBase
3535

36-
from vllm import LLM, SamplingParams
36+
from vllm import SamplingParams
37+
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
3738
from vllm.engine.arg_utils import EngineArgs
3839
from vllm.utils import FlexibleArgumentParser
3940

@@ -190,7 +191,7 @@ def main(args):
190191

191192
engine_args = EngineArgs.from_cli_args(args)
192193

193-
llm = LLM(**dataclasses.asdict(engine_args))
194+
llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit=args.load_in_low_bit)
194195

195196
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
196197

@@ -242,6 +243,13 @@ def main(args):
242243
"when dataset-path is not provided.",
243244
)
244245

246+
parser.add_argument(
247+
"--load-in-low-bit",
248+
type=str,
249+
choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
250+
default="sym_int4",
251+
help="Low-bit format quantization with IPEX-LLM")
252+
245253
parser = EngineArgs.add_cli_args(parser)
246254
args = parser.parse_args()
247-
main(args)
255+
main(args)

benchmarks/benchmark_serving.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,8 @@ def main(args: argparse.Namespace):
779779
np.random.seed(args.seed)
780780

781781
backend = args.backend
782-
model_id = args.model
782+
# model_id = args.model
783+
model_id = args.model.split('/')[-1]
783784
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
784785
tokenizer_mode = args.tokenizer_mode
785786

cmake/utils.cmake

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
379379
GPU
380380
"WITH_SOABI"
381381
"DESTINATION;LANGUAGE;USE_SABI"
382-
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
382+
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES;LINK_FLAGS")
383383

384384
# Add hipify preprocessing step when building with HIP/ROCm.
385385
if (GPU_LANGUAGE STREQUAL "HIP")
@@ -421,6 +421,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
421421

422422
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
423423

424+
if (GPU_LANGUAGE STREQUAL "SYCL")
425+
target_compile_options(${GPU_MOD_NAME} PRIVATE ${GPU_COMPILE_FLAGS})
426+
target_link_options(${GPU_MOD_NAME} PRIVATE ${GPU_LINK_FLAGS})
427+
endif()
428+
424429
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
425430
# dependencies that are not necessary and may not be installed.
426431
if (GPU_LANGUAGE STREQUAL "CUDA")

cmake/xpu_extension.cmake

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
2+
3+
#
4+
# Define environment variables for special configurations
5+
#
6+
# TODO: detect Intel GPU Architecture(PVC or Arc) to add AOT flag.
7+
8+
#
9+
# Check the compile flags
10+
#
11+
append_cmake_prefix_path("intel_extension_for_pytorch" "intel_extension_for_pytorch.cmake_prefix_path")
12+
find_package(IPEX REQUIRED)
13+
# IPEX will overwrite TORCH_LIBRARIES, so re-add torch_python lib.
14+
append_torchlib_if_found(torch_python)
15+
include_directories(${IPEX_INCLUDE_DIRS})
16+
set(CMPLR_ROOT $ENV{CMPLR_ROOT})
17+
set(CMAKE_CXX_COMPILER icpx)
18+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
19+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
20+
set(VLLM_EXTRA_INCLUDE_DIRECTORIES ${CMPLR_ROOT}/include/sycl)
21+
22+
list(APPEND VLLM_GPU_FLAGS "-fsycl" "-fsycl-targets=spir64")
23+
list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64")
24+
list(APPEND VLLM_LINK_LIBRARIES "sycl" "OpenCL" "pthread" "m" "dl" "dnnl" "intel-ext-pt-gpu" )
25+
26+
#
27+
# Define extension targets
28+
#
29+
30+
#
31+
# _C extension
32+
#
33+
set(VLLM_EXT_SRC
34+
"csrc/xpu/activation_xpu.cpp"
35+
"csrc/xpu/attention_xpu.cpp"
36+
"csrc/xpu/cache_ops_xpu.cpp"
37+
"csrc/xpu/gemm_kernels_xpu.cpp"
38+
"csrc/xpu/layernorm_xpu.cpp"
39+
"csrc/xpu/pos_encoding_xpu.cpp"
40+
"csrc/xpu/utils.cpp"
41+
"csrc/xpu/pybind.cpp")
42+
43+
define_gpu_extension_target(
44+
_C
45+
DESTINATION vllm
46+
LANGUAGE ${VLLM_GPU_LANG}
47+
SOURCES ${VLLM_EXT_SRC}
48+
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
49+
LINK_FLAGS ${VLLM_GPU_LINK_FLAGS}
50+
ARCHITECTURES ${VLLM_GPU_ARCHES}
51+
INCLUDE_DIRECTORIES ${VLLM_EXTRA_INCLUDE_DIRECTORIES}
52+
LIBRARIES ${VLLM_LINK_LIBRARIES}
53+
WITH_SOABI
54+
)
55+
56+
add_custom_target(default_xpu)
57+
message(STATUS "Enabling C extension.")
58+
add_dependencies(default_xpu _C)
59+

0 commit comments

Comments
 (0)