Skip to content

Commit 33c63e9

Browse files
jinzhen-linlengrongfusidhpurwala-huzaifagemini-code-assist[bot]russellb
authored
[Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (vllm-project#22428)
Signed-off-by: rongfu.leng <[email protected]> Signed-off-by: Jinzhen Lin <[email protected]> Signed-off-by: Huzaifa Sidhpurwala <[email protected]> Signed-off-by: Varun Sundar Rabindranath <[email protected]> Signed-off-by: Harry Mellor <[email protected]> Signed-off-by: Jee Jee Li <[email protected]> Signed-off-by: mgoin <[email protected]> Signed-off-by: Animesh Jain <[email protected]> Signed-off-by: Rui Qiao <[email protected]> Signed-off-by: Xiongfei Wei <[email protected]> Signed-off-by: Nick Hill <[email protected]> Signed-off-by: yewentao256 <[email protected]> Signed-off-by: kf <[email protected]> Signed-off-by: vllmellm <[email protected]> Signed-off-by: NickLucche <[email protected]> Signed-off-by: Dipika Sikka <[email protected]> Signed-off-by: Sage Moore <[email protected]> Signed-off-by: tjtanaavllm <[email protected]> Signed-off-by: Yong Hoon Shin <[email protected]> Signed-off-by: Chih-Chieh-Yang <[email protected]> Signed-off-by: Roger Wang <[email protected]> Signed-off-by: Vadim Gimpelson <[email protected]> Signed-off-by: Isotr0py <[email protected]> Signed-off-by: zRzRzRzRzRzRzR <[email protected]> Signed-off-by: Chih-Chieh Yang <[email protected]> Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Thomas Parnell <[email protected]> Signed-off-by: yan <[email protected]> Signed-off-by: Yan Ma <[email protected]> Signed-off-by: Xiao Liu <[email protected]> Signed-off-by: jiahanc <[email protected]> Signed-off-by: Isotr0py <[email protected]> Signed-off-by: Ye (Charlotte) Qi <[email protected]> Signed-off-by: LopezCastroRoberto <[email protected]> Signed-off-by: Andy Xie <[email protected]> Signed-off-by: Haibin Lin <[email protected]> Signed-off-by: David Ben-David <[email protected]> Signed-off-by: Woosuk Kwon <[email protected]> Signed-off-by: jiang1.li <[email protected]> Signed-off-by: Seiji Eicher <[email protected]> Signed-off-by: zitian.zhao <[email protected]> Signed-off-by: 22quinn <[email protected]> Signed-off-by: Abirdcfly <[email protected]> Signed-off-by: Giancarlo Delfin <[email protected]> Signed-off-by: Tyler Michael Smith <[email protected]> Signed-off-by: huangweixiao <[email protected]> Signed-off-by: alyosha-swamy <[email protected]> Signed-off-by: Eric Hanley <[email protected]> Signed-off-by: Abatom <[email protected]> Signed-off-by: CLFutureX <[email protected]> Signed-off-by: Linkun Chen <[email protected]> Signed-off-by: tjtanaa <[email protected]> Signed-off-by: Gregory Shtrasberg <[email protected]> Signed-off-by: tlipoca9 <[email protected]> Signed-off-by: elvischenv <[email protected]> Signed-off-by: zitian zhao <[email protected]> Signed-off-by: mgoin <[email protected]> Signed-off-by: wang.yuqi <[email protected]> Signed-off-by: Benji Beck <[email protected]> Signed-off-by: Siyuan Liu <[email protected]> Signed-off-by: Benjamin Chislett <[email protected]> Signed-off-by: isotr0py <[email protected]> Signed-off-by: Chen Zhang <[email protected]> Signed-off-by: simon-mo <[email protected]> Signed-off-by: LucasWilkinson <[email protected]> Signed-off-by: Zhang Jason <[email protected]> Signed-off-by: Yongye Zhu <[email protected]> Signed-off-by: asafg <[email protected]> Signed-off-by: Siyuan Fu <[email protected]> Signed-off-by: Lain <[email protected]> Signed-off-by: Max de Bayser <[email protected]> Signed-off-by: Lucas Wilkinson <[email protected]> Signed-off-by: Kunshang Ji <[email protected]> Signed-off-by: Tao He <[email protected]> Signed-off-by: Michael Goin <[email protected]> Signed-off-by: QscQ <[email protected]> Signed-off-by: qingjun <[email protected]> Signed-off-by: Syed Muhammad Bin Asif <[email protected]> Signed-off-by: Lionel Villard <[email protected]> Signed-off-by: ycyaw66 <[email protected]> Signed-off-by: David Chen <[email protected]> Signed-off-by: Linkun <[email protected]> Signed-off-by: Moritz Sanft <[email protected]> Signed-off-by: Ming Yang <[email protected]> Signed-off-by: Adrian Garcia <[email protected]> Signed-off-by: shaojunqi <[email protected]> Signed-off-by: Ricardo Decal <[email protected]> Signed-off-by: Andrew Chan <[email protected]> Signed-off-by: Felix Marty <[email protected]> Signed-off-by: Andrew Sansom <[email protected]> Signed-off-by: Zhiyu Cheng <[email protected]> Signed-off-by: Shu Wang <[email protected]> Signed-off-by: Po-Han Huang <[email protected]> Signed-off-by: Shu Wang. <[email protected]> Signed-off-by: XIn Li <[email protected]> Signed-off-by: Junhao Li <[email protected]> Signed-off-by: chaunceyjiang <[email protected]> Signed-off-by: iAmir97 <[email protected]> Signed-off-by: iAmir97 <[email protected]> Signed-off-by: <[email protected]> Signed-off-by: Guy Stone <[email protected]> Signed-off-by: <[email protected]> Signed-off-by: yyw <[email protected]> Signed-off-by: Russell Bryant <[email protected]> Signed-off-by: Pradyun Ramadorai <[email protected]> Signed-off-by: Pradyun92 <[email protected]> Signed-off-by: Jinzhen Lin <[email protected]> Co-authored-by: rongfu.leng <[email protected]> Co-authored-by: Huzaifa Sidhpurwala <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <[email protected]> Co-authored-by: Varun Sundar Rabindranath <[email protected]> Co-authored-by: Varun Sundar Rabindranath <[email protected]> Co-authored-by: Harry Mellor <[email protected]> Co-authored-by: Jee Jee Li <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: Animesh Jain <[email protected]> Co-authored-by: Rui Qiao <[email protected]> Co-authored-by: XiongfeiWei <[email protected]> Co-authored-by: Nick Hill <[email protected]> Co-authored-by: Wentao Ye <[email protected]> Co-authored-by: JartX <[email protected]> Co-authored-by: fhl2000 <[email protected]> Co-authored-by: vllmellm <[email protected]> Co-authored-by: kf <[email protected]> Co-authored-by: Nicolò Lucchesi <[email protected]> Co-authored-by: Dipika Sikka <[email protected]> Co-authored-by: Sage Moore <[email protected]> Co-authored-by: tjtanaavllm <[email protected]> Co-authored-by: Yong Hoon Shin <[email protected]> Co-authored-by: Chih-Chieh Yang <[email protected]> Co-authored-by: Roger Wang <[email protected]> Co-authored-by: Vadim Gimpelson <[email protected]> Co-authored-by: Yuxuan Zhang <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Thomas Parnell <[email protected]> Co-authored-by: Yan Ma <[email protected]> Co-authored-by: Xiao <[email protected]> Co-authored-by: jiahanc <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Ye (Charlotte) Qi <[email protected]> Co-authored-by: Roberto L. Castro <[email protected]> Co-authored-by: Ning Xie <[email protected]> Co-authored-by: H <[email protected]> Co-authored-by: David Ben-David <[email protected]> Co-authored-by: David Ben-David <[email protected]> Co-authored-by: Woosuk Kwon <[email protected]> Co-authored-by: Li, Jiang <[email protected]> Co-authored-by: TankNee <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Seiji Eicher <[email protected]> Co-authored-by: ZiTian.Zhao <[email protected]> Co-authored-by: 22quinn <[email protected]> Co-authored-by: Abirdcfly <[email protected]> Co-authored-by: Giancarlo Delfin <[email protected]> Co-authored-by: Chenxi Yang <[email protected]> Co-authored-by: Chenxi Yang <[email protected]> Co-authored-by: Tyler Michael Smith <[email protected]> Co-authored-by: Weixiao Huang <[email protected]> Co-authored-by: Raghav Ravishankar <[email protected]> Co-authored-by: ericehanley <[email protected]> Co-authored-by: Zhonghua Deng <[email protected]> Co-authored-by: Po-Han Huang (NVIDIA) <[email protected]> Co-authored-by: PiteXChen <[email protected]> Co-authored-by: lkchen <[email protected]> Co-authored-by: TJian <[email protected]> Co-authored-by: Gregory Shtrasberg <[email protected]> Co-authored-by: tlipoca9 <[email protected]> Co-authored-by: elvischenv <[email protected]> Co-authored-by: wang.yuqi <[email protected]> Co-authored-by: Benji Beck <[email protected]> Co-authored-by: youkaichao <[email protected]> Co-authored-by: Siyuan Liu <[email protected]> Co-authored-by: Benjamin Chislett <[email protected]> Co-authored-by: LiuXiaoxuanPKU <[email protected]> Co-authored-by: simon-mo <[email protected]> Co-authored-by: Chen Zhang <[email protected]> Co-authored-by: Hongxia Yang <[email protected]> Co-authored-by: Minseok Lee <[email protected]> Co-authored-by: Yongye Zhu <[email protected]> Co-authored-by: Lucas Wilkinson <[email protected]> Co-authored-by: Zhang Jason <[email protected]> Co-authored-by: Asaf Joseph Gardin <[email protected]> Co-authored-by: asafg <[email protected]> Co-authored-by: Lain <[email protected]> Co-authored-by: tc-mb <[email protected]> Co-authored-by: imning3 <[email protected]> Co-authored-by: Maximilien de Bayser <[email protected]> Co-authored-by: Kunshang Ji <[email protected]> Co-authored-by: Tao He <[email protected]> Co-authored-by: qscqesze <[email protected]> Co-authored-by: Syed Muhammad Bin Asif <[email protected]> Co-authored-by: Lionel Villard <[email protected]> Co-authored-by: WeiQing Chen <[email protected]> Co-authored-by: ycyaw66 <[email protected]> Co-authored-by: Moritz Sanft <[email protected]> Co-authored-by: Ming Yang <[email protected]> Co-authored-by: Adrián García García <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: JaceyShao <[email protected]> Co-authored-by: shaojunqi <[email protected]> Co-authored-by: Ricardo Decal <[email protected]> Co-authored-by: Andrew Chan <[email protected]> Co-authored-by: fxmarty-amd <[email protected]> Co-authored-by: Andrew Sansom <[email protected]> Co-authored-by: Zhiyu <[email protected]> Co-authored-by: Shu Wang <[email protected]> Co-authored-by: XIn Li <[email protected]> Co-authored-by: Junhao Li <[email protected]> Co-authored-by: Chauncey <[email protected]> Co-authored-by: iAmir97 <[email protected]> Co-authored-by: iAmir97 <[email protected]> Co-authored-by: Hong Hanh <[email protected]> Co-authored-by: Daniel Serebrenik <[email protected]> Co-authored-by: yewentao256 <[email protected]> Co-authored-by: Guy Stone <[email protected]> Co-authored-by: yyweiss <[email protected]> Co-authored-by: Pradyun92 <[email protected]> Co-authored-by: Pradyun Ramadorai <[email protected]> Co-authored-by: Nicolò Lucchesi <[email protected]>
1 parent ab9f2cf commit 33c63e9

34 files changed

+1126
-322
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
351351
set_gencode_flags_for_srcs(
352352
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
353353
CUDA_ARCHS "${MARLIN_ARCHS}")
354+
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
355+
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
354356

355357
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
356358

@@ -364,7 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
364366
set_gencode_flags_for_srcs(
365367
SRCS "${MARLIN_SRCS}"
366368
CUDA_ARCHS "${MARLIN_ARCHS}")
369+
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
370+
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
367371
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
372+
368373
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
369374
else()
370375
message(STATUS "Not building Marlin kernels as no compatible archs found"
@@ -854,6 +859,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
854859
set_gencode_flags_for_srcs(
855860
SRCS "${MOE_WNAA16_MARLIN_SRC}"
856861
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
862+
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
863+
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
857864

858865
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
859866

benchmarks/kernels/benchmark_machete.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
236236
a=bt.a,
237237
c=None,
238238
b_q_weight=w_q,
239+
b_bias=None,
239240
b_scales=w_s,
240241
global_scale=None,
241242
b_zeros=w_zp,

csrc/core/scalar_type.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
321321
ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
322322
static inline constexpr auto kFE4M3fn =
323323
ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
324+
static inline constexpr auto kFE8M0fnu =
325+
ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
324326
static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
325327
static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
326328
static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);

csrc/moe/marlin_moe_wna16/generate_kernels.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
TEMPLATE = ("template __global__ void Marlin<"
2121
"{{scalar_t}}, "
2222
"{{w_type_id}}, "
23+
"{{s_type_id}}, "
2324
"{{threads}}, "
2425
"{{thread_m_blocks}}, "
2526
"{{thread_n_blocks}}, "
@@ -77,6 +78,7 @@ def generate_new_kernels():
7778
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
7879
continue
7980
# nvfp4 only supports group_size == 16
81+
# mxfp4 only supports group_size == 32
8082
if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
8183
continue
8284
# other quantization methods don't support group_size = 16
@@ -89,9 +91,22 @@ def generate_new_kernels():
8991

9092
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
9193

94+
if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
95+
s_type = "vllm::kFE4M3fn"
96+
elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
97+
s_type = "vllm::kFE8M0fnu"
98+
if dtype == "fp16":
99+
# we cannot safely dequantize e8m0 to fp16, so skip this
100+
continue
101+
elif dtype == "fp16":
102+
s_type = "vllm::kFloat16"
103+
elif dtype == "bf16":
104+
s_type = "vllm::kBFloat16"
105+
92106
template_str = jinja2.Template(TEMPLATE).render(
93107
scalar_t=c_dtype,
94108
w_type_id=scalar_type + ".id()",
109+
s_type_id=s_type + ".id()",
95110
threads=threads,
96111
thread_m_blocks=max(m_blocks, 1),
97112
thread_n_blocks=n_blocks,

csrc/moe/marlin_moe_wna16/kernel.h

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,25 @@
77
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
88
#include "core/scalar_type.hpp"
99

10-
#define MARLIN_KERNEL_PARAMS \
11-
const int4 *__restrict__ A, const int4 *__restrict__ B, \
12-
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
13-
const int4 *__restrict__ scales_ptr, \
14-
const uint16_t *__restrict__ scale2_ptr, \
15-
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
16-
const int32_t *__restrict__ sorted_token_ids_ptr, \
17-
const int32_t *__restrict__ expert_ids_ptr, \
18-
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
19-
const float *__restrict__ topk_weights_ptr, int top_k, \
20-
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
21-
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
10+
#define MARLIN_KERNEL_PARAMS \
11+
const int4 *__restrict__ A, const int4 *__restrict__ B, \
12+
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
13+
const int4 *__restrict__ b_bias_ptr, \
14+
const int4 *__restrict__ scales_ptr, \
15+
const uint16_t *__restrict__ scale2_ptr, \
16+
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
17+
const int32_t *__restrict__ sorted_token_ids_ptr, \
18+
const int32_t *__restrict__ expert_ids_ptr, \
19+
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
20+
const float *__restrict__ topk_weights_ptr, int top_k, \
21+
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
22+
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
2223
bool use_fp32_reduce, int max_shared_mem
2324

2425
namespace MARLIN_NAMESPACE_NAME {
2526
template <typename scalar_t, // compute dtype, half or nv_float16
2627
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
28+
const vllm::ScalarTypeId s_type_id, // weight scale ScalarType id
2729
const int threads, // number of threads in a threadblock
2830
const int thread_m_blocks, // number of 16x16 blocks in the m
2931
// dimension (batchsize) of the

0 commit comments

Comments
 (0)