Skip to content

Commit 31f7b45

Browse files
committed
[Deprecation] Mark env vars covered by --moe-backend / --linear-backend
Centralizes deprecation notices in vllm/envs.py via a new `deprecated_env` helper that wraps a getter and emits a `FutureWarning` (visible to end users by default) on first read when the var is explicitly set. Removal target v0.23. Deprecates: - MoE: VLLM_USE_FLASHINFER_MOE_FP8/FP4/FP16, VLLM_FLASHINFER_MOE_BACKEND, VLLM_USE_FLASHINFER_MOE_MXFP4_BF16/MXFP8/MXFP8_CUTLASS, VLLM_MXFP4_USE_MARLIN (covered by --moe-backend and --quantization_config.moe.activation) - Linear: VLLM_USE_FBGEMM, VLLM_USE_NVFP4_CT_EMULATIONS, VLLM_NVFP4_GEMM_BACKEND (covered by --linear-backend; bumps prior v0.21 notices to v0.23 since v0.21 has shipped) Also drops the inline `warnings.warn` calls in vllm/model_executor/kernels/linear/__init__.py since the env-level helper covers them now. Signed-off-by: mgoin <mgoin64@gmail.com>
1 parent 1242196 commit 31f7b45

2 files changed

Lines changed: 111 additions & 53 deletions

File tree

vllm/envs.py

Lines changed: 110 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import sys
99
import tempfile
1010
import uuid
11+
import warnings
1112
from collections.abc import Callable
1213
from typing import TYPE_CHECKING, Any, Literal
1314

@@ -337,6 +338,33 @@ def use_mega_aot_artifact():
337338
return os.environ.get("VLLM_USE_MEGA_AOT_ARTIFACT", default_value) == "1"
338339

339340

341+
def deprecated_env(
342+
env_name: str,
343+
replacement: str,
344+
getter: Callable[[], Any],
345+
removal_version: str = "v0.23",
346+
) -> Callable[[], Any]:
347+
"""Wrap an env-var getter to emit a FutureWarning when the var is set.
348+
349+
The warning fires on first read of the env var (after `getter` is invoked
350+
via the cached `__getattr__`), but only when the variable is explicitly
351+
present in the environment. Use FutureWarning so the deprecation is visible
352+
to end users by default.
353+
"""
354+
355+
def _read() -> Any:
356+
if env_name in os.environ:
357+
warnings.warn(
358+
f"{env_name} is deprecated and will be removed in "
359+
f"{removal_version}. {replacement}",
360+
FutureWarning,
361+
stacklevel=2,
362+
)
363+
return getter()
364+
365+
return _read
366+
367+
340368
def env_with_choices(
341369
env_name: str,
342370
default: str | None,
@@ -1229,8 +1257,13 @@ def _get_or_set_default() -> str:
12291257
os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1"
12301258
),
12311259
# Whether to use marlin kernel in mxfp4 quantization method
1232-
"VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
1233-
os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
1260+
# Deprecated: use --moe-backend marlin (MoE) or --linear-backend marlin
1261+
# (linear) instead.
1262+
"VLLM_MXFP4_USE_MARLIN": deprecated_env(
1263+
"VLLM_MXFP4_USE_MARLIN",
1264+
"Use --moe-backend marlin (for MoE) or --linear-backend marlin "
1265+
"(for linear) instead.",
1266+
lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
12341267
),
12351268
# The activation dtype for marlin kernel
12361269
"VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
@@ -1325,37 +1358,63 @@ def _get_or_set_default() -> str:
13251358
int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "1"))
13261359
),
13271360
# Allow use of FlashInfer BF16 MoE kernels for fused moe ops.
1328-
"VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
1329-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
1361+
# Deprecated: use --moe-backend to select a kernel explicitly.
1362+
"VLLM_USE_FLASHINFER_MOE_FP16": deprecated_env(
1363+
"VLLM_USE_FLASHINFER_MOE_FP16",
1364+
"Use --moe-backend to select an MoE kernel "
1365+
"(e.g. flashinfer_trtllm, flashinfer_cutlass).",
1366+
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))),
13301367
),
13311368
# Allow use of FlashInfer FP8 MoE kernels for fused moe ops.
1332-
"VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
1333-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
1369+
# Deprecated: use --moe-backend to select a kernel explicitly.
1370+
"VLLM_USE_FLASHINFER_MOE_FP8": deprecated_env(
1371+
"VLLM_USE_FLASHINFER_MOE_FP8",
1372+
"Use --moe-backend to select an MoE kernel "
1373+
"(e.g. flashinfer_trtllm, flashinfer_cutlass).",
1374+
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
13341375
),
13351376
# Allow use of FlashInfer NVFP4 MoE kernels for fused moe ops.
1336-
"VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
1337-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
1377+
# Deprecated: use --moe-backend to select a kernel explicitly.
1378+
"VLLM_USE_FLASHINFER_MOE_FP4": deprecated_env(
1379+
"VLLM_USE_FLASHINFER_MOE_FP4",
1380+
"Use --moe-backend to select an MoE kernel "
1381+
"(e.g. flashinfer_trtllm, flashinfer_cutlass, flashinfer_cutedsl).",
1382+
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
13381383
),
13391384
# Allow use of FlashInfer MxInt4 MoE kernels for fused moe ops.
13401385
"VLLM_USE_FLASHINFER_MOE_INT4": lambda: bool(
13411386
int(os.getenv("VLLM_USE_FLASHINFER_MOE_INT4", "0"))
13421387
),
13431388
# If set to 1, use the FlashInfer
13441389
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
1345-
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
1346-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))
1390+
# Deprecated: use --moe-backend flashinfer_trtllm combined with
1391+
# --quantization_config.moe.activation mxfp8.
1392+
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": deprecated_env(
1393+
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
1394+
"Use --moe-backend flashinfer_trtllm combined with "
1395+
"--quantization_config.moe.activation mxfp8.",
1396+
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
13471397
),
13481398
# If set to 1, use the FlashInfer CUTLASS backend for
13491399
# MXFP8 (activation) x MXFP4 (weight) MoE.
1350-
# This is separate from the TRTLLMGEN path controlled by
1351-
# VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8.
1352-
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
1353-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0"))
1400+
# Deprecated: use --moe-backend flashinfer_cutlass combined with
1401+
# --quantization_config.moe.activation mxfp8.
1402+
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": deprecated_env(
1403+
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
1404+
"Use --moe-backend flashinfer_cutlass combined with "
1405+
"--quantization_config.moe.activation mxfp8.",
1406+
lambda: bool(
1407+
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0"))
1408+
),
13541409
),
13551410
# If set to 1, use the FlashInfer
13561411
# BF16 (activation) x MXFP4 (weight) MoE backend.
1357-
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
1358-
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))
1412+
# Deprecated: use --moe-backend to select a kernel explicitly.
1413+
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": deprecated_env(
1414+
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
1415+
"Use --moe-backend to select an MoE kernel "
1416+
"(e.g. flashinfer_trtllm, flashinfer_cutlass).",
1417+
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
13591418
),
13601419
# Control the cache sized used by the xgrammar compiler. The default
13611420
# of 512 MB should be enough for roughly 1000 JSON schemas.
@@ -1415,10 +1474,16 @@ def _get_or_set_default() -> str:
14151474
# Uses CUTLASS kernels optimized for high-throughput batch inference.
14161475
# - "latency":
14171476
# Uses TensorRT-LLM kernels optimized for low-latency inference.
1418-
"VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
1477+
# Deprecated: pass --moe-backend flashinfer_{trtllm,cutlass,cutedsl} directly.
1478+
"VLLM_FLASHINFER_MOE_BACKEND": deprecated_env(
14191479
"VLLM_FLASHINFER_MOE_BACKEND",
1420-
"latency",
1421-
["throughput", "latency", "masked_gemm"],
1480+
"Pass --moe-backend flashinfer_trtllm, flashinfer_cutlass, or "
1481+
"flashinfer_cutedsl directly instead.",
1482+
env_with_choices(
1483+
"VLLM_FLASHINFER_MOE_BACKEND",
1484+
"latency",
1485+
["throughput", "latency", "masked_gemm"],
1486+
),
14221487
),
14231488
# Override the directory for the FlashInfer autotune config cache.
14241489
"VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR": lambda: os.getenv(
@@ -1500,8 +1565,11 @@ def _get_or_set_default() -> str:
15001565
# Controls whether or not emulations are used for NVFP4
15011566
# generations on machines < 100 for compressed-tensors
15021567
# models
1503-
"VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
1504-
int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
1568+
# Deprecated: use --linear-backend emulation instead.
1569+
"VLLM_USE_NVFP4_CT_EMULATIONS": deprecated_env(
1570+
"VLLM_USE_NVFP4_CT_EMULATIONS",
1571+
"Use --linear-backend emulation instead.",
1572+
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
15051573
),
15061574
# Controls the read mode for the Mori-IO connector
15071575
"VLLM_MORIIO_CONNECTOR_READ_MODE": lambda: (
@@ -1536,17 +1604,22 @@ def _get_or_set_default() -> str:
15361604
# This is only meant for research purposes to run on devices where NVFP4
15371605
# GEMM kernels are not available.
15381606
# - <none>: automatically pick an available backend
1539-
"VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
1607+
# Deprecated: use --linear-backend instead.
1608+
"VLLM_NVFP4_GEMM_BACKEND": deprecated_env(
15401609
"VLLM_NVFP4_GEMM_BACKEND",
1541-
None,
1542-
[
1543-
"flashinfer-cudnn",
1544-
"flashinfer-trtllm",
1545-
"flashinfer-cutlass",
1546-
"cutlass",
1547-
"marlin",
1548-
"emulation",
1549-
],
1610+
"Use --linear-backend instead.",
1611+
env_with_choices(
1612+
"VLLM_NVFP4_GEMM_BACKEND",
1613+
None,
1614+
[
1615+
"flashinfer-cudnn",
1616+
"flashinfer-trtllm",
1617+
"flashinfer-cutlass",
1618+
"cutlass",
1619+
"marlin",
1620+
"emulation",
1621+
],
1622+
),
15501623
),
15511624
# Controls garbage collection during CUDA graph capture.
15521625
# If set to 0 (default), enables GC freezing to speed up capture time.
@@ -1697,7 +1770,12 @@ def _get_or_set_default() -> str:
16971770
# NCCL header path
16981771
"VLLM_NCCL_INCLUDE_PATH": lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
16991772
# Flag to enable FBGemm kernels on model execution
1700-
"VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
1773+
# Deprecated: use --linear-backend fbgemm instead.
1774+
"VLLM_USE_FBGEMM": deprecated_env(
1775+
"VLLM_USE_FBGEMM",
1776+
"Use --linear-backend fbgemm instead.",
1777+
lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
1778+
),
17011779
# GC debug config
17021780
# - VLLM_GC_DEBUG=0: disable GC debugger
17031781
# - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times

vllm/model_executor/kernels/linear/__init__.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import stability.
1414
"""
1515

16-
import warnings
1716
from typing import TypeVar
1817

1918
import torch
@@ -846,31 +845,12 @@ def init_nvfp4_linear_kernel() -> NvFp4LinearKernel:
846845
force_kernel = EmulationNvFp4LinearKernel
847846
elif linear_backend == "auto":
848847
# Deprecated env-var overrides — only honoured when --linear-backend
849-
# is "auto". Will be removed in v0.21; users should migrate to
850-
# --linear-backend.
848+
# is "auto". Deprecation warnings are emitted from vllm/envs.py.
851849
if envs.VLLM_USE_FBGEMM:
852-
warnings.warn(
853-
"VLLM_USE_FBGEMM is deprecated and will be removed in "
854-
"v0.21. Use --linear-backend fbgemm instead.",
855-
DeprecationWarning,
856-
stacklevel=2,
857-
)
858850
force_kernel = FbgemmNvFp4LinearKernel
859851
elif envs.VLLM_USE_NVFP4_CT_EMULATIONS:
860-
warnings.warn(
861-
"VLLM_USE_NVFP4_CT_EMULATIONS is deprecated and will be "
862-
"removed in v0.21. Use --linear-backend emulation instead.",
863-
DeprecationWarning,
864-
stacklevel=2,
865-
)
866852
force_kernel = EmulationNvFp4LinearKernel
867853
elif envs.VLLM_NVFP4_GEMM_BACKEND is not None:
868-
warnings.warn(
869-
"VLLM_NVFP4_GEMM_BACKEND is deprecated and will be "
870-
"removed in v0.21. Use --linear-backend instead.",
871-
DeprecationWarning,
872-
stacklevel=2,
873-
)
874854
backend_name = envs.VLLM_NVFP4_GEMM_BACKEND
875855
force_kernel = _NVFP4_BACKEND_TO_KERNEL.get(backend_name)
876856
if force_kernel is None:

0 commit comments

Comments
 (0)