|
8 | 8 | import sys |
9 | 9 | import tempfile |
10 | 10 | import uuid |
| 11 | +import warnings |
11 | 12 | from collections.abc import Callable |
12 | 13 | from typing import TYPE_CHECKING, Any, Literal |
13 | 14 |
|
@@ -337,6 +338,33 @@ def use_mega_aot_artifact(): |
337 | 338 | return os.environ.get("VLLM_USE_MEGA_AOT_ARTIFACT", default_value) == "1" |
338 | 339 |
|
339 | 340 |
|
| 341 | +def deprecated_env( |
| 342 | + env_name: str, |
| 343 | + replacement: str, |
| 344 | + getter: Callable[[], Any], |
| 345 | + removal_version: str = "v0.23", |
| 346 | +) -> Callable[[], Any]: |
| 347 | + """Wrap an env-var getter to emit a FutureWarning when the var is set. |
| 348 | +
|
| 349 | + The warning fires on first read of the env var (after `getter` is invoked |
| 350 | + via the cached `__getattr__`), but only when the variable is explicitly |
| 351 | + present in the environment. Use FutureWarning so the deprecation is visible |
| 352 | + to end users by default. |
| 353 | + """ |
| 354 | + |
| 355 | + def _read() -> Any: |
| 356 | + if env_name in os.environ: |
| 357 | + warnings.warn( |
| 358 | + f"{env_name} is deprecated and will be removed in " |
| 359 | + f"{removal_version}. {replacement}", |
| 360 | + FutureWarning, |
| 361 | + stacklevel=2, |
| 362 | + ) |
| 363 | + return getter() |
| 364 | + |
| 365 | + return _read |
| 366 | + |
| 367 | + |
340 | 368 | def env_with_choices( |
341 | 369 | env_name: str, |
342 | 370 | default: str | None, |
@@ -1229,8 +1257,13 @@ def _get_or_set_default() -> str: |
1229 | 1257 | os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1" |
1230 | 1258 | ), |
1231 | 1259 | # Whether to use marlin kernel in mxfp4 quantization method |
1232 | | - "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool( |
1233 | | - os.environ.get("VLLM_MXFP4_USE_MARLIN", None) |
| 1260 | + # Deprecated: use --moe-backend marlin (MoE) or --linear-backend marlin |
| 1261 | + # (linear) instead. |
| 1262 | + "VLLM_MXFP4_USE_MARLIN": deprecated_env( |
| 1263 | + "VLLM_MXFP4_USE_MARLIN", |
| 1264 | + "Use --moe-backend marlin (for MoE) or --linear-backend marlin " |
| 1265 | + "(for linear) instead.", |
| 1266 | + lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)), |
1234 | 1267 | ), |
1235 | 1268 | # The activation dtype for marlin kernel |
1236 | 1269 | "VLLM_MARLIN_INPUT_DTYPE": env_with_choices( |
@@ -1325,37 +1358,63 @@ def _get_or_set_default() -> str: |
1325 | 1358 | int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "1")) |
1326 | 1359 | ), |
1327 | 1360 | # Allow use of FlashInfer BF16 MoE kernels for fused moe ops. |
1328 | | - "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool( |
1329 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0")) |
| 1361 | + # Deprecated: use --moe-backend to select a kernel explicitly. |
| 1362 | + "VLLM_USE_FLASHINFER_MOE_FP16": deprecated_env( |
| 1363 | + "VLLM_USE_FLASHINFER_MOE_FP16", |
| 1364 | + "Use --moe-backend to select an MoE kernel " |
| 1365 | + "(e.g. flashinfer_trtllm, flashinfer_cutlass).", |
| 1366 | + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))), |
1330 | 1367 | ), |
1331 | 1368 | # Allow use of FlashInfer FP8 MoE kernels for fused moe ops. |
1332 | | - "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool( |
1333 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0")) |
| 1369 | + # Deprecated: use --moe-backend to select a kernel explicitly. |
| 1370 | + "VLLM_USE_FLASHINFER_MOE_FP8": deprecated_env( |
| 1371 | + "VLLM_USE_FLASHINFER_MOE_FP8", |
| 1372 | + "Use --moe-backend to select an MoE kernel " |
| 1373 | + "(e.g. flashinfer_trtllm, flashinfer_cutlass).", |
| 1374 | + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), |
1334 | 1375 | ), |
1335 | 1376 | # Allow use of FlashInfer NVFP4 MoE kernels for fused moe ops. |
1336 | | - "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool( |
1337 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0")) |
| 1377 | + # Deprecated: use --moe-backend to select a kernel explicitly. |
| 1378 | + "VLLM_USE_FLASHINFER_MOE_FP4": deprecated_env( |
| 1379 | + "VLLM_USE_FLASHINFER_MOE_FP4", |
| 1380 | + "Use --moe-backend to select an MoE kernel " |
| 1381 | + "(e.g. flashinfer_trtllm, flashinfer_cutlass, flashinfer_cutedsl).", |
| 1382 | + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))), |
1338 | 1383 | ), |
1339 | 1384 | # Allow use of FlashInfer MxInt4 MoE kernels for fused moe ops. |
1340 | 1385 | "VLLM_USE_FLASHINFER_MOE_INT4": lambda: bool( |
1341 | 1386 | int(os.getenv("VLLM_USE_FLASHINFER_MOE_INT4", "0")) |
1342 | 1387 | ), |
1343 | 1388 | # If set to 1, use the FlashInfer |
1344 | 1389 | # MXFP8 (activation) x MXFP4 (weight) MoE backend. |
1345 | | - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool( |
1346 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0")) |
| 1390 | + # Deprecated: use --moe-backend flashinfer_trtllm combined with |
| 1391 | + # --quantization_config.moe.activation mxfp8. |
| 1392 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": deprecated_env( |
| 1393 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", |
| 1394 | + "Use --moe-backend flashinfer_trtllm combined with " |
| 1395 | + "--quantization_config.moe.activation mxfp8.", |
| 1396 | + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))), |
1347 | 1397 | ), |
1348 | 1398 | # If set to 1, use the FlashInfer CUTLASS backend for |
1349 | 1399 | # MXFP8 (activation) x MXFP4 (weight) MoE. |
1350 | | - # This is separate from the TRTLLMGEN path controlled by |
1351 | | - # VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8. |
1352 | | - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool( |
1353 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0")) |
| 1400 | + # Deprecated: use --moe-backend flashinfer_cutlass combined with |
| 1401 | + # --quantization_config.moe.activation mxfp8. |
| 1402 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": deprecated_env( |
| 1403 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", |
| 1404 | + "Use --moe-backend flashinfer_cutlass combined with " |
| 1405 | + "--quantization_config.moe.activation mxfp8.", |
| 1406 | + lambda: bool( |
| 1407 | + int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0")) |
| 1408 | + ), |
1354 | 1409 | ), |
1355 | 1410 | # If set to 1, use the FlashInfer |
1356 | 1411 | # BF16 (activation) x MXFP4 (weight) MoE backend. |
1357 | | - "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool( |
1358 | | - int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0")) |
| 1412 | + # Deprecated: use --moe-backend to select a kernel explicitly. |
| 1413 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": deprecated_env( |
| 1414 | + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", |
| 1415 | + "Use --moe-backend to select an MoE kernel " |
| 1416 | + "(e.g. flashinfer_trtllm, flashinfer_cutlass).", |
| 1417 | + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))), |
1359 | 1418 | ), |
1360 | 1419 | # Control the cache sized used by the xgrammar compiler. The default |
1361 | 1420 | # of 512 MB should be enough for roughly 1000 JSON schemas. |
@@ -1415,10 +1474,16 @@ def _get_or_set_default() -> str: |
1415 | 1474 | # Uses CUTLASS kernels optimized for high-throughput batch inference. |
1416 | 1475 | # - "latency": |
1417 | 1476 | # Uses TensorRT-LLM kernels optimized for low-latency inference. |
1418 | | - "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices( |
| 1477 | + # Deprecated: pass --moe-backend flashinfer_{trtllm,cutlass,cutedsl} directly. |
| 1478 | + "VLLM_FLASHINFER_MOE_BACKEND": deprecated_env( |
1419 | 1479 | "VLLM_FLASHINFER_MOE_BACKEND", |
1420 | | - "latency", |
1421 | | - ["throughput", "latency", "masked_gemm"], |
| 1480 | + "Pass --moe-backend flashinfer_trtllm, flashinfer_cutlass, or " |
| 1481 | + "flashinfer_cutedsl directly instead.", |
| 1482 | + env_with_choices( |
| 1483 | + "VLLM_FLASHINFER_MOE_BACKEND", |
| 1484 | + "latency", |
| 1485 | + ["throughput", "latency", "masked_gemm"], |
| 1486 | + ), |
1422 | 1487 | ), |
1423 | 1488 | # Override the directory for the FlashInfer autotune config cache. |
1424 | 1489 | "VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR": lambda: os.getenv( |
@@ -1500,8 +1565,11 @@ def _get_or_set_default() -> str: |
1500 | 1565 | # Controls whether or not emulations are used for NVFP4 |
1501 | 1566 | # generations on machines < 100 for compressed-tensors |
1502 | 1567 | # models |
1503 | | - "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool( |
1504 | | - int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")) |
| 1568 | + # Deprecated: use --linear-backend emulation instead. |
| 1569 | + "VLLM_USE_NVFP4_CT_EMULATIONS": deprecated_env( |
| 1570 | + "VLLM_USE_NVFP4_CT_EMULATIONS", |
| 1571 | + "Use --linear-backend emulation instead.", |
| 1572 | + lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))), |
1505 | 1573 | ), |
1506 | 1574 | # Controls the read mode for the Mori-IO connector |
1507 | 1575 | "VLLM_MORIIO_CONNECTOR_READ_MODE": lambda: ( |
@@ -1536,17 +1604,22 @@ def _get_or_set_default() -> str: |
1536 | 1604 | # This is only meant for research purposes to run on devices where NVFP4 |
1537 | 1605 | # GEMM kernels are not available. |
1538 | 1606 | # - <none>: automatically pick an available backend |
1539 | | - "VLLM_NVFP4_GEMM_BACKEND": env_with_choices( |
| 1607 | + # Deprecated: use --linear-backend instead. |
| 1608 | + "VLLM_NVFP4_GEMM_BACKEND": deprecated_env( |
1540 | 1609 | "VLLM_NVFP4_GEMM_BACKEND", |
1541 | | - None, |
1542 | | - [ |
1543 | | - "flashinfer-cudnn", |
1544 | | - "flashinfer-trtllm", |
1545 | | - "flashinfer-cutlass", |
1546 | | - "cutlass", |
1547 | | - "marlin", |
1548 | | - "emulation", |
1549 | | - ], |
| 1610 | + "Use --linear-backend instead.", |
| 1611 | + env_with_choices( |
| 1612 | + "VLLM_NVFP4_GEMM_BACKEND", |
| 1613 | + None, |
| 1614 | + [ |
| 1615 | + "flashinfer-cudnn", |
| 1616 | + "flashinfer-trtllm", |
| 1617 | + "flashinfer-cutlass", |
| 1618 | + "cutlass", |
| 1619 | + "marlin", |
| 1620 | + "emulation", |
| 1621 | + ], |
| 1622 | + ), |
1550 | 1623 | ), |
1551 | 1624 | # Controls garbage collection during CUDA graph capture. |
1552 | 1625 | # If set to 0 (default), enables GC freezing to speed up capture time. |
@@ -1697,7 +1770,12 @@ def _get_or_set_default() -> str: |
1697 | 1770 | # NCCL header path |
1698 | 1771 | "VLLM_NCCL_INCLUDE_PATH": lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None), |
1699 | 1772 | # Flag to enable FBGemm kernels on model execution |
1700 | | - "VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))), |
| 1773 | + # Deprecated: use --linear-backend fbgemm instead. |
| 1774 | + "VLLM_USE_FBGEMM": deprecated_env( |
| 1775 | + "VLLM_USE_FBGEMM", |
| 1776 | + "Use --linear-backend fbgemm instead.", |
| 1777 | + lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))), |
| 1778 | + ), |
1701 | 1779 | # GC debug config |
1702 | 1780 | # - VLLM_GC_DEBUG=0: disable GC debugger |
1703 | 1781 | # - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times |
|
0 commit comments