Skip to content

Commit 559762f

Browse files
authored
[https://nvbugs/5448754][fix] Download HF model for all nodes. (#6824)
Signed-off-by: Yuxian Qiu <[email protected]>
1 parent 860589a commit 559762f

File tree

4 files changed

+48
-20
lines changed

4 files changed

+48
-20
lines changed

jenkins/L0_Test.groovy

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1892,15 +1892,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18921892
]
18931893
fullSet += SBSASlurmTestConfigs.keySet()
18941894

1895-
multiNodesSBSAConfigs = [
1896-
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
1897-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 6, 8, 2],
1898-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 6, 8, 2],
1899-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 6, 8, 2],
1900-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 6, 8, 2],
1901-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 6, 8, 2],
1902-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6": ["gb200-multi-node", "l0_gb200_multi_nodes", 6, 6, 8, 2],
1903-
]
1895+
multiNodesSBSAConfigs = (1..7).collectEntries { i ->
1896+
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]]
1897+
}
19041898
fullSet += multiNodesSBSAConfigs.keySet()
19051899

19061900
if (env.targetArch == AARCH64_TRIPLE) {

tensorrt_llm/llmapi/llm_utils.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
import weakref
88
from dataclasses import asdict, dataclass, field
99
from pathlib import Path
10-
from typing import Callable, List, Optional, Tuple, Union
10+
from typing import Any, Callable, List, Optional, Tuple, Union
1111

1212
import torch
1313
from tqdm import tqdm
1414

15-
from .._utils import (global_mpi_rank, mpi_barrier, mpi_broadcast, mpi_rank,
16-
release_gc)
15+
from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier,
16+
mpi_broadcast, mpi_rank, release_gc)
1717
from ..auto_parallel import AutoParallelConfig
1818
# yapf: disable
1919
from ..bindings.executor import (BatchingType, CapacitySchedulerPolicy,
@@ -607,6 +607,17 @@ def workspace(self) -> Path:
607607
self._workspace, tempfile.TemporaryDirectory) else Path(
608608
self._workspace)
609609

610+
def _submit_to_all_workers(
611+
self,
612+
task: Callable[..., Any],
613+
*args,
614+
**kwargs,
615+
) -> List[Any]:
616+
if self.llm_args.parallel_config.is_multi_gpu:
617+
return self.mpi_session.submit_sync(task, *args, **kwargs)
618+
else:
619+
return [task(*args, **kwargs)]
620+
610621
def __call__(self) -> Tuple[Path, Union[Path, None]]:
611622

612623
if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE:
@@ -627,9 +638,11 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
627638
f'backend {self.llm_args.backend} is not supported.')
628639

629640
if self.model_loader.model_obj.is_hub_model:
630-
self._hf_model_dir = download_hf_model(
631-
self.model_loader.model_obj.model_name,
632-
self.llm_args.revision)
641+
hf_model_dirs = self._submit_to_all_workers(
642+
CachedModelLoader._node_download_hf_model,
643+
model=self.model_loader.model_obj.model_name,
644+
revision=self.llm_args.revision)
645+
self._hf_model_dir = hf_model_dirs[0]
633646
else:
634647
self._hf_model_dir = self.model_loader.model_obj.model_dir
635648

@@ -806,6 +819,17 @@ def build_task(engine_dir: Path):
806819

807820
return self.get_engine_dir()
808821

822+
@print_traceback_on_error
823+
@staticmethod
824+
def _node_download_hf_model(
825+
model: str,
826+
revision: Optional[str] = None,
827+
) -> Optional[Path]:
828+
if local_mpi_rank() == 0:
829+
return download_hf_model(model, revision)
830+
else:
831+
return None
832+
809833
@print_traceback_on_error
810834
@staticmethod
811835
def _node_build_task(

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,16 +2028,25 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
20282028
task.evaluate(llm)
20292029

20302030
@pytest.mark.parametrize(
2031-
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
2032-
[(1, 1, 1, False, True, True)],
2033-
ids=["latency"])
2031+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,is_cached",
2032+
[(1, 1, 1, False, True, True, True),
2033+
pytest.param(8,
2034+
1,
2035+
1,
2036+
False,
2037+
True,
2038+
True,
2039+
False,
2040+
marks=pytest.mark.skip_less_mpi_world_size(8))],
2041+
ids=["latency", "multi_gpus_no_cache"])
20342042
def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2035-
overlap_scheduler):
2043+
overlap_scheduler, is_cached):
20362044
pytorch_config = dict(
20372045
disable_overlap_scheduler=not overlap_scheduler,
20382046
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
20392047

2040-
with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
2048+
with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B"
2049+
if is_cached else "Qwen/Qwen3-8B",
20412050
tensor_parallel_size=tp_size,
20422051
pipeline_parallel_size=pp_size,
20432052
moe_expert_parallel_size=ep_size,

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ l0_gb200_multi_nodes:
1919
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
2020
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
2121
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
22+
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)

0 commit comments

Comments
 (0)