Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import threading
from contextlib import nullcontext
from multiprocessing import resource_tracker, shared_memory
Expand Down Expand Up @@ -173,9 +174,20 @@ def finalize_layer_weights(self):
total_size += aligned_size

shm_name = self.get_shared_memory_name()
shm = shared_memory.SharedMemory(name=shm_name,
create=True,
size=total_size)
try:
shm = shared_memory.SharedMemory(name=shm_name,
create=True,
size=total_size)
except FileExistsError:
tensorrt_llm.logger.warning(
f'Found exist EPLB shared memory name: {shm_name}, unlinking...'
)
existing_shm = shared_memory.SharedMemory(name=shm_name)
existing_shm.close()
existing_shm.unlink()
shm = shared_memory.SharedMemory(name=shm_name,
create=True,
size=total_size)
self.own_shm = shm

offset = 0
Expand Down Expand Up @@ -670,15 +682,15 @@ def __init__(self,
ep_rank: int,
ep_size: int,
layer_updates_per_iter: int,
shared_memory_base_name: str = 'moe_shared'):
shared_memory_base_name: Optional[str] = None):
"""
Initialize a MoeLoadBalancer instance.

Args:
ep_rank: The rank of the current process in expert parallelism
ep_size: The total number of processes in expert parallelism
layer_updates_per_iter: The number of layers to update per iteration
shared_memory_base_name: Shared memory base name
shared_memory_base_name: Shared memory base name, will use 'moe_shared' if None
"""
self.is_shutdown = True
self.ep_rank = ep_rank
Expand All @@ -688,7 +700,8 @@ def __init__(self,
layer_updates_per_iter)
self._previous_balancer = None
self.single_layer_load_balancers = []
self.shared_memory_base_name = shared_memory_base_name
self.shared_memory_base_name = shared_memory_base_name or os.getenv(
'TRTLLM_EPLB_SHM_NAME', 'moe_shared')
self._setup_mpi_comm()
self.is_shutdown = False

Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_lists/test-db/l0_gb200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ l0_gb200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
- condition:
ranges:
system_gpu_count:
Expand Down Expand Up @@ -64,5 +66,3 @@ l0_gb200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]