[TRTLLM-7008][fix] cherrypick fix to 1.0 Add automatic shared memory delete if already exist (#7433)

dongxuy04 · web-flow · commit 9eecdf2ee9f4 · 2025-09-02T11:23:53.000+08:00
Signed-off-by: Dongxu Yang &lt;78518666+dongxuy04@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
@@ -1,3 +1,4 @@
+import os
 import threading
 from contextlib import nullcontext
 from multiprocessing import resource_tracker, shared_memory
@@ -173,9 +174,20 @@ def finalize_layer_weights(self):
                 total_size += aligned_size
 
         shm_name = self.get_shared_memory_name()
-        shm = shared_memory.SharedMemory(name=shm_name,
-                                         create=True,
-                                         size=total_size)
+        try:
+            shm = shared_memory.SharedMemory(name=shm_name,
+                                             create=True,
+                                             size=total_size)
+        except FileExistsError:
+            tensorrt_llm.logger.warning(
+                f'Found exist EPLB shared memory name: {shm_name}, unlinking...'
+            )
+            existing_shm = shared_memory.SharedMemory(name=shm_name)
+            existing_shm.close()
+            existing_shm.unlink()
+            shm = shared_memory.SharedMemory(name=shm_name,
+                                             create=True,
+                                             size=total_size)
         self.own_shm = shm
 
         offset = 0
@@ -670,15 +682,15 @@ def __init__(self,
                  ep_rank: int,
                  ep_size: int,
                  layer_updates_per_iter: int,
-                 shared_memory_base_name: str = 'moe_shared'):
+                 shared_memory_base_name: Optional[str] = None):
         """
         Initialize a MoeLoadBalancer instance.
 
         Args:
             ep_rank: The rank of the current process in expert parallelism
             ep_size: The total number of processes in expert parallelism
             layer_updates_per_iter: The number of layers to update per iteration
-            shared_memory_base_name: Shared memory base name
+            shared_memory_base_name: Shared memory base name, will use 'moe_shared' if None
         """
         self.is_shutdown = True
         self.ep_rank = ep_rank
@@ -688,7 +700,8 @@ def __init__(self,
                                                        layer_updates_per_iter)
         self._previous_balancer = None
         self.single_layer_load_balancers = []
-        self.shared_memory_base_name = shared_memory_base_name
+        self.shared_memory_base_name = shared_memory_base_name or os.getenv(
+            'TRTLLM_EPLB_SHM_NAME', 'moe_shared')
         self._setup_mpi_comm()
         self.is_shutdown = False
 
diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml
@@ -33,6 +33,8 @@ l0_gb200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
 - condition:
     ranges:
       system_gpu_count:
@@ -64,5 +66,3 @@ l0_gb200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]