1+ import os
12import threading
23from contextlib import nullcontext
34from multiprocessing import resource_tracker , shared_memory
@@ -173,9 +174,20 @@ def finalize_layer_weights(self):
173174 total_size += aligned_size
174175
175176 shm_name = self .get_shared_memory_name ()
176- shm = shared_memory .SharedMemory (name = shm_name ,
177- create = True ,
178- size = total_size )
177+ try :
178+ shm = shared_memory .SharedMemory (name = shm_name ,
179+ create = True ,
180+ size = total_size )
181+ except FileExistsError :
182+ tensorrt_llm .logger .warning (
183+ f'Found exist EPLB shared memory name: { shm_name } , unlinking...'
184+ )
185+ existing_shm = shared_memory .SharedMemory (name = shm_name )
186+ existing_shm .close ()
187+ existing_shm .unlink ()
188+ shm = shared_memory .SharedMemory (name = shm_name ,
189+ create = True ,
190+ size = total_size )
179191 self .own_shm = shm
180192
181193 offset = 0
@@ -670,15 +682,15 @@ def __init__(self,
670682 ep_rank : int ,
671683 ep_size : int ,
672684 layer_updates_per_iter : int ,
673- shared_memory_base_name : str = 'moe_shared' ):
685+ shared_memory_base_name : Optional [ str ] = None ):
674686 """
675687 Initialize a MoeLoadBalancer instance.
676688
677689 Args:
678690 ep_rank: The rank of the current process in expert parallelism
679691 ep_size: The total number of processes in expert parallelism
680692 layer_updates_per_iter: The number of layers to update per iteration
681- shared_memory_base_name: Shared memory base name
693+ shared_memory_base_name: Shared memory base name, will use 'moe_shared' if None
682694 """
683695 self .is_shutdown = True
684696 self .ep_rank = ep_rank
@@ -688,7 +700,8 @@ def __init__(self,
688700 layer_updates_per_iter )
689701 self ._previous_balancer = None
690702 self .single_layer_load_balancers = []
691- self .shared_memory_base_name = shared_memory_base_name
703+ self .shared_memory_base_name = shared_memory_base_name or os .getenv (
704+ 'TRTLLM_EPLB_SHM_NAME' , 'moe_shared' )
692705 self ._setup_mpi_comm ()
693706 self .is_shutdown = False
694707
0 commit comments