From 09e4e91b3c803350f0ab8905303fa4a844920341 Mon Sep 17 00:00:00 2001 From: carryyu <569782149@qq.com> Date: Wed, 17 Sep 2025 15:24:00 +0800 Subject: [PATCH 1/2] optimize clear_low_latency_buffer when mixed_infer --- .../layers/moe/fused_moe_backend_base.py | 14 +++++++++++--- fastdeploy/worker/dcu_model_runner.py | 1 + fastdeploy/worker/gpu_model_runner.py | 1 + fastdeploy/worker/metax_model_runner.py | 1 + 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 2a54748afb..802ca9e0f3 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -161,12 +161,20 @@ def apply( if layer.ep_size > 1: if layer.fd_config.parallel_config.moe_phase.phase == "prefill": if layer.fd_config.parallel_config.splitwise_role == "mixed": - self.ep_prefill_runner.clean_low_latency_buffer() + if ( + layer.fd_config.parallel_config.moe_phase.last_phase + != layer.fd_config.parallel_config.moe_phase.phase + ): + self.ep_prefill_runner.clean_low_latency_buffer() return self.apply_ep_prefill(layer, x, gate) else: if layer.fd_config.parallel_config.splitwise_role == "mixed": - self.ep_decoder_runner.clean_low_latency_buffer() - return self.apply_ep_prefill(layer, x, gate) + if ( + layer.fd_config.parallel_config.moe_phase.last_phase + != layer.fd_config.parallel_config.moe_phase.phase + ): + self.ep_decoder_runner.clean_low_latency_buffer() + return self.apply_ep_decode(layer, x, gate) else: return self.apply_tp(layer, x, gate) diff --git a/fastdeploy/worker/dcu_model_runner.py b/fastdeploy/worker/dcu_model_runner.py index df01b1bd77..bc61350052 100644 --- a/fastdeploy/worker/dcu_model_runner.py +++ b/fastdeploy/worker/dcu_model_runner.py @@ -68,6 +68,7 @@ def initialize_forward_meta(self): prefill_exists = self.exist_prefill() paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists) only_decode_batch = all(only_decode_batch_list) + self.fd_config.parallel_config.moe_phase.last_phase = self.fd_config.parallel_config.moe_phase.phase self.fd_config.parallel_config.moe_phase.phase = "decode" if only_decode_batch else "prefill" self.forward_meta.step_use_cudagraph = ( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 6da0fb5b91..ae88558c20 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -975,6 +975,7 @@ def initialize_forward_meta(self): prefill_exists = self.exist_prefill() paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists) only_decode_batch = all(only_decode_batch_list) + self.fd_config.parallel_config.moe_phase.last_phase = self.fd_config.parallel_config.moe_phase.phase self.fd_config.parallel_config.moe_phase.phase = "decode" if only_decode_batch else "prefill" self.forward_meta.step_use_cudagraph = ( diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index b16d3f984f..11a230b6f4 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -910,6 +910,7 @@ def initialize_forward_meta(self): prefill_exists = self.exist_prefill() paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists) only_decode_batch = all(only_decode_batch_list) + self.fd_config.parallel_config.moe_phase.last_phase = self.fd_config.parallel_config.moe_phase.phase self.fd_config.parallel_config.moe_phase.phase = "decode" if only_decode_batch else "prefill" self.forward_meta.step_use_cudagraph = ( From 1f2fefd08b728a80926e30bcf8441eb4ca03a081 Mon Sep 17 00:00:00 2001 From: carryyu <569782149@qq.com> Date: Thu, 18 Sep 2025 17:05:22 +0800 Subject: [PATCH 2/2] optimize clear_low_latency_buffer when mixed_infer --- .../model_executor/layers/moe/fused_moe_backend_base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 802ca9e0f3..0c3c834d69 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -160,16 +160,10 @@ def apply( """ if layer.ep_size > 1: if layer.fd_config.parallel_config.moe_phase.phase == "prefill": - if layer.fd_config.parallel_config.splitwise_role == "mixed": - if ( - layer.fd_config.parallel_config.moe_phase.last_phase - != layer.fd_config.parallel_config.moe_phase.phase - ): - self.ep_prefill_runner.clean_low_latency_buffer() return self.apply_ep_prefill(layer, x, gate) else: if layer.fd_config.parallel_config.splitwise_role == "mixed": - if ( + if layer.layer_idx == 0 and ( layer.fd_config.parallel_config.moe_phase.last_phase != layer.fd_config.parallel_config.moe_phase.phase ):