Skip to content

Commit 67e6d8c

Browse files
authored
[Feature] Set prefix caching as default (#3814)
* Set prefix caching as default * Set prefix caching as default * Set prefix caching as default * skip dynamic load scene * fix kill bug * fix kill bug * fix kill bug * fix * fix * fix ci
1 parent de8638b commit 67e6d8c

File tree

5 files changed

+23
-8
lines changed

5 files changed

+23
-8
lines changed

fastdeploy/engine/args_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
"""
1616

17+
import argparse
1718
import json
1819
from dataclasses import asdict, dataclass
1920
from dataclasses import fields as dataclass_fields
@@ -190,7 +191,7 @@ class EngineArgs:
190191
"""
191192
Flag to indicate whether to use warm-up before inference.
192193
"""
193-
enable_prefix_caching: bool = False
194+
enable_prefix_caching: bool = True
194195
"""
195196
Flag to enable prefix caching.
196197
"""
@@ -387,6 +388,16 @@ def __post_init__(self):
387388
"""
388389
if not self.tokenizer:
389390
self.tokenizer = self.model
391+
if self.splitwise_role == "decode":
392+
self.enable_prefix_caching = False
393+
if self.speculative_config is not None:
394+
self.enable_prefix_caching = False
395+
if self.enable_mm:
396+
self.enable_prefix_caching = False
397+
if not current_platform.is_cuda():
398+
self.enable_prefix_caching = False
399+
if self.dynamic_load_weight:
400+
self.enable_prefix_caching = False
390401
if self.enable_logprob:
391402
if self.speculative_config is not None:
392403
raise NotImplementedError("Logprob does not support speculation_config.")
@@ -725,7 +736,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
725736
perf_group = parser.add_argument_group("Performance Tuning")
726737
perf_group.add_argument(
727738
"--enable-prefix-caching",
728-
action="store_true",
739+
action=argparse.BooleanOptionalAction,
729740
default=EngineArgs.enable_prefix_caching,
730741
help="Flag to enable prefix caching.",
731742
)

fastdeploy/engine/engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,8 @@ def _exit_sub_services(self):
369369
for p in self.cache_manager_processes:
370370
llm_logger.info(f"Killing cache manager process {p.pid}")
371371
try:
372-
os.killpg(p.pid, signal.SIGTERM)
372+
pgid = os.getpgid(p.pid)
373+
os.killpg(pgid, signal.SIGTERM)
373374
except Exception as e:
374375
console_logger.error(
375376
f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"
@@ -381,7 +382,8 @@ def _exit_sub_services(self):
381382
self.get_profile_block_num_signal.clear()
382383
if hasattr(self, "worker_proc") and self.worker_proc is not None:
383384
try:
384-
os.killpg(self.worker_proc.pid, signal.SIGTERM)
385+
pgid = os.getpgid(self.worker_proc.pid)
386+
os.killpg(pgid, signal.SIGTERM)
385387
except Exception as e:
386388
console_logger.error(f"Error extracting sub services: {e}, {str(traceback.format_exc())}")
387389

scripts/coverage_run.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ for file in $TEST_FILES; do
3232
else
3333
success_pytest=$((success_pytest+1))
3434
fi
35+
ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9
3536
done
3637

3738
##################################

tests/model_loader/test_w4a8_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
[9991, 9992],
3030
]
3131
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
32+
FD_CACHE_QUEUE_PORTS = [FD_CACHE_QUEUE_PORT, FD_CACHE_QUEUE_PORT + 1, FD_CACHE_QUEUE_PORT + 2, FD_CACHE_QUEUE_PORT + 3]
3233

3334

3435
models = [
@@ -54,7 +55,7 @@ def llm(request):
5455
max_model_len=8192,
5556
num_gpu_blocks_override=1024,
5657
engine_worker_queue_port=FD_ENGINE_QUEUE_PORTS[port_index],
57-
cache_queue_port=FD_CACHE_QUEUE_PORT,
58+
cache_queue_port=FD_CACHE_QUEUE_PORTS[port_index],
5859
load_choices="default",
5960
enable_expert_parallel=True,
6061
)

tests/v1/test_schedule_output.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def test_normal_schedule():
3030
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
3131
)
3232
req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3199, "prompt_token_ids_len": 3199})
33-
req2 = Request.from_dict({"request_id": "req2", "prompt_token_ids": [1] * 3201, "prompt_token_ids_len": 3201})
34-
req3 = Request.from_dict({"request_id": "req3", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200})
33+
req2 = Request.from_dict({"request_id": "req2", "prompt_token_ids": [2] * 3201, "prompt_token_ids_len": 3201})
34+
req3 = Request.from_dict({"request_id": "req3", "prompt_token_ids": [3] * 3200, "prompt_token_ids_len": 3200})
3535
resource_manager_v1.add_request(req1)
3636
resource_manager_v1.add_request(req2)
3737
resource_manager_v1.add_request(req3)
@@ -93,7 +93,7 @@ def test_preempted_request():
9393
max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
9494
)
9595
req1 = Request.from_dict({"request_id": "req1", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200})
96-
req2 = Request.from_dict({"request_id": "req2", "prompt_token_ids": [1] * 3200, "prompt_token_ids_len": 3200})
96+
req2 = Request.from_dict({"request_id": "req2", "prompt_token_ids": [2] * 3200, "prompt_token_ids_len": 3200})
9797
resource_manager_v1.add_request(req1)
9898
resource_manager_v1.add_request(req2)
9999
# step 1

0 commit comments

Comments
 (0)