Skip to content

Commit d2d4d10

Browse files
authored
llama should have None max length (#348)
1 parent f894c10 commit d2d4d10

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,14 +152,14 @@
152152
# We need a dict where if we need to override we can
153153
# NOTE: These are in *descending* order of priority. e.g. if you see 'mammoth-coder'
154154
# you'll use that override and not listen to the 'llama-2' override
155-
_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, int]] = {
155+
_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, Optional[int]]] = {
156156
"mammoth-coder": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
157157
# Based on config here: https://huggingface.co/TIGER-Lab/MAmmoTH-Coder-7B/blob/main/config.json#L12
158158
# Can also see 13B, 34B there too
159159
"code-llama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
160160
# Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12
161161
# Can also see 13B, 34B there too
162-
"llama-2": {"max_model_len": 4096, "max_num_batched_tokens": 4096},
162+
"llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096},
163163
"mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
164164
}
165165

@@ -534,7 +534,7 @@ async def create_vllm_bundle(
534534
):
535535
command = []
536536

537-
max_num_batched_tokens: int = 2560 # vLLM's default
537+
max_num_batched_tokens: Optional[int] = 2560 # vLLM's default
538538
max_model_len: Optional[int] = None
539539

540540
for key, value in _VLLM_MODEL_LENGTH_OVERRIDES.items():

0 commit comments

Comments
 (0)