|
152 | 152 | # We need a dict where if we need to override we can
|
153 | 153 | # NOTE: These are in *descending* order of priority. e.g. if you see 'mammoth-coder'
|
154 | 154 | # you'll use that override and not listen to the 'llama-2' override
|
155 |
| -_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, int]] = { |
| 155 | +_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, Optional[int]]] = { |
156 | 156 | "mammoth-coder": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
|
157 | 157 | # Based on config here: https://huggingface.co/TIGER-Lab/MAmmoTH-Coder-7B/blob/main/config.json#L12
|
158 | 158 | # Can also see 13B, 34B there too
|
159 | 159 | "code-llama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
|
160 | 160 | # Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12
|
161 | 161 | # Can also see 13B, 34B there too
|
162 |
| - "llama-2": {"max_model_len": 4096, "max_num_batched_tokens": 4096}, |
| 162 | + "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096}, |
163 | 163 | "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
|
164 | 164 | }
|
165 | 165 |
|
@@ -534,7 +534,7 @@ async def create_vllm_bundle(
|
534 | 534 | ):
|
535 | 535 | command = []
|
536 | 536 |
|
537 |
| - max_num_batched_tokens: int = 2560 # vLLM's default |
| 537 | + max_num_batched_tokens: Optional[int] = 2560 # vLLM's default |
538 | 538 | max_model_len: Optional[int] = None
|
539 | 539 |
|
540 | 540 | for key, value in _VLLM_MODEL_LENGTH_OVERRIDES.items():
|
|
0 commit comments