Validate quantization (#315)

yunfeng-scale · web-flow · commit 60ac144c55aa · 2023-10-11T18:07:59.000-07:00
* Validate quantization

* comments
diff --git a/clients/python/llmengine/model.py b/clients/python/llmengine/model.py
@@ -76,7 +76,7 @@ def create(
 
             num_shards (`int`):
                 Number of shards for the LLM. When bigger than 1, LLM will be sharded
-                to multiple GPUs. Number of GPUs must be larger than num_shards.
+                to multiple GPUs. Number of GPUs must be equal or larger than num_shards.
                 Only affects behavior for text-generation-inference models
 
             quantize (`Optional[Quantization]`):
diff --git a/model-engine/model_engine_server/common/env_vars.py b/model-engine/model_engine_server/common/env_vars.py
@@ -2,6 +2,7 @@
 A place for defining, setting, and referencing all environment variables used in Launch.
 """
 import os
+import sys
 from typing import Optional, Sequence
 
 from model_engine_server.common.constants import PROJECT_ROOT
@@ -73,5 +74,5 @@ def get_boolean_env_var(name: str) -> bool:
     logger.warning("LOCAL development & testing mode is ON")
 
 GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND")
-if GIT_TAG == "GIT_TAG_NOT_FOUND":
+if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules:
     raise ValueError("GIT_TAG environment variable must be set")
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -136,6 +136,13 @@
     },
 }
 
+_SUPPORTED_QUANTIZATIONS: Dict[LLMInferenceFramework, List[Quantization]] = {
+    LLMInferenceFramework.DEEPSPEED: [],
+    LLMInferenceFramework.TEXT_GENERATION_INFERENCE: [Quantization.BITSANDBYTES],
+    LLMInferenceFramework.VLLM: [Quantization.AWQ],
+    LLMInferenceFramework.LIGHTLLM: [],
+}
+
 
 NUM_DOWNSTREAM_REQUEST_RETRIES = 80  # has to be high enough so that the retries take the 5 minutes
 DOWNSTREAM_REQUEST_TIMEOUT_SECONDS = 5 * 60  # 5 minutes
@@ -198,8 +205,21 @@ def validate_num_shards(
             raise ObjectHasInvalidValueException("DeepSpeed requires more than 1 GPU.")
         if num_shards != gpus:
             raise ObjectHasInvalidValueException(
-                f"DeepSpeed requires num shard {num_shards} to be the same as number of GPUs {gpus}."
+                f"Num shard {num_shards} must be the same as number of GPUs {gpus} for DeepSpeed."
             )
+    if num_shards > gpus:
+        raise ObjectHasInvalidValueException(
+            f"Num shard {num_shards} must be less than or equal to the number of GPUs {gpus}."
+        )
+
+
+def validate_quantization(
+    quantize: Optional[Quantization], inference_framework: LLMInferenceFramework
+) -> None:
+    if quantize is not None and quantize not in _SUPPORTED_QUANTIZATIONS[inference_framework]:
+        raise ObjectHasInvalidValueException(
+            f"Quantization {quantize} is not supported for inference framework {inference_framework}. Supported quantization types are {_SUPPORTED_QUANTIZATIONS[inference_framework]}."
+        )
 
 
 class CreateLLMModelEndpointV1UseCase:
@@ -667,10 +687,12 @@ async def execute(
         validate_post_inference_hooks(user, request.post_inference_hooks)
         validate_model_name(request.model_name, request.inference_framework)
         validate_num_shards(request.num_shards, request.inference_framework, request.gpus)
+        validate_quantization(request.quantize, request.inference_framework)
 
         if request.inference_framework in [
             LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
             LLMInferenceFramework.VLLM,
+            LLMInferenceFramework.LIGHTLLM,
         ]:
             if request.endpoint_type != ModelEndpointType.STREAMING:
                 raise ObjectHasInvalidValueException(
diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py
@@ -18,6 +18,7 @@
 )
 from model_engine_server.domain.entities import (
     GpuType,
+    LLMInferenceFramework,
     ModelBundle,
     ModelBundleEnvironmentParams,
     ModelBundleFrameworkType,
@@ -283,7 +284,6 @@ def create_llm_model_endpoint_text_generation_inference_request_streaming() -> (
         inference_framework="deepspeed",
         inference_framework_image_tag="test_tag",
         num_shards=2,
-        quantize=Quantization.BITSANDBYTES,
         endpoint_type=ModelEndpointType.STREAMING,
         metadata={},
         post_inference_hooks=["billing"],
@@ -356,6 +356,33 @@ def create_llm_model_endpoint_request_invalid_model_name() -> CreateLLMModelEndp
     )
 
 
+@pytest.fixture
+def create_llm_model_endpoint_request_invalid_quantization() -> CreateLLMModelEndpointV1Request:
+    return CreateLLMModelEndpointV1Request(
+        name="test_llm_endpoint_name_1",
+        model_name="nonexist",
+        source="hugging_face",
+        inference_framework=LLMInferenceFramework.VLLM,
+        inference_framework_image_tag="test_tag",
+        num_shards=2,
+        quantize=Quantization.BITSANDBYTES,
+        endpoint_type=ModelEndpointType.SYNC,
+        metadata={},
+        post_inference_hooks=["billing"],
+        cpus=1,
+        gpus=2,
+        memory="8G",
+        gpu_type=GpuType.NVIDIA_TESLA_T4,
+        storage=None,
+        min_workers=1,
+        max_workers=3,
+        per_worker=2,
+        labels={"team": "infra", "product": "my_product"},
+        aws_role="test_aws_role",
+        results_s3_bucket="test_s3_bucket",
+    )
+
+
 @pytest.fixture
 def completion_sync_request() -> CompletionSyncV1Request:
     return CompletionSyncV1Request(
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -232,6 +232,35 @@ async def test_create_llm_model_endpoint_use_case_raises_invalid_value_exception
         )
 
 
+@pytest.mark.asyncio
+async def test_create_llm_model_endpoint_use_case_quantization_exception(
+    test_api_key: str,
+    fake_model_bundle_repository,
+    fake_model_endpoint_service,
+    fake_docker_repository_image_always_exists,
+    fake_model_primitive_gateway,
+    fake_llm_artifact_gateway,
+    create_llm_model_endpoint_request_invalid_quantization: CreateLLMModelEndpointV1Request,
+):
+    fake_model_endpoint_service.model_bundle_repository = fake_model_bundle_repository
+    bundle_use_case = CreateModelBundleV2UseCase(
+        model_bundle_repository=fake_model_bundle_repository,
+        docker_repository=fake_docker_repository_image_always_exists,
+        model_primitive_gateway=fake_model_primitive_gateway,
+    )
+    use_case = CreateLLMModelEndpointV1UseCase(
+        create_model_bundle_use_case=bundle_use_case,
+        model_bundle_repository=fake_model_bundle_repository,
+        model_endpoint_service=fake_model_endpoint_service,
+        llm_artifact_gateway=fake_llm_artifact_gateway,
+    )
+    user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
+    with pytest.raises(ObjectHasInvalidValueException):
+        await use_case.execute(
+            user=user, request=create_llm_model_endpoint_request_invalid_quantization
+        )
+
+
 @pytest.mark.asyncio
 async def test_get_llm_model_endpoint_use_case_raises_not_found(
     test_api_key: str,