Separate Gateway DTO and engine DTO

dmchoiboi · dmchoiboi · commit a59cf19b0541 · 2024-05-14T18:03:49.000Z
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
@@ -528,17 +528,37 @@ class CreateBatchCompletionsRequest(BaseModel):
     """
     Maximum runtime of the batch inference in seconds. Default to one day.
     """
-    max_gpu_memory_utilization: Optional[float] = Field(default=0.9, le=1.0)
-    """
-    Maximum GPU memory utilization for the batch inference. Default to 90%.
-    """
     tool_config: Optional[ToolConfig] = None
     """
     Configuration for tool use.
     NOTE: this config is highly experimental and signature will change significantly in future iterations.
     """
 
 
+class CreateBatchCompletionsEngineRequest(CreateBatchCompletionsRequest):
+    """
+    Internal model for representing request to the llm engine. This contains additional fields that we want
+    hidden from the DTO exposed to the client.
+    """
+
+    max_gpu_memory_utilization: Optional[float] = Field(default=0.9, le=1.0)
+    """
+    Maximum GPU memory utilization for the batch inference. Default to 90%.
+    """
+
+    @staticmethod
+    def from_api(request: CreateBatchCompletionsRequest) -> "CreateBatchCompletionsEngineRequest":
+        return CreateBatchCompletionsEngineRequest(
+            input_data_path=request.input_data_path,
+            output_data_path=request.output_data_path,
+            content=request.content,
+            model_config=request.model_config,
+            data_parallelism=request.data_parallelism,
+            max_runtime_sec=request.max_runtime_sec,
+            tool_config=request.tool_config,
+        )
+
+
 class CreateBatchCompletionsResponse(BaseModel):
     job_id: str
 
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -21,6 +21,7 @@
     CompletionStreamV1Response,
     CompletionSyncV1Request,
     CompletionSyncV1Response,
+    CreateBatchCompletionsEngineRequest,
     CreateBatchCompletionsRequest,
     CreateBatchCompletionsResponse,
     CreateLLMModelEndpointV1Request,
@@ -2330,13 +2331,15 @@ async def create_batch_job_bundle(
         return batch_bundle
 
     async def execute(
-        self, user: User, request: CreateBatchCompletionsRequest
+        self, user: User, _request: CreateBatchCompletionsRequest
     ) -> CreateBatchCompletionsResponse:
-        hardware = infer_hardware_from_model_name(request.model_config.model)
+        hardware = infer_hardware_from_model_name(_request.model_config.model)
         # Reconcile gpus count with num_shards from request
         assert hardware.gpus is not None
-        if request.model_config.num_shards:
-            hardware.gpus = max(hardware.gpus, request.model_config.num_shards)
+        if _request.model_config.num_shards:
+            hardware.gpus = max(hardware.gpus, _request.model_config.num_shards)
+
+        request = CreateBatchCompletionsEngineRequest.from_api(_request)
         request.model_config.num_shards = hardware.gpus
 
         if request.tool_config and request.tool_config.name != "code_evaluator":
@@ -2347,6 +2350,7 @@ async def execute(
         additional_engine_args = infer_addition_engine_args_from_model_name(
             request.model_config.model
         )
+
         if additional_engine_args.gpu_memory_utilization is not None:
             request.max_gpu_memory_utilization = additional_engine_args.gpu_memory_utilization
 
diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
@@ -15,7 +15,7 @@
 from func_timeout import FunctionTimedOut, func_set_timeout
 from model_engine_server.common.dtos.llms import (
     CompletionOutput,
-    CreateBatchCompletionsRequest,
+    CreateBatchCompletionsEngineRequest,
     CreateBatchCompletionsRequestContent,
     TokenOutput,
     ToolConfig,
@@ -145,7 +145,7 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-def get_vllm_engine(model, request: CreateBatchCompletionsRequest):
+def get_vllm_engine(model: str, request: CreateBatchCompletionsEngineRequest):
     from vllm import AsyncEngineArgs, AsyncLLMEngine
 
     engine_args = AsyncEngineArgs(
@@ -313,7 +313,7 @@ def tool_func(text: str, past_context: Optional[str]):
 async def batch_inference():
     job_index = int(os.getenv("JOB_COMPLETION_INDEX", 0))
 
-    request = CreateBatchCompletionsRequest.parse_file(CONFIG_FILE)
+    request = CreateBatchCompletionsEngineRequest.parse_file(CONFIG_FILE)
 
     if request.model_config.checkpoint_path is not None:
         download_model(request.model_config.checkpoint_path, MODEL_WEIGHTS_FOLDER)