scaleapi · seanshi-scale · Jun 4, 2024 · Jun 3, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.0.0b33"
+__version__ = "0.0.0b34"
 
 import os
 from typing import Sequence

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
@@ -47,6 +47,7 @@ async def acreate(
         guided_json: Optional[Dict[str, Any]] = None,
         guided_regex: Optional[str] = None,
         guided_choice: Optional[List[str]] = None,
+        guided_grammar: Optional[str] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]:
@@ -118,6 +119,9 @@ async def acreate(
             guided_choice (Optional[List[str]]):
                 If specified, the output will be exactly one of the choices.
 
+            guided_grammar (Optional[str]):
+                If specified, the output will follow the context-free grammar provided.
+
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
 
@@ -218,6 +222,7 @@ async def _acreate_stream(
                 guided_json=guided_json,
                 guided_regex=guided_regex,
                 guided_choice=guided_choice,
+                guided_grammar=guided_grammar,
                 timeout=timeout,
             )
 
@@ -242,6 +247,11 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse:
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
+                include_stop_str_in_output=include_stop_str_in_output,
+                guided_json=guided_json,
+                guided_regex=guided_regex,
+                guided_choice=guided_choice,
+                guided_grammar=guided_grammar,
             )
 
     @classmethod
@@ -261,6 +271,7 @@ def create(
         guided_json: Optional[Dict[str, Any]] = None,
         guided_regex: Optional[str] = None,
         guided_choice: Optional[List[str]] = None,
+        guided_grammar: Optional[str] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
@@ -333,6 +344,9 @@ def create(
             guided_choice (Optional[List[str]]):
                 If specified, the output will be exactly one of the choices.
 
+            guided_grammar (Optional[str]):
+                If specified, the output will follow the context-free grammar provided.
+
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
 
@@ -419,6 +433,11 @@ def _create_stream(**kwargs):
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
+                include_stop_str_in_output=include_stop_str_in_output,
+                guided_json=guided_json,
+                guided_regex=guided_regex,
+                guided_choice=guided_choice,
+                guided_grammar=guided_grammar,
             )
 
         else:
@@ -436,6 +455,7 @@ def _create_stream(**kwargs):
                 guided_json=guided_json,
                 guided_regex=guided_regex,
                 guided_choice=guided_choice,
+                guided_grammar=guided_grammar,
             ).dict()
             response = cls.post_sync(
                 resource_name=f"v1/llm/completions-sync?model_endpoint_name={model}",

diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
@@ -331,6 +331,7 @@ class CompletionSyncV1Request(BaseModel):
     guided_json: Optional[Dict[str, Any]] = Field(default=None)
     guided_regex: Optional[str] = Field(default=None)
     guided_choice: Optional[List[str]] = Field(default=None)
+    guided_grammar: Optional[str] = Field(default=None)
 
 
 class TokenOutput(BaseModel):
@@ -405,6 +406,7 @@ class CompletionStreamV1Request(BaseModel):
     guided_json: Optional[Dict[str, Any]] = Field(default=None)
     guided_regex: Optional[str] = Field(default=None)
     guided_choice: Optional[List[str]] = Field(default=None)
+    guided_grammar: Optional[str] = Field(default=None)
 
 
 class CompletionStreamOutput(BaseModel):

diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scale-llm-engine"
-version = "0.0.0.beta33"
+version = "0.0.0.beta34"
 description = "Scale LLM Engine Python client"
 license = "Apache-2.0"
 authors = ["Phil Chen <[email protected]>"]

diff --git a/clients/python/setup.py b/clients/python/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="scale-llm-engine",
     python_requires=">=3.7",
-    version="0.0.0.beta33",
+    version="0.0.0.beta34",
     packages=find_packages(),
     package_data={"llmengine": ["py.typed"]},
 )