diff --git a/llm_benchmark.py b/llm_benchmark.py index 2a440df..eb1061d 100755 --- a/llm_benchmark.py +++ b/llm_benchmark.py @@ -39,6 +39,9 @@ action="append", help="JSON file defining tools that can be used", ) +parser.add_argument( + "--strict", action="store_true", help="Use strict mode when using tools" +) parser.add_argument( "--model", "-m", diff --git a/llm_benchmark_suite.py b/llm_benchmark_suite.py index ac5ae8e..058ee86 100644 --- a/llm_benchmark_suite.py +++ b/llm_benchmark_suite.py @@ -431,6 +431,9 @@ def _tools_models(): _Llm(GPT_4O), _Llm(GPT_4O_MINI), _Llm(GPT_4_TURBO), + _Llm(GPT_4O, GPT_4O + "-strict", strict=None), + _Llm(GPT_4O_MINI, GPT_4O_MINI + "-strict", strict=None), + _Llm(GPT_4_TURBO, GPT_4_TURBO + "-strict", strict=None), _Llm("claude-3-opus-20240229"), _Llm("claude-3-5-sonnet-20240620"), _Llm("claude-3-sonnet-20240229"), @@ -476,7 +479,7 @@ def _audio_models(): _Llm(GEMINI_1_5_PRO), _Llm(GEMINI_1_5_FLASH), _Llm( - "fixie-ai/ultravox-v0.2", + "fixie-ai/ultravox-v0.3", base_url="https://ultravox.api.fixie.ai/v1", api_key=os.getenv("ULTRAVOX_API_KEY"), ), diff --git a/llm_request.py b/llm_request.py index 64e689f..573da8c 100644 --- a/llm_request.py +++ b/llm_request.py @@ -83,6 +83,7 @@ class ApiContext: prompt: str files: List[InputFile] tools: List[Dict] + strict: bool temperature: float max_tokens: int detail: Optional[str] = None @@ -99,6 +100,7 @@ def __init__(self, session, index, name, func, args, prompt, files, tools): self.prompt = prompt self.files = files self.tools = tools + self.strict = args.strict self.detail = args.detail self.temperature = args.temperature self.max_tokens = args.max_tokens @@ -276,14 +278,19 @@ async def openai_chat(ctx: ApiContext, path: str = "/chat/completions") -> ApiRe url, headers = make_openai_url_and_headers(ctx, path) kwargs = {"messages": make_openai_messages(ctx)} if ctx.tools: - kwargs["tools"] = ctx.tools + tools = ctx.tools[:] + if ctx.strict: + for t in tools: + t["function"]["strict"] = True + t["function"]["parameters"]["additionalProperties"] = False + kwargs["tools"] = tools kwargs["tool_choice"] = "required" if ctx.peft: kwargs["peft"] = ctx.peft # Some providers require opt-in for stream stats, but some providers don't like this opt-in. - # Azure, ovh.net, and vLLM don't support stream stats at the moment. + # Regardless of opt-in, Azure and ovh.net don't return stream stats at the moment. # See https://github.com/Azure/azure-rest-api-specs/issues/25062 - if not any(p in ctx.name for p in ["azure", "databricks", "fireworks", "ultravox"]): + if not any(p in ctx.name for p in ["azure", "databricks", "fireworks"]): kwargs["stream_options"] = {"include_usage": True} data = make_openai_chat_body(ctx, **kwargs) return await post(ctx, url, headers, data, openai_chunk_gen) diff --git a/media/tools/flights.json b/media/tools/flights.json index dc7ad49..aba9980 100644 --- a/media/tools/flights.json +++ b/media/tools/flights.json @@ -12,7 +12,6 @@ }, "date": { "type": "string", - "format": "date", "description": "The date of the flight, e.g., 2024-06-17" } },