Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,34 @@ When `max_turns` is specified, `aisuite` will:
In addition to `response.choices[0].message`, there is an additional field `response.choices[0].intermediate_messages`: which contains the list of all messages including tool interactions used. This can be used to continue the conversation with the model.
For more detailed examples of tool calling, check out the `examples/tool_calling_abstraction.ipynb` notebook.

## Audio Speech Recognition (ASR)

`aisuite` supports Audio Speech Recognition (ASR) with the same unified interface pattern:

```python
import aisuite as ai
client = ai.Client()

# Basic transcription
result = client.audio.transcriptions.create(
model="openai:whisper-1",
file="speech.mp3"
)
print(result.text)

# Provider-specific features
result = client.audio.transcriptions.create(
model="deepgram:nova-2",
file="meeting.mp3",
diarize=True, # Speaker separation
word_confidence=True # Word-level confidence
)
```

**Supported providers:** OpenAI, Deepgram, Google.

**Key features:** Same `provider:model` format • Rich metadata (timestamps, confidence, speakers) • Provider-specific advanced features

## License

aisuite is released under the MIT License. You are free to use, modify, and distribute the code for both commercial and non-commercial purposes.
Expand Down
148 changes: 144 additions & 4 deletions aisuite/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from .provider import ProviderFactory
import os
from .utils.tools import Tools
from typing import Union, BinaryIO, Optional, Any
from .framework.message import (
TranscriptionOptions,
TranscriptionResponse,
)


class Client:
Expand All @@ -26,7 +31,7 @@ def __init__(self, provider_configs: dict = {}):
self.providers = {}
self.provider_configs = provider_configs
self._chat = None
self._initialize_providers()
self._audio = None

def _initialize_providers(self):
"""Helper method to initialize or update providers."""
Expand All @@ -50,15 +55,15 @@ def _validate_provider_key(self, provider_key):

return provider_key

def configure(self, provider_configs: dict = None):
def configure(self, provider_configs: Optional[dict] = None):
"""
Configure the client with provider configurations.
"""
if provider_configs is None:
return

self.provider_configs.update(provider_configs)
self._initialize_providers() # NOTE: This will override existing provider instances.
# Providers will be lazily initialized when needed

@property
def chat(self):
Expand All @@ -67,6 +72,13 @@ def chat(self):
self._chat = Chat(self)
return self._chat

@property
def audio(self):
"""Return the audio API interface."""
if not self._audio:
self._audio = Audio(self)
return self._audio


class Chat:
def __init__(self, client: "Client"):
Expand Down Expand Up @@ -116,7 +128,7 @@ def _tool_runner(
provider,
model_name: str,
messages: list,
tools: any,
tools: Any,
max_turns: int,
**kwargs,
):
Expand Down Expand Up @@ -244,3 +256,131 @@ def create(self, model: str, messages: list, **kwargs):
# Delegate the chat completion to the correct provider's implementation
response = provider.chat_completions_create(model_name, messages, **kwargs)
return self._extract_thinking_content(response)


class Audio:
"""Audio API interface."""

def __init__(self, client: "Client"):
self.client = client
self._transcriptions = Transcriptions(self.client)

@property
def transcriptions(self):
"""Return the transcriptions interface."""
return self._transcriptions


class Transcriptions:
"""Transcriptions API interface."""

def __init__(self, client: "Client"):
self.client = client

def create(
self,
*,
model: str,
file: Union[str, BinaryIO],
options: Optional[TranscriptionOptions] = None,
**kwargs,
) -> TranscriptionResponse:
"""
Create a transcription using the specified model and file.

Args:
model: Provider and model in format 'provider:model' (e.g., 'openai:whisper-1')
file: Audio file to transcribe (file path or file-like object)
options: TranscriptionOptions instance with unified parameters (includes stream control)
**kwargs: Additional parameters (used if options is None, assumed to be OpenAI format)

Returns:
TranscriptionResponse: Unified response (batch or streaming based on options.stream)
"""
# Validate options and kwargs
if options is not None:
if not options.has_any_parameters():
raise ValueError(
"TranscriptionOptions provided but no parameters are set. "
"Please set at least one parameter or pass None to use kwargs."
)
# TranscriptionOptions takes precedence, ignore kwargs
if kwargs:
import warnings

warnings.warn(
"Both TranscriptionOptions and kwargs provided. Using TranscriptionOptions and ignoring kwargs.",
UserWarning,
)
elif not kwargs:
# Neither options nor kwargs provided
raise ValueError(
"Either TranscriptionOptions or kwargs must be provided for transcription parameters."
)

# Check that correct format is used
if ":" not in model:
raise ValueError(
f"Invalid model format. Expected 'provider:model', got '{model}'"
)

# Extract the provider key from the model identifier
provider_key, model_name = model.split(":", 1)

# Initialize provider if not already initialized
if provider_key not in self.client.providers:
config = self.client.provider_configs.get(provider_key, {})
try:
self.client.providers[provider_key] = ProviderFactory.create_provider(
provider_key, config
)
except ImportError as e:
raise ValueError(f"Provider '{provider_key}' is not available: {e}")

provider = self.client.providers.get(provider_key)
if not provider:
raise ValueError(f"Could not load provider for '{provider_key}'.")

# Check if provider supports audio transcription
if not hasattr(provider, "audio") or provider.audio is None:
raise ValueError(
f"Provider '{provider_key}' does not support audio transcription."
)

# Determine if streaming is requested
should_stream = False # Default to batch processing
if options and options.stream is not None:
should_stream = options.stream
elif kwargs.get("stream"):
should_stream = kwargs.get("stream", False)

# Delegate the transcription to the correct provider's implementation
try:
if should_stream:
# Check if provider supports output streaming
if hasattr(provider.audio, "transcriptions") and hasattr(
provider.audio.transcriptions, "create_stream_output"
):
return provider.audio.transcriptions.create_stream_output(
model_name, file, options=options, **kwargs
)
else:
raise ValueError(
f"Provider '{provider_key}' does not support output streaming transcription."
)
else:
# Non-streaming (batch) transcription
if hasattr(provider.audio, "transcriptions") and hasattr(
provider.audio.transcriptions, "create"
):
return provider.audio.transcriptions.create(
model_name, file, options=options, **kwargs
)
else:
raise ValueError(
f"Provider '{provider_key}' does not support audio transcription."
)
except NotImplementedError:
raise ValueError(
f"Provider '{provider_key}' does not support audio transcription."
)
Loading