andrewyng
diff --git a/‎aisuite/providers/deepgram_provider.py‎
Lines changed: 207 additions & 65 deletions b/‎aisuite/providers/deepgram_provider.py‎
Lines changed: 207 additions & 65 deletions
diff --git a/‎aisuite/providers/google_provider.py‎
Lines changed: 1 addition & 5 deletions b/‎aisuite/providers/google_provider.py‎
Lines changed: 1 addition & 5 deletions
@@ -1,5 +1,9 @@
 import os
-import asyncio
+import json
+import numpy as np
+import queue
+import threading
+import time
 from typing import Union, BinaryIO, Optional, AsyncGenerator
 
 from aisuite.provider import Provider, ASRError, Audio
@@ -99,57 +103,136 @@ async def create_stream_output(
             model: str,
             file: Union[str, BinaryIO],
             options: Optional[TranscriptionOptions] = None,
+            chunk_size_minutes: float = 3.0,
             **kwargs,
         ) -> AsyncGenerator[StreamingTranscriptionChunk, None]:
-            """Create streaming audio transcription using Deepgram SDK."""
+            """Create streaming audio transcription using Deepgram SDK with chunked processing."""
             try:
                 from deepgram import LiveOptions
+                from deepgram.clients.listen import LiveTranscriptionEvents
 
-                api_params = self._prepare_api_params(model, options, kwargs)
-                live_options = LiveOptions(**api_params)
-                connection = self.client.listen.websocket.v("1")
+                # Load and prepare audio
+                audio_data, sample_rate = await self._load_and_prepare_audio(file)
 
-                transcript_queue = asyncio.Queue()
-                finished_event = asyncio.Event()
-
-                def on_message(self, result, **kwargs):
-                    sentence = result.channel.alternatives[0].transcript
-                    if sentence:
-                        chunk = StreamingTranscriptionChunk(
-                            text=sentence,
-                            is_final=result.is_final,
-                            confidence=getattr(
-                                result.channel.alternatives[0], "confidence", None
-                            ),
+                # Calculate chunking strategy
+                duration_seconds = len(audio_data) / sample_rate
+                chunk_duration_seconds = chunk_size_minutes * 60
+
+                if duration_seconds <= chunk_duration_seconds:
+                    chunks = [audio_data]
+                else:
+                    chunk_size_samples = int(chunk_duration_seconds * sample_rate)
+                    chunks = []
+                    num_chunks = int(np.ceil(duration_seconds / chunk_duration_seconds))
+                    for i in range(num_chunks):
+                        start_sample = i * chunk_size_samples
+                        end_sample = min(
+                            start_sample + chunk_size_samples, len(audio_data)
                         )
-                        asyncio.create_task(transcript_queue.put(chunk))
+                        chunks.append(audio_data[start_sample:end_sample])
+
+                # Setup API parameters
+                api_params = self._prepare_api_params(
+                    model, options, kwargs, is_streaming=True
+                )
+                api_params["interim_results"] = (
+                    True  # Enable interim results for streaming
+                )
+
+                # Add critical audio format parameters (matching reference)
+                api_params["encoding"] = "linear16"  # PCM16 format
+                api_params["sample_rate"] = 16000  # Match our target sample rate
+                api_params["channels"] = 1  # Mono audio
 
-                def on_error(self, error, **kwargs):
-                    asyncio.create_task(
+                live_options = LiveOptions(**api_params)
+
+                # Create single connection for all chunks
+                connection = self.client.listen.websocket.v("1")
+
+                # Use thread-safe queue instead of asyncio.Queue for cross-thread communication
+                transcript_queue = queue.Queue()
+                connection_closed = threading.Event()
+
+                def on_message(*args, **kwargs):
+                    """Handle transcript events"""
+                    # Extract result from args or kwargs (following reference pattern)
+                    result = None
+                    if len(args) >= 2:
+                        result = args[1]
+                    elif "result" in kwargs:
+                        result = kwargs["result"]
+                    else:
+                        return
+
+                    if hasattr(result, "channel") and result.channel.alternatives:
+                        alt = result.channel.alternatives[0]
+                        if alt.transcript:
+                            chunk = StreamingTranscriptionChunk(
+                                text=alt.transcript,
+                                is_final=getattr(result, "is_final", False),
+                                confidence=getattr(alt, "confidence", None),
+                            )
+                            transcript_queue.put(chunk)  # Thread-safe put
+
+                def on_error(*args, **kwargs):
+                    """Handle error events"""
+                    # Extract error from args or kwargs
+                    error = None
+                    if len(args) >= 2:
+                        error = args[1]
+                    elif "error" in kwargs:
+                        error = kwargs["error"]
+
+                    if error:
                         transcript_queue.put(
                             ASRError(f"Deepgram streaming error: {error}")
-                        )
-                    )
-                    finished_event.set()
+                        )  # Thread-safe put
 
-                def on_close(self, close, **kwargs):
-                    finished_event.set()
+                def on_close(*args, **kwargs):
+                    """Handle connection close events"""
+                    connection_closed.set()
 
-                connection.on(connection.event.TRANSCRIPT_RECEIVED, on_message)
-                connection.on(connection.event.ERROR, on_error)
-                connection.on(connection.event.CLOSE, on_close)
+                # Register event handlers
+                connection.on(LiveTranscriptionEvents.Transcript, on_message)
+                connection.on(LiveTranscriptionEvents.Error, on_error)
+                connection.on(LiveTranscriptionEvents.Close, on_close)
 
+                # Start connection
                 if not connection.start(live_options):
                     raise ASRError("Failed to start Deepgram streaming connection")
 
-                audio_data = self._read_audio_data(file)
-                await self._send_audio_chunks(connection, audio_data)
-                connection.finish()
-
-                async for chunk in self._yield_transcription_chunks(
-                    transcript_queue, finished_event
-                ):
-                    yield chunk
+                # Send all chunks through single connection
+                try:
+                    for audio_chunk in chunks:
+                        self._send_audio_chunk(connection, audio_chunk)
+
+                    # Send CloseStream message to signal end of all chunks
+                    close_stream_message = json.dumps({"type": "CloseStream"})
+                    connection.send(close_stream_message)
+
+                    # Yield results until connection closes naturally
+                    while not connection_closed.is_set():
+                        try:
+                            # Use thread-safe queue with timeout
+                            chunk = transcript_queue.get(timeout=0.1)
+                            if isinstance(chunk, Exception):
+                                raise chunk
+                            yield chunk
+                        except queue.Empty:
+                            continue
+
+                    # Get any remaining results
+                    while not transcript_queue.empty():
+                        try:
+                            chunk = transcript_queue.get_nowait()
+                            if isinstance(chunk, Exception):
+                                raise chunk
+                            yield chunk
+                        except queue.Empty:
+                            break
+
+                except Exception as e:
+                    raise ASRError(f"Error during audio streaming: {e}")
 
             except Exception as e:
                 raise ASRError(f"Deepgram streaming transcription error: {e}")
@@ -159,14 +242,23 @@ def _extract_model_name(self, model: str) -> str:
             return model
 
         def _prepare_api_params(
-            self, model: str, options: Optional[TranscriptionOptions], kwargs: dict
+            self,
+            model: str,
+            options: Optional[TranscriptionOptions],
+            kwargs: dict,
+            is_streaming: bool = False,
         ) -> dict:
             """Prepare API parameters for Deepgram."""
             if options is not None:
                 api_params = ParameterMapper.map_to_deepgram(options)
             else:
                 api_params = self._map_openai_to_deepgram_params(kwargs)
 
+            # Remove parameters not supported by LiveOptions (streaming)
+            if is_streaming:
+                # utterances is only supported in batch/prerecorded, not streaming
+                api_params.pop("utterances", None)
+
             model_name = self._extract_model_name(model)
             api_params.setdefault("smart_format", True)
             api_params.setdefault("punctuate", True)
@@ -188,34 +280,80 @@ def _prepare_audio_payload(self, file: Union[str, BinaryIO]) -> dict:
                     )
             return {"buffer": buffer_data}
 
-        def _read_audio_data(self, file: Union[str, BinaryIO]) -> bytes:
-            """Read audio data from file or file-like object."""
-            if isinstance(file, str):
-                with open(file, "rb") as audio_file:
-                    return audio_file.read()
-            else:
-                return file.read()
-
-        async def _send_audio_chunks(self, connection, audio_data: bytes) -> None:
-            """Send audio data in chunks to Deepgram connection."""
-            chunk_size = 8192
-            for i in range(0, len(audio_data), chunk_size):
-                chunk = audio_data[i : i + chunk_size]
-                connection.send(chunk)
-                await asyncio.sleep(0.01)
-
-        async def _yield_transcription_chunks(
-            self, transcript_queue: asyncio.Queue, finished_event: asyncio.Event
-        ) -> AsyncGenerator[StreamingTranscriptionChunk, None]:
-            """Yield transcription chunks as they arrive."""
-            while not finished_event.is_set():
+        async def _load_and_prepare_audio(
+            self, file: Union[str, BinaryIO]
+        ) -> tuple[np.ndarray, int]:
+            """Load and prepare audio file for streaming.
+
+            Conversions performed only when necessary:
+            - Stereo to mono: Required for multi-channel audio
+            - Sample rate conversion: Required when input != 16kHz
+            - Other formats: Error out as unsupported
+            """
+            try:
                 try:
-                    chunk = await asyncio.wait_for(transcript_queue.get(), timeout=1.0)
-                    if isinstance(chunk, Exception):
-                        raise chunk
-                    yield chunk
-                except asyncio.TimeoutError:
-                    continue
+                    import soundfile as sf
+                except ImportError:
+                    raise ASRError(
+                        "soundfile is required for audio processing. Install with: pip install soundfile"
+                    )
+
+                if isinstance(file, str):
+                    audio_data, original_sample_rate = sf.read(file)
+                else:
+                    audio_data, original_sample_rate = sf.read(file)
+
+                audio_data = np.asarray(audio_data, dtype=np.float32)
+
+                # Convert to mono if stereo
+                if len(audio_data.shape) > 1:
+                    if audio_data.shape[1] == 2:
+                        audio_data = np.mean(audio_data, axis=1)
+                    else:
+                        raise ASRError(
+                            f"Unsupported audio format: {audio_data.shape[1]} channels. Only mono and stereo are supported."
+                        )
+
+                # Resample to 16kHz if needed
+                target_sample_rate = 16000
+                if original_sample_rate != target_sample_rate:
+                    try:
+                        from scipy import signal
+
+                        num_samples = int(
+                            len(audio_data) * target_sample_rate / original_sample_rate
+                        )
+                        audio_data = signal.resample(audio_data, num_samples)
+                    except ImportError:
+                        raise ASRError(
+                            f"Audio resampling required but scipy not available. "
+                            f"Input is {original_sample_rate}Hz, need {target_sample_rate}Hz. "
+                            f"Install scipy or provide audio at {target_sample_rate}Hz."
+                        )
+
+                return np.asarray(audio_data, dtype=np.float32), target_sample_rate
+
+            except Exception as e:
+                if isinstance(e, ASRError):
+                    raise
+                raise ASRError(f"Error loading audio file: {e}")
+
+        def _send_audio_chunk(self, connection, audio_chunk: np.ndarray) -> None:
+            """Send audio chunk data through the connection."""
+            streaming_chunk_size = 8000  # Match reference BLOCKSIZE (~0.5s @16kHz mono)
+            send_delay = 0.01
+
+            for i in range(0, len(audio_chunk), streaming_chunk_size):
+                piece = audio_chunk[i : i + streaming_chunk_size]
+
+                if len(piece) < streaming_chunk_size:
+                    piece = np.pad(
+                        piece, (0, streaming_chunk_size - len(piece)), mode="constant"
+                    )
+
+                pcm16 = (piece * 32767).astype(np.int16).tobytes()
+                connection.send(pcm16)
+                time.sleep(send_delay)  # Use synchronous sleep like reference
 
         def _map_openai_to_deepgram_params(self, openai_params: dict) -> dict:
             """Map OpenAI-style parameters to Deepgram parameters."""
@@ -229,7 +367,11 @@ def _map_openai_to_deepgram_params(self, openai_params: dict) -> dict:
                 granularities = openai_params["timestamp_granularities"]
                 if "word" in granularities:
                     deepgram_params["punctuate"] = True
-                    deepgram_params["utterances"] = True
+                    # Note: utterances is only for batch/prerecorded, not streaming
+
+            # Essential for streaming - map interim_results
+            if "interim_results" in openai_params:
+                deepgram_params["interim_results"] = openai_params["interim_results"]
 
             return deepgram_params
 
 
@@ -398,7 +398,7 @@ async def create_stream_output(
                 )
 
                 responses = self.provider.speech_client.streaming_recognize(
-                    requests=request_generator
+                    config=streaming_config, requests=request_generator
                 )
 
                 for response in responses:
@@ -498,10 +498,6 @@ def _create_streaming_requests(
             """Create streaming requests generator for Google Speech API."""
 
             def request_generator():
-                yield speech.StreamingRecognizeRequest(
-                    streaming_config=streaming_config
-                )
-
                 chunk_size = 8192
                 for i in range(0, len(audio_data), chunk_size):
                     chunk = audio_data[i : i + chunk_size]