feat: Update google provider for ASR

akshay1100 · akshay1100 · commit 18dda548cfe8 · 2025-08-15T00:22:01.000-07:00
diff --git a/README.md b/README.md
@@ -237,7 +237,7 @@ result = client.audio.transcriptions.create(
 )
 ```
 
-**Supported providers:** OpenAI, Deepgram
+**Supported providers:** OpenAI, Deepgram, Google.
 
 **Key features:** Same `provider:model` format • Rich metadata (timestamps, confidence, speakers) • Provider-specific advanced features
 
diff --git a/aisuite/client.py b/aisuite/client.py
@@ -28,7 +28,6 @@ def __init__(self, provider_configs: dict = {}):
         self.provider_configs = provider_configs
         self._chat = None
         self._audio = None
-        self._initialize_providers()
 
     def _initialize_providers(self):
         """Helper method to initialize or update providers."""
@@ -60,7 +59,7 @@ def configure(self, provider_configs: dict = None):
             return
 
         self.provider_configs.update(provider_configs)
-        self._initialize_providers()  # NOTE: This will override existing provider instances.
+        # Providers will be lazily initialized when needed
 
     @property
     def chat(self):
diff --git a/aisuite/providers/google_provider.py b/aisuite/providers/google_provider.py
@@ -2,7 +2,7 @@
 
 import os
 import json
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Union, BinaryIO
 
 import vertexai
 from vertexai.generative_models import (
@@ -16,6 +16,8 @@
 import pprint
 
 from aisuite.framework import ProviderInterface, ChatCompletionResponse, Message
+from aisuite.framework.message import TranscriptionResult, Word, Segment, Alternative
+from aisuite.provider import ASRError
 
 
 DEFAULT_TEMPERATURE = 0.7
@@ -211,6 +213,9 @@ def __init__(self, **config):
 
         self.transformer = GoogleMessageConverter()
 
+        # Initialize Speech client lazily
+        self._speech_client = None
+
     def chat_completions_create(self, model, messages, **kwargs):
         """Request chat completions from the Google AI API.
 
@@ -296,3 +301,113 @@ def chat_completions_create(self, model, messages, **kwargs):
 
         # Convert and return the response
         return self.transformer.convert_response(response)
+
+    @property
+    def speech_client(self):
+        """Lazy initialization of Google Cloud Speech client."""
+        if self._speech_client is None:
+            try:
+                from google.cloud import speech
+
+                self._speech_client = speech.SpeechClient()
+            except ImportError:
+                raise ImportError(
+                    "google-cloud-speech is required for ASR functionality. "
+                    "Install it with: pip install google-cloud-speech"
+                )
+        return self._speech_client
+
+    def audio_transcriptions_create(
+        self, model: str, file: Union[str, BinaryIO], **kwargs
+    ) -> TranscriptionResult:
+        """Create audio transcription using Google Cloud Speech-to-Text API."""
+        try:
+            from google.cloud import speech
+
+            # Handle file input
+            if isinstance(file, str):
+                with open(file, "rb") as audio_file:
+                    audio_data = audio_file.read()
+            else:
+                audio_data = file.read()
+
+            # Create audio object
+            audio = speech.RecognitionAudio(content=audio_data)
+
+            # Configure recognition settings
+            config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=kwargs.get("sample_rate_hertz", 16000),
+                language_code=kwargs.get("language", "en-US"),
+                enable_word_time_offsets=True,
+                enable_word_confidence=True,
+                enable_automatic_punctuation=kwargs.get("punctuate", True),
+                model=model if model != "default" else "latest_long",
+            )
+
+            # Make API request
+            response = self.speech_client.recognize(config=config, audio=audio)
+            return self._parse_google_response(response)
+
+        except ImportError:
+            raise ASRError(
+                "google-cloud-speech is required for ASR functionality. "
+                "Install it with: pip install google-cloud-speech"
+            )
+        except Exception as e:
+            raise ASRError(f"Google Speech-to-Text error: {e}")
+
+    def _parse_google_response(self, response) -> TranscriptionResult:
+        """Convert Google Speech-to-Text response to unified TranscriptionResult."""
+        if not response.results:
+            return TranscriptionResult(text="", language=None)
+
+        # Get the best result
+        best_result = response.results[0]
+        if not best_result.alternatives:
+            return TranscriptionResult(text="", language=None)
+
+        # Get the best alternative
+        best_alternative = best_result.alternatives[0]
+        text = best_alternative.transcript
+        confidence = getattr(best_alternative, "confidence", None)
+
+        # Parse words if available
+        words = []
+        if hasattr(best_alternative, "words") and best_alternative.words:
+            for word in best_alternative.words:
+                words.append(
+                    Word(
+                        word=word.word,
+                        start=(
+                            word.start_time.total_seconds()
+                            if hasattr(word, "start_time")
+                            else 0.0
+                        ),
+                        end=(
+                            word.end_time.total_seconds()
+                            if hasattr(word, "end_time")
+                            else 0.0
+                        ),
+                        confidence=getattr(word, "confidence", None),
+                    )
+                )
+
+        # Create alternatives list
+        alternatives = []
+        for alt in best_result.alternatives:
+            alternatives.append(
+                Alternative(
+                    transcript=alt.transcript,
+                    confidence=getattr(alt, "confidence", None),
+                )
+            )
+
+        return TranscriptionResult(
+            text=text,
+            language=None,  # Google doesn't return detected language in this format
+            confidence=confidence,
+            task="transcribe",
+            words=words if words else None,
+            alternatives=alternatives if alternatives else None,
+        )
diff --git a/examples/asr_example.ipynb b/examples/asr_example.ipynb
@@ -60,6 +60,116 @@
     "for word in result.words[:3]:\n",
     "    print(f\"{word.word}: {word.start:.1f}s-{word.end:.1f}s\")\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32ed3f0f",
+   "metadata": {},
+   "source": [
+    "## Google Cloud Speech-to-Text\n",
+    "\n",
+    "Google provider supports ASR using Google Cloud Speech-to-Text API. Make sure you have:\n",
+    "- `google-cloud-speech` library installed: `pip install google-cloud-speech`\n",
+    "- Google Cloud credentials configured\n",
+    "- Required environment variables: `GOOGLE_PROJECT_ID`, `GOOGLE_REGION`, `GOOGLE_APPLICATION_CREDENTIALS`\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0540ca09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure Google provider\n",
+    "client.configure({\n",
+    "    \"google\": {\n",
+    "        \"project_id\": \"your-project-id\",\n",
+    "        \"region\": \"us-central1\", \n",
+    "        \"application_credentials\": \"path/to/credentials.json\"\n",
+    "    }\n",
+    "})\n",
+    "\n",
+    "# Basic Google transcription\n",
+    "result = client.audio.transcriptions.create(\n",
+    "    model=\"google:latest_long\",\n",
+    "    file=audio_file,\n",
+    "    language=\"en-US\"\n",
+    ")\n",
+    "print(f\"Google transcription: {result.text}\")\n",
+    "print(f\"Confidence: {result.confidence}\")\n",
+    "print(f\"Task: {result.task}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2bd3aa6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Google transcription with advanced options\n",
+    "result = client.audio.transcriptions.create(\n",
+    "    model=\"google:latest_long\",\n",
+    "    file=audio_file,\n",
+    "    language=\"en-US\",\n",
+    "    sample_rate_hertz=44100,\n",
+    "    punctuate=True\n",
+    ")\n",
+    "\n",
+    "print(f\"Text: {result.text}\")\n",
+    "print(f\"Language: {result.language}\")\n",
+    "\n",
+    "# Show word-level timestamps if available\n",
+    "if result.words:\n",
+    "    print(f\"Words with timestamps: {len(result.words)}\")\n",
+    "    for word in result.words[:5]:  # Show first 5 words\n",
+    "        print(f\"  {word.word}: {word.start:.1f}s-{word.end:.1f}s (confidence: {word.confidence:.2f})\")\n",
+    "\n",
+    "# Show alternatives if available\n",
+    "if result.alternatives:\n",
+    "    print(f\"Alternatives: {len(result.alternatives)}\")\n",
+    "    for i, alt in enumerate(result.alternatives[:3]):  # Show first 3 alternatives\n",
+    "        print(f\"  Alt {i+1}: {alt.transcript} (confidence: {alt.confidence:.2f})\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f777a880",
+   "metadata": {},
+   "source": [
+    "## Deepgram Provider\n",
+    "\n",
+    "You can also use Deepgram for ASR with advanced features like speaker diarization.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd51b0ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Deepgram transcription with speaker diarization\n",
+    "result = client.audio.transcriptions.create(\n",
+    "    model=\"deepgram:nova-2\",\n",
+    "    file=audio_file,\n",
+    "    diarize=True,\n",
+    "    punctuate=True,\n",
+    "    language=\"en-US\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Deepgram transcription: {result.text}\")\n",
+    "print(f\"Confidence: {result.confidence}\")\n",
+    "\n",
+    "# Show speaker information if available\n",
+    "if result.words:\n",
+    "    speakers = set(word.speaker for word in result.words if word.speaker is not None)\n",
+    "    print(f\"Detected speakers: {len(speakers)}\")\n",
+    "    for word in result.words[:5]:\n",
+    "        if word.speaker is not None:\n",
+    "            print(f\"  {word.word} (Speaker {word.speaker}): {word.start:.1f}s-{word.end:.1f}s\")\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ anthropic = { version = "^0.30.1", optional = true }
 boto3 = { version = "^1.34.144", optional = true }
 cohere = { version = "^5.12.0", optional = true }
 vertexai = { version = "^1.63.0", optional = true }
+google-cloud-speech = { version = "^2.33.0", optional = true }
 groq = { version = "^0.9.0", optional = true }
 mistralai = { version = "^1.0.3", optional = true }
 openai = { version = "^1.35.8", optional = true }
@@ -27,7 +28,7 @@ azure = []
 cerebras = ["cerebras_cloud_sdk"]
 cohere = ["cohere"]
 deepseek = ["openai"]
-google = ["vertexai"]
+google = ["vertexai", "google-cloud-speech"]
 groq = ["groq"]
 huggingface = []
 mistral = ["mistralai"]
@@ -52,6 +53,7 @@ chromadb = "^0.5.4"
 sentence-transformers = "^3.0.1"
 datasets = "^2.20.0"
 vertexai = "^1.63.0"
+google-cloud-speech = "^2.33.0"
 ibm-watsonx-ai = "^1.1.16"
 cerebras_cloud_sdk = "^1.19.0"
 
diff --git a/tests/providers/test_google_provider.py b/tests/providers/test_google_provider.py

Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ result = client.audio.transcriptions.create(`
`237`	`237`	`)`
`238`	`238`	```
`239`	`239`
`240`		`-Supported providers: OpenAI, Deepgram`
	`240`	`+Supported providers: OpenAI, Deepgram, Google.`
`241`	`241`
`242`	`242`	Key features: Same `provider:model` format • Rich metadata (timestamps, confidence, speakers) • Provider-specific advanced features
`243`	`243`