|
| 1 | +"""Implements embeddings from [Voyage AI](https://voyageai.com). |
| 2 | +""" |
| 3 | + |
| 4 | +import importlib |
| 5 | + |
| 6 | +from kotaemon.base import Document, DocumentWithEmbedding, Param |
| 7 | + |
| 8 | +from .base import BaseEmbeddings |
| 9 | + |
| 10 | +vo = None |
| 11 | + |
| 12 | + |
| 13 | +def _import_voyageai(): |
| 14 | + global vo |
| 15 | + if not vo: |
| 16 | + vo = importlib.import_module("voyageai") |
| 17 | + return vo |
| 18 | + |
| 19 | + |
| 20 | +def _format_output(texts: list[str], embeddings: list[list]): |
| 21 | + """Formats the output of all `.embed` calls. |
| 22 | + Args: |
| 23 | + texts: List of original documents |
| 24 | + embeddings: Embeddings corresponding to each document |
| 25 | + """ |
| 26 | + return [ |
| 27 | + DocumentWithEmbedding(content=text, embedding=embedding) |
| 28 | + for text, embedding in zip(texts, embeddings) |
| 29 | + ] |
| 30 | + |
| 31 | + |
| 32 | +class VoyageAIEmbeddings(BaseEmbeddings): |
| 33 | + """Voyage AI provides best-in-class embedding models and rerankers.""" |
| 34 | + |
| 35 | + api_key: str = Param(None, help="Voyage API key", required=False) |
| 36 | + model: str = Param( |
| 37 | + "voyage-3", |
| 38 | + help=( |
| 39 | + "Model name to use. The Voyage " |
| 40 | + "[documentation](https://docs.voyageai.com/docs/embeddings) " |
| 41 | + "provides a list of all available embedding models." |
| 42 | + ), |
| 43 | + required=True, |
| 44 | + ) |
| 45 | + |
| 46 | + def __init__(self, *args, **kwargs): |
| 47 | + super().__init__(*args, **kwargs) |
| 48 | + if not self.api_key: |
| 49 | + raise ValueError("API key must be provided for VoyageAIEmbeddings.") |
| 50 | + |
| 51 | + self._client = _import_voyageai().Client(api_key=self.api_key) |
| 52 | + self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key) |
| 53 | + |
| 54 | + def invoke( |
| 55 | + self, text: str | list[str] | Document | list[Document], *args, **kwargs |
| 56 | + ) -> list[DocumentWithEmbedding]: |
| 57 | + texts = [t.content for t in self.prepare_input(text)] |
| 58 | + embeddings = self._client.embed(texts, model=self.model).embeddings |
| 59 | + return _format_output(texts, embeddings) |
| 60 | + |
| 61 | + async def ainvoke( |
| 62 | + self, text: str | list[str] | Document | list[Document], *args, **kwargs |
| 63 | + ) -> list[DocumentWithEmbedding]: |
| 64 | + texts = [t.content for t in self.prepare_input(text)] |
| 65 | + embeddings = await self._aclient.embed(texts, model=self.model).embeddings |
| 66 | + return _format_output(texts, embeddings) |
0 commit comments