From 02d34cd1f20dac05fb8e169fe7056ea3349c2ee7 Mon Sep 17 00:00:00 2001 From: Jeffrey Huber Date: Mon, 30 Oct 2023 07:37:02 -0700 Subject: [PATCH 1/4] folders prototype --- docs/api/index.md | 6 ++ docs/hosted/test.mdx | 5 + docs/integrations/index.md | 13 +++ docs/integrations/langchain.md | 56 +++++++++++ docs/integrations/llama-index.md | 13 +++ docs/{integrations.md => integrationsold.md} | 1 + docs/intro.md | 1 - docs/production.md | 6 ++ docusaurus.config.js | 2 +- package.json | 2 +- sidebars.js | 100 +++++++++++++++++-- src/css/custom.css | 30 ++++++ 12 files changed, 224 insertions(+), 11 deletions(-) create mode 100644 docs/api/index.md create mode 100644 docs/hosted/test.mdx create mode 100644 docs/integrations/index.md create mode 100644 docs/integrations/langchain.md create mode 100644 docs/integrations/llama-index.md rename docs/{integrations.md => integrationsold.md} (99%) create mode 100644 docs/production.md diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..456632d --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,6 @@ +--- +slug: /api +title: 🔧 API +--- + +# 🔧 API diff --git a/docs/hosted/test.mdx b/docs/hosted/test.mdx new file mode 100644 index 0000000..576d631 --- /dev/null +++ b/docs/hosted/test.mdx @@ -0,0 +1,5 @@ +--- +title: Test +--- + +Can this work? \ No newline at end of file diff --git a/docs/integrations/index.md b/docs/integrations/index.md new file mode 100644 index 0000000..efaebcf --- /dev/null +++ b/docs/integrations/index.md @@ -0,0 +1,13 @@ +--- +slug: /integrations +title: 🔌 Integrations +hide_title: true +--- + +# 🔌 Integrations + +Chroma currently has integrations with + +- LangChain +- LlamaIndex +- ... and many more \ No newline at end of file diff --git a/docs/integrations/langchain.md b/docs/integrations/langchain.md new file mode 100644 index 0000000..c5d9eb1 --- /dev/null +++ b/docs/integrations/langchain.md @@ -0,0 +1,56 @@ +--- +slug: /integrations/langchain +title: 🦜️🔗 Langchain +--- + +## Langchain - Python + +- [LangChain + Chroma](https://blog.langchain.dev/langchain-chroma/) on the LangChain blog +- [Harrison's `chroma-langchain` demo repo](https://github.com/hwchase17/chroma-langchain) + - [question answering over documents](https://github.com/hwchase17/chroma-langchain/blob/master/qa.ipynb) - ([Replit version](https://replit.com/@swyx/LangChainChromaStarter#main.py)) + - [to use Chroma as a persistent database](https://github.com/hwchase17/chroma-langchain/blob/master/persistent-qa.ipynb) +- Tutorials + - [Chroma and LangChain tutorial](https://github.com/grumpyp/chroma-langchain-tutorial) - The demo showcases how to pull data from the English Wikipedia using their API. The project also demonstrates how to vectorize data in chunks and get embeddings using OpenAI embeddings model. + - [Create a Voice-based ChatGPT Clone That Can Search on the Internet and local files](https://betterprogramming.pub/how-to-create-a-voice-based-chatgpt-clone-that-can-search-on-the-internet-24d7f570ea8) +- [LangChain's Chroma Documentation](https://python.langchain.com/en/latest/modules/indexes/vectorstores.html?highlight=chroma#langchain.vectorstores.Chroma) + + +## Langchain - JS + +Here is an [example in LangChainJS](https://github.com/hwchase17/langchainjs/blob/main/examples/src/chains/chat_vector_db_chroma.ts) + +```javascript +import { OpenAI } from "langchain/llms/openai"; +import { ConversationalRetrievalQAChain } from "langchain/chains"; +import { Chroma } from "langchain/vectorstores/chroma"; +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import * as fs from "fs"; + +// to run this first run a chroma server with `chroma run --path /path/to/data` + +export const run = async () => { + /* Initialize the LLM to use to answer the question */ + const model = new OpenAI(); + /* Load in the file we want to do question answering over */ + const text = fs.readFileSync("state_of_the_union.txt", "utf8"); + /* Split the text into chunks */ + const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 }); + const docs = await textSplitter.createDocuments([text]); + /* Create the vectorstore */ + const vectorStore = await Chroma.fromDocuments(docs, new OpenAIEmbeddings(), { + collectionName: "state_of_the_union", + }); + /* Create the chain */ + const chain = ConversationalRetrievalQAChain.fromLLM( + model, + vectorStore.asRetriever() + ); + /* Ask it a question */ + const question = "What did the president say about Justice Breyer?"; + const res = await chain.call({ question, chat_history: [] }); + console.log(res); +}; +``` + +- [LangChainJS Chroma Documentation](https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/chroma) \ No newline at end of file diff --git a/docs/integrations/llama-index.md b/docs/integrations/llama-index.md new file mode 100644 index 0000000..972d300 --- /dev/null +++ b/docs/integrations/llama-index.md @@ -0,0 +1,13 @@ +--- +slug: /integrations/llama-index +title: 🦙 LlamaIndex +--- + +## 🦙 LlamaIndex + +> _formerly known as GPT-index_ + +- `LlamaIndex` [Vector Store page](https://gpt-index.readthedocs.io/en/latest/how_to/integrations/vector_stores.html) +- Demo: https://github.com/jerryjliu/llama_index/blob/main/docs/examples/vector_stores/ChromaIndexDemo.ipynb +- [Chroma Loader on Llamahub](https://llamahub.ai/l/chroma) + diff --git a/docs/integrations.md b/docs/integrationsold.md similarity index 99% rename from docs/integrations.md rename to docs/integrationsold.md index ddb363f..c5b36b5 100644 --- a/docs/integrations.md +++ b/docs/integrationsold.md @@ -1,5 +1,6 @@ --- sidebar_position: 7 +title: Integrationsold --- # 🔌 Integrations diff --git a/docs/intro.md b/docs/intro.md index 93120f2..a2cd227 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -1,7 +1,6 @@ --- sidebar_position: 0 slug: / -id: my-home-doc title: 🏡 Home hide_title: true --- diff --git a/docs/production.md b/docs/production.md new file mode 100644 index 0000000..3f39bc7 --- /dev/null +++ b/docs/production.md @@ -0,0 +1,6 @@ +--- +sidebar_position: 5 +title: "☁️ In Production" +--- + +# ☁️ In Production \ No newline at end of file diff --git a/docusaurus.config.js b/docusaurus.config.js index 3fe4fee..8dbccc7 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -94,7 +94,7 @@ const config = { position: 'right', }, { - href: 'https://github.com/chroma-core/chroma', + href: 'https://github.com/chroma-core/chroma', label: 'GitHub', position: 'right', }, diff --git a/package.json b/package.json index 77fffa0..0a5115e 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,7 @@ "private": true, "scripts": { "docusaurus": "docusaurus", - "dev": "docusaurus start", + "dev": "docusaurus start --port 3001", "start": "docusaurus start", "build": "docusaurus build", "swizzle": "docusaurus swizzle", diff --git a/sidebars.js b/sidebars.js index 9ab54c2..e2fc749 100644 --- a/sidebars.js +++ b/sidebars.js @@ -14,20 +14,104 @@ /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure - tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], + // tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], - // But you can create a sidebar manually - /* - tutorialSidebar: [ + // But we want to manually specify the sidebar for more control + docs: [ 'intro', - 'hello', + 'getting-started', + 'usage-guide', + 'embeddings', + 'about', + 'api-reference', + 'telemetry', + 'roadmap', + 'contributing', + 'troubleshooting', + { + type: 'link', + href: '/production', + label: '☁️ Production', + className: 'category-link', + }, + { + type: 'link', + href: '/integrations', + label: '🔌 Integrations', + className: 'category-link', + }, + { + type: 'link', + href: '/api', + label: '🔧 API Docs', + className: 'category-link', + }, + ], + integrations: [ + { + type: 'link', + label: '← Home', + href: '/' + }, + 'integrations/index', { type: 'category', - label: 'Tutorial', - items: ['tutorial-basics/create-a-document'], + label: 'Integrations', + collapsed: false, + className: 'category-header', + items: [ + 'integrations/langchain', + 'integrations/llama-index', + ], }, ], - */ + api: [ + { + type: 'link', + label: '← Home', + href: '/' + }, + 'api/index', + { + type: 'category', + label: 'Python Client', + collapsed: false, + className: 'category-header', + items: [ + 'reference/Client', + 'reference/Collection', + ], + }, + { + type: 'category', + label: 'JS/TS Client', + collapsed: false, + className: 'category-header', + items: [ + 'js_reference/Client', + 'js_reference/Collection', + ], + }, + ], + production: [ + { + type: 'link', + label: '← Home', + href: '/' + }, + 'production', + { + type: 'category', + label: 'Topics', + collapsed: false, + className: 'category-header', + items: [ + 'migration', + 'deployment', + // 'observability', + ], + }, + ] }; module.exports = sidebars; diff --git a/src/css/custom.css b/src/css/custom.css index fa1de07..b512705 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -558,4 +558,34 @@ div.special_table + table, th, td { .em { font-style: italic; +} + +.category-header .menu__list-item-collapsible { + pointer-events: none; +} + +.category-header .menu__link--sublist { + text-transform: uppercase; + font-size: small; + font-weight: bold; + margin-top: 40px; +} + +.category-header .menu__link--sublist-caret:after { + display: none; +} + +.category-link a::after { + content: "\2192"; + font-size: 20px; + margin-left: 5px; /* for spacing */ + transform: scaleX(-1) rotate(180deg); + margin-left: 9px; + position: absolute; + right: 30px; + opacity: 0.3; +} + +.main-wrapper { + min-height: 100vh; } \ No newline at end of file From a79665f68dd0e8a125e2a6069532bc6db02057ac Mon Sep 17 00:00:00 2001 From: Jeffrey Huber Date: Mon, 30 Oct 2023 07:39:04 -0700 Subject: [PATCH 2/4] folders protoytpe --- docs/observability.md | 1 - sidebars.js | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/observability.md b/docs/observability.md index 9c17151..78cb3d4 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -1,5 +1,4 @@ --- -sidebar_position: 8 title: "👀 Observability" --- diff --git a/sidebars.js b/sidebars.js index e2fc749..6305aa2 100644 --- a/sidebars.js +++ b/sidebars.js @@ -108,7 +108,7 @@ const sidebars = { items: [ 'migration', 'deployment', - // 'observability', + 'observability', ], }, ] From 2407e09c8edb913a25d5f4f02f07bb0c99bd98e9 Mon Sep 17 00:00:00 2001 From: Jeff Huber Date: Fri, 3 Nov 2023 18:19:10 -0700 Subject: [PATCH 3/4] Delete docs/hosted/test.mdx --- docs/hosted/test.mdx | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/hosted/test.mdx diff --git a/docs/hosted/test.mdx b/docs/hosted/test.mdx deleted file mode 100644 index 576d631..0000000 --- a/docs/hosted/test.mdx +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Test ---- - -Can this work? \ No newline at end of file From e36b5026fa2b7c51e752784a0f0149e0d26d4b2b Mon Sep 17 00:00:00 2001 From: Jeffrey Huber Date: Mon, 6 Nov 2023 07:21:15 -0800 Subject: [PATCH 4/4] folders that is mergable --- docs/api/index.md | 28 ++++ docs/embeddings.md | 228 +++----------------------------- docs/embeddings/cohere.md | 96 ++++++++++++++ docs/embeddings/google-palm.md | 14 ++ docs/embeddings/hugging-face.md | 35 +++++ docs/embeddings/instructor.md | 19 +++ docs/embeddings/openai.md | 61 +++++++++ docs/integrations/index.md | 21 ++- sidebars.js | 57 ++++---- 9 files changed, 322 insertions(+), 237 deletions(-) create mode 100644 docs/embeddings/cohere.md create mode 100644 docs/embeddings/google-palm.md create mode 100644 docs/embeddings/hugging-face.md create mode 100644 docs/embeddings/instructor.md create mode 100644 docs/embeddings/openai.md diff --git a/docs/api/index.md b/docs/api/index.md index 456632d..7ad0d2c 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -4,3 +4,31 @@ title: 🔧 API --- # 🔧 API + +## Client APIs + +Chroma currently maintains 1st party clients for Python and Javscript. For other clients in other languages, use their repos for documentation. + +`Client` - is the object that wraps a connection to a backing Chroma DB + +`Collection` - is the object that wraps a collectiom + + +
+ +| | Client | Collection | +|--------------|-----------|---------------| +| Python | [Client](/reference/Client) | [Collection](/reference/Collection) | +| Javascript | [Client](/js_reference/Client) | [Collection](/reference/Collection) | + +*** + +## Backend API + +Chroma's backend Swagger REST API docs are viewable by running Chroma and navigating to `http://localhost:8000/docs`. + +``` +pip install chromadb +chroma run +open http://localhost:8000/docs +``` \ No newline at end of file diff --git a/docs/embeddings.md b/docs/embeddings.md index c6031f2..96f5163 100644 --- a/docs/embeddings.md +++ b/docs/embeddings.md @@ -4,41 +4,35 @@ sidebar_position: 4 # 🧬 Embeddings -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -
Select a language
- - - - - - -*** - Embeddings are the A.I-native way to represent any kind of data, making them the perfect fit for working with all kinds of A.I-powered tools and algorithms. They can represent text, images, and soon audio and video. There are many options for creating embeddings, whether locally using an installed library, or by calling an API. Chroma provides lightweight wrappers around popular embedding providers, making it easy to use them in your apps. You can set an embedding function when you create a Chroma collection, which will be used automatically, or you can call them directly yourself. - - +
-To get Chroma's embedding functions, import the `chromadb.utils.embedding_functions` module. +| | Python | JS | +|--------------|-----------|---------------| +| [Default](/integrations/langchain) | ✅ | ➖ | +| [OpenAI](/integrations/langchain) | ✅ | ✅ | +| [Cohere](/integrations/llama-index) | ✅ | ✅ | +| [Google PaLM](/integrations/llama-index) | ✅ | ➖ | +| [Hugging Face](/integrations/llama-index) | ✅ | ➖ | +| [Instructor](/integrations/llama-index) | ✅ | ➖ | -```python -from chromadb.utils import embedding_functions -``` +We welcome pull requests to add new Embedding Functions to the community. +*** ## Default: all-MiniLM-L6-v2 By default, Chroma uses the [Sentence Transformers](https://www.sbert.net/) `all-MiniLM-L6-v2` model to create embeddings. This embedding model can create sentence and document embeddings that can be used for a wide variety of tasks. This embedding function runs locally on your machine, and may require you download the model files (this will happen automatically). ```python +from chromadb.utils import embedding_functions default_ef = embedding_functions.DefaultEmbeddingFunction() ``` -:::tip +:::note Embedding functions can linked to a collection, which are used whenever you call `add`, `update`, `upsert` or `query`. You can also be use them directly which can be handy for debugging. ```py val = default_ef(["foo"]) @@ -46,11 +40,6 @@ val = default_ef(["foo"]) -> [[0.05035809800028801, 0.0626462921500206, -0.061827320605516434...]] ::: -
- - - - - -
- @@ -105,192 +91,21 @@ You can pass in an optional `model_name` argument, which lets you choose which S -## OpenAI - -Chroma provides a convenient wrapper around OpenAI's embedding API. This embedding function runs remotely on OpenAI's servers, and requires an API key. You can get an API key by signing up for an account at [OpenAI](https://openai.com/api/). - - - - -This embedding function relies on the `openai` python package, which you can install with `pip install openai`. - -```python -openai_ef = embedding_functions.OpenAIEmbeddingFunction( - api_key="YOUR_API_KEY", - model_name="text-embedding-ada-002" - ) -``` - -To use the OpenAI embedding models on other platforms such as Azure, you can use the `api_base` and `api_type` parameters: -```python -openai_ef = embedding_functions.OpenAIEmbeddingFunction( - api_key="YOUR_API_KEY", - api_base="YOUR_API_BASE_PATH", - api_type="azure", - api_version="YOUR_API_VERSION", - model_name="text-embedding-ada-002" - ) -``` - - - - -```javascript -const {OpenAIEmbeddingFunction} = require('chromadb'); -const embedder = new OpenAIEmbeddingFunction({openai_api_key: "apiKey"}) - -// use directly -const embeddings = embedder.generate(["document1","document2"]) - -// pass documents to query for .add and .query -const collection = await client.createCollection({name: "name", embeddingFunction: embedder}) -const collection = await client.getCollection({name: "name", embeddingFunction: embedder}) -``` - - - - - - -You can pass in an optional `model_name` argument, which lets you choose which OpenAI embeddings model to use. By default, Chroma uses `text-embedding-ada-002`. You can see a list of all available models [here](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). - -## Cohere - -Chroma also provides a convenient wrapper around Cohere's embedding API. This embedding function runs remotely on Cohere’s servers, and requires an API key. You can get an API key by signing up for an account at [Cohere](https://dashboard.cohere.ai/welcome/register). - - - - -This embedding function relies on the `cohere` python package, which you can install with `pip install cohere`. - -```python -cohere_ef = embedding_functions.CohereEmbeddingFunction(api_key="YOUR_API_KEY", model_name="large") -cohere_ef(texts=["document1","document2"]) -``` - - - - -```javascript -const {CohereEmbeddingFunction} = require('chromadb'); -const embedder = new CohereEmbeddingFunction("apiKey") - -// use directly -const embeddings = embedder.generate(["document1","document2"]) - -// pass documents to query for .add and .query -const collection = await client.createCollection({name: "name", embeddingFunction: embedder}) -const collectionGet = await client.getCollection({name:"name", embeddingFunction: embedder}) -``` - - - - - - - -You can pass in an optional `model_name` argument, which lets you choose which Cohere embeddings model to use. By default, Chroma uses `large` model. You can see the available models under `Get embeddings` section [here](https://docs.cohere.ai/reference/embed). - -### Multilingual model example - - - - -```python -cohere_ef = embedding_functions.CohereEmbeddingFunction( - api_key="YOUR_API_KEY", - model_name="multilingual-22-12") - -multilingual_texts = [ 'Hello from Cohere!', 'مرحبًا من كوهير!', - 'Hallo von Cohere!', 'Bonjour de Cohere!', - '¡Hola desde Cohere!', 'Olá do Cohere!', - 'Ciao da Cohere!', '您好,来自 Cohere!', - 'कोहेरे से नमस्ते!' ] - -cohere_ef(texts=multilingual_texts) - -``` - - - - -```javascript -const {CohereEmbeddingFunction} = require('chromadb'); -const embedder = new CohereEmbeddingFunction("apiKey") - -multilingual_texts = [ 'Hello from Cohere!', 'مرحبًا من كوهير!', - 'Hallo von Cohere!', 'Bonjour de Cohere!', - '¡Hola desde Cohere!', 'Olá do Cohere!', - 'Ciao da Cohere!', '您好,来自 Cohere!', - 'कोहेरे से नमस्ते!' ] - -const embeddings = embedder.generate(multilingual_texts) - -``` - - - - - - - - -For more information on multilingual model you can read [here](https://docs.cohere.ai/docs/multilingual-language-models). - -## Instructor models - -The [instructor-embeddings](https://github.com/HKUNLP/instructor-embedding) library is another option, especially when running on a machine with a cuda-capable GPU. They are a good local alternative to OpenAI (see the [Massive Text Embedding Benchmark](https://huggingface.co/blog/mteb) rankings). The embedding function requires the InstructorEmbedding package. To install it, run ```pip install InstructorEmbedding```. - -There are three models available. The default is `hkunlp/instructor-base`, and for better performance you can use `hkunlp/instructor-large` or `hkunlp/instructor-xl`. You can also specify whether to use `cpu` (default) or `cuda`. For example: - -```python -#uses base model and cpu -ef = embedding_functions.InstructorEmbeddingFunction() -``` -or -```python -ef = embedding_functions.InstructorEmbeddingFunction( -model_name="hkunlp/instructor-xl", device="cuda") -``` -Keep in mind that the large and xl models are 1.5GB and 5GB respectively, and are best suited to running on a GPU. - -## Google PaLM API models - -[Google PaLM APIs](https://developers.googleblog.com/2023/03/announcing-palm-api-and-makersuite.html) are currently in private preview, but if you are part of this preview, you can use them with Chroma via the `GooglePalmEmbeddingFunction`. - -To use the PaLM embedding API, you must have `google.generativeai` Python package installed and have the API key. To use: - -```python -palm_embedding = embedding_functions.GooglePalmEmbeddingFunction( - api_key=api_key, model=model_name) - -``` - -## HuggingFace - -Chroma also provides a convenient wrapper around HuggingFace's embedding API. This embedding function runs remotely on HuggingFace's servers, and requires an API key. You can get an API key by signing up for an account at [HuggingFace](https://huggingface.co/). +*** - - -This embedding function relies on the `requests` python package, which you can install with `pip install requests`. +## Custom Embedding Functions -```python -huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( - api_key="YOUR_API_KEY", - model_name="sentence-transformers/all-MiniLM-L6-v2" -) -``` +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -You can pass in an optional `model_name` argument, which lets you choose which HuggingFace model to use. By default, Chroma uses `sentence-transformers/all-MiniLM-L6-v2`. You can see a list of all available models [here](https://huggingface.co/models). +
Select a language
-
- - + + + -## Custom Embedding Functions - @@ -332,4 +147,3 @@ class MyEmbeddingFunction { -We welcome pull requests to add new Embedding Functions to the community. diff --git a/docs/embeddings/cohere.md b/docs/embeddings/cohere.md new file mode 100644 index 0000000..8831c2c --- /dev/null +++ b/docs/embeddings/cohere.md @@ -0,0 +1,96 @@ +--- +--- + +# Cohere + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +
Select a language
+ + + + + + +Chroma also provides a convenient wrapper around Cohere's embedding API. This embedding function runs remotely on Cohere’s servers, and requires an API key. You can get an API key by signing up for an account at [Cohere](https://dashboard.cohere.ai/welcome/register). + + + + +This embedding function relies on the `cohere` python package, which you can install with `pip install cohere`. + +```python +cohere_ef = embedding_functions.CohereEmbeddingFunction(api_key="YOUR_API_KEY", model_name="large") +cohere_ef(texts=["document1","document2"]) +``` + + + + +```javascript +const {CohereEmbeddingFunction} = require('chromadb'); +const embedder = new CohereEmbeddingFunction("apiKey") + +// use directly +const embeddings = embedder.generate(["document1","document2"]) + +// pass documents to query for .add and .query +const collection = await client.createCollection({name: "name", embeddingFunction: embedder}) +const collectionGet = await client.getCollection({name:"name", embeddingFunction: embedder}) +``` + + + + + + + +You can pass in an optional `model_name` argument, which lets you choose which Cohere embeddings model to use. By default, Chroma uses `large` model. You can see the available models under `Get embeddings` section [here](https://docs.cohere.ai/reference/embed). + + +### Multilingual model example + + + + +```python +cohere_ef = embedding_functions.CohereEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="multilingual-22-12") + +multilingual_texts = [ 'Hello from Cohere!', 'مرحبًا من كوهير!', + 'Hallo von Cohere!', 'Bonjour de Cohere!', + '¡Hola desde Cohere!', 'Olá do Cohere!', + 'Ciao da Cohere!', '您好,来自 Cohere!', + 'कोहेरे से नमस्ते!' ] + +cohere_ef(texts=multilingual_texts) + +``` + + + + +```javascript +const {CohereEmbeddingFunction} = require('chromadb'); +const embedder = new CohereEmbeddingFunction("apiKey") + +multilingual_texts = [ 'Hello from Cohere!', 'مرحبًا من كوهير!', + 'Hallo von Cohere!', 'Bonjour de Cohere!', + '¡Hola desde Cohere!', 'Olá do Cohere!', + 'Ciao da Cohere!', '您好,来自 Cohere!', + 'कोहेरे से नमस्ते!' ] + +const embeddings = embedder.generate(multilingual_texts) + +``` + + + + + + + + +For more information on multilingual model you can read [here](https://docs.cohere.ai/docs/multilingual-language-models). \ No newline at end of file diff --git a/docs/embeddings/google-palm.md b/docs/embeddings/google-palm.md new file mode 100644 index 0000000..2cfb6cf --- /dev/null +++ b/docs/embeddings/google-palm.md @@ -0,0 +1,14 @@ +--- +--- + +# Google PaLM + +[Google PaLM APIs](https://developers.googleblog.com/2023/03/announcing-palm-api-and-makersuite.html) are currently in private preview, but if you are part of this preview, you can use them with Chroma via the `GooglePalmEmbeddingFunction`. + +To use the PaLM embedding API, you must have `google.generativeai` Python package installed and have the API key. To use: + +```python +palm_embedding = embedding_functions.GooglePalmEmbeddingFunction( + api_key=api_key, model=model_name) + +``` \ No newline at end of file diff --git a/docs/embeddings/hugging-face.md b/docs/embeddings/hugging-face.md new file mode 100644 index 0000000..2303fe7 --- /dev/null +++ b/docs/embeddings/hugging-face.md @@ -0,0 +1,35 @@ +--- +--- + +# Hugging Face + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +
Select a language
+ + + + + + +Chroma also provides a convenient wrapper around HuggingFace's embedding API. This embedding function runs remotely on HuggingFace's servers, and requires an API key. You can get an API key by signing up for an account at [HuggingFace](https://huggingface.co/). + + + + +This embedding function relies on the `requests` python package, which you can install with `pip install requests`. + +```python +huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="sentence-transformers/all-MiniLM-L6-v2" +) +``` + +You can pass in an optional `model_name` argument, which lets you choose which HuggingFace model to use. By default, Chroma uses `sentence-transformers/all-MiniLM-L6-v2`. You can see a list of all available models [here](https://huggingface.co/models). + + + + + \ No newline at end of file diff --git a/docs/embeddings/instructor.md b/docs/embeddings/instructor.md new file mode 100644 index 0000000..6029ed4 --- /dev/null +++ b/docs/embeddings/instructor.md @@ -0,0 +1,19 @@ +--- +--- + +# Instructor + +The [instructor-embeddings](https://github.com/HKUNLP/instructor-embedding) library is another option, especially when running on a machine with a cuda-capable GPU. They are a good local alternative to OpenAI (see the [Massive Text Embedding Benchmark](https://huggingface.co/blog/mteb) rankings). The embedding function requires the InstructorEmbedding package. To install it, run ```pip install InstructorEmbedding```. + +There are three models available. The default is `hkunlp/instructor-base`, and for better performance you can use `hkunlp/instructor-large` or `hkunlp/instructor-xl`. You can also specify whether to use `cpu` (default) or `cuda`. For example: + +```python +#uses base model and cpu +ef = embedding_functions.InstructorEmbeddingFunction() +``` +or +```python +ef = embedding_functions.InstructorEmbeddingFunction( +model_name="hkunlp/instructor-xl", device="cuda") +``` +Keep in mind that the large and xl models are 1.5GB and 5GB respectively, and are best suited to running on a GPU. \ No newline at end of file diff --git a/docs/embeddings/openai.md b/docs/embeddings/openai.md new file mode 100644 index 0000000..8510205 --- /dev/null +++ b/docs/embeddings/openai.md @@ -0,0 +1,61 @@ +--- +--- + +# OpenAI + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +
Select a language
+ + + + + + +Chroma provides a convenient wrapper around OpenAI's embedding API. This embedding function runs remotely on OpenAI's servers, and requires an API key. You can get an API key by signing up for an account at [OpenAI](https://openai.com/api/). + + + + +This embedding function relies on the `openai` python package, which you can install with `pip install openai`. + +```python +openai_ef = embedding_functions.OpenAIEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="text-embedding-ada-002" + ) +``` + +To use the OpenAI embedding models on other platforms such as Azure, you can use the `api_base` and `api_type` parameters: +```python +openai_ef = embedding_functions.OpenAIEmbeddingFunction( + api_key="YOUR_API_KEY", + api_base="YOUR_API_BASE_PATH", + api_type="azure", + api_version="YOUR_API_VERSION", + model_name="text-embedding-ada-002" + ) +``` + + + + +```javascript +const {OpenAIEmbeddingFunction} = require('chromadb'); +const embedder = new OpenAIEmbeddingFunction({openai_api_key: "apiKey"}) + +// use directly +const embeddings = embedder.generate(["document1","document2"]) + +// pass documents to query for .add and .query +const collection = await client.createCollection({name: "name", embeddingFunction: embedder}) +const collection = await client.getCollection({name: "name", embeddingFunction: embedder}) +``` + + + + + + +You can pass in an optional `model_name` argument, which lets you choose which OpenAI embeddings model to use. By default, Chroma uses `text-embedding-ada-002`. You can see a list of all available models [here](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). \ No newline at end of file diff --git a/docs/integrations/index.md b/docs/integrations/index.md index efaebcf..3f2f272 100644 --- a/docs/integrations/index.md +++ b/docs/integrations/index.md @@ -6,8 +6,21 @@ hide_title: true # 🔌 Integrations -Chroma currently has integrations with +Chroma maintains integrations with many popular tools. These tools can be used to define the business logic of an AI-native application, curate data, fine-tune embedding spaces and more. -- LangChain -- LlamaIndex -- ... and many more \ No newline at end of file +We welcome pull requests to add new Integrations to the community. + +
+ +| | Python | JS | +|--------------|-----------|---------------| +| [🦜️🔗 Langchain](/integrations/langchain) | ✅ | ✅ | +| [🦙 LlamaIndex](/integrations/llama-index) | ✅ | :soon: | + +*Coming soon* - integrations with LangSmith, JinaAI, Braintrust and more. + +*** + +### 🧬 Embeddings + +Are you looking for how Chroma integrates with embeddings providers like OpenAI, Cohere, Google PaLM and others? View [🧬 Embeddings](/embeddings) integrations. \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index 6305aa2..d097e3d 100644 --- a/sidebars.js +++ b/sidebars.js @@ -21,19 +21,22 @@ const sidebars = { 'intro', 'getting-started', 'usage-guide', - 'embeddings', + { + type: 'link', + href: '/embeddings', + label: '🧬 Embeddings', + className: 'category-link', + }, 'about', 'api-reference', 'telemetry', 'roadmap', 'contributing', 'troubleshooting', - { - type: 'link', - href: '/production', - label: '☁️ Production', - className: 'category-link', - }, + 'migration', + 'deployment', + 'observability', + { type: 'link', href: '/integrations', @@ -65,6 +68,27 @@ const sidebars = { ], }, ], + embeddings: [ + { + type: 'link', + label: '← Home', + href: '/' + }, + 'embeddings', + { + type: 'category', + label: 'Integrations', + collapsed: false, + className: 'category-header', + items: [ + 'embeddings/openai', + 'embeddings/cohere', + 'embeddings/hugging-face', + 'embeddings/google-palm', + 'embeddings/instructor' + ], + }, + ], api: [ { type: 'link', @@ -93,25 +117,6 @@ const sidebars = { ], }, ], - production: [ - { - type: 'link', - label: '← Home', - href: '/' - }, - 'production', - { - type: 'category', - label: 'Topics', - collapsed: false, - className: 'category-header', - items: [ - 'migration', - 'deployment', - 'observability', - ], - }, - ] }; module.exports = sidebars;