mirror of
https://github.com/placeholder-soft/chroma.git
synced 2026-01-12 22:44:55 +08:00
## Description of changes Base PR to release sqlite refactor, which spans many stacked PRs. Remaining - [x] Merge this to main - [x] Layered Persistent Index #761 - [x] Remove old impls (In #781 ) - [x] Remove persist() API (In #787) - [x] Add telemetry to SegmentAPI, it was not included. (#788) - [x] New clients #805 - [x] locking and soak tests for thread-safety - [x] Migration tool - [x] Fix #739 - [x] Fix metadata None vs empty - [x] Fix persist directory (addressed in #761) - [x] Leave files open in #761 (merge stacked PR) Post Release - [ ] Un xfail cross version tests once we cut the release - [x] Documentation updates for new silent ADD failure. - [x] Update all documentation for new API instantiation - [x] Update all documentation for settings changes - [ ] Update terraform deployment - [ ] Update cloudformation deployment --------- Co-authored-by: Luke VanderHart <luke@vanderhart.net> Co-authored-by: Jeffrey Huber <jeff@trychroma.com> Co-authored-by: Anton Troynikov <atroyn@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sebastian Sosa <37946988+CakeCrusher@users.noreply.github.com> Co-authored-by: Russell Pollari <russell@sharpestminds.com> Co-authored-by: russell-pollari <pollarir@mgail.com>
326 lines
9.9 KiB
Plaintext
326 lines
9.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
" # Alternative Embeddings\n",
|
|
" \n",
|
|
" This notebook demonstrates how to use alternative embedding functions.\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import chromadb"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"client = chromadb.Client()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from chromadb.utils import embedding_functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using OpenAI Embeddings. This assumes you have the openai package installed\n",
|
|
"openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n",
|
|
" api_key=\"OPENAI_KEY\", # Replace with your own OpenAI API key\n",
|
|
" model_name=\"text-embedding-ada-002\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a new chroma collection\n",
|
|
"openai_collection = client.get_or_create_collection(name=\"openai_embeddings\", embedding_function=openai_ef)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"openai_collection.add(\n",
|
|
" documents=[\"This is a document\", \"This is another document\"],\n",
|
|
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
|
|
" ids=[\"id1\", \"id2\"]\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'ids': [['id1', 'id2']],\n",
|
|
" 'distances': [[0.1385088860988617, 0.2017185091972351]],\n",
|
|
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
|
|
" 'embeddings': None,\n",
|
|
" 'documents': [['This is a document', 'This is another document']]}"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"results = openai_collection.query(\n",
|
|
" query_texts=[\"This is a query document\"],\n",
|
|
" n_results=2\n",
|
|
")\n",
|
|
"results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using Cohere Embeddings. This assumes you have the cohere package installed\n",
|
|
"cohere_ef = embedding_functions.CohereEmbeddingFunction(\n",
|
|
" api_key=\"COHERE_API_KEY\", \n",
|
|
" model_name=\"large\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a new chroma collection\n",
|
|
"cohere_collection = client.create_collection(name=\"cohere_embeddings\", embedding_function=cohere_ef)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cohere_collection.add(\n",
|
|
" documents=[\"This is a document\", \"This is another document\"],\n",
|
|
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
|
|
" ids=[\"id1\", \"id2\"]\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'ids': [['id1', 'id2']],\n",
|
|
" 'embeddings': None,\n",
|
|
" 'documents': [['This is a document', 'This is another document']],\n",
|
|
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
|
|
" 'distances': [[4343.1328125, 5653.28759765625]]}"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"results = cohere_collection.query(\n",
|
|
" query_texts=[\"This is a query document\"],\n",
|
|
" n_results=2\n",
|
|
")\n",
|
|
"results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using Instructor models. The embedding function requires the InstructorEmbedding package. \n",
|
|
"# To install it, run pip install InstructorEmbedding\n",
|
|
"\n",
|
|
"\n",
|
|
"#uses base model and cpu\n",
|
|
"instructor_ef = embedding_functions.InstructorEmbeddingFunction() \n",
|
|
"\n",
|
|
"# For task specific embeddings, add an instruction\n",
|
|
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
|
|
"# instruction=\"Represent the Wikipedia document for retrieval: \"\n",
|
|
"# )\n",
|
|
"\n",
|
|
"# Uses hkunlp/instructor-xl model and GPU\n",
|
|
"#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name=\"hkunlp/instructor-xl\", device=\"cuda\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a collection with the instructor embedding function\n",
|
|
"instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"instructor_collection.add(\n",
|
|
" documents=[\"This is a document\", \"This is another document\"],\n",
|
|
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
|
|
" ids=[\"id1\", \"id2\"]\n",
|
|
")\n",
|
|
"\n",
|
|
"# Adding documents with an instruction\n",
|
|
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
|
|
"# instruction=\"Represent the Science sentence: \"\n",
|
|
"# )\n",
|
|
"# instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
|
|
"# instructor_collection.add(documents=[\"Parton energy loss in QCD matter\"], ids=[\"id1\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"results = instructor_collection.query(\n",
|
|
" query_texts=[\"This is a query document\"],\n",
|
|
" n_results=2\n",
|
|
")\n",
|
|
"results\n",
|
|
"\n",
|
|
"# Querying with an instruction\n",
|
|
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(instruction=\"Represent the Wikipedia question for retrieving supporting documents: \")\n",
|
|
"# instructor_collection = client.get_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
|
|
"# results = instructor_collection.query(query_texts=[\"where is the food stored in a yam plant\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using HuggingFace models. The embedding function a huggingface api_key\n",
|
|
"huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n",
|
|
" api_key=\"HUGGINGFACE_API_KEY\", # Replace with your own HuggingFace API key\n",
|
|
" model_name=\"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a new HuggingFace collection\n",
|
|
"huggingface_collection = client.create_collection(name=\"huggingface_embeddings\", embedding_function=huggingface_ef)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"huggingface_collection.add(\n",
|
|
" documents=[\"This is a document\", \"This is another document\"],\n",
|
|
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
|
|
" ids=[\"id1\", \"id2\"]\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'ids': [['id1', 'id2']],\n",
|
|
" 'embeddings': None,\n",
|
|
" 'documents': [['This is a document', 'This is another document']],\n",
|
|
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
|
|
" 'distances': [[0.7111215591430664, 1.010978102684021]]}"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"results = huggingface_collection.query(\n",
|
|
" query_texts=[\"This is a query document\"],\n",
|
|
" n_results=2\n",
|
|
")\n",
|
|
"results"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|