Files
chroma/examples/basic_functionality/alternative_embeddings.ipynb
Hammad Bashir 46de47945a SQLite Release PR (#808)
## Description of changes
Base PR to release sqlite refactor, which spans many stacked PRs.

Remaining
- [x] Merge this to main
- [x] Layered Persistent Index #761 
- [x] Remove old impls (In #781 )
- [x] Remove persist() API (In #787)
- [x] Add telemetry to SegmentAPI, it was not included. (#788)
- [x] New clients #805 
- [x] locking and soak tests for thread-safety 
- [x] Migration tool
- [x] Fix #739 
- [x] Fix metadata None vs empty
- [x] Fix persist directory (addressed in #761)
- [x] Leave files open in #761 (merge stacked PR)

Post Release
- [ ] Un xfail cross version tests once we cut the release
- [x] Documentation updates for new silent ADD failure.
- [x] Update all documentation for new API instantiation
- [x] Update all documentation for settings changes
- [ ] Update terraform deployment
- [ ] Update cloudformation deployment

---------

Co-authored-by: Luke VanderHart <luke@vanderhart.net>
Co-authored-by: Jeffrey Huber <jeff@trychroma.com>
Co-authored-by: Anton Troynikov <atroyn@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sebastian Sosa <37946988+CakeCrusher@users.noreply.github.com>
Co-authored-by: Russell Pollari <russell@sharpestminds.com>
Co-authored-by: russell-pollari <pollarir@mgail.com>
2023-07-17 14:21:34 -07:00

326 lines
9.9 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
" # Alternative Embeddings\n",
" \n",
" This notebook demonstrates how to use alternative embedding functions.\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import chromadb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"client = chromadb.Client()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from chromadb.utils import embedding_functions"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Using OpenAI Embeddings. This assumes you have the openai package installed\n",
"openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n",
" api_key=\"OPENAI_KEY\", # Replace with your own OpenAI API key\n",
" model_name=\"text-embedding-ada-002\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Create a new chroma collection\n",
"openai_collection = client.get_or_create_collection(name=\"openai_embeddings\", embedding_function=openai_ef)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"openai_collection.add(\n",
" documents=[\"This is a document\", \"This is another document\"],\n",
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
" ids=[\"id1\", \"id2\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ids': [['id1', 'id2']],\n",
" 'distances': [[0.1385088860988617, 0.2017185091972351]],\n",
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
" 'embeddings': None,\n",
" 'documents': [['This is a document', 'This is another document']]}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = openai_collection.query(\n",
" query_texts=[\"This is a query document\"],\n",
" n_results=2\n",
")\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Using Cohere Embeddings. This assumes you have the cohere package installed\n",
"cohere_ef = embedding_functions.CohereEmbeddingFunction(\n",
" api_key=\"COHERE_API_KEY\", \n",
" model_name=\"large\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Create a new chroma collection\n",
"cohere_collection = client.create_collection(name=\"cohere_embeddings\", embedding_function=cohere_ef)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"cohere_collection.add(\n",
" documents=[\"This is a document\", \"This is another document\"],\n",
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
" ids=[\"id1\", \"id2\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ids': [['id1', 'id2']],\n",
" 'embeddings': None,\n",
" 'documents': [['This is a document', 'This is another document']],\n",
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
" 'distances': [[4343.1328125, 5653.28759765625]]}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = cohere_collection.query(\n",
" query_texts=[\"This is a query document\"],\n",
" n_results=2\n",
")\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using Instructor models. The embedding function requires the InstructorEmbedding package. \n",
"# To install it, run pip install InstructorEmbedding\n",
"\n",
"\n",
"#uses base model and cpu\n",
"instructor_ef = embedding_functions.InstructorEmbeddingFunction() \n",
"\n",
"# For task specific embeddings, add an instruction\n",
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
"# instruction=\"Represent the Wikipedia document for retrieval: \"\n",
"# )\n",
"\n",
"# Uses hkunlp/instructor-xl model and GPU\n",
"#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name=\"hkunlp/instructor-xl\", device=\"cuda\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a collection with the instructor embedding function\n",
"instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"instructor_collection.add(\n",
" documents=[\"This is a document\", \"This is another document\"],\n",
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
" ids=[\"id1\", \"id2\"]\n",
")\n",
"\n",
"# Adding documents with an instruction\n",
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
"# instruction=\"Represent the Science sentence: \"\n",
"# )\n",
"# instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
"# instructor_collection.add(documents=[\"Parton energy loss in QCD matter\"], ids=[\"id1\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = instructor_collection.query(\n",
" query_texts=[\"This is a query document\"],\n",
" n_results=2\n",
")\n",
"results\n",
"\n",
"# Querying with an instruction\n",
"# instructor_ef = embedding_functions.InstructorEmbeddingFunction(instruction=\"Represent the Wikipedia question for retrieving supporting documents: \")\n",
"# instructor_collection = client.get_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
"# results = instructor_collection.query(query_texts=[\"where is the food stored in a yam plant\"])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Using HuggingFace models. The embedding function a huggingface api_key\n",
"huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n",
" api_key=\"HUGGINGFACE_API_KEY\", # Replace with your own HuggingFace API key\n",
" model_name=\"sentence-transformers/all-MiniLM-L6-v2\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Create a new HuggingFace collection\n",
"huggingface_collection = client.create_collection(name=\"huggingface_embeddings\", embedding_function=huggingface_ef)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"huggingface_collection.add(\n",
" documents=[\"This is a document\", \"This is another document\"],\n",
" metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
" ids=[\"id1\", \"id2\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ids': [['id1', 'id2']],\n",
" 'embeddings': None,\n",
" 'documents': [['This is a document', 'This is another document']],\n",
" 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
" 'distances': [[0.7111215591430664, 1.010978102684021]]}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = huggingface_collection.query(\n",
" query_texts=[\"This is a query document\"],\n",
" n_results=2\n",
")\n",
"results"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}