chroma/examples/basic_functionality/alternative_embeddings.ipynb

{
  "cells": [
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        " # Alternative Embeddings\n",
        " \n",
        " This notebook demonstrates how to use alternative embedding functions.\n",
        " "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [],
      "source": [
        "import chromadb"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [],
      "source": [
        "client = chromadb.Client()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [],
      "source": [
        "from chromadb.utils import embedding_functions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Using OpenAI Embeddings. This assumes you have the openai package installed\n",
        "openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n",
        "    api_key=\"OPENAI_KEY\", # Replace with your own OpenAI API key\n",
        "    model_name=\"text-embedding-ada-002\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create a new chroma collection\n",
        "openai_collection = client.get_or_create_collection(name=\"openai_embeddings\", embedding_function=openai_ef)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [],
      "source": [
        "openai_collection.add(\n",
        "    documents=[\"This is a document\", \"This is another document\"],\n",
        "    metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
        "    ids=[\"id1\", \"id2\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{'ids': [['id1', 'id2']],\n",
              " 'distances': [[0.1385088860988617, 0.2017185091972351]],\n",
              " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
              " 'embeddings': None,\n",
              " 'documents': [['This is a document', 'This is another document']]}"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "results = openai_collection.query(\n",
        "    query_texts=[\"This is a query document\"],\n",
        "    n_results=2\n",
        ")\n",
        "results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Using Cohere Embeddings. This assumes you have the cohere package installed\n",
        "cohere_ef  = embedding_functions.CohereEmbeddingFunction(\n",
        "    api_key=\"COHERE_API_KEY\",  \n",
        "    model_name=\"large\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create a new chroma collection\n",
        "cohere_collection = client.create_collection(name=\"cohere_embeddings\", embedding_function=cohere_ef)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [],
      "source": [
        "cohere_collection.add(\n",
        "    documents=[\"This is a document\", \"This is another document\"],\n",
        "    metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
        "    ids=[\"id1\", \"id2\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{'ids': [['id1', 'id2']],\n",
              " 'embeddings': None,\n",
              " 'documents': [['This is a document', 'This is another document']],\n",
              " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
              " 'distances': [[4343.1328125, 5653.28759765625]]}"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "results = cohere_collection.query(\n",
        "    query_texts=[\"This is a query document\"],\n",
        "    n_results=2\n",
        ")\n",
        "results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Using Instructor models. The embedding function requires the InstructorEmbedding package. \n",
        "# To install it, run pip install InstructorEmbedding\n",
        "\n",
        "\n",
        "#uses base model and cpu\n",
        "instructor_ef = embedding_functions.InstructorEmbeddingFunction() \n",
        "\n",
        "# For task specific embeddings, add an instruction\n",
        "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
        "#     instruction=\"Represent the Wikipedia document for retrieval: \"\n",
        "# )\n",
        "\n",
        "# Uses hkunlp/instructor-xl model and GPU\n",
        "#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name=\"hkunlp/instructor-xl\", device=\"cuda\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create a collection with the instructor embedding function\n",
        "instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "instructor_collection.add(\n",
        "    documents=[\"This is a document\", \"This is another document\"],\n",
        "    metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
        "    ids=[\"id1\", \"id2\"]\n",
        ")\n",
        "\n",
        "# Adding documents with an instruction\n",
        "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n",
        "#     instruction=\"Represent the Science sentence: \"\n",
        "# )\n",
        "# instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
        "# instructor_collection.add(documents=[\"Parton energy loss in QCD matter\"], ids=[\"id1\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "results = instructor_collection.query(\n",
        "    query_texts=[\"This is a query document\"],\n",
        "    n_results=2\n",
        ")\n",
        "results\n",
        "\n",
        "# Querying with an instruction\n",
        "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(instruction=\"Represent the Wikipedia question for retrieving supporting documents: \")\n",
        "# instructor_collection = client.get_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n",
        "# results = instructor_collection.query(query_texts=[\"where is the food stored in a yam plant\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Using HuggingFace models. The embedding function a huggingface api_key\n",
        "huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n",
        "    api_key=\"HUGGINGFACE_API_KEY\", # Replace with your own HuggingFace API key\n",
        "    model_name=\"sentence-transformers/all-MiniLM-L6-v2\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create a new HuggingFace collection\n",
        "huggingface_collection = client.create_collection(name=\"huggingface_embeddings\", embedding_function=huggingface_ef)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {},
      "outputs": [],
      "source": [
        "huggingface_collection.add(\n",
        "    documents=[\"This is a document\", \"This is another document\"],\n",
        "    metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n",
        "    ids=[\"id1\", \"id2\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{'ids': [['id1', 'id2']],\n",
              " 'embeddings': None,\n",
              " 'documents': [['This is a document', 'This is another document']],\n",
              " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n",
              " 'distances': [[0.7111215591430664, 1.010978102684021]]}"
            ]
          },
          "execution_count": 8,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "results = huggingface_collection.query(\n",
        "    query_texts=[\"This is a query document\"],\n",
        "    n_results=2\n",
        ")\n",
        "results"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.8"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}