[ENH]: CIP-4: In and Not In Metadata Filters (#1081)

Cherry-picked from #1029

## Description of changes

*Summarize the changes made by this PR.*
 - Improvements & Bug fixes
	 - Added support for `$in` and `$nin` metadata filters

> Note: See CIP in `docs/` or example notebook for more info

## Test plan
*How are these changes tested?*

- [x] Tests pass locally with `pytest` for python

## Documentation Changes
TBD

---------

Co-authored-by: Hammad Bashir <HammadB@users.noreply.github.com>
This commit is contained in:
Trayan Azarov
2023-09-05 20:42:01 +03:00
committed by GitHub
parent 750f2edbfa
commit 6dd2d4af0b
8 changed files with 354 additions and 20 deletions

View File

@@ -0,0 +1,149 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-08-30T12:48:38.227653Z",
"start_time": "2023-08-30T12:48:27.744069Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ids': [['1', '3']], 'distances': [[0.28824201226234436, 1.017508625984192]], 'metadatas': [[{'author': 'john'}, {'author': 'jill'}]], 'embeddings': None, 'documents': [['Article by john', 'Article by Jill']]}\n",
"{'ids': ['1', '3'], 'embeddings': None, 'metadatas': [{'author': 'john'}, {'author': 'jill'}], 'documents': ['Article by john', 'Article by Jill']}\n"
]
}
],
"source": [
"import chromadb\n",
"\n",
"from chromadb.utils import embedding_functions\n",
"\n",
"sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n",
"\n",
"\n",
"client = chromadb.Client()\n",
"# client.heartbeat()\n",
"# client.reset()\n",
"collection = client.get_or_create_collection(\"test-where-list\", embedding_function=sentence_transformer_ef)\n",
"collection.add(documents=[\"Article by john\", \"Article by Jack\", \"Article by Jill\"],\n",
" metadatas=[{\"author\": \"john\"}, {\"author\": \"jack\"}, {\"author\": \"jill\"}], ids=[\"1\", \"2\", \"3\"])\n",
"\n",
"query = [\"Give me articles by john\"]\n",
"res = collection.query(query_texts=query,where={'author': {'$in': ['john', 'jill']}}, n_results=10)\n",
"print(res)\n",
"\n",
"res_get = collection.get(where={'author': {'$in': ['john', 'jill']}})\n",
"print(res_get)\n"
]
},
{
"cell_type": "markdown",
"source": [
"# Interactions with existing Where operators"
],
"metadata": {
"collapsed": false
},
"id": "752cef843ba2f900"
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": "{'ids': [['1']],\n 'distances': [[0.28824201226234436]],\n 'metadatas': [[{'article_type': 'blog', 'author': 'john'}]],\n 'embeddings': None,\n 'documents': [['Article by john']]}"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"collection.upsert(documents=[\"Article by john\", \"Article by Jack\", \"Article by Jill\"],\n",
" metadatas=[{\"author\": \"john\",\"article_type\":\"blog\"}, {\"author\": \"jack\",\"article_type\":\"social\"}, {\"author\": \"jill\",\"article_type\":\"paper\"}], ids=[\"1\", \"2\", \"3\"])\n",
"\n",
"collection.query(query_texts=query,where={\"$and\":[{\"author\": {'$in': ['john', 'jill']}},{\"article_type\":{\"$eq\":\"blog\"}}]}, n_results=3)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-30T12:48:49.974353Z",
"start_time": "2023-08-30T12:48:49.938985Z"
}
},
"id": "ca56cda318f9e94d"
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": "{'ids': [['1', '3']],\n 'distances': [[0.28824201226234436, 1.017508625984192]],\n 'metadatas': [[{'article_type': 'blog', 'author': 'john'},\n {'article_type': 'paper', 'author': 'jill'}]],\n 'embeddings': None,\n 'documents': [['Article by john', 'Article by Jill']]}"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"collection.query(query_texts=query,where={\"$or\":[{\"author\": {'$in': ['john']}},{\"article_type\":{\"$in\":[\"paper\"]}}]}, n_results=3)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-30T12:48:53.501431Z",
"start_time": "2023-08-30T12:48:53.481571Z"
}
},
"id": "f10e79ec90c797c1"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "d97b8b6dd96261d0"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}