From e1711ce5f95b7a6004d1c48a306699359296646b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 10 May 2024 14:12:24 -0400 Subject: [PATCH] Adds Unit and Integration tests for MongoDBAtlasVectorSearch (#12854) --- .../llama_index/embeddings/__init__.py | 0 .../README.md | 131 +++++++++++++++++- .../llama_index/vector_stores/mongodb/base.py | 10 +- .../pyproject.toml | 9 +- .../tests/BUILD | 4 + .../tests/conftest.py | 81 +++++++++++ .../tests/test_integration.py | 62 +++++++++ .../tests/test_vector_stores_mongodb.py | 2 +- .../tests/test_vectorstore.py | 89 ++++++++++++ 9 files changed, 379 insertions(+), 9 deletions(-) create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md index 540c24de2c1dc..69619e63a1c05 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md @@ -1 +1,130 @@ -# LlamaIndex Vector_Stores Integration: Mongodb +# LlamaIndex Vector_Stores Integration: MongoDB + +## Setting up MongoDB Atlas as the Datastore Provider + +MongoDB Atlas is a multi-cloud database service made by the same people that build MongoDB. +Atlas simplifies deploying and managing your databases while offering the versatility you need +to build resilient and performant global applications on the cloud providers of your choice. + +You can perform semantic search on data in your Atlas cluster running MongoDB v6.0.11, v7.0.2, +or later using Atlas Vector Search. You can store vector embeddings for any kind of data along +with other data in your collection on the Atlas cluster. + +In the section, we provide detailed instructions to run the tests. + +### Deploy a Cluster + +Follow the [Getting-Started](https://www.mongodb.com/basics/mongodb-atlas-tutorial) documentation +to create an account, deploy an Atlas cluster, and connect to a database. + +### Retrieve the URI used by Python to connect to the Cluster + +When you deploy, this will be stored as the environment variable: `MONGODB_URI`, +It will look something like the following. The username and password, if not provided, +can be configured in _Database Access_ under Security in the left panel. + +``` +export MONGODB_URI="mongodb+srv://:@cluster0.foo.mongodb.net/?retryWrites=true&w=majority" +``` + +There are a number of ways to navigate the Atlas UI. Keep your eye out for "Connect" and "driver". + +On the left panel, navigate and click 'Database' under DEPLOYMENT. +Click the Connect button that appears, then Drivers. Select Python. +(Have no concern for the version. This is the PyMongo, not Python, version.) +Once you have got the Connect Window open, you will see an instruction to `pip install pymongo`. +You will also see a **connection string**. +This is the `uri` that a `pymongo.MongoClient` uses to connect to the Database. + +### Test the connection + +Atlas provides a simple check. Once you have your `uri` and `pymongo` installed, +try the following in a python console. + +```python +from pymongo.mongo_client import MongoClient + +client = MongoClient(uri) # Create a new client and connect to the server +try: + client.admin.command( + "ping" + ) # Send a ping to confirm a successful connection + print("Pinged your deployment. You successfully connected to MongoDB!") +except Exception as e: + print(e) +``` + +**Troubleshooting** + +- You can edit a Database's users and passwords on the 'Database Access' page, under Security. +- Remember to add your IP address. (Try `curl -4 ifconfig.co`) + +### Create a Database and Collection + +As mentioned, Vector Databases provide two functions. In addition to being the data store, +they provide very efficient search based on natural language queries. +With Vector Search, one will index and query data with a powerful vector search algorithm +using "Hierarchical Navigable Small World (HNSW) graphs to find vector similarity. + +The indexing runs beside the data as a separate service asynchronously. +The Search index monitors changes to the Collection that it applies to. +Subsequently, one need not upload the data first. +We will create an empty collection now, which will simplify setup in the example notebook. + +Back in the UI, navigate to the Database Deployments page by clicking Database on the left panel. +Click the "Browse Collections" and then "+ Create Database" buttons. +This will open a window where you choose Database and Collection names. (No additional preferences.) +Remember these values as they will be as the environment variables, +`MONGODB_DATABASE` and `MONGODB_COLLECTION`. + +### Set Datastore Environment Variables + +To establish a connection to the MongoDB Cluster, Database, and Collection, plus create a Vector Search Index, +define the following environment variables. +You can confirm that the required ones have been set like this: `assert "MONGODB_URI" in os.environ` + +**IMPORTANT** It is crucial that the choices are consistent between setup in Atlas and Python environment(s). + +| Name | Description | Example | +| -------------------- | ----------------- | ------------------------------------------------------------------- | +| `MONGODB_URI` | Connection String | mongodb+srv://``:``@llama-index.zeatahb.mongodb.net | +| `MONGODB_DATABASE` | Database name | llama_index_test_db | +| `MONGODB_COLLECTION` | Collection name | llama_index_test_vectorstore | +| `MONGODB_INDEX` | Search index name | vector_index | + +The following will be required to authenticate with OpenAI. + +| Name | Description | +| ---------------- | ------------------------------------------------------------ | +| `OPENAI_API_KEY` | OpenAI token created at https://platform.openai.com/api-keys | + +### Create an Atlas Vector Search Index + +The final step to configure MongoDB as the Datastore is to create a Vector Search Index. +The procedure is described [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure). + +Under Services on the left panel, choose Atlas Search > Create Search Index > +Atlas Vector Search JSON Editor. + +The Plugin expects an index definition like the following. +To begin, choose `numDimensions: 1536` along with the suggested EMBEDDING variables above. +You can experiment with these later. + +```json +{ + "fields": [ + { + "numDimensions": 1536, + "path": "embedding", + "similarity": "cosine", + "type": "vector" + } + ] +} +``` + +### Running MongoDB Integration Tests + +In addition to the Jupyter Notebook in `examples/`, +a suite of integration tests is available to verify the MongoDB integration. +The test suite needs the cluster up and running, and the environment variables defined above. diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py index f4fb685b08536..6778ace317329 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py @@ -103,7 +103,7 @@ def __init__( db_name: str = "default_db", collection_name: str = "default_collection", index_name: str = "default", - id_key: str = "id", + id_key: str = "_id", embedding_key: str = "embedding", text_key: str = "text", metadata_key: str = "metadata", @@ -128,13 +128,13 @@ def __init__( if mongodb_client is not None: self._mongodb_client = cast(MongoClient, mongodb_client) else: - if "MONGO_URI" not in os.environ: + if "MONGODB_URI" not in os.environ: raise ValueError( - "Must specify MONGO_URI via env variable " + "Must specify MONGODB_URI via env variable " "if not directly passing in client." ) self._mongodb_client = MongoClient( - os.environ["MONGO_URI"], + os.environ["MONGODB_URI"], driver=DriverInfo(name="llama-index", version=version("llama-index")), ) @@ -193,7 +193,7 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: """ # delete by filtering on the doc_id metadata - self._collection.delete_one( + self._collection.delete_many( filter={self._metadata_key + ".ref_doc_id": ref_doc_id}, **delete_kwargs ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml index 64344ed28242b..9254276d1b723 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml @@ -21,13 +21,15 @@ ignore_missing_imports = true python_version = "3.8" [tool.poetry] -authors = ["Your Name "] +authors = [ + "The MongoDB Python Team", +] description = "llama-index vector_stores mongodb integration" exclude = ["**/BUILD"] license = "MIT" name = "llama-index-vector-stores-mongodb" readme = "README.md" -version = "0.1.4" +version = "0.1.5" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" @@ -37,6 +39,9 @@ pymongo = "^4.6.1" [tool.poetry.group.dev.dependencies] ipython = "8.10.0" jupyter = "^1.0.0" +llama-index-embeddings-openai = "^0.1.5" +llama-index-llms-openai = "^0.1.13" +llama-index-readers-file = "^0.1.4" mypy = "0.991" pre-commit = "3.2.0" pylint = "2.15.10" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD index dabf212d7e716..45d59ac8248a2 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD @@ -1 +1,5 @@ python_tests() + +python_test_utils( + name="test_utils", +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py new file mode 100644 index 0000000000000..e3af605d4e50f --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py @@ -0,0 +1,81 @@ +import os +from typing import List +import pytest +from llama_index.core.ingestion import IngestionPipeline +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.schema import Document, TextNode +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch +from pymongo import MongoClient + +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + +import threading + +lock = threading.Lock() + + +@pytest.fixture(scope="session") +def documents() -> List[Document]: + """List of documents represents data to be embedded in the datastore. + Minimum requirements for Documents in the /upsert endpoint's UpsertRequest. + """ + text = Document.example().text + metadata = Document.example().metadata + texts = text.split("\n") + return [Document(text=text, metadata=metadata) for text in texts] + + +@pytest.fixture(scope="session") +def nodes(documents) -> List[TextNode]: + if OPENAI_API_KEY is None: + return None + + pipeline = IngestionPipeline( + transformations=[ + SentenceSplitter(chunk_size=1024, chunk_overlap=200), + OpenAIEmbedding(), + ], + ) + + return pipeline.run(documents=documents) + + +db_name = os.environ.get("MONGODB_DATABASE", "llama_index_test_db") +collection_name = os.environ.get("MONGODB_COLLECTION", "llama_index_test_vectorstore") +index_name = os.environ.get("MONGODB_INDEX", "vector_index") +MONGODB_URI = os.environ.get("MONGODB_URI") + + +@pytest.fixture(scope="session") +def atlas_client() -> MongoClient: + if MONGODB_URI is None: + return None + + client = MongoClient(MONGODB_URI) + + assert db_name in client.list_database_names() + assert collection_name in client[db_name].list_collection_names() + + # TODO error: $listSearchIndexes is not allowed or the syntax is incorrect + # assert index_name in [ + # idx["name"] for idx in client[db_name][collection_name].list_search_indexes() + # ] + + # Clear the collection for the tests + client[db_name][collection_name].delete_many({}) + + return client + + +@pytest.fixture(scope="session") +def vector_store(atlas_client: MongoClient) -> MongoDBAtlasVectorSearch: + if MONGODB_URI is None: + return None + + return MongoDBAtlasVectorSearch( + mongodb_client=atlas_client, + db_name=db_name, + collection_name=collection_name, + index_name=index_name, + ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py new file mode 100644 index 0000000000000..6020935ff5254 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py @@ -0,0 +1,62 @@ +"""Integration Tests of llama-index-vector-stores-mongodb +with MongoDB Atlas Vector Datastore and OPENAI Embedding model. + +As described in docs/providers/mongodb/setup.md, to run this, one must +have a running MongoDB Atlas Cluster, and +provide a valid OPENAI_API_KEY. +""" + +import os +from time import sleep +from typing import List +import pytest +from llama_index.core import StorageContext, VectorStoreIndex +from llama_index.core.schema import Document +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch +from pymongo import MongoClient + +from .conftest import lock + + +@pytest.mark.skipif( + os.environ.get("MONGODB_URI") is None, reason="Requires MONGODB_URI in os.environ" +) +def test_mongodb_connection(atlas_client: MongoClient) -> None: + """Confirm that the connection to the datastore works.""" + assert atlas_client.admin.command("ping")["ok"] + + +@pytest.mark.skipif( + os.environ.get("MONGODB_URI") is None or os.environ.get("OPENAI_API_KEY") is None, + reason="Requires MONGODB_URI and OPENAI_API_KEY in os.environ", +) +def test_index( + documents: List[Document], vector_store: MongoDBAtlasVectorSearch +) -> None: + """End-to-end example from essay and query to response. + + via NodeParser, LLM Embedding, VectorStore, and Synthesizer. + """ + with lock: + vector_store._collection.delete_many({}) + sleep(2) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + index = VectorStoreIndex.from_documents( + documents, storage_context=storage_context + ) + query_engine = index.as_query_engine() + + question = "What are LLMs useful for?" + no_response = True + response = None + retries = 5 + search_limit = query_engine.retriever.similarity_top_k + while no_response and retries: + response = query_engine.query(question) + if len(response.source_nodes) == search_limit: + no_response = False + else: + retries -= 1 + sleep(5) + assert retries + assert "LLM" in response.response diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py index 4066dae44415f..39fde58f0a546 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py @@ -2,6 +2,6 @@ from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch -def test_class(): +def test_class() -> None: names_of_base_classes = [b.__name__ for b in MongoDBAtlasVectorSearch.__mro__] assert BasePydanticVectorStore.__name__ in names_of_base_classes diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py new file mode 100644 index 0000000000000..dbb3dc3b6181a --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py @@ -0,0 +1,89 @@ +import os +from time import sleep +import pytest +from typing import List + +from llama_index.core.schema import Document, TextNode +from llama_index.core.vector_stores.types import VectorStoreQuery +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch + +from .conftest import lock + + +def test_documents(documents: List[Document]) -> None: + """Sanity check essay was found and documents loaded.""" + assert len(documents) == 25 + assert isinstance(documents[0], Document) + + +@pytest.mark.skipif( + os.environ.get("OPENAI_API_KEY") is None, + reason="Requires OPENAI_API_KEY in os.environ", +) +def test_nodes(nodes: List[TextNode]) -> None: + """Test Ingestion Pipeline transforming documents into nodes with embeddings.""" + assert isinstance(nodes, list) + assert isinstance(nodes[0], TextNode) + + +@pytest.mark.skipif( + os.environ.get("MONGODB_URI") is None or os.environ.get("OPENAI_API_KEY") is None, + reason="Requires MONGODB_URI and OPENAI_API_KEY in os.environ", +) +def test_vectorstore( + nodes: List[TextNode], vector_store: MongoDBAtlasVectorSearch +) -> None: + """Test add, query, delete API of MongoDBAtlasVectorSearch.""" + with lock: + # 0. Clean up the collection + vector_store._collection.delete_many({}) + sleep(2) + + # 1. Test add() + ids = vector_store.add(nodes) + assert set(ids) == {node.node_id for node in nodes} + + # 2. test query() + query_str = "What are LLMs useful for?" + n_similar = 2 + query_embedding = OpenAIEmbedding().get_text_embedding(query_str) + query = VectorStoreQuery( + query_str=query_str, + query_embedding=query_embedding, + similarity_top_k=n_similar, + ) + result_found = False + query_responses = None + retries = 5 + while retries and not result_found: + query_responses = vector_store.query(query=query) + if len(query_responses.nodes) == n_similar: + result_found = True + else: + sleep(2) + retries -= 1 + + assert all(score > 0.89 for score in query_responses.similarities) + assert any("LLM" in node.text for node in query_responses.nodes) + assert all(id_res in ids for id_res in query_responses.ids) + + # 3. Test delete() + # Remember, the current API deletes by *ref_doc_id*, not *node_id*. + # In our case, we began with only one document, + # so deleting the ref_doc_id from any node + # should delete ALL the nodes. + n_docs = vector_store._collection.count_documents({}) + assert n_docs == len(ids) + remove_id = query_responses.nodes[0].ref_doc_id + sleep(2) + retries = 5 + while retries: + vector_store.delete(remove_id) + n_remaining = vector_store._collection.count_documents({}) + if n_remaining == n_docs: + sleep(2) + retries -= 1 + else: + retries = 0 + assert n_remaining == n_docs - 1