run-llama · logan-markewich · May 10, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 27, 2024
diff --git a/...-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py b/...-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py
diff --git a/...vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py b/...vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py
@@ -103,7 +103,7 @@ def __init__(
  db_name: str = "default_db",
  collection_name: str = "default_collection",
  index_name: str = "default",
- id_key: str = "id",
+ id_key: str = "_id",
  embedding_key: str = "embedding",
  text_key: str = "text",
  metadata_key: str = "metadata",
@@ -193,7 +193,7 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
 
  """
  # delete by filtering on the doc_id metadata
- self._collection.delete_one(
+ self._collection.delete_many(
  filter={self._metadata_key + ".ref_doc_id": ref_doc_id}, **delete_kwargs
  )
 

diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml
@@ -21,13 +21,15 @@ ignore_missing_imports = true
 python_version = "3.8"
 
 [tool.poetry]
-authors = ["Your Name <[email protected]>"]
+authors = [
+ "The MongoDB Python Team",
+]
 description = "llama-index vector_stores mongodb integration"
 exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-vector-stores-mongodb"
 readme = "README.md"
-version = "0.1.4"
+version = "0.1.5"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
@@ -37,6 +39,9 @@ pymongo = "^4.6.1"
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"
 jupyter = "^1.0.0"
+llama-index-embeddings-openai = "^0.1.5"
+llama-index-llms-openai = "^0.1.13"
+llama-index-readers-file = "^0.1.4"
 mypy = "0.991"
 pre-commit = "3.2.0"
 pylint = "2.15.10"

diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/setup.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/setup.md
@@ -0,0 +1,128 @@
+# Setting up MongoDB Atlas as the Datastore Provider
+
+MongoDB Atlas is a multi-cloud database service made by the same people that build MongoDB.
+Atlas simplifies deploying and managing your databases while offering the versatility you need
+to build resilient and performant global applications on the cloud providers of your choice.
+
+You can perform semantic search on data in your Atlas cluster running MongoDB v6.0.11, v7.0.2,
+or later using Atlas Vector Search. You can store vector embeddings for any kind of data along
+with other data in your collection on the Atlas cluster.
+
+In the section, we set up a cluster, a database, test it, and finally create an Atlas Vector Search Index.
+
+### Deploy a Cluster
+
+Follow the [Getting-Started](https://www.mongodb.com/basics/mongodb-atlas-tutorial) documentation
+to create an account, deploy an Atlas cluster, and connect to a database.
+
+### Retrieve the URI used by Python to connect to the Cluster
+
+When you deploy the ChatGPT Retrieval App, this will be stored as the environment variable: `MONGO_URI`
+It will look something like the following. The username and password, if not provided,
+can be configured in _Database Access_ under Security in the left panel.
+
+```
+export MONGO_URI="mongodb+srv://<username>:<password>@chatgpt-retrieval-plugin.zeatahb.mongodb.net/?retryWrites=true&w=majority"
+```
+
+There are a number of ways to navigate the Atlas UI. Keep your eye out for "Connect" and "driver".
+
+On the left panel, navigate and click 'Database' under DEPLOYMENT.
+Click the Connect button that appears, then Drivers. Select Python.
+(Have no concern for the version. This is the PyMongo, not Python, version.)
+Once you have got the Connect Window open, you will see an instruction to `pip install pymongo`.
+You will also see a **connection string**.
+This is the `uri` that a `pymongo.MongoClient` uses to connect to the Database.
+
+### Test the connection
+
+Atlas provides a simple check. Once you have your `uri` and `pymongo` installed,
+try the following in a python console.
+
+```python
+from pymongo.mongo_client import MongoClient
+
+client = MongoClient(uri) # Create a new client and connect to the server
+try:
+ client.admin.command(
+ "ping"
+ ) # Send a ping to confirm a successful connection
+ print("Pinged your deployment. You successfully connected to MongoDB!")
+except Exception as e:
+ print(e)
+```
+
+**Troubleshooting**
+
+- You can edit a Database's users and passwords on the 'Database Access' page, under Security.
+- Remember to add your IP address. (Try `curl -4 ifconfig.co`)
+
+### Create a Database and Collection
+
+As mentioned, Vector Databases provide two functions. In addition to being the data store,
+they provide very efficient search based on natural language queries.
+With Vector Search, one will index and query data with a powerful vector search algorithm
+using "Hierarchical Navigable Small World (HNSW) graphs to find vector similarity.
+
+The indexing runs beside the data as a separate service asynchronously.
+The Search index monitors changes to the Collection that it applies to.
+Subsequently, one need not upload the data first.
+We will create an empty collection now, which will simplify setup in the example notebook.
+
+Back in the UI, navigate to the Database Deployments page by clicking Database on the left panel.
+Click the "Browse Collections" and then "+ Create Database" buttons.
+This will open a window where you choose Database and Collection names. (No additional preferences.)
+Remember these values as they will be as the environment variables,
+`MONGODB_DATABASE` and `MONGODB_COLLECTION`.
+
+### Set Datastore Environment Variables
+
+To establish a connection to the MongoDB Cluster, Database, and Collection, plus create a Vector Search Index,
+define the following environment variables.
+You can confirm that the required ones have been set like this: `assert "MONGO_URI" in os.environ`
+
+**IMPORTANT** It is crucial that the choices are consistent between setup in Atlas and Python environment(s).
+
+| Name | Description | Example |
+| -------------------- | ----------------- | ------------------------------------------------------------------- |
+| `MONGO_URI` | Connection String | mongodb+srv://`<user>`:`<password>`@llama-index.zeatahb.mongodb.net |
+| `MONGODB_DATABASE` | Database name | llama_index_test_db |
+| `MONGODB_COLLECTION` | Collection name | llama_index_test_vectorstore |
+| `MONGODB_INDEX` | Search index name | vector_index |
+
+The following will be required to authenticate with OpenAI.
+
+| Name | Description |
+| ---------------- | ------------------------------------------------------------ |
+| `OPENAI_API_KEY` | OpenAI token created at https://platform.openai.com/api-keys |
+
+### Create an Atlas Vector Search Index
+
+The final step to configure MongoDB as the Datastore is to create a Vector Search Index.
+The procedure is described [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure).
+
+Under Services on the left panel, choose Atlas Search > Create Search Index >
+Atlas Vector Search JSON Editor.
+
+The Plugin expects an index definition like the following.
+To begin, choose `numDimensions: 1536` along with the suggested EMBEDDING variables above.
+You can experiment with these later.
+
+```json
+{
+ "fields": [
+ {
+ "numDimensions": 1536,
+ "path": "embedding",
+ "similarity": "cosine",
+ "type": "vector"
+ }
+ ]
+}
+```
+
+### Running MongoDB Integration Tests
+
+In addition to the Jupyter Notebook in `examples/`,
+a suite of integration tests is available to verify the MongoDB integration.
+The test suite needs the cluster up and running, and the environment variables defined above.
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py
@@ -0,0 +1,70 @@
+import os
+
+import openai
+import pytest
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from pymongo import MongoClient
+
+openai.api_key = os.environ["OPENAI_API_KEY"]
+
+import threading
+from pathlib import Path
+
+lock = threading.Lock()
+
+
+@pytest.fixture(scope="session")
+def documents(tmp_path_factory):
+ """List of documents represents data to be embedded in the datastore.
+ Minimum requirements for Documents in the /upsert endpoint's UpsertRequest.
+ """
+ data_dir = Path(__file__).parents[4] / "docs/docs/examples/data/paul_graham"
+ return SimpleDirectoryReader(data_dir).load_data()
+
+
+@pytest.fixture(scope="session")
+def nodes(documents):
+ pipeline = IngestionPipeline(
+ transformations=[
+ SentenceSplitter(chunk_size=1024, chunk_overlap=200),
+ OpenAIEmbedding(),
+ ],
+ )
+
+ return pipeline.run(documents=documents)
+
+
+db_name = os.environ.get("MONGODB_DATABASE", "llama_index_test_db")
+collection_name = os.environ.get("MONGODB_COLLECTION", "llama_index_test_vectorstore")
+index_name = os.environ.get("MONGODB_INDEX", "vector_index")
+cluster_uri = os.environ["MONGO_URI"]
+
+
+@pytest.fixture(scope="session")
+def atlas_client():
+ client = MongoClient(cluster_uri)
+
+ assert db_name in client.list_database_names()
+ assert collection_name in client[db_name].list_collection_names()
+ assert index_name in [
+ idx["name"] for idx in client[db_name][collection_name].list_search_indexes()
+ ]
+
+ # Clear the collection for the tests
+ client[db_name][collection_name].delete_many({})
+
+ return client
+
+
+@pytest.fixture(scope="session")
+def vector_store(atlas_client):
+ return MongoDBAtlasVectorSearch(
+ mongodb_client=atlas_client,
+ db_name=db_name,
+ collection_name=collection_name,
+ index_name=index_name,
+ )
diff --git a/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py b/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py
@@ -0,0 +1,60 @@
+"""Integration Tests of llama-index-vector-stores-mongodb
+with MongoDB Atlas Vector Datastore and OPENAI Embedding model.
+
+As described in docs/providers/mongodb/setup.md, to run this, one must
+have a running MongoDB Atlas Cluster, and
+provide a valid OPENAI_API_KEY.
+"""
+
+import os
+from time import sleep
+
+import pytest
+from llama_index.core import StorageContext, VectorStoreIndex
+
+from .conftest import lock
+
+
+def test_required_vars():
+ """Confirm that the environment has all it needs."""
+ required_vars = ["OPENAI_API_KEY", "MONGO_URI"]
+ for var in required_vars:
+ try:
+ os.environ[var]
+ except KeyError:
+ pytest.fail(f"Required var '{var}' not in os.environ")
+
+
+def test_mongodb_connection(atlas_client):
+ """Confirm that the connection to the datastore works."""
+ assert atlas_client.admin.command("ping")["ok"]
+
+
+def test_index(documents, vector_store):
+ """End-to-end example from essay and query to response.
+
+ via NodeParser, LLM Embedding, VectorStore, and Synthesizer.
+ """
+ with lock:
+ vector_store._collection.delete_many({})
+ sleep(2)
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
+ index = VectorStoreIndex.from_documents(
+ documents, storage_context=storage_context
+ )
+ query_engine = index.as_query_engine()
+
+ question = "Who is the author of this essay?"
+ no_response = True
+ response = None
+ retries = 5
+ search_limit = query_engine.retriever.similarity_top_k
+ while no_response and retries:
+ response = query_engine.query(question)
+ if len(response.source_nodes) == search_limit:
+ no_response = False
+ else:
+ retries -= 1
+ sleep(5)
+ assert retries
+ assert "Paul Graham" in response.response
diff --git a/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py b/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_vectorstore.py
@@ -0,0 +1,81 @@
+import os
+from time import sleep
+
+import openai
+from llama_index.core.schema import Document, TextNode
+from llama_index.core.vector_stores.types import VectorStoreQuery
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+from .conftest import lock
+
+openai.api_key = os.environ["OPENAI_API_KEY"]
+
+
+def test_documents(documents: list[Document]):
+ """Sanity check essay was found and documents loaded."""
+ assert len(documents) == 1
+ assert isinstance(documents[0], Document)
+
+
+def test_nodes(nodes):
+ """Test Ingestion Pipeline transforming documents into nodes with embeddings."""
+ assert isinstance(nodes, list)
+ assert isinstance(nodes[0], TextNode)
+
+
+def test_vectorstore(nodes, vector_store):
+ """Test add, query, delete API of MongoDBAtlasVectorSearch."""
+ with lock:
+ # 0. Clean up the collection
+ vector_store._collection.delete_many({})
+ sleep(2)
+
+ # 1. Test add()
+ ids = vector_store.add(nodes)
+ assert set(ids) == {node.node_id for node in nodes}
+
+ # 2. test query()
+ query_str = "Who is this author of this essay?"
+ n_similar = 2
+ query_embedding = OpenAIEmbedding().get_text_embedding(query_str)
+ query = VectorStoreQuery(
+ query_str=query_str,
+ query_embedding=query_embedding,
+ similarity_top_k=n_similar,
+ )
+ result_found = False
+ query_responses = None
+ retries = 5
+ while retries and not result_found:
+ query_responses = vector_store.query(query=query)
+ if len(query_responses.nodes) == n_similar:
+ result_found = True
+ else:
+ sleep(2)
+ retries -= 1
+
+ assert all(score > 0.89 for score in query_responses.similarities)
+ assert any(
+ "seem more like rants" in node.text for node in query_responses.nodes
+ )
+ assert all(id_res in ids for id_res in query_responses.ids)
+
+ # 3. Test delete()
+ # Remember, the current API deletes by *ref_doc_id*, not *node_id*.
+ # In our case, we began with only one document,
+ # so deleting the ref_doc_id from any node
+ # should delete ALL the nodes.
+ n_docs = vector_store._collection.count_documents({})
+ assert n_docs == len(ids)
+ remove_id = query_responses.nodes[0].ref_doc_id
+ sleep(2)
+ retries = 5
+ while retries:
+ vector_store.delete(remove_id)
+ n_remaining = vector_store._collection.count_documents({})
+ if n_remaining == n_docs:
+ sleep(2)
+ retries -= 1
+ else:
+ retries = 0
+ assert n_remaining == 0