run-llama · logan-markewich · May 10, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 27, 2024
diff --git a/...-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py b/...-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/__init__.py
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/README.md
@@ -1 +1,130 @@
-# LlamaIndex Vector_Stores Integration: Mongodb
+# LlamaIndex Vector_Stores Integration: MongoDB
+
+## Setting up MongoDB Atlas as the Datastore Provider
+
+MongoDB Atlas is a multi-cloud database service made by the same people that build MongoDB.
+Atlas simplifies deploying and managing your databases while offering the versatility you need
+to build resilient and performant global applications on the cloud providers of your choice.
+
+You can perform semantic search on data in your Atlas cluster running MongoDB v6.0.11, v7.0.2,
+or later using Atlas Vector Search. You can store vector embeddings for any kind of data along
+with other data in your collection on the Atlas cluster.
+
+In the section, we provide detailed instructions to run the tests.
+
+### Deploy a Cluster
+
+Follow the [Getting-Started](https://www.mongodb.com/basics/mongodb-atlas-tutorial) documentation
+to create an account, deploy an Atlas cluster, and connect to a database.
+
+### Retrieve the URI used by Python to connect to the Cluster
+
+When you deploy, this will be stored as the environment variable: `MONGODB_URI`,
+It will look something like the following. The username and password, if not provided,
+can be configured in _Database Access_ under Security in the left panel.
+
+```
+export MONGODB_URI="mongodb+srv://<username>:<password>@cluster0.foo.mongodb.net/?retryWrites=true&w=majority"
+```
+
+There are a number of ways to navigate the Atlas UI. Keep your eye out for "Connect" and "driver".
+
+On the left panel, navigate and click 'Database' under DEPLOYMENT.
+Click the Connect button that appears, then Drivers. Select Python.
+(Have no concern for the version. This is the PyMongo, not Python, version.)
+Once you have got the Connect Window open, you will see an instruction to `pip install pymongo`.
+You will also see a **connection string**.
+This is the `uri` that a `pymongo.MongoClient` uses to connect to the Database.
+
+### Test the connection
+
+Atlas provides a simple check. Once you have your `uri` and `pymongo` installed,
+try the following in a python console.
+
+```python
+from pymongo.mongo_client import MongoClient
+
+client = MongoClient(uri) # Create a new client and connect to the server
+try:
+ client.admin.command(
+ "ping"
+ ) # Send a ping to confirm a successful connection
+ print("Pinged your deployment. You successfully connected to MongoDB!")
+except Exception as e:
+ print(e)
+```
+
+**Troubleshooting**
+
+- You can edit a Database's users and passwords on the 'Database Access' page, under Security.
+- Remember to add your IP address. (Try `curl -4 ifconfig.co`)
+
+### Create a Database and Collection
+
+As mentioned, Vector Databases provide two functions. In addition to being the data store,
+they provide very efficient search based on natural language queries.
+With Vector Search, one will index and query data with a powerful vector search algorithm
+using "Hierarchical Navigable Small World (HNSW) graphs to find vector similarity.
+
+The indexing runs beside the data as a separate service asynchronously.
+The Search index monitors changes to the Collection that it applies to.
+Subsequently, one need not upload the data first.
+We will create an empty collection now, which will simplify setup in the example notebook.
+
+Back in the UI, navigate to the Database Deployments page by clicking Database on the left panel.
+Click the "Browse Collections" and then "+ Create Database" buttons.
+This will open a window where you choose Database and Collection names. (No additional preferences.)
+Remember these values as they will be as the environment variables,
+`MONGODB_DATABASE` and `MONGODB_COLLECTION`.
+
+### Set Datastore Environment Variables
+
+To establish a connection to the MongoDB Cluster, Database, and Collection, plus create a Vector Search Index,
+define the following environment variables.
+You can confirm that the required ones have been set like this: `assert "MONGODB_URI" in os.environ`
+
+**IMPORTANT** It is crucial that the choices are consistent between setup in Atlas and Python environment(s).
+
+| Name | Description | Example |
+| -------------------- | ----------------- | ------------------------------------------------------------------- |
+| `MONGODB_URI` | Connection String | mongodb+srv://`<user>`:`<password>`@llama-index.zeatahb.mongodb.net |
+| `MONGODB_DATABASE` | Database name | llama_index_test_db |
+| `MONGODB_COLLECTION` | Collection name | llama_index_test_vectorstore |
+| `MONGODB_INDEX` | Search index name | vector_index |
+
+The following will be required to authenticate with OpenAI.
+
+| Name | Description |
+| ---------------- | ------------------------------------------------------------ |
+| `OPENAI_API_KEY` | OpenAI token created at https://platform.openai.com/api-keys |
+
+### Create an Atlas Vector Search Index
+
+The final step to configure MongoDB as the Datastore is to create a Vector Search Index.
+The procedure is described [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure).
+
+Under Services on the left panel, choose Atlas Search > Create Search Index >
+Atlas Vector Search JSON Editor.
+
+The Plugin expects an index definition like the following.
+To begin, choose `numDimensions: 1536` along with the suggested EMBEDDING variables above.
+You can experiment with these later.
+
+```json
+{
+ "fields": [
+ {
+ "numDimensions": 1536,
+ "path": "embedding",
+ "similarity": "cosine",
+ "type": "vector"
+ }
+ ]
+}
+```
+
+### Running MongoDB Integration Tests
+
+In addition to the Jupyter Notebook in `examples/`,
+a suite of integration tests is available to verify the MongoDB integration.
+The test suite needs the cluster up and running, and the environment variables defined above.
diff --git a/...vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py b/...vector_stores/llama-index-vector-stores-mongodb/llama_index/vector_stores/mongodb/base.py
@@ -103,7 +103,7 @@ def __init__(
  db_name: str = "default_db",
  collection_name: str = "default_collection",
  index_name: str = "default",
- id_key: str = "id",
+ id_key: str = "_id",
  embedding_key: str = "embedding",
  text_key: str = "text",
  metadata_key: str = "metadata",
@@ -128,13 +128,13 @@ def __init__(
  if mongodb_client is not None:
  self._mongodb_client = cast(MongoClient, mongodb_client)
  else:
- if "MONGO_URI" not in os.environ:
+ if "MONGODB_URI" not in os.environ:
  raise ValueError(
- "Must specify MONGO_URI via env variable "
+ "Must specify MONGODB_URI via env variable "
  "if not directly passing in client."
  )
  self._mongodb_client = MongoClient(
- os.environ["MONGO_URI"],
+ os.environ["MONGODB_URI"],
  driver=DriverInfo(name="llama-index", version=version("llama-index")),
  )
 
@@ -193,7 +193,7 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
 
  """
  # delete by filtering on the doc_id metadata
- self._collection.delete_one(
+ self._collection.delete_many(
  filter={self._metadata_key + ".ref_doc_id": ref_doc_id}, **delete_kwargs
  )
 

diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/pyproject.toml
@@ -21,13 +21,15 @@ ignore_missing_imports = true
 python_version = "3.8"
 
 [tool.poetry]
-authors = ["Your Name <[email protected]>"]
+authors = [
+ "The MongoDB Python Team",
+]
 description = "llama-index vector_stores mongodb integration"
 exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-vector-stores-mongodb"
 readme = "README.md"
-version = "0.1.4"
+version = "0.1.5"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
@@ -37,6 +39,9 @@ pymongo = "^4.6.1"
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"
 jupyter = "^1.0.0"
+llama-index-embeddings-openai = "^0.1.5"
+llama-index-llms-openai = "^0.1.13"
+llama-index-readers-file = "^0.1.4"
 mypy = "0.991"
 pre-commit = "3.2.0"
 pylint = "2.15.10"

diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/BUILD
@@ -1 +1,5 @@
 python_tests()
+
+python_test_utils(
+ name="test_utils",
+)
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/conftest.py
@@ -0,0 +1,81 @@
+import os
+from typing import List
+import pytest
+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import Document, TextNode
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from pymongo import MongoClient
+
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+import threading
+
+lock = threading.Lock()
+
+
+@pytest.fixture(scope="session")
+def documents() -> List[Document]:
+ """List of documents represents data to be embedded in the datastore.
+ Minimum requirements for Documents in the /upsert endpoint's UpsertRequest.
+ """
+ text = Document.example().text
+ metadata = Document.example().metadata
+ texts = text.split("\n")
+ return [Document(text=text, metadata=metadata) for text in texts]
+
+
+@pytest.fixture(scope="session")
+def nodes(documents) -> List[TextNode]:
+ if OPENAI_API_KEY is None:
+ return None
+
+ pipeline = IngestionPipeline(
+ transformations=[
+ SentenceSplitter(chunk_size=1024, chunk_overlap=200),
+ OpenAIEmbedding(),
+ ],
+ )
+
+ return pipeline.run(documents=documents)
+
+
+db_name = os.environ.get("MONGODB_DATABASE", "llama_index_test_db")
+collection_name = os.environ.get("MONGODB_COLLECTION", "llama_index_test_vectorstore")
+index_name = os.environ.get("MONGODB_INDEX", "vector_index")
+MONGODB_URI = os.environ.get("MONGODB_URI")
+
+
+@pytest.fixture(scope="session")
+def atlas_client() -> MongoClient:
+ if MONGODB_URI is None:
+ return None
+
+ client = MongoClient(MONGODB_URI)
+
+ assert db_name in client.list_database_names()
+ assert collection_name in client[db_name].list_collection_names()
+
+ # TODO error: $listSearchIndexes is not allowed or the syntax is incorrect
+ # assert index_name in [
+ # idx["name"] for idx in client[db_name][collection_name].list_search_indexes()
+ # ]
+
+ # Clear the collection for the tests
+ client[db_name][collection_name].delete_many({})
+
+ return client
+
+
+@pytest.fixture(scope="session")
+def vector_store(atlas_client: MongoClient) -> MongoDBAtlasVectorSearch:
+ if MONGODB_URI is None:
+ return None
+
+ return MongoDBAtlasVectorSearch(
+ mongodb_client=atlas_client,
+ db_name=db_name,
+ collection_name=collection_name,
+ index_name=index_name,
+ )
diff --git a/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py b/...ex-integrations/vector_stores/llama-index-vector-stores-mongodb/tests/test_integration.py
@@ -0,0 +1,62 @@
+"""Integration Tests of llama-index-vector-stores-mongodb
+with MongoDB Atlas Vector Datastore and OPENAI Embedding model.
+
+As described in docs/providers/mongodb/setup.md, to run this, one must
+have a running MongoDB Atlas Cluster, and
+provide a valid OPENAI_API_KEY.
+"""
+
+import os
+from time import sleep
+from typing import List
+import pytest
+from llama_index.core import StorageContext, VectorStoreIndex
+from llama_index.core.schema import Document
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from pymongo import MongoClient
+
+from .conftest import lock
+
+
+@pytest.mark.skipif(
+ os.environ.get("MONGODB_URI") is None, reason="Requires MONGODB_URI in os.environ"
+)
+def test_mongodb_connection(atlas_client: MongoClient) -> None:
+ """Confirm that the connection to the datastore works."""
+ assert atlas_client.admin.command("ping")["ok"]
+
+
+@pytest.mark.skipif(
+ os.environ.get("MONGODB_URI") is None or os.environ.get("OPENAI_API_KEY") is None,
+ reason="Requires MONGODB_URI and OPENAI_API_KEY in os.environ",
+)
+def test_index(
+ documents: List[Document], vector_store: MongoDBAtlasVectorSearch
+) -> None:
+ """End-to-end example from essay and query to response.
+
+ via NodeParser, LLM Embedding, VectorStore, and Synthesizer.
+ """
+ with lock:
+ vector_store._collection.delete_many({})
+ sleep(2)
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
+ index = VectorStoreIndex.from_documents(
+ documents, storage_context=storage_context
+ )
+ query_engine = index.as_query_engine()
+
+ question = "What are LLMs useful for?"
+ no_response = True
+ response = None
+ retries = 5
+ search_limit = query_engine.retriever.similarity_top_k
+ while no_response and retries:
+ response = query_engine.query(question)
+ if len(response.source_nodes) == search_limit:
+ no_response = False
+ else:
+ retries -= 1
+ sleep(5)
+ assert retries
+ assert "LLM" in response.response
diff --git a/...tions/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py b/...tions/vector_stores/llama-index-vector-stores-mongodb/tests/test_vector_stores_mongodb.py
@@ -2,6 +2,6 @@
 from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
 
 
-def test_class():
+def test_class() -> None:
  names_of_base_classes = [b.__name__ for b in MongoDBAtlasVectorSearch.__mro__]
  assert BasePydanticVectorStore.__name__ in names_of_base_classes