langchain-ai · rsk2327 · Apr 28, 2024 · Apr 29, 2024 · Apr 29, 2024 · May 3, 2024
diff --git a/libs/community/langchain_community/vectorstores/chroma.py b/libs/community/langchain_community/vectorstores/chroma.py
@@ -33,21 +33,34 @@
 DEFAULT_K = 4 # Number of Documents to return.
 
 
-def _results_to_docs(results: Any) -> List[Document]:
- return [doc for doc, _ in _results_to_docs_and_scores(results)]
-
-
-def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
- return [
- # TODO: Chroma can do batch querying,
- # we shouldn't hard code to the 1st result
- (Document(page_content=result[0], metadata=result[1] or {}), result[2])
- for result in zip(
- results["documents"][0],
- results["metadatas"][0],
- results["distances"][0],
- )
- ]
+def _results_to_docs(
+ results: Any, include_id: Optional[bool] = False
+) -> List[Document]:
+ return [doc for doc, _ in _results_to_docs_and_scores(results, include_id)]
+
+
+def _results_to_docs_and_scores(
+ results: Any, include_id: Optional[bool] = False
+) -> List[Tuple[Document, float]]:
+ # TODO: Chroma can do batch querying,
+ # we shouldn't hard code to the 1st result
+ output = []
+
+ for result in zip(
+ results["documents"][0],
+ results["metadatas"][0],
+ results["distances"][0],
+ results["ids"][0],
+ ):
+ metadata = result[1] or {}
+ if include_id:
+ metadata["id"] = result[3]
+
+ doc = Document(page_content=result[0], metadata=metadata)
+
+ output.append((doc, result[2]))
+
+ return output
 
 
 class Chroma(VectorStore):
@@ -357,13 +370,15 @@ def similarity_search_by_vector(
  k: int = DEFAULT_K,
  filter: Optional[Dict[str, str]] = None,
  where_document: Optional[Dict[str, str]] = None,
+ include_id: Optional[bool] = False,
  **kwargs: Any,
  ) -> List[Document]:
  """Return docs most similar to embedding vector.
  Args:
  embedding (List[float]): Embedding to look up documents similar to.
  k (int): Number of Documents to return. Defaults to 4.
  filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
+ include_id : Returns id of the vector as metadata if set to True
  Returns:
  List of Documents most similar to the query vector.
  """
@@ -374,7 +389,7 @@ def similarity_search_by_vector(
  where_document=where_document,
  **kwargs,
  )
- return _results_to_docs(results)
+ return _results_to_docs(results, include_id)
 
  def similarity_search_by_vector_with_relevance_scores(
  self,
@@ -811,3 +826,24 @@ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
  def __len__(self) -> int:
  """Count the number of documents in the collection."""
  return self._collection.count()
+
+ def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
+ if isinstance(ids, list):
+ output = self._collection.get(ids=[str(x) for x in ids])
+ else:
+ output = self._collection.get(ids=[str(ids)])
+
+ num_results = len(output["ids"])
+
+ output_docs = []
+
+ if num_results > 0:
+ for i in range(num_results):
+ metadata = output["metadatas"][i] # type: ignore[index]
+ page_content = output["documents"][i] # type: ignore[index]
+
+ output_docs.append(
+ Document(page_content=page_content, metadata=metadata)
+ )
+
+ return output_docs
diff --git a/libs/community/langchain_community/vectorstores/milvus.py b/libs/community/langchain_community/vectorstores/milvus.py
@@ -739,6 +739,7 @@ def similarity_search_with_score_by_vector(
  param: Optional[dict] = None,
  expr: Optional[str] = None,
  timeout: Optional[float] = None,
+ include_id: Optional[bool] = False,
  **kwargs: Any,
  ) -> List[Tuple[Document, float]]:
  """Perform a search on a query string and return results with score.
@@ -755,6 +756,8 @@ def similarity_search_with_score_by_vector(
  expr (str, optional): Filtering expression. Defaults to None.
  timeout (float, optional): How long to wait before timeout error.
  Defaults to None.
+ include_id (bool, optional): Returns id of the vector as metadata
+ if set to True
  kwargs: Collection.search() keyword arguments.
 
  Returns:
@@ -786,6 +789,9 @@ def similarity_search_with_score_by_vector(
  ret = []
  for result in res[0]:
  data = {x: result.entity.get(x) for x in output_fields}
+ if include_id:
+ data["id"] = result.id
+
  doc = self._parse_document(data)
  pair = (doc, result.score)
  ret.append(pair)
@@ -1081,3 +1087,40 @@ def upsert(
  "Failed to upsert entities: %s error: %s", self.collection_name, exc
  )
  raise exc
+
+ def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
+ # Generating filtering expr for passing to query function
+ if isinstance(ids, list):
+ expr = "pk in ["
+
+ for id_value in ids:
+ expr += f"'{id_value}',"
+
+ expr += "]"
+
+ else:
+ expr = "pk in ['{id}']".format(id=ids)
+
+ output_fields = list(
+ set(self.fields).intersection(["source", "text", "pk", "page", "chunk_id"])
+ )
+
+ results = self.col.query(expr=expr, output_fields=output_fields) # type: ignore[union-attr]
+
+ output_docs = []
+
+ if len(results) > 0:
+ for i in range(len(results)):
+ page_content = results[i]["text"]
+
+ metadata = {"pk": results[i]["pk"]}
+
+ for metadata_field in ["source", "page", "chunk_id"]:
+ if metadata_field in output_fields:
+ metadata[metadata_field] = results[i][metadata_field]
+
+ output_docs.append(
+ Document(page_content=page_content, metadata=metadata)
+ )
+
+ return output_docs
diff --git a/libs/langchain/file::memory:?cache=shared b/libs/langchain/file::memory:?cache=shared
diff --git a/libs/langchain/langchain/retrievers/__init__.py b/libs/langchain/langchain/retrievers/__init__.py
@@ -17,6 +17,7 @@
  Document, Serializable, Callbacks,
  CallbackManagerForRetrieverRun, AsyncCallbackManagerForRetrieverRun
 """
+
 from typing import TYPE_CHECKING, Any
 
 from langchain._api.module_import import create_importer

diff --git a/libs/partners/pinecone/langchain_pinecone/vectorstores.py b/libs/partners/pinecone/langchain_pinecone/vectorstores.py
@@ -200,6 +200,7 @@ def similarity_search_by_vector_with_score(
  k: int = 4,
  filter: Optional[dict] = None,
  namespace: Optional[str] = None,
+ include_id: Optional[bool] = False,
  ) -> List[Tuple[Document, float]]:
  """Return pinecone documents most similar to embedding, along with scores."""
 
@@ -215,6 +216,10 @@ def similarity_search_by_vector_with_score(
  )
  for res in results["matches"]:
  metadata = res["metadata"]
+
+ if include_id:
+ metadata["id"] = res["id"]
+
  if self._text_key in metadata:
  text = metadata.pop(self._text_key)
  score = res["score"]
@@ -493,6 +498,33 @@ def delete(
 
  return None
 
+ def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
+ """Fetches vectors based on their IDs
+
+ Args:
+ ids : IDs of vectors to be retrieved
+ """
+
+ if isinstance(ids, list):
+ results = self._index.fetch(ids=[str(x) for x in ids])
+ else:
+ results = self._index.fetch(ids=[str(ids)])
+
+ output_docs = []
+
+ if len(results.vectors) > 0:
+ for id_value in results.vectors.keys():
+ metadata = results.vectors[id_value].get("metadata")
+ page_content = metadata.pop("text")
+
+ metadata["id"] = id_value
+
+ output_docs.append(
+ Document(page_content=page_content, metadata=metadata)
+ )
+
+ return output_docs
+
 
 @deprecated(since="0.0.3", removal="0.3.0", alternative="PineconeVectorStore")
 class Pinecone(PineconeVectorStore):

diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py
@@ -38,6 +38,7 @@ def __init__(
  keep_separator: bool = False,
  add_start_index: bool = False,
  strip_whitespace: bool = True,
+ add_chunk_id: bool = False,
  ) -> None:
  """Create a new TextSplitter.
 
@@ -49,6 +50,8 @@ def __init__(
  add_start_index: If `True`, includes chunk's start index in metadata
  strip_whitespace: If `True`, strips whitespace from the start and end of
  every document
+ add_chunk_id: If `True`, adds a unique sequential ID for each chunk
+ within a given text
  """
  if chunk_overlap > chunk_size:
  raise ValueError(
@@ -61,6 +64,7 @@ def __init__(
  self._keep_separator = keep_separator
  self._add_start_index = add_start_index
  self._strip_whitespace = strip_whitespace
+ self._add_chunk_id = add_chunk_id
 
  @abstractmethod
  def split_text(self, text: str) -> List[str]:
@@ -72,16 +76,33 @@ def create_documents(
  """Create documents from a list of texts."""
  _metadatas = metadatas or [{}] * len(texts)
  documents = []
+ prev_text_source = None
+
  for i, text in enumerate(texts):
  index = 0
  previous_chunk_len = 0
+
+ # To check if current text has the same source
+ # as previous text
+ if prev_text_source is None:
+ chunk_id = 0
+ prev_text_source = _metadatas[i].get("source")
+ else:
+ if _metadatas[i].get("source") != prev_text_source:
+ chunk_id = 0
+
  for chunk in self.split_text(text):
  metadata = copy.deepcopy(_metadatas[i])
  if self._add_start_index:
  offset = index + previous_chunk_len - self._chunk_overlap
  index = text.find(chunk, max(0, offset))
  metadata["start_index"] = index
  previous_chunk_len = len(chunk)
+
+ if self._add_chunk_id:
+ metadata["chunk_id"] = chunk_id
+ chunk_id += 1
+
  new_doc = Document(page_content=chunk, metadata=metadata)
  documents.append(new_doc)
  return documents