Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[community] Added SentenceWindowRetriever #21260

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 52 additions & 16 deletions libs/community/langchain_community/vectorstores/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,34 @@
DEFAULT_K = 4 # Number of Documents to return.


def _results_to_docs(results: Any) -> List[Document]:
return [doc for doc, _ in _results_to_docs_and_scores(results)]


def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
return [
# TODO: Chroma can do batch querying,
# we shouldn't hard code to the 1st result
(Document(page_content=result[0], metadata=result[1] or {}), result[2])
for result in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
)
]
def _results_to_docs(
results: Any, include_id: Optional[bool] = False
) -> List[Document]:
return [doc for doc, _ in _results_to_docs_and_scores(results, include_id)]


def _results_to_docs_and_scores(
results: Any, include_id: Optional[bool] = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: code is not properly typed, what is results?

) -> List[Tuple[Document, float]]:
# TODO: Chroma can do batch querying,
# we shouldn't hard code to the 1st result
output = []

for result in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
results["ids"][0],
):
metadata = result[1] or {}
if include_id:
metadata["id"] = result[3]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

search logic should not be modifying metadata.

It's OK if it's present during indexing, but shouldn't be mutated on the search path, as the vectorstore should be returning the document as it was indexed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this was an iffy change. I didnt want to create an entirely new key to store ID so went with just adding it to the metadata. But if we go with your suggestion from the first comment and setup a new ID attribute for documents, that would resolve such issues.


doc = Document(page_content=result[0], metadata=metadata)

output.append((doc, result[2]))

return output


class Chroma(VectorStore):
Expand Down Expand Up @@ -357,13 +370,15 @@ def similarity_search_by_vector(
k: int = DEFAULT_K,
filter: Optional[Dict[str, str]] = None,
where_document: Optional[Dict[str, str]] = None,
include_id: Optional[bool] = False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll need to be careful in terms of how we deal with the ID, so it can be rolled out throughout the various integrations.

**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding (List[float]): Embedding to look up documents similar to.
k (int): Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
include_id : Returns id of the vector as metadata if set to True
Returns:
List of Documents most similar to the query vector.
"""
Expand All @@ -374,7 +389,7 @@ def similarity_search_by_vector(
where_document=where_document,
**kwargs,
)
return _results_to_docs(results)
return _results_to_docs(results, include_id)

def similarity_search_by_vector_with_relevance_scores(
self,
Expand Down Expand Up @@ -811,3 +826,24 @@ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
def __len__(self) -> int:
"""Count the number of documents in the collection."""
return self._collection.count()

def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
if isinstance(ids, list):
output = self._collection.get(ids=[str(x) for x in ids])
else:
output = self._collection.get(ids=[str(ids)])

num_results = len(output["ids"])

output_docs = []

if num_results > 0:
for i in range(num_results):
metadata = output["metadatas"][i] # type: ignore[index]
page_content = output["documents"][i] # type: ignore[index]

output_docs.append(
Document(page_content=page_content, metadata=metadata)
)

return output_docs
43 changes: 43 additions & 0 deletions libs/community/langchain_community/vectorstores/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,7 @@ def similarity_search_with_score_by_vector(
param: Optional[dict] = None,
expr: Optional[str] = None,
timeout: Optional[float] = None,
include_id: Optional[bool] = False,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Perform a search on a query string and return results with score.
Expand All @@ -755,6 +756,8 @@ def similarity_search_with_score_by_vector(
expr (str, optional): Filtering expression. Defaults to None.
timeout (float, optional): How long to wait before timeout error.
Defaults to None.
include_id (bool, optional): Returns id of the vector as metadata
if set to True
kwargs: Collection.search() keyword arguments.

Returns:
Expand Down Expand Up @@ -786,6 +789,9 @@ def similarity_search_with_score_by_vector(
ret = []
for result in res[0]:
data = {x: result.entity.get(x) for x in output_fields}
if include_id:
data["id"] = result.id

doc = self._parse_document(data)
pair = (doc, result.score)
ret.append(pair)
Expand Down Expand Up @@ -1081,3 +1087,40 @@ def upsert(
"Failed to upsert entities: %s error: %s", self.collection_name, exc
)
raise exc

def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
# Generating filtering expr for passing to query function
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: map into the same code path, or force users to always think in terms of batch (it's good bias since the code involves round trips between client code and server)

if not isinstance(ids, (list, tuple)):
ids = [ids]

if isinstance(ids, list):
expr = "pk in ["

for id_value in ids:
expr += f"'{id_value}',"

expr += "]"

else:
expr = "pk in ['{id}']".format(id=ids)

output_fields = list(
set(self.fields).intersection(["source", "text", "pk", "page", "chunk_id"])
)

results = self.col.query(expr=expr, output_fields=output_fields) # type: ignore[union-attr]

output_docs = []

if len(results) > 0:
for i in range(len(results)):
page_content = results[i]["text"]

metadata = {"pk": results[i]["pk"]}

for metadata_field in ["source", "page", "chunk_id"]:
if metadata_field in output_fields:
metadata[metadata_field] = results[i][metadata_field]

output_docs.append(
Document(page_content=page_content, metadata=metadata)
)

return output_docs
Binary file added libs/langchain/file::memory:?cache=shared
Binary file not shown.
1 change: 1 addition & 0 deletions libs/langchain/langchain/retrievers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Document, Serializable, Callbacks,
CallbackManagerForRetrieverRun, AsyncCallbackManagerForRetrieverRun
"""

from typing import TYPE_CHECKING, Any

from langchain._api.module_import import create_importer
Expand Down
32 changes: 32 additions & 0 deletions libs/partners/pinecone/langchain_pinecone/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ def similarity_search_by_vector_with_score(
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
include_id: Optional[bool] = False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want to the search API right now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might have missed a word in there. Are you suggesting not to include the 'include_id' argument?

Is this related to you first comment which said that we need to add an 'id' attribute to Document? Which I guess would make the include_id argument unnecessary?

) -> List[Tuple[Document, float]]:
"""Return pinecone documents most similar to embedding, along with scores."""

Expand All @@ -215,6 +216,10 @@ def similarity_search_by_vector_with_score(
)
for res in results["matches"]:
metadata = res["metadata"]

if include_id:
metadata["id"] = res["id"]

if self._text_key in metadata:
text = metadata.pop(self._text_key)
score = res["score"]
Expand Down Expand Up @@ -493,6 +498,33 @@ def delete(

return None

def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]:
"""Fetches vectors based on their IDs

Args:
ids : IDs of vectors to be retrieved
"""

if isinstance(ids, list):
results = self._index.fetch(ids=[str(x) for x in ids])
else:
results = self._index.fetch(ids=[str(ids)])

output_docs = []

if len(results.vectors) > 0:
for id_value in results.vectors.keys():
metadata = results.vectors[id_value].get("metadata")
page_content = metadata.pop("text")

metadata["id"] = id_value

output_docs.append(
Document(page_content=page_content, metadata=metadata)
)

return output_docs


@deprecated(since="0.0.3", removal="0.3.0", alternative="PineconeVectorStore")
class Pinecone(PineconeVectorStore):
Expand Down
21 changes: 21 additions & 0 deletions libs/text-splitters/langchain_text_splitters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
keep_separator: bool = False,
add_start_index: bool = False,
strip_whitespace: bool = True,
add_chunk_id: bool = False,
) -> None:
"""Create a new TextSplitter.

Expand All @@ -49,6 +50,8 @@ def __init__(
add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
add_chunk_id: If `True`, adds a unique sequential ID for each chunk
within a given text
"""
if chunk_overlap > chunk_size:
raise ValueError(
Expand All @@ -61,6 +64,7 @@ def __init__(
self._keep_separator = keep_separator
self._add_start_index = add_start_index
self._strip_whitespace = strip_whitespace
self._add_chunk_id = add_chunk_id

@abstractmethod
def split_text(self, text: str) -> List[str]:
Expand All @@ -72,16 +76,33 @@ def create_documents(
"""Create documents from a list of texts."""
_metadatas = metadatas or [{}] * len(texts)
documents = []
prev_text_source = None

for i, text in enumerate(texts):
index = 0
previous_chunk_len = 0

# To check if current text has the same source
# as previous text
if prev_text_source is None:
chunk_id = 0
prev_text_source = _metadatas[i].get("source")
else:
if _metadatas[i].get("source") != prev_text_source:
chunk_id = 0

for chunk in self.split_text(text):
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
offset = index + previous_chunk_len - self._chunk_overlap
index = text.find(chunk, max(0, offset))
metadata["start_index"] = index
previous_chunk_len = len(chunk)

if self._add_chunk_id:
metadata["chunk_id"] = chunk_id
chunk_id += 1

new_doc = Document(page_content=chunk, metadata=metadata)
documents.append(new_doc)
return documents
Expand Down