-
Notifications
You must be signed in to change notification settings - Fork 13.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[community] Added SentenceWindowRetriever #21260
base: master
Are you sure you want to change the base?
Changes from all commits
a71b6b8
68cef24
a9889ee
efb848d
679e5d3
a77318f
5e2e6ca
f38fd95
03d6c35
c1a1fb5
273fdb1
2752876
b3a8df1
06488e0
46f91af
ad2c3df
a0508b2
d82b6bc
36dc48d
72c6dba
c57ac20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,21 +33,34 @@ | |
DEFAULT_K = 4 # Number of Documents to return. | ||
|
||
|
||
def _results_to_docs(results: Any) -> List[Document]: | ||
return [doc for doc, _ in _results_to_docs_and_scores(results)] | ||
|
||
|
||
def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: | ||
return [ | ||
# TODO: Chroma can do batch querying, | ||
# we shouldn't hard code to the 1st result | ||
(Document(page_content=result[0], metadata=result[1] or {}), result[2]) | ||
for result in zip( | ||
results["documents"][0], | ||
results["metadatas"][0], | ||
results["distances"][0], | ||
) | ||
] | ||
def _results_to_docs( | ||
results: Any, include_id: Optional[bool] = False | ||
) -> List[Document]: | ||
return [doc for doc, _ in _results_to_docs_and_scores(results, include_id)] | ||
|
||
|
||
def _results_to_docs_and_scores( | ||
results: Any, include_id: Optional[bool] = False | ||
) -> List[Tuple[Document, float]]: | ||
# TODO: Chroma can do batch querying, | ||
# we shouldn't hard code to the 1st result | ||
output = [] | ||
|
||
for result in zip( | ||
results["documents"][0], | ||
results["metadatas"][0], | ||
results["distances"][0], | ||
results["ids"][0], | ||
): | ||
metadata = result[1] or {} | ||
if include_id: | ||
metadata["id"] = result[3] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. search logic should not be modifying metadata. It's OK if it's present during indexing, but shouldn't be mutated on the search path, as the vectorstore should be returning the document as it was indexed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this was an iffy change. I didnt want to create an entirely new key to store ID so went with just adding it to the metadata. But if we go with your suggestion from the first comment and setup a new ID attribute for documents, that would resolve such issues. |
||
|
||
doc = Document(page_content=result[0], metadata=metadata) | ||
|
||
output.append((doc, result[2])) | ||
|
||
return output | ||
|
||
|
||
class Chroma(VectorStore): | ||
|
@@ -357,13 +370,15 @@ def similarity_search_by_vector( | |
k: int = DEFAULT_K, | ||
filter: Optional[Dict[str, str]] = None, | ||
where_document: Optional[Dict[str, str]] = None, | ||
include_id: Optional[bool] = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll need to be careful in terms of how we deal with the ID, so it can be rolled out throughout the various integrations. |
||
**kwargs: Any, | ||
) -> List[Document]: | ||
"""Return docs most similar to embedding vector. | ||
Args: | ||
embedding (List[float]): Embedding to look up documents similar to. | ||
k (int): Number of Documents to return. Defaults to 4. | ||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. | ||
include_id : Returns id of the vector as metadata if set to True | ||
Returns: | ||
List of Documents most similar to the query vector. | ||
""" | ||
|
@@ -374,7 +389,7 @@ def similarity_search_by_vector( | |
where_document=where_document, | ||
**kwargs, | ||
) | ||
return _results_to_docs(results) | ||
return _results_to_docs(results, include_id) | ||
|
||
def similarity_search_by_vector_with_relevance_scores( | ||
self, | ||
|
@@ -811,3 +826,24 @@ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None: | |
def __len__(self) -> int: | ||
"""Count the number of documents in the collection.""" | ||
return self._collection.count() | ||
|
||
def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]: | ||
if isinstance(ids, list): | ||
output = self._collection.get(ids=[str(x) for x in ids]) | ||
else: | ||
output = self._collection.get(ids=[str(ids)]) | ||
|
||
num_results = len(output["ids"]) | ||
|
||
output_docs = [] | ||
|
||
if num_results > 0: | ||
for i in range(num_results): | ||
metadata = output["metadatas"][i] # type: ignore[index] | ||
page_content = output["documents"][i] # type: ignore[index] | ||
|
||
output_docs.append( | ||
Document(page_content=page_content, metadata=metadata) | ||
) | ||
|
||
return output_docs |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -739,6 +739,7 @@ def similarity_search_with_score_by_vector( | |
param: Optional[dict] = None, | ||
expr: Optional[str] = None, | ||
timeout: Optional[float] = None, | ||
include_id: Optional[bool] = False, | ||
**kwargs: Any, | ||
) -> List[Tuple[Document, float]]: | ||
"""Perform a search on a query string and return results with score. | ||
|
@@ -755,6 +756,8 @@ def similarity_search_with_score_by_vector( | |
expr (str, optional): Filtering expression. Defaults to None. | ||
timeout (float, optional): How long to wait before timeout error. | ||
Defaults to None. | ||
include_id (bool, optional): Returns id of the vector as metadata | ||
if set to True | ||
kwargs: Collection.search() keyword arguments. | ||
|
||
Returns: | ||
|
@@ -786,6 +789,9 @@ def similarity_search_with_score_by_vector( | |
ret = [] | ||
for result in res[0]: | ||
data = {x: result.entity.get(x) for x in output_fields} | ||
if include_id: | ||
data["id"] = result.id | ||
|
||
doc = self._parse_document(data) | ||
pair = (doc, result.score) | ||
ret.append(pair) | ||
|
@@ -1081,3 +1087,40 @@ def upsert( | |
"Failed to upsert entities: %s error: %s", self.collection_name, exc | ||
) | ||
raise exc | ||
|
||
def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]: | ||
# Generating filtering expr for passing to query function | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: map into the same code path, or force users to always think in terms of batch (it's good bias since the code involves round trips between client code and server) if not isinstance(ids, (list, tuple)): |
||
if isinstance(ids, list): | ||
expr = "pk in [" | ||
|
||
for id_value in ids: | ||
expr += f"'{id_value}'," | ||
|
||
expr += "]" | ||
|
||
else: | ||
expr = "pk in ['{id}']".format(id=ids) | ||
|
||
output_fields = list( | ||
set(self.fields).intersection(["source", "text", "pk", "page", "chunk_id"]) | ||
) | ||
|
||
results = self.col.query(expr=expr, output_fields=output_fields) # type: ignore[union-attr] | ||
|
||
output_docs = [] | ||
|
||
if len(results) > 0: | ||
for i in range(len(results)): | ||
page_content = results[i]["text"] | ||
|
||
metadata = {"pk": results[i]["pk"]} | ||
|
||
for metadata_field in ["source", "page", "chunk_id"]: | ||
if metadata_field in output_fields: | ||
metadata[metadata_field] = results[i][metadata_field] | ||
|
||
output_docs.append( | ||
Document(page_content=page_content, metadata=metadata) | ||
) | ||
|
||
return output_docs |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -200,6 +200,7 @@ def similarity_search_by_vector_with_score( | |
k: int = 4, | ||
filter: Optional[dict] = None, | ||
namespace: Optional[str] = None, | ||
include_id: Optional[bool] = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to the search API right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might have missed a word in there. Are you suggesting not to include the 'include_id' argument? Is this related to you first comment which said that we need to add an 'id' attribute to Document? Which I guess would make the include_id argument unnecessary? |
||
) -> List[Tuple[Document, float]]: | ||
"""Return pinecone documents most similar to embedding, along with scores.""" | ||
|
||
|
@@ -215,6 +216,10 @@ def similarity_search_by_vector_with_score( | |
) | ||
for res in results["matches"]: | ||
metadata = res["metadata"] | ||
|
||
if include_id: | ||
metadata["id"] = res["id"] | ||
|
||
if self._text_key in metadata: | ||
text = metadata.pop(self._text_key) | ||
score = res["score"] | ||
|
@@ -493,6 +498,33 @@ def delete( | |
|
||
return None | ||
|
||
def get_documents_by_ids(self, ids: int | str | List[int | str]) -> List[Document]: | ||
"""Fetches vectors based on their IDs | ||
|
||
Args: | ||
ids : IDs of vectors to be retrieved | ||
""" | ||
|
||
if isinstance(ids, list): | ||
results = self._index.fetch(ids=[str(x) for x in ids]) | ||
else: | ||
results = self._index.fetch(ids=[str(ids)]) | ||
|
||
output_docs = [] | ||
|
||
if len(results.vectors) > 0: | ||
for id_value in results.vectors.keys(): | ||
metadata = results.vectors[id_value].get("metadata") | ||
page_content = metadata.pop("text") | ||
|
||
metadata["id"] = id_value | ||
|
||
output_docs.append( | ||
Document(page_content=page_content, metadata=metadata) | ||
) | ||
|
||
return output_docs | ||
|
||
|
||
@deprecated(since="0.0.3", removal="0.3.0", alternative="PineconeVectorStore") | ||
class Pinecone(PineconeVectorStore): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: code is not properly typed, what is results?