Merge pull request #702 from BBC-Esq/update-langchain-embedding-classes

automatic correct langchain library
PromtEngineer · Feb 3, 2024 · 8450efc · 8450efc
2 parents 040c69a + 747a9b4
commit 8450efc
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 16 deletions.
diff --git a/ingest.py b/ingest.py
@@ -153,26 +153,43 @@ def main(device_type):
  logging.info(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
  logging.info(f"Split into {len(texts)} chunks of text")
 
- # Create embeddings
- embeddings = HuggingFaceInstructEmbeddings(
- model_name=EMBEDDING_MODEL_NAME,
- model_kwargs={"device": device_type},
- )
- # change the embedding type here if you are running into issues.
- # These are much smaller embeddings and will work for most appications
- # If you use HuggingFaceEmbeddings, make sure to also use the same in the
- # run_localGPT.py file.
-
- # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+ """
+ (1) Chooses an appropriate langchain library based on the enbedding model name. Matching code is contained within fun_localGPT.py.
+ 
+ (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
+ their respective huggingface repository, project page or github repository.
+ """
+
+ if "instructor" in EMBEDDING_MODEL_NAME:
+ return HuggingFaceInstructEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ embed_instruction='Represent the document for retrieval:',
+ query_instruction='Represent the question for retrieving supporting documents:'
+ )
+
+ elif "bge" in EMBEDDING_MODEL_NAME:
+ query_instruction = 'Represent this sentence for searching relevant passages:'
+
+ return HuggingFaceBgeEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ query_instruction='Represent this sentence for searching relevant passages:'
+ )
+
+ else:
+
+ return HuggingFaceEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ )
 
  db = Chroma.from_documents(
  texts,
  embeddings,
  persist_directory=PERSIST_DIRECTORY,
  client_settings=CHROMA_SETTINGS,
  )
-
-
 
 if __name__ == "__main__":
  logging.basicConfig(

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -119,9 +119,33 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
  - The QA system retrieves relevant documents using the retriever and then answers questions based on those documents.
  """
 
- embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": device_type})
- # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
- # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+ """
+ (1) Chooses an appropriate langchain library based on the enbedding model name. Matching code is contained within ingest.py.
+ 
+ (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
+ their respective huggingface repository, project page or github repository.
+ """
+
+ if "instructor" in EMBEDDING_MODEL_NAME:
+ return HuggingFaceInstructEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ embed_instruction='Represent the document for retrieval:',
+ query_instruction='Represent the question for retrieving supporting documents:'
+ )
+
+ elif "bge" in EMBEDDING_MODEL_NAME:
+ return HuggingFaceBgeEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ query_instruction='Represent this sentence for searching relevant passages:'
+ )
+
+ else:
+ return HuggingFaceEmbeddings(
+ model_name=EMBEDDING_MODEL_NAME,
+ model_kwargs={"device": compute_device},
+ )
 
  # load the vectorstore
  db = Chroma(