-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
105 lines (86 loc) · 4.06 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pickle
from flask import Flask, render_template, request, jsonify
import time
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain_core.documents import Document
app = Flask(__name__)
# Function to print execution time
def print_execution_time(message, start_time):
end_time = time.time()
elapsed_time = end_time - start_time
print(f"{message}: {elapsed_time} seconds")
# Function to load documents from pickle object
def load_documents_from_pickle(pickle_file):
start_time = time.time()
with open(pickle_file, 'rb') as f:
documents = pickle.load(f)
print_execution_time("\nLoaded Documents", start_time)
return documents
# Function to split text into chunks
def split_text_into_chunks(documents):
start_time = time.time()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(documents)
print_execution_time("\nSplit Text", start_time)
return text_chunks
# Function to create embeddings
def create_embeddings():
start_time = time.time()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': "cuda:0"}) # "cpu" if no GPU exists
print_execution_time("\nCreated Embeddings", start_time)
return embeddings
# Function to create vector store
def create_vector_store(text_chunks, embeddings):
start_time = time.time()
vector_store = FAISS.from_documents(text_chunks, embeddings)
print_execution_time("\nCreated vector store", start_time)
return vector_store
# Function to create LLMS model
def create_llms_model():
start_time = time.time()
llm = CTransformers(model="/path/to/file/mistral-7b-instruct-v0.1.Q4_K_M.gguf", config={'max_new_tokens': 1000, 'temperature': 0.01, 'context_length': 2048})
print_execution_time("\nCreated LLM", start_time)
return llm
# Loading of documents from pickle
# "./output.pkl"
documents = load_documents_from_pickle("./output.pkl")
# Split text into chunks
text_chunks = split_text_into_chunks(documents)
# Create embeddings
embeddings = create_embeddings()
# Create vector store
vector_store = create_vector_store(text_chunks, embeddings)
# Create LLMS model
llm = create_llms_model()
# Create memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Create Chain
chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type='stuff',
retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
memory=memory)
# Define chat function
def conversation_chat(query):
start_time = time.time() # Start time
query_with_instructions = query + " Be concise. Stick to the context. If a question that is asked is not found in the context, just say \"I could not find that in the given context.\" Be helpful. Avoid speculation. Use natural language. Provide accurate information. Use a maximum of 5 sentences"
# Call the chain with the modified query
result = chain({"question": query_with_instructions, "chat_history": []}) # Initialize chat history as empty list
end_time = time.time() # End time
elapsed_time = end_time - start_time # Calculate elapsed time
return result["answer"], elapsed_time
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
query = request.form['query']
# Call the conversation_chat function to get the response and query time
response, query_time = conversation_chat(query)
# Return the response, query, and query time as JSON data
return jsonify({'query': query, 'response': response, 'query_time': query_time})
else:
return render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)