Skip to content

Commit

Permalink
Revert "feat(celery): moved assistant summary to celery" (#2558)
Browse files Browse the repository at this point in the history
Reverts #2557
  • Loading branch information
StanGirard committed May 7, 2024
1 parent 5583436 commit cd73412
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 197 deletions.
3 changes: 0 additions & 3 deletions backend/celery_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

celery = Celery(__name__)


if CELERY_BROKER_URL.startswith("sqs"):
broker_transport_options = {
CELERY_BROKER_QUEUE_NAME: {
Expand Down Expand Up @@ -37,5 +36,3 @@
)
else:
raise ValueError(f"Unsupported broker URL: {CELERY_BROKER_URL}")

celery.autodiscover_tasks(["modules.assistant.ito"])
77 changes: 31 additions & 46 deletions backend/modules/assistant/ito/ito.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@

from fastapi import UploadFile
from logger import get_logger
from modules.user.service.user_usage import UserUsage
from modules.assistant.dto.inputs import InputAssistant
from modules.assistant.ito.utils.pdf_generator import PDFGenerator, PDFModel
from modules.chat.controller.chat.utils import update_user_usage
from modules.contact_support.controller.settings import ContactsSettings
from modules.upload.controller.upload_routes import upload_file
from modules.user.entity.user_identity import UserIdentity
from modules.user.service.user_usage import UserUsage
from packages.emails.send_email import send_email
from pydantic import BaseModel
from unidecode import unidecode
Expand Down Expand Up @@ -62,36 +62,31 @@ def increase_usage_user(self):
def calculate_pricing(self):
return 20

def generate_pdf(self, filename: str, title: str, content: str):
pdf_model = PDFModel(title=title, content=content)
pdf = PDFGenerator(pdf_model)
pdf.print_pdf()
pdf.output(filename, "F")

@abstractmethod
async def process_assistant(self):
pass


async def uploadfile_to_file(uploadFile: UploadFile):
# Transform the UploadFile object to a file object with same name and content
tmp_file = NamedTemporaryFile(delete=False)
tmp_file.write(uploadFile.file.read())
tmp_file.flush() # Make sure all data is written to disk
return tmp_file


class OutputHandler(BaseModel):
async def send_output_by_email(
self,
filename: str,
file: UploadFile,
filename: str,
task_name: str,
custom_message: str,
brain_id: str = None,
user_email: str = None,
):
settings = ContactsSettings()
file = await uploadfile_to_file(file)
file = await self.uploadfile_to_file(file)
domain_quivr = os.getenv("QUIVR_DOMAIN", "https://chat.quivr.app/")

with open(file.name, "rb") as f:
mail_from = settings.resend_contact_sales_from
mail_to = user_email
mail_to = self.current_user.email
body = f"""
<div style="text-align: center;">
<img src="https://quivr-cms.s3.eu-west-3.amazonaws.com/logo_quivr_white_7e3c72620f.png" alt="Quivr Logo" style="width: 100px; height: 100px; border-radius: 50%; margin: 0 auto; display: block;">
Expand Down Expand Up @@ -121,34 +116,20 @@ async def send_output_by_email(
"subject": "Quivr Ingestion Processed",
"reply_to": "[email protected]",
"html": body,
"attachments": [
{
"filename": filename,
"content": list(f.read()),
"type": "application/pdf",
}
],
"attachments": [{"filename": filename, "content": list(f.read())}],
}
logger.info(f"Sending email to {mail_to} with file {filename}")
send_email(params)

def generate_pdf(self, filename: str, title: str, content: str):
pdf_model = PDFModel(title=title, content=content)
pdf = PDFGenerator(pdf_model)
pdf.print_pdf()
pdf.output(filename, "F")
async def uploadfile_to_file(self, uploadFile: UploadFile):
# Transform the UploadFile object to a file object with same name and content
tmp_file = NamedTemporaryFile(delete=False)
tmp_file.write(uploadFile.file.read())
tmp_file.flush() # Make sure all data is written to disk
return tmp_file

async def create_and_upload_processed_file(
self,
processed_content: str,
original_filename: str,
file_description: str,
content: str,
task_name: str,
custom_message: str,
brain_id: str = None,
email_activated: bool = False,
current_user: UserIdentity = None,
self, processed_content: str, original_filename: str, file_description: str
) -> dict:
"""Handles creation and uploading of the processed file."""
# remove any special characters from the filename that aren't http safe
Expand Down Expand Up @@ -183,25 +164,29 @@ async def create_and_upload_processed_file(
headers={"content-type": "application/pdf"},
)

logger.info(f"current_user: {current_user}")
if email_activated:
if self.input.outputs.email.activated:
await self.send_output_by_email(
new_filename,
file_to_upload,
new_filename,
"Summary",
f"{file_description} of {original_filename}",
brain_id=brain_id,
user_email=current_user["email"],
brain_id=(
self.input.outputs.brain.value
if (
self.input.outputs.brain.activated
and self.input.outputs.brain.value
)
else None
),
)

# Reset to start of file before upload
file_to_upload.file.seek(0)
UserIdentity(**current_user)
if brain_id:
if self.input.outputs.brain.activated:
await upload_file(
uploadFile=file_to_upload,
brain_id=brain_id,
current_user=current_user,
brain_id=self.input.outputs.brain.value,
current_user=self.current_user,
chat_id=None,
)

Expand Down
188 changes: 82 additions & 106 deletions backend/modules/assistant/ito/summary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import tempfile
from typing import List

from celery_config import celery
from fastapi import UploadFile
from langchain.chains import (
MapReduceDocumentsChain,
Expand All @@ -24,12 +23,9 @@
Outputs,
)
from modules.assistant.ito.ito import ITO
from modules.notification.dto.inputs import CreateNotification
from modules.notification.service.notification_service import NotificationService
from modules.user.entity.user_identity import UserIdentity

logger = get_logger(__name__)
notification_service = NotificationService()


class SummaryAssistant(ITO):
Expand Down Expand Up @@ -73,117 +69,97 @@ def check_input(self):
return True

async def process_assistant(self):

try:
notification_service.add_notification(
CreateNotification(
user_id=self.current_user.id,
status="info",
title=f"Creating Summary for {self.files[0].filename}",
)
)
# Create a temporary file with the uploaded file as a temporary file and then pass it to the loader
tmp_file = tempfile.NamedTemporaryFile(delete=False)
self.increase_usage_user()
except Exception as e:
logger.error(f"Error increasing usage: {e}")
return {"error": str(e)}

# Write the file to the temporary file
tmp_file.write(self.files[0].file.read())
# Create a temporary file with the uploaded file as a temporary file and then pass it to the loader
tmp_file = tempfile.NamedTemporaryFile(delete=False)

# Now pass the path of the temporary file to the loader
# Write the file to the temporary file
tmp_file.write(self.files[0].file.read())

loader = UnstructuredPDFLoader(tmp_file.name)
# Now pass the path of the temporary file to the loader

tmp_file.close()
loader = UnstructuredPDFLoader(tmp_file.name)

data = loader.load()
tmp_file.close()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=100
)
split_docs = text_splitter.split_documents(data)
logger.info(f"Split {len(split_docs)} documents")
# Jsonify the split docs
split_docs = [doc.to_json() for doc in split_docs]
## Turn this into a task
brain_id = (
self.input.outputs.brain.id
if self.input.outputs.brain.activated
else None
)
email_activated = self.input.outputs.email.activated
celery.send_task(
name="task_summary",
args=(
split_docs,
self.files[0].filename,
brain_id,
email_activated,
self.current_user.model_dump(mode="json"),
),
)
except Exception as e:
logger.error(f"Error processing summary: {e}")


def map_reduce_chain():
llm = ChatLiteLLM(model="gpt-3.5-turbo", max_tokens=2000)

map_template = """The following is a document that has been divided into multiple sections:
{docs}
Please carefully analyze each section and identify the following:
1. Main Themes: What are the overarching ideas or topics in this section?
2. Key Points: What are the most important facts, arguments, or ideas presented in this section?
3. Important Information: Are there any crucial details that stand out? This could include data, quotes, specific events, entity, or other relevant information.
4. People: Who are the key individuals mentioned in this section? What roles do they play?
5. Reasoning: What logic or arguments are used to support the key points?
6. Chapters: If the document is divided into chapters, what is the main focus of each chapter?
Remember to consider the language and context of the document. This will help in understanding the nuances and subtleties of the text."""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is a set of summaries for parts of the document :
{docs}
Take these and distill it into a final, consolidated summary of the document. Make sure to include the main themes, key points, and important information such as data, quotes,people and specific events.
Use markdown such as bold, italics, underlined. For example, **bold**, *italics*, and _underlined_ to highlight key points.
Please provide the final summary with sections using bold headers.
Sections should always be Summary and Key Points, but feel free to add more sections as needed.
Always use bold text for the sections headers.
Keep the same language as the documents.
Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="docs"
)
data = loader.load()

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=4000,
)
llm = ChatLiteLLM(model="gpt-3.5-turbo", max_tokens=2000)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="docs",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
return map_reduce_chain
map_template = """The following is a document that has been divided into multiple sections:
{docs}
Please carefully analyze each section and identify the following:
1. Main Themes: What are the overarching ideas or topics in this section?
2. Key Points: What are the most important facts, arguments, or ideas presented in this section?
3. Important Information: Are there any crucial details that stand out? This could include data, quotes, specific events, entity, or other relevant information.
4. People: Who are the key individuals mentioned in this section? What roles do they play?
5. Reasoning: What logic or arguments are used to support the key points?
6. Chapters: If the document is divided into chapters, what is the main focus of each chapter?
Remember to consider the language and context of the document. This will help in understanding the nuances and subtleties of the text."""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is a set of summaries for parts of the document:
{docs}
Take these and distill it into a final, consolidated summary of the document. Make sure to include the main themes, key points, and important information such as data, quotes,people and specific events.
Use markdown such as bold, italics, underlined. For example, **bold**, *italics*, and _underlined_ to highlight key points.
Please provide the final summary with sections using bold headers.
Sections should always be Summary and Key Points, but feel free to add more sections as needed.
Always use bold text for the sections headers.
Keep the same language as the documents.
Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="docs",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=100
)
split_docs = text_splitter.split_documents(data)

content = map_reduce_chain.run(split_docs)

return await self.create_and_upload_processed_file(
content, self.files[0].filename, "Summary"
)


def summary_inputs():
Expand Down

0 comments on commit cd73412

Please sign in to comment.