-
Notifications
You must be signed in to change notification settings - Fork 219
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a Use Case with Azure Open AI+Pinecone+Chainlit+LangChain for p…
…df processing and QA (#81) Co-authored-by: milanju <[email protected]>
- Loading branch information
Showing
12 changed files
with
375 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
FROM python:3.11 | ||
|
||
# Create a non-root user | ||
RUN useradd -m -u 1000 user | ||
|
||
# Set user environment | ||
USER user | ||
ENV HOME=/home/user \ | ||
PATH=/home/user/.local/bin:$PATH | ||
|
||
# Set working directory | ||
WORKDIR $HOME/app | ||
|
||
# Copy files | ||
COPY --chown=user . . | ||
|
||
# Install dependencies | ||
RUN pip install -r requirements.txt | ||
|
||
# Command to run the application | ||
CMD ["chainlit", "run", "app.py", "--port", "8000"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
from pathlib import Path | ||
from typing import List | ||
from langchain_openai import AzureOpenAIEmbeddings,AzureChatOpenAI | ||
from dotenv import load_dotenv | ||
from langchain.schema import Document | ||
from langchain_pinecone import Pinecone | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain_community.document_loaders import ( | ||
PyMuPDFLoader, | ||
) | ||
import os | ||
import uuid | ||
from pinecone import Pinecone, ServerlessSpec | ||
import chainlit as cl | ||
|
||
chunk_size = 1024 | ||
chunk_overlap = 50 | ||
PDF_STORAGE_PATH = "./pdfs" | ||
|
||
# Load environment variables | ||
load_dotenv() | ||
|
||
# OpenAI configuration | ||
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | ||
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | ||
AZURE_OPENAI_ADA_DEPLOYMENT_VERSION = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT_VERSION") | ||
AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION= os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION") | ||
AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME") | ||
AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME") | ||
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME") | ||
|
||
# Pinecone configuration | ||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") | ||
index_name = 'primer' | ||
|
||
#Initialize Pinecone | ||
pc = Pinecone(api_key=PINECONE_API_KEY) | ||
if index_name not in pc.list_indexes().names(): | ||
pc.create_index( | ||
name=index_name, | ||
dimension=1536, | ||
metric="cosine", | ||
spec=ServerlessSpec(cloud="aws",region="us-west-2") | ||
) | ||
|
||
#Initialize Azure OpenAI embeddings | ||
|
||
embeddings = AzureOpenAIEmbeddings( | ||
deployment=AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, | ||
model=AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME, | ||
azure_endpoint=AZURE_OPENAI_ENDPOINT, | ||
openai_api_key=AZURE_OPENAI_API_KEY, | ||
openai_api_version=AZURE_OPENAI_ADA_DEPLOYMENT_VERSION | ||
) | ||
def process_pdfs(pdf_storage_path: str): | ||
pdf_directory = Path(pdf_storage_path) | ||
docs = [] | ||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | ||
|
||
# Load PDFs and split into documents | ||
for pdf_path in pdf_directory.glob("*.pdf"): | ||
loader = PyMuPDFLoader(str(pdf_path)) | ||
documents = loader.load() | ||
docs += text_splitter.split_documents(documents) | ||
|
||
|
||
# Convert text to embeddings | ||
for doc in docs: | ||
embedding = embeddings.embed_query(doc.page_content) | ||
random_id = str(uuid.uuid4()) | ||
#print (embedding) | ||
doc_search = pc.Index(index_name) | ||
#doc_search = Pinecone (doc_search, embeddings.embed_query, doc.page_content, random_id) | ||
|
||
# Store the vector in Pinecone index | ||
doc_search.upsert(vectors = [{"id": random_id, "values": embedding, "metadata": {"source": doc.page_content}}]) | ||
print("Vector stored in Pinecone index successfully.") | ||
return doc_search | ||
|
||
doc_search = process_pdfs(PDF_STORAGE_PATH) | ||
|
||
welcome_message = "Welcome to the Chainlit Pinecone demo! Ask anything about documents you vectorized and stored in your Pinecone DB." | ||
namespace = None | ||
|
||
from langchain.memory import ChatMessageHistory, ConversationBufferMemory | ||
from langchain.chains import ConversationalRetrievalChain | ||
from langchain.vectorstores.pinecone import Pinecone | ||
import pinecone | ||
|
||
@cl.on_chat_start | ||
async def start(): | ||
await cl.Message(content=welcome_message).send() | ||
docsearch = Pinecone.from_existing_index( | ||
index_name=index_name, embedding=embeddings, namespace=namespace | ||
) | ||
|
||
message_history = ChatMessageHistory() | ||
|
||
memory = ConversationBufferMemory( | ||
memory_key="chat_history", | ||
output_key="answer", | ||
chat_memory=message_history, | ||
return_messages=True, | ||
) | ||
|
||
chain = ConversationalRetrievalChain.from_llm( | ||
llm = AzureChatOpenAI( | ||
api_key=AZURE_OPENAI_API_KEY, | ||
azure_endpoint=AZURE_OPENAI_ENDPOINT, | ||
api_version=AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION, | ||
openai_api_type="azure", | ||
azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME, | ||
streaming=True | ||
), | ||
chain_type="stuff", | ||
retriever=docsearch.as_retriever(), | ||
memory=memory, | ||
return_source_documents=True, | ||
) | ||
cl.user_session.set("chain", chain) | ||
|
||
@cl.on_message | ||
async def main(message: cl.Message): | ||
chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain | ||
|
||
cb = cl.AsyncLangchainCallbackHandler() | ||
|
||
res = await chain.acall(message.content, callbacks=[cb]) | ||
answer = res["answer"] | ||
source_documents = res["source_documents"] # type: List[Document] | ||
|
||
text_elements = [] # type: List[cl.Text] | ||
|
||
if source_documents: | ||
for source_idx, source_doc in enumerate(source_documents): | ||
source_name = f"source_{source_idx}" | ||
# Create the text element referenced in the message | ||
text_elements.append( | ||
cl.Text(content=source_doc.page_content, name=source_name) | ||
) | ||
source_names = [text_el.name for text_el in text_elements] | ||
|
||
if source_names: | ||
answer += f"\nSources: {', '.join(source_names)}" | ||
else: | ||
answer += "\nNo sources found" | ||
|
||
await cl.Message(content=answer, elements=text_elements).send() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
# Variables | ||
source ./variables.sh | ||
|
||
# Build the docker image | ||
docker build --platform=linux/arm64 -t $docImageName:$tag -f Dockerfile --build-arg FILENAME=$docAppFile --build-arg PORT=$port . | ||
#docker build --platform=linux/amd64 -t $docImageName:$tag -f Dockerfile --build-arg FILENAME=$docAppFile --build-arg PORT=$port . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Welcome to Chainlit! 🚀🤖 | ||
|
||
Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs. | ||
|
||
## Useful Links 🔗 | ||
|
||
- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚 | ||
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬 | ||
|
||
We can't wait to see what you create with Chainlit! Happy coding! 💻😊 | ||
|
||
## Welcome screen | ||
|
||
To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty. |
Binary file added
BIN
+359 KB
azure-openai-pinecone-pdf-qa/src/images/Screenshot 2024-02-18 at 16.58.21.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+324 KB
azure-openai-pinecone-pdf-qa/src/images/Screenshot 2024-02-18 at 17.13.36.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/sh | ||
|
||
# Load variables | ||
. ./variables.sh | ||
|
||
# Login to ACR | ||
echo "Logging in to [$acrName] container registry..." | ||
az acr login --name "$(echo "$acrName" | tr '[:upper:]' '[:lower:]')" | ||
|
||
# Retrieve ACR login server | ||
echo "Retrieving login server for the [$acrName] container registry..." | ||
loginServer=$(az acr show --name "$(echo "$acrName" | tr '[:upper:]' '[:lower:]')" --query loginServer --output tsv) | ||
|
||
# Push the local docker images to the Azure Container Registry | ||
echo "Pushing the local docker images to the [$acrName] container registry..." | ||
docker tag "$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag" "$loginServer/$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag" | ||
docker push "$loginServer/$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Azure OpenAI, Pinecone, Chainlit, LangChain, PDF Processing, Question Answering | ||
|
||
|
||
This repository contains a Chainlit application that provides a question-answering service using documents stored in a Pinecone vector store. It allows users to upload PDF documents, which are then chunked, embedded using Azure Open AI service, and indexed for efficient retrieval. When a user asks a question, the application retrieves relevant document chunks and uses Azure OpenAI's language model to generate an answer. | ||
|
||
## High Level Description | ||
|
||
The app.py script performs the following functions: | ||
|
||
- PDF Processing (process_pdfs): Chunks PDF files into smaller text segments. | ||
- Creates embeddings for each chunk using Azure Open AI service, and stores them in Pinecone. | ||
- Question Answering (on_message): When a user asks a question, the application retrieves relevant document chunks and generates an answer using Azure OpenAI's language model, providing the sources for transparency. | ||
|
||
The following files are also included in the repository: | ||
- requirements.txt: Lists the required Python packages. | ||
- Dockerfile: Used to build a Docker image for the application. | ||
- .env: Contains the environment variables. | ||
- build-docker-image.sh: A script to build the Docker image. | ||
- run-docker-image.sh: A script to run the Docker image locally. | ||
- push-docker-image.sh: A script to push the Docker image to an Azure Container Registry | ||
- variables.sh: contains the variables for the Azure Container Registry, and the Docker image. | ||
|
||
## Quickstart | ||
|
||
### Prerequisites: | ||
- An active [Azure Subscription](https://learn.microsoft.com/en-us/azure/guides/developer/azure-developer-guide#understanding-accounts-subscriptions-and-billing). If you don't have one, create a [free Azure account](https://azure.microsoft.com/en-gb/free/) before you begin. | ||
- [VS Code](https://code.visualstudio.com/) as a code editor. | ||
- [Docker](https://www.docker.com/) installed on your local machine. | ||
- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) installed on your local machine. | ||
- [Pinecone account](https://www.pinecone.io/) and API key. | ||
- [Azure OpenAI account](https://azure.microsoft.com/en-us/services/cognitive-services/openai/). You will need to create a resource and obtain your OpenAI Endpoint, API Key, deploy text-embedding-ada-002 and gpt-35-turbo-16k model. | ||
- Pdf document to be uploaded in the folder `pdfs`. This document will be indexed and used for question answering. | ||
- Python 3.11 or higher installed on your local machine. | ||
- (Optional) [Azure Container Registry](https://docs.microsoft.com/en-us/azure/container-registry/) to store the Docker image. This step is optional, if you want to deploy the application to Azure Container Apps for example. | ||
|
||
### Setup the environment variables | ||
|
||
1. Create an .env file and update the following environment variables: | ||
|
||
``` | ||
AZURE_OPENAI_API_KEY=d49e7825c734484b86c6803d4452ce68 | ||
# replace with your Azure OpenAI API Key | ||
AZURE_OPENAI_ENDPOINT=https://pineconellmdemoopenai.openai.azure.com/ | ||
# replace with your Azure OpenAI Endpoint | ||
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-35-turbo-16k | ||
#Create a deployment for the gpt-35-turbo-16k model and place the deployment name here. You can name the deployment as per your choice and put the name here. #In my case, I have named it as `gpt-35-turbo-16k`. | ||
AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION=2023-07-01-preview | ||
#You don't need to change this unless you are willing to try other versions. | ||
PINECONE_API_KEY=a592424f-ba4e-4c66-a0d2-deda1dcd1de9 | ||
#Change this to your Pinecone API Key | ||
AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME=text-embedding-ada-002 | ||
#Create a new deployment in the Azure Open AI Studio using the text-embedding-ada-002 #model and place the deployment name here. You can name the deployment #as per your #choice and put the name here. In my case, I have named it as `text-embedding-ada-002`. | ||
AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME=text-embedding-ada-002 | ||
#This is the model name of the text-embedding-ada-002 deployment model from above. You don't need to change it as it will be the same in your case. | ||
AZURE_OPENAI_ADA_DEPLOYMENT_VERSION=2024-02-15-preview | ||
#You don't need to change this unless you are willing to try earlier versions. | ||
``` | ||
|
||
Once you have updated the .env file, please save the changes and you are ready to proceed to the next step. | ||
|
||
### Option 1: Run the application locally | ||
|
||
1. Install dependencies: | ||
Open the terminal and navigate to the src folder of the repository. Then run the following command to install the necesairly Python packages: | ||
|
||
```pip | ||
pip install -r requirements.txt | ||
``` | ||
|
||
2. Process pdf files: In the folder 'pdfs', place the pdf document that you want to use for answering questions. | ||
|
||
3. Run the application: Run the following command to start the application: | ||
|
||
```chainlit | ||
chainlit run app.py -w | ||
``` | ||
4. Test the application: Open a new terminal and run the following command to test the application: | ||
|
||
```chainlit | ||
http://localhost:8000/ | ||
``` | ||
You can now upload a pdf document and ask questions to test the application. | ||
|
||
![Screen](src/images/Screenshot 2024-02-18 at 16.58.21.png) | ||
|
||
### Option 2: Run the application in a Docker container | ||
|
||
1. Navigate to the src folder of the repository | ||
|
||
2. Open the file build-docker-image.sh and depending on the architecture of your local machine (linux/arm64 or linux/amd64), uncomment the respective line and comment the other line. Then save the file. In my case I built the image to run it locally on my M1 Mac, so I have uncommented the line for linux/arm64 and commented the line for linux/amd64. If you plan to build the image for a different architecture, you can uncomment the respective line and comment the other line. | ||
|
||
3. Run the following command to build the Docker image: | ||
|
||
```build-docker-image | ||
./build-docker-image.sh | ||
``` | ||
4. Run the following command to run the Docker image: | ||
|
||
```run-docker-image | ||
./run-docker-image.sh | ||
``` | ||
5. Test the application: Open a new terminal and run the following command to test the application: | ||
|
||
```chainlit | ||
http://localhost:8000/ | ||
``` | ||
![Screen](src/images/Screenshot 2024-02-18 at 17.13.36.png) | ||
|
||
6. (optional) Push the Docker image to an Azure Container Registry | ||
|
||
If you want to deploy the application to Azure, you can push the Docker image to an Azure Container Registry. To do this, you need to have an Azure Container Registry and the Docker image name and the Azure Container Registry name in the variables.sh file. Once you have updated the variables.sh file, run the following Azure CLI command to connect to your Azure Subscription: | ||
|
||
```azure | ||
az login | ||
``` | ||
|
||
Then run the following command to push the Docker image to the Azure Container Registry: | ||
|
||
```push-docker-image | ||
./push-docker-image.sh | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
langchain | ||
chainlit | ||
langchain_openai | ||
openai | ||
tiktoken | ||
pymupdf | ||
pinecone-client | ||
langchain_pinecone |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# Variables | ||
source ./variables.sh | ||
|
||
# Print the text | ||
echo "Running the docker Container " | ||
|
||
# Run the docker container | ||
#docker run -p $port:$port $docImageName:$tag | ||
docker run -it \ | ||
--rm \ | ||
-p $port:$port \ | ||
-e AZURE_OPENAI_KEY=$AZURE_OPENAI_KEY \ | ||
--name $docImageName \ | ||
$docImageName:$tag |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Variables | ||
|
||
# Azure Container Registry | ||
prefix="PineconeLLMDemo" | ||
acrName="${prefix}Registry" | ||
acrResourceGrougName="${prefix}-RG" | ||
location="EastUS" | ||
|
||
# Python File | ||
docAppFile="app.py" | ||
|
||
# Docker Image | ||
docImageName="doc" | ||
tag="v1" | ||
port="8000" |