Skip to content

Commit

Permalink
Added a Use Case with Azure Open AI+Pinecone+Chainlit+LangChain for p…
Browse files Browse the repository at this point in the history
…df processing and QA (#81)

Co-authored-by: milanju <[email protected]>
  • Loading branch information
bbmilan and bbmilan committed Feb 21, 2024
1 parent f60dc79 commit 1ba1c1a
Show file tree
Hide file tree
Showing 12 changed files with 375 additions and 0 deletions.
21 changes: 21 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.11

# Create a non-root user
RUN useradd -m -u 1000 user

# Set user environment
USER user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH

# Set working directory
WORKDIR $HOME/app

# Copy files
COPY --chown=user . .

# Install dependencies
RUN pip install -r requirements.txt

# Command to run the application
CMD ["chainlit", "run", "app.py", "--port", "8000"]
148 changes: 148 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from pathlib import Path
from typing import List
from langchain_openai import AzureOpenAIEmbeddings,AzureChatOpenAI
from dotenv import load_dotenv
from langchain.schema import Document
from langchain_pinecone import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyMuPDFLoader,
)
import os
import uuid
from pinecone import Pinecone, ServerlessSpec
import chainlit as cl

chunk_size = 1024
chunk_overlap = 50
PDF_STORAGE_PATH = "./pdfs"

# Load environment variables
load_dotenv()

# OpenAI configuration
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_ADA_DEPLOYMENT_VERSION = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT_VERSION")
AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION= os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION")
AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME")
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")

# Pinecone configuration
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
index_name = 'primer'

#Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(cloud="aws",region="us-west-2")
)

#Initialize Azure OpenAI embeddings

embeddings = AzureOpenAIEmbeddings(
deployment=AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
model=AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
openai_api_key=AZURE_OPENAI_API_KEY,
openai_api_version=AZURE_OPENAI_ADA_DEPLOYMENT_VERSION
)
def process_pdfs(pdf_storage_path: str):
pdf_directory = Path(pdf_storage_path)
docs = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Load PDFs and split into documents
for pdf_path in pdf_directory.glob("*.pdf"):
loader = PyMuPDFLoader(str(pdf_path))
documents = loader.load()
docs += text_splitter.split_documents(documents)


# Convert text to embeddings
for doc in docs:
embedding = embeddings.embed_query(doc.page_content)
random_id = str(uuid.uuid4())
#print (embedding)
doc_search = pc.Index(index_name)
#doc_search = Pinecone (doc_search, embeddings.embed_query, doc.page_content, random_id)

# Store the vector in Pinecone index
doc_search.upsert(vectors = [{"id": random_id, "values": embedding, "metadata": {"source": doc.page_content}}])
print("Vector stored in Pinecone index successfully.")
return doc_search

doc_search = process_pdfs(PDF_STORAGE_PATH)

welcome_message = "Welcome to the Chainlit Pinecone demo! Ask anything about documents you vectorized and stored in your Pinecone DB."
namespace = None

from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores.pinecone import Pinecone
import pinecone

@cl.on_chat_start
async def start():
await cl.Message(content=welcome_message).send()
docsearch = Pinecone.from_existing_index(
index_name=index_name, embedding=embeddings, namespace=namespace
)

message_history = ChatMessageHistory()

memory = ConversationBufferMemory(
memory_key="chat_history",
output_key="answer",
chat_memory=message_history,
return_messages=True,
)

chain = ConversationalRetrievalChain.from_llm(
llm = AzureChatOpenAI(
api_key=AZURE_OPENAI_API_KEY,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_version=AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION,
openai_api_type="azure",
azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME,
streaming=True
),
chain_type="stuff",
retriever=docsearch.as_retriever(),
memory=memory,
return_source_documents=True,
)
cl.user_session.set("chain", chain)

@cl.on_message
async def main(message: cl.Message):
chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain

cb = cl.AsyncLangchainCallbackHandler()

res = await chain.acall(message.content, callbacks=[cb])
answer = res["answer"]
source_documents = res["source_documents"] # type: List[Document]

text_elements = [] # type: List[cl.Text]

if source_documents:
for source_idx, source_doc in enumerate(source_documents):
source_name = f"source_{source_idx}"
# Create the text element referenced in the message
text_elements.append(
cl.Text(content=source_doc.page_content, name=source_name)
)
source_names = [text_el.name for text_el in text_elements]

if source_names:
answer += f"\nSources: {', '.join(source_names)}"
else:
answer += "\nNo sources found"

await cl.Message(content=answer, elements=text_elements).send()
8 changes: 8 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/build-docker-image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

# Variables
source ./variables.sh

# Build the docker image
docker build --platform=linux/arm64 -t $docImageName:$tag -f Dockerfile --build-arg FILENAME=$docAppFile --build-arg PORT=$port .
#docker build --platform=linux/amd64 -t $docImageName:$tag -f Dockerfile --build-arg FILENAME=$docAppFile --build-arg PORT=$port .
14 changes: 14 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/chainlit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Welcome to Chainlit! 🚀🤖

Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.

## Useful Links 🔗

- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬

We can't wait to see what you create with Chainlit! Happy coding! 💻😊

## Welcome screen

To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
17 changes: 17 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/push-docker-image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh

# Load variables
. ./variables.sh

# Login to ACR
echo "Logging in to [$acrName] container registry..."
az acr login --name "$(echo "$acrName" | tr '[:upper:]' '[:lower:]')"

# Retrieve ACR login server
echo "Retrieving login server for the [$acrName] container registry..."
loginServer=$(az acr show --name "$(echo "$acrName" | tr '[:upper:]' '[:lower:]')" --query loginServer --output tsv)

# Push the local docker images to the Azure Container Registry
echo "Pushing the local docker images to the [$acrName] container registry..."
docker tag "$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag" "$loginServer/$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag"
docker push "$loginServer/$(echo "$docImageName" | tr '[:upper:]' '[:lower:]'):$tag"
128 changes: 128 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Azure OpenAI, Pinecone, Chainlit, LangChain, PDF Processing, Question Answering


This repository contains a Chainlit application that provides a question-answering service using documents stored in a Pinecone vector store. It allows users to upload PDF documents, which are then chunked, embedded using Azure Open AI service, and indexed for efficient retrieval. When a user asks a question, the application retrieves relevant document chunks and uses Azure OpenAI's language model to generate an answer.

## High Level Description

The app.py script performs the following functions:

- PDF Processing (process_pdfs): Chunks PDF files into smaller text segments.
- Creates embeddings for each chunk using Azure Open AI service, and stores them in Pinecone.
- Question Answering (on_message): When a user asks a question, the application retrieves relevant document chunks and generates an answer using Azure OpenAI's language model, providing the sources for transparency.

The following files are also included in the repository:
- requirements.txt: Lists the required Python packages.
- Dockerfile: Used to build a Docker image for the application.
- .env: Contains the environment variables.
- build-docker-image.sh: A script to build the Docker image.
- run-docker-image.sh: A script to run the Docker image locally.
- push-docker-image.sh: A script to push the Docker image to an Azure Container Registry
- variables.sh: contains the variables for the Azure Container Registry, and the Docker image.

## Quickstart

### Prerequisites:
- An active [Azure Subscription](https://learn.microsoft.com/en-us/azure/guides/developer/azure-developer-guide#understanding-accounts-subscriptions-and-billing). If you don't have one, create a [free Azure account](https://azure.microsoft.com/en-gb/free/) before you begin.
- [VS Code](https://code.visualstudio.com/) as a code editor.
- [Docker](https://www.docker.com/) installed on your local machine.
- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) installed on your local machine.
- [Pinecone account](https://www.pinecone.io/) and API key.
- [Azure OpenAI account](https://azure.microsoft.com/en-us/services/cognitive-services/openai/). You will need to create a resource and obtain your OpenAI Endpoint, API Key, deploy text-embedding-ada-002 and gpt-35-turbo-16k model.
- Pdf document to be uploaded in the folder `pdfs`. This document will be indexed and used for question answering.
- Python 3.11 or higher installed on your local machine.
- (Optional) [Azure Container Registry](https://docs.microsoft.com/en-us/azure/container-registry/) to store the Docker image. This step is optional, if you want to deploy the application to Azure Container Apps for example.

### Setup the environment variables

1. Create an .env file and update the following environment variables:

```
AZURE_OPENAI_API_KEY=d49e7825c734484b86c6803d4452ce68
# replace with your Azure OpenAI API Key
AZURE_OPENAI_ENDPOINT=https://pineconellmdemoopenai.openai.azure.com/
# replace with your Azure OpenAI Endpoint
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-35-turbo-16k
#Create a deployment for the gpt-35-turbo-16k model and place the deployment name here. You can name the deployment as per your choice and put the name here. #In my case, I have named it as `gpt-35-turbo-16k`.
AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION=2023-07-01-preview
#You don't need to change this unless you are willing to try other versions.
PINECONE_API_KEY=a592424f-ba4e-4c66-a0d2-deda1dcd1de9
#Change this to your Pinecone API Key
AZURE_OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME=text-embedding-ada-002
#Create a new deployment in the Azure Open AI Studio using the text-embedding-ada-002 #model and place the deployment name here. You can name the deployment #as per your #choice and put the name here. In my case, I have named it as `text-embedding-ada-002`.
AZURE_OPENAI_ADA_EMBEDDING_MODEL_NAME=text-embedding-ada-002
#This is the model name of the text-embedding-ada-002 deployment model from above. You don't need to change it as it will be the same in your case.
AZURE_OPENAI_ADA_DEPLOYMENT_VERSION=2024-02-15-preview
#You don't need to change this unless you are willing to try earlier versions.
```

Once you have updated the .env file, please save the changes and you are ready to proceed to the next step.

### Option 1: Run the application locally

1. Install dependencies:
Open the terminal and navigate to the src folder of the repository. Then run the following command to install the necesairly Python packages:

```pip
pip install -r requirements.txt
```

2. Process pdf files: In the folder 'pdfs', place the pdf document that you want to use for answering questions.

3. Run the application: Run the following command to start the application:

```chainlit
chainlit run app.py -w
```
4. Test the application: Open a new terminal and run the following command to test the application:

```chainlit
http://localhost:8000/
```
You can now upload a pdf document and ask questions to test the application.

![Screen](src/images/Screenshot 2024-02-18 at 16.58.21.png)

### Option 2: Run the application in a Docker container

1. Navigate to the src folder of the repository

2. Open the file build-docker-image.sh and depending on the architecture of your local machine (linux/arm64 or linux/amd64), uncomment the respective line and comment the other line. Then save the file. In my case I built the image to run it locally on my M1 Mac, so I have uncommented the line for linux/arm64 and commented the line for linux/amd64. If you plan to build the image for a different architecture, you can uncomment the respective line and comment the other line.

3. Run the following command to build the Docker image:

```build-docker-image
./build-docker-image.sh
```
4. Run the following command to run the Docker image:

```run-docker-image
./run-docker-image.sh
```
5. Test the application: Open a new terminal and run the following command to test the application:

```chainlit
http://localhost:8000/
```
![Screen](src/images/Screenshot 2024-02-18 at 17.13.36.png)

6. (optional) Push the Docker image to an Azure Container Registry

If you want to deploy the application to Azure, you can push the Docker image to an Azure Container Registry. To do this, you need to have an Azure Container Registry and the Docker image name and the Azure Container Registry name in the variables.sh file. Once you have updated the variables.sh file, run the following Azure CLI command to connect to your Azure Subscription:

```azure
az login
```

Then run the following command to push the Docker image to the Azure Container Registry:

```push-docker-image
./push-docker-image.sh
```
8 changes: 8 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
langchain
chainlit
langchain_openai
openai
tiktoken
pymupdf
pinecone-client
langchain_pinecone
16 changes: 16 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/run-docker-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# Variables
source ./variables.sh

# Print the text
echo "Running the docker Container "

# Run the docker container
#docker run -p $port:$port $docImageName:$tag
docker run -it \
--rm \
-p $port:$port \
-e AZURE_OPENAI_KEY=$AZURE_OPENAI_KEY \
--name $docImageName \
$docImageName:$tag
15 changes: 15 additions & 0 deletions azure-openai-pinecone-pdf-qa/src/variables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Variables

# Azure Container Registry
prefix="PineconeLLMDemo"
acrName="${prefix}Registry"
acrResourceGrougName="${prefix}-RG"
location="EastUS"

# Python File
docAppFile="app.py"

# Docker Image
docImageName="doc"
tag="v1"
port="8000"

0 comments on commit 1ba1c1a

Please sign in to comment.