Skip to content
This repository has been archived by the owner on Jun 7, 2024. It is now read-only.

Commit

Permalink
Merge pull request #28 from athrael-soju/18-update-pinecone-to-v101-1
Browse files Browse the repository at this point in the history
18-update-pinecone-to-v101-1
  • Loading branch information
athrael-soju committed Oct 9, 2023
2 parents 3bad44b + 9c69436 commit 2a292c2
Show file tree
Hide file tree
Showing 10 changed files with 698 additions and 664 deletions.
1 change: 1 addition & 0 deletions next.config.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/** @type {import('next').NextConfig} */
module.exports = {
webpack: (config, { isServer }) => {
if (!isServer) {
Expand Down
1,118 changes: 546 additions & 572 deletions package-lock.json

Large diffs are not rendered by default.

39 changes: 20 additions & 19 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,57 @@
"lint": "next lint"
},
"dependencies": {
"@fortawesome/fontawesome-free": "^6.4.0",
"@fortawesome/fontawesome-free": "^6.4.2",
"@fortawesome/fontawesome-svg-core": "^6.4.2",
"@fortawesome/free-solid-svg-icons": "^6.4.2",
"@fortawesome/react-fontawesome": "^0.2.0",
"@pinecone-database/doc-splitter": "^0.0.1",
"@pinecone-database/pinecone": "^0.1.6",
"@pinecone-database/pinecone": "^1.1.0",
"@speechly/speech-recognition-polyfill": "^1.3.0",
"@vercel/analytics": "^1.0.2",
"ai": "^2.1.16",
"ai": "^2.2.14",
"cheerio": "^1.0.0-rc.12",
"eslint": "8.44.0",
"eslint-config-next": "^13.5.3",
"eslint": "8.50.0",
"eslint-config-next": "^13.5.4",
"filepond-plugin-image-exif-orientation": "^1.0.11",
"filepond-plugin-image-preview": "^4.6.11",
"font-awesome": "^4.7.0",
"fs": "^0.0.1-security",
"fs.promises": "^0.1.2",
"langchain": "^0.0.150",
"langchain": "^0.0.158",
"mammoth": "^1.6.0",
"md5": "^2.3.0",
"next": "^13.5.3",
"next": "^13.5.4",
"next-connect": "^1.0.0",
"node-html-markdown": "^1.3.0",
"openai-edge": "^1.2.0",
"openai-edge": "^1.2.2",
"path": "^0.12.7",
"pdf-parse": "^1.1.1",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-filepond": "^7.1.2",
"react-icons": "^4.10.1",
"react-markdown": "^8.0.7",
"react-icons": "^4.11.0",
"react-markdown": "^9.0.0",
"react-speech-recognition": "^3.10.0",
"regenerator-runtime": "^0.14.0",
"sswr": "^2.0.0",
"stream": "^0.0.2",
"svelte": "^4.0.5",
"tailwindcss": "3.3.2",
"typescript": "5.1.6",
"unified": "^10.1.2",
"svelte": "^4.2.1",
"tailwindcss": "3.3.3",
"typescript": "5.2.2",
"unified": "^11.0.3",
"vue": "^3.3.4",
"wink-eng-lite-web-model": "^1.5.2",
"wink-nlp": "^1.14.3"
},
"devDependencies": {
"@types/md5": "^2.3.2",
"@types/node": "20.4.0",
"@types/react": "18.2.14",
"@types/react-dom": "18.2.6",
"@types/md5": "^2.3.3",
"@types/node": "20.8.2",
"@types/react": "18.2.24",
"@types/react-dom": "18.2.8",
"@types/react-speech-recognition": "^3.9.2",
"@types/regenerator-runtime": "^0.13.1",
"@types/regenerator-runtime": "^0.13.2",
"cross-env": "^7.0.3",
"encoding": "^0.1.13"
}
}
31 changes: 19 additions & 12 deletions src/app/api/clearIndex/route.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import { NextRequest, NextResponse } from "next/server";
import { getPineconeClient } from "@/utils/pinecone";

export async function POST(req: Request) {
const pinecone = await getPineconeClient()
const index = pinecone.Index(process.env.PINECONE_INDEX!)
await index.delete1({
deleteAll: true
});
import { NextResponse } from 'next/server';
import { Pinecone } from '@pinecone-database/pinecone';

export async function POST() {
// Instantiate a new Pinecone client
const pinecone = new Pinecone();
// Select the desired index
const index = pinecone.Index(process.env.PINECONE_INDEX!);

// Use the custom namespace, if provided, otherwise use the default
const namespaceName = process.env.PINECONE_NAMESPACE ?? '';
const namespace = index.namespace(namespaceName);

// Delete everything within the namespace
await namespace.deleteAll();

return NextResponse.json({
success: true
})
}
success: true,
});
}
4 changes: 2 additions & 2 deletions src/app/api/context/route.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { NextResponse } from "next/server";
import { getContext } from "@/utils/context";
import { ScoredVector } from "@pinecone-database/pinecone";
import { ScoredPineconeRecord } from "@pinecone-database/pinecone";

export async function POST(req: Request) {
try {
const { messages } = await req.json()
const lastMessage = messages.length > 1 ? messages[messages.length - 1] : messages[0]
const context = await getContext(lastMessage.content, '', 10000, 0.7, false) as ScoredVector[]
const context = await getContext(lastMessage.content, '', 10000, 0.7, false) as ScoredPineconeRecord[]
return NextResponse.json({ context })
} catch (e) {
console.log(e)
Expand Down
27 changes: 17 additions & 10 deletions src/app/api/crawl/seed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@ import {
MarkdownTextSplitter,
RecursiveCharacterTextSplitter,
} from '@pinecone-database/doc-splitter';
import { utils as PineconeUtils, Vector } from '@pinecone-database/pinecone';
import { Pinecone, PineconeRecord } from '@pinecone-database/pinecone';
import { chunkedUpsert } from '../../utils/chunkedUpsert';
import md5 from 'md5';
import { getPineconeClient } from '@/utils/pinecone';
import { Crawler, Page } from './crawler';
import { truncateStringByBytes } from '@/utils/truncateString';

const { chunkedUpsert, createIndexIfNotExists } = PineconeUtils;

interface SeedOptions {
splittingMethod: string;
chunkSize: number;
Expand All @@ -28,7 +26,7 @@ async function seed(
) {
try {
// Initialize the Pinecone client
const pinecone = await getPineconeClient();
const pinecone = new Pinecone();

// Destructure the options object
const { splittingMethod, chunkSize, chunkOverlap } = options;
Expand All @@ -51,8 +49,17 @@ async function seed(
);

// Create Pinecone index if it does not exist
await createIndexIfNotExists(pinecone, indexName, 1536);
const index = pinecone?.Index(indexName);
const indexList = await pinecone.listIndexes();
const indexExists = indexList.some((index) => index.name === indexName);
if (!indexExists) {
await pinecone.createIndex({
name: indexName,
dimension: 1536,
waitUntilReady: true,
});
}

const index = pinecone.Index(indexName);

// Get the vector embeddings for the documents
const vectors = await Promise.all(documents.flat().map(embedDocument));
Expand All @@ -68,7 +75,7 @@ async function seed(
}
}

async function embedDocument(doc: Document): Promise<Vector> {
async function embedDocument(doc: Document): Promise<PineconeRecord> {
try {
// Generate OpenAI embeddings for the document content
const embedding = await getEmbeddings(doc.pageContent);
Expand All @@ -87,7 +94,7 @@ async function embedDocument(doc: Document): Promise<Vector> {
url: doc.metadata.url as string, // The URL where the document was found
hash: doc.metadata.hash as string, // The hash of the document content
},
} as Vector;
} as PineconeRecord;
} catch (error) {
console.log('Error embedding document: ', error);
throw error;
Expand All @@ -109,7 +116,7 @@ async function prepareDocument(
url: page.url,
// Truncate the text to a maximum byte length
text: truncateStringByBytes(pageContent, 36000),
},
},
}),
]);

Expand Down
26 changes: 16 additions & 10 deletions src/app/api/ingest/seed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
import { DocxLoader } from 'langchain/document_loaders/fs/docx';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { getPineconeClient } from '@/utils/pinecone';
import { utils as PineconeUtils, Vector } from '@pinecone-database/pinecone';
import { Pinecone, PineconeRecord } from '@pinecone-database/pinecone';
import { chunkedUpsert } from '../../utils/chunkedUpsert';
import { truncateStringByBytes } from '@/utils/truncateString';
import md5 from 'md5';

Expand All @@ -22,20 +22,18 @@ interface SeedOptions {
}
type DocumentSplitter = RecursiveCharacterTextSplitter | MarkdownTextSplitter;

const { chunkedUpsert, createIndexIfNotExists } = PineconeUtils;

async function seed(
filename: string,
path: string,
topK: string,
index: string,
indexName: string,
options: SeedOptions
) {
try {
//TODO: Add topK support

// Initialize the Pinecone client
const pinecone = await getPineconeClient();
const pinecone = new Pinecone();

// Destructure the options object
const { splittingMethod, chunkSize, chunkOverlap } = options;
Expand All @@ -58,14 +56,22 @@ async function seed(
docs.map((doc) => prepareDocument(doc, filename, splitter))
).then((docs) => docs.flat());

await createIndexIfNotExists(pinecone, index, 1536);
const indexList = await pinecone.listIndexes();
const indexExists = indexList.some((index) => index.name === indexName);
if (!indexExists) {
await pinecone.createIndex({
name: indexName,
dimension: 1536,
waitUntilReady: true,
});
}

// Get the vector embeddings for the documents
// Warning: For larger files, the chunk size should be increased accordingly.
const vectors = await Promise.all(documents.map(embedDocument));

// Upsert vectors into the Pinecone index
await chunkedUpsert(pinecone?.Index(index)!, vectors, '', 10);
await chunkedUpsert(pinecone?.Index(indexName)!, vectors, '', 10);
const filesToDelete = readdirSync(path);

filesToDelete.forEach((file) => {
Expand All @@ -80,7 +86,7 @@ async function seed(
}
}

async function embedDocument(doc: Document): Promise<Vector> {
async function embedDocument(doc: Document): Promise<PineconeRecord> {
try {
// Generate OpenAI embeddings for the document content
const embedding = await getEmbeddings(doc.pageContent);
Expand All @@ -99,7 +105,7 @@ async function embedDocument(doc: Document): Promise<Vector> {
filename: doc.metadata.filename as string, // The URL where the document was found
hash: doc.metadata.hash as string, // The hash of the document content
},
} as Vector;
} as PineconeRecord;
} catch (error) {
console.log('Error embedding document: ', error);
throw error;
Expand Down
7 changes: 4 additions & 3 deletions src/app/layout.tsx
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import { Analytics } from '@vercel/analytics/react';

export const metadata = {
title: 'Title',
description: 'Description',
title: 'Iridium.AI',
description:
'Iridium-AI is an Open Source application, heavily inspired by pinecone-vercel-starter. Fell free to clone/Fork, or even use as a template',
};

import '../global.css';

export default function RootLayout({
children,
}: {
children: React.ReactNode;
readonly children: React.ReactNode;
}) {
return (
<html lang="en">
Expand Down
34 changes: 34 additions & 0 deletions src/app/utils/chunkedUpsert.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import type { Index, PineconeRecord } from '@pinecone-database/pinecone';

const sliceIntoChunks = <T>(arr: T[], chunkSize: number) => {
return Array.from({ length: Math.ceil(arr.length / chunkSize) }, (_, i) =>
arr.slice(i * chunkSize, (i + 1) * chunkSize)
);
};

export const chunkedUpsert = async (
index: Index,
vectors: Array<PineconeRecord>,
namespace: string,
chunkSize = 10
) => {
// Split the vectors into chunks
const chunks = sliceIntoChunks<PineconeRecord>(vectors, chunkSize);

try {
// Upsert each chunk of vectors into the index
await Promise.allSettled(
chunks.map(async (chunk) => {
try {
await index.namespace(namespace).upsert(vectors);
} catch (e) {
console.log('Error upserting chunk', e);
}
})
);

return true;
} catch (e) {
throw new Error(`Error upserting vectors into index: ${e}`);
}
};
Loading

0 comments on commit 2a292c2

Please sign in to comment.