Skip to content

Commit

Permalink
Nick: max num tokens for llm extract (for now) + slice the max
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed May 21, 2024
1 parent d5d0d48 commit 77a79b5
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 13 deletions.
3 changes: 3 additions & 0 deletions apps/api/src/controllers/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) {
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds

Expand Down
24 changes: 19 additions & 5 deletions apps/api/src/lib/LLM-extraction/models.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,38 @@
import OpenAI from "openai";
import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";

export type ScraperCompletionResult = {
data: any | null;
url: string;
};

const maxTokens = 32000;
const modifier = 4;
const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage";

function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
// Check if the markdown content exists in the document
if (!document.markdown) {
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
let markdown = document.markdown;

// Check if the markdown content exists in the document
if (!markdown) {
throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to [email protected]"
);
}

return [{ type: "text", text: document.markdown }];
// count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");

if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}

return [[{ type: "text", text: markdown }], numTokens];
}

export async function generateOpenAICompletions({
Expand All @@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
temperature?: number;
}): Promise<Document> {
const openai = client as OpenAI;
const content = prepareOpenAIDoc(document);
const [content, numTokens] = prepareOpenAIDoc(document);

const completion = await openai.chat.completions.create({
model,
Expand Down Expand Up @@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
return {
...document,
llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at [email protected] so we can help you.` : undefined,
};
}

1 change: 1 addition & 0 deletions apps/api/src/lib/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ export class Document {
};
childrenLinks?: string[];
provider?: string;
warning?: string;

constructor(data: Partial<Document>) {
if (!data.content) {
Expand Down
10 changes: 2 additions & 8 deletions apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ export const excludeNonMainTags = [
"#nav",
".breadcrumbs",
"#breadcrumbs",
".form",
"form",
"#search-form",
".search",
"#search",
Expand All @@ -51,10 +49,6 @@ export const excludeNonMainTags = [
"#tag",
".category",
"#category",
".comment",
"#comment",
".reply",
"#reply",
".author",
"#author",
".cookie",
"#cookie"
];

0 comments on commit 77a79b5

Please sign in to comment.