Nick: max num tokens for llm extract (for now) + slice the max

mendableai · May 21, 2024 · 77a79b5 · 77a79b5
1 parent d5d0d48
commit 77a79b5
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 13 deletions.
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
@@ -106,6 +106,9 @@ export async function scrapeController(req: Request, res: Response) {
  const extractorOptions = req.body.extractorOptions ?? {
  mode: "markdown"
  }
+ if (extractorOptions.mode === "llm-extraction") {
+ pageOptions.onlyMainContent = true;
+ }
  const origin = req.body.origin ?? "api";
  const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
 

diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts
@@ -1,25 +1,38 @@
 import OpenAI from "openai";
 import { Document } from "../../lib/entities";
+import { numTokensFromString } from "./helpers";
 
 export type ScraperCompletionResult = {
  data: any | null;
  url: string;
 };
 
+const maxTokens = 32000;
+const modifier = 4;
 const defaultPrompt =
  "You are a professional web scraper. Extract the contents of the webpage";
 
 function prepareOpenAIDoc(
  document: Document
-): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
- // Check if the markdown content exists in the document
- if (!document.markdown) {
+): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
+ let markdown = document.markdown;
+
+// Check if the markdown content exists in the document
+ if (!markdown) {
  throw new Error(
  "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to [email protected]"
  );
  }
 
- return [{ type: "text", text: document.markdown }];
+ // count number of tokens
+ const numTokens = numTokensFromString(document.markdown, "gpt-4");
+
+ if (numTokens > maxTokens) {
+ // trim the document to the maximum number of tokens, tokens != characters
+ markdown = markdown.slice(0, (maxTokens * modifier));
+ }
+
+ return [[{ type: "text", text: markdown }], numTokens];
 }
 
 export async function generateOpenAICompletions({
@@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
  temperature?: number;
 }): Promise<Document> {
  const openai = client as OpenAI;
- const content = prepareOpenAIDoc(document);
+ const [content, numTokens] = prepareOpenAIDoc(document);
 
  const completion = await openai.chat.completions.create({
  model,
@@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
  return {
  ...document,
  llm_extraction: llmExtraction,
+ warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at [email protected] so we can help you.` : undefined,
  };
 }
 
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
@@ -72,6 +72,7 @@ export class Document {
  };
  childrenLinks?: string[];
  provider?: string;
+ warning?: string;
 
  constructor(data: Partial<Document>) {
  if (!data.content) {

diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
@@ -34,8 +34,6 @@ export const excludeNonMainTags = [
  "#nav",
  ".breadcrumbs",
  "#breadcrumbs",
- ".form",
- "form",
  "#search-form",
  ".search",
  "#search",
@@ -51,10 +49,6 @@ export const excludeNonMainTags = [
  "#tag",
  ".category",
  "#category",
- ".comment",
- "#comment",
- ".reply",
- "#reply",
- ".author",
- "#author",
+ ".cookie",
+ "#cookie"
 ];