pinecone-io · dougwithseismic · Sep 15, 2023 · Sep 25, 2023 · Sep 25, 2023 · HarounAns
diff --git a/src/app/api/crawl/crawler.ts b/src/app/api/crawl/crawler.ts
@@ -1,5 +1,5 @@
-import cheerio from 'cheerio';
-import { NodeHtmlMarkdown } from 'node-html-markdown';
+import cheerio from "cheerio";
+import { NodeHtmlMarkdown } from "node-html-markdown";
 
 interface Page {
  url: string;
@@ -11,9 +11,12 @@ class Crawler {
  private pages: Page[] = [];
  private queue: { url: string; depth: number }[] = [];
 
- constructor(private maxDepth = 2, private maxPages = 1) { }
+ constructor(private maxDepth = 2, private maxPages = 1) {}
 
  async crawl(startUrl: string): Promise<Page[]> {
+ // Capture the hostname of the start URL
+ const startHostname = new URL(startUrl).hostname;
+
  // Add the start URL to the queue
  this.addToQueue(startUrl);
 
@@ -34,8 +37,8 @@ class Crawler {
  // Parse the HTML and add the page to the list of crawled pages
  this.pages.push({ url, content: this.parseHtml(html) });
 
- // Extract new URLs from the page HTML and add them to the queue
- this.addNewUrlsToQueue(this.extractUrls(html, url), depth);
+ // Pass startHostname to addNewUrlsToQueue
+ this.addNewUrlsToQueue(this.extractUrls(html, url), depth, startHostname);
  }
 
  // Return the list of crawled pages
@@ -58,8 +61,16 @@ class Crawler {
  this.queue.push({ url, depth });
  }
 
- private addNewUrlsToQueue(urls: string[], depth: number) {
- this.queue.push(...urls.map(url => ({ url, depth: depth + 1 })));
+ private addNewUrlsToQueue(
+ urls: string[],
+ depth: number,
+ startHostname: string
+ ) {
+ const filteredUrls = urls.filter((url) => {
+ const hostname = new URL(url).hostname;
+ return hostname === startHostname;
+ });
+ this.queue.push(...filteredUrls.map((url) => ({ url, depth: depth + 1 })));
  }
 
  private async fetchPage(url: string): Promise<string> {
@@ -68,20 +79,24 @@ class Crawler {
  return await response.text();
  } catch (error) {
  console.error(`Failed to fetch ${url}: ${error}`);
- return '';
+ return "";
  }
  }
 
  private parseHtml(html: string): string {
  const $ = cheerio.load(html);
- $('a').removeAttr('href');
+ $("a").removeAttr("href");
  return NodeHtmlMarkdown.translate($.html());
  }
 
  private extractUrls(html: string, baseUrl: string): string[] {
  const $ = cheerio.load(html);
- const relativeUrls = $('a').map((_, link) => $(link).attr('href')).get() as string[];
- return relativeUrls.map(relativeUrl => new URL(relativeUrl, baseUrl).href);
+ const relativeUrls = $("a")
+ .map((_, link) => $(link).attr("href"))
+ .get() as string[];
+ return relativeUrls.map(
+ (relativeUrl) => new URL(relativeUrl, baseUrl).href
+ );
  }
 }