Skip to content
This repository has been archived by the owner on Jun 7, 2024. It is now read-only.

Commit

Permalink
Merge pull request #33 from athrael-soju/32-update-crawlerts-add-host…
Browse files Browse the repository at this point in the history
…name-check-to-keep-crawler-on-the-same-domain

32-update-crawlerts-add-hostname-check-to-keep-crawler-on-the-same-domain
  • Loading branch information
athrael-soju committed Oct 9, 2023
2 parents c594e8a + 03b2bec commit 813b310
Showing 1 changed file with 44 additions and 10 deletions.
54 changes: 44 additions & 10 deletions src/app/api/crawl/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import * as cheerio from 'cheerio';
import { load } from 'cheerio';
import { NodeHtmlMarkdown } from 'node-html-markdown';

interface Page {
Expand All @@ -11,9 +11,17 @@ class Crawler {
private pages: Page[] = [];
private queue: { url: string; depth: number }[] = [];

constructor(private maxDepth = 2, private maxPages = 1) { }
constructor(private maxDepth = 2, private maxPages = 1) {}

async crawl(startUrl: string): Promise<Page[]> {
// Capture the hostname of the start URL
let startHostname;
try {
startHostname = new URL(startUrl).hostname;
} catch (error) {
throw new Error(`Invalid URL: ${startUrl}`);
}

// Add the start URL to the queue
this.addToQueue(startUrl);

Expand All @@ -34,8 +42,8 @@ class Crawler {
// Parse the HTML and add the page to the list of crawled pages
this.pages.push({ url, content: this.parseHtml(html) });

// Extract new URLs from the page HTML and add them to the queue
this.addNewUrlsToQueue(this.extractUrls(html, url), depth);
// Pass startHostname to addNewUrlsToQueue
this.addNewUrlsToQueue(this.extractUrls(html, url), depth, startHostname);
}

// Return the list of crawled pages
Expand All @@ -58,8 +66,22 @@ class Crawler {
this.queue.push({ url, depth });
}

private addNewUrlsToQueue(urls: string[], depth: number) {
this.queue.push(...urls.map(url => ({ url, depth: depth + 1 })));
private addNewUrlsToQueue(
urls: string[],
depth: number,
startHostname: string
) {
const filteredUrls = urls.filter((url) => {
try {
const hostname = new URL(url).hostname;
return hostname === startHostname;
} catch (e) {
console.error(`Invalid URL: ${url}`);
return false;
}
});

this.queue.push(...filteredUrls.map((url) => ({ url, depth: depth + 1 })));
}

private async fetchPage(url: string): Promise<string> {
Expand All @@ -73,15 +95,27 @@ class Crawler {
}

private parseHtml(html: string): string {
const $ = cheerio.load(html);
const $ = load(html);
$('a').removeAttr('href');
return NodeHtmlMarkdown.translate($.html());
}

private extractUrls(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const relativeUrls = $('a').map((_, link) => $(link).attr('href')).get() as string[];
return relativeUrls.map(relativeUrl => new URL(relativeUrl, baseUrl).href);
const $ = load(html);
const relativeUrls = $('a')
.map((_, link) => $(link).attr('href'))
.get();

try {
const relativeMap = relativeUrls.map(
(relativeUrl) => new URL(relativeUrl, baseUrl).href
);

return relativeMap;
} catch (error) {
console.error('Error extracting URLs:', error);
return [];
}
}
}

Expand Down

0 comments on commit 813b310

Please sign in to comment.