From 4400157c11f0c4e5998376c81e783bdb72299de6 Mon Sep 17 00:00:00 2001 From: Dardan Bujupaj Date: Fri, 8 Mar 2024 23:39:51 +0100 Subject: [PATCH 1/3] add configuration option for additional request headers add a pre navigation hook to add thos headers to the request --- src/crawler.ts | 20 ++++++++++++++++++-- src/types.ts | 1 + 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e42da1b..8d10f80 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -5,6 +5,7 @@ import { PuppeteerCrawlingContext, PuppeteerCrawlerOptions, RequestQueue, + PuppeteerHook, } from 'crawlee' import { minimatch } from 'minimatch' @@ -50,8 +51,8 @@ export class Crawler { this.config.strategy == 'docssearch' ? new DocsearchScraper(this.sender, this.config) : this.config.strategy == 'schema' - ? new SchemaScraper(this.sender, this.config) - : new DefaultScraper(this.sender, this.config) + ? new SchemaScraper(this.sender, this.config) + : new DefaultScraper(this.sender, this.config) } async run() { @@ -62,12 +63,27 @@ export class Crawler { //Create the router const router = createPuppeteerRouter() + // type DefaultHandler = Parameters[0]; router.addDefaultHandler(this.defaultHandler.bind(this)) + const preNavigationHooks: PuppeteerHook[] = this.config.additional_request_headers ? [ + async (crawlingContext) => { + crawlingContext.addInterceptRequestHandler(async (request) => { + request.continue({ + headers: { + ...request.headers(), + ...this.config.additional_request_headers, + } + }); + }) + }, + ] : [] + const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = { requestQueue, requestHandler: router, + preNavigationHooks: preNavigationHooks, launchContext: { launchOptions: { headless: this.config.headless || true, diff --git a/src/types.ts b/src/types.ts index 2da52ee..4bf5568 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,6 +9,7 @@ export type Config = { meilisearch_api_key: string start_urls: string[] urls_to_exclude?: string[] + additional_request_headers?: Record queue?: string[] primary_key?: string batch_size?: number From de3a8682b64bdafa9cece7e53784d92e29ad7d83 Mon Sep 17 00:00:00 2001 From: Dardan Bujupaj Date: Mon, 18 Mar 2024 12:22:12 +0100 Subject: [PATCH 2/3] fix lint errors --- src/crawler.ts | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 8d10f80..ff39441 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -51,8 +51,8 @@ export class Crawler { this.config.strategy == 'docssearch' ? new DocsearchScraper(this.sender, this.config) : this.config.strategy == 'schema' - ? new SchemaScraper(this.sender, this.config) - : new DefaultScraper(this.sender, this.config) + ? new SchemaScraper(this.sender, this.config) + : new DefaultScraper(this.sender, this.config) } async run() { @@ -63,22 +63,26 @@ export class Crawler { //Create the router const router = createPuppeteerRouter() - // type DefaultHandler = Parameters[0]; router.addDefaultHandler(this.defaultHandler.bind(this)) - const preNavigationHooks: PuppeteerHook[] = this.config.additional_request_headers ? [ - async (crawlingContext) => { - crawlingContext.addInterceptRequestHandler(async (request) => { - request.continue({ - headers: { - ...request.headers(), - ...this.config.additional_request_headers, - } - }); - }) - }, - ] : [] + const preNavigationHooks: PuppeteerHook[] = this.config + .additional_request_headers + ? [ + async (crawlingContext) => { + await crawlingContext.addInterceptRequestHandler( + async (request) => { + return await request.continue({ + headers: { + ...request.headers(), + ...this.config.additional_request_headers, + }, + }) + } + ) + }, + ] + : [] const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = { requestQueue, From 8a4e9237ff826be4f4061c695b24a53710079b9e Mon Sep 17 00:00:00 2001 From: Dardan Bujupaj Date: Tue, 19 Mar 2024 21:46:15 +0100 Subject: [PATCH 3/3] add description of additional_request_headers to readme --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 82b1bda..0e745e9 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,19 @@ In the case [webhooks](#webhooks) are enabled, the webhook_payload option gives `webhook_url` The URL on which the webhook calls are made. +`additional_request_headers` +An object containing headers to be added to every request the crawler makes. +This can be useful to add authentication headers to crawl protected sites. + +E.g. authenticate crawler with basic auth: +``` +{ + "additional_request_headers": { + "Authorization": "Basic dXNlcjpwYXNzd29yZA==" + } +} +``` + ## Webhooks To be able to receive updates on the state of the crawler, you need to create a webhook. To do so, you absolutely need to have a public URL that can be reached by the crawler. This URL will be called by the crawler to send you updates.