diff --git a/README.md b/README.md index 82b1bda..0e745e9 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,19 @@ In the case [webhooks](#webhooks) are enabled, the webhook_payload option gives `webhook_url` The URL on which the webhook calls are made. +`additional_request_headers` +An object containing headers to be added to every request the crawler makes. +This can be useful to add authentication headers to crawl protected sites. + +E.g. authenticate crawler with basic auth: +``` +{ + "additional_request_headers": { + "Authorization": "Basic dXNlcjpwYXNzd29yZA==" + } +} +``` + ## Webhooks To be able to receive updates on the state of the crawler, you need to create a webhook. To do so, you absolutely need to have a public URL that can be reached by the crawler. This URL will be called by the crawler to send you updates. diff --git a/src/crawler.ts b/src/crawler.ts index e42da1b..ff39441 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -5,6 +5,7 @@ import { PuppeteerCrawlingContext, PuppeteerCrawlerOptions, RequestQueue, + PuppeteerHook, } from 'crawlee' import { minimatch } from 'minimatch' @@ -65,9 +66,28 @@ export class Crawler { // type DefaultHandler = Parameters[0]; router.addDefaultHandler(this.defaultHandler.bind(this)) + const preNavigationHooks: PuppeteerHook[] = this.config + .additional_request_headers + ? [ + async (crawlingContext) => { + await crawlingContext.addInterceptRequestHandler( + async (request) => { + return await request.continue({ + headers: { + ...request.headers(), + ...this.config.additional_request_headers, + }, + }) + } + ) + }, + ] + : [] + const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = { requestQueue, requestHandler: router, + preNavigationHooks: preNavigationHooks, launchContext: { launchOptions: { headless: this.config.headless || true, diff --git a/src/types.ts b/src/types.ts index 2da52ee..4bf5568 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,6 +9,7 @@ export type Config = { meilisearch_api_key: string start_urls: string[] urls_to_exclude?: string[] + additional_request_headers?: Record queue?: string[] primary_key?: string batch_size?: number