From 4400157c11f0c4e5998376c81e783bdb72299de6 Mon Sep 17 00:00:00 2001
From: Dardan Bujupaj <dardan@ax.tech>
Date: Fri, 8 Mar 2024 23:39:51 +0100
Subject: [PATCH 1/3] add configuration option for additional request headers

add a pre navigation hook to add thos headers to the request
---
 src/crawler.ts | 20 ++++++++++++++++++--
 src/types.ts   |  1 +
 2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/src/crawler.ts b/src/crawler.ts
index e42da1b..8d10f80 100644
--- a/src/crawler.ts
+++ b/src/crawler.ts
@@ -5,6 +5,7 @@ import {
   PuppeteerCrawlingContext,
   PuppeteerCrawlerOptions,
   RequestQueue,
+  PuppeteerHook,
 } from 'crawlee'
 
 import { minimatch } from 'minimatch'
@@ -50,8 +51,8 @@ export class Crawler {
       this.config.strategy == 'docssearch'
         ? new DocsearchScraper(this.sender, this.config)
         : this.config.strategy == 'schema'
-        ? new SchemaScraper(this.sender, this.config)
-        : new DefaultScraper(this.sender, this.config)
+          ? new SchemaScraper(this.sender, this.config)
+          : new DefaultScraper(this.sender, this.config)
   }
 
   async run() {
@@ -62,12 +63,27 @@ export class Crawler {
     //Create the router
     const router = createPuppeteerRouter()
 
+
     // type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
     router.addDefaultHandler(this.defaultHandler.bind(this))
 
+    const preNavigationHooks: PuppeteerHook[] = this.config.additional_request_headers ? [
+      async (crawlingContext) => {
+        crawlingContext.addInterceptRequestHandler(async (request) => {
+          request.continue({
+            headers: {
+              ...request.headers(),
+              ...this.config.additional_request_headers,
+            }
+          });
+        })
+      },
+    ] : []
+
     const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = {
       requestQueue,
       requestHandler: router,
+      preNavigationHooks: preNavigationHooks,
       launchContext: {
         launchOptions: {
           headless: this.config.headless || true,
diff --git a/src/types.ts b/src/types.ts
index 2da52ee..4bf5568 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -9,6 +9,7 @@ export type Config = {
   meilisearch_api_key: string
   start_urls: string[]
   urls_to_exclude?: string[]
+  additional_request_headers?: Record<string, string>
   queue?: string[]
   primary_key?: string
   batch_size?: number

From de3a8682b64bdafa9cece7e53784d92e29ad7d83 Mon Sep 17 00:00:00 2001
From: Dardan Bujupaj <dardan@ax.tech>
Date: Mon, 18 Mar 2024 12:22:12 +0100
Subject: [PATCH 2/3] fix lint errors

---
 src/crawler.ts | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/crawler.ts b/src/crawler.ts
index 8d10f80..ff39441 100644
--- a/src/crawler.ts
+++ b/src/crawler.ts
@@ -51,8 +51,8 @@ export class Crawler {
       this.config.strategy == 'docssearch'
         ? new DocsearchScraper(this.sender, this.config)
         : this.config.strategy == 'schema'
-          ? new SchemaScraper(this.sender, this.config)
-          : new DefaultScraper(this.sender, this.config)
+        ? new SchemaScraper(this.sender, this.config)
+        : new DefaultScraper(this.sender, this.config)
   }
 
   async run() {
@@ -63,22 +63,26 @@ export class Crawler {
     //Create the router
     const router = createPuppeteerRouter()
 
-
     // type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
     router.addDefaultHandler(this.defaultHandler.bind(this))
 
-    const preNavigationHooks: PuppeteerHook[] = this.config.additional_request_headers ? [
-      async (crawlingContext) => {
-        crawlingContext.addInterceptRequestHandler(async (request) => {
-          request.continue({
-            headers: {
-              ...request.headers(),
-              ...this.config.additional_request_headers,
-            }
-          });
-        })
-      },
-    ] : []
+    const preNavigationHooks: PuppeteerHook[] = this.config
+      .additional_request_headers
+      ? [
+          async (crawlingContext) => {
+            await crawlingContext.addInterceptRequestHandler(
+              async (request) => {
+                return await request.continue({
+                  headers: {
+                    ...request.headers(),
+                    ...this.config.additional_request_headers,
+                  },
+                })
+              }
+            )
+          },
+        ]
+      : []
 
     const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = {
       requestQueue,

From 8a4e9237ff826be4f4061c695b24a53710079b9e Mon Sep 17 00:00:00 2001
From: Dardan Bujupaj <dardan@ax.tech>
Date: Tue, 19 Mar 2024 21:46:15 +0100
Subject: [PATCH 3/3] add description of additional_request_headers to readme

---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 82b1bda..0e745e9 100644
--- a/README.md
+++ b/README.md
@@ -191,6 +191,19 @@ In the case [webhooks](#webhooks) are enabled, the webhook_payload option gives
 `webhook_url`
 The URL on which the webhook calls are made.
 
+`additional_request_headers`
+An object containing headers to be added to every request the crawler makes.
+This can be useful to add authentication headers to crawl protected sites.
+
+E.g. authenticate crawler with basic auth:
+```
+{
+  "additional_request_headers": {
+    "Authorization": "Basic dXNlcjpwYXNzd29yZA=="
+  }
+}
+```
+
 ## Webhooks
 
 To be able to receive updates on the state of the crawler, you need to create a webhook. To do so, you absolutely need to have a public URL that can be reached by the crawler. This URL will be called by the crawler to send you updates.