Merge pull request #158 from mendableai/nsc/docx-support

feat: Docx Support
mendableai · May 16, 2024 · 5c1e6d1 · 5c1e6d1
2 parents d407ec7 + 9d635cb
commit 5c1e6d1
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 13 deletions.
diff --git a/apps/api/package.json b/apps/api/package.json
@@ -33,6 +33,7 @@
  "express": "^4.18.2",
  "jest": "^29.6.3",
  "jest-fetch-mock": "^3.0.3",
+ "mammoth": "^1.7.2",
  "nodemon": "^2.0.20",
  "supabase": "^1.77.9",
  "supertest": "^6.3.3",

diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -321,7 +321,7 @@ export class WebCrawler {
  ".mp4",
  ".mp3",
  ".pptx",
- ".docx",
+ // ".docx",
  ".xlsx",
  ".xml",
  ];

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
@@ -17,6 +17,7 @@ import {
 } from "./utils/replacePaths";
 import { generateCompletions } from "../../lib/LLM-extraction";
 import { getWebScraperQueue } from "../../../src/services/queue-service";
+import { fetchAndProcessDocx } from "./utils/docxProcessor";
 
 export class WebScraperDataProvider {
  private bullJobId: string;
@@ -157,7 +158,7 @@ export class WebScraperDataProvider {
  private async handleCrawlMode(
  inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
- 
+
  const crawler = new WebCrawler({
  initialUrl: this.urls[0],
  includes: this.includes,
@@ -237,9 +238,13 @@ export class WebScraperDataProvider {
  inProgress?: (progress: Progress) => void,
  allHtmls?: string[]
  ): Promise<Document[]> {
- let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
- let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
- links = links.filter((link) => !link.endsWith(".pdf"));
+ const pdfLinks = links.filter(link => link.endsWith(".pdf"));
+ const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
+
+ const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+ const docxDocuments = await this.fetchDocxDocuments(docLinks);
+
+ links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
 
  let documents = await this.convertUrlsToDocuments(
  links,
@@ -257,7 +262,7 @@ export class WebScraperDataProvider {
  ) {
  documents = await generateCompletions(documents, this.extractorOptions);
  }
- return documents.concat(pdfDocuments);
+ return documents.concat(pdfDocuments).concat(docxDocuments);
  }
 
  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
@@ -272,6 +277,18 @@ export class WebScraperDataProvider {
  })
  );
  }
+ private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
+ return Promise.all(
+ docxLinks.map(async (p) => {
+ const docXDocument = await fetchAndProcessDocx(p);
+ return {
+ content: docXDocument,
+ metadata: { sourceURL: p },
+ provider: "web-scraper",
+ };
+ })
+ );
+ }
 
  private applyPathReplacements(documents: Document[]): Document[] {
  return this.replaceAllPathsWithAbsolutePaths