Skip to content

Commit

Permalink
Merge pull request #158 from mendableai/nsc/docx-support
Browse files Browse the repository at this point in the history
feat: Docx Support
  • Loading branch information
nickscamara committed May 16, 2024
2 parents d407ec7 + 9d635cb commit 5c1e6d1
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 13 deletions.
1 change: 1 addition & 0 deletions apps/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"express": "^4.18.2",
"jest": "^29.6.3",
"jest-fetch-mock": "^3.0.3",
"mammoth": "^1.7.2",
"nodemon": "^2.0.20",
"supabase": "^1.77.9",
"supertest": "^6.3.3",
Expand Down
111 changes: 104 additions & 7 deletions apps/api/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apps/api/src/scraper/WebScraper/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ export class WebCrawler {
".mp4",
".mp3",
".pptx",
".docx",
// ".docx",
".xlsx",
".xml",
];
Expand Down
27 changes: 22 additions & 5 deletions apps/api/src/scraper/WebScraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
} from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";

export class WebScraperDataProvider {
private bullJobId: string;
Expand Down Expand Up @@ -157,7 +158,7 @@ export class WebScraperDataProvider {
private async handleCrawlMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {

const crawler = new WebCrawler({
initialUrl: this.urls[0],
includes: this.includes,
Expand Down Expand Up @@ -237,9 +238,13 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> {
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
links = links.filter((link) => !link.endsWith(".pdf"));
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));

const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const docxDocuments = await this.fetchDocxDocuments(docLinks);

links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));

let documents = await this.convertUrlsToDocuments(
links,
Expand All @@ -257,7 +262,7 @@ export class WebScraperDataProvider {
) {
documents = await generateCompletions(documents, this.extractorOptions);
}
return documents.concat(pdfDocuments);
return documents.concat(pdfDocuments).concat(docxDocuments);
}

private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
Expand All @@ -272,6 +277,18 @@ export class WebScraperDataProvider {
})
);
}
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
provider: "web-scraper",
};
})
);
}

private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths
Expand Down

0 comments on commit 5c1e6d1

Please sign in to comment.