From 4028efa01b7bad19e2874993eac54aa04c437843 Mon Sep 17 00:00:00 2001 From: Daniel Andersen Date: Sat, 27 Sep 2025 22:32:42 +0200 Subject: [PATCH] fix: prevent PDF binary content from being included in scrape output Add PDF detection to skip processing PDF files in fetch and playwright scrapers. This prevents raw PDF binary data from being dumped into HTML/markdown fields. Fixes #28 --- .../src/scraper/WebScraper/scrapers/fetch.ts | 46 +++++++++++++++++++ .../scraper/WebScraper/scrapers/playwright.ts | 46 +++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 34e9fcd693..db3da8a119 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -2,6 +2,39 @@ import axios from "axios"; import { universalTimeout } from "../global"; import { Logger } from "../../../lib/logger"; +/** + * Detects if the content is a PDF file + * @param content The content to check + * @returns true if the content is a PDF + */ +function isPDFContent(content: string): boolean { + if (!content || typeof content !== 'string') { + return false; + } + + const trimmedContent = content.trim(); + + // Check for PDF header signature + if (trimmedContent.startsWith('%PDF-')) { + return true; + } + + // Check for PDF binary content indicators + if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && + trimmedContent.includes('stream') && trimmedContent.includes('endstream')) { + return true; + } + + // Check for high ratio of non-printable characters (typical of binary PDF content) + const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length; + const totalChars = content.length; + if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) { + return true; + } + + return false; +} + /** * Scrapes a URL with Axios * @param url The URL to scrape @@ -44,6 +77,19 @@ export async function scrapeWithFetch( } const text = response.data; + + // Check if the content is a PDF file + if (isPDFContent(text)) { + Logger.debug(`⛏️ fetch: Detected PDF content for ${url}, skipping PDF processing`); + logParams.error_message = "PDF content detected - not suitable for text extraction"; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: "PDF content detected - not suitable for text extraction", + }; + } + logParams.success = true; logParams.html = text; logParams.response_code = response.status; diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 9f60bd5c2d..f44e53c6ed 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -3,6 +3,39 @@ import { generateRequestParams } from "../single_url"; import { universalTimeout } from "../global"; import { Logger } from "../../../lib/logger"; +/** + * Detects if the content is a PDF file + * @param content The content to check + * @returns true if the content is a PDF + */ +function isPDFContent(content: string): boolean { + if (!content || typeof content !== 'string') { + return false; + } + + const trimmedContent = content.trim(); + + // Check for PDF header signature + if (trimmedContent.startsWith('%PDF-')) { + return true; + } + + // Check for PDF binary content indicators + if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && + trimmedContent.includes('stream') && trimmedContent.includes('endstream')) { + return true; + } + + // Check for high ratio of non-printable characters (typical of binary PDF content) + const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length; + const totalChars = content.length; + if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) { + return true; + } + + return false; +} + /** * Scrapes a URL with Playwright * @param url The URL to scrape @@ -64,6 +97,19 @@ export async function scrapeWithPlaywright( try { const data = JSON.parse(textData); const html = data.content; + + // Check if the content is a PDF file + if (isPDFContent(html)) { + Logger.debug(`⛏️ Playwright: Detected PDF content for ${url}, skipping PDF processing`); + logParams.error_message = "PDF content detected - not suitable for text extraction"; + logParams.response_code = data.pageStatusCode; + return { + content: "", + pageStatusCode: data.pageStatusCode, + pageError: "PDF content detected - not suitable for text extraction", + }; + } + logParams.success = true; logParams.html = html; logParams.response_code = data.pageStatusCode;