-
Notifications
You must be signed in to change notification settings - Fork 56
Feat/pdf ingest fix #835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat/pdf ingest fix #835
Changes from 9 commits
feb056f
60bfdb9
29508fa
498c544
5ab007e
9f816a0
2f497d0
9dab216
0c98b7a
bb60269
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -16,8 +16,7 @@ const qcmsWasmPath = | |||
| path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/" | ||||
| const seenHashDescriptions = new Map<string, string>() | ||||
| const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10) | ||||
| // Minimum line height used for calculating line break detection tolerance (in PDF units) | ||||
| const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10 | ||||
|
|
||||
|
|
||||
| const Logger = getLogger(Subsystem.Integrations).child({ | ||||
| module: "pdfChunks", | ||||
|
|
@@ -200,42 +199,7 @@ function validateTextItem(item: any): boolean { | |||
| ) | ||||
| } | ||||
|
|
||||
| /** | ||||
| * Extract text from various PDF.js text operators with enhanced validation | ||||
| */ | ||||
| function extractTextFromArgs(args: any[]): string { | ||||
| let text = "" | ||||
|
|
||||
| if (!args || args.length === 0) { | ||||
| return text | ||||
| } | ||||
|
|
||||
| const firstArg = args[0] | ||||
|
|
||||
| if (typeof firstArg === "string") { | ||||
| text = firstArg | ||||
| } else if (Array.isArray(firstArg)) { | ||||
| for (const item of firstArg) { | ||||
| if (typeof item === "string") { | ||||
| text += item | ||||
| } else if (typeof item === "number") { | ||||
| // Skip spacing numbers in text arrays | ||||
| continue | ||||
| } else if (item && typeof item === "object") { | ||||
| // Enhanced validation using validateTextItem function | ||||
| if (validateTextItem(item)) { | ||||
| text += item.str | ||||
| } else if ("unicode" in item && typeof item.unicode === "string") { | ||||
| text += item.unicode | ||||
| } | ||||
| } | ||||
| } | ||||
| } | ||||
|
|
||||
| // Additional validation: ensure we return clean, valid text | ||||
| const result = typeof text === "string" ? text : "" | ||||
| return result | ||||
| } | ||||
|
|
||||
| /** | ||||
| * Process collected paragraphs into chunks and add to results | ||||
|
|
@@ -423,62 +387,7 @@ export async function extractTextAndImagesWithChunksFromPDF( | |||
| return paragraphs.filter((p) => p.trim().length > 0) | ||||
| } | ||||
|
|
||||
| // Extract text from operators as fallback for edge cases | ||||
| const extractFallbackTextFromOperators = (opList: any): string[] => { | ||||
| const fallbackLines: string[] = [] | ||||
|
|
||||
| for (let i = 0; i < opList.fnArray.length; i++) { | ||||
| const fnId = opList.fnArray[i] | ||||
| const args = opList.argsArray[i] | ||||
|
|
||||
| // Handle text operators | ||||
| if ( | ||||
| fnId === PDFJS.OPS.showText || | ||||
| fnId === PDFJS.OPS.showSpacedText || | ||||
| fnId === PDFJS.OPS.nextLineShowText || | ||||
| fnId === PDFJS.OPS.nextLineSetSpacingShowText | ||||
| ) { | ||||
| const extractedText = extractTextFromArgs(args) | ||||
| if (extractedText.trim()) { | ||||
| fallbackLines.push(extractedText.trim()) | ||||
| } | ||||
| } | ||||
| } | ||||
|
|
||||
| return fallbackLines | ||||
| } | ||||
|
|
||||
| // Combine and deduplicate text from multiple sources | ||||
| const combineTextSources = ( | ||||
| primaryParagraphs: string[], | ||||
| fallbackLines: string[], | ||||
| ): string[] => { | ||||
| if (fallbackLines.length === 0) { | ||||
| return primaryParagraphs | ||||
| } | ||||
|
|
||||
| const primaryText = primaryParagraphs.join(" ").toLowerCase() | ||||
| const additionalLines: string[] = [] | ||||
|
|
||||
| // Add fallback lines that aren't already covered by primary extraction | ||||
| for (const line of fallbackLines) { | ||||
| const cleanLine = line.trim() | ||||
| if ( | ||||
| cleanLine.length > 2 && // Skip very short strings | ||||
| !primaryText.includes(cleanLine.toLowerCase()) | ||||
| ) { | ||||
| additionalLines.push(cleanLine) | ||||
| } | ||||
| } | ||||
|
|
||||
| // If we found additional text, append it as a new paragraph | ||||
| if (additionalLines.length > 0) { | ||||
| const additionalParagraph = additionalLines.join(" ") | ||||
| return [...primaryParagraphs, additionalParagraph] | ||||
| } | ||||
|
|
||||
| return primaryParagraphs | ||||
| } | ||||
|
|
||||
| for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { | ||||
| Logger.debug(`Processing page ${pageNum}`) | ||||
|
|
@@ -488,16 +397,9 @@ export async function extractTextAndImagesWithChunksFromPDF( | |||
| const opList = await page.getOperatorList() | ||||
|
|
||||
| // Use textContent-based paragraphs for this page as primary source | ||||
| let primaryParagraphs: string[] = await buildParagraphsFromPage(page) | ||||
|
|
||||
| // Extract fallback text from operators for edge cases | ||||
| const fallbackLines = extractFallbackTextFromOperators(opList) | ||||
| let paragraphs: string[] = await buildParagraphsFromPage(page) | ||||
|
|
||||
| // Combine both sources, prioritizing primary extraction | ||||
| let paragraphs: string[] = combineTextSources( | ||||
| primaryParagraphs, | ||||
| fallbackLines, | ||||
| ) | ||||
|
|
||||
|
|
||||
| let textOperatorCount = (await page.getTextContent()).items.length | ||||
|
|
||||
|
|
@@ -512,8 +414,8 @@ export async function extractTextAndImagesWithChunksFromPDF( | |||
|
|
||||
| Logger.debug("Text extraction summary for page", { | ||||
| pageNum, | ||||
| primaryParagraphs: primaryParagraphs.length, | ||||
| fallbackLines: fallbackLines.length, | ||||
| primaryParagraphs: paragraphs.length, | ||||
|
|
||||
| finalParagraphs: paragraphs.length, | ||||
| textOperatorCount, | ||||
| initialPageOverlap: pageOverlap, | ||||
|
|
@@ -1394,6 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF( | |||
| describeImages, | ||||
| }) | ||||
|
|
||||
| console.log("All text chunks", { text_chunks }) | ||||
|
||||
| console.log("All text chunks", { text_chunks }) |
🤖 Prompt for AI Agents
In server/pdfChunks.ts around line 1299, remove the plain console.log that dumps
all text_chunks to stdout; instead use the application's structured logger at
debug/trace level (or guard behind a verbose env flag) and log either a safe
summary (e.g., count of chunks, lengths, or hashes) or a sampled/truncated
subset to avoid exposing PII and high-volume output. Ensure the log call does
not include full chunk contents, and add a comment noting the change for
auditing.
Uh oh!
There was an error while loading. Please reload this page.