Skip to content
Merged
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 6 additions & 103 deletions server/pdfChunks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ const qcmsWasmPath =
path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/"
const seenHashDescriptions = new Map<string, string>()
const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10)
// Minimum line height used for calculating line break detection tolerance (in PDF units)
const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10


const Logger = getLogger(Subsystem.Integrations).child({
module: "pdfChunks",
Expand Down Expand Up @@ -200,42 +199,7 @@ function validateTextItem(item: any): boolean {
)
}

/**
* Extract text from various PDF.js text operators with enhanced validation
*/
function extractTextFromArgs(args: any[]): string {
let text = ""

if (!args || args.length === 0) {
return text
}

const firstArg = args[0]

if (typeof firstArg === "string") {
text = firstArg
} else if (Array.isArray(firstArg)) {
for (const item of firstArg) {
if (typeof item === "string") {
text += item
} else if (typeof item === "number") {
// Skip spacing numbers in text arrays
continue
} else if (item && typeof item === "object") {
// Enhanced validation using validateTextItem function
if (validateTextItem(item)) {
text += item.str
} else if ("unicode" in item && typeof item.unicode === "string") {
text += item.unicode
}
}
}
}

// Additional validation: ensure we return clean, valid text
const result = typeof text === "string" ? text : ""
return result
}

/**
* Process collected paragraphs into chunks and add to results
Expand Down Expand Up @@ -423,62 +387,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
return paragraphs.filter((p) => p.trim().length > 0)
}

// Extract text from operators as fallback for edge cases
const extractFallbackTextFromOperators = (opList: any): string[] => {
const fallbackLines: string[] = []

for (let i = 0; i < opList.fnArray.length; i++) {
const fnId = opList.fnArray[i]
const args = opList.argsArray[i]

// Handle text operators
if (
fnId === PDFJS.OPS.showText ||
fnId === PDFJS.OPS.showSpacedText ||
fnId === PDFJS.OPS.nextLineShowText ||
fnId === PDFJS.OPS.nextLineSetSpacingShowText
) {
const extractedText = extractTextFromArgs(args)
if (extractedText.trim()) {
fallbackLines.push(extractedText.trim())
}
}
}

return fallbackLines
}

// Combine and deduplicate text from multiple sources
const combineTextSources = (
primaryParagraphs: string[],
fallbackLines: string[],
): string[] => {
if (fallbackLines.length === 0) {
return primaryParagraphs
}

const primaryText = primaryParagraphs.join(" ").toLowerCase()
const additionalLines: string[] = []

// Add fallback lines that aren't already covered by primary extraction
for (const line of fallbackLines) {
const cleanLine = line.trim()
if (
cleanLine.length > 2 && // Skip very short strings
!primaryText.includes(cleanLine.toLowerCase())
) {
additionalLines.push(cleanLine)
}
}

// If we found additional text, append it as a new paragraph
if (additionalLines.length > 0) {
const additionalParagraph = additionalLines.join(" ")
return [...primaryParagraphs, additionalParagraph]
}

return primaryParagraphs
}

for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
Logger.debug(`Processing page ${pageNum}`)
Expand All @@ -488,16 +397,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
const opList = await page.getOperatorList()

// Use textContent-based paragraphs for this page as primary source
let primaryParagraphs: string[] = await buildParagraphsFromPage(page)

// Extract fallback text from operators for edge cases
const fallbackLines = extractFallbackTextFromOperators(opList)
let paragraphs: string[] = await buildParagraphsFromPage(page)

// Combine both sources, prioritizing primary extraction
let paragraphs: string[] = combineTextSources(
primaryParagraphs,
fallbackLines,
)


let textOperatorCount = (await page.getTextContent()).items.length

Expand All @@ -512,8 +414,8 @@ export async function extractTextAndImagesWithChunksFromPDF(

Logger.debug("Text extraction summary for page", {
pageNum,
primaryParagraphs: primaryParagraphs.length,
fallbackLines: fallbackLines.length,
primaryParagraphs: paragraphs.length,

finalParagraphs: paragraphs.length,
textOperatorCount,
initialPageOverlap: pageOverlap,
Expand Down Expand Up @@ -1394,6 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
describeImages,
})

console.log("All text chunks", { text_chunks })
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove console.log dumping all text chunks (PII/log volume risk)

This prints full extracted content to stdout in production. Use structured debug logs with sampling or summaries instead.

-    console.log("All text chunks", { text_chunks })
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
console.log("All text chunks", { text_chunks })
🤖 Prompt for AI Agents
In server/pdfChunks.ts around line 1299, remove the plain console.log that dumps
all text_chunks to stdout; instead use the application's structured logger at
debug/trace level (or guard behind a verbose env flag) and log either a safe
summary (e.g., count of chunks, lengths, or hashes) or a sampled/truncated
subset to avoid exposing PII and high-volume output. Ensure the log call does
not include full chunk contents, and add a comment noting the change for
auditing.

Logger.debug("All text chunks", { text_chunks })
Logger.debug("All text chunk positions", { text_chunk_pos })
Logger.debug("All image chunks", { image_chunks })
Expand Down