From feb056f72125f500691c310101d95d07a217d787 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 13:07:18 +0530 Subject: [PATCH 1/9] pdfChunks file updated for better text processing --- server/pdfChunks.ts | 1321 +++++++++++++++++++++++++------ server/scripts/testPdfDirect.ts | 89 +++ 2 files changed, 1158 insertions(+), 252 deletions(-) create mode 100644 server/scripts/testPdfDirect.ts diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index c8ab942b9..0f7f11483 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -15,6 +15,7 @@ const openjpegWasmPath = const qcmsWasmPath = path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/" const seenHashDescriptions = new Map() +const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10) const Logger = getLogger(Subsystem.Integrations).child({ module: "pdfChunks", @@ -23,14 +24,144 @@ const Logger = getLogger(Subsystem.Integrations).child({ const PDFJS = pdfjsLib // Utility function to clean text consistent with chunkTextByParagraph -const cleanText = (str: string): string => { - const normalized = str.replace(/\r\n|\r/g, "\n") - return normalized.replace( - /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g, - "", - ) +// const cleanText = (str: string): string => { +// console.log('CLEAN TEXT DEBUG: Input string length:', str.length) +// console.log('CLEAN TEXT DEBUG: Input string:', str) + +// const normalized = str.replace(/\r\n|\r/g, "\n") +// console.log('CLEAN TEXT DEBUG: After normalization length:', normalized.length) +// console.log('CLEAN TEXT DEBUG: After normalization:', normalized) + +// const cleaned = normalized.replace( +// /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g, +// "", +// ) +// console.log('CLEAN TEXT DEBUG: After cleaning length:', cleaned.length) +// console.log('CLEAN TEXT DEBUG: Cleaned string:', cleaned) + +// return cleaned +// } + +//=== + +export function normalizeText(input: string): string { + if (!input) return "" + + let normalized = input.normalize("NFC") + + // Strip control chars except newline/tab + normalized = normalized.replace(/[^\P{C}\n\t]/gu, "") + + // Normalize whitespace + normalized = normalized.replace(/\u00A0/g, " ") // nbsp → space + normalized = normalized.replace(/\u200B/g, "") // zero-width space + normalized = normalized.replace(/\t+/g, " ") // tabs → single space + + return normalized.trim() +} + +// ========================================= +// 2. Smart letter-spacing collapse (per line) +// ========================================= +function smartDespaceLine(line: string): string { + if (!line) return line + + const parts = line.split(/(\s+)/) + const out: string[] = [] + + const isSingleAllowed = (s: string) => + s.length === 1 && /[\p{L}\p{N}'’]/u.test(s) + + const isSingleLowerLetter = (s: string) => s.length === 1 && /\p{Ll}/u.test(s) + + let i = 0 + while (i < parts.length) { + const tok = parts[i] + + if (!/\s+/.test(tok) && isSingleAllowed(tok)) { + const runTokens: string[] = [tok] + let j = i + 1 + + while ( + j + 1 < parts.length && + parts[j] === " " && + !/\s+/.test(parts[j + 1]) && + isSingleAllowed(parts[j + 1]) + ) { + runTokens.push(parts[j + 1]) + j += 2 + } + + // Join spaced letters like "N A S A" -> "NASA" + if (runTokens.length >= 3) { + out.push(runTokens.join("")) + i = j + continue + } + + // Join two-letter lowercase sequences like "i s" -> "is" + if ( + runTokens.length === 2 && + isSingleLowerLetter(runTokens[0]) && + isSingleLowerLetter(runTokens[1]) + ) { + out.push(runTokens.join("")) + i = j + continue + } + } + + out.push(tok) + i += 1 + } + + return out.join("") } +// ============================= +// 3. High-level text cleaner +// ============================= +export function cleanText(input: string): string { + let s = normalizeText(input) + + // Fix hyphenation across line breaks + s = s.replace(/(\p{L})-\n(\p{L})/gu, "$1$2") + + // Trim spaces around newlines + s = s.replace(/[ \t]*\n[ \t]*/g, "\n") + + // Turn intra-paragraph newlines into spaces, preserve paragraph breaks + // 1) Mark paragraph breaks + s = s.replace(/\n{2,}/g, "[[PARA]]") + // 2) Collapse remaining newlines (soft wraps) into spaces + s = s.replace(/\n+/g, " ") + // 3) Restore paragraph breaks + s = s.replace(/\[\[PARA\]\]/g, "\n\n") + + // Apply line-wise despacing + s = s + .split("\n") + .map((line) => smartDespaceLine(line)) + .join("\n") + + // Remove spaces before punctuation + s = s.replace(/\s+([.,;:!?])/g, "$1") + + // Cap extreme space runs, preserve 2–4 spaces + s = s.replace(/[ ]{5,}/g, " ") + + // Trim lines & drop empties + s = s + .split("\n") + .map((l) => l.trim()) + .filter((l) => l.length > 0) + .join("\n") + + return s.trim() +} + +//=== + /** * Validate text item */ @@ -77,7 +208,9 @@ function extractTextFromArgs(args: any[]): string { } // Additional validation: ensure we return clean, valid text - return typeof text === "string" ? text : "" + const result = typeof text === "string" ? text : "" + console.log("EXTRACT TEXT DEBUG: Final extracted text:", result) + return result } /** @@ -91,19 +224,32 @@ function processTextParagraphs( globalSeq: { value: number }, overlapBytes: number = 32, ): string { - if (paragraphs.length === 0) return "" + console.log("TEXT DEBUG: Processing paragraphs count:", paragraphs.length) + + if (paragraphs.length === 0) { + console.log("TEXT DEBUG: No paragraphs to process") + return "" + } const cleanedParagraphs = paragraphs .map(cleanText) .filter((p) => p.length > 0) - if (cleanedParagraphs.length === 0) return "" + if (cleanedParagraphs.length === 0) { + console.log("TEXT DEBUG: No cleaned paragraphs after filtering") + return "" + } const cleanedText = cleanedParagraphs.join("\n") + // console.log('TEXT DEBUG: Cleaned text length:', cleanedText.length) + // console.log('TEXT DEBUG: Full cleaned text:', cleanedText) + const chunks = chunkTextByParagraph(cleanedText, 512, 128) + // console.log('TEXT DEBUG: Generated chunks count:', chunks.length) for (const chunk of chunks) { text_chunks.push(chunk) text_chunk_pos.push(globalSeq.value) + // console.log('TEXT DEBUG: Added chunk at position', globalSeq.value, 'content:', chunk) globalSeq.value++ } @@ -111,13 +257,25 @@ function processTextParagraphs( // Take the last overlapBytes from the processed text let overlapText = "" let overlapLen = 0 + + // Logger.info(`OVERLAP DEBUG: Calculating overlap text from cleanedText of length ${cleanedText.length}, target bytes: ${overlapBytes}`) + // console.log('OVERLAP DEBUG: Full cleanedText for overlap calculation:', cleanedText) + for (let i = cleanedText.length - 1; i >= 0; i--) { const charBytes = Buffer.byteLength(cleanedText[i], "utf8") - if (overlapLen + charBytes > overlapBytes) break + if (overlapLen + charBytes > overlapBytes) { + // console.log('OVERLAP DEBUG: Stopping overlap calculation at char', i, 'would exceed', overlapBytes, 'bytes (current:', overlapLen, 'char bytes:', charBytes, ')') + break + } overlapText = cleanedText[i] + overlapText overlapLen += charBytes + // console.log('OVERLAP DEBUG: Added char', cleanedText[i], 'to overlap. Current overlap length:', overlapLen, 'bytes, text:', overlapText) } + // console.log('OVERLAP DEBUG: Final calculated overlap text:', overlapText) + // console.log('OVERLAP DEBUG: Final overlap length:', overlapLen, 'bytes') + // Logger.info(`OVERLAP DEBUG: processTextParagraphs returning overlap text: "${overlapText}" (${overlapLen} bytes)`) + return overlapText } @@ -133,6 +291,12 @@ export async function extractTextAndImagesWithChunksFromPDF( image_chunk_pos: number[] }> { Logger.info(`Starting PDF processing for: ${docid}`) + console.log("PDF DEBUG: Starting processing with parameters:", { + docid, + extractImages, + describeImages, + dataSize: data.length, + }) const loadingTask = PDFJS.getDocument({ data, @@ -171,8 +335,69 @@ export async function extractTextAndImagesWithChunksFromPDF( let globalSeq = { value: 0 } let crossImageOverlap = "" // Track overlap across images + // Logger.info("OVERLAP DEBUG: Initialized crossImageOverlap as empty string") + // console.log('OVERLAP DEBUG: Starting PDF processing with initial crossImageOverlap:', crossImageOverlap) + Logger.info(`PDF has ${pdfDocument.numPages} pages`) + // Robust text extraction using PDF.js textContent API + const buildParagraphsFromPage = async ( + page: pdfjsLib.PDFPageProxy, + ): Promise => { + const textContent = await page.getTextContent({ + includeMarkedContent: false, + disableNormalization: false, + }) + + // Build lines using hasEOL and Y-position changes (handles PPT/DOC exports) + const lines: string[] = [] + let current = "" + let prevY: number | null = null + let prevH: number | null = null + for (const item of textContent.items as any[]) { + const str: string = item && typeof item.str === "string" ? item.str : "" + if (!str) continue + + const tr = Array.isArray(item.transform) ? item.transform : [] + const y = typeof tr[5] === "number" ? tr[5] : null + const h = typeof item.height === "number" ? item.height : null + + let newLine = false + if (prevY != null && y != null) { + const tol = Math.max(prevH || 0, h || 0, 10) * 0.4 // dynamic tolerance + if (Math.abs(y - prevY) > tol) newLine = true + } + + if (newLine || (item as any).hasEOL) { + if (current.length > 0) lines.push(current) + current = str + } else { + current += str + } + + prevY = y + prevH = h + } + if (current.trim().length > 0) lines.push(current) + + // Group lines into paragraphs separated by blank lines + const paragraphs: string[] = [] + let buf: string[] = [] + const pushPara = () => { + if (buf.length === 0) return + paragraphs.push(buf.join("\n")) + buf = [] + } + for (const ln of lines) { + if (ln.trim().length === 0) pushPara() + else buf.push(ln) + } + pushPara() + + // Clean and filter + return paragraphs.map(cleanText).filter((p) => p.length > 0) + } + for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { Logger.debug(`Processing page ${pageNum}`) @@ -180,17 +405,87 @@ export async function extractTextAndImagesWithChunksFromPDF( try { const opList = await page.getOperatorList() - // Hold paragraphs for current page - let paragraphs: string[] = [] - let currentParagraph = "" - let textOperatorCount = 0 + // Use textContent-based paragraphs for this page + let paragraphs: string[] = await buildParagraphsFromPage(page) + let currentParagraph = "" // kept for image-flow flush, but not used for text + let textOperatorCount = (await page.getTextContent()).items.length - // Start with cross-image overlap if available - if (crossImageOverlap && extractImages) { - currentParagraph = crossImageOverlap + " " - crossImageOverlap = "" // Clear after using + // Helper: try to resolve image object by name directly from page.objs + const resolveImageByName = async ( + name: string, + ): Promise => { + try { + // Some builds expose has method + // @ts-ignore + if ( + typeof (page.objs as any).has === "function" && + (page.objs as any).has(name) + ) { + // @ts-ignore + return (page.objs as any).get(name) + } + const obj = (page.objs as any).get(name) + return obj || null + } catch (e) { + return null + } } + // Track CTM to compute image bounds when image data is not directly retrievable + let currentCTM: [number, number, number, number, number, number] = [ + 1, 0, 0, 1, 0, 0, + ] + const ctmStack: [number, number, number, number, number, number][] = [] + + const mul = ( + m1: number[], + m2: number[], + ): [number, number, number, number, number, number] => { + const [a1, b1, c1, d1, e1, f1] = m1 as [ + number, + number, + number, + number, + number, + number, + ] + const [a2, b2, c2, d2, e2, f2] = m2 as [ + number, + number, + number, + number, + number, + number, + ] + return [ + a1 * a2 + c1 * b2, + b1 * a2 + d1 * b2, + a1 * c2 + c1 * d2, + b1 * c2 + d1 * d2, + a1 * e2 + c1 * f2 + e1, + b1 * e2 + d1 * f2 + f1, + ] + } + + const applyToPoint = ( + m: number[], + x: number, + y: number, + ): { x: number; y: number } => { + const [a, b, c, d, e, f] = m as [ + number, + number, + number, + number, + number, + number, + ] + return { x: a * x + c * y + e, y: b * x + d * y + f } + } + + // Do not inject crossImageOverlap into text paragraphs here + // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap) + // Helper to flush currentParagraph into paragraphs array const flushParagraph = () => { if (currentParagraph.trim().length > 0) { @@ -199,33 +494,44 @@ export async function extractTextAndImagesWithChunksFromPDF( } } + let imagesOnPage = 0 + let vectorOpsDetected = false for (let i = 0; i < opList.fnArray.length; i++) { const fnId = opList.fnArray[i] const args = opList.argsArray[i] + // console.log(PDFJS.OPS.paintImageXObject , "PDFJS.OPS.paintImageXObject") + // console.log(PDFJS.OPS.paintImageXObjectRepeat , "PDFJS.OPS.paintImageXObjectRepeat") + // console.log(PDFJS.OPS.paintInlineImageXObject , "PDFJS.OPS.paintInlineImageXObject") + // console.log(PDFJS.OPS.paintImageMaskXObject , "PDFJS.OPS.paintImageMaskXObject") + + // Track vector drawing operators (paths, fills, form XObjects) + const isVectorOp = + fnId === PDFJS.OPS.constructPath || + fnId === PDFJS.OPS.stroke || + fnId === PDFJS.OPS.closeStroke || + fnId === PDFJS.OPS.fill || + fnId === PDFJS.OPS.eoFill || + fnId === PDFJS.OPS.fillStroke || + fnId === PDFJS.OPS.eoFillStroke || + fnId === PDFJS.OPS.closeFillStroke || + fnId === PDFJS.OPS.closeEOFillStroke || + fnId === PDFJS.OPS.clip || + fnId === PDFJS.OPS.eoClip || + fnId === PDFJS.OPS.rectangle || + fnId === PDFJS.OPS.shadingFill || + fnId === PDFJS.OPS.rawFillPath || + fnId === PDFJS.OPS.paintFormXObjectBegin || + fnId === PDFJS.OPS.paintFormXObjectEnd + if (isVectorOp) vectorOpsDetected = true + switch (fnId) { case PDFJS.OPS.showText: - case PDFJS.OPS.showSpacedText: { - const text = extractTextFromArgs(args) - if (text) { - currentParagraph += text + " " - textOperatorCount++ - } - break - } - // Handle line break operators - case PDFJS.OPS.nextLine: { - flushParagraph() - break - } + case PDFJS.OPS.showSpacedText: + case PDFJS.OPS.nextLine: case PDFJS.OPS.nextLineShowText: case PDFJS.OPS.nextLineSetSpacingShowText: { - const text = extractTextFromArgs(args) - if (text) { - currentParagraph += text + " " - textOperatorCount++ - } - flushParagraph() + // Text handled via getTextContent; ignore operator-driven text break } // Handle matrix and positioning operators that might indicate paragraph breaks @@ -234,171 +540,467 @@ export async function extractTextAndImagesWithChunksFromPDF( case PDFJS.OPS.moveText: { // These might indicate significant positioning changes // For now, we'll be conservative and not flush, but this could be adjusted + if (fnId === PDFJS.OPS.transform) { + try { + if ( + Array.isArray(args) && + args.length >= 6 && + args.every((n: any) => typeof n === "number") + ) { + currentCTM = mul(currentCTM, args as number[]) + } + } catch {} + } + break + } + case PDFJS.OPS.save: { + ctmStack.push([...currentCTM]) + break + } + case PDFJS.OPS.restore: { + if (ctmStack.length) currentCTM = ctmStack.pop()! break } - // Handle image operators + // Handle image operators - be more comprehensive case extractImages ? PDFJS.OPS.paintImageXObject : null: case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null: case extractImages ? PDFJS.OPS.paintInlineImageXObject : null: - case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: { - // Flush any pending text paragraphs before image - flushParagraph() - - // Process accumulated paragraphs and capture overlap - const overlapText = processTextParagraphs( - paragraphs, - text_chunks, - text_chunk_pos, - globalSeq, + case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: + case extractImages ? 83 : null: + case extractImages ? 85 : null: + case extractImages ? 86 : null: + case extractImages ? 88 : null: { + console.log( + "IMAGE DEBUG: Image operator detected on page", + pageNum, + { + extractImages, + operatorType: fnId, + imageName: args[0], + knownOperators: { + paintImageXObject: PDFJS.OPS.paintImageXObject, + paintImageXObjectRepeat: PDFJS.OPS.paintImageXObjectRepeat, + paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject, + paintImageMaskXObject: PDFJS.OPS.paintImageMaskXObject, + }, + }, ) - paragraphs = [] // Clear paragraphs after processing - // Store overlap for continuation after image - crossImageOverlap = overlapText + // Do not process text per-image anymore; text is processed once per page. + // Maintain crossImageOverlap continuity by keeping placeholders only. + flushParagraph() // Extract image buffer - const imageName = args[0] - // Small delay to ensure image object has a chance to resolve - let imageDict - try { - imageDict = page.objs.get(imageName) - } catch (err) { - Logger.debug( - `Image ${imageName} not resolved or failed to decode on page ${pageNum}: ${err instanceof Error ? err.message : err}`, - ) - continue + const imageName = + typeof args?.[0] === "string" + ? args[0] + : args?.[0] && + typeof args[0] === "object" && + typeof args[0].name === "string" + ? args[0].name + : args?.[0] + console.log("IMAGE DEBUG: Processing image:", imageName) + let imageDict: any | null = null + let isInline = false + // Inline image may directly carry data in args + console.log("IMAGE DEBUG: Initial args for image operator:", args) + console.log("IMAGE DEBUG: fnId for image operator:", fnId) + console.log( + PDFJS.OPS.paintInlineImageXObject, + "PDFJS.OPS.paintInlineImageXObject", + ) + if (fnId === PDFJS.OPS.paintInlineImageXObject || fnId === 86) { + console.log("IMAGE DEBUG: Detected inline image data in args") + const candidate = Array.isArray(args) + ? args.find( + (a: any) => + a && + typeof a === "object" && + ("data" in a || "imgData" in a) && + "width" in a && + "height" in a, + ) + : null + if (candidate) { + imageDict = candidate + isInline = true + } } - if (!imageDict || !imageDict.data) { + console.log( + "IMAGE DEBUG: Initial imageDict resolved from args:", + imageDict, + ) + if ( + !imageDict && + (typeof imageName === "string" || + (imageName && + typeof imageName === "object" && + typeof imageName.name === "string")) + ) { + const name = + typeof imageName === "string" ? imageName : imageName.name + imageDict = await resolveImageByName(name) + } + + // If we cannot get the raw image object, skip this image + if (!imageDict) { Logger.debug( - `No image data found for ${imageName} on page ${pageNum}`, + `No image object available for ${imageName} on page ${pageNum} — skipping`, ) continue } + console.log("IMAGE DEBUG: Resolved imageDict:", { + imageDict, + isInline, + }) - try { - const { width, height, kind, data } = imageDict - - if (!width || !height || width <= 0 || height <= 0) { - Logger.debug( - `Invalid image dimensions for ${imageName}: ${width}x${height}`, - ) - continue - } - - if ( - data.length > - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 - ) { - Logger.warn( - `Skipping large image (${(data.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`, - ) - continue - } - - if (width < 250 || height < 250) continue // Skip small images - - let uint8Data: Uint8Array - if (data instanceof Uint8Array) { - uint8Data = data - } else if ( - data && - typeof data === "object" && - data.length !== undefined - ) { - uint8Data = new Uint8Array(data) - } else { - Logger.debug(`Invalid image data format for ${imageName}`) - continue - } + // Ensure imageDict is valid before processing + if (!imageDict || typeof imageDict !== "object") { + console.log( + "IMAGE DEBUG: imageDict is null or invalid, skipping to crop fallback", + ) + // This will fall through to the crop fallback logic below + } else { + try { + // Fast paths for Canvas or Image-like objects returned by page.objs + const isCanvasLike = (obj: any) => + obj && + typeof obj.getContext === "function" && + typeof obj.width === "number" && + typeof obj.height === "number" + const isImageLike = (obj: any) => + obj && + typeof obj.width === "number" && + typeof obj.height === "number" && + typeof obj.getContext !== "function" - const canvas = createCanvas(width, height) - const ctx = canvas.getContext("2d") - let imageProcessed = false - - switch (kind) { - case pdfjsLib.ImageKind.GRAYSCALE_1BPP: - case pdfjsLib.ImageKind.RGB_24BPP: - case pdfjsLib.ImageKind.RGBA_32BPP: { - const bytesPerPixel = - kind === pdfjsLib.ImageKind.RGBA_32BPP - ? 4 - : kind === pdfjsLib.ImageKind.RGB_24BPP - ? 3 - : 1 - const expectedLength = width * height * bytesPerPixel - - if (uint8Data.length >= expectedLength) { - const rgbaData = new Uint8ClampedArray(width * height * 4) - for (let i = 0; i < width * height; i++) { - const srcIdx = i * bytesPerPixel - const dstIdx = i * 4 - if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { - const gray = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 - rgbaData[dstIdx] = gray // R - rgbaData[dstIdx + 1] = gray // G - rgbaData[dstIdx + 2] = gray // B - rgbaData[dstIdx + 3] = 255 // A - } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) { - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = 255 // A - } else { - // RGBA_32BPP - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = - srcIdx + 3 < uint8Data.length - ? uint8Data[srcIdx + 3] - : 255 // A + if (isCanvasLike(imageDict)) { + const c: any = imageDict + const width: number = c.width + const height: number = c.height + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + console.log( + "IMAGE DEBUG: SKIPPED - Small dimensions from canvas for", + imageName, + { width, height }, + ) + } else { + const buffer = c.toBuffer("image/png") + if ( + buffer.length <= + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + // @ts-ignore + let type = await imageType(buffer) + if (!type) type = { mime: "image/png", ext: "png" } + if ( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) + ) { + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") + let description = "This is an image." + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + } else { + try { + description = describeImages + ? await describeImageWithllm(buffer) + : description + } catch { + // ignore + } + if ( + !description || + description === "No description returned." || + description === "Image is not worth describing." + ) { + description = "Image extracted from PDF page." + } + seenHashDescriptions.set(imageHash, description) + } + try { + const baseDir = path.resolve( + process.env.IMAGE_DIR || + "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { + recursive: true, + }) + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join( + outputDir, + imageFilename, + ) + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info( + `Saved image (objs/canvas) to: ${imagePath}`, + ) + } catch (e) { + Logger.error( + `Failed to save objs/canvas image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + // Skip on failure + break + } + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed objs/canvas image ${imageName} on page ${pageNum}`, + ) + break } + } else { + Logger.warn( + `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, + ) } - const imageData = new ImageData(rgbaData, width) - ctx.putImageData(imageData, 0, 0) - imageProcessed = true } - break } - default: { - try { - const imgBuffer = Buffer.from(uint8Data.buffer) - const img = new CanvasImage() - await new Promise((resolve, reject) => { - img.onload = () => resolve() - img.onerror = (err) => reject(err) - img.src = imgBuffer - }) - ctx.drawImage(img, 0, 0) - imageProcessed = true - } catch (err) { + + if (isImageLike(imageDict)) { + const imgLike: any = imageDict + const width: number = imgLike.width + const height: number = imgLike.height + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + console.log( + "IMAGE DEBUG: SKIPPED - Small dimensions from image-like for", + imageName, + { width, height }, + ) + } else { + const cnv = createCanvas(width, height) + const cctx = cnv.getContext("2d") try { + // @ts-ignore draw directly + cctx.drawImage(imgLike, 0, 0) + const buffer = cnv.toBuffer("image/png") + // @ts-ignore + let type = await imageType(buffer) + if (!type) type = { mime: "image/png", ext: "png" } + if ( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) + ) { + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") + let description = "This is an image." + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + } else { + try { + description = describeImages + ? await describeImageWithllm(buffer) + : description + } catch { + // ignore + } + if ( + !description || + description === "No description returned." || + description === "Image is not worth describing." + ) { + description = "Image extracted from PDF page." + } + seenHashDescriptions.set(imageHash, description) + } + try { + const baseDir = path.resolve( + process.env.IMAGE_DIR || + "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { + recursive: true, + }) + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join( + outputDir, + imageFilename, + ) + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info( + `Saved image (objs/image) to: ${imagePath}`, + ) + } catch (e) { + Logger.error( + `Failed to save objs/image image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + break + } + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed objs/image image ${imageName} on page ${pageNum}`, + ) + break + } + } catch (e) { + Logger.debug( + `Drawing objs image failed for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + } + } + } + + const width: number = (imageDict.width ?? + imageDict.w) as number + const height: number = (imageDict.height ?? + imageDict.h) as number + const kind = + imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind + // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes + let rawData: any = + imageDict.data ?? + imageDict.bytes ?? + (imageDict.imgData ? imageDict.imgData.data : undefined) + + console.log( + "IMAGE DEBUG: Full image details for", + imageName, + { + width, + height, + kind, + dataLength: rawData ? rawData.length : null, + dataSizeMB: rawData + ? (rawData.length / (1024 * 1024)).toFixed(2) + : null, + maxAllowedSizeMB: + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + minDimension: MIN_IMAGE_DIM_PX, + isValidDimensions: width > 0 && height > 0, + meetsMinSize: + width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, + withinSizeLimit: rawData + ? rawData.length <= + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + : false, + isInline, + }, + ) + + if (!width || !height || width <= 0 || height <= 0) { + console.log( + "IMAGE DEBUG: SKIPPED - Invalid dimensions for", + imageName, + "width:", + width, + "height:", + height, + ) + Logger.debug( + `Invalid image dimensions for ${imageName}: ${width}x${height}`, + ) + continue + } + + if ( + rawData && + rawData.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + console.log( + "IMAGE DEBUG: SKIPPED - Large file size for", + imageName, + { + actualSizeMB: (rawData.length / (1024 * 1024)).toFixed( + 2, + ), + maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + actualBytes: rawData.length, + maxAllowedBytes: + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * + 1024 * + 1024, + }, + ) + Logger.warn( + `Skipping large image (${(rawData.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`, + ) + continue + } + + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + console.log( + "IMAGE DEBUG: SKIPPED - Small dimensions for", + imageName, + { + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + widthTooSmall: width < MIN_IMAGE_DIM_PX, + heightTooSmall: height < MIN_IMAGE_DIM_PX, + }, + ) + continue // Skip small images + } + + console.log( + "IMAGE DEBUG: Image passed all filters, proceeding with processing for", + imageName, + ) + + let uint8Data: Uint8Array + if (rawData instanceof Uint8Array) { + uint8Data = rawData + } else if ( + rawData && + typeof rawData === "object" && + rawData.length !== undefined + ) { + uint8Data = new Uint8Array(rawData) + } else { + Logger.debug(`Invalid image data format for ${imageName}`) + continue + } + + const canvas = createCanvas(width, height) + const ctx = canvas.getContext("2d") + let imageProcessed = false + + switch (kind) { + case pdfjsLib.ImageKind.GRAYSCALE_1BPP: + case pdfjsLib.ImageKind.RGB_24BPP: + case pdfjsLib.ImageKind.RGBA_32BPP: { + const bytesPerPixel = + kind === pdfjsLib.ImageKind.RGBA_32BPP + ? 4 + : kind === pdfjsLib.ImageKind.RGB_24BPP + ? 3 + : 1 + const expectedLength = width * height * bytesPerPixel + + if (uint8Data.length >= expectedLength) { const rgbaData = new Uint8ClampedArray( width * height * 4, ) - const bytesPerPixel = Math.floor( - uint8Data.length / (width * height), - ) - - if (bytesPerPixel >= 3) { - for (let i = 0; i < width * height; i++) { - const srcIdx = i * bytesPerPixel - const dstIdx = i * 4 + for (let i = 0; i < width * height; i++) { + const srcIdx = i * bytesPerPixel + const dstIdx = i * 4 + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { + const gray = + srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 + rgbaData[dstIdx] = gray // R + rgbaData[dstIdx + 1] = gray // G + rgbaData[dstIdx + 2] = gray // B + rgbaData[dstIdx + 3] = 255 // A + } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) { rgbaData[dstIdx] = srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R rgbaData[dstIdx + 1] = @@ -410,107 +1012,295 @@ export async function extractTextAndImagesWithChunksFromPDF( ? uint8Data[srcIdx + 2] : 0 // B rgbaData[dstIdx + 3] = 255 // A + } else { + // RGBA_32BPP + rgbaData[dstIdx] = + srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = + srcIdx + 3 < uint8Data.length + ? uint8Data[srcIdx + 3] + : 255 // A } - const imageData = new ImageData(rgbaData, width) - ctx.putImageData(imageData, 0, 0) - imageProcessed = true } - } catch { - Logger.debug( - `Failed to process image ${imageName} with fallback method`, - ) + const imageData = new ImageData(rgbaData, width) + ctx.putImageData(imageData, 0, 0) + imageProcessed = true + } + break + } + default: { + try { + const imgBuffer = Buffer.from(uint8Data.buffer) + const img = new CanvasImage() + await new Promise((resolve, reject) => { + img.onload = () => resolve() + img.onerror = (err) => reject(err) + img.src = imgBuffer + }) + ctx.drawImage(img, 0, 0) + imageProcessed = true + } catch (err) { + try { + const rgbaData = new Uint8ClampedArray( + width * height * 4, + ) + const bytesPerPixel = Math.floor( + uint8Data.length / (width * height), + ) + + if (bytesPerPixel >= 3) { + for (let i = 0; i < width * height; i++) { + const srcIdx = i * bytesPerPixel + const dstIdx = i * 4 + rgbaData[dstIdx] = + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = 255 // A + } + const imageData = new ImageData(rgbaData, width) + ctx.putImageData(imageData, 0, 0) + imageProcessed = true + } + } catch { + Logger.debug( + `Failed to process image ${imageName} with fallback method`, + ) + } } } } - } - if (imageProcessed) { - const buffer = canvas.toBuffer("image/png") - // @ts-ignore - let type = await imageType(buffer) - if (!type) { - Logger.warn( - `Could not determine MIME type for ${imageName}. Defaulting to image/png`, + console.log( + "IMAGE DEBUG: Image processing result for", + imageName, + { + imageProcessed, + canvasWidth: canvas.width, + canvasHeight: canvas.height, + }, + ) + + if (imageProcessed) { + console.log( + "IMAGE DEBUG: Converting to PNG buffer for", + imageName, ) - type = { mime: "image/png", ext: "png" } - } - if ( - !type || - !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) - ) { - Logger.warn( - `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`, + const buffer = canvas.toBuffer("image/png") + console.log( + "IMAGE DEBUG: PNG buffer created for", + imageName, + "size:", + buffer.length, + "bytes", ) - continue - } - // buffer already created above - const imageHash = crypto - .createHash("md5") - .update(new Uint8Array(buffer)) - .digest("hex") + // @ts-ignore + let type = await imageType(buffer) + console.log( + "IMAGE DEBUG: Image type detection result for", + imageName, + type, + ) - let description: string + if (!type) { + console.log( + "IMAGE DEBUG: Could not determine MIME type for", + imageName, + "using default image/png", + ) + Logger.warn( + `Could not determine MIME type for ${imageName}. Defaulting to image/png`, + ) + type = { mime: "image/png", ext: "png" } + } - if (seenHashDescriptions.has(imageHash)) { - description = seenHashDescriptions.get(imageHash)! - Logger.warn( - `Reusing description for repeated image ${imageName} on page ${pageNum}`, + console.log( + "IMAGE DEBUG: Checking MIME type support for", + imageName, + { + detectedMime: type.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + isSupported: + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( + type.mime, + ), + }, ) - } else { - if(describeImages) { - description = await describeImageWithllm(buffer) - } else { - description = "This is an image." - } + if ( - description === "No description returned." || - description === "Image is not worth describing." + !type || + !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) ) { + console.log( + "IMAGE DEBUG: SKIPPED - Unsupported MIME type for", + imageName, + { + detectedMime: type?.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + }, + ) Logger.warn( - `${description} ${imageName} on page ${pageNum}`, + `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`, ) continue } - seenHashDescriptions.set(imageHash, description) - } - try { - // Save image to Downloads/xyne_images_db with improved error handling - const baseDir = path.resolve( - process.env.IMAGE_DIR || "downloads/xyne_images_db", + console.log( + "IMAGE DEBUG: MIME type check passed for", + imageName, + "proceeding with hash and description", ) - const outputDir = path.join(baseDir, docid) - await fsPromises.mkdir(outputDir, { recursive: true }) - const imageFilename = `${globalSeq.value}.${type.ext || "png"}` - const imagePath = path.join(outputDir, imageFilename) + // buffer already created above + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") + + let description: string - await fsPromises.writeFile( - imagePath, - buffer as NodeJS.ArrayBufferView, + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + console.log( + "IMAGE DEBUG: Reusing cached description for", + imageName, + "description:", + description, + ) + Logger.warn( + `Reusing description for repeated image ${imageName} on page ${pageNum}`, + ) + } else { + console.log( + "IMAGE DEBUG: Generating new description for", + imageName, + "describeImages:", + describeImages, + ) + if (describeImages) { + try { + console.log( + "AI DEBUG: Calling describeImageWithllm for image", + imageName, + ) + description = await describeImageWithllm(buffer) + console.log( + "AI DEBUG: Got description from AI for", + imageName, + "description:", + description, + ) + } catch (e) { + Logger.warn( + `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`, + ) + description = "This is an image from the PDF." + console.log( + "IMAGE DEBUG: Fallback description used due to AI error", + ) + } + } else { + description = "This is an image." + console.log( + "IMAGE DEBUG: Using default description (describeImages=false)", + ) + } + if ( + description === "No description returned." || + description === "Image is not worth describing." + ) { + console.log( + "IMAGE DEBUG: Replacing insufficient description for", + imageName, + "previous:", + description, + ) + Logger.warn( + `${description} ${imageName} on page ${pageNum}`, + ) + description = "Image extracted from PDF page." + } + seenHashDescriptions.set(imageHash, description) + console.log( + "IMAGE DEBUG: Cached new description for", + imageName, + "description:", + description, + ) + } + + try { + // Save image to Downloads/xyne_images_db with improved error handling + const baseDir = path.resolve( + process.env.IMAGE_DIR || "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { recursive: true }) + + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join(outputDir, imageFilename) + + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info(`Saved image to: ${imagePath}`) + } catch (saveError) { + Logger.error( + `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`, + ) + // Skip adding to chunks if save failed + continue + } + + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + // Logger.info(`OVERLAP DEBUG: Adding image placeholder to crossImageOverlap. Before: "${crossImageOverlap}"`) + // console.log('OVERLAP DEBUG: crossImageOverlap before adding image placeholder:', crossImageOverlap) + crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` + // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`) + // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap) + console.log( + "IMAGE DEBUG: Added image chunk at position", + globalSeq.value, + { + imageName, + description, + crossImageOverlap, + }, ) - Logger.info(`Saved image to: ${imagePath}`) - } catch (saveError) { - Logger.error( - `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`, + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed image ${imageName} on page ${pageNum}`, ) - // Skip adding to chunks if save failed - continue } - - image_chunks.push(description) - image_chunk_pos.push(globalSeq.value) - crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` - globalSeq.value++ - Logger.debug( - `Successfully processed image ${imageName} on page ${pageNum}`, + } catch (error) { + Logger.warn( + `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`, ) } - } catch (error) { - Logger.warn( - `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`, - ) } break } @@ -520,6 +1310,8 @@ export async function extractTextAndImagesWithChunksFromPDF( } } + // Vector snapshot functionality removed (no longer creating fallback canvas) + // End of page: flush remaining paragraph and process paragraphs flushParagraph() const overlapText = processTextParagraphs( @@ -530,10 +1322,20 @@ export async function extractTextAndImagesWithChunksFromPDF( ) // Update cross-image overlap - APPEND instead of REPLACE to preserve image placeholders + // Logger.info(`OVERLAP DEBUG: End of page ${pageNum} - processing final overlap update`) + // console.log('OVERLAP DEBUG: Page', pageNum, 'end - overlapText from processTextParagraphs:', overlapText) + // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap before final update:', crossImageOverlap) if (overlapText.trim()) { + // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText has content, updating crossImageOverlap`) + const previousCrossImageOverlap = crossImageOverlap crossImageOverlap = crossImageOverlap ? `${crossImageOverlap} ${overlapText}` : overlapText + // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - crossImageOverlap updated from "${previousCrossImageOverlap}" to "${crossImageOverlap}"`) + // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap after final update:', crossImageOverlap) + } else { + // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText is empty, no update to crossImageOverlap`) + // console.log('OVERLAP DEBUG: Page', pageNum, 'end - no update to crossImageOverlap (overlapText empty)') } Logger.debug( @@ -549,6 +1351,20 @@ export async function extractTextAndImagesWithChunksFromPDF( `PDF processing completed. Total text chunks: ${text_chunks.length}, Total image chunks: ${image_chunks.length}`, ) + console.log("FINAL DEBUG: PDF processing completed for", docid) + console.log("FINAL DEBUG: Processing summary:", { + totalTextChunks: text_chunks.length, + totalImageChunks: image_chunks.length, + textChunkPositions: text_chunk_pos.length, + imageChunkPositions: image_chunk_pos.length, + extractImages, + describeImages, + }) + + console.log("FINAL DEBUG: All text chunks:", text_chunks) + console.log("FINAL DEBUG: All text chunk positions:", text_chunk_pos) + console.log("FINAL DEBUG: All image chunks:", image_chunks) + console.log("FINAL DEBUG: All image chunk positions:", image_chunk_pos) return { text_chunks, image_chunks, @@ -556,6 +1372,7 @@ export async function extractTextAndImagesWithChunksFromPDF( image_chunk_pos, } } finally { + console.log("Calling destroy") await pdfDocument.destroy() } } diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts new file mode 100644 index 000000000..0f0adac30 --- /dev/null +++ b/server/scripts/testPdfDirect.ts @@ -0,0 +1,89 @@ +import { readFileSync } from "fs" +import { resolve } from "path" +import { FileProcessorService } from "@/services/fileProcessor" +import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" + +async function testPdfDirect() { + let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf" + // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf" + pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf" + try { + console.log("=== DIRECT PDF PROCESSING TEST ===") + console.log("PDF Path:", pdfPath) + + // Read the PDF file + console.log("\n1. Reading PDF file...") + const pdfBuffer = readFileSync(pdfPath) + console.log("File size:", pdfBuffer.length, "bytes") + + console.log("\n2. Testing direct PDF processing (current knowledge base flow)...") + console.log("This simulates exactly what happens in the knowledge base upload:") + console.log("- FileProcessorService.processFile() is called") + console.log("- extractImages defaults to false") + console.log("- describeImages defaults to false") + + // Test the exact flow used in knowledge base + const result = await FileProcessorService.processFile( + pdfBuffer, + "application/pdf", + "small2.pdf", + "test-doc-id", + pdfPath + // extractImages and describeImages default to false + ) + + console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===") + console.log("Text chunks:", result.chunks.length) + console.log("Image chunks:", result.image_chunks.length) + console.log("Text chunk positions:", result.chunks_pos.length) + console.log("Image chunk positions:", result.image_chunks_pos.length) + + console.log("\n3. Testing with image processing enabled...") + console.log("Parameters: extractImages=true, describeImages=true") + + // Test with images enabled to see the difference + const imageResult = await extractTextAndImagesWithChunksFromPDF( + new Uint8Array(pdfBuffer), + "test-doc-with-images", + true, // extractImages enabled + true // describeImages enabled + ) + + console.log("\n=== RESULTS WITH IMAGES ENABLED ===") + console.log("Text chunks:", imageResult.text_chunks.length) + console.log("Image chunks:", imageResult.image_chunks.length) + console.log("Text chunk positions:", imageResult.text_chunk_pos.length) + console.log("Image chunk positions:", imageResult.image_chunk_pos.length) + + console.log("\n=== COMPARISON ===") + console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length) + console.log("With images - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length) + + if (result.chunks.length > 0) { + console.log("\n=== SAMPLE TEXT CHUNKS ===") + result.chunks.slice(0, 2).forEach((chunk, idx) => { + console.log(`\nText Chunk ${idx + 1}:`) + console.log(chunk) + }) + } + + if (imageResult.image_chunks.length > 0) { + console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===") + imageResult.image_chunks.forEach((chunk, idx) => { + console.log(`\nImage ${idx + 1}:`) + console.log(chunk) + }) + } + + console.log("\n=== TEST COMPLETED ===") + console.log("✓ Check the debug logs above from pdfChunks.ts") + console.log("✓ You can see exactly what's being processed in the current knowledge base flow") + + } catch (error) { + console.error("Error processing PDF:", error) + process.exit(1) + } +} + +// Run the test +testPdfDirect().catch(console.error) \ No newline at end of file From 60bfdb9babb0f9039543ab25512843ef8701b4e0 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 14:07:45 +0530 Subject: [PATCH 2/9] comment fixes --- server/pdfChunks.ts | 343 +++++++++++++++++++++++++++----------------- 1 file changed, 211 insertions(+), 132 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 0f7f11483..43c7f210e 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -16,6 +16,8 @@ const qcmsWasmPath = path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/" const seenHashDescriptions = new Map() const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10) +// Minimum line height used for calculating line break detection tolerance (in PDF units) +const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10 const Logger = getLogger(Subsystem.Integrations).child({ module: "pdfChunks", @@ -23,26 +25,7 @@ const Logger = getLogger(Subsystem.Integrations).child({ const PDFJS = pdfjsLib -// Utility function to clean text consistent with chunkTextByParagraph -// const cleanText = (str: string): string => { -// console.log('CLEAN TEXT DEBUG: Input string length:', str.length) -// console.log('CLEAN TEXT DEBUG: Input string:', str) -// const normalized = str.replace(/\r\n|\r/g, "\n") -// console.log('CLEAN TEXT DEBUG: After normalization length:', normalized.length) -// console.log('CLEAN TEXT DEBUG: After normalization:', normalized) - -// const cleaned = normalized.replace( -// /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g, -// "", -// ) -// console.log('CLEAN TEXT DEBUG: After cleaning length:', cleaned.length) -// console.log('CLEAN TEXT DEBUG: Cleaned string:', cleaned) - -// return cleaned -// } - -//=== export function normalizeText(input: string): string { if (!input) return "" @@ -131,12 +114,13 @@ export function cleanText(input: string): string { s = s.replace(/[ \t]*\n[ \t]*/g, "\n") // Turn intra-paragraph newlines into spaces, preserve paragraph breaks - // 1) Mark paragraph breaks - s = s.replace(/\n{2,}/g, "[[PARA]]") + // 1) Mark paragraph breaks with a unique placeholder + const uniqueParaPlaceholder = `\uE000XYNE_PARA_BREAK_${Math.random().toString(36).substring(2)}\uE001` + s = s.replace(/\n{2,}/g, uniqueParaPlaceholder) // 2) Collapse remaining newlines (soft wraps) into spaces s = s.replace(/\n+/g, " ") // 3) Restore paragraph breaks - s = s.replace(/\[\[PARA\]\]/g, "\n\n") + s = s.replace(new RegExp(uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), "\n\n") // Apply line-wise despacing s = s @@ -160,6 +144,47 @@ export function cleanText(input: string): string { return s.trim() } +// ============================= +// 4. Matrix transformation utilities +// ============================= + +/** + * Multiply two 2D transformation matrices + * Each matrix is represented as [a, b, c, d, e, f] corresponding to: + * [a c e] + * [b d f] + * [0 0 1] + */ +function multiplyMatrices( + m1: number[], + m2: number[], +): [number, number, number, number, number, number] { + const [a1, b1, c1, d1, e1, f1] = m1 as [ + number, + number, + number, + number, + number, + number, + ] + const [a2, b2, c2, d2, e2, f2] = m2 as [ + number, + number, + number, + number, + number, + number, + ] + return [ + a1 * a2 + c1 * b2, + b1 * a2 + d1 * b2, + a1 * c2 + c1 * d2, + b1 * c2 + d1 * d2, + a1 * e2 + c1 * f2 + e1, + b1 * e2 + d1 * f2 + f1, + ] +} + //=== /** @@ -290,6 +315,8 @@ export async function extractTextAndImagesWithChunksFromPDF( text_chunk_pos: number[] image_chunk_pos: number[] }> { + // Sanitize docid for safe filesystem use + const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_") Logger.info(`Starting PDF processing for: ${docid}`) console.log("PDF DEBUG: Starting processing with parameters:", { docid, @@ -398,6 +425,65 @@ export async function extractTextAndImagesWithChunksFromPDF( return paragraphs.map(cleanText).filter((p) => p.length > 0) } + // Extract text from operators as fallback for edge cases + const extractFallbackTextFromOperators = ( + opList: any, + ): string[] => { + const fallbackLines: string[] = [] + + for (let i = 0; i < opList.fnArray.length; i++) { + const fnId = opList.fnArray[i] + const args = opList.argsArray[i] + + // Handle text operators + if ( + fnId === PDFJS.OPS.showText || + fnId === PDFJS.OPS.showSpacedText || + fnId === PDFJS.OPS.nextLineShowText || + fnId === PDFJS.OPS.nextLineSetSpacingShowText + ) { + const extractedText = extractTextFromArgs(args) + if (extractedText.trim()) { + fallbackLines.push(extractedText.trim()) + } + } + } + + return fallbackLines + } + + // Combine and deduplicate text from multiple sources + const combineTextSources = ( + primaryParagraphs: string[], + fallbackLines: string[], + ): string[] => { + if (fallbackLines.length === 0) { + return primaryParagraphs + } + + const primaryText = primaryParagraphs.join(" ").toLowerCase() + const additionalLines: string[] = [] + + // Add fallback lines that aren't already covered by primary extraction + for (const line of fallbackLines) { + const cleanLine = line.trim() + if ( + cleanLine.length > 2 && // Skip very short strings + !primaryText.includes(cleanLine.toLowerCase()) + ) { + additionalLines.push(cleanLine) + } + } + + // If we found additional text, append it as a new paragraph + if (additionalLines.length > 0) { + const additionalParagraph = additionalLines.join(" ") + return [...primaryParagraphs, additionalParagraph] + } + + return primaryParagraphs + } + for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { Logger.debug(`Processing page ${pageNum}`) @@ -405,11 +491,25 @@ export async function extractTextAndImagesWithChunksFromPDF( try { const opList = await page.getOperatorList() - // Use textContent-based paragraphs for this page - let paragraphs: string[] = await buildParagraphsFromPage(page) + // Use textContent-based paragraphs for this page as primary source + let primaryParagraphs: string[] = await buildParagraphsFromPage(page) + + // Extract fallback text from operators for edge cases + const fallbackLines = extractFallbackTextFromOperators(opList) + + // Combine both sources, prioritizing primary extraction + let paragraphs: string[] = combineTextSources(primaryParagraphs, fallbackLines) + let currentParagraph = "" // kept for image-flow flush, but not used for text let textOperatorCount = (await page.getTextContent()).items.length + console.log("TEXT DEBUG: Text extraction summary for page", pageNum, { + primaryParagraphs: primaryParagraphs.length, + fallbackLines: fallbackLines.length, + finalParagraphs: paragraphs.length, + textOperatorCount, + }) + // Helper: try to resolve image object by name directly from page.objs const resolveImageByName = async ( name: string, @@ -437,51 +537,6 @@ export async function extractTextAndImagesWithChunksFromPDF( ] const ctmStack: [number, number, number, number, number, number][] = [] - const mul = ( - m1: number[], - m2: number[], - ): [number, number, number, number, number, number] => { - const [a1, b1, c1, d1, e1, f1] = m1 as [ - number, - number, - number, - number, - number, - number, - ] - const [a2, b2, c2, d2, e2, f2] = m2 as [ - number, - number, - number, - number, - number, - number, - ] - return [ - a1 * a2 + c1 * b2, - b1 * a2 + d1 * b2, - a1 * c2 + c1 * d2, - b1 * c2 + d1 * d2, - a1 * e2 + c1 * f2 + e1, - b1 * e2 + d1 * f2 + f1, - ] - } - - const applyToPoint = ( - m: number[], - x: number, - y: number, - ): { x: number; y: number } => { - const [a, b, c, d, e, f] = m as [ - number, - number, - number, - number, - number, - number, - ] - return { x: a * x + c * y + e, y: b * x + d * y + f } - } // Do not inject crossImageOverlap into text paragraphs here // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap) @@ -500,10 +555,6 @@ export async function extractTextAndImagesWithChunksFromPDF( const fnId = opList.fnArray[i] const args = opList.argsArray[i] - // console.log(PDFJS.OPS.paintImageXObject , "PDFJS.OPS.paintImageXObject") - // console.log(PDFJS.OPS.paintImageXObjectRepeat , "PDFJS.OPS.paintImageXObjectRepeat") - // console.log(PDFJS.OPS.paintInlineImageXObject , "PDFJS.OPS.paintInlineImageXObject") - // console.log(PDFJS.OPS.paintImageMaskXObject , "PDFJS.OPS.paintImageMaskXObject") // Track vector drawing operators (paths, fills, form XObjects) const isVectorOp = @@ -531,7 +582,8 @@ export async function extractTextAndImagesWithChunksFromPDF( case PDFJS.OPS.nextLine: case PDFJS.OPS.nextLineShowText: case PDFJS.OPS.nextLineSetSpacingShowText: { - // Text handled via getTextContent; ignore operator-driven text + // Text is now handled by combined extraction approach + // Operator-level extraction happens in extractFallbackTextFromOperators break } // Handle matrix and positioning operators that might indicate paragraph breaks @@ -547,7 +599,7 @@ export async function extractTextAndImagesWithChunksFromPDF( args.length >= 6 && args.every((n: any) => typeof n === "number") ) { - currentCTM = mul(currentCTM, args as number[]) + currentCTM = multiplyMatrices(currentCTM, args as number[]) } } catch {} } @@ -561,15 +613,11 @@ export async function extractTextAndImagesWithChunksFromPDF( if (ctmStack.length) currentCTM = ctmStack.pop()! break } - // Handle image operators - be more comprehensive + // Handle image operators case extractImages ? PDFJS.OPS.paintImageXObject : null: case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null: case extractImages ? PDFJS.OPS.paintInlineImageXObject : null: - case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: - case extractImages ? 83 : null: - case extractImages ? 85 : null: - case extractImages ? 86 : null: - case extractImages ? 88 : null: { + case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: { console.log( "IMAGE DEBUG: Image operator detected on page", pageNum, @@ -609,7 +657,7 @@ export async function extractTextAndImagesWithChunksFromPDF( PDFJS.OPS.paintInlineImageXObject, "PDFJS.OPS.paintInlineImageXObject", ) - if (fnId === PDFJS.OPS.paintInlineImageXObject || fnId === 86) { + if (fnId === PDFJS.OPS.paintInlineImageXObject) { console.log("IMAGE DEBUG: Detected inline image data in args") const candidate = Array.isArray(args) ? args.find( @@ -725,7 +773,7 @@ export async function extractTextAndImagesWithChunksFromPDF( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, docid) + const outputDir = path.join(baseDir, safeDocId) await fsPromises.mkdir(outputDir, { recursive: true, }) @@ -818,7 +866,7 @@ export async function extractTextAndImagesWithChunksFromPDF( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, docid) + const outputDir = path.join(baseDir, safeDocId) await fsPromises.mkdir(outputDir, { recursive: true, }) @@ -978,59 +1026,81 @@ export async function extractTextAndImagesWithChunksFromPDF( case pdfjsLib.ImageKind.GRAYSCALE_1BPP: case pdfjsLib.ImageKind.RGB_24BPP: case pdfjsLib.ImageKind.RGBA_32BPP: { - const bytesPerPixel = - kind === pdfjsLib.ImageKind.RGBA_32BPP - ? 4 - : kind === pdfjsLib.ImageKind.RGB_24BPP - ? 3 - : 1 - const expectedLength = width * height * bytesPerPixel + let expectedLength: number + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { + // 1 bit per pixel, packed into bytes + expectedLength = Math.ceil((width * height) / 8) + } else { + const bytesPerPixel = + kind === pdfjsLib.ImageKind.RGBA_32BPP + ? 4 + : 3 // RGB_24BPP + expectedLength = width * height * bytesPerPixel + } if (uint8Data.length >= expectedLength) { const rgbaData = new Uint8ClampedArray( width * height * 4, ) - for (let i = 0; i < width * height; i++) { - const srcIdx = i * bytesPerPixel - const dstIdx = i * 4 - if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { - const gray = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 - rgbaData[dstIdx] = gray // R - rgbaData[dstIdx + 1] = gray // G - rgbaData[dstIdx + 2] = gray // B - rgbaData[dstIdx + 3] = 255 // A - } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) { - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = 255 // A - } else { - // RGBA_32BPP - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = - srcIdx + 3 < uint8Data.length - ? uint8Data[srcIdx + 3] - : 255 // A + + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { + // Handle 1 bit per pixel grayscale (bit-packed data) + let pixelIndex = 0 + for (let y = 0; y < height; y++) { + for (let x = 0; x < width; x++) { + const byteIndex = Math.floor(pixelIndex / 8) + const bitIndex = 7 - (pixelIndex % 8) // MSB first + const bit = byteIndex < uint8Data.length + ? (uint8Data[byteIndex] >> bitIndex) & 1 + : 0 + const gray = bit ? 255 : 0 // Convert bit to full pixel value + + const dstIdx = pixelIndex * 4 + rgbaData[dstIdx] = gray // R + rgbaData[dstIdx + 1] = gray // G + rgbaData[dstIdx + 2] = gray // B + rgbaData[dstIdx + 3] = 255 // A + pixelIndex++ + } + } + } else { + // Handle RGB_24BPP and RGBA_32BPP (byte-per-channel data) + const bytesPerPixel = kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 + for (let i = 0; i < width * height; i++) { + const srcIdx = i * bytesPerPixel + const dstIdx = i * 4 + if (kind === pdfjsLib.ImageKind.RGB_24BPP) { + rgbaData[dstIdx] = + srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = 255 // A + } else { + // RGBA_32BPP + rgbaData[dstIdx] = + srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = + srcIdx + 3 < uint8Data.length + ? uint8Data[srcIdx + 3] + : 255 // A + } } } - const imageData = new ImageData(rgbaData, width) + const imageData = new ImageData(rgbaData, width, height) ctx.putImageData(imageData, 0, 0) imageProcessed = true } @@ -1074,7 +1144,7 @@ export async function extractTextAndImagesWithChunksFromPDF( : 0 // B rgbaData[dstIdx + 3] = 255 // A } - const imageData = new ImageData(rgbaData, width) + const imageData = new ImageData(rgbaData, width, height) ctx.putImageData(imageData, 0, 0) imageProcessed = true } @@ -1103,6 +1173,15 @@ export async function extractTextAndImagesWithChunksFromPDF( imageName, ) const buffer = canvas.toBuffer("image/png") + if ( + buffer.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn( + `Skipping encoded image > ${DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB} MB (size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB)`, + ) + continue + } console.log( "IMAGE DEBUG: PNG buffer created for", imageName, @@ -1255,7 +1334,7 @@ export async function extractTextAndImagesWithChunksFromPDF( const baseDir = path.resolve( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, docid) + const outputDir = path.join(baseDir, safeDocId) await fsPromises.mkdir(outputDir, { recursive: true }) const imageFilename = `${globalSeq.value}.${type.ext || "png"}` From 29508fa9a7d91ed4759156b969b7c264d16fec78 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 15:18:01 +0530 Subject: [PATCH 3/9] comment fixes --- server/pdfChunks.ts | 406 +++++++++++++++++++------------------------- 1 file changed, 170 insertions(+), 236 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 43c7f210e..f3b5675e0 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -25,8 +25,6 @@ const Logger = getLogger(Subsystem.Integrations).child({ const PDFJS = pdfjsLib - - export function normalizeText(input: string): string { if (!input) return "" @@ -43,9 +41,9 @@ export function normalizeText(input: string): string { return normalized.trim() } -// ========================================= + // 2. Smart letter-spacing collapse (per line) -// ========================================= + function smartDespaceLine(line: string): string { if (!line) return line @@ -101,9 +99,9 @@ function smartDespaceLine(line: string): string { return out.join("") } -// ============================= + // 3. High-level text cleaner -// ============================= + export function cleanText(input: string): string { let s = normalizeText(input) @@ -120,7 +118,13 @@ export function cleanText(input: string): string { // 2) Collapse remaining newlines (soft wraps) into spaces s = s.replace(/\n+/g, " ") // 3) Restore paragraph breaks - s = s.replace(new RegExp(uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), "\n\n") + s = s.replace( + new RegExp( + uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), + "g", + ), + "\n\n", + ) // Apply line-wise despacing s = s @@ -144,9 +148,9 @@ export function cleanText(input: string): string { return s.trim() } -// ============================= + // 4. Matrix transformation utilities -// ============================= + /** * Multiply two 2D transformation matrices @@ -234,7 +238,6 @@ function extractTextFromArgs(args: any[]): string { // Additional validation: ensure we return clean, valid text const result = typeof text === "string" ? text : "" - console.log("EXTRACT TEXT DEBUG: Final extracted text:", result) return result } @@ -249,10 +252,10 @@ function processTextParagraphs( globalSeq: { value: number }, overlapBytes: number = 32, ): string { - console.log("TEXT DEBUG: Processing paragraphs count:", paragraphs.length) + Logger.debug("Processing paragraphs", { count: paragraphs.length }) if (paragraphs.length === 0) { - console.log("TEXT DEBUG: No paragraphs to process") + Logger.debug("No paragraphs to process") return "" } @@ -260,7 +263,7 @@ function processTextParagraphs( .map(cleanText) .filter((p) => p.length > 0) if (cleanedParagraphs.length === 0) { - console.log("TEXT DEBUG: No cleaned paragraphs after filtering") + Logger.debug("No cleaned paragraphs after filtering") return "" } @@ -317,8 +320,7 @@ export async function extractTextAndImagesWithChunksFromPDF( }> { // Sanitize docid for safe filesystem use const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_") - Logger.info(`Starting PDF processing for: ${docid}`) - console.log("PDF DEBUG: Starting processing with parameters:", { + Logger.debug("Starting processing with parameters", { docid, extractImages, describeImages, @@ -426,11 +428,9 @@ export async function extractTextAndImagesWithChunksFromPDF( } // Extract text from operators as fallback for edge cases - const extractFallbackTextFromOperators = ( - opList: any, - ): string[] => { + const extractFallbackTextFromOperators = (opList: any): string[] => { const fallbackLines: string[] = [] - + for (let i = 0; i < opList.fnArray.length; i++) { const fnId = opList.fnArray[i] const args = opList.argsArray[i] @@ -493,17 +493,21 @@ export async function extractTextAndImagesWithChunksFromPDF( // Use textContent-based paragraphs for this page as primary source let primaryParagraphs: string[] = await buildParagraphsFromPage(page) - + // Extract fallback text from operators for edge cases const fallbackLines = extractFallbackTextFromOperators(opList) - + // Combine both sources, prioritizing primary extraction - let paragraphs: string[] = combineTextSources(primaryParagraphs, fallbackLines) - + let paragraphs: string[] = combineTextSources( + primaryParagraphs, + fallbackLines, + ) + let currentParagraph = "" // kept for image-flow flush, but not used for text let textOperatorCount = (await page.getTextContent()).items.length - console.log("TEXT DEBUG: Text extraction summary for page", pageNum, { + Logger.debug("Text extraction summary for page", { + pageNum, primaryParagraphs: primaryParagraphs.length, fallbackLines: fallbackLines.length, finalParagraphs: paragraphs.length, @@ -537,7 +541,6 @@ export async function extractTextAndImagesWithChunksFromPDF( ] const ctmStack: [number, number, number, number, number, number][] = [] - // Do not inject crossImageOverlap into text paragraphs here // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap) @@ -555,7 +558,6 @@ export async function extractTextAndImagesWithChunksFromPDF( const fnId = opList.fnArray[i] const args = opList.argsArray[i] - // Track vector drawing operators (paths, fills, form XObjects) const isVectorOp = fnId === PDFJS.OPS.constructPath || @@ -618,21 +620,12 @@ export async function extractTextAndImagesWithChunksFromPDF( case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null: case extractImages ? PDFJS.OPS.paintInlineImageXObject : null: case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: { - console.log( - "IMAGE DEBUG: Image operator detected on page", + Logger.debug("Image operator detected", { pageNum, - { - extractImages, - operatorType: fnId, - imageName: args[0], - knownOperators: { - paintImageXObject: PDFJS.OPS.paintImageXObject, - paintImageXObjectRepeat: PDFJS.OPS.paintImageXObjectRepeat, - paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject, - paintImageMaskXObject: PDFJS.OPS.paintImageMaskXObject, - }, - }, - ) + extractImages, + operatorType: fnId, + imageName: args[0], + }) // Do not process text per-image anymore; text is processed once per page. // Maintain crossImageOverlap continuity by keeping placeholders only. @@ -647,18 +640,17 @@ export async function extractTextAndImagesWithChunksFromPDF( typeof args[0].name === "string" ? args[0].name : args?.[0] - console.log("IMAGE DEBUG: Processing image:", imageName) + Logger.debug("Processing image", { imageName }) let imageDict: any | null = null let isInline = false // Inline image may directly carry data in args - console.log("IMAGE DEBUG: Initial args for image operator:", args) - console.log("IMAGE DEBUG: fnId for image operator:", fnId) - console.log( - PDFJS.OPS.paintInlineImageXObject, - "PDFJS.OPS.paintInlineImageXObject", - ) + Logger.debug("Image operator details", { + args: args.length, + fnId, + paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject, + }) if (fnId === PDFJS.OPS.paintInlineImageXObject) { - console.log("IMAGE DEBUG: Detected inline image data in args") + Logger.debug("Detected inline image data in args") const candidate = Array.isArray(args) ? args.find( (a: any) => @@ -674,10 +666,10 @@ export async function extractTextAndImagesWithChunksFromPDF( isInline = true } } - console.log( - "IMAGE DEBUG: Initial imageDict resolved from args:", - imageDict, - ) + Logger.debug("Initial imageDict resolved", { + hasImageDict: !!imageDict, + isInline, + }) if ( !imageDict && (typeof imageName === "string" || @@ -697,16 +689,14 @@ export async function extractTextAndImagesWithChunksFromPDF( ) continue } - console.log("IMAGE DEBUG: Resolved imageDict:", { - imageDict, + Logger.debug("Resolved imageDict", { + hasImageDict: !!imageDict, isInline, }) // Ensure imageDict is valid before processing if (!imageDict || typeof imageDict !== "object") { - console.log( - "IMAGE DEBUG: imageDict is null or invalid, skipping to crop fallback", - ) + Logger.debug("imageDict is null or invalid, skipping to crop fallback") // This will fall through to the crop fallback logic below } else { try { @@ -727,11 +717,12 @@ export async function extractTextAndImagesWithChunksFromPDF( const width: number = c.width const height: number = c.height if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { - console.log( - "IMAGE DEBUG: SKIPPED - Small dimensions from canvas for", + Logger.debug("Skipped small canvas image", { imageName, - { width, height }, - ) + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) } else { const buffer = c.toBuffer("image/png") if ( @@ -819,11 +810,12 @@ export async function extractTextAndImagesWithChunksFromPDF( const width: number = imgLike.width const height: number = imgLike.height if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { - console.log( - "IMAGE DEBUG: SKIPPED - Small dimensions from image-like for", + Logger.debug("Skipped small image-like object", { imageName, - { width, height }, - ) + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) } else { const cnv = createCanvas(width, height) const cctx = cnv.getContext("2d") @@ -918,43 +910,33 @@ export async function extractTextAndImagesWithChunksFromPDF( imageDict.bytes ?? (imageDict.imgData ? imageDict.imgData.data : undefined) - console.log( - "IMAGE DEBUG: Full image details for", + Logger.debug("Full image details", { imageName, - { - width, - height, - kind, - dataLength: rawData ? rawData.length : null, - dataSizeMB: rawData - ? (rawData.length / (1024 * 1024)).toFixed(2) - : null, - maxAllowedSizeMB: - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, - minDimension: MIN_IMAGE_DIM_PX, - isValidDimensions: width > 0 && height > 0, - meetsMinSize: - width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, - withinSizeLimit: rawData - ? rawData.length <= - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 - : false, - isInline, - }, - ) + width, + height, + kind, + dataLength: rawData ? rawData.length : null, + dataSizeMB: rawData + ? (rawData.length / (1024 * 1024)).toFixed(2) + : null, + maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + minDimension: MIN_IMAGE_DIM_PX, + isValidDimensions: width > 0 && height > 0, + meetsMinSize: + width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, + withinSizeLimit: rawData + ? rawData.length <= + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + : false, + isInline, + }) if (!width || !height || width <= 0 || height <= 0) { - console.log( - "IMAGE DEBUG: SKIPPED - Invalid dimensions for", + Logger.debug("Skipped image with invalid dimensions", { imageName, - "width:", width, - "height:", height, - ) - Logger.debug( - `Invalid image dimensions for ${imageName}: ${width}x${height}`, - ) + }) continue } @@ -963,46 +945,27 @@ export async function extractTextAndImagesWithChunksFromPDF( rawData.length > DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 ) { - console.log( - "IMAGE DEBUG: SKIPPED - Large file size for", + Logger.warn("Skipped large image", { imageName, - { - actualSizeMB: (rawData.length / (1024 * 1024)).toFixed( - 2, - ), - maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, - actualBytes: rawData.length, - maxAllowedBytes: - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * - 1024 * - 1024, - }, - ) - Logger.warn( - `Skipping large image (${(rawData.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`, - ) + actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2), + maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + }) continue } if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { - console.log( - "IMAGE DEBUG: SKIPPED - Small dimensions for", + Logger.debug("Skipped small image", { imageName, - { - width, - height, - minRequired: MIN_IMAGE_DIM_PX, - widthTooSmall: width < MIN_IMAGE_DIM_PX, - heightTooSmall: height < MIN_IMAGE_DIM_PX, - }, - ) + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) continue // Skip small images } - console.log( - "IMAGE DEBUG: Image passed all filters, proceeding with processing for", + Logger.debug("Image passed all filters, proceeding with processing", { imageName, - ) + }) let uint8Data: Uint8Array if (rawData instanceof Uint8Array) { @@ -1032,9 +995,7 @@ export async function extractTextAndImagesWithChunksFromPDF( expectedLength = Math.ceil((width * height) / 8) } else { const bytesPerPixel = - kind === pdfjsLib.ImageKind.RGBA_32BPP - ? 4 - : 3 // RGB_24BPP + kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 // RGB_24BPP expectedLength = width * height * bytesPerPixel } @@ -1042,7 +1003,7 @@ export async function extractTextAndImagesWithChunksFromPDF( const rgbaData = new Uint8ClampedArray( width * height * 4, ) - + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { // Handle 1 bit per pixel grayscale (bit-packed data) let pixelIndex = 0 @@ -1050,28 +1011,32 @@ export async function extractTextAndImagesWithChunksFromPDF( for (let x = 0; x < width; x++) { const byteIndex = Math.floor(pixelIndex / 8) const bitIndex = 7 - (pixelIndex % 8) // MSB first - const bit = byteIndex < uint8Data.length - ? (uint8Data[byteIndex] >> bitIndex) & 1 - : 0 + const bit = + byteIndex < uint8Data.length + ? (uint8Data[byteIndex] >> bitIndex) & 1 + : 0 const gray = bit ? 255 : 0 // Convert bit to full pixel value - + const dstIdx = pixelIndex * 4 - rgbaData[dstIdx] = gray // R + rgbaData[dstIdx] = gray // R rgbaData[dstIdx + 1] = gray // G rgbaData[dstIdx + 2] = gray // B - rgbaData[dstIdx + 3] = 255 // A + rgbaData[dstIdx + 3] = 255 // A pixelIndex++ } } } else { // Handle RGB_24BPP and RGBA_32BPP (byte-per-channel data) - const bytesPerPixel = kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 + const bytesPerPixel = + kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 for (let i = 0; i < width * height; i++) { const srcIdx = i * bytesPerPixel const dstIdx = i * 4 if (kind === pdfjsLib.ImageKind.RGB_24BPP) { rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R rgbaData[dstIdx + 1] = srcIdx + 1 < uint8Data.length ? uint8Data[srcIdx + 1] @@ -1084,7 +1049,9 @@ export async function extractTextAndImagesWithChunksFromPDF( } else { // RGBA_32BPP rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R rgbaData[dstIdx + 1] = srcIdx + 1 < uint8Data.length ? uint8Data[srcIdx + 1] @@ -1144,7 +1111,11 @@ export async function extractTextAndImagesWithChunksFromPDF( : 0 // B rgbaData[dstIdx + 3] = 255 // A } - const imageData = new ImageData(rgbaData, width, height) + const imageData = new ImageData( + rgbaData, + width, + height, + ) ctx.putImageData(imageData, 0, 0) imageProcessed = true } @@ -1157,21 +1128,15 @@ export async function extractTextAndImagesWithChunksFromPDF( } } - console.log( - "IMAGE DEBUG: Image processing result for", + Logger.debug("Image processing result", { imageName, - { - imageProcessed, - canvasWidth: canvas.width, - canvasHeight: canvas.height, - }, - ) + imageProcessed, + canvasWidth: canvas.width, + canvasHeight: canvas.height, + }) if (imageProcessed) { - console.log( - "IMAGE DEBUG: Converting to PNG buffer for", - imageName, - ) + Logger.debug("Converting to PNG buffer", { imageName }) const buffer = canvas.toBuffer("image/png") if ( buffer.length > @@ -1182,74 +1147,61 @@ export async function extractTextAndImagesWithChunksFromPDF( ) continue } - console.log( - "IMAGE DEBUG: PNG buffer created for", + Logger.debug("PNG buffer created", { imageName, - "size:", - buffer.length, - "bytes", - ) + size: buffer.length, + }) // @ts-ignore let type = await imageType(buffer) - console.log( - "IMAGE DEBUG: Image type detection result for", + Logger.debug("Image type detection result", { imageName, type, - ) + }) if (!type) { - console.log( - "IMAGE DEBUG: Could not determine MIME type for", + Logger.debug("Could not determine MIME type, using default", { imageName, - "using default image/png", - ) + default: "image/png", + }) Logger.warn( `Could not determine MIME type for ${imageName}. Defaulting to image/png`, ) type = { mime: "image/png", ext: "png" } } - console.log( - "IMAGE DEBUG: Checking MIME type support for", + Logger.debug("Checking MIME type support", { imageName, - { - detectedMime: type.mime, - supportedMimes: Array.from( - DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + detectedMime: type.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + isSupported: + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( + type.mime, ), - isSupported: - DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( - type.mime, - ), - }, - ) + }) if ( !type || !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) ) { - console.log( - "IMAGE DEBUG: SKIPPED - Unsupported MIME type for", + Logger.debug("Skipped image with unsupported MIME type", { imageName, - { - detectedMime: type?.mime, - supportedMimes: Array.from( - DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, - ), - }, - ) + detectedMime: type?.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + }) Logger.warn( `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`, ) continue } - console.log( - "IMAGE DEBUG: MIME type check passed for", + Logger.debug("MIME type check passed, proceeding with processing", { imageName, - "proceeding with hash and description", - ) + }) // buffer already created above const imageHash = crypto @@ -1261,72 +1213,57 @@ export async function extractTextAndImagesWithChunksFromPDF( if (seenHashDescriptions.has(imageHash)) { description = seenHashDescriptions.get(imageHash)! - console.log( - "IMAGE DEBUG: Reusing cached description for", + Logger.debug("Reusing cached description for image", { imageName, - "description:", description, - ) + }) Logger.warn( `Reusing description for repeated image ${imageName} on page ${pageNum}`, ) } else { - console.log( - "IMAGE DEBUG: Generating new description for", + Logger.debug("Generating new description for image", { imageName, - "describeImages:", describeImages, - ) + }) if (describeImages) { try { - console.log( - "AI DEBUG: Calling describeImageWithllm for image", + Logger.debug("Calling describeImageWithllm for image", { imageName, - ) + }) description = await describeImageWithllm(buffer) - console.log( - "AI DEBUG: Got description from AI for", + Logger.debug("Got description from AI for image", { imageName, - "description:", description, - ) + }) } catch (e) { Logger.warn( `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`, ) description = "This is an image from the PDF." - console.log( - "IMAGE DEBUG: Fallback description used due to AI error", - ) + Logger.debug("Using fallback description due to AI error") } } else { description = "This is an image." - console.log( - "IMAGE DEBUG: Using default description (describeImages=false)", - ) + Logger.debug("Using default description (describeImages=false)") } if ( description === "No description returned." || description === "Image is not worth describing." ) { - console.log( - "IMAGE DEBUG: Replacing insufficient description for", + Logger.debug("Replacing insufficient description", { imageName, - "previous:", - description, - ) + previousDescription: description, + }) Logger.warn( `${description} ${imageName} on page ${pageNum}`, ) description = "Image extracted from PDF page." } seenHashDescriptions.set(imageHash, description) - console.log( - "IMAGE DEBUG: Cached new description for", + Logger.debug("Cached new description for image", { imageName, - "description:", description, - ) + }) } try { @@ -1360,15 +1297,12 @@ export async function extractTextAndImagesWithChunksFromPDF( crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`) // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap) - console.log( - "IMAGE DEBUG: Added image chunk at position", - globalSeq.value, - { - imageName, - description, - crossImageOverlap, - }, - ) + Logger.debug("Added image chunk at position", { + position: globalSeq.value, + imageName, + description, + crossImageOverlap, + }) globalSeq.value++ imagesOnPage += 1 Logger.debug( @@ -1430,8 +1364,8 @@ export async function extractTextAndImagesWithChunksFromPDF( `PDF processing completed. Total text chunks: ${text_chunks.length}, Total image chunks: ${image_chunks.length}`, ) - console.log("FINAL DEBUG: PDF processing completed for", docid) - console.log("FINAL DEBUG: Processing summary:", { + Logger.debug("PDF processing completed for document", { docid }) + Logger.debug("Processing summary", { totalTextChunks: text_chunks.length, totalImageChunks: image_chunks.length, textChunkPositions: text_chunk_pos.length, @@ -1440,10 +1374,10 @@ export async function extractTextAndImagesWithChunksFromPDF( describeImages, }) - console.log("FINAL DEBUG: All text chunks:", text_chunks) - console.log("FINAL DEBUG: All text chunk positions:", text_chunk_pos) - console.log("FINAL DEBUG: All image chunks:", image_chunks) - console.log("FINAL DEBUG: All image chunk positions:", image_chunk_pos) + Logger.debug("All text chunks", { text_chunks }) + Logger.debug("All text chunk positions", { text_chunk_pos }) + Logger.debug("All image chunks", { image_chunks }) + Logger.debug("All image chunk positions", { image_chunk_pos }) return { text_chunks, image_chunks, @@ -1451,7 +1385,7 @@ export async function extractTextAndImagesWithChunksFromPDF( image_chunk_pos, } } finally { - console.log("Calling destroy") + Logger.debug("Calling PDF document destroy") await pdfDocument.destroy() } } From 498c544dd79a4c6dd996248bb35fcce91400bdd0 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 15:18:12 +0530 Subject: [PATCH 4/9] comment fixes --- server/pdfChunks.ts | 63 +++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index f3b5675e0..9a40135be 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -41,7 +41,6 @@ export function normalizeText(input: string): string { return normalized.trim() } - // 2. Smart letter-spacing collapse (per line) function smartDespaceLine(line: string): string { @@ -99,7 +98,6 @@ function smartDespaceLine(line: string): string { return out.join("") } - // 3. High-level text cleaner export function cleanText(input: string): string { @@ -148,10 +146,8 @@ export function cleanText(input: string): string { return s.trim() } - // 4. Matrix transformation utilities - /** * Multiply two 2D transformation matrices * Each matrix is represented as [a, b, c, d, e, f] corresponding to: @@ -644,7 +640,7 @@ export async function extractTextAndImagesWithChunksFromPDF( let imageDict: any | null = null let isInline = false // Inline image may directly carry data in args - Logger.debug("Image operator details", { + Logger.debug("Image operator details", { args: args.length, fnId, paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject, @@ -696,7 +692,9 @@ export async function extractTextAndImagesWithChunksFromPDF( // Ensure imageDict is valid before processing if (!imageDict || typeof imageDict !== "object") { - Logger.debug("imageDict is null or invalid, skipping to crop fallback") + Logger.debug( + "imageDict is null or invalid, skipping to crop fallback", + ) // This will fall through to the crop fallback logic below } else { try { @@ -963,9 +961,12 @@ export async function extractTextAndImagesWithChunksFromPDF( continue // Skip small images } - Logger.debug("Image passed all filters, proceeding with processing", { - imageName, - }) + Logger.debug( + "Image passed all filters, proceeding with processing", + { + imageName, + }, + ) let uint8Data: Uint8Array if (rawData instanceof Uint8Array) { @@ -1160,10 +1161,13 @@ export async function extractTextAndImagesWithChunksFromPDF( }) if (!type) { - Logger.debug("Could not determine MIME type, using default", { - imageName, - default: "image/png", - }) + Logger.debug( + "Could not determine MIME type, using default", + { + imageName, + default: "image/png", + }, + ) Logger.warn( `Could not determine MIME type for ${imageName}. Defaulting to image/png`, ) @@ -1176,10 +1180,9 @@ export async function extractTextAndImagesWithChunksFromPDF( supportedMimes: Array.from( DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, ), - isSupported: - DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( - type.mime, - ), + isSupported: DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( + type.mime, + ), }) if ( @@ -1199,9 +1202,12 @@ export async function extractTextAndImagesWithChunksFromPDF( continue } - Logger.debug("MIME type check passed, proceeding with processing", { - imageName, - }) + Logger.debug( + "MIME type check passed, proceeding with processing", + { + imageName, + }, + ) // buffer already created above const imageHash = crypto @@ -1227,9 +1233,12 @@ export async function extractTextAndImagesWithChunksFromPDF( }) if (describeImages) { try { - Logger.debug("Calling describeImageWithllm for image", { - imageName, - }) + Logger.debug( + "Calling describeImageWithllm for image", + { + imageName, + }, + ) description = await describeImageWithllm(buffer) Logger.debug("Got description from AI for image", { imageName, @@ -1240,11 +1249,15 @@ export async function extractTextAndImagesWithChunksFromPDF( `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`, ) description = "This is an image from the PDF." - Logger.debug("Using fallback description due to AI error") + Logger.debug( + "Using fallback description due to AI error", + ) } } else { description = "This is an image." - Logger.debug("Using default description (describeImages=false)") + Logger.debug( + "Using default description (describeImages=false)", + ) } if ( description === "No description returned." || From 5ab007e3cf3354aa1dd67bc5f183adf100588cd9 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 15:21:03 +0530 Subject: [PATCH 5/9] comment fixes --- server/integrations/microsoft/index.ts | 5 +++-- server/pdfChunks.ts | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/server/integrations/microsoft/index.ts b/server/integrations/microsoft/index.ts index 1e7926146..6b0042d8c 100644 --- a/server/integrations/microsoft/index.ts +++ b/server/integrations/microsoft/index.ts @@ -216,11 +216,12 @@ const insertCalendarEvents = async ( } // Check for next page - deltaToken = (response["@odata.deltaLink"])? response["@odata.deltaLink"] : deltaToken + deltaToken = response["@odata.deltaLink"] + ? response["@odata.deltaLink"] + : deltaToken if (response["@odata.nextLink"]) { // More pages available, continue with next page nextLink = response["@odata.nextLink"] - } else { // No more data nextLink = undefined diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 9a40135be..2f33a9a4f 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -419,8 +419,8 @@ export async function extractTextAndImagesWithChunksFromPDF( } pushPara() - // Clean and filter - return paragraphs.map(cleanText).filter((p) => p.length > 0) + // Filter raw paragraphs - check trimmed length but don't apply full cleaning yet + return paragraphs.filter((p) => p.trim().length > 0) } // Extract text from operators as fallback for edge cases From 9f816a0b106ce771603eaa6c7f90a9ad4c88813e Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Fri, 12 Sep 2025 21:35:33 +0530 Subject: [PATCH 6/9] comment fixes --- server/pdfChunks.ts | 99 +++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 2f33a9a4f..cc00df987 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -308,18 +308,18 @@ export async function extractTextAndImagesWithChunksFromPDF( docid: string = crypto.randomUUID(), extractImages: boolean = false, describeImages: boolean = true, + includeImageMarkersInText: boolean = true, ): Promise<{ text_chunks: string[] image_chunks: string[] text_chunk_pos: number[] image_chunk_pos: number[] }> { - // Sanitize docid for safe filesystem use - const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_") Logger.debug("Starting processing with parameters", { docid, extractImages, describeImages, + includeImageMarkersInText, dataSize: data.length, }) @@ -358,10 +358,10 @@ export async function extractTextAndImagesWithChunksFromPDF( // Use object to pass by reference for sequence counter let globalSeq = { value: 0 } - let crossImageOverlap = "" // Track overlap across images + // Track overlap across pages to maintain continuity + let pageOverlap = "" - // Logger.info("OVERLAP DEBUG: Initialized crossImageOverlap as empty string") - // console.log('OVERLAP DEBUG: Starting PDF processing with initial crossImageOverlap:', crossImageOverlap) + // Overlap is now tracked page-to-page only Logger.info(`PDF has ${pdfDocument.numPages} pages`) @@ -499,15 +499,24 @@ export async function extractTextAndImagesWithChunksFromPDF( fallbackLines, ) - let currentParagraph = "" // kept for image-flow flush, but not used for text let textOperatorCount = (await page.getTextContent()).items.length + // Prepend previous page overlap to the first paragraph for continuity + if (pageOverlap && paragraphs.length > 0) { + paragraphs[0] = `${pageOverlap} ${paragraphs[0]}` + pageOverlap = "" + } else if (pageOverlap) { + paragraphs = [pageOverlap] + pageOverlap = "" + } + Logger.debug("Text extraction summary for page", { pageNum, primaryParagraphs: primaryParagraphs.length, fallbackLines: fallbackLines.length, finalParagraphs: paragraphs.length, textOperatorCount, + initialPageOverlap: pageOverlap, }) // Helper: try to resolve image object by name directly from page.objs @@ -537,16 +546,7 @@ export async function extractTextAndImagesWithChunksFromPDF( ] const ctmStack: [number, number, number, number, number, number][] = [] - // Do not inject crossImageOverlap into text paragraphs here - // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap) - // Helper to flush currentParagraph into paragraphs array - const flushParagraph = () => { - if (currentParagraph.trim().length > 0) { - paragraphs.push(currentParagraph.trim()) - currentParagraph = "" - } - } let imagesOnPage = 0 let vectorOpsDetected = false @@ -623,10 +623,6 @@ export async function extractTextAndImagesWithChunksFromPDF( imageName: args[0], }) - // Do not process text per-image anymore; text is processed once per page. - // Maintain crossImageOverlap continuity by keeping placeholders only. - flushParagraph() - // Extract image buffer const imageName = typeof args?.[0] === "string" @@ -753,7 +749,10 @@ export async function extractTextAndImagesWithChunksFromPDF( description === "No description returned." || description === "Image is not worth describing." ) { - description = "Image extracted from PDF page." + Logger.warn( + `Skipping image with poor description: ${imageName} on page ${pageNum}`, + ) + break } seenHashDescriptions.set(imageHash, description) } @@ -762,7 +761,7 @@ export async function extractTextAndImagesWithChunksFromPDF( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, safeDocId) + const outputDir = path.join(baseDir, docid) await fsPromises.mkdir(outputDir, { recursive: true, }) @@ -787,7 +786,10 @@ export async function extractTextAndImagesWithChunksFromPDF( } image_chunks.push(description) image_chunk_pos.push(globalSeq.value) - crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } globalSeq.value++ imagesOnPage += 1 Logger.debug( @@ -847,7 +849,10 @@ export async function extractTextAndImagesWithChunksFromPDF( description === "No description returned." || description === "Image is not worth describing." ) { - description = "Image extracted from PDF page." + Logger.warn( + `Skipping image with poor description: ${imageName} on page ${pageNum}`, + ) + break } seenHashDescriptions.set(imageHash, description) } @@ -856,7 +861,7 @@ export async function extractTextAndImagesWithChunksFromPDF( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, safeDocId) + const outputDir = path.join(baseDir, docid) await fsPromises.mkdir(outputDir, { recursive: true, }) @@ -880,7 +885,10 @@ export async function extractTextAndImagesWithChunksFromPDF( } image_chunks.push(description) image_chunk_pos.push(globalSeq.value) - crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } globalSeq.value++ imagesOnPage += 1 Logger.debug( @@ -1260,17 +1268,18 @@ export async function extractTextAndImagesWithChunksFromPDF( ) } if ( + !description || description === "No description returned." || description === "Image is not worth describing." ) { - Logger.debug("Replacing insufficient description", { + Logger.debug("Skipping image with insufficient description", { imageName, previousDescription: description, }) Logger.warn( - `${description} ${imageName} on page ${pageNum}`, + `Skipping image with poor description: ${imageName} on page ${pageNum}`, ) - description = "Image extracted from PDF page." + continue } seenHashDescriptions.set(imageHash, description) Logger.debug("Cached new description for image", { @@ -1284,7 +1293,7 @@ export async function extractTextAndImagesWithChunksFromPDF( const baseDir = path.resolve( process.env.IMAGE_DIR || "downloads/xyne_images_db", ) - const outputDir = path.join(baseDir, safeDocId) + const outputDir = path.join(baseDir, docid) await fsPromises.mkdir(outputDir, { recursive: true }) const imageFilename = `${globalSeq.value}.${type.ext || "png"}` @@ -1305,16 +1314,15 @@ export async function extractTextAndImagesWithChunksFromPDF( image_chunks.push(description) image_chunk_pos.push(globalSeq.value) - // Logger.info(`OVERLAP DEBUG: Adding image placeholder to crossImageOverlap. Before: "${crossImageOverlap}"`) - // console.log('OVERLAP DEBUG: crossImageOverlap before adding image placeholder:', crossImageOverlap) - crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` - // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`) - // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap) + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } + // Removed cross-image overlap placeholder handling Logger.debug("Added image chunk at position", { position: globalSeq.value, imageName, description, - crossImageOverlap, }) globalSeq.value++ imagesOnPage += 1 @@ -1338,8 +1346,7 @@ export async function extractTextAndImagesWithChunksFromPDF( // Vector snapshot functionality removed (no longer creating fallback canvas) - // End of page: flush remaining paragraph and process paragraphs - flushParagraph() + // End of page: process paragraphs const overlapText = processTextParagraphs( paragraphs, text_chunks, @@ -1347,22 +1354,8 @@ export async function extractTextAndImagesWithChunksFromPDF( globalSeq, ) - // Update cross-image overlap - APPEND instead of REPLACE to preserve image placeholders - // Logger.info(`OVERLAP DEBUG: End of page ${pageNum} - processing final overlap update`) - // console.log('OVERLAP DEBUG: Page', pageNum, 'end - overlapText from processTextParagraphs:', overlapText) - // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap before final update:', crossImageOverlap) - if (overlapText.trim()) { - // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText has content, updating crossImageOverlap`) - const previousCrossImageOverlap = crossImageOverlap - crossImageOverlap = crossImageOverlap - ? `${crossImageOverlap} ${overlapText}` - : overlapText - // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - crossImageOverlap updated from "${previousCrossImageOverlap}" to "${crossImageOverlap}"`) - // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap after final update:', crossImageOverlap) - } else { - // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText is empty, no update to crossImageOverlap`) - // console.log('OVERLAP DEBUG: Page', pageNum, 'end - no update to crossImageOverlap (overlapText empty)') - } + // Store overlap for continuity to the next page + pageOverlap = overlapText.trim() Logger.debug( `Page ${pageNum} completed. Text operators found: ${textOperatorCount}, Current text chunks: ${text_chunks.length}, Current image chunks: ${image_chunks.length}`, From 2f497d00d4810e996bb63cc49782ea84e0a181ae Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Mon, 15 Sep 2025 14:49:51 +0530 Subject: [PATCH 7/9] comment fixes --- server/pdfChunks.ts | 188 ++++++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 87 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index cc00df987..0f01a1b55 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -546,8 +546,6 @@ export async function extractTextAndImagesWithChunksFromPDF( ] const ctmStack: [number, number, number, number, number, number][] = [] - - let imagesOnPage = 0 let vectorOpsDetected = false for (let i = 0; i < opList.fnArray.length; i++) { @@ -694,6 +692,77 @@ export async function extractTextAndImagesWithChunksFromPDF( // This will fall through to the crop fallback logic below } else { try { + const width: number = (imageDict.width ?? + imageDict.w) as number + const height: number = (imageDict.height ?? + imageDict.h) as number + const kind = + imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind + // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes + let rawData: any = + imageDict.data ?? + imageDict.bytes ?? + (imageDict.imgData ? imageDict.imgData.data : undefined) + + Logger.debug("Full image details", { + imageName, + width, + height, + kind, + dataLength: rawData ? rawData.length : null, + dataSizeMB: rawData + ? (rawData.length / (1024 * 1024)).toFixed(2) + : null, + maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + minDimension: MIN_IMAGE_DIM_PX, + isValidDimensions: width > 0 && height > 0, + meetsMinSize: + width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, + withinSizeLimit: rawData + ? rawData.length <= + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + : false, + isInline, + }) + + if (!width || !height || width <= 0 || height <= 0) { + Logger.debug("Skipped image with invalid dimensions", { + imageName, + width, + height, + }) + continue + } + + if ( + rawData && + rawData.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn("Skipped large image", { + imageName, + actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2), + maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + }) + continue + } + + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + Logger.debug("Skipped small image", { + imageName, + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) + continue // Skip small images + } + + Logger.debug( + "Image passed all filters, proceeding with processing", + { + imageName, + }, + ) // Fast paths for Canvas or Image-like objects returned by page.objs const isCanvasLike = (obj: any) => obj && @@ -719,10 +788,15 @@ export async function extractTextAndImagesWithChunksFromPDF( }) } else { const buffer = c.toBuffer("image/png") + // Run all filters BEFORE attempting LLM description if ( - buffer.length <= + buffer.length > DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 ) { + Logger.warn( + `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, + ) + } else { // @ts-ignore let type = await imageType(buffer) if (!type) type = { mime: "image/png", ext: "png" } @@ -744,6 +818,7 @@ export async function extractTextAndImagesWithChunksFromPDF( } catch { // ignore } + // Check description quality after LLM call if ( !description || description === "No description returned." || @@ -797,10 +872,6 @@ export async function extractTextAndImagesWithChunksFromPDF( ) break } - } else { - Logger.warn( - `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, - ) } } } @@ -819,10 +890,22 @@ export async function extractTextAndImagesWithChunksFromPDF( } else { const cnv = createCanvas(width, height) const cctx = cnv.getContext("2d") + try { + // @ts-ignore draw directly cctx.drawImage(imgLike, 0, 0) const buffer = cnv.toBuffer("image/png") + // Run all filters BEFORE attempting LLM description + if ( + buffer.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn( + `Skipping objs/image image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, + ) + break + } // @ts-ignore let type = await imageType(buffer) if (!type) type = { mime: "image/png", ext: "png" } @@ -844,6 +927,7 @@ export async function extractTextAndImagesWithChunksFromPDF( } catch { // ignore } + // Check description quality after LLM call if ( !description || description === "No description returned." || @@ -904,78 +988,6 @@ export async function extractTextAndImagesWithChunksFromPDF( } } - const width: number = (imageDict.width ?? - imageDict.w) as number - const height: number = (imageDict.height ?? - imageDict.h) as number - const kind = - imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind - // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes - let rawData: any = - imageDict.data ?? - imageDict.bytes ?? - (imageDict.imgData ? imageDict.imgData.data : undefined) - - Logger.debug("Full image details", { - imageName, - width, - height, - kind, - dataLength: rawData ? rawData.length : null, - dataSizeMB: rawData - ? (rawData.length / (1024 * 1024)).toFixed(2) - : null, - maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, - minDimension: MIN_IMAGE_DIM_PX, - isValidDimensions: width > 0 && height > 0, - meetsMinSize: - width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, - withinSizeLimit: rawData - ? rawData.length <= - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 - : false, - isInline, - }) - - if (!width || !height || width <= 0 || height <= 0) { - Logger.debug("Skipped image with invalid dimensions", { - imageName, - width, - height, - }) - continue - } - - if ( - rawData && - rawData.length > - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 - ) { - Logger.warn("Skipped large image", { - imageName, - actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2), - maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, - }) - continue - } - - if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { - Logger.debug("Skipped small image", { - imageName, - width, - height, - minRequired: MIN_IMAGE_DIM_PX, - }) - continue // Skip small images - } - - Logger.debug( - "Image passed all filters, proceeding with processing", - { - imageName, - }, - ) - let uint8Data: Uint8Array if (rawData instanceof Uint8Array) { uint8Data = rawData @@ -1211,7 +1223,7 @@ export async function extractTextAndImagesWithChunksFromPDF( } Logger.debug( - "MIME type check passed, proceeding with processing", + "All filters passed, proceeding with image description", { imageName, }, @@ -1267,15 +1279,20 @@ export async function extractTextAndImagesWithChunksFromPDF( "Using default description (describeImages=false)", ) } + + // Check description quality after LLM call if ( !description || description === "No description returned." || description === "Image is not worth describing." ) { - Logger.debug("Skipping image with insufficient description", { - imageName, - previousDescription: description, - }) + Logger.debug( + "Skipping image with insufficient description", + { + imageName, + previousDescription: description, + }, + ) Logger.warn( `Skipping image with poor description: ${imageName} on page ${pageNum}`, ) @@ -1343,9 +1360,6 @@ export async function extractTextAndImagesWithChunksFromPDF( break } } - - // Vector snapshot functionality removed (no longer creating fallback canvas) - // End of page: process paragraphs const overlapText = processTextParagraphs( paragraphs, From 0c98b7a32e9ce042f5682ccc979a3c3569579d38 Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Mon, 15 Sep 2025 18:30:11 +0530 Subject: [PATCH 8/9] removing old logic --- server/pdfChunks.ts | 109 +++----------------------------------------- 1 file changed, 6 insertions(+), 103 deletions(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 0f01a1b55..31fa9a129 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -16,8 +16,7 @@ const qcmsWasmPath = path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/" const seenHashDescriptions = new Map() const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10) -// Minimum line height used for calculating line break detection tolerance (in PDF units) -const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10 + const Logger = getLogger(Subsystem.Integrations).child({ module: "pdfChunks", @@ -200,42 +199,7 @@ function validateTextItem(item: any): boolean { ) } -/** - * Extract text from various PDF.js text operators with enhanced validation - */ -function extractTextFromArgs(args: any[]): string { - let text = "" - - if (!args || args.length === 0) { - return text - } - - const firstArg = args[0] - - if (typeof firstArg === "string") { - text = firstArg - } else if (Array.isArray(firstArg)) { - for (const item of firstArg) { - if (typeof item === "string") { - text += item - } else if (typeof item === "number") { - // Skip spacing numbers in text arrays - continue - } else if (item && typeof item === "object") { - // Enhanced validation using validateTextItem function - if (validateTextItem(item)) { - text += item.str - } else if ("unicode" in item && typeof item.unicode === "string") { - text += item.unicode - } - } - } - } - // Additional validation: ensure we return clean, valid text - const result = typeof text === "string" ? text : "" - return result -} /** * Process collected paragraphs into chunks and add to results @@ -423,62 +387,7 @@ export async function extractTextAndImagesWithChunksFromPDF( return paragraphs.filter((p) => p.trim().length > 0) } - // Extract text from operators as fallback for edge cases - const extractFallbackTextFromOperators = (opList: any): string[] => { - const fallbackLines: string[] = [] - - for (let i = 0; i < opList.fnArray.length; i++) { - const fnId = opList.fnArray[i] - const args = opList.argsArray[i] - - // Handle text operators - if ( - fnId === PDFJS.OPS.showText || - fnId === PDFJS.OPS.showSpacedText || - fnId === PDFJS.OPS.nextLineShowText || - fnId === PDFJS.OPS.nextLineSetSpacingShowText - ) { - const extractedText = extractTextFromArgs(args) - if (extractedText.trim()) { - fallbackLines.push(extractedText.trim()) - } - } - } - - return fallbackLines - } - - // Combine and deduplicate text from multiple sources - const combineTextSources = ( - primaryParagraphs: string[], - fallbackLines: string[], - ): string[] => { - if (fallbackLines.length === 0) { - return primaryParagraphs - } - - const primaryText = primaryParagraphs.join(" ").toLowerCase() - const additionalLines: string[] = [] - - // Add fallback lines that aren't already covered by primary extraction - for (const line of fallbackLines) { - const cleanLine = line.trim() - if ( - cleanLine.length > 2 && // Skip very short strings - !primaryText.includes(cleanLine.toLowerCase()) - ) { - additionalLines.push(cleanLine) - } - } - // If we found additional text, append it as a new paragraph - if (additionalLines.length > 0) { - const additionalParagraph = additionalLines.join(" ") - return [...primaryParagraphs, additionalParagraph] - } - - return primaryParagraphs - } for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { Logger.debug(`Processing page ${pageNum}`) @@ -488,16 +397,9 @@ export async function extractTextAndImagesWithChunksFromPDF( const opList = await page.getOperatorList() // Use textContent-based paragraphs for this page as primary source - let primaryParagraphs: string[] = await buildParagraphsFromPage(page) - - // Extract fallback text from operators for edge cases - const fallbackLines = extractFallbackTextFromOperators(opList) + let paragraphs: string[] = await buildParagraphsFromPage(page) - // Combine both sources, prioritizing primary extraction - let paragraphs: string[] = combineTextSources( - primaryParagraphs, - fallbackLines, - ) + let textOperatorCount = (await page.getTextContent()).items.length @@ -512,8 +414,8 @@ export async function extractTextAndImagesWithChunksFromPDF( Logger.debug("Text extraction summary for page", { pageNum, - primaryParagraphs: primaryParagraphs.length, - fallbackLines: fallbackLines.length, + primaryParagraphs: paragraphs.length, + finalParagraphs: paragraphs.length, textOperatorCount, initialPageOverlap: pageOverlap, @@ -1394,6 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF( describeImages, }) + console.log("All text chunks", { text_chunks }) Logger.debug("All text chunks", { text_chunks }) Logger.debug("All text chunk positions", { text_chunk_pos }) Logger.debug("All image chunks", { image_chunks }) From bb602696dde65a02df9bef9bd09db5901de10f3b Mon Sep 17 00:00:00 2001 From: Aayushjshah <2001aayushshah@gmail.com> Date: Mon, 15 Sep 2025 18:32:41 +0530 Subject: [PATCH 9/9] removing old logic --- server/pdfChunks.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index 31fa9a129..cfe0b67db 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -1296,7 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF( describeImages, }) - console.log("All text chunks", { text_chunks }) + Logger.debug("All text chunks", { text_chunks }) Logger.debug("All text chunks", { text_chunks }) Logger.debug("All text chunk positions", { text_chunk_pos }) Logger.debug("All image chunks", { image_chunks })