diff --git a/server/integrations/microsoft/index.ts b/server/integrations/microsoft/index.ts index 1e7926146..6b0042d8c 100644 --- a/server/integrations/microsoft/index.ts +++ b/server/integrations/microsoft/index.ts @@ -216,11 +216,12 @@ const insertCalendarEvents = async ( } // Check for next page - deltaToken = (response["@odata.deltaLink"])? response["@odata.deltaLink"] : deltaToken + deltaToken = response["@odata.deltaLink"] + ? response["@odata.deltaLink"] + : deltaToken if (response["@odata.nextLink"]) { // More pages available, continue with next page nextLink = response["@odata.nextLink"] - } else { // No more data nextLink = undefined diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts index c8ab942b9..0f01a1b55 100644 --- a/server/pdfChunks.ts +++ b/server/pdfChunks.ts @@ -15,6 +15,9 @@ const openjpegWasmPath = const qcmsWasmPath = path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/" const seenHashDescriptions = new Map() +const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10) +// Minimum line height used for calculating line break detection tolerance (in PDF units) +const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10 const Logger = getLogger(Subsystem.Integrations).child({ module: "pdfChunks", @@ -22,15 +25,168 @@ const Logger = getLogger(Subsystem.Integrations).child({ const PDFJS = pdfjsLib -// Utility function to clean text consistent with chunkTextByParagraph -const cleanText = (str: string): string => { - const normalized = str.replace(/\r\n|\r/g, "\n") - return normalized.replace( - /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g, - "", +export function normalizeText(input: string): string { + if (!input) return "" + + let normalized = input.normalize("NFC") + + // Strip control chars except newline/tab + normalized = normalized.replace(/[^\P{C}\n\t]/gu, "") + + // Normalize whitespace + normalized = normalized.replace(/\u00A0/g, " ") // nbsp → space + normalized = normalized.replace(/\u200B/g, "") // zero-width space + normalized = normalized.replace(/\t+/g, " ") // tabs → single space + + return normalized.trim() +} + +// 2. Smart letter-spacing collapse (per line) + +function smartDespaceLine(line: string): string { + if (!line) return line + + const parts = line.split(/(\s+)/) + const out: string[] = [] + + const isSingleAllowed = (s: string) => + s.length === 1 && /[\p{L}\p{N}'’]/u.test(s) + + const isSingleLowerLetter = (s: string) => s.length === 1 && /\p{Ll}/u.test(s) + + let i = 0 + while (i < parts.length) { + const tok = parts[i] + + if (!/\s+/.test(tok) && isSingleAllowed(tok)) { + const runTokens: string[] = [tok] + let j = i + 1 + + while ( + j + 1 < parts.length && + parts[j] === " " && + !/\s+/.test(parts[j + 1]) && + isSingleAllowed(parts[j + 1]) + ) { + runTokens.push(parts[j + 1]) + j += 2 + } + + // Join spaced letters like "N A S A" -> "NASA" + if (runTokens.length >= 3) { + out.push(runTokens.join("")) + i = j + continue + } + + // Join two-letter lowercase sequences like "i s" -> "is" + if ( + runTokens.length === 2 && + isSingleLowerLetter(runTokens[0]) && + isSingleLowerLetter(runTokens[1]) + ) { + out.push(runTokens.join("")) + i = j + continue + } + } + + out.push(tok) + i += 1 + } + + return out.join("") +} + +// 3. High-level text cleaner + +export function cleanText(input: string): string { + let s = normalizeText(input) + + // Fix hyphenation across line breaks + s = s.replace(/(\p{L})-\n(\p{L})/gu, "$1$2") + + // Trim spaces around newlines + s = s.replace(/[ \t]*\n[ \t]*/g, "\n") + + // Turn intra-paragraph newlines into spaces, preserve paragraph breaks + // 1) Mark paragraph breaks with a unique placeholder + const uniqueParaPlaceholder = `\uE000XYNE_PARA_BREAK_${Math.random().toString(36).substring(2)}\uE001` + s = s.replace(/\n{2,}/g, uniqueParaPlaceholder) + // 2) Collapse remaining newlines (soft wraps) into spaces + s = s.replace(/\n+/g, " ") + // 3) Restore paragraph breaks + s = s.replace( + new RegExp( + uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), + "g", + ), + "\n\n", ) + + // Apply line-wise despacing + s = s + .split("\n") + .map((line) => smartDespaceLine(line)) + .join("\n") + + // Remove spaces before punctuation + s = s.replace(/\s+([.,;:!?])/g, "$1") + + // Cap extreme space runs, preserve 2–4 spaces + s = s.replace(/[ ]{5,}/g, " ") + + // Trim lines & drop empties + s = s + .split("\n") + .map((l) => l.trim()) + .filter((l) => l.length > 0) + .join("\n") + + return s.trim() } +// 4. Matrix transformation utilities + +/** + * Multiply two 2D transformation matrices + * Each matrix is represented as [a, b, c, d, e, f] corresponding to: + * [a c e] + * [b d f] + * [0 0 1] + */ +function multiplyMatrices( + m1: number[], + m2: number[], +): [number, number, number, number, number, number] { + const [a1, b1, c1, d1, e1, f1] = m1 as [ + number, + number, + number, + number, + number, + number, + ] + const [a2, b2, c2, d2, e2, f2] = m2 as [ + number, + number, + number, + number, + number, + number, + ] + return [ + a1 * a2 + c1 * b2, + b1 * a2 + d1 * b2, + a1 * c2 + c1 * d2, + b1 * c2 + d1 * d2, + a1 * e2 + c1 * f2 + e1, + b1 * e2 + d1 * f2 + f1, + ] +} + +//=== + /** * Validate text item */ @@ -77,7 +233,8 @@ function extractTextFromArgs(args: any[]): string { } // Additional validation: ensure we return clean, valid text - return typeof text === "string" ? text : "" + const result = typeof text === "string" ? text : "" + return result } /** @@ -91,19 +248,32 @@ function processTextParagraphs( globalSeq: { value: number }, overlapBytes: number = 32, ): string { - if (paragraphs.length === 0) return "" + Logger.debug("Processing paragraphs", { count: paragraphs.length }) + + if (paragraphs.length === 0) { + Logger.debug("No paragraphs to process") + return "" + } const cleanedParagraphs = paragraphs .map(cleanText) .filter((p) => p.length > 0) - if (cleanedParagraphs.length === 0) return "" + if (cleanedParagraphs.length === 0) { + Logger.debug("No cleaned paragraphs after filtering") + return "" + } const cleanedText = cleanedParagraphs.join("\n") + // console.log('TEXT DEBUG: Cleaned text length:', cleanedText.length) + // console.log('TEXT DEBUG: Full cleaned text:', cleanedText) + const chunks = chunkTextByParagraph(cleanedText, 512, 128) + // console.log('TEXT DEBUG: Generated chunks count:', chunks.length) for (const chunk of chunks) { text_chunks.push(chunk) text_chunk_pos.push(globalSeq.value) + // console.log('TEXT DEBUG: Added chunk at position', globalSeq.value, 'content:', chunk) globalSeq.value++ } @@ -111,13 +281,25 @@ function processTextParagraphs( // Take the last overlapBytes from the processed text let overlapText = "" let overlapLen = 0 + + // Logger.info(`OVERLAP DEBUG: Calculating overlap text from cleanedText of length ${cleanedText.length}, target bytes: ${overlapBytes}`) + // console.log('OVERLAP DEBUG: Full cleanedText for overlap calculation:', cleanedText) + for (let i = cleanedText.length - 1; i >= 0; i--) { const charBytes = Buffer.byteLength(cleanedText[i], "utf8") - if (overlapLen + charBytes > overlapBytes) break + if (overlapLen + charBytes > overlapBytes) { + // console.log('OVERLAP DEBUG: Stopping overlap calculation at char', i, 'would exceed', overlapBytes, 'bytes (current:', overlapLen, 'char bytes:', charBytes, ')') + break + } overlapText = cleanedText[i] + overlapText overlapLen += charBytes + // console.log('OVERLAP DEBUG: Added char', cleanedText[i], 'to overlap. Current overlap length:', overlapLen, 'bytes, text:', overlapText) } + // console.log('OVERLAP DEBUG: Final calculated overlap text:', overlapText) + // console.log('OVERLAP DEBUG: Final overlap length:', overlapLen, 'bytes') + // Logger.info(`OVERLAP DEBUG: processTextParagraphs returning overlap text: "${overlapText}" (${overlapLen} bytes)`) + return overlapText } @@ -126,13 +308,20 @@ export async function extractTextAndImagesWithChunksFromPDF( docid: string = crypto.randomUUID(), extractImages: boolean = false, describeImages: boolean = true, + includeImageMarkersInText: boolean = true, ): Promise<{ text_chunks: string[] image_chunks: string[] text_chunk_pos: number[] image_chunk_pos: number[] }> { - Logger.info(`Starting PDF processing for: ${docid}`) + Logger.debug("Starting processing with parameters", { + docid, + extractImages, + describeImages, + includeImageMarkersInText, + dataSize: data.length, + }) const loadingTask = PDFJS.getDocument({ data, @@ -169,10 +358,128 @@ export async function extractTextAndImagesWithChunksFromPDF( // Use object to pass by reference for sequence counter let globalSeq = { value: 0 } - let crossImageOverlap = "" // Track overlap across images + // Track overlap across pages to maintain continuity + let pageOverlap = "" + + // Overlap is now tracked page-to-page only Logger.info(`PDF has ${pdfDocument.numPages} pages`) + // Robust text extraction using PDF.js textContent API + const buildParagraphsFromPage = async ( + page: pdfjsLib.PDFPageProxy, + ): Promise => { + const textContent = await page.getTextContent({ + includeMarkedContent: false, + disableNormalization: false, + }) + + // Build lines using hasEOL and Y-position changes (handles PPT/DOC exports) + const lines: string[] = [] + let current = "" + let prevY: number | null = null + let prevH: number | null = null + for (const item of textContent.items as any[]) { + const str: string = item && typeof item.str === "string" ? item.str : "" + if (!str) continue + + const tr = Array.isArray(item.transform) ? item.transform : [] + const y = typeof tr[5] === "number" ? tr[5] : null + const h = typeof item.height === "number" ? item.height : null + + let newLine = false + if (prevY != null && y != null) { + const tol = Math.max(prevH || 0, h || 0, 10) * 0.4 // dynamic tolerance + if (Math.abs(y - prevY) > tol) newLine = true + } + + if (newLine || (item as any).hasEOL) { + if (current.length > 0) lines.push(current) + current = str + } else { + current += str + } + + prevY = y + prevH = h + } + if (current.trim().length > 0) lines.push(current) + + // Group lines into paragraphs separated by blank lines + const paragraphs: string[] = [] + let buf: string[] = [] + const pushPara = () => { + if (buf.length === 0) return + paragraphs.push(buf.join("\n")) + buf = [] + } + for (const ln of lines) { + if (ln.trim().length === 0) pushPara() + else buf.push(ln) + } + pushPara() + + // Filter raw paragraphs - check trimmed length but don't apply full cleaning yet + return paragraphs.filter((p) => p.trim().length > 0) + } + + // Extract text from operators as fallback for edge cases + const extractFallbackTextFromOperators = (opList: any): string[] => { + const fallbackLines: string[] = [] + + for (let i = 0; i < opList.fnArray.length; i++) { + const fnId = opList.fnArray[i] + const args = opList.argsArray[i] + + // Handle text operators + if ( + fnId === PDFJS.OPS.showText || + fnId === PDFJS.OPS.showSpacedText || + fnId === PDFJS.OPS.nextLineShowText || + fnId === PDFJS.OPS.nextLineSetSpacingShowText + ) { + const extractedText = extractTextFromArgs(args) + if (extractedText.trim()) { + fallbackLines.push(extractedText.trim()) + } + } + } + + return fallbackLines + } + + // Combine and deduplicate text from multiple sources + const combineTextSources = ( + primaryParagraphs: string[], + fallbackLines: string[], + ): string[] => { + if (fallbackLines.length === 0) { + return primaryParagraphs + } + + const primaryText = primaryParagraphs.join(" ").toLowerCase() + const additionalLines: string[] = [] + + // Add fallback lines that aren't already covered by primary extraction + for (const line of fallbackLines) { + const cleanLine = line.trim() + if ( + cleanLine.length > 2 && // Skip very short strings + !primaryText.includes(cleanLine.toLowerCase()) + ) { + additionalLines.push(cleanLine) + } + } + + // If we found additional text, append it as a new paragraph + if (additionalLines.length > 0) { + const additionalParagraph = additionalLines.join(" ") + return [...primaryParagraphs, additionalParagraph] + } + + return primaryParagraphs + } + for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { Logger.debug(`Processing page ${pageNum}`) @@ -180,52 +487,99 @@ export async function extractTextAndImagesWithChunksFromPDF( try { const opList = await page.getOperatorList() - // Hold paragraphs for current page - let paragraphs: string[] = [] - let currentParagraph = "" - let textOperatorCount = 0 + // Use textContent-based paragraphs for this page as primary source + let primaryParagraphs: string[] = await buildParagraphsFromPage(page) + + // Extract fallback text from operators for edge cases + const fallbackLines = extractFallbackTextFromOperators(opList) + + // Combine both sources, prioritizing primary extraction + let paragraphs: string[] = combineTextSources( + primaryParagraphs, + fallbackLines, + ) - // Start with cross-image overlap if available - if (crossImageOverlap && extractImages) { - currentParagraph = crossImageOverlap + " " - crossImageOverlap = "" // Clear after using + let textOperatorCount = (await page.getTextContent()).items.length + + // Prepend previous page overlap to the first paragraph for continuity + if (pageOverlap && paragraphs.length > 0) { + paragraphs[0] = `${pageOverlap} ${paragraphs[0]}` + pageOverlap = "" + } else if (pageOverlap) { + paragraphs = [pageOverlap] + pageOverlap = "" } - // Helper to flush currentParagraph into paragraphs array - const flushParagraph = () => { - if (currentParagraph.trim().length > 0) { - paragraphs.push(currentParagraph.trim()) - currentParagraph = "" + Logger.debug("Text extraction summary for page", { + pageNum, + primaryParagraphs: primaryParagraphs.length, + fallbackLines: fallbackLines.length, + finalParagraphs: paragraphs.length, + textOperatorCount, + initialPageOverlap: pageOverlap, + }) + + // Helper: try to resolve image object by name directly from page.objs + const resolveImageByName = async ( + name: string, + ): Promise => { + try { + // Some builds expose has method + // @ts-ignore + if ( + typeof (page.objs as any).has === "function" && + (page.objs as any).has(name) + ) { + // @ts-ignore + return (page.objs as any).get(name) + } + const obj = (page.objs as any).get(name) + return obj || null + } catch (e) { + return null } } + // Track CTM to compute image bounds when image data is not directly retrievable + let currentCTM: [number, number, number, number, number, number] = [ + 1, 0, 0, 1, 0, 0, + ] + const ctmStack: [number, number, number, number, number, number][] = [] + + let imagesOnPage = 0 + let vectorOpsDetected = false for (let i = 0; i < opList.fnArray.length; i++) { const fnId = opList.fnArray[i] const args = opList.argsArray[i] + // Track vector drawing operators (paths, fills, form XObjects) + const isVectorOp = + fnId === PDFJS.OPS.constructPath || + fnId === PDFJS.OPS.stroke || + fnId === PDFJS.OPS.closeStroke || + fnId === PDFJS.OPS.fill || + fnId === PDFJS.OPS.eoFill || + fnId === PDFJS.OPS.fillStroke || + fnId === PDFJS.OPS.eoFillStroke || + fnId === PDFJS.OPS.closeFillStroke || + fnId === PDFJS.OPS.closeEOFillStroke || + fnId === PDFJS.OPS.clip || + fnId === PDFJS.OPS.eoClip || + fnId === PDFJS.OPS.rectangle || + fnId === PDFJS.OPS.shadingFill || + fnId === PDFJS.OPS.rawFillPath || + fnId === PDFJS.OPS.paintFormXObjectBegin || + fnId === PDFJS.OPS.paintFormXObjectEnd + if (isVectorOp) vectorOpsDetected = true + switch (fnId) { case PDFJS.OPS.showText: - case PDFJS.OPS.showSpacedText: { - const text = extractTextFromArgs(args) - if (text) { - currentParagraph += text + " " - textOperatorCount++ - } - break - } - // Handle line break operators - case PDFJS.OPS.nextLine: { - flushParagraph() - break - } + case PDFJS.OPS.showSpacedText: + case PDFJS.OPS.nextLine: case PDFJS.OPS.nextLineShowText: case PDFJS.OPS.nextLineSetSpacingShowText: { - const text = extractTextFromArgs(args) - if (text) { - currentParagraph += text + " " - textOperatorCount++ - } - flushParagraph() + // Text is now handled by combined extraction approach + // Operator-level extraction happens in extractFallbackTextFromOperators break } // Handle matrix and positioning operators that might indicate paragraph breaks @@ -234,6 +588,25 @@ export async function extractTextAndImagesWithChunksFromPDF( case PDFJS.OPS.moveText: { // These might indicate significant positioning changes // For now, we'll be conservative and not flush, but this could be adjusted + if (fnId === PDFJS.OPS.transform) { + try { + if ( + Array.isArray(args) && + args.length >= 6 && + args.every((n: any) => typeof n === "number") + ) { + currentCTM = multiplyMatrices(currentCTM, args as number[]) + } + } catch {} + } + break + } + case PDFJS.OPS.save: { + ctmStack.push([...currentCTM]) + break + } + case PDFJS.OPS.restore: { + if (ctmStack.length) currentCTM = ctmStack.pop()! break } // Handle image operators @@ -241,276 +614,744 @@ export async function extractTextAndImagesWithChunksFromPDF( case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null: case extractImages ? PDFJS.OPS.paintInlineImageXObject : null: case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: { - // Flush any pending text paragraphs before image - flushParagraph() - - // Process accumulated paragraphs and capture overlap - const overlapText = processTextParagraphs( - paragraphs, - text_chunks, - text_chunk_pos, - globalSeq, - ) - paragraphs = [] // Clear paragraphs after processing - - // Store overlap for continuation after image - crossImageOverlap = overlapText + Logger.debug("Image operator detected", { + pageNum, + extractImages, + operatorType: fnId, + imageName: args[0], + }) // Extract image buffer - const imageName = args[0] - // Small delay to ensure image object has a chance to resolve - let imageDict - try { - imageDict = page.objs.get(imageName) - } catch (err) { + const imageName = + typeof args?.[0] === "string" + ? args[0] + : args?.[0] && + typeof args[0] === "object" && + typeof args[0].name === "string" + ? args[0].name + : args?.[0] + Logger.debug("Processing image", { imageName }) + let imageDict: any | null = null + let isInline = false + // Inline image may directly carry data in args + Logger.debug("Image operator details", { + args: args.length, + fnId, + paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject, + }) + if (fnId === PDFJS.OPS.paintInlineImageXObject) { + Logger.debug("Detected inline image data in args") + const candidate = Array.isArray(args) + ? args.find( + (a: any) => + a && + typeof a === "object" && + ("data" in a || "imgData" in a) && + "width" in a && + "height" in a, + ) + : null + if (candidate) { + imageDict = candidate + isInline = true + } + } + Logger.debug("Initial imageDict resolved", { + hasImageDict: !!imageDict, + isInline, + }) + if ( + !imageDict && + (typeof imageName === "string" || + (imageName && + typeof imageName === "object" && + typeof imageName.name === "string")) + ) { + const name = + typeof imageName === "string" ? imageName : imageName.name + imageDict = await resolveImageByName(name) + } + + // If we cannot get the raw image object, skip this image + if (!imageDict) { Logger.debug( - `Image ${imageName} not resolved or failed to decode on page ${pageNum}: ${err instanceof Error ? err.message : err}`, + `No image object available for ${imageName} on page ${pageNum} — skipping`, ) continue } - if (!imageDict || !imageDict.data) { + Logger.debug("Resolved imageDict", { + hasImageDict: !!imageDict, + isInline, + }) + + // Ensure imageDict is valid before processing + if (!imageDict || typeof imageDict !== "object") { Logger.debug( - `No image data found for ${imageName} on page ${pageNum}`, + "imageDict is null or invalid, skipping to crop fallback", ) - continue - } + // This will fall through to the crop fallback logic below + } else { + try { + const width: number = (imageDict.width ?? + imageDict.w) as number + const height: number = (imageDict.height ?? + imageDict.h) as number + const kind = + imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind + // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes + let rawData: any = + imageDict.data ?? + imageDict.bytes ?? + (imageDict.imgData ? imageDict.imgData.data : undefined) - try { - const { width, height, kind, data } = imageDict + Logger.debug("Full image details", { + imageName, + width, + height, + kind, + dataLength: rawData ? rawData.length : null, + dataSizeMB: rawData + ? (rawData.length / (1024 * 1024)).toFixed(2) + : null, + maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + minDimension: MIN_IMAGE_DIM_PX, + isValidDimensions: width > 0 && height > 0, + meetsMinSize: + width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX, + withinSizeLimit: rawData + ? rawData.length <= + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + : false, + isInline, + }) - if (!width || !height || width <= 0 || height <= 0) { - Logger.debug( - `Invalid image dimensions for ${imageName}: ${width}x${height}`, - ) - continue - } + if (!width || !height || width <= 0 || height <= 0) { + Logger.debug("Skipped image with invalid dimensions", { + imageName, + width, + height, + }) + continue + } - if ( - data.length > - DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 - ) { - Logger.warn( - `Skipping large image (${(data.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`, - ) - continue - } + if ( + rawData && + rawData.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn("Skipped large image", { + imageName, + actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2), + maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB, + }) + continue + } - if (width < 250 || height < 250) continue // Skip small images - - let uint8Data: Uint8Array - if (data instanceof Uint8Array) { - uint8Data = data - } else if ( - data && - typeof data === "object" && - data.length !== undefined - ) { - uint8Data = new Uint8Array(data) - } else { - Logger.debug(`Invalid image data format for ${imageName}`) - continue - } + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + Logger.debug("Skipped small image", { + imageName, + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) + continue // Skip small images + } - const canvas = createCanvas(width, height) - const ctx = canvas.getContext("2d") - let imageProcessed = false - - switch (kind) { - case pdfjsLib.ImageKind.GRAYSCALE_1BPP: - case pdfjsLib.ImageKind.RGB_24BPP: - case pdfjsLib.ImageKind.RGBA_32BPP: { - const bytesPerPixel = - kind === pdfjsLib.ImageKind.RGBA_32BPP - ? 4 - : kind === pdfjsLib.ImageKind.RGB_24BPP - ? 3 - : 1 - const expectedLength = width * height * bytesPerPixel - - if (uint8Data.length >= expectedLength) { - const rgbaData = new Uint8ClampedArray(width * height * 4) - for (let i = 0; i < width * height; i++) { - const srcIdx = i * bytesPerPixel - const dstIdx = i * 4 - if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { - const gray = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 - rgbaData[dstIdx] = gray // R - rgbaData[dstIdx + 1] = gray // G - rgbaData[dstIdx + 2] = gray // B - rgbaData[dstIdx + 3] = 255 // A - } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) { - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = 255 // A - } else { - // RGBA_32BPP - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = - srcIdx + 3 < uint8Data.length - ? uint8Data[srcIdx + 3] - : 255 // A + Logger.debug( + "Image passed all filters, proceeding with processing", + { + imageName, + }, + ) + // Fast paths for Canvas or Image-like objects returned by page.objs + const isCanvasLike = (obj: any) => + obj && + typeof obj.getContext === "function" && + typeof obj.width === "number" && + typeof obj.height === "number" + const isImageLike = (obj: any) => + obj && + typeof obj.width === "number" && + typeof obj.height === "number" && + typeof obj.getContext !== "function" + + if (isCanvasLike(imageDict)) { + const c: any = imageDict + const width: number = c.width + const height: number = c.height + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + Logger.debug("Skipped small canvas image", { + imageName, + width, + height, + minRequired: MIN_IMAGE_DIM_PX, + }) + } else { + const buffer = c.toBuffer("image/png") + // Run all filters BEFORE attempting LLM description + if ( + buffer.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn( + `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, + ) + } else { + // @ts-ignore + let type = await imageType(buffer) + if (!type) type = { mime: "image/png", ext: "png" } + if ( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) + ) { + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") + let description = "This is an image." + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + } else { + try { + description = describeImages + ? await describeImageWithllm(buffer) + : description + } catch { + // ignore + } + // Check description quality after LLM call + if ( + !description || + description === "No description returned." || + description === "Image is not worth describing." + ) { + Logger.warn( + `Skipping image with poor description: ${imageName} on page ${pageNum}`, + ) + break + } + seenHashDescriptions.set(imageHash, description) + } + try { + const baseDir = path.resolve( + process.env.IMAGE_DIR || + "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { + recursive: true, + }) + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join( + outputDir, + imageFilename, + ) + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info( + `Saved image (objs/canvas) to: ${imagePath}`, + ) + } catch (e) { + Logger.error( + `Failed to save objs/canvas image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + // Skip on failure + break + } + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed objs/canvas image ${imageName} on page ${pageNum}`, + ) + break } } - const imageData = new ImageData(rgbaData, width) - ctx.putImageData(imageData, 0, 0) - imageProcessed = true } - break } - default: { - try { - const imgBuffer = Buffer.from(uint8Data.buffer) - const img = new CanvasImage() - await new Promise((resolve, reject) => { - img.onload = () => resolve() - img.onerror = (err) => reject(err) - img.src = imgBuffer + + if (isImageLike(imageDict)) { + const imgLike: any = imageDict + const width: number = imgLike.width + const height: number = imgLike.height + if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) { + Logger.debug("Skipped small image-like object", { + imageName, + width, + height, + minRequired: MIN_IMAGE_DIM_PX, }) - ctx.drawImage(img, 0, 0) - imageProcessed = true - } catch (err) { + } else { + const cnv = createCanvas(width, height) + const cctx = cnv.getContext("2d") + try { + + // @ts-ignore draw directly + cctx.drawImage(imgLike, 0, 0) + const buffer = cnv.toBuffer("image/png") + // Run all filters BEFORE attempting LLM description + if ( + buffer.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn( + `Skipping objs/image image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`, + ) + break + } + // @ts-ignore + let type = await imageType(buffer) + if (!type) type = { mime: "image/png", ext: "png" } + if ( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) + ) { + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") + let description = "This is an image." + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + } else { + try { + description = describeImages + ? await describeImageWithllm(buffer) + : description + } catch { + // ignore + } + // Check description quality after LLM call + if ( + !description || + description === "No description returned." || + description === "Image is not worth describing." + ) { + Logger.warn( + `Skipping image with poor description: ${imageName} on page ${pageNum}`, + ) + break + } + seenHashDescriptions.set(imageHash, description) + } + try { + const baseDir = path.resolve( + process.env.IMAGE_DIR || + "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { + recursive: true, + }) + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join( + outputDir, + imageFilename, + ) + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info( + `Saved image (objs/image) to: ${imagePath}`, + ) + } catch (e) { + Logger.error( + `Failed to save objs/image image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + break + } + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed objs/image image ${imageName} on page ${pageNum}`, + ) + break + } + } catch (e) { + Logger.debug( + `Drawing objs image failed for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`, + ) + } + } + } + + let uint8Data: Uint8Array + if (rawData instanceof Uint8Array) { + uint8Data = rawData + } else if ( + rawData && + typeof rawData === "object" && + rawData.length !== undefined + ) { + uint8Data = new Uint8Array(rawData) + } else { + Logger.debug(`Invalid image data format for ${imageName}`) + continue + } + + const canvas = createCanvas(width, height) + const ctx = canvas.getContext("2d") + let imageProcessed = false + + switch (kind) { + case pdfjsLib.ImageKind.GRAYSCALE_1BPP: + case pdfjsLib.ImageKind.RGB_24BPP: + case pdfjsLib.ImageKind.RGBA_32BPP: { + let expectedLength: number + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { + // 1 bit per pixel, packed into bytes + expectedLength = Math.ceil((width * height) / 8) + } else { + const bytesPerPixel = + kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 // RGB_24BPP + expectedLength = width * height * bytesPerPixel + } + + if (uint8Data.length >= expectedLength) { const rgbaData = new Uint8ClampedArray( width * height * 4, ) - const bytesPerPixel = Math.floor( - uint8Data.length / (width * height), - ) - if (bytesPerPixel >= 3) { + if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) { + // Handle 1 bit per pixel grayscale (bit-packed data) + let pixelIndex = 0 + for (let y = 0; y < height; y++) { + for (let x = 0; x < width; x++) { + const byteIndex = Math.floor(pixelIndex / 8) + const bitIndex = 7 - (pixelIndex % 8) // MSB first + const bit = + byteIndex < uint8Data.length + ? (uint8Data[byteIndex] >> bitIndex) & 1 + : 0 + const gray = bit ? 255 : 0 // Convert bit to full pixel value + + const dstIdx = pixelIndex * 4 + rgbaData[dstIdx] = gray // R + rgbaData[dstIdx + 1] = gray // G + rgbaData[dstIdx + 2] = gray // B + rgbaData[dstIdx + 3] = 255 // A + pixelIndex++ + } + } + } else { + // Handle RGB_24BPP and RGBA_32BPP (byte-per-channel data) + const bytesPerPixel = + kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 for (let i = 0; i < width * height; i++) { const srcIdx = i * bytesPerPixel const dstIdx = i * 4 - rgbaData[dstIdx] = - srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R - rgbaData[dstIdx + 1] = - srcIdx + 1 < uint8Data.length - ? uint8Data[srcIdx + 1] - : 0 // G - rgbaData[dstIdx + 2] = - srcIdx + 2 < uint8Data.length - ? uint8Data[srcIdx + 2] - : 0 // B - rgbaData[dstIdx + 3] = 255 // A + if (kind === pdfjsLib.ImageKind.RGB_24BPP) { + rgbaData[dstIdx] = + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = 255 // A + } else { + // RGBA_32BPP + rgbaData[dstIdx] = + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = + srcIdx + 3 < uint8Data.length + ? uint8Data[srcIdx + 3] + : 255 // A + } } - const imageData = new ImageData(rgbaData, width) - ctx.putImageData(imageData, 0, 0) - imageProcessed = true } - } catch { - Logger.debug( - `Failed to process image ${imageName} with fallback method`, - ) + const imageData = new ImageData(rgbaData, width, height) + ctx.putImageData(imageData, 0, 0) + imageProcessed = true } + break } - } - } + default: { + try { + const imgBuffer = Buffer.from(uint8Data.buffer) + const img = new CanvasImage() + await new Promise((resolve, reject) => { + img.onload = () => resolve() + img.onerror = (err) => reject(err) + img.src = imgBuffer + }) + ctx.drawImage(img, 0, 0) + imageProcessed = true + } catch (err) { + try { + const rgbaData = new Uint8ClampedArray( + width * height * 4, + ) + const bytesPerPixel = Math.floor( + uint8Data.length / (width * height), + ) - if (imageProcessed) { - const buffer = canvas.toBuffer("image/png") - // @ts-ignore - let type = await imageType(buffer) - if (!type) { - Logger.warn( - `Could not determine MIME type for ${imageName}. Defaulting to image/png`, - ) - type = { mime: "image/png", ext: "png" } - } - if ( - !type || - !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) - ) { - Logger.warn( - `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`, - ) - continue + if (bytesPerPixel >= 3) { + for (let i = 0; i < width * height; i++) { + const srcIdx = i * bytesPerPixel + const dstIdx = i * 4 + rgbaData[dstIdx] = + srcIdx < uint8Data.length + ? uint8Data[srcIdx] + : 0 // R + rgbaData[dstIdx + 1] = + srcIdx + 1 < uint8Data.length + ? uint8Data[srcIdx + 1] + : 0 // G + rgbaData[dstIdx + 2] = + srcIdx + 2 < uint8Data.length + ? uint8Data[srcIdx + 2] + : 0 // B + rgbaData[dstIdx + 3] = 255 // A + } + const imageData = new ImageData( + rgbaData, + width, + height, + ) + ctx.putImageData(imageData, 0, 0) + imageProcessed = true + } + } catch { + Logger.debug( + `Failed to process image ${imageName} with fallback method`, + ) + } + } + } } - // buffer already created above - const imageHash = crypto - .createHash("md5") - .update(new Uint8Array(buffer)) - .digest("hex") + Logger.debug("Image processing result", { + imageName, + imageProcessed, + canvasWidth: canvas.width, + canvasHeight: canvas.height, + }) - let description: string + if (imageProcessed) { + Logger.debug("Converting to PNG buffer", { imageName }) + const buffer = canvas.toBuffer("image/png") + if ( + buffer.length > + DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024 + ) { + Logger.warn( + `Skipping encoded image > ${DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB} MB (size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB)`, + ) + continue + } + Logger.debug("PNG buffer created", { + imageName, + size: buffer.length, + }) - if (seenHashDescriptions.has(imageHash)) { - description = seenHashDescriptions.get(imageHash)! - Logger.warn( - `Reusing description for repeated image ${imageName} on page ${pageNum}`, - ) - } else { - if(describeImages) { - description = await describeImageWithllm(buffer) - } else { - description = "This is an image." + // @ts-ignore + let type = await imageType(buffer) + Logger.debug("Image type detection result", { + imageName, + type, + }) + + if (!type) { + Logger.debug( + "Could not determine MIME type, using default", + { + imageName, + default: "image/png", + }, + ) + Logger.warn( + `Could not determine MIME type for ${imageName}. Defaulting to image/png`, + ) + type = { mime: "image/png", ext: "png" } } + + Logger.debug("Checking MIME type support", { + imageName, + detectedMime: type.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + isSupported: DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has( + type.mime, + ), + }) + if ( - description === "No description returned." || - description === "Image is not worth describing." + !type || + !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime) ) { + Logger.debug("Skipped image with unsupported MIME type", { + imageName, + detectedMime: type?.mime, + supportedMimes: Array.from( + DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES, + ), + }) Logger.warn( - `${description} ${imageName} on page ${pageNum}`, + `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`, ) continue } - seenHashDescriptions.set(imageHash, description) - } - try { - // Save image to Downloads/xyne_images_db with improved error handling - const baseDir = path.resolve( - process.env.IMAGE_DIR || "downloads/xyne_images_db", + Logger.debug( + "All filters passed, proceeding with image description", + { + imageName, + }, ) - const outputDir = path.join(baseDir, docid) - await fsPromises.mkdir(outputDir, { recursive: true }) - const imageFilename = `${globalSeq.value}.${type.ext || "png"}` - const imagePath = path.join(outputDir, imageFilename) + // buffer already created above + const imageHash = crypto + .createHash("md5") + .update(new Uint8Array(buffer)) + .digest("hex") - await fsPromises.writeFile( - imagePath, - buffer as NodeJS.ArrayBufferView, - ) - Logger.info(`Saved image to: ${imagePath}`) - } catch (saveError) { - Logger.error( - `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`, + let description: string + + if (seenHashDescriptions.has(imageHash)) { + description = seenHashDescriptions.get(imageHash)! + Logger.debug("Reusing cached description for image", { + imageName, + description, + }) + Logger.warn( + `Reusing description for repeated image ${imageName} on page ${pageNum}`, + ) + } else { + Logger.debug("Generating new description for image", { + imageName, + describeImages, + }) + if (describeImages) { + try { + Logger.debug( + "Calling describeImageWithllm for image", + { + imageName, + }, + ) + description = await describeImageWithllm(buffer) + Logger.debug("Got description from AI for image", { + imageName, + description, + }) + } catch (e) { + Logger.warn( + `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`, + ) + description = "This is an image from the PDF." + Logger.debug( + "Using fallback description due to AI error", + ) + } + } else { + description = "This is an image." + Logger.debug( + "Using default description (describeImages=false)", + ) + } + + // Check description quality after LLM call + if ( + !description || + description === "No description returned." || + description === "Image is not worth describing." + ) { + Logger.debug( + "Skipping image with insufficient description", + { + imageName, + previousDescription: description, + }, + ) + Logger.warn( + `Skipping image with poor description: ${imageName} on page ${pageNum}`, + ) + continue + } + seenHashDescriptions.set(imageHash, description) + Logger.debug("Cached new description for image", { + imageName, + description, + }) + } + + try { + // Save image to Downloads/xyne_images_db with improved error handling + const baseDir = path.resolve( + process.env.IMAGE_DIR || "downloads/xyne_images_db", + ) + const outputDir = path.join(baseDir, docid) + await fsPromises.mkdir(outputDir, { recursive: true }) + + const imageFilename = `${globalSeq.value}.${type.ext || "png"}` + const imagePath = path.join(outputDir, imageFilename) + + await fsPromises.writeFile( + imagePath, + buffer as NodeJS.ArrayBufferView, + ) + Logger.info(`Saved image to: ${imagePath}`) + } catch (saveError) { + Logger.error( + `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`, + ) + // Skip adding to chunks if save failed + continue + } + + image_chunks.push(description) + image_chunk_pos.push(globalSeq.value) + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`) + text_chunk_pos.push(globalSeq.value) + } + // Removed cross-image overlap placeholder handling + Logger.debug("Added image chunk at position", { + position: globalSeq.value, + imageName, + description, + }) + globalSeq.value++ + imagesOnPage += 1 + Logger.debug( + `Successfully processed image ${imageName} on page ${pageNum}`, ) - // Skip adding to chunks if save failed - continue } - - image_chunks.push(description) - image_chunk_pos.push(globalSeq.value) - crossImageOverlap += ` [[IMG#${globalSeq.value}]] ` - globalSeq.value++ - Logger.debug( - `Successfully processed image ${imageName} on page ${pageNum}`, + } catch (error) { + Logger.warn( + `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`, ) } - } catch (error) { - Logger.warn( - `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`, - ) } break } @@ -519,9 +1360,7 @@ export async function extractTextAndImagesWithChunksFromPDF( break } } - - // End of page: flush remaining paragraph and process paragraphs - flushParagraph() + // End of page: process paragraphs const overlapText = processTextParagraphs( paragraphs, text_chunks, @@ -529,12 +1368,8 @@ export async function extractTextAndImagesWithChunksFromPDF( globalSeq, ) - // Update cross-image overlap - APPEND instead of REPLACE to preserve image placeholders - if (overlapText.trim()) { - crossImageOverlap = crossImageOverlap - ? `${crossImageOverlap} ${overlapText}` - : overlapText - } + // Store overlap for continuity to the next page + pageOverlap = overlapText.trim() Logger.debug( `Page ${pageNum} completed. Text operators found: ${textOperatorCount}, Current text chunks: ${text_chunks.length}, Current image chunks: ${image_chunks.length}`, @@ -549,6 +1384,20 @@ export async function extractTextAndImagesWithChunksFromPDF( `PDF processing completed. Total text chunks: ${text_chunks.length}, Total image chunks: ${image_chunks.length}`, ) + Logger.debug("PDF processing completed for document", { docid }) + Logger.debug("Processing summary", { + totalTextChunks: text_chunks.length, + totalImageChunks: image_chunks.length, + textChunkPositions: text_chunk_pos.length, + imageChunkPositions: image_chunk_pos.length, + extractImages, + describeImages, + }) + + Logger.debug("All text chunks", { text_chunks }) + Logger.debug("All text chunk positions", { text_chunk_pos }) + Logger.debug("All image chunks", { image_chunks }) + Logger.debug("All image chunk positions", { image_chunk_pos }) return { text_chunks, image_chunks, @@ -556,6 +1405,7 @@ export async function extractTextAndImagesWithChunksFromPDF( image_chunk_pos, } } finally { + Logger.debug("Calling PDF document destroy") await pdfDocument.destroy() } } diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts new file mode 100644 index 000000000..0f0adac30 --- /dev/null +++ b/server/scripts/testPdfDirect.ts @@ -0,0 +1,89 @@ +import { readFileSync } from "fs" +import { resolve } from "path" +import { FileProcessorService } from "@/services/fileProcessor" +import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" + +async function testPdfDirect() { + let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf" + // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf" + pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf" + try { + console.log("=== DIRECT PDF PROCESSING TEST ===") + console.log("PDF Path:", pdfPath) + + // Read the PDF file + console.log("\n1. Reading PDF file...") + const pdfBuffer = readFileSync(pdfPath) + console.log("File size:", pdfBuffer.length, "bytes") + + console.log("\n2. Testing direct PDF processing (current knowledge base flow)...") + console.log("This simulates exactly what happens in the knowledge base upload:") + console.log("- FileProcessorService.processFile() is called") + console.log("- extractImages defaults to false") + console.log("- describeImages defaults to false") + + // Test the exact flow used in knowledge base + const result = await FileProcessorService.processFile( + pdfBuffer, + "application/pdf", + "small2.pdf", + "test-doc-id", + pdfPath + // extractImages and describeImages default to false + ) + + console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===") + console.log("Text chunks:", result.chunks.length) + console.log("Image chunks:", result.image_chunks.length) + console.log("Text chunk positions:", result.chunks_pos.length) + console.log("Image chunk positions:", result.image_chunks_pos.length) + + console.log("\n3. Testing with image processing enabled...") + console.log("Parameters: extractImages=true, describeImages=true") + + // Test with images enabled to see the difference + const imageResult = await extractTextAndImagesWithChunksFromPDF( + new Uint8Array(pdfBuffer), + "test-doc-with-images", + true, // extractImages enabled + true // describeImages enabled + ) + + console.log("\n=== RESULTS WITH IMAGES ENABLED ===") + console.log("Text chunks:", imageResult.text_chunks.length) + console.log("Image chunks:", imageResult.image_chunks.length) + console.log("Text chunk positions:", imageResult.text_chunk_pos.length) + console.log("Image chunk positions:", imageResult.image_chunk_pos.length) + + console.log("\n=== COMPARISON ===") + console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length) + console.log("With images - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length) + + if (result.chunks.length > 0) { + console.log("\n=== SAMPLE TEXT CHUNKS ===") + result.chunks.slice(0, 2).forEach((chunk, idx) => { + console.log(`\nText Chunk ${idx + 1}:`) + console.log(chunk) + }) + } + + if (imageResult.image_chunks.length > 0) { + console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===") + imageResult.image_chunks.forEach((chunk, idx) => { + console.log(`\nImage ${idx + 1}:`) + console.log(chunk) + }) + } + + console.log("\n=== TEST COMPLETED ===") + console.log("✓ Check the debug logs above from pdfChunks.ts") + console.log("✓ You can see exactly what's being processed in the current knowledge base flow") + + } catch (error) { + console.error("Error processing PDF:", error) + process.exit(1) + } +} + +// Run the test +testPdfDirect().catch(console.error) \ No newline at end of file