From feb056f72125f500691c310101d95d07a217d787 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 13:07:18 +0530
Subject: [PATCH 1/9] pdfChunks file updated for better text processing

---
 server/pdfChunks.ts             | 1321 +++++++++++++++++++++++++------
 server/scripts/testPdfDirect.ts |   89 +++
 2 files changed, 1158 insertions(+), 252 deletions(-)
 create mode 100644 server/scripts/testPdfDirect.ts

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index c8ab942b9..0f7f11483 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -15,6 +15,7 @@ const openjpegWasmPath =
 const qcmsWasmPath =
   path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/"
 const seenHashDescriptions = new Map<string, string>()
+const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10)
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "pdfChunks",
@@ -23,14 +24,144 @@ const Logger = getLogger(Subsystem.Integrations).child({
 const PDFJS = pdfjsLib
 
 // Utility function to clean text consistent with chunkTextByParagraph
-const cleanText = (str: string): string => {
-  const normalized = str.replace(/\r\n|\r/g, "\n")
-  return normalized.replace(
-    /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g,
-    "",
-  )
+// const cleanText = (str: string): string => {
+//   console.log('CLEAN TEXT DEBUG: Input string length:', str.length)
+//   console.log('CLEAN TEXT DEBUG: Input string:', str)
+
+//   const normalized = str.replace(/\r\n|\r/g, "\n")
+//   console.log('CLEAN TEXT DEBUG: After normalization length:', normalized.length)
+//   console.log('CLEAN TEXT DEBUG: After normalization:', normalized)
+
+//   const cleaned = normalized.replace(
+//     /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g,
+//     "",
+//   )
+//   console.log('CLEAN TEXT DEBUG: After cleaning length:', cleaned.length)
+//   console.log('CLEAN TEXT DEBUG: Cleaned string:', cleaned)
+
+//   return cleaned
+// }
+
+//===
+
+export function normalizeText(input: string): string {
+  if (!input) return ""
+
+  let normalized = input.normalize("NFC")
+
+  // Strip control chars except newline/tab
+  normalized = normalized.replace(/[^\P{C}\n\t]/gu, "")
+
+  // Normalize whitespace
+  normalized = normalized.replace(/\u00A0/g, " ") // nbsp → space
+  normalized = normalized.replace(/\u200B/g, "") // zero-width space
+  normalized = normalized.replace(/\t+/g, " ") // tabs → single space
+
+  return normalized.trim()
+}
+
+// =========================================
+// 2. Smart letter-spacing collapse (per line)
+// =========================================
+function smartDespaceLine(line: string): string {
+  if (!line) return line
+
+  const parts = line.split(/(\s+)/)
+  const out: string[] = []
+
+  const isSingleAllowed = (s: string) =>
+    s.length === 1 && /[\p{L}\p{N}'’]/u.test(s)
+
+  const isSingleLowerLetter = (s: string) => s.length === 1 && /\p{Ll}/u.test(s)
+
+  let i = 0
+  while (i < parts.length) {
+    const tok = parts[i]
+
+    if (!/\s+/.test(tok) && isSingleAllowed(tok)) {
+      const runTokens: string[] = [tok]
+      let j = i + 1
+
+      while (
+        j + 1 < parts.length &&
+        parts[j] === " " &&
+        !/\s+/.test(parts[j + 1]) &&
+        isSingleAllowed(parts[j + 1])
+      ) {
+        runTokens.push(parts[j + 1])
+        j += 2
+      }
+
+      // Join spaced letters like "N A S A" -> "NASA"
+      if (runTokens.length >= 3) {
+        out.push(runTokens.join(""))
+        i = j
+        continue
+      }
+
+      // Join two-letter lowercase sequences like "i s" -> "is"
+      if (
+        runTokens.length === 2 &&
+        isSingleLowerLetter(runTokens[0]) &&
+        isSingleLowerLetter(runTokens[1])
+      ) {
+        out.push(runTokens.join(""))
+        i = j
+        continue
+      }
+    }
+
+    out.push(tok)
+    i += 1
+  }
+
+  return out.join("")
 }
 
+// =============================
+// 3. High-level text cleaner
+// =============================
+export function cleanText(input: string): string {
+  let s = normalizeText(input)
+
+  // Fix hyphenation across line breaks
+  s = s.replace(/(\p{L})-\n(\p{L})/gu, "$1$2")
+
+  // Trim spaces around newlines
+  s = s.replace(/[ \t]*\n[ \t]*/g, "\n")
+
+  // Turn intra-paragraph newlines into spaces, preserve paragraph breaks
+  // 1) Mark paragraph breaks
+  s = s.replace(/\n{2,}/g, "[[PARA]]")
+  // 2) Collapse remaining newlines (soft wraps) into spaces
+  s = s.replace(/\n+/g, " ")
+  // 3) Restore paragraph breaks
+  s = s.replace(/\[\[PARA\]\]/g, "\n\n")
+
+  // Apply line-wise despacing
+  s = s
+    .split("\n")
+    .map((line) => smartDespaceLine(line))
+    .join("\n")
+
+  // Remove spaces before punctuation
+  s = s.replace(/\s+([.,;:!?])/g, "$1")
+
+  // Cap extreme space runs, preserve 2–4 spaces
+  s = s.replace(/[ ]{5,}/g, "    ")
+
+  // Trim lines & drop empties
+  s = s
+    .split("\n")
+    .map((l) => l.trim())
+    .filter((l) => l.length > 0)
+    .join("\n")
+
+  return s.trim()
+}
+
+//===
+
 /**
  * Validate text item
  */
@@ -77,7 +208,9 @@ function extractTextFromArgs(args: any[]): string {
   }
 
   // Additional validation: ensure we return clean, valid text
-  return typeof text === "string" ? text : ""
+  const result = typeof text === "string" ? text : ""
+  console.log("EXTRACT TEXT DEBUG: Final extracted text:", result)
+  return result
 }
 
 /**
@@ -91,19 +224,32 @@ function processTextParagraphs(
   globalSeq: { value: number },
   overlapBytes: number = 32,
 ): string {
-  if (paragraphs.length === 0) return ""
+  console.log("TEXT DEBUG: Processing paragraphs count:", paragraphs.length)
+
+  if (paragraphs.length === 0) {
+    console.log("TEXT DEBUG: No paragraphs to process")
+    return ""
+  }
 
   const cleanedParagraphs = paragraphs
     .map(cleanText)
     .filter((p) => p.length > 0)
-  if (cleanedParagraphs.length === 0) return ""
+  if (cleanedParagraphs.length === 0) {
+    console.log("TEXT DEBUG: No cleaned paragraphs after filtering")
+    return ""
+  }
 
   const cleanedText = cleanedParagraphs.join("\n")
+  // console.log('TEXT DEBUG: Cleaned text length:', cleanedText.length)
+  // console.log('TEXT DEBUG: Full cleaned text:', cleanedText)
+
   const chunks = chunkTextByParagraph(cleanedText, 512, 128)
+  // console.log('TEXT DEBUG: Generated chunks count:', chunks.length)
 
   for (const chunk of chunks) {
     text_chunks.push(chunk)
     text_chunk_pos.push(globalSeq.value)
+    // console.log('TEXT DEBUG: Added chunk at position', globalSeq.value, 'content:', chunk)
     globalSeq.value++
   }
 
@@ -111,13 +257,25 @@ function processTextParagraphs(
   // Take the last overlapBytes from the processed text
   let overlapText = ""
   let overlapLen = 0
+
+  // Logger.info(`OVERLAP DEBUG: Calculating overlap text from cleanedText of length ${cleanedText.length}, target bytes: ${overlapBytes}`)
+  // console.log('OVERLAP DEBUG: Full cleanedText for overlap calculation:', cleanedText)
+
   for (let i = cleanedText.length - 1; i >= 0; i--) {
     const charBytes = Buffer.byteLength(cleanedText[i], "utf8")
-    if (overlapLen + charBytes > overlapBytes) break
+    if (overlapLen + charBytes > overlapBytes) {
+      // console.log('OVERLAP DEBUG: Stopping overlap calculation at char', i, 'would exceed', overlapBytes, 'bytes (current:', overlapLen, 'char bytes:', charBytes, ')')
+      break
+    }
     overlapText = cleanedText[i] + overlapText
     overlapLen += charBytes
+    // console.log('OVERLAP DEBUG: Added char', cleanedText[i], 'to overlap. Current overlap length:', overlapLen, 'bytes, text:', overlapText)
   }
 
+  // console.log('OVERLAP DEBUG: Final calculated overlap text:', overlapText)
+  // console.log('OVERLAP DEBUG: Final overlap length:', overlapLen, 'bytes')
+  // Logger.info(`OVERLAP DEBUG: processTextParagraphs returning overlap text: "${overlapText}" (${overlapLen} bytes)`)
+
   return overlapText
 }
 
@@ -133,6 +291,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
   image_chunk_pos: number[]
 }> {
   Logger.info(`Starting PDF processing for: ${docid}`)
+  console.log("PDF DEBUG: Starting processing with parameters:", {
+    docid,
+    extractImages,
+    describeImages,
+    dataSize: data.length,
+  })
 
   const loadingTask = PDFJS.getDocument({
     data,
@@ -171,8 +335,69 @@ export async function extractTextAndImagesWithChunksFromPDF(
     let globalSeq = { value: 0 }
     let crossImageOverlap = "" // Track overlap across images
 
+    // Logger.info("OVERLAP DEBUG: Initialized crossImageOverlap as empty string")
+    // console.log('OVERLAP DEBUG: Starting PDF processing with initial crossImageOverlap:', crossImageOverlap)
+
     Logger.info(`PDF has ${pdfDocument.numPages} pages`)
 
+    // Robust text extraction using PDF.js textContent API
+    const buildParagraphsFromPage = async (
+      page: pdfjsLib.PDFPageProxy,
+    ): Promise<string[]> => {
+      const textContent = await page.getTextContent({
+        includeMarkedContent: false,
+        disableNormalization: false,
+      })
+
+      // Build lines using hasEOL and Y-position changes (handles PPT/DOC exports)
+      const lines: string[] = []
+      let current = ""
+      let prevY: number | null = null
+      let prevH: number | null = null
+      for (const item of textContent.items as any[]) {
+        const str: string = item && typeof item.str === "string" ? item.str : ""
+        if (!str) continue
+
+        const tr = Array.isArray(item.transform) ? item.transform : []
+        const y = typeof tr[5] === "number" ? tr[5] : null
+        const h = typeof item.height === "number" ? item.height : null
+
+        let newLine = false
+        if (prevY != null && y != null) {
+          const tol = Math.max(prevH || 0, h || 0, 10) * 0.4 // dynamic tolerance
+          if (Math.abs(y - prevY) > tol) newLine = true
+        }
+
+        if (newLine || (item as any).hasEOL) {
+          if (current.length > 0) lines.push(current)
+          current = str
+        } else {
+          current += str
+        }
+
+        prevY = y
+        prevH = h
+      }
+      if (current.trim().length > 0) lines.push(current)
+
+      // Group lines into paragraphs separated by blank lines
+      const paragraphs: string[] = []
+      let buf: string[] = []
+      const pushPara = () => {
+        if (buf.length === 0) return
+        paragraphs.push(buf.join("\n"))
+        buf = []
+      }
+      for (const ln of lines) {
+        if (ln.trim().length === 0) pushPara()
+        else buf.push(ln)
+      }
+      pushPara()
+
+      // Clean and filter
+      return paragraphs.map(cleanText).filter((p) => p.length > 0)
+    }
+
     for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
       Logger.debug(`Processing page ${pageNum}`)
 
@@ -180,17 +405,87 @@ export async function extractTextAndImagesWithChunksFromPDF(
       try {
         const opList = await page.getOperatorList()
 
-        // Hold paragraphs for current page
-        let paragraphs: string[] = []
-        let currentParagraph = ""
-        let textOperatorCount = 0
+        // Use textContent-based paragraphs for this page
+        let paragraphs: string[] = await buildParagraphsFromPage(page)
+        let currentParagraph = "" // kept for image-flow flush, but not used for text
+        let textOperatorCount = (await page.getTextContent()).items.length
 
-        // Start with cross-image overlap if available
-        if (crossImageOverlap && extractImages) {
-          currentParagraph = crossImageOverlap + " "
-          crossImageOverlap = "" // Clear after using
+        // Helper: try to resolve image object by name directly from page.objs
+        const resolveImageByName = async (
+          name: string,
+        ): Promise<any | null> => {
+          try {
+            // Some builds expose has method
+            // @ts-ignore
+            if (
+              typeof (page.objs as any).has === "function" &&
+              (page.objs as any).has(name)
+            ) {
+              // @ts-ignore
+              return (page.objs as any).get(name)
+            }
+            const obj = (page.objs as any).get(name)
+            return obj || null
+          } catch (e) {
+            return null
+          }
         }
 
+        // Track CTM to compute image bounds when image data is not directly retrievable
+        let currentCTM: [number, number, number, number, number, number] = [
+          1, 0, 0, 1, 0, 0,
+        ]
+        const ctmStack: [number, number, number, number, number, number][] = []
+
+        const mul = (
+          m1: number[],
+          m2: number[],
+        ): [number, number, number, number, number, number] => {
+          const [a1, b1, c1, d1, e1, f1] = m1 as [
+            number,
+            number,
+            number,
+            number,
+            number,
+            number,
+          ]
+          const [a2, b2, c2, d2, e2, f2] = m2 as [
+            number,
+            number,
+            number,
+            number,
+            number,
+            number,
+          ]
+          return [
+            a1 * a2 + c1 * b2,
+            b1 * a2 + d1 * b2,
+            a1 * c2 + c1 * d2,
+            b1 * c2 + d1 * d2,
+            a1 * e2 + c1 * f2 + e1,
+            b1 * e2 + d1 * f2 + f1,
+          ]
+        }
+
+        const applyToPoint = (
+          m: number[],
+          x: number,
+          y: number,
+        ): { x: number; y: number } => {
+          const [a, b, c, d, e, f] = m as [
+            number,
+            number,
+            number,
+            number,
+            number,
+            number,
+          ]
+          return { x: a * x + c * y + e, y: b * x + d * y + f }
+        }
+
+        // Do not inject crossImageOverlap into text paragraphs here
+        // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap)
+
         // Helper to flush currentParagraph into paragraphs array
         const flushParagraph = () => {
           if (currentParagraph.trim().length > 0) {
@@ -199,33 +494,44 @@ export async function extractTextAndImagesWithChunksFromPDF(
           }
         }
 
+        let imagesOnPage = 0
+        let vectorOpsDetected = false
         for (let i = 0; i < opList.fnArray.length; i++) {
           const fnId = opList.fnArray[i]
           const args = opList.argsArray[i]
 
+          // console.log(PDFJS.OPS.paintImageXObject , "PDFJS.OPS.paintImageXObject")
+          //   console.log(PDFJS.OPS.paintImageXObjectRepeat , "PDFJS.OPS.paintImageXObjectRepeat")
+          //   console.log(PDFJS.OPS.paintInlineImageXObject , "PDFJS.OPS.paintInlineImageXObject")
+          //   console.log(PDFJS.OPS.paintImageMaskXObject , "PDFJS.OPS.paintImageMaskXObject")
+
+          // Track vector drawing operators (paths, fills, form XObjects)
+          const isVectorOp =
+            fnId === PDFJS.OPS.constructPath ||
+            fnId === PDFJS.OPS.stroke ||
+            fnId === PDFJS.OPS.closeStroke ||
+            fnId === PDFJS.OPS.fill ||
+            fnId === PDFJS.OPS.eoFill ||
+            fnId === PDFJS.OPS.fillStroke ||
+            fnId === PDFJS.OPS.eoFillStroke ||
+            fnId === PDFJS.OPS.closeFillStroke ||
+            fnId === PDFJS.OPS.closeEOFillStroke ||
+            fnId === PDFJS.OPS.clip ||
+            fnId === PDFJS.OPS.eoClip ||
+            fnId === PDFJS.OPS.rectangle ||
+            fnId === PDFJS.OPS.shadingFill ||
+            fnId === PDFJS.OPS.rawFillPath ||
+            fnId === PDFJS.OPS.paintFormXObjectBegin ||
+            fnId === PDFJS.OPS.paintFormXObjectEnd
+          if (isVectorOp) vectorOpsDetected = true
+
           switch (fnId) {
             case PDFJS.OPS.showText:
-            case PDFJS.OPS.showSpacedText: {
-              const text = extractTextFromArgs(args)
-              if (text) {
-                currentParagraph += text + " "
-                textOperatorCount++
-              }
-              break
-            }
-            // Handle line break operators
-            case PDFJS.OPS.nextLine: {
-              flushParagraph()
-              break
-            }
+            case PDFJS.OPS.showSpacedText:
+            case PDFJS.OPS.nextLine:
             case PDFJS.OPS.nextLineShowText:
             case PDFJS.OPS.nextLineSetSpacingShowText: {
-              const text = extractTextFromArgs(args)
-              if (text) {
-                currentParagraph += text + " "
-                textOperatorCount++
-              }
-              flushParagraph()
+              // Text handled via getTextContent; ignore operator-driven text
               break
             }
             // Handle matrix and positioning operators that might indicate paragraph breaks
@@ -234,171 +540,467 @@ export async function extractTextAndImagesWithChunksFromPDF(
             case PDFJS.OPS.moveText: {
               // These might indicate significant positioning changes
               // For now, we'll be conservative and not flush, but this could be adjusted
+              if (fnId === PDFJS.OPS.transform) {
+                try {
+                  if (
+                    Array.isArray(args) &&
+                    args.length >= 6 &&
+                    args.every((n: any) => typeof n === "number")
+                  ) {
+                    currentCTM = mul(currentCTM, args as number[])
+                  }
+                } catch {}
+              }
+              break
+            }
+            case PDFJS.OPS.save: {
+              ctmStack.push([...currentCTM])
+              break
+            }
+            case PDFJS.OPS.restore: {
+              if (ctmStack.length) currentCTM = ctmStack.pop()!
               break
             }
-            // Handle image operators
+            // Handle image operators - be more comprehensive
             case extractImages ? PDFJS.OPS.paintImageXObject : null:
             case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null:
             case extractImages ? PDFJS.OPS.paintInlineImageXObject : null:
-            case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: {
-              // Flush any pending text paragraphs before image
-              flushParagraph()
-
-              // Process accumulated paragraphs and capture overlap
-              const overlapText = processTextParagraphs(
-                paragraphs,
-                text_chunks,
-                text_chunk_pos,
-                globalSeq,
+            case extractImages ? PDFJS.OPS.paintImageMaskXObject : null:
+            case extractImages ? 83 : null:
+            case extractImages ? 85 : null:
+            case extractImages ? 86 : null:
+            case extractImages ? 88 : null: {
+              console.log(
+                "IMAGE DEBUG: Image operator detected on page",
+                pageNum,
+                {
+                  extractImages,
+                  operatorType: fnId,
+                  imageName: args[0],
+                  knownOperators: {
+                    paintImageXObject: PDFJS.OPS.paintImageXObject,
+                    paintImageXObjectRepeat: PDFJS.OPS.paintImageXObjectRepeat,
+                    paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject,
+                    paintImageMaskXObject: PDFJS.OPS.paintImageMaskXObject,
+                  },
+                },
               )
-              paragraphs = [] // Clear paragraphs after processing
 
-              // Store overlap for continuation after image
-              crossImageOverlap = overlapText
+              // Do not process text per-image anymore; text is processed once per page.
+              // Maintain crossImageOverlap continuity by keeping placeholders only.
+              flushParagraph()
 
               // Extract image buffer
-              const imageName = args[0]
-              // Small delay to ensure image object has a chance to resolve
-              let imageDict
-              try {
-                imageDict = page.objs.get(imageName)
-              } catch (err) {
-                Logger.debug(
-                  `Image ${imageName} not resolved or failed to decode on page ${pageNum}: ${err instanceof Error ? err.message : err}`,
-                )
-                continue
+              const imageName =
+                typeof args?.[0] === "string"
+                  ? args[0]
+                  : args?.[0] &&
+                      typeof args[0] === "object" &&
+                      typeof args[0].name === "string"
+                    ? args[0].name
+                    : args?.[0]
+              console.log("IMAGE DEBUG: Processing image:", imageName)
+              let imageDict: any | null = null
+              let isInline = false
+              // Inline image may directly carry data in args
+              console.log("IMAGE DEBUG: Initial args for image operator:", args)
+              console.log("IMAGE DEBUG: fnId for image operator:", fnId)
+              console.log(
+                PDFJS.OPS.paintInlineImageXObject,
+                "PDFJS.OPS.paintInlineImageXObject",
+              )
+              if (fnId === PDFJS.OPS.paintInlineImageXObject || fnId === 86) {
+                console.log("IMAGE DEBUG: Detected inline image data in args")
+                const candidate = Array.isArray(args)
+                  ? args.find(
+                      (a: any) =>
+                        a &&
+                        typeof a === "object" &&
+                        ("data" in a || "imgData" in a) &&
+                        "width" in a &&
+                        "height" in a,
+                    )
+                  : null
+                if (candidate) {
+                  imageDict = candidate
+                  isInline = true
+                }
               }
-              if (!imageDict || !imageDict.data) {
+              console.log(
+                "IMAGE DEBUG: Initial imageDict resolved from args:",
+                imageDict,
+              )
+              if (
+                !imageDict &&
+                (typeof imageName === "string" ||
+                  (imageName &&
+                    typeof imageName === "object" &&
+                    typeof imageName.name === "string"))
+              ) {
+                const name =
+                  typeof imageName === "string" ? imageName : imageName.name
+                imageDict = await resolveImageByName(name)
+              }
+
+              // If we cannot get the raw image object, skip this image
+              if (!imageDict) {
                 Logger.debug(
-                  `No image data found for ${imageName} on page ${pageNum}`,
+                  `No image object available for ${imageName} on page ${pageNum} — skipping`,
                 )
                 continue
               }
+              console.log("IMAGE DEBUG: Resolved imageDict:", {
+                imageDict,
+                isInline,
+              })
 
-              try {
-                const { width, height, kind, data } = imageDict
-
-                if (!width || !height || width <= 0 || height <= 0) {
-                  Logger.debug(
-                    `Invalid image dimensions for ${imageName}: ${width}x${height}`,
-                  )
-                  continue
-                }
-
-                if (
-                  data.length >
-                  DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
-                ) {
-                  Logger.warn(
-                    `Skipping large image (${(data.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`,
-                  )
-                  continue
-                }
-
-                if (width < 250 || height < 250) continue // Skip small images
-
-                let uint8Data: Uint8Array
-                if (data instanceof Uint8Array) {
-                  uint8Data = data
-                } else if (
-                  data &&
-                  typeof data === "object" &&
-                  data.length !== undefined
-                ) {
-                  uint8Data = new Uint8Array(data)
-                } else {
-                  Logger.debug(`Invalid image data format for ${imageName}`)
-                  continue
-                }
+              // Ensure imageDict is valid before processing
+              if (!imageDict || typeof imageDict !== "object") {
+                console.log(
+                  "IMAGE DEBUG: imageDict is null or invalid, skipping to crop fallback",
+                )
+                // This will fall through to the crop fallback logic below
+              } else {
+                try {
+                  // Fast paths for Canvas or Image-like objects returned by page.objs
+                  const isCanvasLike = (obj: any) =>
+                    obj &&
+                    typeof obj.getContext === "function" &&
+                    typeof obj.width === "number" &&
+                    typeof obj.height === "number"
+                  const isImageLike = (obj: any) =>
+                    obj &&
+                    typeof obj.width === "number" &&
+                    typeof obj.height === "number" &&
+                    typeof obj.getContext !== "function"
 
-                const canvas = createCanvas(width, height)
-                const ctx = canvas.getContext("2d")
-                let imageProcessed = false
-
-                switch (kind) {
-                  case pdfjsLib.ImageKind.GRAYSCALE_1BPP:
-                  case pdfjsLib.ImageKind.RGB_24BPP:
-                  case pdfjsLib.ImageKind.RGBA_32BPP: {
-                    const bytesPerPixel =
-                      kind === pdfjsLib.ImageKind.RGBA_32BPP
-                        ? 4
-                        : kind === pdfjsLib.ImageKind.RGB_24BPP
-                          ? 3
-                          : 1
-                    const expectedLength = width * height * bytesPerPixel
-
-                    if (uint8Data.length >= expectedLength) {
-                      const rgbaData = new Uint8ClampedArray(width * height * 4)
-                      for (let i = 0; i < width * height; i++) {
-                        const srcIdx = i * bytesPerPixel
-                        const dstIdx = i * 4
-                        if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
-                          const gray =
-                            srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0
-                          rgbaData[dstIdx] = gray // R
-                          rgbaData[dstIdx + 1] = gray // G
-                          rgbaData[dstIdx + 2] = gray // B
-                          rgbaData[dstIdx + 3] = 255 // A
-                        } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) {
-                          rgbaData[dstIdx] =
-                            srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
-                          rgbaData[dstIdx + 1] =
-                            srcIdx + 1 < uint8Data.length
-                              ? uint8Data[srcIdx + 1]
-                              : 0 // G
-                          rgbaData[dstIdx + 2] =
-                            srcIdx + 2 < uint8Data.length
-                              ? uint8Data[srcIdx + 2]
-                              : 0 // B
-                          rgbaData[dstIdx + 3] = 255 // A
-                        } else {
-                          // RGBA_32BPP
-                          rgbaData[dstIdx] =
-                            srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
-                          rgbaData[dstIdx + 1] =
-                            srcIdx + 1 < uint8Data.length
-                              ? uint8Data[srcIdx + 1]
-                              : 0 // G
-                          rgbaData[dstIdx + 2] =
-                            srcIdx + 2 < uint8Data.length
-                              ? uint8Data[srcIdx + 2]
-                              : 0 // B
-                          rgbaData[dstIdx + 3] =
-                            srcIdx + 3 < uint8Data.length
-                              ? uint8Data[srcIdx + 3]
-                              : 255 // A
+                  if (isCanvasLike(imageDict)) {
+                    const c: any = imageDict
+                    const width: number = c.width
+                    const height: number = c.height
+                    if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
+                      console.log(
+                        "IMAGE DEBUG: SKIPPED - Small dimensions from canvas for",
+                        imageName,
+                        { width, height },
+                      )
+                    } else {
+                      const buffer = c.toBuffer("image/png")
+                      if (
+                        buffer.length <=
+                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                      ) {
+                        // @ts-ignore
+                        let type = await imageType(buffer)
+                        if (!type) type = { mime: "image/png", ext: "png" }
+                        if (
+                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)
+                        ) {
+                          const imageHash = crypto
+                            .createHash("md5")
+                            .update(new Uint8Array(buffer))
+                            .digest("hex")
+                          let description = "This is an image."
+                          if (seenHashDescriptions.has(imageHash)) {
+                            description = seenHashDescriptions.get(imageHash)!
+                          } else {
+                            try {
+                              description = describeImages
+                                ? await describeImageWithllm(buffer)
+                                : description
+                            } catch {
+                              // ignore
+                            }
+                            if (
+                              !description ||
+                              description === "No description returned." ||
+                              description === "Image is not worth describing."
+                            ) {
+                              description = "Image extracted from PDF page."
+                            }
+                            seenHashDescriptions.set(imageHash, description)
+                          }
+                          try {
+                            const baseDir = path.resolve(
+                              process.env.IMAGE_DIR ||
+                                "downloads/xyne_images_db",
+                            )
+                            const outputDir = path.join(baseDir, docid)
+                            await fsPromises.mkdir(outputDir, {
+                              recursive: true,
+                            })
+                            const imageFilename = `${globalSeq.value}.${type.ext || "png"}`
+                            const imagePath = path.join(
+                              outputDir,
+                              imageFilename,
+                            )
+                            await fsPromises.writeFile(
+                              imagePath,
+                              buffer as NodeJS.ArrayBufferView,
+                            )
+                            Logger.info(
+                              `Saved image (objs/canvas) to: ${imagePath}`,
+                            )
+                          } catch (e) {
+                            Logger.error(
+                              `Failed to save objs/canvas image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`,
+                            )
+                            // Skip on failure
+                            break
+                          }
+                          image_chunks.push(description)
+                          image_chunk_pos.push(globalSeq.value)
+                          crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
+                          globalSeq.value++
+                          imagesOnPage += 1
+                          Logger.debug(
+                            `Successfully processed objs/canvas image ${imageName} on page ${pageNum}`,
+                          )
+                          break
                         }
+                      } else {
+                        Logger.warn(
+                          `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`,
+                        )
                       }
-                      const imageData = new ImageData(rgbaData, width)
-                      ctx.putImageData(imageData, 0, 0)
-                      imageProcessed = true
                     }
-                    break
                   }
-                  default: {
-                    try {
-                      const imgBuffer = Buffer.from(uint8Data.buffer)
-                      const img = new CanvasImage()
-                      await new Promise<void>((resolve, reject) => {
-                        img.onload = () => resolve()
-                        img.onerror = (err) => reject(err)
-                        img.src = imgBuffer
-                      })
-                      ctx.drawImage(img, 0, 0)
-                      imageProcessed = true
-                    } catch (err) {
+
+                  if (isImageLike(imageDict)) {
+                    const imgLike: any = imageDict
+                    const width: number = imgLike.width
+                    const height: number = imgLike.height
+                    if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
+                      console.log(
+                        "IMAGE DEBUG: SKIPPED - Small dimensions from image-like for",
+                        imageName,
+                        { width, height },
+                      )
+                    } else {
+                      const cnv = createCanvas(width, height)
+                      const cctx = cnv.getContext("2d")
                       try {
+                        // @ts-ignore draw directly
+                        cctx.drawImage(imgLike, 0, 0)
+                        const buffer = cnv.toBuffer("image/png")
+                        // @ts-ignore
+                        let type = await imageType(buffer)
+                        if (!type) type = { mime: "image/png", ext: "png" }
+                        if (
+                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)
+                        ) {
+                          const imageHash = crypto
+                            .createHash("md5")
+                            .update(new Uint8Array(buffer))
+                            .digest("hex")
+                          let description = "This is an image."
+                          if (seenHashDescriptions.has(imageHash)) {
+                            description = seenHashDescriptions.get(imageHash)!
+                          } else {
+                            try {
+                              description = describeImages
+                                ? await describeImageWithllm(buffer)
+                                : description
+                            } catch {
+                              // ignore
+                            }
+                            if (
+                              !description ||
+                              description === "No description returned." ||
+                              description === "Image is not worth describing."
+                            ) {
+                              description = "Image extracted from PDF page."
+                            }
+                            seenHashDescriptions.set(imageHash, description)
+                          }
+                          try {
+                            const baseDir = path.resolve(
+                              process.env.IMAGE_DIR ||
+                                "downloads/xyne_images_db",
+                            )
+                            const outputDir = path.join(baseDir, docid)
+                            await fsPromises.mkdir(outputDir, {
+                              recursive: true,
+                            })
+                            const imageFilename = `${globalSeq.value}.${type.ext || "png"}`
+                            const imagePath = path.join(
+                              outputDir,
+                              imageFilename,
+                            )
+                            await fsPromises.writeFile(
+                              imagePath,
+                              buffer as NodeJS.ArrayBufferView,
+                            )
+                            Logger.info(
+                              `Saved image (objs/image) to: ${imagePath}`,
+                            )
+                          } catch (e) {
+                            Logger.error(
+                              `Failed to save objs/image image for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`,
+                            )
+                            break
+                          }
+                          image_chunks.push(description)
+                          image_chunk_pos.push(globalSeq.value)
+                          crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
+                          globalSeq.value++
+                          imagesOnPage += 1
+                          Logger.debug(
+                            `Successfully processed objs/image image ${imageName} on page ${pageNum}`,
+                          )
+                          break
+                        }
+                      } catch (e) {
+                        Logger.debug(
+                          `Drawing objs image failed for ${imageName} on page ${pageNum}: ${e instanceof Error ? e.message : e}`,
+                        )
+                      }
+                    }
+                  }
+
+                  const width: number = (imageDict.width ??
+                    imageDict.w) as number
+                  const height: number = (imageDict.height ??
+                    imageDict.h) as number
+                  const kind =
+                    imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind
+                  // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes
+                  let rawData: any =
+                    imageDict.data ??
+                    imageDict.bytes ??
+                    (imageDict.imgData ? imageDict.imgData.data : undefined)
+
+                  console.log(
+                    "IMAGE DEBUG: Full image details for",
+                    imageName,
+                    {
+                      width,
+                      height,
+                      kind,
+                      dataLength: rawData ? rawData.length : null,
+                      dataSizeMB: rawData
+                        ? (rawData.length / (1024 * 1024)).toFixed(2)
+                        : null,
+                      maxAllowedSizeMB:
+                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                      minDimension: MIN_IMAGE_DIM_PX,
+                      isValidDimensions: width > 0 && height > 0,
+                      meetsMinSize:
+                        width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX,
+                      withinSizeLimit: rawData
+                        ? rawData.length <=
+                          DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                        : false,
+                      isInline,
+                    },
+                  )
+
+                  if (!width || !height || width <= 0 || height <= 0) {
+                    console.log(
+                      "IMAGE DEBUG: SKIPPED - Invalid dimensions for",
+                      imageName,
+                      "width:",
+                      width,
+                      "height:",
+                      height,
+                    )
+                    Logger.debug(
+                      `Invalid image dimensions for ${imageName}: ${width}x${height}`,
+                    )
+                    continue
+                  }
+
+                  if (
+                    rawData &&
+                    rawData.length >
+                      DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                  ) {
+                    console.log(
+                      "IMAGE DEBUG: SKIPPED - Large file size for",
+                      imageName,
+                      {
+                        actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(
+                          2,
+                        ),
+                        maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                        actualBytes: rawData.length,
+                        maxAllowedBytes:
+                          DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB *
+                          1024 *
+                          1024,
+                      },
+                    )
+                    Logger.warn(
+                      `Skipping large image (${(rawData.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`,
+                    )
+                    continue
+                  }
+
+                  if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
+                    console.log(
+                      "IMAGE DEBUG: SKIPPED - Small dimensions for",
+                      imageName,
+                      {
+                        width,
+                        height,
+                        minRequired: MIN_IMAGE_DIM_PX,
+                        widthTooSmall: width < MIN_IMAGE_DIM_PX,
+                        heightTooSmall: height < MIN_IMAGE_DIM_PX,
+                      },
+                    )
+                    continue // Skip small images
+                  }
+
+                  console.log(
+                    "IMAGE DEBUG: Image passed all filters, proceeding with processing for",
+                    imageName,
+                  )
+
+                  let uint8Data: Uint8Array
+                  if (rawData instanceof Uint8Array) {
+                    uint8Data = rawData
+                  } else if (
+                    rawData &&
+                    typeof rawData === "object" &&
+                    rawData.length !== undefined
+                  ) {
+                    uint8Data = new Uint8Array(rawData)
+                  } else {
+                    Logger.debug(`Invalid image data format for ${imageName}`)
+                    continue
+                  }
+
+                  const canvas = createCanvas(width, height)
+                  const ctx = canvas.getContext("2d")
+                  let imageProcessed = false
+
+                  switch (kind) {
+                    case pdfjsLib.ImageKind.GRAYSCALE_1BPP:
+                    case pdfjsLib.ImageKind.RGB_24BPP:
+                    case pdfjsLib.ImageKind.RGBA_32BPP: {
+                      const bytesPerPixel =
+                        kind === pdfjsLib.ImageKind.RGBA_32BPP
+                          ? 4
+                          : kind === pdfjsLib.ImageKind.RGB_24BPP
+                            ? 3
+                            : 1
+                      const expectedLength = width * height * bytesPerPixel
+
+                      if (uint8Data.length >= expectedLength) {
                         const rgbaData = new Uint8ClampedArray(
                           width * height * 4,
                         )
-                        const bytesPerPixel = Math.floor(
-                          uint8Data.length / (width * height),
-                        )
-
-                        if (bytesPerPixel >= 3) {
-                          for (let i = 0; i < width * height; i++) {
-                            const srcIdx = i * bytesPerPixel
-                            const dstIdx = i * 4
+                        for (let i = 0; i < width * height; i++) {
+                          const srcIdx = i * bytesPerPixel
+                          const dstIdx = i * 4
+                          if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
+                            const gray =
+                              srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0
+                            rgbaData[dstIdx] = gray // R
+                            rgbaData[dstIdx + 1] = gray // G
+                            rgbaData[dstIdx + 2] = gray // B
+                            rgbaData[dstIdx + 3] = 255 // A
+                          } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) {
                             rgbaData[dstIdx] =
                               srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
                             rgbaData[dstIdx + 1] =
@@ -410,107 +1012,295 @@ export async function extractTextAndImagesWithChunksFromPDF(
                                 ? uint8Data[srcIdx + 2]
                                 : 0 // B
                             rgbaData[dstIdx + 3] = 255 // A
+                          } else {
+                            // RGBA_32BPP
+                            rgbaData[dstIdx] =
+                              srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
+                            rgbaData[dstIdx + 1] =
+                              srcIdx + 1 < uint8Data.length
+                                ? uint8Data[srcIdx + 1]
+                                : 0 // G
+                            rgbaData[dstIdx + 2] =
+                              srcIdx + 2 < uint8Data.length
+                                ? uint8Data[srcIdx + 2]
+                                : 0 // B
+                            rgbaData[dstIdx + 3] =
+                              srcIdx + 3 < uint8Data.length
+                                ? uint8Data[srcIdx + 3]
+                                : 255 // A
                           }
-                          const imageData = new ImageData(rgbaData, width)
-                          ctx.putImageData(imageData, 0, 0)
-                          imageProcessed = true
                         }
-                      } catch {
-                        Logger.debug(
-                          `Failed to process image ${imageName} with fallback method`,
-                        )
+                        const imageData = new ImageData(rgbaData, width)
+                        ctx.putImageData(imageData, 0, 0)
+                        imageProcessed = true
+                      }
+                      break
+                    }
+                    default: {
+                      try {
+                        const imgBuffer = Buffer.from(uint8Data.buffer)
+                        const img = new CanvasImage()
+                        await new Promise<void>((resolve, reject) => {
+                          img.onload = () => resolve()
+                          img.onerror = (err) => reject(err)
+                          img.src = imgBuffer
+                        })
+                        ctx.drawImage(img, 0, 0)
+                        imageProcessed = true
+                      } catch (err) {
+                        try {
+                          const rgbaData = new Uint8ClampedArray(
+                            width * height * 4,
+                          )
+                          const bytesPerPixel = Math.floor(
+                            uint8Data.length / (width * height),
+                          )
+
+                          if (bytesPerPixel >= 3) {
+                            for (let i = 0; i < width * height; i++) {
+                              const srcIdx = i * bytesPerPixel
+                              const dstIdx = i * 4
+                              rgbaData[dstIdx] =
+                                srcIdx < uint8Data.length
+                                  ? uint8Data[srcIdx]
+                                  : 0 // R
+                              rgbaData[dstIdx + 1] =
+                                srcIdx + 1 < uint8Data.length
+                                  ? uint8Data[srcIdx + 1]
+                                  : 0 // G
+                              rgbaData[dstIdx + 2] =
+                                srcIdx + 2 < uint8Data.length
+                                  ? uint8Data[srcIdx + 2]
+                                  : 0 // B
+                              rgbaData[dstIdx + 3] = 255 // A
+                            }
+                            const imageData = new ImageData(rgbaData, width)
+                            ctx.putImageData(imageData, 0, 0)
+                            imageProcessed = true
+                          }
+                        } catch {
+                          Logger.debug(
+                            `Failed to process image ${imageName} with fallback method`,
+                          )
+                        }
                       }
                     }
                   }
-                }
 
-                if (imageProcessed) {
-                  const buffer = canvas.toBuffer("image/png")
-                  // @ts-ignore
-                  let type = await imageType(buffer)
-                  if (!type) {
-                    Logger.warn(
-                      `Could not determine MIME type for ${imageName}. Defaulting to image/png`,
+                  console.log(
+                    "IMAGE DEBUG: Image processing result for",
+                    imageName,
+                    {
+                      imageProcessed,
+                      canvasWidth: canvas.width,
+                      canvasHeight: canvas.height,
+                    },
+                  )
+
+                  if (imageProcessed) {
+                    console.log(
+                      "IMAGE DEBUG: Converting to PNG buffer for",
+                      imageName,
                     )
-                    type = { mime: "image/png", ext: "png" }
-                  }
-                  if (
-                    !type ||
-                    !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)
-                  ) {
-                    Logger.warn(
-                      `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`,
+                    const buffer = canvas.toBuffer("image/png")
+                    console.log(
+                      "IMAGE DEBUG: PNG buffer created for",
+                      imageName,
+                      "size:",
+                      buffer.length,
+                      "bytes",
                     )
-                    continue
-                  }
 
-                  // buffer already created above
-                  const imageHash = crypto
-                    .createHash("md5")
-                    .update(new Uint8Array(buffer))
-                    .digest("hex")
+                    // @ts-ignore
+                    let type = await imageType(buffer)
+                    console.log(
+                      "IMAGE DEBUG: Image type detection result for",
+                      imageName,
+                      type,
+                    )
 
-                  let description: string
+                    if (!type) {
+                      console.log(
+                        "IMAGE DEBUG: Could not determine MIME type for",
+                        imageName,
+                        "using default image/png",
+                      )
+                      Logger.warn(
+                        `Could not determine MIME type for ${imageName}. Defaulting to image/png`,
+                      )
+                      type = { mime: "image/png", ext: "png" }
+                    }
 
-                  if (seenHashDescriptions.has(imageHash)) {
-                    description = seenHashDescriptions.get(imageHash)!
-                    Logger.warn(
-                      `Reusing description for repeated image ${imageName} on page ${pageNum}`,
+                    console.log(
+                      "IMAGE DEBUG: Checking MIME type support for",
+                      imageName,
+                      {
+                        detectedMime: type.mime,
+                        supportedMimes: Array.from(
+                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
+                        ),
+                        isSupported:
+                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(
+                            type.mime,
+                          ),
+                      },
                     )
-                  } else {
-                    if(describeImages) {
-                      description = await describeImageWithllm(buffer)
-                    } else {
-                      description = "This is an image."
-                    }
+
                     if (
-                      description === "No description returned." ||
-                      description === "Image is not worth describing."
+                      !type ||
+                      !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)
                     ) {
+                      console.log(
+                        "IMAGE DEBUG: SKIPPED - Unsupported MIME type for",
+                        imageName,
+                        {
+                          detectedMime: type?.mime,
+                          supportedMimes: Array.from(
+                            DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
+                          ),
+                        },
+                      )
                       Logger.warn(
-                        `${description} ${imageName} on page ${pageNum}`,
+                        `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`,
                       )
                       continue
                     }
-                    seenHashDescriptions.set(imageHash, description)
-                  }
 
-                  try {
-                    // Save image to Downloads/xyne_images_db with improved error handling
-                    const baseDir = path.resolve(
-                      process.env.IMAGE_DIR || "downloads/xyne_images_db",
+                    console.log(
+                      "IMAGE DEBUG: MIME type check passed for",
+                      imageName,
+                      "proceeding with hash and description",
                     )
-                    const outputDir = path.join(baseDir, docid)
-                    await fsPromises.mkdir(outputDir, { recursive: true })
 
-                    const imageFilename = `${globalSeq.value}.${type.ext || "png"}`
-                    const imagePath = path.join(outputDir, imageFilename)
+                    // buffer already created above
+                    const imageHash = crypto
+                      .createHash("md5")
+                      .update(new Uint8Array(buffer))
+                      .digest("hex")
+
+                    let description: string
 
-                    await fsPromises.writeFile(
-                      imagePath,
-                      buffer as NodeJS.ArrayBufferView,
+                    if (seenHashDescriptions.has(imageHash)) {
+                      description = seenHashDescriptions.get(imageHash)!
+                      console.log(
+                        "IMAGE DEBUG: Reusing cached description for",
+                        imageName,
+                        "description:",
+                        description,
+                      )
+                      Logger.warn(
+                        `Reusing description for repeated image ${imageName} on page ${pageNum}`,
+                      )
+                    } else {
+                      console.log(
+                        "IMAGE DEBUG: Generating new description for",
+                        imageName,
+                        "describeImages:",
+                        describeImages,
+                      )
+                      if (describeImages) {
+                        try {
+                          console.log(
+                            "AI DEBUG: Calling describeImageWithllm for image",
+                            imageName,
+                          )
+                          description = await describeImageWithllm(buffer)
+                          console.log(
+                            "AI DEBUG: Got description from AI for",
+                            imageName,
+                            "description:",
+                            description,
+                          )
+                        } catch (e) {
+                          Logger.warn(
+                            `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`,
+                          )
+                          description = "This is an image from the PDF."
+                          console.log(
+                            "IMAGE DEBUG: Fallback description used due to AI error",
+                          )
+                        }
+                      } else {
+                        description = "This is an image."
+                        console.log(
+                          "IMAGE DEBUG: Using default description (describeImages=false)",
+                        )
+                      }
+                      if (
+                        description === "No description returned." ||
+                        description === "Image is not worth describing."
+                      ) {
+                        console.log(
+                          "IMAGE DEBUG: Replacing insufficient description for",
+                          imageName,
+                          "previous:",
+                          description,
+                        )
+                        Logger.warn(
+                          `${description} ${imageName} on page ${pageNum}`,
+                        )
+                        description = "Image extracted from PDF page."
+                      }
+                      seenHashDescriptions.set(imageHash, description)
+                      console.log(
+                        "IMAGE DEBUG: Cached new description for",
+                        imageName,
+                        "description:",
+                        description,
+                      )
+                    }
+
+                    try {
+                      // Save image to Downloads/xyne_images_db with improved error handling
+                      const baseDir = path.resolve(
+                        process.env.IMAGE_DIR || "downloads/xyne_images_db",
+                      )
+                      const outputDir = path.join(baseDir, docid)
+                      await fsPromises.mkdir(outputDir, { recursive: true })
+
+                      const imageFilename = `${globalSeq.value}.${type.ext || "png"}`
+                      const imagePath = path.join(outputDir, imageFilename)
+
+                      await fsPromises.writeFile(
+                        imagePath,
+                        buffer as NodeJS.ArrayBufferView,
+                      )
+                      Logger.info(`Saved image to: ${imagePath}`)
+                    } catch (saveError) {
+                      Logger.error(
+                        `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`,
+                      )
+                      // Skip adding to chunks if save failed
+                      continue
+                    }
+
+                    image_chunks.push(description)
+                    image_chunk_pos.push(globalSeq.value)
+                    // Logger.info(`OVERLAP DEBUG: Adding image placeholder to crossImageOverlap. Before: "${crossImageOverlap}"`)
+                    // console.log('OVERLAP DEBUG: crossImageOverlap before adding image placeholder:', crossImageOverlap)
+                    crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
+                    // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`)
+                    // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap)
+                    console.log(
+                      "IMAGE DEBUG: Added image chunk at position",
+                      globalSeq.value,
+                      {
+                        imageName,
+                        description,
+                        crossImageOverlap,
+                      },
                     )
-                    Logger.info(`Saved image to: ${imagePath}`)
-                  } catch (saveError) {
-                    Logger.error(
-                      `Failed to save image for ${imageName} on page ${pageNum}: ${saveError instanceof Error ? saveError.message : saveError}`,
+                    globalSeq.value++
+                    imagesOnPage += 1
+                    Logger.debug(
+                      `Successfully processed image ${imageName} on page ${pageNum}`,
                     )
-                    // Skip adding to chunks if save failed
-                    continue
                   }
-
-                  image_chunks.push(description)
-                  image_chunk_pos.push(globalSeq.value)
-                  crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
-                  globalSeq.value++
-                  Logger.debug(
-                    `Successfully processed image ${imageName} on page ${pageNum}`,
+                } catch (error) {
+                  Logger.warn(
+                    `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`,
                   )
                 }
-              } catch (error) {
-                Logger.warn(
-                  `Failed to process image ${imageName} on page ${pageNum}: ${(error as Error).message}`,
-                )
               }
               break
             }
@@ -520,6 +1310,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
           }
         }
 
+        // Vector snapshot functionality removed (no longer creating fallback canvas)
+
         // End of page: flush remaining paragraph and process paragraphs
         flushParagraph()
         const overlapText = processTextParagraphs(
@@ -530,10 +1322,20 @@ export async function extractTextAndImagesWithChunksFromPDF(
         )
 
         // Update cross-image overlap - APPEND instead of REPLACE to preserve image placeholders
+        // Logger.info(`OVERLAP DEBUG: End of page ${pageNum} - processing final overlap update`)
+        // console.log('OVERLAP DEBUG: Page', pageNum, 'end - overlapText from processTextParagraphs:', overlapText)
+        // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap before final update:', crossImageOverlap)
         if (overlapText.trim()) {
+          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText has content, updating crossImageOverlap`)
+          const previousCrossImageOverlap = crossImageOverlap
           crossImageOverlap = crossImageOverlap
             ? `${crossImageOverlap} ${overlapText}`
             : overlapText
+          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - crossImageOverlap updated from "${previousCrossImageOverlap}" to "${crossImageOverlap}"`)
+          // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap after final update:', crossImageOverlap)
+        } else {
+          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText is empty, no update to crossImageOverlap`)
+          // console.log('OVERLAP DEBUG: Page', pageNum, 'end - no update to crossImageOverlap (overlapText empty)')
         }
 
         Logger.debug(
@@ -549,6 +1351,20 @@ export async function extractTextAndImagesWithChunksFromPDF(
       `PDF processing completed. Total text chunks: ${text_chunks.length}, Total image chunks: ${image_chunks.length}`,
     )
 
+    console.log("FINAL DEBUG: PDF processing completed for", docid)
+    console.log("FINAL DEBUG: Processing summary:", {
+      totalTextChunks: text_chunks.length,
+      totalImageChunks: image_chunks.length,
+      textChunkPositions: text_chunk_pos.length,
+      imageChunkPositions: image_chunk_pos.length,
+      extractImages,
+      describeImages,
+    })
+
+    console.log("FINAL DEBUG: All text chunks:", text_chunks)
+    console.log("FINAL DEBUG: All text chunk positions:", text_chunk_pos)
+    console.log("FINAL DEBUG: All image chunks:", image_chunks)
+    console.log("FINAL DEBUG: All image chunk positions:", image_chunk_pos)
     return {
       text_chunks,
       image_chunks,
@@ -556,6 +1372,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
       image_chunk_pos,
     }
   } finally {
+    console.log("Calling destroy")
     await pdfDocument.destroy()
   }
 }
diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts
new file mode 100644
index 000000000..0f0adac30
--- /dev/null
+++ b/server/scripts/testPdfDirect.ts
@@ -0,0 +1,89 @@
+import { readFileSync } from "fs"
+import { resolve } from "path"
+import { FileProcessorService } from "@/services/fileProcessor"
+import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+
+async function testPdfDirect() {
+  let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf"
+    // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf"
+  pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf"
+  try {
+    console.log("=== DIRECT PDF PROCESSING TEST ===")
+    console.log("PDF Path:", pdfPath)
+
+    // Read the PDF file
+    console.log("\n1. Reading PDF file...")
+    const pdfBuffer = readFileSync(pdfPath)
+    console.log("File size:", pdfBuffer.length, "bytes")
+
+    console.log("\n2. Testing direct PDF processing (current knowledge base flow)...")
+    console.log("This simulates exactly what happens in the knowledge base upload:")
+    console.log("- FileProcessorService.processFile() is called")
+    console.log("- extractImages defaults to false")
+    console.log("- describeImages defaults to false")
+
+    // Test the exact flow used in knowledge base
+    const result = await FileProcessorService.processFile(
+      pdfBuffer,
+      "application/pdf",
+      "small2.pdf",
+      "test-doc-id",
+      pdfPath
+      // extractImages and describeImages default to false
+    )
+
+    console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===")
+    console.log("Text chunks:", result.chunks.length)
+    console.log("Image chunks:", result.image_chunks.length)
+    console.log("Text chunk positions:", result.chunks_pos.length)
+    console.log("Image chunk positions:", result.image_chunks_pos.length)
+
+    console.log("\n3. Testing with image processing enabled...")
+    console.log("Parameters: extractImages=true, describeImages=true")
+
+    // Test with images enabled to see the difference
+    const imageResult = await extractTextAndImagesWithChunksFromPDF(
+      new Uint8Array(pdfBuffer),
+      "test-doc-with-images",
+      true,  // extractImages enabled
+      true   // describeImages enabled
+    )
+
+    console.log("\n=== RESULTS WITH IMAGES ENABLED ===")
+    console.log("Text chunks:", imageResult.text_chunks.length)
+    console.log("Image chunks:", imageResult.image_chunks.length)
+    console.log("Text chunk positions:", imageResult.text_chunk_pos.length)
+    console.log("Image chunk positions:", imageResult.image_chunk_pos.length)
+
+    console.log("\n=== COMPARISON ===")
+    console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length)
+    console.log("With images    - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length)
+
+    if (result.chunks.length > 0) {
+      console.log("\n=== SAMPLE TEXT CHUNKS ===")
+      result.chunks.slice(0, 2).forEach((chunk, idx) => {
+        console.log(`\nText Chunk ${idx + 1}:`)
+        console.log(chunk)
+      })
+    }
+
+    if (imageResult.image_chunks.length > 0) {
+      console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===")
+      imageResult.image_chunks.forEach((chunk, idx) => {
+        console.log(`\nImage ${idx + 1}:`)
+        console.log(chunk)
+      })
+    }
+
+    console.log("\n=== TEST COMPLETED ===")
+    console.log("✓ Check the debug logs above from pdfChunks.ts")
+    console.log("✓ You can see exactly what's being processed in the current knowledge base flow")
+
+  } catch (error) {
+    console.error("Error processing PDF:", error)
+    process.exit(1)
+  }
+}
+
+// Run the test
+testPdfDirect().catch(console.error)
\ No newline at end of file

From 60bfdb9babb0f9039543ab25512843ef8701b4e0 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 14:07:45 +0530
Subject: [PATCH 2/9] comment fixes

---
 server/pdfChunks.ts | 343 +++++++++++++++++++++++++++-----------------
 1 file changed, 211 insertions(+), 132 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 0f7f11483..43c7f210e 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -16,6 +16,8 @@ const qcmsWasmPath =
   path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/"
 const seenHashDescriptions = new Map<string, string>()
 const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10)
+// Minimum line height used for calculating line break detection tolerance (in PDF units)
+const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "pdfChunks",
@@ -23,26 +25,7 @@ const Logger = getLogger(Subsystem.Integrations).child({
 
 const PDFJS = pdfjsLib
 
-// Utility function to clean text consistent with chunkTextByParagraph
-// const cleanText = (str: string): string => {
-//   console.log('CLEAN TEXT DEBUG: Input string length:', str.length)
-//   console.log('CLEAN TEXT DEBUG: Input string:', str)
 
-//   const normalized = str.replace(/\r\n|\r/g, "\n")
-//   console.log('CLEAN TEXT DEBUG: After normalization length:', normalized.length)
-//   console.log('CLEAN TEXT DEBUG: After normalization:', normalized)
-
-//   const cleaned = normalized.replace(
-//     /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g,
-//     "",
-//   )
-//   console.log('CLEAN TEXT DEBUG: After cleaning length:', cleaned.length)
-//   console.log('CLEAN TEXT DEBUG: Cleaned string:', cleaned)
-
-//   return cleaned
-// }
-
-//===
 
 export function normalizeText(input: string): string {
   if (!input) return ""
@@ -131,12 +114,13 @@ export function cleanText(input: string): string {
   s = s.replace(/[ \t]*\n[ \t]*/g, "\n")
 
   // Turn intra-paragraph newlines into spaces, preserve paragraph breaks
-  // 1) Mark paragraph breaks
-  s = s.replace(/\n{2,}/g, "[[PARA]]")
+  // 1) Mark paragraph breaks with a unique placeholder
+  const uniqueParaPlaceholder = `\uE000XYNE_PARA_BREAK_${Math.random().toString(36).substring(2)}\uE001`
+  s = s.replace(/\n{2,}/g, uniqueParaPlaceholder)
   // 2) Collapse remaining newlines (soft wraps) into spaces
   s = s.replace(/\n+/g, " ")
   // 3) Restore paragraph breaks
-  s = s.replace(/\[\[PARA\]\]/g, "\n\n")
+  s = s.replace(new RegExp(uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), "\n\n")
 
   // Apply line-wise despacing
   s = s
@@ -160,6 +144,47 @@ export function cleanText(input: string): string {
   return s.trim()
 }
 
+// =============================
+// 4. Matrix transformation utilities
+// =============================
+
+/**
+ * Multiply two 2D transformation matrices
+ * Each matrix is represented as [a, b, c, d, e, f] corresponding to:
+ * [a  c  e]
+ * [b  d  f]
+ * [0  0  1]
+ */
+function multiplyMatrices(
+  m1: number[],
+  m2: number[],
+): [number, number, number, number, number, number] {
+  const [a1, b1, c1, d1, e1, f1] = m1 as [
+    number,
+    number,
+    number,
+    number,
+    number,
+    number,
+  ]
+  const [a2, b2, c2, d2, e2, f2] = m2 as [
+    number,
+    number,
+    number,
+    number,
+    number,
+    number,
+  ]
+  return [
+    a1 * a2 + c1 * b2,
+    b1 * a2 + d1 * b2,
+    a1 * c2 + c1 * d2,
+    b1 * c2 + d1 * d2,
+    a1 * e2 + c1 * f2 + e1,
+    b1 * e2 + d1 * f2 + f1,
+  ]
+}
+
 //===
 
 /**
@@ -290,6 +315,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
   text_chunk_pos: number[]
   image_chunk_pos: number[]
 }> {
+  // Sanitize docid for safe filesystem use
+  const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_")
   Logger.info(`Starting PDF processing for: ${docid}`)
   console.log("PDF DEBUG: Starting processing with parameters:", {
     docid,
@@ -398,6 +425,65 @@ export async function extractTextAndImagesWithChunksFromPDF(
       return paragraphs.map(cleanText).filter((p) => p.length > 0)
     }
 
+    // Extract text from operators as fallback for edge cases
+    const extractFallbackTextFromOperators = (
+      opList: any,
+    ): string[] => {
+      const fallbackLines: string[] = []
+      
+      for (let i = 0; i < opList.fnArray.length; i++) {
+        const fnId = opList.fnArray[i]
+        const args = opList.argsArray[i]
+
+        // Handle text operators
+        if (
+          fnId === PDFJS.OPS.showText ||
+          fnId === PDFJS.OPS.showSpacedText ||
+          fnId === PDFJS.OPS.nextLineShowText ||
+          fnId === PDFJS.OPS.nextLineSetSpacingShowText
+        ) {
+          const extractedText = extractTextFromArgs(args)
+          if (extractedText.trim()) {
+            fallbackLines.push(extractedText.trim())
+          }
+        }
+      }
+
+      return fallbackLines
+    }
+
+    // Combine and deduplicate text from multiple sources
+    const combineTextSources = (
+      primaryParagraphs: string[],
+      fallbackLines: string[],
+    ): string[] => {
+      if (fallbackLines.length === 0) {
+        return primaryParagraphs
+      }
+
+      const primaryText = primaryParagraphs.join(" ").toLowerCase()
+      const additionalLines: string[] = []
+
+      // Add fallback lines that aren't already covered by primary extraction
+      for (const line of fallbackLines) {
+        const cleanLine = line.trim()
+        if (
+          cleanLine.length > 2 && // Skip very short strings
+          !primaryText.includes(cleanLine.toLowerCase())
+        ) {
+          additionalLines.push(cleanLine)
+        }
+      }
+
+      // If we found additional text, append it as a new paragraph
+      if (additionalLines.length > 0) {
+        const additionalParagraph = additionalLines.join(" ")
+        return [...primaryParagraphs, additionalParagraph]
+      }
+
+      return primaryParagraphs
+    }
+
     for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
       Logger.debug(`Processing page ${pageNum}`)
 
@@ -405,11 +491,25 @@ export async function extractTextAndImagesWithChunksFromPDF(
       try {
         const opList = await page.getOperatorList()
 
-        // Use textContent-based paragraphs for this page
-        let paragraphs: string[] = await buildParagraphsFromPage(page)
+        // Use textContent-based paragraphs for this page as primary source
+        let primaryParagraphs: string[] = await buildParagraphsFromPage(page)
+        
+        // Extract fallback text from operators for edge cases
+        const fallbackLines = extractFallbackTextFromOperators(opList)
+        
+        // Combine both sources, prioritizing primary extraction
+        let paragraphs: string[] = combineTextSources(primaryParagraphs, fallbackLines)
+        
         let currentParagraph = "" // kept for image-flow flush, but not used for text
         let textOperatorCount = (await page.getTextContent()).items.length
 
+        console.log("TEXT DEBUG: Text extraction summary for page", pageNum, {
+          primaryParagraphs: primaryParagraphs.length,
+          fallbackLines: fallbackLines.length,
+          finalParagraphs: paragraphs.length,
+          textOperatorCount,
+        })
+
         // Helper: try to resolve image object by name directly from page.objs
         const resolveImageByName = async (
           name: string,
@@ -437,51 +537,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
         ]
         const ctmStack: [number, number, number, number, number, number][] = []
 
-        const mul = (
-          m1: number[],
-          m2: number[],
-        ): [number, number, number, number, number, number] => {
-          const [a1, b1, c1, d1, e1, f1] = m1 as [
-            number,
-            number,
-            number,
-            number,
-            number,
-            number,
-          ]
-          const [a2, b2, c2, d2, e2, f2] = m2 as [
-            number,
-            number,
-            number,
-            number,
-            number,
-            number,
-          ]
-          return [
-            a1 * a2 + c1 * b2,
-            b1 * a2 + d1 * b2,
-            a1 * c2 + c1 * d2,
-            b1 * c2 + d1 * d2,
-            a1 * e2 + c1 * f2 + e1,
-            b1 * e2 + d1 * f2 + f1,
-          ]
-        }
-
-        const applyToPoint = (
-          m: number[],
-          x: number,
-          y: number,
-        ): { x: number; y: number } => {
-          const [a, b, c, d, e, f] = m as [
-            number,
-            number,
-            number,
-            number,
-            number,
-            number,
-          ]
-          return { x: a * x + c * y + e, y: b * x + d * y + f }
-        }
 
         // Do not inject crossImageOverlap into text paragraphs here
         // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap)
@@ -500,10 +555,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
           const fnId = opList.fnArray[i]
           const args = opList.argsArray[i]
 
-          // console.log(PDFJS.OPS.paintImageXObject , "PDFJS.OPS.paintImageXObject")
-          //   console.log(PDFJS.OPS.paintImageXObjectRepeat , "PDFJS.OPS.paintImageXObjectRepeat")
-          //   console.log(PDFJS.OPS.paintInlineImageXObject , "PDFJS.OPS.paintInlineImageXObject")
-          //   console.log(PDFJS.OPS.paintImageMaskXObject , "PDFJS.OPS.paintImageMaskXObject")
 
           // Track vector drawing operators (paths, fills, form XObjects)
           const isVectorOp =
@@ -531,7 +582,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
             case PDFJS.OPS.nextLine:
             case PDFJS.OPS.nextLineShowText:
             case PDFJS.OPS.nextLineSetSpacingShowText: {
-              // Text handled via getTextContent; ignore operator-driven text
+              // Text is now handled by combined extraction approach
+              // Operator-level extraction happens in extractFallbackTextFromOperators
               break
             }
             // Handle matrix and positioning operators that might indicate paragraph breaks
@@ -547,7 +599,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     args.length >= 6 &&
                     args.every((n: any) => typeof n === "number")
                   ) {
-                    currentCTM = mul(currentCTM, args as number[])
+                    currentCTM = multiplyMatrices(currentCTM, args as number[])
                   }
                 } catch {}
               }
@@ -561,15 +613,11 @@ export async function extractTextAndImagesWithChunksFromPDF(
               if (ctmStack.length) currentCTM = ctmStack.pop()!
               break
             }
-            // Handle image operators - be more comprehensive
+            // Handle image operators
             case extractImages ? PDFJS.OPS.paintImageXObject : null:
             case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null:
             case extractImages ? PDFJS.OPS.paintInlineImageXObject : null:
-            case extractImages ? PDFJS.OPS.paintImageMaskXObject : null:
-            case extractImages ? 83 : null:
-            case extractImages ? 85 : null:
-            case extractImages ? 86 : null:
-            case extractImages ? 88 : null: {
+            case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: {
               console.log(
                 "IMAGE DEBUG: Image operator detected on page",
                 pageNum,
@@ -609,7 +657,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                 PDFJS.OPS.paintInlineImageXObject,
                 "PDFJS.OPS.paintInlineImageXObject",
               )
-              if (fnId === PDFJS.OPS.paintInlineImageXObject || fnId === 86) {
+              if (fnId === PDFJS.OPS.paintInlineImageXObject) {
                 console.log("IMAGE DEBUG: Detected inline image data in args")
                 const candidate = Array.isArray(args)
                   ? args.find(
@@ -725,7 +773,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               process.env.IMAGE_DIR ||
                                 "downloads/xyne_images_db",
                             )
-                            const outputDir = path.join(baseDir, docid)
+                            const outputDir = path.join(baseDir, safeDocId)
                             await fsPromises.mkdir(outputDir, {
                               recursive: true,
                             })
@@ -818,7 +866,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               process.env.IMAGE_DIR ||
                                 "downloads/xyne_images_db",
                             )
-                            const outputDir = path.join(baseDir, docid)
+                            const outputDir = path.join(baseDir, safeDocId)
                             await fsPromises.mkdir(outputDir, {
                               recursive: true,
                             })
@@ -978,59 +1026,81 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     case pdfjsLib.ImageKind.GRAYSCALE_1BPP:
                     case pdfjsLib.ImageKind.RGB_24BPP:
                     case pdfjsLib.ImageKind.RGBA_32BPP: {
-                      const bytesPerPixel =
-                        kind === pdfjsLib.ImageKind.RGBA_32BPP
-                          ? 4
-                          : kind === pdfjsLib.ImageKind.RGB_24BPP
-                            ? 3
-                            : 1
-                      const expectedLength = width * height * bytesPerPixel
+                      let expectedLength: number
+                      if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
+                        // 1 bit per pixel, packed into bytes
+                        expectedLength = Math.ceil((width * height) / 8)
+                      } else {
+                        const bytesPerPixel =
+                          kind === pdfjsLib.ImageKind.RGBA_32BPP
+                            ? 4
+                            : 3 // RGB_24BPP
+                        expectedLength = width * height * bytesPerPixel
+                      }
 
                       if (uint8Data.length >= expectedLength) {
                         const rgbaData = new Uint8ClampedArray(
                           width * height * 4,
                         )
-                        for (let i = 0; i < width * height; i++) {
-                          const srcIdx = i * bytesPerPixel
-                          const dstIdx = i * 4
-                          if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
-                            const gray =
-                              srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0
-                            rgbaData[dstIdx] = gray // R
-                            rgbaData[dstIdx + 1] = gray // G
-                            rgbaData[dstIdx + 2] = gray // B
-                            rgbaData[dstIdx + 3] = 255 // A
-                          } else if (kind === pdfjsLib.ImageKind.RGB_24BPP) {
-                            rgbaData[dstIdx] =
-                              srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
-                            rgbaData[dstIdx + 1] =
-                              srcIdx + 1 < uint8Data.length
-                                ? uint8Data[srcIdx + 1]
-                                : 0 // G
-                            rgbaData[dstIdx + 2] =
-                              srcIdx + 2 < uint8Data.length
-                                ? uint8Data[srcIdx + 2]
-                                : 0 // B
-                            rgbaData[dstIdx + 3] = 255 // A
-                          } else {
-                            // RGBA_32BPP
-                            rgbaData[dstIdx] =
-                              srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
-                            rgbaData[dstIdx + 1] =
-                              srcIdx + 1 < uint8Data.length
-                                ? uint8Data[srcIdx + 1]
-                                : 0 // G
-                            rgbaData[dstIdx + 2] =
-                              srcIdx + 2 < uint8Data.length
-                                ? uint8Data[srcIdx + 2]
-                                : 0 // B
-                            rgbaData[dstIdx + 3] =
-                              srcIdx + 3 < uint8Data.length
-                                ? uint8Data[srcIdx + 3]
-                                : 255 // A
+                        
+                        if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
+                          // Handle 1 bit per pixel grayscale (bit-packed data)
+                          let pixelIndex = 0
+                          for (let y = 0; y < height; y++) {
+                            for (let x = 0; x < width; x++) {
+                              const byteIndex = Math.floor(pixelIndex / 8)
+                              const bitIndex = 7 - (pixelIndex % 8) // MSB first
+                              const bit = byteIndex < uint8Data.length 
+                                ? (uint8Data[byteIndex] >> bitIndex) & 1 
+                                : 0
+                              const gray = bit ? 255 : 0 // Convert bit to full pixel value
+                              
+                              const dstIdx = pixelIndex * 4
+                              rgbaData[dstIdx] = gray     // R
+                              rgbaData[dstIdx + 1] = gray // G
+                              rgbaData[dstIdx + 2] = gray // B
+                              rgbaData[dstIdx + 3] = 255  // A
+                              pixelIndex++
+                            }
+                          }
+                        } else {
+                          // Handle RGB_24BPP and RGBA_32BPP (byte-per-channel data)
+                          const bytesPerPixel = kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3
+                          for (let i = 0; i < width * height; i++) {
+                            const srcIdx = i * bytesPerPixel
+                            const dstIdx = i * 4
+                            if (kind === pdfjsLib.ImageKind.RGB_24BPP) {
+                              rgbaData[dstIdx] =
+                                srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
+                              rgbaData[dstIdx + 1] =
+                                srcIdx + 1 < uint8Data.length
+                                  ? uint8Data[srcIdx + 1]
+                                  : 0 // G
+                              rgbaData[dstIdx + 2] =
+                                srcIdx + 2 < uint8Data.length
+                                  ? uint8Data[srcIdx + 2]
+                                  : 0 // B
+                              rgbaData[dstIdx + 3] = 255 // A
+                            } else {
+                              // RGBA_32BPP
+                              rgbaData[dstIdx] =
+                                srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
+                              rgbaData[dstIdx + 1] =
+                                srcIdx + 1 < uint8Data.length
+                                  ? uint8Data[srcIdx + 1]
+                                  : 0 // G
+                              rgbaData[dstIdx + 2] =
+                                srcIdx + 2 < uint8Data.length
+                                  ? uint8Data[srcIdx + 2]
+                                  : 0 // B
+                              rgbaData[dstIdx + 3] =
+                                srcIdx + 3 < uint8Data.length
+                                  ? uint8Data[srcIdx + 3]
+                                  : 255 // A
+                            }
                           }
                         }
-                        const imageData = new ImageData(rgbaData, width)
+                        const imageData = new ImageData(rgbaData, width, height)
                         ctx.putImageData(imageData, 0, 0)
                         imageProcessed = true
                       }
@@ -1074,7 +1144,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                                   : 0 // B
                               rgbaData[dstIdx + 3] = 255 // A
                             }
-                            const imageData = new ImageData(rgbaData, width)
+                            const imageData = new ImageData(rgbaData, width, height)
                             ctx.putImageData(imageData, 0, 0)
                             imageProcessed = true
                           }
@@ -1103,6 +1173,15 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       imageName,
                     )
                     const buffer = canvas.toBuffer("image/png")
+                    if (
+                      buffer.length >
+                      DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                    ) {
+                      Logger.warn(
+                        `Skipping encoded image > ${DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB} MB (size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB)`,
+                      )
+                      continue
+                    }
                     console.log(
                       "IMAGE DEBUG: PNG buffer created for",
                       imageName,
@@ -1255,7 +1334,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       const baseDir = path.resolve(
                         process.env.IMAGE_DIR || "downloads/xyne_images_db",
                       )
-                      const outputDir = path.join(baseDir, docid)
+                      const outputDir = path.join(baseDir, safeDocId)
                       await fsPromises.mkdir(outputDir, { recursive: true })
 
                       const imageFilename = `${globalSeq.value}.${type.ext || "png"}`

From 29508fa9a7d91ed4759156b969b7c264d16fec78 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 15:18:01 +0530
Subject: [PATCH 3/9] comment fixes

---
 server/pdfChunks.ts | 406 +++++++++++++++++++-------------------------
 1 file changed, 170 insertions(+), 236 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 43c7f210e..f3b5675e0 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -25,8 +25,6 @@ const Logger = getLogger(Subsystem.Integrations).child({
 
 const PDFJS = pdfjsLib
 
-
-
 export function normalizeText(input: string): string {
   if (!input) return ""
 
@@ -43,9 +41,9 @@ export function normalizeText(input: string): string {
   return normalized.trim()
 }
 
-// =========================================
+
 // 2. Smart letter-spacing collapse (per line)
-// =========================================
+
 function smartDespaceLine(line: string): string {
   if (!line) return line
 
@@ -101,9 +99,9 @@ function smartDespaceLine(line: string): string {
   return out.join("")
 }
 
-// =============================
+
 // 3. High-level text cleaner
-// =============================
+
 export function cleanText(input: string): string {
   let s = normalizeText(input)
 
@@ -120,7 +118,13 @@ export function cleanText(input: string): string {
   // 2) Collapse remaining newlines (soft wraps) into spaces
   s = s.replace(/\n+/g, " ")
   // 3) Restore paragraph breaks
-  s = s.replace(new RegExp(uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), "\n\n")
+  s = s.replace(
+    new RegExp(
+      uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"),
+      "g",
+    ),
+    "\n\n",
+  )
 
   // Apply line-wise despacing
   s = s
@@ -144,9 +148,9 @@ export function cleanText(input: string): string {
   return s.trim()
 }
 
-// =============================
+
 // 4. Matrix transformation utilities
-// =============================
+
 
 /**
  * Multiply two 2D transformation matrices
@@ -234,7 +238,6 @@ function extractTextFromArgs(args: any[]): string {
 
   // Additional validation: ensure we return clean, valid text
   const result = typeof text === "string" ? text : ""
-  console.log("EXTRACT TEXT DEBUG: Final extracted text:", result)
   return result
 }
 
@@ -249,10 +252,10 @@ function processTextParagraphs(
   globalSeq: { value: number },
   overlapBytes: number = 32,
 ): string {
-  console.log("TEXT DEBUG: Processing paragraphs count:", paragraphs.length)
+  Logger.debug("Processing paragraphs", { count: paragraphs.length })
 
   if (paragraphs.length === 0) {
-    console.log("TEXT DEBUG: No paragraphs to process")
+    Logger.debug("No paragraphs to process")
     return ""
   }
 
@@ -260,7 +263,7 @@ function processTextParagraphs(
     .map(cleanText)
     .filter((p) => p.length > 0)
   if (cleanedParagraphs.length === 0) {
-    console.log("TEXT DEBUG: No cleaned paragraphs after filtering")
+    Logger.debug("No cleaned paragraphs after filtering")
     return ""
   }
 
@@ -317,8 +320,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
 }> {
   // Sanitize docid for safe filesystem use
   const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_")
-  Logger.info(`Starting PDF processing for: ${docid}`)
-  console.log("PDF DEBUG: Starting processing with parameters:", {
+  Logger.debug("Starting processing with parameters", {
     docid,
     extractImages,
     describeImages,
@@ -426,11 +428,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
     }
 
     // Extract text from operators as fallback for edge cases
-    const extractFallbackTextFromOperators = (
-      opList: any,
-    ): string[] => {
+    const extractFallbackTextFromOperators = (opList: any): string[] => {
       const fallbackLines: string[] = []
-      
+
       for (let i = 0; i < opList.fnArray.length; i++) {
         const fnId = opList.fnArray[i]
         const args = opList.argsArray[i]
@@ -493,17 +493,21 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
         // Use textContent-based paragraphs for this page as primary source
         let primaryParagraphs: string[] = await buildParagraphsFromPage(page)
-        
+
         // Extract fallback text from operators for edge cases
         const fallbackLines = extractFallbackTextFromOperators(opList)
-        
+
         // Combine both sources, prioritizing primary extraction
-        let paragraphs: string[] = combineTextSources(primaryParagraphs, fallbackLines)
-        
+        let paragraphs: string[] = combineTextSources(
+          primaryParagraphs,
+          fallbackLines,
+        )
+
         let currentParagraph = "" // kept for image-flow flush, but not used for text
         let textOperatorCount = (await page.getTextContent()).items.length
 
-        console.log("TEXT DEBUG: Text extraction summary for page", pageNum, {
+        Logger.debug("Text extraction summary for page", {
+          pageNum,
           primaryParagraphs: primaryParagraphs.length,
           fallbackLines: fallbackLines.length,
           finalParagraphs: paragraphs.length,
@@ -537,7 +541,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
         ]
         const ctmStack: [number, number, number, number, number, number][] = []
 
-
         // Do not inject crossImageOverlap into text paragraphs here
         // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap)
 
@@ -555,7 +558,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
           const fnId = opList.fnArray[i]
           const args = opList.argsArray[i]
 
-
           // Track vector drawing operators (paths, fills, form XObjects)
           const isVectorOp =
             fnId === PDFJS.OPS.constructPath ||
@@ -618,21 +620,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
             case extractImages ? PDFJS.OPS.paintImageXObjectRepeat : null:
             case extractImages ? PDFJS.OPS.paintInlineImageXObject : null:
             case extractImages ? PDFJS.OPS.paintImageMaskXObject : null: {
-              console.log(
-                "IMAGE DEBUG: Image operator detected on page",
+              Logger.debug("Image operator detected", {
                 pageNum,
-                {
-                  extractImages,
-                  operatorType: fnId,
-                  imageName: args[0],
-                  knownOperators: {
-                    paintImageXObject: PDFJS.OPS.paintImageXObject,
-                    paintImageXObjectRepeat: PDFJS.OPS.paintImageXObjectRepeat,
-                    paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject,
-                    paintImageMaskXObject: PDFJS.OPS.paintImageMaskXObject,
-                  },
-                },
-              )
+                extractImages,
+                operatorType: fnId,
+                imageName: args[0],
+              })
 
               // Do not process text per-image anymore; text is processed once per page.
               // Maintain crossImageOverlap continuity by keeping placeholders only.
@@ -647,18 +640,17 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       typeof args[0].name === "string"
                     ? args[0].name
                     : args?.[0]
-              console.log("IMAGE DEBUG: Processing image:", imageName)
+              Logger.debug("Processing image", { imageName })
               let imageDict: any | null = null
               let isInline = false
               // Inline image may directly carry data in args
-              console.log("IMAGE DEBUG: Initial args for image operator:", args)
-              console.log("IMAGE DEBUG: fnId for image operator:", fnId)
-              console.log(
-                PDFJS.OPS.paintInlineImageXObject,
-                "PDFJS.OPS.paintInlineImageXObject",
-              )
+              Logger.debug("Image operator details", { 
+                args: args.length,
+                fnId,
+                paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject,
+              })
               if (fnId === PDFJS.OPS.paintInlineImageXObject) {
-                console.log("IMAGE DEBUG: Detected inline image data in args")
+                Logger.debug("Detected inline image data in args")
                 const candidate = Array.isArray(args)
                   ? args.find(
                       (a: any) =>
@@ -674,10 +666,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
                   isInline = true
                 }
               }
-              console.log(
-                "IMAGE DEBUG: Initial imageDict resolved from args:",
-                imageDict,
-              )
+              Logger.debug("Initial imageDict resolved", {
+                hasImageDict: !!imageDict,
+                isInline,
+              })
               if (
                 !imageDict &&
                 (typeof imageName === "string" ||
@@ -697,16 +689,14 @@ export async function extractTextAndImagesWithChunksFromPDF(
                 )
                 continue
               }
-              console.log("IMAGE DEBUG: Resolved imageDict:", {
-                imageDict,
+              Logger.debug("Resolved imageDict", {
+                hasImageDict: !!imageDict,
                 isInline,
               })
 
               // Ensure imageDict is valid before processing
               if (!imageDict || typeof imageDict !== "object") {
-                console.log(
-                  "IMAGE DEBUG: imageDict is null or invalid, skipping to crop fallback",
-                )
+                Logger.debug("imageDict is null or invalid, skipping to crop fallback")
                 // This will fall through to the crop fallback logic below
               } else {
                 try {
@@ -727,11 +717,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     const width: number = c.width
                     const height: number = c.height
                     if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
-                      console.log(
-                        "IMAGE DEBUG: SKIPPED - Small dimensions from canvas for",
+                      Logger.debug("Skipped small canvas image", {
                         imageName,
-                        { width, height },
-                      )
+                        width,
+                        height,
+                        minRequired: MIN_IMAGE_DIM_PX,
+                      })
                     } else {
                       const buffer = c.toBuffer("image/png")
                       if (
@@ -819,11 +810,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     const width: number = imgLike.width
                     const height: number = imgLike.height
                     if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
-                      console.log(
-                        "IMAGE DEBUG: SKIPPED - Small dimensions from image-like for",
+                      Logger.debug("Skipped small image-like object", {
                         imageName,
-                        { width, height },
-                      )
+                        width,
+                        height,
+                        minRequired: MIN_IMAGE_DIM_PX,
+                      })
                     } else {
                       const cnv = createCanvas(width, height)
                       const cctx = cnv.getContext("2d")
@@ -918,43 +910,33 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     imageDict.bytes ??
                     (imageDict.imgData ? imageDict.imgData.data : undefined)
 
-                  console.log(
-                    "IMAGE DEBUG: Full image details for",
+                  Logger.debug("Full image details", {
                     imageName,
-                    {
-                      width,
-                      height,
-                      kind,
-                      dataLength: rawData ? rawData.length : null,
-                      dataSizeMB: rawData
-                        ? (rawData.length / (1024 * 1024)).toFixed(2)
-                        : null,
-                      maxAllowedSizeMB:
-                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
-                      minDimension: MIN_IMAGE_DIM_PX,
-                      isValidDimensions: width > 0 && height > 0,
-                      meetsMinSize:
-                        width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX,
-                      withinSizeLimit: rawData
-                        ? rawData.length <=
-                          DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
-                        : false,
-                      isInline,
-                    },
-                  )
+                    width,
+                    height,
+                    kind,
+                    dataLength: rawData ? rawData.length : null,
+                    dataSizeMB: rawData
+                      ? (rawData.length / (1024 * 1024)).toFixed(2)
+                      : null,
+                    maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                    minDimension: MIN_IMAGE_DIM_PX,
+                    isValidDimensions: width > 0 && height > 0,
+                    meetsMinSize:
+                      width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX,
+                    withinSizeLimit: rawData
+                      ? rawData.length <=
+                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                      : false,
+                    isInline,
+                  })
 
                   if (!width || !height || width <= 0 || height <= 0) {
-                    console.log(
-                      "IMAGE DEBUG: SKIPPED - Invalid dimensions for",
+                    Logger.debug("Skipped image with invalid dimensions", {
                       imageName,
-                      "width:",
                       width,
-                      "height:",
                       height,
-                    )
-                    Logger.debug(
-                      `Invalid image dimensions for ${imageName}: ${width}x${height}`,
-                    )
+                    })
                     continue
                   }
 
@@ -963,46 +945,27 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     rawData.length >
                       DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
                   ) {
-                    console.log(
-                      "IMAGE DEBUG: SKIPPED - Large file size for",
+                    Logger.warn("Skipped large image", {
                       imageName,
-                      {
-                        actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(
-                          2,
-                        ),
-                        maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
-                        actualBytes: rawData.length,
-                        maxAllowedBytes:
-                          DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB *
-                          1024 *
-                          1024,
-                      },
-                    )
-                    Logger.warn(
-                      `Skipping large image (${(rawData.length / (1024 * 1024)).toFixed(2)} MB): ${imageName}`,
-                    )
+                      actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2),
+                      maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                    })
                     continue
                   }
 
                   if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
-                    console.log(
-                      "IMAGE DEBUG: SKIPPED - Small dimensions for",
+                    Logger.debug("Skipped small image", {
                       imageName,
-                      {
-                        width,
-                        height,
-                        minRequired: MIN_IMAGE_DIM_PX,
-                        widthTooSmall: width < MIN_IMAGE_DIM_PX,
-                        heightTooSmall: height < MIN_IMAGE_DIM_PX,
-                      },
-                    )
+                      width,
+                      height,
+                      minRequired: MIN_IMAGE_DIM_PX,
+                    })
                     continue // Skip small images
                   }
 
-                  console.log(
-                    "IMAGE DEBUG: Image passed all filters, proceeding with processing for",
+                  Logger.debug("Image passed all filters, proceeding with processing", {
                     imageName,
-                  )
+                  })
 
                   let uint8Data: Uint8Array
                   if (rawData instanceof Uint8Array) {
@@ -1032,9 +995,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                         expectedLength = Math.ceil((width * height) / 8)
                       } else {
                         const bytesPerPixel =
-                          kind === pdfjsLib.ImageKind.RGBA_32BPP
-                            ? 4
-                            : 3 // RGB_24BPP
+                          kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3 // RGB_24BPP
                         expectedLength = width * height * bytesPerPixel
                       }
 
@@ -1042,7 +1003,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                         const rgbaData = new Uint8ClampedArray(
                           width * height * 4,
                         )
-                        
+
                         if (kind === pdfjsLib.ImageKind.GRAYSCALE_1BPP) {
                           // Handle 1 bit per pixel grayscale (bit-packed data)
                           let pixelIndex = 0
@@ -1050,28 +1011,32 @@ export async function extractTextAndImagesWithChunksFromPDF(
                             for (let x = 0; x < width; x++) {
                               const byteIndex = Math.floor(pixelIndex / 8)
                               const bitIndex = 7 - (pixelIndex % 8) // MSB first
-                              const bit = byteIndex < uint8Data.length 
-                                ? (uint8Data[byteIndex] >> bitIndex) & 1 
-                                : 0
+                              const bit =
+                                byteIndex < uint8Data.length
+                                  ? (uint8Data[byteIndex] >> bitIndex) & 1
+                                  : 0
                               const gray = bit ? 255 : 0 // Convert bit to full pixel value
-                              
+
                               const dstIdx = pixelIndex * 4
-                              rgbaData[dstIdx] = gray     // R
+                              rgbaData[dstIdx] = gray // R
                               rgbaData[dstIdx + 1] = gray // G
                               rgbaData[dstIdx + 2] = gray // B
-                              rgbaData[dstIdx + 3] = 255  // A
+                              rgbaData[dstIdx + 3] = 255 // A
                               pixelIndex++
                             }
                           }
                         } else {
                           // Handle RGB_24BPP and RGBA_32BPP (byte-per-channel data)
-                          const bytesPerPixel = kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3
+                          const bytesPerPixel =
+                            kind === pdfjsLib.ImageKind.RGBA_32BPP ? 4 : 3
                           for (let i = 0; i < width * height; i++) {
                             const srcIdx = i * bytesPerPixel
                             const dstIdx = i * 4
                             if (kind === pdfjsLib.ImageKind.RGB_24BPP) {
                               rgbaData[dstIdx] =
-                                srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
+                                srcIdx < uint8Data.length
+                                  ? uint8Data[srcIdx]
+                                  : 0 // R
                               rgbaData[dstIdx + 1] =
                                 srcIdx + 1 < uint8Data.length
                                   ? uint8Data[srcIdx + 1]
@@ -1084,7 +1049,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
                             } else {
                               // RGBA_32BPP
                               rgbaData[dstIdx] =
-                                srcIdx < uint8Data.length ? uint8Data[srcIdx] : 0 // R
+                                srcIdx < uint8Data.length
+                                  ? uint8Data[srcIdx]
+                                  : 0 // R
                               rgbaData[dstIdx + 1] =
                                 srcIdx + 1 < uint8Data.length
                                   ? uint8Data[srcIdx + 1]
@@ -1144,7 +1111,11 @@ export async function extractTextAndImagesWithChunksFromPDF(
                                   : 0 // B
                               rgbaData[dstIdx + 3] = 255 // A
                             }
-                            const imageData = new ImageData(rgbaData, width, height)
+                            const imageData = new ImageData(
+                              rgbaData,
+                              width,
+                              height,
+                            )
                             ctx.putImageData(imageData, 0, 0)
                             imageProcessed = true
                           }
@@ -1157,21 +1128,15 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     }
                   }
 
-                  console.log(
-                    "IMAGE DEBUG: Image processing result for",
+                  Logger.debug("Image processing result", {
                     imageName,
-                    {
-                      imageProcessed,
-                      canvasWidth: canvas.width,
-                      canvasHeight: canvas.height,
-                    },
-                  )
+                    imageProcessed,
+                    canvasWidth: canvas.width,
+                    canvasHeight: canvas.height,
+                  })
 
                   if (imageProcessed) {
-                    console.log(
-                      "IMAGE DEBUG: Converting to PNG buffer for",
-                      imageName,
-                    )
+                    Logger.debug("Converting to PNG buffer", { imageName })
                     const buffer = canvas.toBuffer("image/png")
                     if (
                       buffer.length >
@@ -1182,74 +1147,61 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       )
                       continue
                     }
-                    console.log(
-                      "IMAGE DEBUG: PNG buffer created for",
+                    Logger.debug("PNG buffer created", {
                       imageName,
-                      "size:",
-                      buffer.length,
-                      "bytes",
-                    )
+                      size: buffer.length,
+                    })
 
                     // @ts-ignore
                     let type = await imageType(buffer)
-                    console.log(
-                      "IMAGE DEBUG: Image type detection result for",
+                    Logger.debug("Image type detection result", {
                       imageName,
                       type,
-                    )
+                    })
 
                     if (!type) {
-                      console.log(
-                        "IMAGE DEBUG: Could not determine MIME type for",
+                      Logger.debug("Could not determine MIME type, using default", {
                         imageName,
-                        "using default image/png",
-                      )
+                        default: "image/png",
+                      })
                       Logger.warn(
                         `Could not determine MIME type for ${imageName}. Defaulting to image/png`,
                       )
                       type = { mime: "image/png", ext: "png" }
                     }
 
-                    console.log(
-                      "IMAGE DEBUG: Checking MIME type support for",
+                    Logger.debug("Checking MIME type support", {
                       imageName,
-                      {
-                        detectedMime: type.mime,
-                        supportedMimes: Array.from(
-                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
+                      detectedMime: type.mime,
+                      supportedMimes: Array.from(
+                        DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
+                      ),
+                      isSupported:
+                        DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(
+                          type.mime,
                         ),
-                        isSupported:
-                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(
-                            type.mime,
-                          ),
-                      },
-                    )
+                    })
 
                     if (
                       !type ||
                       !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)
                     ) {
-                      console.log(
-                        "IMAGE DEBUG: SKIPPED - Unsupported MIME type for",
+                      Logger.debug("Skipped image with unsupported MIME type", {
                         imageName,
-                        {
-                          detectedMime: type?.mime,
-                          supportedMimes: Array.from(
-                            DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
-                          ),
-                        },
-                      )
+                        detectedMime: type?.mime,
+                        supportedMimes: Array.from(
+                          DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
+                        ),
+                      })
                       Logger.warn(
                         `Unsupported or unknown image MIME type: ${type?.mime}. Skipping image: ${imageName}`,
                       )
                       continue
                     }
 
-                    console.log(
-                      "IMAGE DEBUG: MIME type check passed for",
+                    Logger.debug("MIME type check passed, proceeding with processing", {
                       imageName,
-                      "proceeding with hash and description",
-                    )
+                    })
 
                     // buffer already created above
                     const imageHash = crypto
@@ -1261,72 +1213,57 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
                     if (seenHashDescriptions.has(imageHash)) {
                       description = seenHashDescriptions.get(imageHash)!
-                      console.log(
-                        "IMAGE DEBUG: Reusing cached description for",
+                      Logger.debug("Reusing cached description for image", {
                         imageName,
-                        "description:",
                         description,
-                      )
+                      })
                       Logger.warn(
                         `Reusing description for repeated image ${imageName} on page ${pageNum}`,
                       )
                     } else {
-                      console.log(
-                        "IMAGE DEBUG: Generating new description for",
+                      Logger.debug("Generating new description for image", {
                         imageName,
-                        "describeImages:",
                         describeImages,
-                      )
+                      })
                       if (describeImages) {
                         try {
-                          console.log(
-                            "AI DEBUG: Calling describeImageWithllm for image",
+                          Logger.debug("Calling describeImageWithllm for image", {
                             imageName,
-                          )
+                          })
                           description = await describeImageWithllm(buffer)
-                          console.log(
-                            "AI DEBUG: Got description from AI for",
+                          Logger.debug("Got description from AI for image", {
                             imageName,
-                            "description:",
                             description,
-                          )
+                          })
                         } catch (e) {
                           Logger.warn(
                             `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`,
                           )
                           description = "This is an image from the PDF."
-                          console.log(
-                            "IMAGE DEBUG: Fallback description used due to AI error",
-                          )
+                          Logger.debug("Using fallback description due to AI error")
                         }
                       } else {
                         description = "This is an image."
-                        console.log(
-                          "IMAGE DEBUG: Using default description (describeImages=false)",
-                        )
+                        Logger.debug("Using default description (describeImages=false)")
                       }
                       if (
                         description === "No description returned." ||
                         description === "Image is not worth describing."
                       ) {
-                        console.log(
-                          "IMAGE DEBUG: Replacing insufficient description for",
+                        Logger.debug("Replacing insufficient description", {
                           imageName,
-                          "previous:",
-                          description,
-                        )
+                          previousDescription: description,
+                        })
                         Logger.warn(
                           `${description} ${imageName} on page ${pageNum}`,
                         )
                         description = "Image extracted from PDF page."
                       }
                       seenHashDescriptions.set(imageHash, description)
-                      console.log(
-                        "IMAGE DEBUG: Cached new description for",
+                      Logger.debug("Cached new description for image", {
                         imageName,
-                        "description:",
                         description,
-                      )
+                      })
                     }
 
                     try {
@@ -1360,15 +1297,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
                     // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`)
                     // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap)
-                    console.log(
-                      "IMAGE DEBUG: Added image chunk at position",
-                      globalSeq.value,
-                      {
-                        imageName,
-                        description,
-                        crossImageOverlap,
-                      },
-                    )
+                    Logger.debug("Added image chunk at position", {
+                      position: globalSeq.value,
+                      imageName,
+                      description,
+                      crossImageOverlap,
+                    })
                     globalSeq.value++
                     imagesOnPage += 1
                     Logger.debug(
@@ -1430,8 +1364,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
       `PDF processing completed. Total text chunks: ${text_chunks.length}, Total image chunks: ${image_chunks.length}`,
     )
 
-    console.log("FINAL DEBUG: PDF processing completed for", docid)
-    console.log("FINAL DEBUG: Processing summary:", {
+    Logger.debug("PDF processing completed for document", { docid })
+    Logger.debug("Processing summary", {
       totalTextChunks: text_chunks.length,
       totalImageChunks: image_chunks.length,
       textChunkPositions: text_chunk_pos.length,
@@ -1440,10 +1374,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
       describeImages,
     })
 
-    console.log("FINAL DEBUG: All text chunks:", text_chunks)
-    console.log("FINAL DEBUG: All text chunk positions:", text_chunk_pos)
-    console.log("FINAL DEBUG: All image chunks:", image_chunks)
-    console.log("FINAL DEBUG: All image chunk positions:", image_chunk_pos)
+    Logger.debug("All text chunks", { text_chunks })
+    Logger.debug("All text chunk positions", { text_chunk_pos })
+    Logger.debug("All image chunks", { image_chunks })
+    Logger.debug("All image chunk positions", { image_chunk_pos })
     return {
       text_chunks,
       image_chunks,
@@ -1451,7 +1385,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
       image_chunk_pos,
     }
   } finally {
-    console.log("Calling destroy")
+    Logger.debug("Calling PDF document destroy")
     await pdfDocument.destroy()
   }
 }

From 498c544dd79a4c6dd996248bb35fcce91400bdd0 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 15:18:12 +0530
Subject: [PATCH 4/9] comment fixes

---
 server/pdfChunks.ts | 63 +++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index f3b5675e0..9a40135be 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -41,7 +41,6 @@ export function normalizeText(input: string): string {
   return normalized.trim()
 }
 
-
 // 2. Smart letter-spacing collapse (per line)
 
 function smartDespaceLine(line: string): string {
@@ -99,7 +98,6 @@ function smartDespaceLine(line: string): string {
   return out.join("")
 }
 
-
 // 3. High-level text cleaner
 
 export function cleanText(input: string): string {
@@ -148,10 +146,8 @@ export function cleanText(input: string): string {
   return s.trim()
 }
 
-
 // 4. Matrix transformation utilities
 
-
 /**
  * Multiply two 2D transformation matrices
  * Each matrix is represented as [a, b, c, d, e, f] corresponding to:
@@ -644,7 +640,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
               let imageDict: any | null = null
               let isInline = false
               // Inline image may directly carry data in args
-              Logger.debug("Image operator details", { 
+              Logger.debug("Image operator details", {
                 args: args.length,
                 fnId,
                 paintInlineImageXObject: PDFJS.OPS.paintInlineImageXObject,
@@ -696,7 +692,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
               // Ensure imageDict is valid before processing
               if (!imageDict || typeof imageDict !== "object") {
-                Logger.debug("imageDict is null or invalid, skipping to crop fallback")
+                Logger.debug(
+                  "imageDict is null or invalid, skipping to crop fallback",
+                )
                 // This will fall through to the crop fallback logic below
               } else {
                 try {
@@ -963,9 +961,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     continue // Skip small images
                   }
 
-                  Logger.debug("Image passed all filters, proceeding with processing", {
-                    imageName,
-                  })
+                  Logger.debug(
+                    "Image passed all filters, proceeding with processing",
+                    {
+                      imageName,
+                    },
+                  )
 
                   let uint8Data: Uint8Array
                   if (rawData instanceof Uint8Array) {
@@ -1160,10 +1161,13 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     })
 
                     if (!type) {
-                      Logger.debug("Could not determine MIME type, using default", {
-                        imageName,
-                        default: "image/png",
-                      })
+                      Logger.debug(
+                        "Could not determine MIME type, using default",
+                        {
+                          imageName,
+                          default: "image/png",
+                        },
+                      )
                       Logger.warn(
                         `Could not determine MIME type for ${imageName}. Defaulting to image/png`,
                       )
@@ -1176,10 +1180,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       supportedMimes: Array.from(
                         DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES,
                       ),
-                      isSupported:
-                        DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(
-                          type.mime,
-                        ),
+                      isSupported: DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(
+                        type.mime,
+                      ),
                     })
 
                     if (
@@ -1199,9 +1202,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       continue
                     }
 
-                    Logger.debug("MIME type check passed, proceeding with processing", {
-                      imageName,
-                    })
+                    Logger.debug(
+                      "MIME type check passed, proceeding with processing",
+                      {
+                        imageName,
+                      },
+                    )
 
                     // buffer already created above
                     const imageHash = crypto
@@ -1227,9 +1233,12 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       })
                       if (describeImages) {
                         try {
-                          Logger.debug("Calling describeImageWithllm for image", {
-                            imageName,
-                          })
+                          Logger.debug(
+                            "Calling describeImageWithllm for image",
+                            {
+                              imageName,
+                            },
+                          )
                           description = await describeImageWithllm(buffer)
                           Logger.debug("Got description from AI for image", {
                             imageName,
@@ -1240,11 +1249,15 @@ export async function extractTextAndImagesWithChunksFromPDF(
                             `describeImageWithllm failed for ${imageName}: ${e instanceof Error ? e.message : e}`,
                           )
                           description = "This is an image from the PDF."
-                          Logger.debug("Using fallback description due to AI error")
+                          Logger.debug(
+                            "Using fallback description due to AI error",
+                          )
                         }
                       } else {
                         description = "This is an image."
-                        Logger.debug("Using default description (describeImages=false)")
+                        Logger.debug(
+                          "Using default description (describeImages=false)",
+                        )
                       }
                       if (
                         description === "No description returned." ||

From 5ab007e3cf3354aa1dd67bc5f183adf100588cd9 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 15:21:03 +0530
Subject: [PATCH 5/9] comment fixes

---
 server/integrations/microsoft/index.ts | 5 +++--
 server/pdfChunks.ts                    | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/server/integrations/microsoft/index.ts b/server/integrations/microsoft/index.ts
index 1e7926146..6b0042d8c 100644
--- a/server/integrations/microsoft/index.ts
+++ b/server/integrations/microsoft/index.ts
@@ -216,11 +216,12 @@ const insertCalendarEvents = async (
       }
 
       // Check for next page
-      deltaToken = (response["@odata.deltaLink"])? response["@odata.deltaLink"] : deltaToken
+      deltaToken = response["@odata.deltaLink"]
+        ? response["@odata.deltaLink"]
+        : deltaToken
       if (response["@odata.nextLink"]) {
         // More pages available, continue with next page
         nextLink = response["@odata.nextLink"]
-       
       } else {
         // No more data
         nextLink = undefined
diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 9a40135be..2f33a9a4f 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -419,8 +419,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
       }
       pushPara()
 
-      // Clean and filter
-      return paragraphs.map(cleanText).filter((p) => p.length > 0)
+      // Filter raw paragraphs - check trimmed length but don't apply full cleaning yet
+      return paragraphs.filter((p) => p.trim().length > 0)
     }
 
     // Extract text from operators as fallback for edge cases

From 9f816a0b106ce771603eaa6c7f90a9ad4c88813e Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Fri, 12 Sep 2025 21:35:33 +0530
Subject: [PATCH 6/9] comment fixes

---
 server/pdfChunks.ts | 99 +++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 53 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 2f33a9a4f..cc00df987 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -308,18 +308,18 @@ export async function extractTextAndImagesWithChunksFromPDF(
   docid: string = crypto.randomUUID(),
   extractImages: boolean = false,
   describeImages: boolean = true,
+  includeImageMarkersInText: boolean = true,
 ): Promise<{
   text_chunks: string[]
   image_chunks: string[]
   text_chunk_pos: number[]
   image_chunk_pos: number[]
 }> {
-  // Sanitize docid for safe filesystem use
-  const safeDocId = docid.replace(/[^a-zA-Z0-9._-]/g, "_")
   Logger.debug("Starting processing with parameters", {
     docid,
     extractImages,
     describeImages,
+    includeImageMarkersInText,
     dataSize: data.length,
   })
 
@@ -358,10 +358,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
     // Use object to pass by reference for sequence counter
     let globalSeq = { value: 0 }
-    let crossImageOverlap = "" // Track overlap across images
+    // Track overlap across pages to maintain continuity
+    let pageOverlap = ""
 
-    // Logger.info("OVERLAP DEBUG: Initialized crossImageOverlap as empty string")
-    // console.log('OVERLAP DEBUG: Starting PDF processing with initial crossImageOverlap:', crossImageOverlap)
+    // Overlap is now tracked page-to-page only
 
     Logger.info(`PDF has ${pdfDocument.numPages} pages`)
 
@@ -499,15 +499,24 @@ export async function extractTextAndImagesWithChunksFromPDF(
           fallbackLines,
         )
 
-        let currentParagraph = "" // kept for image-flow flush, but not used for text
         let textOperatorCount = (await page.getTextContent()).items.length
 
+        // Prepend previous page overlap to the first paragraph for continuity
+        if (pageOverlap && paragraphs.length > 0) {
+          paragraphs[0] = `${pageOverlap} ${paragraphs[0]}`
+          pageOverlap = ""
+        } else if (pageOverlap) {
+          paragraphs = [pageOverlap]
+          pageOverlap = ""
+        }
+
         Logger.debug("Text extraction summary for page", {
           pageNum,
           primaryParagraphs: primaryParagraphs.length,
           fallbackLines: fallbackLines.length,
           finalParagraphs: paragraphs.length,
           textOperatorCount,
+          initialPageOverlap: pageOverlap,
         })
 
         // Helper: try to resolve image object by name directly from page.objs
@@ -537,16 +546,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
         ]
         const ctmStack: [number, number, number, number, number, number][] = []
 
-        // Do not inject crossImageOverlap into text paragraphs here
-        // console.log('OVERLAP DEBUG: Page', pageNum, 'crossImageOverlap at start:', crossImageOverlap)
 
-        // Helper to flush currentParagraph into paragraphs array
-        const flushParagraph = () => {
-          if (currentParagraph.trim().length > 0) {
-            paragraphs.push(currentParagraph.trim())
-            currentParagraph = ""
-          }
-        }
 
         let imagesOnPage = 0
         let vectorOpsDetected = false
@@ -623,10 +623,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
                 imageName: args[0],
               })
 
-              // Do not process text per-image anymore; text is processed once per page.
-              // Maintain crossImageOverlap continuity by keeping placeholders only.
-              flushParagraph()
-
               // Extract image buffer
               const imageName =
                 typeof args?.[0] === "string"
@@ -753,7 +749,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               description === "No description returned." ||
                               description === "Image is not worth describing."
                             ) {
-                              description = "Image extracted from PDF page."
+                              Logger.warn(
+                                `Skipping image with poor description: ${imageName} on page ${pageNum}`,
+                              )
+                              break
                             }
                             seenHashDescriptions.set(imageHash, description)
                           }
@@ -762,7 +761,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               process.env.IMAGE_DIR ||
                                 "downloads/xyne_images_db",
                             )
-                            const outputDir = path.join(baseDir, safeDocId)
+                            const outputDir = path.join(baseDir, docid)
                             await fsPromises.mkdir(outputDir, {
                               recursive: true,
                             })
@@ -787,7 +786,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
                           }
                           image_chunks.push(description)
                           image_chunk_pos.push(globalSeq.value)
-                          crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
+                          if (includeImageMarkersInText) {
+                            text_chunks.push(`[[IMG#${globalSeq.value}]]`)
+                            text_chunk_pos.push(globalSeq.value)
+                          }
                           globalSeq.value++
                           imagesOnPage += 1
                           Logger.debug(
@@ -847,7 +849,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               description === "No description returned." ||
                               description === "Image is not worth describing."
                             ) {
-                              description = "Image extracted from PDF page."
+                              Logger.warn(
+                                `Skipping image with poor description: ${imageName} on page ${pageNum}`,
+                              )
+                              break
                             }
                             seenHashDescriptions.set(imageHash, description)
                           }
@@ -856,7 +861,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                               process.env.IMAGE_DIR ||
                                 "downloads/xyne_images_db",
                             )
-                            const outputDir = path.join(baseDir, safeDocId)
+                            const outputDir = path.join(baseDir, docid)
                             await fsPromises.mkdir(outputDir, {
                               recursive: true,
                             })
@@ -880,7 +885,10 @@ export async function extractTextAndImagesWithChunksFromPDF(
                           }
                           image_chunks.push(description)
                           image_chunk_pos.push(globalSeq.value)
-                          crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
+                          if (includeImageMarkersInText) {
+                            text_chunks.push(`[[IMG#${globalSeq.value}]]`)
+                            text_chunk_pos.push(globalSeq.value)
+                          }
                           globalSeq.value++
                           imagesOnPage += 1
                           Logger.debug(
@@ -1260,17 +1268,18 @@ export async function extractTextAndImagesWithChunksFromPDF(
                         )
                       }
                       if (
+                        !description ||
                         description === "No description returned." ||
                         description === "Image is not worth describing."
                       ) {
-                        Logger.debug("Replacing insufficient description", {
+                        Logger.debug("Skipping image with insufficient description", {
                           imageName,
                           previousDescription: description,
                         })
                         Logger.warn(
-                          `${description} ${imageName} on page ${pageNum}`,
+                          `Skipping image with poor description: ${imageName} on page ${pageNum}`,
                         )
-                        description = "Image extracted from PDF page."
+                        continue
                       }
                       seenHashDescriptions.set(imageHash, description)
                       Logger.debug("Cached new description for image", {
@@ -1284,7 +1293,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       const baseDir = path.resolve(
                         process.env.IMAGE_DIR || "downloads/xyne_images_db",
                       )
-                      const outputDir = path.join(baseDir, safeDocId)
+                      const outputDir = path.join(baseDir, docid)
                       await fsPromises.mkdir(outputDir, { recursive: true })
 
                       const imageFilename = `${globalSeq.value}.${type.ext || "png"}`
@@ -1305,16 +1314,15 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
                     image_chunks.push(description)
                     image_chunk_pos.push(globalSeq.value)
-                    // Logger.info(`OVERLAP DEBUG: Adding image placeholder to crossImageOverlap. Before: "${crossImageOverlap}"`)
-                    // console.log('OVERLAP DEBUG: crossImageOverlap before adding image placeholder:', crossImageOverlap)
-                    crossImageOverlap += ` [[IMG#${globalSeq.value}]] `
-                    // Logger.info(`OVERLAP DEBUG: Added image placeholder to crossImageOverlap. After: "${crossImageOverlap}"`)
-                    // console.log('OVERLAP DEBUG: crossImageOverlap after adding image placeholder:', crossImageOverlap)
+                    if (includeImageMarkersInText) {
+                      text_chunks.push(`[[IMG#${globalSeq.value}]]`)
+                      text_chunk_pos.push(globalSeq.value)
+                    }
+                    // Removed cross-image overlap placeholder handling
                     Logger.debug("Added image chunk at position", {
                       position: globalSeq.value,
                       imageName,
                       description,
-                      crossImageOverlap,
                     })
                     globalSeq.value++
                     imagesOnPage += 1
@@ -1338,8 +1346,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
         // Vector snapshot functionality removed (no longer creating fallback canvas)
 
-        // End of page: flush remaining paragraph and process paragraphs
-        flushParagraph()
+        // End of page: process paragraphs
         const overlapText = processTextParagraphs(
           paragraphs,
           text_chunks,
@@ -1347,22 +1354,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
           globalSeq,
         )
 
-        // Update cross-image overlap - APPEND instead of REPLACE to preserve image placeholders
-        // Logger.info(`OVERLAP DEBUG: End of page ${pageNum} - processing final overlap update`)
-        // console.log('OVERLAP DEBUG: Page', pageNum, 'end - overlapText from processTextParagraphs:', overlapText)
-        // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap before final update:', crossImageOverlap)
-        if (overlapText.trim()) {
-          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText has content, updating crossImageOverlap`)
-          const previousCrossImageOverlap = crossImageOverlap
-          crossImageOverlap = crossImageOverlap
-            ? `${crossImageOverlap} ${overlapText}`
-            : overlapText
-          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - crossImageOverlap updated from "${previousCrossImageOverlap}" to "${crossImageOverlap}"`)
-          // console.log('OVERLAP DEBUG: Page', pageNum, 'end - crossImageOverlap after final update:', crossImageOverlap)
-        } else {
-          // Logger.info(`OVERLAP DEBUG: Page ${pageNum} - overlapText is empty, no update to crossImageOverlap`)
-          // console.log('OVERLAP DEBUG: Page', pageNum, 'end - no update to crossImageOverlap (overlapText empty)')
-        }
+        // Store overlap for continuity to the next page
+        pageOverlap = overlapText.trim()
 
         Logger.debug(
           `Page ${pageNum} completed. Text operators found: ${textOperatorCount}, Current text chunks: ${text_chunks.length}, Current image chunks: ${image_chunks.length}`,

From 2f497d00d4810e996bb63cc49782ea84e0a181ae Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Mon, 15 Sep 2025 14:49:51 +0530
Subject: [PATCH 7/9] comment fixes

---
 server/pdfChunks.ts | 188 ++++++++++++++++++++++++--------------------
 1 file changed, 101 insertions(+), 87 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index cc00df987..0f01a1b55 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -546,8 +546,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
         ]
         const ctmStack: [number, number, number, number, number, number][] = []
 
-
-
         let imagesOnPage = 0
         let vectorOpsDetected = false
         for (let i = 0; i < opList.fnArray.length; i++) {
@@ -694,6 +692,77 @@ export async function extractTextAndImagesWithChunksFromPDF(
                 // This will fall through to the crop fallback logic below
               } else {
                 try {
+                  const width: number = (imageDict.width ??
+                    imageDict.w) as number
+                  const height: number = (imageDict.height ??
+                    imageDict.h) as number
+                  const kind =
+                    imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind
+                  // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes
+                  let rawData: any =
+                    imageDict.data ??
+                    imageDict.bytes ??
+                    (imageDict.imgData ? imageDict.imgData.data : undefined)
+
+                  Logger.debug("Full image details", {
+                    imageName,
+                    width,
+                    height,
+                    kind,
+                    dataLength: rawData ? rawData.length : null,
+                    dataSizeMB: rawData
+                      ? (rawData.length / (1024 * 1024)).toFixed(2)
+                      : null,
+                    maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                    minDimension: MIN_IMAGE_DIM_PX,
+                    isValidDimensions: width > 0 && height > 0,
+                    meetsMinSize:
+                      width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX,
+                    withinSizeLimit: rawData
+                      ? rawData.length <=
+                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                      : false,
+                    isInline,
+                  })
+
+                  if (!width || !height || width <= 0 || height <= 0) {
+                    Logger.debug("Skipped image with invalid dimensions", {
+                      imageName,
+                      width,
+                      height,
+                    })
+                    continue
+                  }
+
+                  if (
+                    rawData &&
+                    rawData.length >
+                      DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                  ) {
+                    Logger.warn("Skipped large image", {
+                      imageName,
+                      actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2),
+                      maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
+                    })
+                    continue
+                  }
+
+                  if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
+                    Logger.debug("Skipped small image", {
+                      imageName,
+                      width,
+                      height,
+                      minRequired: MIN_IMAGE_DIM_PX,
+                    })
+                    continue // Skip small images
+                  }
+
+                  Logger.debug(
+                    "Image passed all filters, proceeding with processing",
+                    {
+                      imageName,
+                    },
+                  )
                   // Fast paths for Canvas or Image-like objects returned by page.objs
                   const isCanvasLike = (obj: any) =>
                     obj &&
@@ -719,10 +788,15 @@ export async function extractTextAndImagesWithChunksFromPDF(
                       })
                     } else {
                       const buffer = c.toBuffer("image/png")
+                      // Run all filters BEFORE attempting LLM description
                       if (
-                        buffer.length <=
+                        buffer.length >
                         DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
                       ) {
+                        Logger.warn(
+                          `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`,
+                        )
+                      } else {
                         // @ts-ignore
                         let type = await imageType(buffer)
                         if (!type) type = { mime: "image/png", ext: "png" }
@@ -744,6 +818,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                             } catch {
                               // ignore
                             }
+                            // Check description quality after LLM call
                             if (
                               !description ||
                               description === "No description returned." ||
@@ -797,10 +872,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
                           )
                           break
                         }
-                      } else {
-                        Logger.warn(
-                          `Skipping objs/canvas image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`,
-                        )
                       }
                     }
                   }
@@ -819,10 +890,22 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     } else {
                       const cnv = createCanvas(width, height)
                       const cctx = cnv.getContext("2d")
+                      
                       try {
+                        
                         // @ts-ignore draw directly
                         cctx.drawImage(imgLike, 0, 0)
                         const buffer = cnv.toBuffer("image/png")
+                        // Run all filters BEFORE attempting LLM description
+                        if (
+                          buffer.length >
+                          DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
+                        ) {
+                          Logger.warn(
+                            `Skipping objs/image image due to size ${(buffer.length / (1024 * 1024)).toFixed(2)} MB: ${imageName}`,
+                          )
+                          break
+                        }
                         // @ts-ignore
                         let type = await imageType(buffer)
                         if (!type) type = { mime: "image/png", ext: "png" }
@@ -844,6 +927,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                             } catch {
                               // ignore
                             }
+                            // Check description quality after LLM call
                             if (
                               !description ||
                               description === "No description returned." ||
@@ -904,78 +988,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     }
                   }
 
-                  const width: number = (imageDict.width ??
-                    imageDict.w) as number
-                  const height: number = (imageDict.height ??
-                    imageDict.h) as number
-                  const kind =
-                    imageDict.kind ?? imageDict.imageKind ?? imageDict.ImageKind
-                  // data may live in imageDict.data, imageDict.imgData.data, or imageDict.bytes
-                  let rawData: any =
-                    imageDict.data ??
-                    imageDict.bytes ??
-                    (imageDict.imgData ? imageDict.imgData.data : undefined)
-
-                  Logger.debug("Full image details", {
-                    imageName,
-                    width,
-                    height,
-                    kind,
-                    dataLength: rawData ? rawData.length : null,
-                    dataSizeMB: rawData
-                      ? (rawData.length / (1024 * 1024)).toFixed(2)
-                      : null,
-                    maxAllowedSizeMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
-                    minDimension: MIN_IMAGE_DIM_PX,
-                    isValidDimensions: width > 0 && height > 0,
-                    meetsMinSize:
-                      width >= MIN_IMAGE_DIM_PX && height >= MIN_IMAGE_DIM_PX,
-                    withinSizeLimit: rawData
-                      ? rawData.length <=
-                        DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
-                      : false,
-                    isInline,
-                  })
-
-                  if (!width || !height || width <= 0 || height <= 0) {
-                    Logger.debug("Skipped image with invalid dimensions", {
-                      imageName,
-                      width,
-                      height,
-                    })
-                    continue
-                  }
-
-                  if (
-                    rawData &&
-                    rawData.length >
-                      DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB * 1024 * 1024
-                  ) {
-                    Logger.warn("Skipped large image", {
-                      imageName,
-                      actualSizeMB: (rawData.length / (1024 * 1024)).toFixed(2),
-                      maxAllowedMB: DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB,
-                    })
-                    continue
-                  }
-
-                  if (width < MIN_IMAGE_DIM_PX || height < MIN_IMAGE_DIM_PX) {
-                    Logger.debug("Skipped small image", {
-                      imageName,
-                      width,
-                      height,
-                      minRequired: MIN_IMAGE_DIM_PX,
-                    })
-                    continue // Skip small images
-                  }
-
-                  Logger.debug(
-                    "Image passed all filters, proceeding with processing",
-                    {
-                      imageName,
-                    },
-                  )
-
                   let uint8Data: Uint8Array
                   if (rawData instanceof Uint8Array) {
                     uint8Data = rawData
@@ -1211,7 +1223,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
                     }
 
                     Logger.debug(
-                      "MIME type check passed, proceeding with processing",
+                      "All filters passed, proceeding with image description",
                       {
                         imageName,
                       },
@@ -1267,15 +1279,20 @@ export async function extractTextAndImagesWithChunksFromPDF(
                           "Using default description (describeImages=false)",
                         )
                       }
+                      
+                      // Check description quality after LLM call
                       if (
                         !description ||
                         description === "No description returned." ||
                         description === "Image is not worth describing."
                       ) {
-                        Logger.debug("Skipping image with insufficient description", {
-                          imageName,
-                          previousDescription: description,
-                        })
+                        Logger.debug(
+                          "Skipping image with insufficient description",
+                          {
+                            imageName,
+                            previousDescription: description,
+                          },
+                        )
                         Logger.warn(
                           `Skipping image with poor description: ${imageName} on page ${pageNum}`,
                         )
@@ -1343,9 +1360,6 @@ export async function extractTextAndImagesWithChunksFromPDF(
               break
           }
         }
-
-        // Vector snapshot functionality removed (no longer creating fallback canvas)
-
         // End of page: process paragraphs
         const overlapText = processTextParagraphs(
           paragraphs,

From 0c98b7a32e9ce042f5682ccc979a3c3569579d38 Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Mon, 15 Sep 2025 18:30:11 +0530
Subject: [PATCH 8/9] removing old logic

---
 server/pdfChunks.ts | 109 +++-----------------------------------------
 1 file changed, 6 insertions(+), 103 deletions(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 0f01a1b55..31fa9a129 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -16,8 +16,7 @@ const qcmsWasmPath =
   path.join(__dirname, "../node_modules/pdfjs-dist/wasm/") + "/"
 const seenHashDescriptions = new Map<string, string>()
 const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || "150", 10)
-// Minimum line height used for calculating line break detection tolerance (in PDF units)
-const MIN_LINE_HEIGHT_FOR_TOLERANCE = 10
+
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "pdfChunks",
@@ -200,42 +199,7 @@ function validateTextItem(item: any): boolean {
   )
 }
 
-/**
- * Extract text from various PDF.js text operators with enhanced validation
- */
-function extractTextFromArgs(args: any[]): string {
-  let text = ""
-
-  if (!args || args.length === 0) {
-    return text
-  }
-
-  const firstArg = args[0]
-
-  if (typeof firstArg === "string") {
-    text = firstArg
-  } else if (Array.isArray(firstArg)) {
-    for (const item of firstArg) {
-      if (typeof item === "string") {
-        text += item
-      } else if (typeof item === "number") {
-        // Skip spacing numbers in text arrays
-        continue
-      } else if (item && typeof item === "object") {
-        // Enhanced validation using validateTextItem function
-        if (validateTextItem(item)) {
-          text += item.str
-        } else if ("unicode" in item && typeof item.unicode === "string") {
-          text += item.unicode
-        }
-      }
-    }
-  }
 
-  // Additional validation: ensure we return clean, valid text
-  const result = typeof text === "string" ? text : ""
-  return result
-}
 
 /**
  * Process collected paragraphs into chunks and add to results
@@ -423,62 +387,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
       return paragraphs.filter((p) => p.trim().length > 0)
     }
 
-    // Extract text from operators as fallback for edge cases
-    const extractFallbackTextFromOperators = (opList: any): string[] => {
-      const fallbackLines: string[] = []
-
-      for (let i = 0; i < opList.fnArray.length; i++) {
-        const fnId = opList.fnArray[i]
-        const args = opList.argsArray[i]
-
-        // Handle text operators
-        if (
-          fnId === PDFJS.OPS.showText ||
-          fnId === PDFJS.OPS.showSpacedText ||
-          fnId === PDFJS.OPS.nextLineShowText ||
-          fnId === PDFJS.OPS.nextLineSetSpacingShowText
-        ) {
-          const extractedText = extractTextFromArgs(args)
-          if (extractedText.trim()) {
-            fallbackLines.push(extractedText.trim())
-          }
-        }
-      }
-
-      return fallbackLines
-    }
-
-    // Combine and deduplicate text from multiple sources
-    const combineTextSources = (
-      primaryParagraphs: string[],
-      fallbackLines: string[],
-    ): string[] => {
-      if (fallbackLines.length === 0) {
-        return primaryParagraphs
-      }
-
-      const primaryText = primaryParagraphs.join(" ").toLowerCase()
-      const additionalLines: string[] = []
-
-      // Add fallback lines that aren't already covered by primary extraction
-      for (const line of fallbackLines) {
-        const cleanLine = line.trim()
-        if (
-          cleanLine.length > 2 && // Skip very short strings
-          !primaryText.includes(cleanLine.toLowerCase())
-        ) {
-          additionalLines.push(cleanLine)
-        }
-      }
 
-      // If we found additional text, append it as a new paragraph
-      if (additionalLines.length > 0) {
-        const additionalParagraph = additionalLines.join(" ")
-        return [...primaryParagraphs, additionalParagraph]
-      }
-
-      return primaryParagraphs
-    }
 
     for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
       Logger.debug(`Processing page ${pageNum}`)
@@ -488,16 +397,9 @@ export async function extractTextAndImagesWithChunksFromPDF(
         const opList = await page.getOperatorList()
 
         // Use textContent-based paragraphs for this page as primary source
-        let primaryParagraphs: string[] = await buildParagraphsFromPage(page)
-
-        // Extract fallback text from operators for edge cases
-        const fallbackLines = extractFallbackTextFromOperators(opList)
+        let paragraphs: string[] = await buildParagraphsFromPage(page)
 
-        // Combine both sources, prioritizing primary extraction
-        let paragraphs: string[] = combineTextSources(
-          primaryParagraphs,
-          fallbackLines,
-        )
+        
 
         let textOperatorCount = (await page.getTextContent()).items.length
 
@@ -512,8 +414,8 @@ export async function extractTextAndImagesWithChunksFromPDF(
 
         Logger.debug("Text extraction summary for page", {
           pageNum,
-          primaryParagraphs: primaryParagraphs.length,
-          fallbackLines: fallbackLines.length,
+          primaryParagraphs: paragraphs.length,
+         
           finalParagraphs: paragraphs.length,
           textOperatorCount,
           initialPageOverlap: pageOverlap,
@@ -1394,6 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
       describeImages,
     })
 
+    console.log("All text chunks", { text_chunks })
     Logger.debug("All text chunks", { text_chunks })
     Logger.debug("All text chunk positions", { text_chunk_pos })
     Logger.debug("All image chunks", { image_chunks })

From bb602696dde65a02df9bef9bd09db5901de10f3b Mon Sep 17 00:00:00 2001
From: Aayushjshah <2001aayushshah@gmail.com>
Date: Mon, 15 Sep 2025 18:32:41 +0530
Subject: [PATCH 9/9] removing old logic

---
 server/pdfChunks.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/pdfChunks.ts b/server/pdfChunks.ts
index 31fa9a129..cfe0b67db 100644
--- a/server/pdfChunks.ts
+++ b/server/pdfChunks.ts
@@ -1296,7 +1296,7 @@ export async function extractTextAndImagesWithChunksFromPDF(
       describeImages,
     })
 
-    console.log("All text chunks", { text_chunks })
+    Logger.debug("All text chunks", { text_chunks })
     Logger.debug("All text chunks", { text_chunks })
     Logger.debug("All text chunk positions", { text_chunk_pos })
     Logger.debug("All image chunks", { image_chunks })