xynehq · junaid-shirur · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -32,7 +32,7 @@ import {
 } from "./errors"
 import { describeImageWithllm } from "@/lib/describeImageWithllm"
 import { promises as fsPromises } from "fs"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import imageType from "image-type"
@@ -208,22 +208,28 @@ const processPdfContent = async (
 ): Promise<VespaDataSourceFile> => {
   try {
     const docId = `dsf-${createId()}`
-    const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } =
-      await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId)
-    if (text_chunks.length === 0 && image_chunks.length === 0) {
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      options.fileName,
+      docId,
+      true,
+      true,
+    )
+
+    if (result.chunks.length === 0 && result.image_chunks.length === 0) {
       throw new ContentExtractionError(
         "No chunks generated from PDF content",
         "PDF",
       )
     }
 
     return createVespaDataSourceFile(
-      text_chunks,
+      result.chunks,
       options,
-      "pdf_processing",
-      image_chunks,
-      text_chunk_pos,
-      image_chunk_pos,
+      result.processingMethod || "pdf_processing",
+      result.image_chunks,
+      result.chunks_pos,
+      result.image_chunks_pos,
       docId,
     )
   } catch (error) {

@@ -18,7 +18,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
 import { checkFileSize } from "../dataSource"
 
@@ -49,12 +49,14 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    // Handle non-spreadsheet files as before
-    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
-      pdfBuffer,
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      `attachment-${attachmentId}`,
       attachmentId,
+      false,
+      false,
     )
-    return pdfResult.text_chunks.filter((v) => v.trim())
+    return result.chunks.filter((v) => v.trim())
   } catch (error) {
     Logger.error(error, `Error processing PDF buffer`)
     return []

@@ -16,7 +16,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
 import { checkFileSize } from "../dataSource"
@@ -49,11 +49,14 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
-      pdfBuffer,
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      `attachment-${attachmentId}`,
       attachmentId,
+      false,
+      false,
     )
-    return pdfResult.text_chunks.filter((v) => v.trim())
+    return result.chunks.filter((v) => v.trim())
   } catch (error) {
     Logger.error(error, `Error processing PDF buffer`)
     return []

@@ -17,8 +17,7 @@ const DEFAULT_LAYOUT_PARSING_VISUALIZE = false
 const LAYOUT_PARSING_API_PATH = "/v2/models/layout-parsing/infer"
 const DEFAULT_MAX_PAGES_PER_LAYOUT_REQUEST = 100
 const TEXT_CHUNK_OVERLAP_CHARS = 32
-const USE_SEQUENTIAL_BATCH_PROCESSING=true
-
+const USE_SEQUENTIAL_BATCH_PROCESSING = true
 
 type LayoutParsingBlock = {
   block_label?: string
@@ -139,7 +138,10 @@ function trimChunkToByteLimit(content: string, byteLimit: number): string {
 
   let endIndex = content.length
 
-  while (endIndex > 0 && getByteLength(content.slice(0, endIndex)) > byteLimit) {
+  while (
+    endIndex > 0 &&
+    getByteLength(content.slice(0, endIndex)) > byteLimit
+  ) {
     endIndex -= 1
   }
 
@@ -322,9 +324,11 @@ async function callLayoutParsingApi(
   buffer: Buffer,
   fileName: string,
 ): Promise<LayoutParsingApiPayload> {
-  const baseUrl = (process.env.LAYOUT_PARSING_BASE_URL || DEFAULT_LAYOUT_PARSING_BASE_URL).replace(/\/+$/, '')
-
-  const apiUrl = baseUrl + '/' + LAYOUT_PARSING_API_PATH.replace(/^\/+/, '')
+  const baseUrl = (
+    process.env.LAYOUT_PARSING_BASE_URL || DEFAULT_LAYOUT_PARSING_BASE_URL
+  ).replace(/\/+$/, "")
+
+  const apiUrl = baseUrl + "/" + LAYOUT_PARSING_API_PATH.replace(/^\/+/, "")
   const fileType = DEFAULT_LAYOUT_PARSING_FILE_TYPE
   const visualize = DEFAULT_LAYOUT_PARSING_VISUALIZE
   const timeoutMs = Number.parseInt(
@@ -407,17 +411,25 @@ async function callLayoutParsingApi(
     return result
   } catch (error) {
     // Log the layout parsing API failure with context
-    Logger.error(error, `Layout parsing API call failed for file: ${fileName}`, {
-      fileName,
-      fileSize: buffer.length,
-      apiUrl,
-    })
-
+    Logger.error(
+      error,
+      `Layout parsing API call failed for file: ${fileName}`,
+      {
+        fileName,
+        fileSize: buffer.length,
+        apiUrl,
+      },
+    )
+
     // Re-throw with enhanced error message for better debugging
     if (error instanceof Error) {
-      throw new Error(`Layout parsing API failed for "${fileName}": ${error.message}`)
+      throw new Error(
+        `Layout parsing API failed for "${fileName}": ${error.message}`,
+      )
     } else {
-      throw new Error(`Layout parsing API failed for "${fileName}": Unknown error occurred`)
+      throw new Error(
+        `Layout parsing API failed for "${fileName}": Unknown error occurred`,
+      )
     }
   } finally {
     if (timer) {
@@ -521,7 +533,11 @@ async function splitPdfIntoBatches(
 
   const batches: Buffer[] = []
 
-  for (let startPage = 0; startPage < totalPages; startPage += maxPagesPerBatch) {
+  for (
+    let startPage = 0;
+    startPage < totalPages;
+    startPage += maxPagesPerBatch
+  ) {
     const endPage = Math.min(startPage + maxPagesPerBatch, totalPages)
     const pageCount = endPage - startPage
 
@@ -563,7 +579,7 @@ function mergeLayoutParsingResults(
     // Adjust page indices to maintain correct ordering across batches
     const adjustedResults = layoutResults.map((layout, localPageIndex) => ({
       ...layout,
-      // We don't need to modify the layout structure itself since 
+      // We don't need to modify the layout structure itself since
       // transformLayoutParsingResults handles page indexing correctly
     }))
 
@@ -601,8 +617,7 @@ async function processBatchesConcurrently(
       Logger.info("Completed PDF batch", {
         fileName,
         batchIndex,
-        layoutResultsCount:
-          batchResult.layoutParsingResults?.length ?? 0,
+        layoutResultsCount: batchResult.layoutParsingResults?.length ?? 0,
       })
       return batchResult
     }),
@@ -623,7 +638,7 @@ async function processBatchesSequentially(
   for (let index = 0; index < batches.length; index++) {
     const batch = batches[index]
     const batchIndex = index + 1
-    
+
     Logger.info("Processing PDF batch sequentially", {
       fileName,
       batchIndex,
@@ -639,8 +654,7 @@ async function processBatchesSequentially(
     Logger.info("Completed PDF batch sequentially", {
       fileName,
       batchIndex,
-      layoutResultsCount:
-        batchResult.layoutParsingResults?.length ?? 0,
+      layoutResultsCount: batchResult.layoutParsingResults?.length ?? 0,
     })
 
     batchResults.push(batchResult)
@@ -686,7 +700,9 @@ export async function chunkByOCRFromBuffer(
           fileName,
           totalPages,
           batches: batches.length,
-          processingMode: USE_SEQUENTIAL_BATCH_PROCESSING ? 'sequential' : 'concurrent',
+          processingMode: USE_SEQUENTIAL_BATCH_PROCESSING
+            ? "sequential"
+            : "concurrent",
         })
 
         const batchResults = USE_SEQUENTIAL_BATCH_PROCESSING
@@ -696,17 +712,19 @@ export async function chunkByOCRFromBuffer(
         finalApiResult = mergeLayoutParsingResults(batchResults)
         Logger.info("Merged batch results", {
           totalBatches: batches.length,
-          layoutResultsCount:
-            finalApiResult.layoutParsingResults?.length || 0,
+          layoutResultsCount: finalApiResult.layoutParsingResults?.length || 0,
         })
       } else {
         finalApiResult = await callLayoutParsingApi(buffer, fileName)
       }
     } catch (error) {
-      Logger.warn("Failed to analyze PDF for batching, processing as single file", {
-        fileName,
-        error: (error as Error).message,
-      })
+      Logger.warn(
+        "Failed to analyze PDF for batching, processing as single file",
+        {
+          fileName,
+          error: (error as Error).message,
+        },
+      )
       finalApiResult = await callLayoutParsingApi(buffer, fileName)
     }
   } else {
@@ -788,14 +806,18 @@ export async function chunkByOCR(
         const overlap = lastTextChunk.slice(-TEXT_CHUNK_OVERLAP_CHARS)
         if (overlap && !chunkContent.startsWith(overlap)) {
           const needsSeparator =
-            !/\s$/.test(overlap) && chunkContent.length > 0 && !/^\s/.test(chunkContent)
+            !/\s$/.test(overlap) &&
+            chunkContent.length > 0 &&
+            !/^\s/.test(chunkContent)
           chunkContent = `${overlap}${needsSeparator ? " " : ""}${chunkContent}`
         }
       }
 
       chunkContent = trimChunkToByteLimit(chunkContent, chunkSizeLimit)
 
-      const pageNumbersArray = Array.from(currentPageNumbers).sort((a, b) => a - b)
+      const pageNumbersArray = Array.from(currentPageNumbers).sort(
+        (a, b) => a - b,
+      )
       const blockLabelsArray = Array.from(new Set(currentBlockLabels))
 
       chunks.push(chunkContent)

@@ -348,9 +348,6 @@ export function parseGeminiChunkBlocks(raw: string): string[] {
 }
 
 /**
- * Gemini-backed PDF extractor that returns the same shape as
- * extractTextAndImagesWithChunksFromPDF in server/pdfChunks.ts.
- *
  * Notes:
  * - image_chunks and image_chunk_pos are intentionally empty.
  * - Maintains chunk positions sequentially (0..n-1), equivalent to