xynehq · junaid-shirur · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -32,7 +32,7 @@ import {
 } from "./errors"
 import { describeImageWithllm } from "@/lib/describeImageWithllm"
 import { promises as fsPromises } from "fs"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import imageType from "image-type"
@@ -208,22 +208,28 @@ const processPdfContent = async (
 ): Promise<VespaDataSourceFile> => {
   try {
     const docId = `dsf-${createId()}`
-    const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } =
-      await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId)
-    if (text_chunks.length === 0 && image_chunks.length === 0) {
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      options.fileName,
+      docId,
+      true,
+      true,
+    )
+
+    if (result.chunks.length === 0 && result.image_chunks.length === 0) {
       throw new ContentExtractionError(
         "No chunks generated from PDF content",
         "PDF",
       )
     }
 
     return createVespaDataSourceFile(
-      text_chunks,
+      result.chunks,
       options,
-      "pdf_processing",
-      image_chunks,
-      text_chunk_pos,
-      image_chunk_pos,
+      result.processingMethod || "pdf_processing",
+      result.image_chunks,
+      result.chunks_pos,
+      result.image_chunks_pos,
       docId,
     )
   } catch (error) {

@@ -18,7 +18,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
 import { checkFileSize } from "../dataSource"
 
@@ -49,12 +49,14 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    // Handle non-spreadsheet files as before
-    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
-      pdfBuffer,
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      `attachment-${attachmentId}`,
       attachmentId,
+      false,
+      false,
     )
-    return pdfResult.text_chunks.filter((v) => v.trim())
+    return result.chunks.filter((v) => v.trim())
   } catch (error) {
     Logger.error(error, `Error processing PDF buffer`)
     return []

@@ -16,7 +16,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { PdfProcessor } from "@/lib/pdfProcessor"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
 import { checkFileSize } from "../dataSource"
@@ -49,11 +49,14 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
-      pdfBuffer,
+    const result = await PdfProcessor.processWithFallback(
+      Buffer.from(pdfBuffer),
+      `attachment-${attachmentId}`,
       attachmentId,
+      false,
+      false,
     )
-    return pdfResult.text_chunks.filter((v) => v.trim())
+    return result.chunks.filter((v) => v.trim())
   } catch (error) {
     Logger.error(error, `Error processing PDF buffer`)
     return []