diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts index ee3f6708c..b6a4de193 100644 --- a/server/integrations/dataSource/index.ts +++ b/server/integrations/dataSource/index.ts @@ -32,7 +32,7 @@ import { } from "./errors" import { describeImageWithllm } from "@/lib/describeImageWithllm" import { promises as fsPromises } from "fs" -import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" +import { PdfProcessor } from "@/lib/pdfProcessor" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" import imageType from "image-type" @@ -208,9 +208,15 @@ const processPdfContent = async ( ): Promise => { try { const docId = `dsf-${createId()}` - const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } = - await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId) - if (text_chunks.length === 0 && image_chunks.length === 0) { + const result = await PdfProcessor.processWithFallback( + Buffer.from(pdfBuffer), + options.fileName, + docId, + true, + true, + ) + + if (result.chunks.length === 0 && result.image_chunks.length === 0) { throw new ContentExtractionError( "No chunks generated from PDF content", "PDF", @@ -218,12 +224,12 @@ const processPdfContent = async ( } return createVespaDataSourceFile( - text_chunks, + result.chunks, options, - "pdf_processing", - image_chunks, - text_chunk_pos, - image_chunk_pos, + result.processingMethod || "pdf_processing", + result.image_chunks, + result.chunks_pos, + result.image_chunks_pos, docId, ) } catch (error) { diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts index 746dd8353..79088a3ce 100644 --- a/server/integrations/google/worker-utils.ts +++ b/server/integrations/google/worker-utils.ts @@ -18,7 +18,7 @@ import { import * as XLSX from "xlsx" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" -import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" +import { PdfProcessor } from "@/lib/pdfProcessor" import { chunkSheetWithHeaders } from "@/sheetChunk" import { checkFileSize } from "../dataSource" @@ -49,12 +49,14 @@ const processPdfFile = async ( attachmentId: string, ): Promise => { try { - // Handle non-spreadsheet files as before - const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini( - pdfBuffer, + const result = await PdfProcessor.processWithFallback( + Buffer.from(pdfBuffer), + `attachment-${attachmentId}`, attachmentId, + false, + false, ) - return pdfResult.text_chunks.filter((v) => v.trim()) + return result.chunks.filter((v) => v.trim()) } catch (error) { Logger.error(error, `Error processing PDF buffer`) return [] diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts index 1b62276b2..16a4bf63c 100644 --- a/server/integrations/microsoft/attachment-utils.ts +++ b/server/integrations/microsoft/attachment-utils.ts @@ -16,7 +16,7 @@ import { import * as XLSX from "xlsx" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" -import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" +import { PdfProcessor } from "@/lib/pdfProcessor" import { makeGraphApiCall, type MicrosoftGraphClient } from "./client" import { chunkSheetWithHeaders } from "@/sheetChunk" import { checkFileSize } from "../dataSource" @@ -49,11 +49,14 @@ const processPdfFile = async ( attachmentId: string, ): Promise => { try { - const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini( - pdfBuffer, + const result = await PdfProcessor.processWithFallback( + Buffer.from(pdfBuffer), + `attachment-${attachmentId}`, attachmentId, + false, + false, ) - return pdfResult.text_chunks.filter((v) => v.trim()) + return result.chunks.filter((v) => v.trim()) } catch (error) { Logger.error(error, `Error processing PDF buffer`) return [] diff --git a/server/lib/chunkByOCR.ts b/server/lib/chunkByOCR.ts index ce943447b..63409cea2 100644 --- a/server/lib/chunkByOCR.ts +++ b/server/lib/chunkByOCR.ts @@ -17,8 +17,7 @@ const DEFAULT_LAYOUT_PARSING_VISUALIZE = false const LAYOUT_PARSING_API_PATH = "/v2/models/layout-parsing/infer" const DEFAULT_MAX_PAGES_PER_LAYOUT_REQUEST = 100 const TEXT_CHUNK_OVERLAP_CHARS = 32 -const USE_SEQUENTIAL_BATCH_PROCESSING=true - +const USE_SEQUENTIAL_BATCH_PROCESSING = true type LayoutParsingBlock = { block_label?: string @@ -139,7 +138,10 @@ function trimChunkToByteLimit(content: string, byteLimit: number): string { let endIndex = content.length - while (endIndex > 0 && getByteLength(content.slice(0, endIndex)) > byteLimit) { + while ( + endIndex > 0 && + getByteLength(content.slice(0, endIndex)) > byteLimit + ) { endIndex -= 1 } @@ -322,9 +324,11 @@ async function callLayoutParsingApi( buffer: Buffer, fileName: string, ): Promise { - const baseUrl = (process.env.LAYOUT_PARSING_BASE_URL || DEFAULT_LAYOUT_PARSING_BASE_URL).replace(/\/+$/, '') - - const apiUrl = baseUrl + '/' + LAYOUT_PARSING_API_PATH.replace(/^\/+/, '') + const baseUrl = ( + process.env.LAYOUT_PARSING_BASE_URL || DEFAULT_LAYOUT_PARSING_BASE_URL + ).replace(/\/+$/, "") + + const apiUrl = baseUrl + "/" + LAYOUT_PARSING_API_PATH.replace(/^\/+/, "") const fileType = DEFAULT_LAYOUT_PARSING_FILE_TYPE const visualize = DEFAULT_LAYOUT_PARSING_VISUALIZE const timeoutMs = Number.parseInt( @@ -407,17 +411,25 @@ async function callLayoutParsingApi( return result } catch (error) { // Log the layout parsing API failure with context - Logger.error(error, `Layout parsing API call failed for file: ${fileName}`, { - fileName, - fileSize: buffer.length, - apiUrl, - }) - + Logger.error( + error, + `Layout parsing API call failed for file: ${fileName}`, + { + fileName, + fileSize: buffer.length, + apiUrl, + }, + ) + // Re-throw with enhanced error message for better debugging if (error instanceof Error) { - throw new Error(`Layout parsing API failed for "${fileName}": ${error.message}`) + throw new Error( + `Layout parsing API failed for "${fileName}": ${error.message}`, + ) } else { - throw new Error(`Layout parsing API failed for "${fileName}": Unknown error occurred`) + throw new Error( + `Layout parsing API failed for "${fileName}": Unknown error occurred`, + ) } } finally { if (timer) { @@ -521,7 +533,11 @@ async function splitPdfIntoBatches( const batches: Buffer[] = [] - for (let startPage = 0; startPage < totalPages; startPage += maxPagesPerBatch) { + for ( + let startPage = 0; + startPage < totalPages; + startPage += maxPagesPerBatch + ) { const endPage = Math.min(startPage + maxPagesPerBatch, totalPages) const pageCount = endPage - startPage @@ -563,7 +579,7 @@ function mergeLayoutParsingResults( // Adjust page indices to maintain correct ordering across batches const adjustedResults = layoutResults.map((layout, localPageIndex) => ({ ...layout, - // We don't need to modify the layout structure itself since + // We don't need to modify the layout structure itself since // transformLayoutParsingResults handles page indexing correctly })) @@ -601,8 +617,7 @@ async function processBatchesConcurrently( Logger.info("Completed PDF batch", { fileName, batchIndex, - layoutResultsCount: - batchResult.layoutParsingResults?.length ?? 0, + layoutResultsCount: batchResult.layoutParsingResults?.length ?? 0, }) return batchResult }), @@ -623,7 +638,7 @@ async function processBatchesSequentially( for (let index = 0; index < batches.length; index++) { const batch = batches[index] const batchIndex = index + 1 - + Logger.info("Processing PDF batch sequentially", { fileName, batchIndex, @@ -639,8 +654,7 @@ async function processBatchesSequentially( Logger.info("Completed PDF batch sequentially", { fileName, batchIndex, - layoutResultsCount: - batchResult.layoutParsingResults?.length ?? 0, + layoutResultsCount: batchResult.layoutParsingResults?.length ?? 0, }) batchResults.push(batchResult) @@ -686,7 +700,9 @@ export async function chunkByOCRFromBuffer( fileName, totalPages, batches: batches.length, - processingMode: USE_SEQUENTIAL_BATCH_PROCESSING ? 'sequential' : 'concurrent', + processingMode: USE_SEQUENTIAL_BATCH_PROCESSING + ? "sequential" + : "concurrent", }) const batchResults = USE_SEQUENTIAL_BATCH_PROCESSING @@ -696,17 +712,19 @@ export async function chunkByOCRFromBuffer( finalApiResult = mergeLayoutParsingResults(batchResults) Logger.info("Merged batch results", { totalBatches: batches.length, - layoutResultsCount: - finalApiResult.layoutParsingResults?.length || 0, + layoutResultsCount: finalApiResult.layoutParsingResults?.length || 0, }) } else { finalApiResult = await callLayoutParsingApi(buffer, fileName) } } catch (error) { - Logger.warn("Failed to analyze PDF for batching, processing as single file", { - fileName, - error: (error as Error).message, - }) + Logger.warn( + "Failed to analyze PDF for batching, processing as single file", + { + fileName, + error: (error as Error).message, + }, + ) finalApiResult = await callLayoutParsingApi(buffer, fileName) } } else { @@ -788,14 +806,18 @@ export async function chunkByOCR( const overlap = lastTextChunk.slice(-TEXT_CHUNK_OVERLAP_CHARS) if (overlap && !chunkContent.startsWith(overlap)) { const needsSeparator = - !/\s$/.test(overlap) && chunkContent.length > 0 && !/^\s/.test(chunkContent) + !/\s$/.test(overlap) && + chunkContent.length > 0 && + !/^\s/.test(chunkContent) chunkContent = `${overlap}${needsSeparator ? " " : ""}${chunkContent}` } } chunkContent = trimChunkToByteLimit(chunkContent, chunkSizeLimit) - const pageNumbersArray = Array.from(currentPageNumbers).sort((a, b) => a - b) + const pageNumbersArray = Array.from(currentPageNumbers).sort( + (a, b) => a - b, + ) const blockLabelsArray = Array.from(new Set(currentBlockLabels)) chunks.push(chunkContent) diff --git a/server/lib/chunkPdfWithGemini.ts b/server/lib/chunkPdfWithGemini.ts index 0627b394d..ae4dc535b 100644 --- a/server/lib/chunkPdfWithGemini.ts +++ b/server/lib/chunkPdfWithGemini.ts @@ -348,9 +348,6 @@ export function parseGeminiChunkBlocks(raw: string): string[] { } /** - * Gemini-backed PDF extractor that returns the same shape as - * extractTextAndImagesWithChunksFromPDF in server/pdfChunks.ts. - * * Notes: * - image_chunks and image_chunk_pos are intentionally empty. * - Maintains chunk positions sequentially (0..n-1), equivalent to