diff --git a/server/ai/prompts.ts b/server/ai/prompts.ts index c5612df1b..a7b4a0312 100644 --- a/server/ai/prompts.ts +++ b/server/ai/prompts.ts @@ -2414,4 +2414,48 @@ Without these connections, I can only provide general assistance and cannot acce - Project-specific data - Company knowledge bases -I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?` \ No newline at end of file +I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?` + +// PDF Chunking Prompt +// This prompt is used for OCR and semantic chunking of PDF pages using Gemini. +export const CHUNKING_PROMPT = `\ +OCR the provided PDF page(s) into clean Markdown with enriched table and image handling, then segment into coherent RAG-ready chunks. + +GLOBAL RULES: +- Preserve text structure as Markdown (headings, paragraphs, lists, footnotes). +- Keep reading order across pages; prefer natural section boundaries. +- No hallucinations. If content is unreadable, write [illegible]. +- Do not surround output with triple backticks or any code fences. +- Output ONLY a sequence of ... blocks. No extra commentary. + +TABLES (including tables shown inside images): +- Extract ALL tables completely; never summarize or omit cells. +- Represent EVERY table as HTML: …
. +- Keep the entire table within a single chunk when possible. +- If a table must be split across chunks due to limits: + - Split on complete rows only; never split a cell. + - Repeat the full header row () at the start of the next chunk. + - Add "(table continues)" at the end of the first part and "(table continued)" at the start of the next part. + +IMAGES, FIGURES, CHARTS, DIAGRAMS: +- Insert an inline marker at the exact location where the image appears: + - Begin a new paragraph starting with "Image:" and provide a rich, thorough description. + - Describe the scene, axes, legends, units, labels, key values, trends, colors, shapes, and any text in the image. +- If the image contains tabular data, transcribe it immediately after the description as an HTML (same structure as above). +- For charts, add 1–2 sentences summarizing key insights after the description. + +CHUNKING: +- Group content by semantic theme (e.g., subsection, self-contained explanation, contiguous table). +- Target 250–512 words per chunk with a hard maximum of 1024 bytes (UTF-8). +- If 250–512 words would exceed 1024 bytes, end early to respect the byte limit and continue in the next chunk. +- Do not break sentences, list items, or table rows across chunks unless unavoidable due to the byte limit. +- When continuing content in the next chunk, begin with a brief "(continued)" cue to retain context. +- Maintain flow: image descriptions and any extracted tables must appear inline where the image occurs so readers know an image was present there. + +FORMATTING: +- Surround each chunk with ... tags. +- Inside chunks, use valid Markdown and HTML (
only). +- Keep whitespace clean; avoid double spaces and stray line breaks. + +Begin now and emit only blocks. +` diff --git a/server/api/dataSource.ts b/server/api/dataSource.ts index d112e7750..2dafe7c8e 100644 --- a/server/api/dataSource.ts +++ b/server/api/dataSource.ts @@ -30,6 +30,7 @@ import { DeleteDocument } from "@/search/vespa" import type { VespaSchema } from "@xyne/vespa-ts/types" import config from "@/config" import { getErrorMessage } from "@/utils" +import { isDataSourceError } from "@/integrations/dataSource/errors" import { removeAppIntegrationFromAllAgents, getAgentsByDataSourceId, @@ -197,6 +198,10 @@ export async function handleSingleFileUploadToDataSource( flag, }, ) + if (isDataSourceError(error)) { + // Preserve DataSourceError so UI can display error.userMessage + throw error + } if ( error instanceof Error && (error.message.includes("already exists") || diff --git a/server/api/files.ts b/server/api/files.ts index 84269459c..de1ca43d9 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -23,6 +23,7 @@ import type { AttachmentMetadata } from "@/shared/types" import { FileProcessorService } from "@/services/fileProcessor" import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types" import { getBaseMimeType } from "@/integrations/dataSource/config" +import { isDataSourceError } from "@/integrations/dataSource/errors" const { JwtPayloadKey } = config const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" }) @@ -139,9 +140,11 @@ export const handleFileUpload = async (c: Context) => { ) } catch (error) { const errorMessage = - error instanceof Error - ? error.message - : "Unknown error during DataSource processing" + isDataSourceError(error) + ? error.userMessage + : error instanceof Error + ? error.message + : "Unknown error during DataSource processing" loggerWithChild({ email: email }).error( error, `Error processing file "${file.name}" for DataSource`, diff --git a/server/integrations/dataSource/errors.ts b/server/integrations/dataSource/errors.ts index cc1c3dd60..639dbdd47 100644 --- a/server/integrations/dataSource/errors.ts +++ b/server/integrations/dataSource/errors.ts @@ -26,6 +26,16 @@ export class FileSizeExceededError extends FileValidationError { } } +// Specific PDF validation error when a single page exceeds client-side processing limits +export class PdfPageTooLargeError extends FileValidationError { + constructor(pageNumber: number, maxSizeMB: number, actualBytes: number) { + const actualMB = actualBytes / (1024 * 1024) + const message = `PDF page ${pageNumber} size ${actualMB.toFixed(2)}MB exceeds maximum allowed per-page limit of ${maxSizeMB}MB` + const userMessage = `One page in the PDF is too large (${actualMB.toFixed(2)}MB). Please compress or split the PDF so each page is under ${maxSizeMB}MB.` + super(message, userMessage) + } +} + export class UnsupportedFileTypeError extends FileValidationError { constructor(mimeType: string, supportedTypes: string[]) { const message = `Unsupported file type: ${mimeType}` diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts index 933bd6fde..d186e5797 100644 --- a/server/integrations/dataSource/index.ts +++ b/server/integrations/dataSource/index.ts @@ -32,7 +32,7 @@ import { } from "./errors" import { describeImageWithllm } from "@/lib/describeImageWithllm" import { promises as fsPromises } from "fs" -import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" +import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" import imageType from "image-type" @@ -208,7 +208,7 @@ const processPdfContent = async ( try { const docId = `dsf-${createId()}` const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } = - await extractTextAndImagesWithChunksFromPDF(pdfBuffer, docId, true) + await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId) if (text_chunks.length === 0 && image_chunks.length === 0) { throw new ContentExtractionError( "No chunks generated from PDF content", diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts index 0fcb597a2..e2c55bfa5 100644 --- a/server/integrations/google/worker-utils.ts +++ b/server/integrations/google/worker-utils.ts @@ -19,7 +19,7 @@ import { import * as XLSX from "xlsx" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" -import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" +import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" const Logger = getLogger(Subsystem.Integrations).child({ module: "google" }) @@ -49,10 +49,9 @@ const processPdfFile = async ( ): Promise => { try { // Handle non-spreadsheet files as before - const pdfResult = await extractTextAndImagesWithChunksFromPDF( + const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini( pdfBuffer, attachmentId, - false, // Don't extract images for email attachments ) return pdfResult.text_chunks.filter((v) => v.trim()) } catch (error) { diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts index 6bd8fe852..228e8087d 100644 --- a/server/integrations/microsoft/attachment-utils.ts +++ b/server/integrations/microsoft/attachment-utils.ts @@ -17,7 +17,7 @@ import { import * as XLSX from "xlsx" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" -import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" +import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" import { makeGraphApiCall, type MicrosoftGraphClient } from "./client" const Logger = getLogger(Subsystem.Integrations).child({ @@ -48,10 +48,9 @@ const processPdfFile = async ( attachmentId: string, ): Promise => { try { - const pdfResult = await extractTextAndImagesWithChunksFromPDF( + const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini( pdfBuffer, attachmentId, - false, // Don't extract images for email attachments ) return pdfResult.text_chunks.filter((v) => v.trim()) } catch (error) { diff --git a/server/lib/chunkPdfWithGemini.ts b/server/lib/chunkPdfWithGemini.ts new file mode 100644 index 000000000..faf4c0461 --- /dev/null +++ b/server/lib/chunkPdfWithGemini.ts @@ -0,0 +1,300 @@ +import * as crypto from "crypto" +import { VertexAI } from "@google-cloud/vertexai" +import { getLogger } from "@/logger" +import { Subsystem } from "@/types" +import { PDFDocument } from "pdf-lib" +import { FileSizeExceededError, PdfPageTooLargeError } from "@/integrations/dataSource/errors" +import { CHUNKING_PROMPT } from "@/ai/prompts" + +const Logger = getLogger(Subsystem.AI).child({ module: "chunkPdfWithGemini" }) + +// Splitting uses pdf-lib only; pdfjs not required here + +export type ChunkPdfOptions = { + projectId?: string + location?: string + model?: string + gcsUri?: string // Optional GCS URI to use for PDFs >= 15MB + maxOutputTokens?: number + temperature?: number +} + +// Size limits for PDF processing +const INLINE_MAX_BYTES = 17 * 1024 * 1024 // 17MB - split into chunks +const MAX_SUPPORTED_BYTES = 100 * 1024 * 1024 // 100MB - hard limit + +// Save [startPageIdxInclusive .. startPageIdxInclusive+count-1] into a new PDF +async function saveRange( + srcPdf: PDFDocument, + startPageIdxInclusive: number, + count: number, +): Promise { + const newPdf = await PDFDocument.create() + const indices: number[] = [] + for (let i = 0; i < count; i++) indices.push(startPageIdxInclusive + i) + const copied = await newPdf.copyPages(srcPdf, indices) + for (const p of copied) newPdf.addPage(p) + return await newPdf.save() +} + +// Find the largest `count` pages starting at `start` that fit under `maxBytes`. +// Returns { count, bytes }. Uses exponential growth + binary search. +// Complexity: ~O(log remainingPages) saves. +async function findMaxFittingCount( + srcPdf: PDFDocument, + start: number, + remainingPages: number, + maxBytes: number, +): Promise<{ count: number; bytes: Uint8Array }> { + // 1) At least one page must fit, or we error (single-page too large) + let loCount = 1 + let loBytes = await saveRange(srcPdf, start, loCount) + if (loBytes.length > maxBytes) { + throw new PdfPageTooLargeError(start + 1, Math.floor(maxBytes / (1024 * 1024)), loBytes.length) + } + + // 2) Exponential growth to find an overflow upper bound + let hiCount = loCount + let hiBytes: Uint8Array | null = null + while (hiCount < remainingPages) { + // Double, but cap by remaining pages + const next = Math.min(hiCount * 2, remainingPages) + const tryBytes = await saveRange(srcPdf, start, next) + if (tryBytes.length <= maxBytes) { + // Still under → move low up + loCount = next + loBytes = tryBytes + hiCount = next + if (next === remainingPages) { + // Everything fits, done + return { count: loCount, bytes: loBytes } + } + } else { + // Overflow found; set high bound and break + hiCount = next + hiBytes = tryBytes // record overflow marker + break + } + } + + // If we never overflowed (all pages fit via loop), return lo + if (!hiBytes && loCount === remainingPages) { + return { count: loCount, bytes: loBytes } + } + + // 3) Binary search between (loCount, hiCount-1) + let left = loCount + 1 + let right = hiCount - 1 + let bestCount = loCount + let bestBytes = loBytes + + while (left <= right) { + const mid = (left + right) >> 1 + const bytes = await saveRange(srcPdf, start, mid) + if (bytes.length <= maxBytes) { + bestCount = mid + bestBytes = bytes + left = mid + 1 + } else { + right = mid - 1 + } + } + + return { count: bestCount, bytes: bestBytes } +} + +// Public splitter: O(k log n) saves for k chunks total +export async function splitPdfIntoInlineSizedChunks( + data: Uint8Array, + maxBytes: number, + logger?: { info: Function; warn: Function }, +): Promise { + const srcPdf = await PDFDocument.load(data) + const totalPages = srcPdf.getPageCount() + + const chunks: Uint8Array[] = [] + let start = 0 + + while (start < totalPages) { + const remaining = totalPages - start + const { count, bytes } = await findMaxFittingCount(srcPdf, start, remaining, maxBytes) + + if (logger) { + console.log( + { + startPage: start + 1, + endPage: start + count, + pagesInChunk: count, + subSizeBytes: bytes.length, + maxBytes, + }, + "Prepared sub-PDF chunk", + ) + } + + chunks.push(bytes) + start += count + } + return chunks +} + +/** + * Extract semantic chunks from a PDF using Gemini Flash on Vertex AI. + * - If the data passed to this function is < 17MB, it is sent as inlineData (base64-encoded). + * - Callers should split larger PDFs into sub-PDFs <= 17MB and call this per part. + */ +export async function extractSemanticChunksFromPdf( + pdfData: Uint8Array, + opts: ChunkPdfOptions = {}, +): Promise { + if (!pdfData || pdfData.length === 0) throw new Error("pdfData is required") + + const dataSize = pdfData.length + + const projectId = + process.env.VERTEX_PROJECT_ID || + "" + + const location = + process.env.VERTEX_REGION || + "us-central1" + + if (!projectId) { + throw new Error( + "Missing GCP project ID. Set VERTEX_PROJECT_ID or GOOGLE_CLOUD_PROJECT (or GCLOUD_PROJECT/GCP_PROJECT_ID) or pass options.projectId.", + ) + } + + const modelId = opts.model || process.env.VERTEX_AI_MODEL_PDF_PROCESSING || "gemini-2.5-flash" + const maxOutputTokens = opts.maxOutputTokens ?? 8192 + const temperature = opts.temperature ?? 0.1 + + const vertex = new VertexAI({ project: projectId, location }) + const model = vertex.getGenerativeModel({ + model: modelId, + generationConfig: { maxOutputTokens, temperature }, + }) + + // Build message parts - always inlineData (callers split before calling) + const messageParts: any[] = [{ text: CHUNKING_PROMPT }] + const pdfBase64 = Buffer.from(pdfData).toString("base64") + messageParts.push({ + inlineData: { + mimeType: "application/pdf", + data: pdfBase64, + }, + }) + + Logger.debug( + { + model: modelId, + projectId, + location, + mode: "inlineData", + sizeBytes: dataSize, + }, + "Sending PDF to Gemini Flash via Vertex AI", + ) + + // Call Vertex AI Gemini Flash + const result = await model.generateContent({ + contents: [ + { + role: "user", + parts: messageParts, + }, + ], + }) + + // Parse and return raw text + const candidates = result.response?.candidates ?? [] + const parts = candidates[0]?.content?.parts ?? [] + const text = parts + .filter((p: any) => typeof p?.text === "string") + .map((p: any) => p.text as string) + .join("") + .trim() + + return text +} + +/** + * Parse Gemini's raw output into an ordered list of chunk strings. + * Looks for ... blocks, preserving order. + */ +export function parseGeminiChunkBlocks(raw: string): string[] { + if (!raw) return [] + const chunks: string[] = [] + const re = /]*>([\s\S]*?)<\/chunk>/gi + let match: RegExpExecArray | null + while ((match = re.exec(raw)) !== null) { + const content = (match[1] || "").trim() + if (content) chunks.push(content) + } + return chunks +} + +/** + * Gemini-backed PDF extractor that returns the same shape as + * extractTextAndImagesWithChunksFromPDF in server/pdfChunks.ts. + * + * Notes: + * - image_chunks and image_chunk_pos are intentionally empty. + * - Maintains chunk positions sequentially (0..n-1), equivalent to + * the globalSeq logic in pdfChunks.ts. + * - Accepts a PDF as Uint8Array and processes it directly with Gemini. + */ +export async function extractTextAndImagesWithChunksFromPDFviaGemini( + data: Uint8Array, + docid: string = crypto.randomUUID(), // will be used to parse images if we extract it later + opts: Partial = {}, +): Promise<{ + text_chunks: string[] + image_chunks: string[] + text_chunk_pos: number[] + image_chunk_pos: number[] +}> { + if (!data || data.length === 0) { + return { text_chunks: [], image_chunks: [], text_chunk_pos: [], image_chunk_pos: [] } + } + + if (data.length > MAX_SUPPORTED_BYTES) { + const actualMB = data.length / (1024 * 1024) + const maxMB = MAX_SUPPORTED_BYTES / (1024 * 1024) + throw new FileSizeExceededError(maxMB, actualMB) + } + + const text_chunks: string[] = [] + const text_chunk_pos: number[] = [] + let globalSeq = 0 + + if (data.length <= INLINE_MAX_BYTES) { + // Single call path + Logger.info("Sending single PDF to Gemini , no splitting needed") + const raw = await extractSemanticChunksFromPdf(data, opts as ChunkPdfOptions) + const chunks = parseGeminiChunkBlocks(raw) + for (const c of chunks) { + text_chunks.push(c) + text_chunk_pos.push(globalSeq++) + } + } else { + // Split into page-based sub-PDFs that are each <= 17MB + const subPdfs = await splitPdfIntoInlineSizedChunks(data, INLINE_MAX_BYTES, Logger) + for (let i = 0; i < subPdfs.length; i++) { + const part = subPdfs[i] + Logger.info({ index: i + 1, bytes: part.length }, "Sending sub-PDF to Gemini") + const raw = await extractSemanticChunksFromPdf(part, opts as ChunkPdfOptions) + const chunks = parseGeminiChunkBlocks(raw) + for (const c of chunks) { + text_chunks.push(c) + text_chunk_pos.push(globalSeq++) + } + } + } + + // As requested: image arrays are always empty/unified + const image_chunks: string[] = [] + const image_chunk_pos: number[] = [] + + return { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } +} diff --git a/server/package.json b/server/package.json index b80ce4deb..d92167477 100644 --- a/server/package.json +++ b/server/package.json @@ -90,6 +90,7 @@ "ora": "^8.1.1", "p-limit": "^6.2.0", "partial-json": "^0.1.7", + "pdf-lib": "^1.17.1", "pdf-parse": "^1.1.1", "pdfjs-dist": "^5.3.31", "pg-boss": "^10.1.5", diff --git a/server/scripts/testGeminiFromProcessFile.ts b/server/scripts/testGeminiFromProcessFile.ts new file mode 100644 index 000000000..80d5a4e28 --- /dev/null +++ b/server/scripts/testGeminiFromProcessFile.ts @@ -0,0 +1,109 @@ +#!/usr/bin/env bun + +import { promises as fs } from "fs" +import path from "path" +import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" + +type EnvMap = Record +const DEFAULT_TEST_PDF = "" + +async function loadEnvFile(envPath: string): Promise { + try { + const raw = await fs.readFile(envPath, "utf8") + const map: EnvMap = {} + for (const line of raw.split(/\r?\n/)) { + const trimmed = line.trim() + if (!trimmed || trimmed.startsWith("#")) continue + const eq = trimmed.indexOf("=") + if (eq === -1) continue + const key = trimmed.slice(0, eq).trim() + let val = trimmed.slice(eq + 1).trim() + // Strip surrounding quotes if present + if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) { + val = val.slice(1, -1) + } + map[key] = val + if (!(key in process.env)) { + process.env[key] = val + } + } + return map + } catch (err: any) { + if (err?.code !== "ENOENT") { + console.warn(`Warning: failed to read env file at ${envPath}:`, err) + } + return {} + } +} + +function resolvePdfPath(args: string[], envs: EnvMap): string { + // Priority: CLI arg -> TEST_PDF_PATH -> PDF_PATH + const cli = args[0] + const fromEnv = envs["TEST_PDF_PATH"] || envs["PDF_PATH"] || process.env.TEST_PDF_PATH || process.env.PDF_PATH + const p = cli || fromEnv || DEFAULT_TEST_PDF + return path.resolve(p) +} + +async function main() { + console.log("=== Gemini PDF Chunker (processFile simulation) ===") + + // Load env from server/.env (preferred) then from project .env (optional) + const cwd = process.cwd() + const serverEnvPath = path.resolve(cwd, "server/.env") + const rootEnvPath = path.resolve(cwd, ".env") + const envs = { + ...(await loadEnvFile(serverEnvPath)), + ...(await loadEnvFile(rootEnvPath)), + } + + // Resolve PDF path + const argv = process.argv.slice(2) + const pdfPath = resolvePdfPath(argv, envs) + console.log("PDF Path:", pdfPath) + + // Read the PDF file into a Buffer (simulate FileProcessorService.processFile input) + const buffer = await fs.readFile(pdfPath) + console.log("File size:", buffer.length, "bytes") + + // Simulate processFile -> extractTextAndImagesWithChunksFromPDFviaGemini call + const vespaDocId = "test-docid-gemini" + + console.log("\nCalling Gemini-backed extractor...") + const result = await extractTextAndImagesWithChunksFromPDFviaGemini( + new Uint8Array(buffer), + vespaDocId, + ) + + // Map to FileProcessorService result naming for clarity + const chunks = result.text_chunks + const chunks_pos = result.text_chunk_pos + const image_chunks = result.image_chunks + const image_chunks_pos = result.image_chunk_pos + + console.log("\n=== Results ===") + console.log("Text chunks:", chunks.length) + console.log("Text chunk positions:", chunks_pos.length) + console.log("Image chunks (should be 0):", image_chunks.length) + console.log("Image chunk positions (should be 0):", image_chunks_pos.length) + + console.log("All text chunks", { chunks }) + console.log("All text chunk positions", { chunks_pos }) + console.log("All image chunks", { image_chunks }) + console.log("All image chunk positions", { image_chunks_pos }) + + // Print chunks with their positions + // console.log("\n=== Text Chunks with Positions ===") + // for (let i = 0; i < chunks.length; i++) { + // const chunk = chunks[i] + // const pos = chunks_pos[i] + // console.log(`\n[${i}] pos=${pos}`) + // console.log(chunk) + // } + + console.log("\n=== Done ===") +} + +await main().catch((err) => { + console.error("Test failed:", err) + process.exit(1) +}) diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts deleted file mode 100644 index 0f0adac30..000000000 --- a/server/scripts/testPdfDirect.ts +++ /dev/null @@ -1,89 +0,0 @@ -import { readFileSync } from "fs" -import { resolve } from "path" -import { FileProcessorService } from "@/services/fileProcessor" -import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" - -async function testPdfDirect() { - let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf" - // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf" - pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf" - try { - console.log("=== DIRECT PDF PROCESSING TEST ===") - console.log("PDF Path:", pdfPath) - - // Read the PDF file - console.log("\n1. Reading PDF file...") - const pdfBuffer = readFileSync(pdfPath) - console.log("File size:", pdfBuffer.length, "bytes") - - console.log("\n2. Testing direct PDF processing (current knowledge base flow)...") - console.log("This simulates exactly what happens in the knowledge base upload:") - console.log("- FileProcessorService.processFile() is called") - console.log("- extractImages defaults to false") - console.log("- describeImages defaults to false") - - // Test the exact flow used in knowledge base - const result = await FileProcessorService.processFile( - pdfBuffer, - "application/pdf", - "small2.pdf", - "test-doc-id", - pdfPath - // extractImages and describeImages default to false - ) - - console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===") - console.log("Text chunks:", result.chunks.length) - console.log("Image chunks:", result.image_chunks.length) - console.log("Text chunk positions:", result.chunks_pos.length) - console.log("Image chunk positions:", result.image_chunks_pos.length) - - console.log("\n3. Testing with image processing enabled...") - console.log("Parameters: extractImages=true, describeImages=true") - - // Test with images enabled to see the difference - const imageResult = await extractTextAndImagesWithChunksFromPDF( - new Uint8Array(pdfBuffer), - "test-doc-with-images", - true, // extractImages enabled - true // describeImages enabled - ) - - console.log("\n=== RESULTS WITH IMAGES ENABLED ===") - console.log("Text chunks:", imageResult.text_chunks.length) - console.log("Image chunks:", imageResult.image_chunks.length) - console.log("Text chunk positions:", imageResult.text_chunk_pos.length) - console.log("Image chunk positions:", imageResult.image_chunk_pos.length) - - console.log("\n=== COMPARISON ===") - console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length) - console.log("With images - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length) - - if (result.chunks.length > 0) { - console.log("\n=== SAMPLE TEXT CHUNKS ===") - result.chunks.slice(0, 2).forEach((chunk, idx) => { - console.log(`\nText Chunk ${idx + 1}:`) - console.log(chunk) - }) - } - - if (imageResult.image_chunks.length > 0) { - console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===") - imageResult.image_chunks.forEach((chunk, idx) => { - console.log(`\nImage ${idx + 1}:`) - console.log(chunk) - }) - } - - console.log("\n=== TEST COMPLETED ===") - console.log("✓ Check the debug logs above from pdfChunks.ts") - console.log("✓ You can see exactly what's being processed in the current knowledge base flow") - - } catch (error) { - console.error("Error processing PDF:", error) - process.exit(1) - } -} - -// Run the test -testPdfDirect().catch(console.error) \ No newline at end of file diff --git a/server/services/fileProcessor.ts b/server/services/fileProcessor.ts index d15fca2bb..466bd3a93 100644 --- a/server/services/fileProcessor.ts +++ b/server/services/fileProcessor.ts @@ -1,6 +1,8 @@ import { getErrorMessage } from "@/utils" import { chunkDocument } from "@/chunks" -import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks" +// import { extractTextAndImagesWithChunksFromPDF } from "@/pdf + +import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini" import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks" import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks" import * as XLSX from "xlsx" @@ -38,11 +40,9 @@ export class FileProcessorService { try { if (baseMimeType === "application/pdf") { // Process PDF - const result = await extractTextAndImagesWithChunksFromPDF( + const result = await extractTextAndImagesWithChunksFromPDFviaGemini( new Uint8Array(buffer), vespaDocId, - extractImages, - describeImages, ) chunks = result.text_chunks chunks_pos = result.text_chunk_pos