diff --git a/server/ai/prompts.ts b/server/ai/prompts.ts
index c5612df1b..a7b4a0312 100644
--- a/server/ai/prompts.ts
+++ b/server/ai/prompts.ts
@@ -2414,4 +2414,48 @@ Without these connections, I can only provide general assistance and cannot acce
- Project-specific data
- Company knowledge bases
-I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
\ No newline at end of file
+I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
+
+// PDF Chunking Prompt
+// This prompt is used for OCR and semantic chunking of PDF pages using Gemini.
+export const CHUNKING_PROMPT = `\
+OCR the provided PDF page(s) into clean Markdown with enriched table and image handling, then segment into coherent RAG-ready chunks.
+
+GLOBAL RULES:
+- Preserve text structure as Markdown (headings, paragraphs, lists, footnotes).
+- Keep reading order across pages; prefer natural section boundaries.
+- No hallucinations. If content is unreadable, write [illegible].
+- Do not surround output with triple backticks or any code fences.
+- Output ONLY a sequence of ... blocks. No extra commentary.
+
+TABLES (including tables shown inside images):
+- Extract ALL tables completely; never summarize or omit cells.
+- Represent EVERY table as HTML:
.
+- Keep the entire table within a single chunk when possible.
+- If a table must be split across chunks due to limits:
+ - Split on complete rows only; never split a cell.
+ - Repeat the full header row () at the start of the next chunk.
+ - Add "(table continues)" at the end of the first part and "(table continued)" at the start of the next part.
+
+IMAGES, FIGURES, CHARTS, DIAGRAMS:
+- Insert an inline marker at the exact location where the image appears:
+ - Begin a new paragraph starting with "Image:" and provide a rich, thorough description.
+ - Describe the scene, axes, legends, units, labels, key values, trends, colors, shapes, and any text in the image.
+- If the image contains tabular data, transcribe it immediately after the description as an HTML (same structure as above).
+- For charts, add 1–2 sentences summarizing key insights after the description.
+
+CHUNKING:
+- Group content by semantic theme (e.g., subsection, self-contained explanation, contiguous table).
+- Target 250–512 words per chunk with a hard maximum of 1024 bytes (UTF-8).
+- If 250–512 words would exceed 1024 bytes, end early to respect the byte limit and continue in the next chunk.
+- Do not break sentences, list items, or table rows across chunks unless unavoidable due to the byte limit.
+- When continuing content in the next chunk, begin with a brief "(continued)" cue to retain context.
+- Maintain flow: image descriptions and any extracted tables must appear inline where the image occurs so readers know an image was present there.
+
+FORMATTING:
+- Surround each chunk with ... tags.
+- Inside chunks, use valid Markdown and HTML ( only).
+- Keep whitespace clean; avoid double spaces and stray line breaks.
+
+Begin now and emit only blocks.
+`
diff --git a/server/api/dataSource.ts b/server/api/dataSource.ts
index d112e7750..2dafe7c8e 100644
--- a/server/api/dataSource.ts
+++ b/server/api/dataSource.ts
@@ -30,6 +30,7 @@ import { DeleteDocument } from "@/search/vespa"
import type { VespaSchema } from "@xyne/vespa-ts/types"
import config from "@/config"
import { getErrorMessage } from "@/utils"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
import {
removeAppIntegrationFromAllAgents,
getAgentsByDataSourceId,
@@ -197,6 +198,10 @@ export async function handleSingleFileUploadToDataSource(
flag,
},
)
+ if (isDataSourceError(error)) {
+ // Preserve DataSourceError so UI can display error.userMessage
+ throw error
+ }
if (
error instanceof Error &&
(error.message.includes("already exists") ||
diff --git a/server/api/files.ts b/server/api/files.ts
index 84269459c..de1ca43d9 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -23,6 +23,7 @@ import type { AttachmentMetadata } from "@/shared/types"
import { FileProcessorService } from "@/services/fileProcessor"
import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types"
import { getBaseMimeType } from "@/integrations/dataSource/config"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
const { JwtPayloadKey } = config
const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" })
@@ -139,9 +140,11 @@ export const handleFileUpload = async (c: Context) => {
)
} catch (error) {
const errorMessage =
- error instanceof Error
- ? error.message
- : "Unknown error during DataSource processing"
+ isDataSourceError(error)
+ ? error.userMessage
+ : error instanceof Error
+ ? error.message
+ : "Unknown error during DataSource processing"
loggerWithChild({ email: email }).error(
error,
`Error processing file "${file.name}" for DataSource`,
diff --git a/server/integrations/dataSource/errors.ts b/server/integrations/dataSource/errors.ts
index cc1c3dd60..639dbdd47 100644
--- a/server/integrations/dataSource/errors.ts
+++ b/server/integrations/dataSource/errors.ts
@@ -26,6 +26,16 @@ export class FileSizeExceededError extends FileValidationError {
}
}
+// Specific PDF validation error when a single page exceeds client-side processing limits
+export class PdfPageTooLargeError extends FileValidationError {
+ constructor(pageNumber: number, maxSizeMB: number, actualBytes: number) {
+ const actualMB = actualBytes / (1024 * 1024)
+ const message = `PDF page ${pageNumber} size ${actualMB.toFixed(2)}MB exceeds maximum allowed per-page limit of ${maxSizeMB}MB`
+ const userMessage = `One page in the PDF is too large (${actualMB.toFixed(2)}MB). Please compress or split the PDF so each page is under ${maxSizeMB}MB.`
+ super(message, userMessage)
+ }
+}
+
export class UnsupportedFileTypeError extends FileValidationError {
constructor(mimeType: string, supportedTypes: string[]) {
const message = `Unsupported file type: ${mimeType}`
diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts
index 933bd6fde..d186e5797 100644
--- a/server/integrations/dataSource/index.ts
+++ b/server/integrations/dataSource/index.ts
@@ -32,7 +32,7 @@ import {
} from "./errors"
import { describeImageWithllm } from "@/lib/describeImageWithllm"
import { promises as fsPromises } from "fs"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
import imageType from "image-type"
@@ -208,7 +208,7 @@ const processPdfContent = async (
try {
const docId = `dsf-${createId()}`
const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } =
- await extractTextAndImagesWithChunksFromPDF(pdfBuffer, docId, true)
+ await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId)
if (text_chunks.length === 0 && image_chunks.length === 0) {
throw new ContentExtractionError(
"No chunks generated from PDF content",
diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts
index 0fcb597a2..e2c55bfa5 100644
--- a/server/integrations/google/worker-utils.ts
+++ b/server/integrations/google/worker-utils.ts
@@ -19,7 +19,7 @@ import {
import * as XLSX from "xlsx"
import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
const Logger = getLogger(Subsystem.Integrations).child({ module: "google" })
@@ -49,10 +49,9 @@ const processPdfFile = async (
): Promise => {
try {
// Handle non-spreadsheet files as before
- const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+ const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
pdfBuffer,
attachmentId,
- false, // Don't extract images for email attachments
)
return pdfResult.text_chunks.filter((v) => v.trim())
} catch (error) {
diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts
index 6bd8fe852..228e8087d 100644
--- a/server/integrations/microsoft/attachment-utils.ts
+++ b/server/integrations/microsoft/attachment-utils.ts
@@ -17,7 +17,7 @@ import {
import * as XLSX from "xlsx"
import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
const Logger = getLogger(Subsystem.Integrations).child({
@@ -48,10 +48,9 @@ const processPdfFile = async (
attachmentId: string,
): Promise => {
try {
- const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+ const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
pdfBuffer,
attachmentId,
- false, // Don't extract images for email attachments
)
return pdfResult.text_chunks.filter((v) => v.trim())
} catch (error) {
diff --git a/server/lib/chunkPdfWithGemini.ts b/server/lib/chunkPdfWithGemini.ts
new file mode 100644
index 000000000..faf4c0461
--- /dev/null
+++ b/server/lib/chunkPdfWithGemini.ts
@@ -0,0 +1,300 @@
+import * as crypto from "crypto"
+import { VertexAI } from "@google-cloud/vertexai"
+import { getLogger } from "@/logger"
+import { Subsystem } from "@/types"
+import { PDFDocument } from "pdf-lib"
+import { FileSizeExceededError, PdfPageTooLargeError } from "@/integrations/dataSource/errors"
+import { CHUNKING_PROMPT } from "@/ai/prompts"
+
+const Logger = getLogger(Subsystem.AI).child({ module: "chunkPdfWithGemini" })
+
+// Splitting uses pdf-lib only; pdfjs not required here
+
+export type ChunkPdfOptions = {
+ projectId?: string
+ location?: string
+ model?: string
+ gcsUri?: string // Optional GCS URI to use for PDFs >= 15MB
+ maxOutputTokens?: number
+ temperature?: number
+}
+
+// Size limits for PDF processing
+const INLINE_MAX_BYTES = 17 * 1024 * 1024 // 17MB - split into chunks
+const MAX_SUPPORTED_BYTES = 100 * 1024 * 1024 // 100MB - hard limit
+
+// Save [startPageIdxInclusive .. startPageIdxInclusive+count-1] into a new PDF
+async function saveRange(
+ srcPdf: PDFDocument,
+ startPageIdxInclusive: number,
+ count: number,
+): Promise {
+ const newPdf = await PDFDocument.create()
+ const indices: number[] = []
+ for (let i = 0; i < count; i++) indices.push(startPageIdxInclusive + i)
+ const copied = await newPdf.copyPages(srcPdf, indices)
+ for (const p of copied) newPdf.addPage(p)
+ return await newPdf.save()
+}
+
+// Find the largest `count` pages starting at `start` that fit under `maxBytes`.
+// Returns { count, bytes }. Uses exponential growth + binary search.
+// Complexity: ~O(log remainingPages) saves.
+async function findMaxFittingCount(
+ srcPdf: PDFDocument,
+ start: number,
+ remainingPages: number,
+ maxBytes: number,
+): Promise<{ count: number; bytes: Uint8Array }> {
+ // 1) At least one page must fit, or we error (single-page too large)
+ let loCount = 1
+ let loBytes = await saveRange(srcPdf, start, loCount)
+ if (loBytes.length > maxBytes) {
+ throw new PdfPageTooLargeError(start + 1, Math.floor(maxBytes / (1024 * 1024)), loBytes.length)
+ }
+
+ // 2) Exponential growth to find an overflow upper bound
+ let hiCount = loCount
+ let hiBytes: Uint8Array | null = null
+ while (hiCount < remainingPages) {
+ // Double, but cap by remaining pages
+ const next = Math.min(hiCount * 2, remainingPages)
+ const tryBytes = await saveRange(srcPdf, start, next)
+ if (tryBytes.length <= maxBytes) {
+ // Still under → move low up
+ loCount = next
+ loBytes = tryBytes
+ hiCount = next
+ if (next === remainingPages) {
+ // Everything fits, done
+ return { count: loCount, bytes: loBytes }
+ }
+ } else {
+ // Overflow found; set high bound and break
+ hiCount = next
+ hiBytes = tryBytes // record overflow marker
+ break
+ }
+ }
+
+ // If we never overflowed (all pages fit via loop), return lo
+ if (!hiBytes && loCount === remainingPages) {
+ return { count: loCount, bytes: loBytes }
+ }
+
+ // 3) Binary search between (loCount, hiCount-1)
+ let left = loCount + 1
+ let right = hiCount - 1
+ let bestCount = loCount
+ let bestBytes = loBytes
+
+ while (left <= right) {
+ const mid = (left + right) >> 1
+ const bytes = await saveRange(srcPdf, start, mid)
+ if (bytes.length <= maxBytes) {
+ bestCount = mid
+ bestBytes = bytes
+ left = mid + 1
+ } else {
+ right = mid - 1
+ }
+ }
+
+ return { count: bestCount, bytes: bestBytes }
+}
+
+// Public splitter: O(k log n) saves for k chunks total
+export async function splitPdfIntoInlineSizedChunks(
+ data: Uint8Array,
+ maxBytes: number,
+ logger?: { info: Function; warn: Function },
+): Promise {
+ const srcPdf = await PDFDocument.load(data)
+ const totalPages = srcPdf.getPageCount()
+
+ const chunks: Uint8Array[] = []
+ let start = 0
+
+ while (start < totalPages) {
+ const remaining = totalPages - start
+ const { count, bytes } = await findMaxFittingCount(srcPdf, start, remaining, maxBytes)
+
+ if (logger) {
+ console.log(
+ {
+ startPage: start + 1,
+ endPage: start + count,
+ pagesInChunk: count,
+ subSizeBytes: bytes.length,
+ maxBytes,
+ },
+ "Prepared sub-PDF chunk",
+ )
+ }
+
+ chunks.push(bytes)
+ start += count
+ }
+ return chunks
+}
+
+/**
+ * Extract semantic chunks from a PDF using Gemini Flash on Vertex AI.
+ * - If the data passed to this function is < 17MB, it is sent as inlineData (base64-encoded).
+ * - Callers should split larger PDFs into sub-PDFs <= 17MB and call this per part.
+ */
+export async function extractSemanticChunksFromPdf(
+ pdfData: Uint8Array,
+ opts: ChunkPdfOptions = {},
+): Promise {
+ if (!pdfData || pdfData.length === 0) throw new Error("pdfData is required")
+
+ const dataSize = pdfData.length
+
+ const projectId =
+ process.env.VERTEX_PROJECT_ID ||
+ ""
+
+ const location =
+ process.env.VERTEX_REGION ||
+ "us-central1"
+
+ if (!projectId) {
+ throw new Error(
+ "Missing GCP project ID. Set VERTEX_PROJECT_ID or GOOGLE_CLOUD_PROJECT (or GCLOUD_PROJECT/GCP_PROJECT_ID) or pass options.projectId.",
+ )
+ }
+
+ const modelId = opts.model || process.env.VERTEX_AI_MODEL_PDF_PROCESSING || "gemini-2.5-flash"
+ const maxOutputTokens = opts.maxOutputTokens ?? 8192
+ const temperature = opts.temperature ?? 0.1
+
+ const vertex = new VertexAI({ project: projectId, location })
+ const model = vertex.getGenerativeModel({
+ model: modelId,
+ generationConfig: { maxOutputTokens, temperature },
+ })
+
+ // Build message parts - always inlineData (callers split before calling)
+ const messageParts: any[] = [{ text: CHUNKING_PROMPT }]
+ const pdfBase64 = Buffer.from(pdfData).toString("base64")
+ messageParts.push({
+ inlineData: {
+ mimeType: "application/pdf",
+ data: pdfBase64,
+ },
+ })
+
+ Logger.debug(
+ {
+ model: modelId,
+ projectId,
+ location,
+ mode: "inlineData",
+ sizeBytes: dataSize,
+ },
+ "Sending PDF to Gemini Flash via Vertex AI",
+ )
+
+ // Call Vertex AI Gemini Flash
+ const result = await model.generateContent({
+ contents: [
+ {
+ role: "user",
+ parts: messageParts,
+ },
+ ],
+ })
+
+ // Parse and return raw text
+ const candidates = result.response?.candidates ?? []
+ const parts = candidates[0]?.content?.parts ?? []
+ const text = parts
+ .filter((p: any) => typeof p?.text === "string")
+ .map((p: any) => p.text as string)
+ .join("")
+ .trim()
+
+ return text
+}
+
+/**
+ * Parse Gemini's raw output into an ordered list of chunk strings.
+ * Looks for ... blocks, preserving order.
+ */
+export function parseGeminiChunkBlocks(raw: string): string[] {
+ if (!raw) return []
+ const chunks: string[] = []
+ const re = /]*>([\s\S]*?)<\/chunk>/gi
+ let match: RegExpExecArray | null
+ while ((match = re.exec(raw)) !== null) {
+ const content = (match[1] || "").trim()
+ if (content) chunks.push(content)
+ }
+ return chunks
+}
+
+/**
+ * Gemini-backed PDF extractor that returns the same shape as
+ * extractTextAndImagesWithChunksFromPDF in server/pdfChunks.ts.
+ *
+ * Notes:
+ * - image_chunks and image_chunk_pos are intentionally empty.
+ * - Maintains chunk positions sequentially (0..n-1), equivalent to
+ * the globalSeq logic in pdfChunks.ts.
+ * - Accepts a PDF as Uint8Array and processes it directly with Gemini.
+ */
+export async function extractTextAndImagesWithChunksFromPDFviaGemini(
+ data: Uint8Array,
+ docid: string = crypto.randomUUID(), // will be used to parse images if we extract it later
+ opts: Partial = {},
+): Promise<{
+ text_chunks: string[]
+ image_chunks: string[]
+ text_chunk_pos: number[]
+ image_chunk_pos: number[]
+}> {
+ if (!data || data.length === 0) {
+ return { text_chunks: [], image_chunks: [], text_chunk_pos: [], image_chunk_pos: [] }
+ }
+
+ if (data.length > MAX_SUPPORTED_BYTES) {
+ const actualMB = data.length / (1024 * 1024)
+ const maxMB = MAX_SUPPORTED_BYTES / (1024 * 1024)
+ throw new FileSizeExceededError(maxMB, actualMB)
+ }
+
+ const text_chunks: string[] = []
+ const text_chunk_pos: number[] = []
+ let globalSeq = 0
+
+ if (data.length <= INLINE_MAX_BYTES) {
+ // Single call path
+ Logger.info("Sending single PDF to Gemini , no splitting needed")
+ const raw = await extractSemanticChunksFromPdf(data, opts as ChunkPdfOptions)
+ const chunks = parseGeminiChunkBlocks(raw)
+ for (const c of chunks) {
+ text_chunks.push(c)
+ text_chunk_pos.push(globalSeq++)
+ }
+ } else {
+ // Split into page-based sub-PDFs that are each <= 17MB
+ const subPdfs = await splitPdfIntoInlineSizedChunks(data, INLINE_MAX_BYTES, Logger)
+ for (let i = 0; i < subPdfs.length; i++) {
+ const part = subPdfs[i]
+ Logger.info({ index: i + 1, bytes: part.length }, "Sending sub-PDF to Gemini")
+ const raw = await extractSemanticChunksFromPdf(part, opts as ChunkPdfOptions)
+ const chunks = parseGeminiChunkBlocks(raw)
+ for (const c of chunks) {
+ text_chunks.push(c)
+ text_chunk_pos.push(globalSeq++)
+ }
+ }
+ }
+
+ // As requested: image arrays are always empty/unified
+ const image_chunks: string[] = []
+ const image_chunk_pos: number[] = []
+
+ return { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos }
+}
diff --git a/server/package.json b/server/package.json
index b80ce4deb..d92167477 100644
--- a/server/package.json
+++ b/server/package.json
@@ -90,6 +90,7 @@
"ora": "^8.1.1",
"p-limit": "^6.2.0",
"partial-json": "^0.1.7",
+ "pdf-lib": "^1.17.1",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^5.3.31",
"pg-boss": "^10.1.5",
diff --git a/server/scripts/testGeminiFromProcessFile.ts b/server/scripts/testGeminiFromProcessFile.ts
new file mode 100644
index 000000000..80d5a4e28
--- /dev/null
+++ b/server/scripts/testGeminiFromProcessFile.ts
@@ -0,0 +1,109 @@
+#!/usr/bin/env bun
+
+import { promises as fs } from "fs"
+import path from "path"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+
+type EnvMap = Record
+const DEFAULT_TEST_PDF = ""
+
+async function loadEnvFile(envPath: string): Promise {
+ try {
+ const raw = await fs.readFile(envPath, "utf8")
+ const map: EnvMap = {}
+ for (const line of raw.split(/\r?\n/)) {
+ const trimmed = line.trim()
+ if (!trimmed || trimmed.startsWith("#")) continue
+ const eq = trimmed.indexOf("=")
+ if (eq === -1) continue
+ const key = trimmed.slice(0, eq).trim()
+ let val = trimmed.slice(eq + 1).trim()
+ // Strip surrounding quotes if present
+ if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
+ val = val.slice(1, -1)
+ }
+ map[key] = val
+ if (!(key in process.env)) {
+ process.env[key] = val
+ }
+ }
+ return map
+ } catch (err: any) {
+ if (err?.code !== "ENOENT") {
+ console.warn(`Warning: failed to read env file at ${envPath}:`, err)
+ }
+ return {}
+ }
+}
+
+function resolvePdfPath(args: string[], envs: EnvMap): string {
+ // Priority: CLI arg -> TEST_PDF_PATH -> PDF_PATH
+ const cli = args[0]
+ const fromEnv = envs["TEST_PDF_PATH"] || envs["PDF_PATH"] || process.env.TEST_PDF_PATH || process.env.PDF_PATH
+ const p = cli || fromEnv || DEFAULT_TEST_PDF
+ return path.resolve(p)
+}
+
+async function main() {
+ console.log("=== Gemini PDF Chunker (processFile simulation) ===")
+
+ // Load env from server/.env (preferred) then from project .env (optional)
+ const cwd = process.cwd()
+ const serverEnvPath = path.resolve(cwd, "server/.env")
+ const rootEnvPath = path.resolve(cwd, ".env")
+ const envs = {
+ ...(await loadEnvFile(serverEnvPath)),
+ ...(await loadEnvFile(rootEnvPath)),
+ }
+
+ // Resolve PDF path
+ const argv = process.argv.slice(2)
+ const pdfPath = resolvePdfPath(argv, envs)
+ console.log("PDF Path:", pdfPath)
+
+ // Read the PDF file into a Buffer (simulate FileProcessorService.processFile input)
+ const buffer = await fs.readFile(pdfPath)
+ console.log("File size:", buffer.length, "bytes")
+
+ // Simulate processFile -> extractTextAndImagesWithChunksFromPDFviaGemini call
+ const vespaDocId = "test-docid-gemini"
+
+ console.log("\nCalling Gemini-backed extractor...")
+ const result = await extractTextAndImagesWithChunksFromPDFviaGemini(
+ new Uint8Array(buffer),
+ vespaDocId,
+ )
+
+ // Map to FileProcessorService result naming for clarity
+ const chunks = result.text_chunks
+ const chunks_pos = result.text_chunk_pos
+ const image_chunks = result.image_chunks
+ const image_chunks_pos = result.image_chunk_pos
+
+ console.log("\n=== Results ===")
+ console.log("Text chunks:", chunks.length)
+ console.log("Text chunk positions:", chunks_pos.length)
+ console.log("Image chunks (should be 0):", image_chunks.length)
+ console.log("Image chunk positions (should be 0):", image_chunks_pos.length)
+
+ console.log("All text chunks", { chunks })
+ console.log("All text chunk positions", { chunks_pos })
+ console.log("All image chunks", { image_chunks })
+ console.log("All image chunk positions", { image_chunks_pos })
+
+ // Print chunks with their positions
+ // console.log("\n=== Text Chunks with Positions ===")
+ // for (let i = 0; i < chunks.length; i++) {
+ // const chunk = chunks[i]
+ // const pos = chunks_pos[i]
+ // console.log(`\n[${i}] pos=${pos}`)
+ // console.log(chunk)
+ // }
+
+ console.log("\n=== Done ===")
+}
+
+await main().catch((err) => {
+ console.error("Test failed:", err)
+ process.exit(1)
+})
diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts
deleted file mode 100644
index 0f0adac30..000000000
--- a/server/scripts/testPdfDirect.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-import { readFileSync } from "fs"
-import { resolve } from "path"
-import { FileProcessorService } from "@/services/fileProcessor"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
-
-async function testPdfDirect() {
- let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf"
- // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf"
- pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf"
- try {
- console.log("=== DIRECT PDF PROCESSING TEST ===")
- console.log("PDF Path:", pdfPath)
-
- // Read the PDF file
- console.log("\n1. Reading PDF file...")
- const pdfBuffer = readFileSync(pdfPath)
- console.log("File size:", pdfBuffer.length, "bytes")
-
- console.log("\n2. Testing direct PDF processing (current knowledge base flow)...")
- console.log("This simulates exactly what happens in the knowledge base upload:")
- console.log("- FileProcessorService.processFile() is called")
- console.log("- extractImages defaults to false")
- console.log("- describeImages defaults to false")
-
- // Test the exact flow used in knowledge base
- const result = await FileProcessorService.processFile(
- pdfBuffer,
- "application/pdf",
- "small2.pdf",
- "test-doc-id",
- pdfPath
- // extractImages and describeImages default to false
- )
-
- console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===")
- console.log("Text chunks:", result.chunks.length)
- console.log("Image chunks:", result.image_chunks.length)
- console.log("Text chunk positions:", result.chunks_pos.length)
- console.log("Image chunk positions:", result.image_chunks_pos.length)
-
- console.log("\n3. Testing with image processing enabled...")
- console.log("Parameters: extractImages=true, describeImages=true")
-
- // Test with images enabled to see the difference
- const imageResult = await extractTextAndImagesWithChunksFromPDF(
- new Uint8Array(pdfBuffer),
- "test-doc-with-images",
- true, // extractImages enabled
- true // describeImages enabled
- )
-
- console.log("\n=== RESULTS WITH IMAGES ENABLED ===")
- console.log("Text chunks:", imageResult.text_chunks.length)
- console.log("Image chunks:", imageResult.image_chunks.length)
- console.log("Text chunk positions:", imageResult.text_chunk_pos.length)
- console.log("Image chunk positions:", imageResult.image_chunk_pos.length)
-
- console.log("\n=== COMPARISON ===")
- console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length)
- console.log("With images - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length)
-
- if (result.chunks.length > 0) {
- console.log("\n=== SAMPLE TEXT CHUNKS ===")
- result.chunks.slice(0, 2).forEach((chunk, idx) => {
- console.log(`\nText Chunk ${idx + 1}:`)
- console.log(chunk)
- })
- }
-
- if (imageResult.image_chunks.length > 0) {
- console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===")
- imageResult.image_chunks.forEach((chunk, idx) => {
- console.log(`\nImage ${idx + 1}:`)
- console.log(chunk)
- })
- }
-
- console.log("\n=== TEST COMPLETED ===")
- console.log("✓ Check the debug logs above from pdfChunks.ts")
- console.log("✓ You can see exactly what's being processed in the current knowledge base flow")
-
- } catch (error) {
- console.error("Error processing PDF:", error)
- process.exit(1)
- }
-}
-
-// Run the test
-testPdfDirect().catch(console.error)
\ No newline at end of file
diff --git a/server/services/fileProcessor.ts b/server/services/fileProcessor.ts
index d15fca2bb..466bd3a93 100644
--- a/server/services/fileProcessor.ts
+++ b/server/services/fileProcessor.ts
@@ -1,6 +1,8 @@
import { getErrorMessage } from "@/utils"
import { chunkDocument } from "@/chunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+// import { extractTextAndImagesWithChunksFromPDF } from "@/pdf
+
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
import * as XLSX from "xlsx"
@@ -38,11 +40,9 @@ export class FileProcessorService {
try {
if (baseMimeType === "application/pdf") {
// Process PDF
- const result = await extractTextAndImagesWithChunksFromPDF(
+ const result = await extractTextAndImagesWithChunksFromPDFviaGemini(
new Uint8Array(buffer),
vespaDocId,
- extractImages,
- describeImages,
)
chunks = result.text_chunks
chunks_pos = result.text_chunk_pos