diff --git a/server/ai/prompts.ts b/server/ai/prompts.ts
index c5612df1b..a7b4a0312 100644
--- a/server/ai/prompts.ts
+++ b/server/ai/prompts.ts
@@ -2414,4 +2414,48 @@ Without these connections, I can only provide general assistance and cannot acce
 - Project-specific data
 - Company knowledge bases
 
-I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
\ No newline at end of file
+I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
+
+// PDF Chunking Prompt
+// This prompt is used for OCR and semantic chunking of PDF pages using Gemini.
+export const CHUNKING_PROMPT = `\
+OCR the provided PDF page(s) into clean Markdown with enriched table and image handling, then segment into coherent RAG-ready chunks.
+
+GLOBAL RULES:
+- Preserve text structure as Markdown (headings, paragraphs, lists, footnotes).
+- Keep reading order across pages; prefer natural section boundaries.
+- No hallucinations. If content is unreadable, write [illegible].
+- Do not surround output with triple backticks or any code fences.
+- Output ONLY a sequence of <chunk>...</chunk> blocks. No extra commentary.
+
+TABLES (including tables shown inside images):
+- Extract ALL tables completely; never summarize or omit cells.
+- Represent EVERY table as HTML: <table><thead><tr><th>…</th></tr></thead><tbody><tr><td>…</td></tr>…</tbody></table>.
+- Keep the entire table within a single chunk when possible.
+- If a table must be split across chunks due to limits:
+  - Split on complete rows only; never split a cell.
+  - Repeat the full header row (<thead>) at the start of the next chunk.
+  - Add "(table continues)" at the end of the first part and "(table continued)" at the start of the next part.
+
+IMAGES, FIGURES, CHARTS, DIAGRAMS:
+- Insert an inline marker at the exact location where the image appears:
+  - Begin a new paragraph starting with "Image:" and provide a rich, thorough description.
+  - Describe the scene, axes, legends, units, labels, key values, trends, colors, shapes, and any text in the image.
+- If the image contains tabular data, transcribe it immediately after the description as an HTML <table> (same structure as above).
+- For charts, add 1–2 sentences summarizing key insights after the description.
+
+CHUNKING:
+- Group content by semantic theme (e.g., subsection, self-contained explanation, contiguous table).
+- Target 250–512 words per chunk with a hard maximum of 1024 bytes (UTF-8).
+- If 250–512 words would exceed 1024 bytes, end early to respect the byte limit and continue in the next chunk.
+- Do not break sentences, list items, or table rows across chunks unless unavoidable due to the byte limit.
+- When continuing content in the next chunk, begin with a brief "(continued)" cue to retain context.
+- Maintain flow: image descriptions and any extracted tables must appear inline where the image occurs so readers know an image was present there.
+
+FORMATTING:
+- Surround each chunk with <chunk> ... </chunk> tags.
+- Inside chunks, use valid Markdown and HTML (<table> only).
+- Keep whitespace clean; avoid double spaces and stray line breaks.
+
+Begin now and emit only <chunk> blocks.
+`
diff --git a/server/api/dataSource.ts b/server/api/dataSource.ts
index d112e7750..2dafe7c8e 100644
--- a/server/api/dataSource.ts
+++ b/server/api/dataSource.ts
@@ -30,6 +30,7 @@ import { DeleteDocument } from "@/search/vespa"
 import type { VespaSchema } from "@xyne/vespa-ts/types"
 import config from "@/config"
 import { getErrorMessage } from "@/utils"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
 import {
   removeAppIntegrationFromAllAgents,
   getAgentsByDataSourceId,
@@ -197,6 +198,10 @@ export async function handleSingleFileUploadToDataSource(
         flag,
       },
     )
+    if (isDataSourceError(error)) {
+      // Preserve DataSourceError so UI can display error.userMessage
+      throw error
+    }
     if (
       error instanceof Error &&
       (error.message.includes("already exists") ||
diff --git a/server/api/files.ts b/server/api/files.ts
index 84269459c..de1ca43d9 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -23,6 +23,7 @@ import type { AttachmentMetadata } from "@/shared/types"
 import { FileProcessorService } from "@/services/fileProcessor"
 import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types"
 import { getBaseMimeType } from "@/integrations/dataSource/config"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
 
 const { JwtPayloadKey } = config
 const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" })
@@ -139,9 +140,11 @@ export const handleFileUpload = async (c: Context) => {
         )
       } catch (error) {
         const errorMessage =
-          error instanceof Error
-            ? error.message
-            : "Unknown error during DataSource processing"
+          isDataSourceError(error)
+            ? error.userMessage
+            : error instanceof Error
+              ? error.message
+              : "Unknown error during DataSource processing"
         loggerWithChild({ email: email }).error(
           error,
           `Error processing file "${file.name}" for DataSource`,
diff --git a/server/integrations/dataSource/errors.ts b/server/integrations/dataSource/errors.ts
index cc1c3dd60..639dbdd47 100644
--- a/server/integrations/dataSource/errors.ts
+++ b/server/integrations/dataSource/errors.ts
@@ -26,6 +26,16 @@ export class FileSizeExceededError extends FileValidationError {
   }
 }
 
+// Specific PDF validation error when a single page exceeds client-side processing limits
+export class PdfPageTooLargeError extends FileValidationError {
+  constructor(pageNumber: number, maxSizeMB: number, actualBytes: number) {
+    const actualMB = actualBytes / (1024 * 1024)
+    const message = `PDF page ${pageNumber} size ${actualMB.toFixed(2)}MB exceeds maximum allowed per-page limit of ${maxSizeMB}MB`
+    const userMessage = `One page in the PDF is too large (${actualMB.toFixed(2)}MB). Please compress or split the PDF so each page is under ${maxSizeMB}MB.`
+    super(message, userMessage)
+  }
+}
+
 export class UnsupportedFileTypeError extends FileValidationError {
   constructor(mimeType: string, supportedTypes: string[]) {
     const message = `Unsupported file type: ${mimeType}`
diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts
index 933bd6fde..d186e5797 100644
--- a/server/integrations/dataSource/index.ts
+++ b/server/integrations/dataSource/index.ts
@@ -32,7 +32,7 @@ import {
 } from "./errors"
 import { describeImageWithllm } from "@/lib/describeImageWithllm"
 import { promises as fsPromises } from "fs"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import imageType from "image-type"
@@ -208,7 +208,7 @@ const processPdfContent = async (
   try {
     const docId = `dsf-${createId()}`
     const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } =
-      await extractTextAndImagesWithChunksFromPDF(pdfBuffer, docId, true)
+      await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId)
     if (text_chunks.length === 0 && image_chunks.length === 0) {
       throw new ContentExtractionError(
         "No chunks generated from PDF content",
diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts
index 0fcb597a2..e2c55bfa5 100644
--- a/server/integrations/google/worker-utils.ts
+++ b/server/integrations/google/worker-utils.ts
@@ -19,7 +19,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 
 const Logger = getLogger(Subsystem.Integrations).child({ module: "google" })
 
@@ -49,10 +49,9 @@ const processPdfFile = async (
 ): Promise<string[]> => {
   try {
     // Handle non-spreadsheet files as before
-    const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
       pdfBuffer,
       attachmentId,
-      false, // Don't extract images for email attachments
     )
     return pdfResult.text_chunks.filter((v) => v.trim())
   } catch (error) {
diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts
index 6bd8fe852..228e8087d 100644
--- a/server/integrations/microsoft/attachment-utils.ts
+++ b/server/integrations/microsoft/attachment-utils.ts
@@ -17,7 +17,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
 
 const Logger = getLogger(Subsystem.Integrations).child({
@@ -48,10 +48,9 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
       pdfBuffer,
       attachmentId,
-      false, // Don't extract images for email attachments
     )
     return pdfResult.text_chunks.filter((v) => v.trim())
   } catch (error) {
diff --git a/server/lib/chunkPdfWithGemini.ts b/server/lib/chunkPdfWithGemini.ts
new file mode 100644
index 000000000..faf4c0461
--- /dev/null
+++ b/server/lib/chunkPdfWithGemini.ts
@@ -0,0 +1,300 @@
+import * as crypto from "crypto"
+import { VertexAI } from "@google-cloud/vertexai"
+import { getLogger } from "@/logger"
+import { Subsystem } from "@/types"
+import { PDFDocument } from "pdf-lib"
+import { FileSizeExceededError, PdfPageTooLargeError } from "@/integrations/dataSource/errors"
+import { CHUNKING_PROMPT } from "@/ai/prompts"
+
+const Logger = getLogger(Subsystem.AI).child({ module: "chunkPdfWithGemini" })
+
+// Splitting uses pdf-lib only; pdfjs not required here
+
+export type ChunkPdfOptions = {
+  projectId?: string
+  location?: string
+  model?: string
+  gcsUri?: string // Optional GCS URI to use for PDFs >= 15MB
+  maxOutputTokens?: number
+  temperature?: number
+}
+
+// Size limits for PDF processing
+const INLINE_MAX_BYTES = 17 * 1024 * 1024 // 17MB - split into chunks
+const MAX_SUPPORTED_BYTES = 100 * 1024 * 1024 // 100MB - hard limit
+
+// Save [startPageIdxInclusive .. startPageIdxInclusive+count-1] into a new PDF
+async function saveRange(
+  srcPdf: PDFDocument,
+  startPageIdxInclusive: number,
+  count: number,
+): Promise<Uint8Array> {
+  const newPdf = await PDFDocument.create()
+  const indices: number[] = []
+  for (let i = 0; i < count; i++) indices.push(startPageIdxInclusive + i)
+  const copied = await newPdf.copyPages(srcPdf, indices)
+  for (const p of copied) newPdf.addPage(p)
+  return await newPdf.save()
+}
+
+// Find the largest `count` pages starting at `start` that fit under `maxBytes`.
+// Returns { count, bytes }. Uses exponential growth + binary search.
+// Complexity: ~O(log remainingPages) saves.
+async function findMaxFittingCount(
+  srcPdf: PDFDocument,
+  start: number,
+  remainingPages: number,
+  maxBytes: number,
+): Promise<{ count: number; bytes: Uint8Array }> {
+  // 1) At least one page must fit, or we error (single-page too large)
+  let loCount = 1
+  let loBytes = await saveRange(srcPdf, start, loCount)
+  if (loBytes.length > maxBytes) {
+    throw new PdfPageTooLargeError(start + 1, Math.floor(maxBytes / (1024 * 1024)), loBytes.length)
+  }
+
+  // 2) Exponential growth to find an overflow upper bound
+  let hiCount = loCount
+  let hiBytes: Uint8Array | null = null
+  while (hiCount < remainingPages) {
+    // Double, but cap by remaining pages
+    const next = Math.min(hiCount * 2, remainingPages)
+    const tryBytes = await saveRange(srcPdf, start, next)
+    if (tryBytes.length <= maxBytes) {
+      // Still under → move low up
+      loCount = next
+      loBytes = tryBytes
+      hiCount = next
+      if (next === remainingPages) {
+        // Everything fits, done
+        return { count: loCount, bytes: loBytes }
+      }
+    } else {
+      // Overflow found; set high bound and break
+      hiCount = next
+      hiBytes = tryBytes // record overflow marker
+      break
+    }
+  }
+
+  // If we never overflowed (all pages fit via loop), return lo
+  if (!hiBytes && loCount === remainingPages) {
+    return { count: loCount, bytes: loBytes }
+  }
+
+  // 3) Binary search between (loCount, hiCount-1)
+  let left = loCount + 1
+  let right = hiCount - 1
+  let bestCount = loCount
+  let bestBytes = loBytes
+
+  while (left <= right) {
+    const mid = (left + right) >> 1
+    const bytes = await saveRange(srcPdf, start, mid)
+    if (bytes.length <= maxBytes) {
+      bestCount = mid
+      bestBytes = bytes
+      left = mid + 1
+    } else {
+      right = mid - 1
+    }
+  }
+
+  return { count: bestCount, bytes: bestBytes }
+}
+
+// Public splitter: O(k log n) saves for k chunks total
+export async function splitPdfIntoInlineSizedChunks(
+  data: Uint8Array,
+  maxBytes: number,
+  logger?: { info: Function; warn: Function },
+): Promise<Uint8Array[]> {
+  const srcPdf = await PDFDocument.load(data)
+  const totalPages = srcPdf.getPageCount()
+
+  const chunks: Uint8Array[] = []
+  let start = 0
+
+  while (start < totalPages) {
+    const remaining = totalPages - start
+    const { count, bytes } = await findMaxFittingCount(srcPdf, start, remaining, maxBytes)
+
+    if (logger) {
+     console.log(
+        {
+          startPage: start + 1,
+          endPage: start + count,
+          pagesInChunk: count,
+          subSizeBytes: bytes.length,
+          maxBytes,
+        },
+        "Prepared sub-PDF chunk",
+      )
+    }
+
+    chunks.push(bytes)
+    start += count
+  }
+  return chunks
+}
+
+/**
+ * Extract semantic chunks from a PDF using Gemini Flash on Vertex AI.
+ * - If the data passed to this function is < 17MB, it is sent as inlineData (base64-encoded).
+ * - Callers should split larger PDFs into sub-PDFs <= 17MB and call this per part.
+ */
+export async function extractSemanticChunksFromPdf(
+  pdfData: Uint8Array,
+  opts: ChunkPdfOptions = {},
+): Promise<string> {
+  if (!pdfData || pdfData.length === 0) throw new Error("pdfData is required")
+
+  const dataSize = pdfData.length
+
+  const projectId =
+    process.env.VERTEX_PROJECT_ID ||
+    ""
+
+  const location =
+    process.env.VERTEX_REGION ||
+    "us-central1"
+
+  if (!projectId) {
+    throw new Error(
+      "Missing GCP project ID. Set VERTEX_PROJECT_ID or GOOGLE_CLOUD_PROJECT (or GCLOUD_PROJECT/GCP_PROJECT_ID) or pass options.projectId.",
+    )
+  }
+
+  const modelId = opts.model || process.env.VERTEX_AI_MODEL_PDF_PROCESSING || "gemini-2.5-flash"
+  const maxOutputTokens = opts.maxOutputTokens ?? 8192
+  const temperature = opts.temperature ?? 0.1
+
+  const vertex = new VertexAI({ project: projectId, location })
+  const model = vertex.getGenerativeModel({
+    model: modelId,
+    generationConfig: { maxOutputTokens, temperature },
+  })
+
+  // Build message parts - always inlineData (callers split before calling)
+  const messageParts: any[] = [{ text: CHUNKING_PROMPT }]
+  const pdfBase64 = Buffer.from(pdfData).toString("base64")
+  messageParts.push({
+    inlineData: {
+      mimeType: "application/pdf",
+      data: pdfBase64,
+    },
+  })
+
+  Logger.debug(
+    {
+      model: modelId,
+      projectId,
+      location,
+      mode: "inlineData",
+      sizeBytes: dataSize,
+    },
+    "Sending PDF to Gemini Flash via Vertex AI",
+  )
+
+  // Call Vertex AI Gemini Flash
+  const result = await model.generateContent({
+    contents: [
+      {
+        role: "user",
+        parts: messageParts,
+      },
+    ],
+  })
+
+  // Parse and return raw text
+  const candidates = result.response?.candidates ?? []
+  const parts = candidates[0]?.content?.parts ?? []
+  const text = parts
+    .filter((p: any) => typeof p?.text === "string")
+    .map((p: any) => p.text as string)
+    .join("")
+    .trim()
+
+  return text
+}
+
+/**
+ * Parse Gemini's raw output into an ordered list of chunk strings.
+ * Looks for <chunk>...</chunk> blocks, preserving order.
+ */
+export function parseGeminiChunkBlocks(raw: string): string[] {
+  if (!raw) return []
+  const chunks: string[] = []
+  const re = /<chunk\b[^>]*>([\s\S]*?)<\/chunk>/gi
+  let match: RegExpExecArray | null
+  while ((match = re.exec(raw)) !== null) {
+    const content = (match[1] || "").trim()
+    if (content) chunks.push(content)
+  }
+  return chunks
+}
+
+/**
+ * Gemini-backed PDF extractor that returns the same shape as
+ * extractTextAndImagesWithChunksFromPDF in server/pdfChunks.ts.
+ *
+ * Notes:
+ * - image_chunks and image_chunk_pos are intentionally empty.
+ * - Maintains chunk positions sequentially (0..n-1), equivalent to
+ *   the globalSeq logic in pdfChunks.ts.
+ * - Accepts a PDF as Uint8Array and processes it directly with Gemini.
+ */
+export async function extractTextAndImagesWithChunksFromPDFviaGemini(
+  data: Uint8Array,
+  docid: string = crypto.randomUUID(), // will be used to parse images if we extract it later
+  opts: Partial<ChunkPdfOptions> = {},
+): Promise<{
+  text_chunks: string[]
+  image_chunks: string[]
+  text_chunk_pos: number[]
+  image_chunk_pos: number[]
+}> {
+  if (!data || data.length === 0) {
+    return { text_chunks: [], image_chunks: [], text_chunk_pos: [], image_chunk_pos: [] }
+  }
+
+  if (data.length > MAX_SUPPORTED_BYTES) {
+    const actualMB = data.length / (1024 * 1024)
+    const maxMB = MAX_SUPPORTED_BYTES / (1024 * 1024)
+    throw new FileSizeExceededError(maxMB, actualMB)
+  }
+
+  const text_chunks: string[] = []
+  const text_chunk_pos: number[] = []
+  let globalSeq = 0
+
+  if (data.length <= INLINE_MAX_BYTES) {
+    // Single call path
+    Logger.info("Sending single PDF to Gemini , no splitting needed")
+    const raw = await extractSemanticChunksFromPdf(data, opts as ChunkPdfOptions)
+    const chunks = parseGeminiChunkBlocks(raw)
+    for (const c of chunks) {
+      text_chunks.push(c)
+      text_chunk_pos.push(globalSeq++)
+    }
+  } else {
+    // Split into page-based sub-PDFs that are each <= 17MB
+    const subPdfs = await splitPdfIntoInlineSizedChunks(data, INLINE_MAX_BYTES, Logger)
+    for (let i = 0; i < subPdfs.length; i++) {
+      const part = subPdfs[i]
+      Logger.info({ index: i + 1, bytes: part.length }, "Sending sub-PDF to Gemini")
+      const raw = await extractSemanticChunksFromPdf(part, opts as ChunkPdfOptions)
+      const chunks = parseGeminiChunkBlocks(raw)
+      for (const c of chunks) {
+        text_chunks.push(c)
+        text_chunk_pos.push(globalSeq++)
+      }
+    }
+  }
+
+  // As requested: image arrays are always empty/unified
+  const image_chunks: string[] = []
+  const image_chunk_pos: number[] = []
+
+  return { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos }
+}
diff --git a/server/package.json b/server/package.json
index b80ce4deb..d92167477 100644
--- a/server/package.json
+++ b/server/package.json
@@ -90,6 +90,7 @@
     "ora": "^8.1.1",
     "p-limit": "^6.2.0",
     "partial-json": "^0.1.7",
+    "pdf-lib": "^1.17.1",
     "pdf-parse": "^1.1.1",
     "pdfjs-dist": "^5.3.31",
     "pg-boss": "^10.1.5",
diff --git a/server/scripts/testGeminiFromProcessFile.ts b/server/scripts/testGeminiFromProcessFile.ts
new file mode 100644
index 000000000..80d5a4e28
--- /dev/null
+++ b/server/scripts/testGeminiFromProcessFile.ts
@@ -0,0 +1,109 @@
+#!/usr/bin/env bun
+
+import { promises as fs } from "fs"
+import path from "path"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+
+type EnvMap = Record<string, string>
+const DEFAULT_TEST_PDF = ""
+
+async function loadEnvFile(envPath: string): Promise<EnvMap> {
+  try {
+    const raw = await fs.readFile(envPath, "utf8")
+    const map: EnvMap = {}
+    for (const line of raw.split(/\r?\n/)) {
+      const trimmed = line.trim()
+      if (!trimmed || trimmed.startsWith("#")) continue
+      const eq = trimmed.indexOf("=")
+      if (eq === -1) continue
+      const key = trimmed.slice(0, eq).trim()
+      let val = trimmed.slice(eq + 1).trim()
+      // Strip surrounding quotes if present
+      if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
+        val = val.slice(1, -1)
+      }
+      map[key] = val
+      if (!(key in process.env)) {
+        process.env[key] = val
+      }
+    }
+    return map
+  } catch (err: any) {
+    if (err?.code !== "ENOENT") {
+      console.warn(`Warning: failed to read env file at ${envPath}:`, err)
+    }
+    return {}
+  }
+}
+
+function resolvePdfPath(args: string[], envs: EnvMap): string {
+  // Priority: CLI arg -> TEST_PDF_PATH -> PDF_PATH
+  const cli = args[0]
+  const fromEnv = envs["TEST_PDF_PATH"] || envs["PDF_PATH"] || process.env.TEST_PDF_PATH || process.env.PDF_PATH
+  const p = cli || fromEnv || DEFAULT_TEST_PDF
+  return path.resolve(p)
+}
+
+async function main() {
+  console.log("=== Gemini PDF Chunker (processFile simulation) ===")
+
+  // Load env from server/.env (preferred) then from project .env (optional)
+  const cwd = process.cwd()
+  const serverEnvPath = path.resolve(cwd, "server/.env")
+  const rootEnvPath = path.resolve(cwd, ".env")
+  const envs = {
+    ...(await loadEnvFile(serverEnvPath)),
+    ...(await loadEnvFile(rootEnvPath)),
+  }
+
+  // Resolve PDF path
+  const argv = process.argv.slice(2)
+  const pdfPath = resolvePdfPath(argv, envs)
+  console.log("PDF Path:", pdfPath)
+
+  // Read the PDF file into a Buffer (simulate FileProcessorService.processFile input)
+  const buffer = await fs.readFile(pdfPath)
+  console.log("File size:", buffer.length, "bytes")
+
+  // Simulate processFile -> extractTextAndImagesWithChunksFromPDFviaGemini call
+  const vespaDocId = "test-docid-gemini"
+
+  console.log("\nCalling Gemini-backed extractor...")
+  const result = await extractTextAndImagesWithChunksFromPDFviaGemini(
+    new Uint8Array(buffer),
+    vespaDocId,
+  )
+
+  // Map to FileProcessorService result naming for clarity
+  const chunks = result.text_chunks
+  const chunks_pos = result.text_chunk_pos
+  const image_chunks = result.image_chunks
+  const image_chunks_pos = result.image_chunk_pos
+
+  console.log("\n=== Results ===")
+  console.log("Text chunks:", chunks.length)
+  console.log("Text chunk positions:", chunks_pos.length)
+  console.log("Image chunks (should be 0):", image_chunks.length)
+  console.log("Image chunk positions (should be 0):", image_chunks_pos.length)
+
+  console.log("All text chunks", { chunks })
+  console.log("All text chunk positions", { chunks_pos })
+  console.log("All image chunks", { image_chunks })
+  console.log("All image chunk positions", { image_chunks_pos })
+
+  // Print chunks with their positions
+  // console.log("\n=== Text Chunks with Positions ===")
+  // for (let i = 0; i < chunks.length; i++) {
+  //   const chunk = chunks[i]
+  //   const pos = chunks_pos[i]
+  //   console.log(`\n[${i}] pos=${pos}`)
+  //   console.log(chunk)
+  // }
+
+  console.log("\n=== Done ===")
+}
+
+await main().catch((err) => {
+  console.error("Test failed:", err)
+  process.exit(1)
+})
diff --git a/server/scripts/testPdfDirect.ts b/server/scripts/testPdfDirect.ts
deleted file mode 100644
index 0f0adac30..000000000
--- a/server/scripts/testPdfDirect.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-import { readFileSync } from "fs"
-import { resolve } from "path"
-import { FileProcessorService } from "@/services/fileProcessor"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
-
-async function testPdfDirect() {
-  let pdfPath = "/Users/aayush.shah/Downloads/small2.pdf"
-    // const pdfPath = "/Users/aayush.shah/Downloads/Aayush_Resume_2025.pdf"
-  pdfPath = "/Users/aayush.shah/Downloads/somatosensory.pdf"
-  try {
-    console.log("=== DIRECT PDF PROCESSING TEST ===")
-    console.log("PDF Path:", pdfPath)
-
-    // Read the PDF file
-    console.log("\n1. Reading PDF file...")
-    const pdfBuffer = readFileSync(pdfPath)
-    console.log("File size:", pdfBuffer.length, "bytes")
-
-    console.log("\n2. Testing direct PDF processing (current knowledge base flow)...")
-    console.log("This simulates exactly what happens in the knowledge base upload:")
-    console.log("- FileProcessorService.processFile() is called")
-    console.log("- extractImages defaults to false")
-    console.log("- describeImages defaults to false")
-
-    // Test the exact flow used in knowledge base
-    const result = await FileProcessorService.processFile(
-      pdfBuffer,
-      "application/pdf",
-      "small2.pdf",
-      "test-doc-id",
-      pdfPath
-      // extractImages and describeImages default to false
-    )
-
-    console.log("\n=== RESULTS FROM KNOWLEDGE BASE FLOW ===")
-    console.log("Text chunks:", result.chunks.length)
-    console.log("Image chunks:", result.image_chunks.length)
-    console.log("Text chunk positions:", result.chunks_pos.length)
-    console.log("Image chunk positions:", result.image_chunks_pos.length)
-
-    console.log("\n3. Testing with image processing enabled...")
-    console.log("Parameters: extractImages=true, describeImages=true")
-
-    // Test with images enabled to see the difference
-    const imageResult = await extractTextAndImagesWithChunksFromPDF(
-      new Uint8Array(pdfBuffer),
-      "test-doc-with-images",
-      true,  // extractImages enabled
-      true   // describeImages enabled
-    )
-
-    console.log("\n=== RESULTS WITH IMAGES ENABLED ===")
-    console.log("Text chunks:", imageResult.text_chunks.length)
-    console.log("Image chunks:", imageResult.image_chunks.length)
-    console.log("Text chunk positions:", imageResult.text_chunk_pos.length)
-    console.log("Image chunk positions:", imageResult.image_chunk_pos.length)
-
-    console.log("\n=== COMPARISON ===")
-    console.log("Current KB flow - Text chunks:", result.chunks.length, "Image chunks:", result.image_chunks.length)
-    console.log("With images    - Text chunks:", imageResult.text_chunks.length, "Image chunks:", imageResult.image_chunks.length)
-
-    if (result.chunks.length > 0) {
-      console.log("\n=== SAMPLE TEXT CHUNKS ===")
-      result.chunks.slice(0, 2).forEach((chunk, idx) => {
-        console.log(`\nText Chunk ${idx + 1}:`)
-        console.log(chunk)
-      })
-    }
-
-    if (imageResult.image_chunks.length > 0) {
-      console.log("\n=== SAMPLE IMAGE DESCRIPTIONS ===")
-      imageResult.image_chunks.forEach((chunk, idx) => {
-        console.log(`\nImage ${idx + 1}:`)
-        console.log(chunk)
-      })
-    }
-
-    console.log("\n=== TEST COMPLETED ===")
-    console.log("✓ Check the debug logs above from pdfChunks.ts")
-    console.log("✓ You can see exactly what's being processed in the current knowledge base flow")
-
-  } catch (error) {
-    console.error("Error processing PDF:", error)
-    process.exit(1)
-  }
-}
-
-// Run the test
-testPdfDirect().catch(console.error)
\ No newline at end of file
diff --git a/server/services/fileProcessor.ts b/server/services/fileProcessor.ts
index d15fca2bb..466bd3a93 100644
--- a/server/services/fileProcessor.ts
+++ b/server/services/fileProcessor.ts
@@ -1,6 +1,8 @@
 import { getErrorMessage } from "@/utils"
 import { chunkDocument } from "@/chunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+// import { extractTextAndImagesWithChunksFromPDF } from "@/pdf
+
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import * as XLSX from "xlsx"
@@ -38,11 +40,9 @@ export class FileProcessorService {
     try {
       if (baseMimeType === "application/pdf") {
         // Process PDF
-        const result = await extractTextAndImagesWithChunksFromPDF(
+        const result = await extractTextAndImagesWithChunksFromPDFviaGemini(
           new Uint8Array(buffer),
           vespaDocId,
-          extractImages,
-          describeImages,
         )
         chunks = result.text_chunks
         chunks_pos = result.text_chunk_pos