From 8a15aa727942fe393d9236d6981afa64f7b86895 Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Mon, 6 Oct 2025 14:37:36 +0530
Subject: [PATCH 1/7] fix: sheet ingestion and rag on sheets

---
 server/ai/context.ts                          | 181 ++++++-
 server/api/chat/agents.ts                     |  60 ++-
 server/api/chat/chat.ts                       |  88 ++--
 server/api/files.ts                           | 117 +++--
 server/api/knowledgeBase.ts                   |  12 +-
 server/integrations/dataSource/config.ts      |  13 +-
 server/integrations/dataSource/index.ts       |  86 +---
 server/integrations/google/config.ts          |   6 +-
 server/integrations/google/index.ts           | 118 ++---
 server/integrations/google/worker-utils.ts    | 171 ++++---
 .../microsoft/attachment-utils.ts             | 170 ++++---
 server/integrations/ribbie/index.ts           |  14 +-
 server/lib/duckdb.ts                          | 218 ++++++++
 server/lib/sqlInference.ts                    | 126 +++++
 server/package.json                           |   1 +
 server/queue/fileProcessor.ts                 | 129 +++--
 server/services/fileProcessor.ts              |  80 +--
 server/sheetChunk.ts                          | 465 ++++++++++++++++++
 server/types.ts                               |  25 +
 19 files changed, 1589 insertions(+), 491 deletions(-)
 create mode 100644 server/lib/duckdb.ts
 create mode 100644 server/lib/sqlInference.ts
 create mode 100644 server/sheetChunk.ts

diff --git a/server/ai/context.ts b/server/ai/context.ts
index baec0421f..dd2fe3013 100644
--- a/server/ai/context.ts
+++ b/server/ai/context.ts
@@ -32,6 +32,151 @@ import {
   getSortedScoredImageChunks,
 } from "@xyne/vespa-ts/mappers"
 import type { UserMetadataType } from "@/types"
+import { querySheetChunks } from "@/lib/duckdb"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
+
+// Utility function to extract header from chunks and remove headers from each chunk
+const extractHeaderAndDataChunks = (
+  chunks_summary: (string | { chunk: string; score: number; index: number })[] | undefined,
+  matchfeatures?: any
+): { 
+  chunks_summary: (string | { chunk: string; score: number; index: number })[];
+  matchfeatures?: any;
+} => {
+  if (!chunks_summary || chunks_summary.length === 0) {
+    return { chunks_summary: [], matchfeatures };
+  }
+
+  // Find the header from the first chunk
+  let headerChunk = '';
+  if (chunks_summary.length > 0) {
+    const firstChunk = typeof chunks_summary[0] === "string" ? chunks_summary[0] : chunks_summary[0].chunk;
+    const lines = firstChunk.split('\n');
+    if (lines.length > 0 && lines[0].includes('\t')) {
+      headerChunk = lines[0]; // Extract the header line
+    }
+  }
+  
+  // Process all chunks: remove header from each and keep only data rows
+  const processedChunks: (string | { chunk: string; score: number; index: number })[] = [];
+  let newMatchfeatures = matchfeatures;
+  
+  // Add header as first chunk if found, using the same structure as original
+  if (headerChunk) {
+    if (typeof chunks_summary[0] === "string") {
+      processedChunks.push(headerChunk);
+    } else {
+      processedChunks.push({
+        chunk: headerChunk,
+        score: 1,
+        index: chunks_summary.length,
+      });
+    }
+
+     // Update matchfeatures to include the header chunk score
+     if (newMatchfeatures) {
+        const existingCells = newMatchfeatures.chunk_scores?.cells || {};
+        const maxScore = Object.values(existingCells).length > 0 ? Math.max(...Object.values(existingCells as number[])) : 0;
+        // Create new chunk_scores that match the new chunks
+        const newChunkScores: Record<string, number> = {}
+        newChunkScores["0"] = maxScore + 1
+        Object.entries(existingCells).forEach(([idx, score]) => {
+          newChunkScores[(parseInt(idx) + 1).toString()] = score as number
+        })
+       
+       newMatchfeatures = {
+         ...newMatchfeatures,
+         chunk_scores: {
+           cells: newChunkScores
+         }
+       };
+     }
+  }
+  
+  // Process each original chunk: remove header and add data rows
+  for (let i = 0; i < chunks_summary.length; i++) {
+    const originalChunk = chunks_summary[i];
+    const chunkContent = typeof originalChunk === "string" ? originalChunk : originalChunk.chunk;
+    const lines = chunkContent.split('\n');
+    
+    // Skip the first line (header) and keep only data rows
+    const dataRows = lines.slice(1).filter(line => line.trim().length > 0);
+    if (dataRows.length > 0) {
+      const dataContent = dataRows.join('\n');
+      
+      if (typeof originalChunk === "string") {
+        processedChunks.push(dataContent);
+      } else {
+        processedChunks.push({
+          chunk: dataContent,
+          score: originalChunk.score,
+          index: originalChunk.index
+        });
+      }
+    }
+  }
+  
+  return { chunks_summary: processedChunks, matchfeatures: newMatchfeatures };
+};
+
+// Utility function to process sheet queries for spreadsheet files
+const processSheetQuery = async (
+  chunks_summary: (string | { chunk: string; score: number; index: number })[] | undefined,
+  query: string,
+  matchfeatures: any
+): Promise<{
+  chunks_summary: { chunk: string; score: number; index: number }[];
+  matchfeatures: any;
+  maxSummaryChunks: number;
+} | null> => {
+  const duckDBResult = await querySheetChunks(
+    chunks_summary?.map((c) => typeof c === "string" ? c : c.chunk) || [], 
+    query
+  )
+  
+  // If DuckDB query failed (null means not metric-related or SQL generation failed), return null to fallback to original approach
+  if (!duckDBResult) {
+    return null;
+  }
+  
+  // Create metadata chunk with query information (excluding data)
+  const metadataChunk = JSON.stringify({
+    assumptions: duckDBResult.assumptions,
+    schema_fragment: duckDBResult.schema_fragment
+  }, null, 2)
+  
+  // Use chunkSheetWithHeaders to chunk the 2D array data
+  const dataChunks = chunkSheetWithHeaders(duckDBResult.data.rows, {headerRows: 1})
+  
+  // Combine metadata chunk with data chunks
+  const allChunks = [metadataChunk, ...dataChunks]
+  
+  const newChunksSummary = allChunks.map((c, idx) => ({chunk: c, score: 0, index: idx}))
+  
+  // Update matchfeatures to correspond to the new chunks
+  let newMatchfeatures = matchfeatures
+  if (matchfeatures) {
+    // Create new chunk_scores that match the new chunks
+    const newChunkScores: Record<string, number> = {}
+    allChunks.forEach((_, idx) => {
+      newChunkScores[idx.toString()] = 0 // All new chunks get score 0
+    })
+    
+    // Update the matchfeatures with new chunk_scores
+    newMatchfeatures = {
+      ...matchfeatures,
+      chunk_scores: {
+        cells: newChunkScores
+      }
+    }
+  }
+  
+  return {
+    chunks_summary: newChunksSummary,
+    matchfeatures: newMatchfeatures,
+    maxSummaryChunks: allChunks.length
+  }
+}
 
 // Utility to capitalize the first letter of a string
 const capitalize = (str: string) => str.charAt(0).toUpperCase() + str.slice(1)
@@ -573,7 +718,7 @@ const constructCollectionFileContext = (
   isMsgWithSources?: boolean,
 ): string => {
  
-  if ((!maxSummaryChunks && !isSelectedFiles) || isMsgWithSources) {
+  if ((!maxSummaryChunks && !isSelectedFiles)) {
     maxSummaryChunks = fields.chunks_summary?.length
   }
   let chunks: ScoredChunk[] = []
@@ -738,13 +883,43 @@ export const answerColoredContextMap = (
 }
 
 type AiContext = string
-export const answerContextMap = (
+export const answerContextMap = async (
   searchResult: VespaSearchResults,
   userMetadata: UserMetadataType,
   maxSummaryChunks?: number,
   isSelectedFiles?: boolean,
   isMsgWithSources?: boolean,
-): AiContext => {
+  query?: string,
+): Promise<AiContext> => {
+  if(searchResult.fields.sddocname === fileSchema || searchResult.fields.sddocname === dataSourceFileSchema || searchResult.fields.sddocname === KbItemsSchema || searchResult.fields.sddocname === mailAttachmentSchema) {
+    let mimeType
+    if(searchResult.fields.sddocname === mailAttachmentSchema) {
+      mimeType = searchResult.fields.fileType
+    } else {
+      mimeType = searchResult.fields.mimeType
+    }
+    if(mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ||
+      mimeType === "application/vnd.ms-excel" ||
+      mimeType === "text/csv") {
+        const result = extractHeaderAndDataChunks(searchResult.fields.chunks_summary, searchResult.fields.matchfeatures);
+        searchResult.fields.chunks_summary = result.chunks_summary;
+        if (result.matchfeatures) {
+          searchResult.fields.matchfeatures = result.matchfeatures;
+        }
+        
+        if (query) {
+          const sheetResult = await processSheetQuery(searchResult.fields.chunks_summary, query, searchResult.fields.matchfeatures)
+          if (sheetResult) {
+            const { chunks_summary, matchfeatures, maxSummaryChunks: newMaxSummaryChunks } = sheetResult
+            searchResult.fields.chunks_summary = chunks_summary
+            searchResult.fields.matchfeatures = matchfeatures
+            maxSummaryChunks = newMaxSummaryChunks
+          } else {
+            maxSummaryChunks = Math.min(searchResult.fields.chunks_summary?.length || 0, 100)
+          }
+        }
+    }
+  }
   if (searchResult.fields.sddocname === fileSchema) {
     return constructFileContext(
       searchResult.fields,
diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts
index a23a9d361..963601f4c 100644
--- a/server/api/chat/agents.ts
+++ b/server/api/chat/agents.ts
@@ -135,6 +135,7 @@ import { getModelValueFromLabel } from "@/ai/modelConfig"
 import {
   buildContext,
   buildUserQuery,
+  expandSheetIds,
   getThreadContext,
   isContextSelected,
   UnderstandMessageAndAnswer,
@@ -530,13 +531,21 @@ const checkAndYieldCitationsForAgent = async function* (
   }
 }
 
-const vespaResultToMinimalAgentFragment = (
+const vespaResultToMinimalAgentFragment = async (
   child: VespaSearchResult,
   idx: number,
   userMetadata: UserMetadataType,
-): MinimalAgentFragment => ({
+  query: string,
+): Promise<MinimalAgentFragment> => ({
   id: `${(child.fields as any)?.docId || `Frangment_id_${idx}`}`,
-  content: answerContextMap(child as VespaSearchResults, userMetadata, 0, true),
+  content: await answerContextMap(
+    child as VespaSearchResults,
+    userMetadata,
+    0,
+    true,
+    undefined,
+    query,
+  ),
   source: searchToCitation(child as VespaSearchResults),
   confidence: 1.0,
 })
@@ -839,15 +848,12 @@ export const MessageWithToolsApi = async (c: Context) => {
     }
 
     const attachmentMetadata = parseAttachmentMetadata(c)
-    const attachmentFileIds = attachmentMetadata.map(
-      (m: AttachmentMetadata) => m.fileId,
-    )
     const imageAttachmentFileIds = attachmentMetadata
       .filter((m) => m.isImage)
       .map((m) => m.fileId)
     const nonImageAttachmentFileIds = attachmentMetadata
       .filter((m) => !m.isImage)
-      .map((m) => m.fileId)
+      .flatMap((m) => expandSheetIds(m.fileId))
     let attachmentStorageError: Error | null = null
 
     const contextExtractionSpan = initSpan.startSpan("context_extraction")
@@ -866,7 +872,7 @@ export const MessageWithToolsApi = async (c: Context) => {
       extractedInfo?.totalValidFileIdsFromLinkCount
     loggerWithChild({ email: email }).info(`Extracted ${fileIds} extractedInfo`)
     loggerWithChild({ email: email }).info(
-      `Total attachment files received: ${attachmentFileIds.length}`,
+      `Total attachment files received: ${attachmentMetadata.length}`,
     )
     const hasReferencedContext = fileIds && fileIds.length > 0
     contextExtractionSpan.setAttribute("file_ids_count", fileIds?.length || 0)
@@ -1521,11 +1527,13 @@ export const MessageWithToolsApi = async (c: Context) => {
             if (results?.root?.children && results.root.children.length > 0) {
               const contextPromises = results?.root?.children?.map(
                 async (v, i) => {
-                  let content = answerContextMap(
+                  let content = await answerContextMap(
                     v as VespaSearchResults,
                     userMetadata,
                     0,
                     true,
+                    undefined,
+                    message,
                   )
                   const chatContainerFields =
                     isChatContainerFields(v.fields) &&
@@ -1593,22 +1601,24 @@ export const MessageWithToolsApi = async (c: Context) => {
                 planningContext += "\n" + buildContext(threadContexts, 10, userMetadata)
               }
 
-              gatheredFragments = results.root.children.map(
-                (child: VespaSearchResult, idx) =>
-                  vespaResultToMinimalAgentFragment(child, idx, userMetadata),
+              gatheredFragments = await Promise.all(
+                results.root.children.map(
+                  async (child: VespaSearchResult, idx) =>
+                    await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
+                )
               )
               if (chatContexts.length > 0) {
                 gatheredFragments.push(
-                  ...chatContexts.map((child, idx) =>
-                    vespaResultToMinimalAgentFragment(child, idx, userMetadata),
-                  ),
+                  ...(await Promise.all(chatContexts.map(async (child, idx) =>
+                    await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
+                  ))),
                 )
               }
               if (threadContexts.length > 0) {
                 gatheredFragments.push(
-                  ...threadContexts.map((child, idx) =>
-                    vespaResultToMinimalAgentFragment(child, idx, userMetadata),
-                  ),
+                  ...(await Promise.all(threadContexts.map(async (child, idx) =>
+                    await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
+                  ))),
                 )
               }
               const parseSynthesisOutput = await performSynthesis(
@@ -2881,9 +2891,9 @@ export const AgentMessageApiRagOff = async (c: Context) => {
             chunksSpan.end()
             if (allChunks?.root?.children) {
               const startIndex = 0
-              fragments = allChunks.root.children.map((child, idx) =>
-                vespaResultToMinimalAgentFragment(child, idx, userMetadata),
-              )
+              fragments = await Promise.all(allChunks.root.children.map(async (child, idx) =>
+                await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message)
+              ))
               context = answerContextMapFromFragments(
                 fragments,
                 maxDefaultSummary,
@@ -3149,9 +3159,9 @@ export const AgentMessageApiRagOff = async (c: Context) => {
         if (docIds.length > 0) {
           const allChunks = await GetDocumentsByDocIds(docIds, chunksSpan)
           if (allChunks?.root?.children) {
-            fragments = allChunks.root.children.map((child, idx) =>
-              vespaResultToMinimalAgentFragment(child, idx, userMetadata),
-            )
+            fragments = await Promise.all(allChunks.root.children.map(async (child, idx) =>
+              await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
+            ))
             context = answerContextMapFromFragments(
               fragments,
               maxDefaultSummary,
@@ -3365,7 +3375,7 @@ export const AgentMessageApi = async (c: Context) => {
       .map((m) => m.fileId)
     const nonImageAttachmentFileIds = attachmentMetadata
       .filter((m) => !m.isImage)
-      .map((m) => m.fileId)
+      .flatMap((m) => expandSheetIds(m.fileId))
     let attachmentStorageError: Error | null = null
     let {
       message,
diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts
index 33a480f2f..fde97d412 100644
--- a/server/api/chat/chat.ts
+++ b/server/api/chat/chat.ts
@@ -217,6 +217,26 @@ import { getChunkCountPerDoc } from "./chunk-selection"
 const METADATA_NO_DOCUMENTS_FOUND = "METADATA_NO_DOCUMENTS_FOUND_INTERNAL"
 const METADATA_FALLBACK_TO_RAG = "METADATA_FALLBACK_TO_RAG_INTERNAL"
 
+export function expandSheetIds(fileId: string): string[] {
+  // Check if the fileId matches the pattern docId_sheet_number
+  const sheetMatch = fileId.match(/^(.+)_sheet_(\d+)$/)
+  
+  if (!sheetMatch) {
+    // Not a sheet ID, return as is
+    return [fileId]
+  }
+  
+  const [, docId, sheetNumberStr] = sheetMatch
+  const sheetNumber = parseInt(sheetNumberStr, 10)
+  // Generate IDs from docId_sheet_0 to docId_sheet_number
+  const expandedIds: string[] = []
+  for (let i = 0; i < Math.max(sheetNumber, 1); i++) {
+    expandedIds.push(`${docId}_sheet_${i}`)
+  }
+  
+  return expandedIds
+}
+
 export async function resolveNamesToEmails(
   intent: Intent,
   email: string,
@@ -853,10 +873,13 @@ export const ChatDeleteApi = async (c: Context) => {
         for (const fileId of nonImageAttachmentFileIds) {
           try {
             // Delete from Vespa kb_items schema using the proper Vespa function
-            await DeleteDocument(fileId, KbItemsSchema)
-            loggerWithChild({ email: email }).info(
-              `Successfully deleted non-image attachment ${fileId} from Vespa kb_items schema`,
-            )
+            const vespaIds = expandSheetIds(fileId)
+            vespaIds.forEach(async (id) => {
+              await DeleteDocument(id, KbItemsSchema)
+              loggerWithChild({ email: email }).info(
+                `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`,
+              )
+            })
           } catch (error) {
             const errorMessage = getErrorMessage(error)
             if (errorMessage.includes("404 Not Found")) {
@@ -1189,24 +1212,26 @@ export const replaceDocIdwithUserDocId = async (
   return userMap[email] ?? docId
 }
 
-export function buildContext(
+export async function buildContext(
   results: VespaSearchResult[],
   maxSummaryCount: number | undefined,
   userMetadata: UserMetadataType,
   startIndex: number = 0,
-): string {
-  return cleanContext(
-    results
-      ?.map(
-        (v, i) =>
-          `Index ${i + startIndex} \n ${answerContextMap(
-            v as VespaSearchResults,
-            userMetadata,
-            maxSummaryCount,
-          )}`,
-      )
-      ?.join("\n"),
+  builtUserQuery?: string,
+): Promise<string> {
+  const contextPromises = results?.map(
+    async (v, i) =>
+      `Index ${i + startIndex} \n ${await answerContextMap(
+        v as VespaSearchResults,
+        userMetadata,
+        maxSummaryCount,
+        undefined,
+        undefined,
+        builtUserQuery,
+      )}`,
   )
+  const contexts = await Promise.all(contextPromises || [])
+  return cleanContext(contexts.join("\n"))
 }
 
 async function* generateIterativeTimeFilterAndQueryRewrite(
@@ -1563,10 +1588,12 @@ async function* generateIterativeTimeFilterAndQueryRewrite(
       )
       vespaSearchSpan?.end()
 
-      const initialContext = buildContext(
+      const initialContext = await buildContext(
         results?.root?.children,
         maxSummaryCount,
-        userMetadata
+        userMetadata,
+        0,
+        message,
       )
 
       const queryRewriteSpan = rewriteSpan?.startSpan("query_rewriter")
@@ -1695,7 +1722,7 @@ async function* generateIterativeTimeFilterAndQueryRewrite(
         )
         totalResultsSpan?.end()
         const contextSpan = querySpan?.startSpan("build_context")
-        const initialContext = buildContext(totalResults, maxSummaryCount, userMetadata)
+        const initialContext = await buildContext(totalResults, maxSummaryCount, userMetadata, 0, message)
 
         const { imageFileNames } = extractImageFileNames(
           initialContext,
@@ -1883,11 +1910,12 @@ async function* generateIterativeTimeFilterAndQueryRewrite(
     pageSearchSpan?.end()
     const startIndex = isReasoning ? previousResultsLength : 0
     const contextSpan = pageSpan?.startSpan("build_context")
-    const initialContext = buildContext(
+    const initialContext = await buildContext(
       results?.root?.children,
       maxSummaryCount,
       userMetadata,
       startIndex,
+      message,
     )
 
     const { imageFileNames } = extractImageFileNames(
@@ -2172,12 +2200,13 @@ async function* generateAnswerFromGivenContext(
 
   const startIndex = isReasoning ? previousResultsLength : 0
   const contextPromises = combinedSearchResponse?.map(async (v, i) => {
-    let content = answerContextMap(
+    let content = await answerContextMap(
       v as VespaSearchResults,
       userMetadata,
       i < chunksPerDocument.length ? chunksPerDocument[i] : 0,
       true,
       isMsgWithSources,
+      message,
     )
     if (
       v.fields &&
@@ -2808,11 +2837,12 @@ async function* generatePointQueryTimeExpansion(
     // Prepare context for LLM
     const contextSpan = iterationSpan?.startSpan("build_context")
     const startIndex = isReasoning ? previousResultsLength : 0
-    const initialContext = buildContext(
+    const initialContext = await buildContext(
       combinedResults?.root?.children,
       maxSummaryCount,
       userMetadata,
       startIndex,
+      message,
     )
 
     const { imageFileNames } = extractImageFileNames(
@@ -2943,7 +2973,7 @@ async function* processResultsForMetadata(
     "Document chunk size",
     `full_context maxed to ${chunksCount}`,
   )
-  const context = buildContext(items, chunksCount, userMetadata)
+  const context = await buildContext(items, chunksCount, userMetadata, 0, input)
   const { imageFileNames } = extractImageFileNames(context, items)
   const streamOptions = {
     stream: true,
@@ -3276,7 +3306,7 @@ async function* generateMetadataQueryAnswer(
         ),
       )
 
-      pageSpan?.setAttribute("context", buildContext(items, 20, userMetadata))
+      pageSpan?.setAttribute("context", await buildContext(items, 20, userMetadata, 0, input))
       if (!items.length) {
         loggerWithChild({ email: email }).info(
           `No documents found on iteration ${iteration}${
@@ -3447,7 +3477,7 @@ async function* generateMetadataQueryAnswer(
       ),
     )
 
-    span?.setAttribute("context", buildContext(items, 20, userMetadata))
+    span?.setAttribute("context", await buildContext(items, 20, userMetadata, 0, input))
     span?.end()
     loggerWithChild({ email: email }).info(
       `Retrieved Documents : ${QueryType.GetItems} - ${items.length}`,
@@ -3583,7 +3613,7 @@ async function* generateMetadataQueryAnswer(
           items.map((v: VespaSearchResult) => (v.fields as any).docId),
         ),
       )
-      iterationSpan?.setAttribute(`context`, buildContext(items, 20, userMetadata))
+      iterationSpan?.setAttribute(`context`, await buildContext(items, 20, userMetadata, 0, input))
       iterationSpan?.end()
 
       loggerWithChild({ email: email }).info(
@@ -4172,7 +4202,7 @@ export const MessageApi = async (c: Context) => {
       .map((m) => m.fileId)
     const nonImageAttachmentFileIds = attachmentMetadata
       .filter((m) => !m.isImage)
-      .map((m) => m.fileId)
+      .flatMap((m) => expandSheetIds(m.fileId))
 
     if (agentPromptValue) {
       const userAndWorkspaceCheck = await getUserAndWorkspaceByEmail(
@@ -4229,7 +4259,7 @@ export const MessageApi = async (c: Context) => {
       try {
         const resp = await getCollectionFilesVespaIds(JSON.parse(kbItems), db)
         fileIds = resp
-          .map((file) => file.vespaDocId || "")
+          .flatMap((file) => expandSheetIds(file.vespaDocId || ""))
           .filter((id) => id !== "")
       } catch {
         fileIds = []
diff --git a/server/api/files.ts b/server/api/files.ts
index 17d0ac751..f1932f903 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -20,7 +20,7 @@ import { HTTPException } from "hono/http-exception"
 import { isValidFile, isImageFile } from "shared/fileUtils"
 import { generateThumbnail, getThumbnailPath } from "@/utils/image"
 import type { AttachmentMetadata } from "@/shared/types"
-import { FileProcessorService } from "@/services/fileProcessor"
+import { FileProcessorService, type SheetProcessingResult } from "@/services/fileProcessor"
 import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types"
 import { getBaseMimeType } from "@/integrations/dataSource/config"
 import { isDataSourceError } from "@/integrations/dataSource/errors"
@@ -218,6 +218,7 @@ export const handleAttachmentUpload = async (c: Context) => {
     }
 
     const attachmentMetadata: AttachmentMetadata[] = []
+    let vespaId : string = ""
 
     for (const file of files) {
       const fileBuffer = await file.arrayBuffer()
@@ -247,7 +248,7 @@ export const handleAttachmentUpload = async (c: Context) => {
           // For non-images: process through FileProcessorService and ingest into Vespa
 
           // Process the file content using FileProcessorService
-          const processingResult = await FileProcessorService.processFile(
+          const processingResults = await FileProcessorService.processFile(
             Buffer.from(fileBuffer),
             file.type,
             file.name,
@@ -257,61 +258,75 @@ export const handleAttachmentUpload = async (c: Context) => {
             false,
           )
 
-          // TODO: Ingest the processed content into Vespa
-          // This would typically involve calling your Vespa ingestion service
-          // For now, we'll log the processing result
-          loggerWithChild({ email }).info(
-            `Processed non-image file "${file.name}" with ${processingResult.chunks.length} text chunks and ${processingResult.image_chunks.length} image chunks`,
-          )
-
-          const { chunks, chunks_pos, image_chunks, image_chunks_pos } =
-            processingResult
-
-          const vespaDoc = {
-            docId: fileId,
-            clId: "attachment",
-            itemId: fileId,
-            fileName: file.name,
-            app: Apps.KnowledgeBase as const,
-            entity: KnowledgeBaseEntity.Attachment,
-            description: "",
-            storagePath: "",
-            chunks: chunks,
-            chunks_pos: chunks_pos,
-            image_chunks: image_chunks,
-            image_chunks_pos: image_chunks_pos,
-            chunks_map: chunks.map((_, index) => ({
-              chunk_index: index,
-              page_number: 0,
-              block_labels: [],
-            })),
-            image_chunks_map: image_chunks.map((_, index) => ({
-              chunk_index: index,
-              page_number: 0,
-              block_labels: [],
-            })),
-            metadata: JSON.stringify({
-              originalFileName: file.name,
-              uploadedBy: email,
-              chunksCount: chunks.length,
-              imageChunksCount: image_chunks.length,
-              processingMethod: getBaseMimeType(file.type || "text/plain"),
-              lastModified: Date.now(),
-            }),
-            createdBy: email,
-            duration: 0,
-            mimeType: getBaseMimeType(file.type || "text/plain"),
-            fileSize: file.size,
-            createdAt: Date.now(),
-            updatedAt: Date.now(),
+          if(processingResults.length > 0 && 'totalSheets' in processingResults[0]) {
+            vespaId = `${fileId}_sheet_${(processingResults[0] as SheetProcessingResult).totalSheets}`
+          } else {
+            vespaId = fileId
           }
+          // Handle multiple processing results (e.g., for spreadsheets with multiple sheets)
+          for (const [resultIndex, processingResult] of processingResults.entries()) {
+            let docId = fileId
+            let fileName = file.name
+
+            // For sheet processing results, append sheet information
+            if ('sheetName' in processingResult) {
+              const sheetResult = processingResult as SheetProcessingResult
+              fileName = processingResults.length > 1 
+                ? `${file.name} / ${sheetResult.sheetName}`
+                : file.name
+              docId = sheetResult.docId
+            }
+
+            loggerWithChild({ email }).info(
+              `Processed non-image file "${fileName}" with ${processingResult.chunks.length} text chunks and ${processingResult.image_chunks.length} image chunks`,
+            )
 
-          await insert(vespaDoc, KbItemsSchema)
+            const { chunks, chunks_pos, image_chunks, image_chunks_pos } =
+              processingResult
+
+            const vespaDoc = {
+              docId: docId,
+              clId: "attachment",
+              itemId: fileId,
+              fileName: fileName,
+              app: Apps.KnowledgeBase as const,
+              entity: KnowledgeBaseEntity.Attachment,
+              description: "",
+              storagePath: "",
+              chunks: chunks,
+              chunks_pos: chunks_pos,
+              image_chunks: image_chunks,
+              image_chunks_pos: image_chunks_pos,
+              chunks_map: processingResult.chunks_map,
+              image_chunks_map: processingResult.image_chunks_map,
+              metadata: JSON.stringify({
+                originalFileName: file.name,
+                uploadedBy: email,
+                chunksCount: chunks.length,
+                imageChunksCount: image_chunks.length,
+                processingMethod: getBaseMimeType(file.type || "text/plain"),
+                lastModified: Date.now(),
+                ...(('sheetName' in processingResult) && {
+                  sheetName: (processingResult as SheetProcessingResult).sheetName,
+                  sheetIndex: (processingResult as SheetProcessingResult).sheetIndex,
+                  totalSheets: (processingResult as SheetProcessingResult).totalSheets,
+                }),
+              }),
+              createdBy: email,
+              duration: 0,
+              mimeType: getBaseMimeType(file.type || "text/plain"),
+              fileSize: file.size,
+              createdAt: Date.now(),
+              updatedAt: Date.now(),
+            }
+
+            await insert(vespaDoc, KbItemsSchema)
+          }
         }
 
         // Create attachment metadata
         const metadata: AttachmentMetadata = {
-          fileId,
+          fileId: vespaId,
           fileName: file.name,
           fileType: file.type,
           fileSize: file.size,
diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts
index 39878d233..3fa7a1622 100644
--- a/server/api/knowledgeBase.ts
+++ b/server/api/knowledgeBase.ts
@@ -54,6 +54,7 @@ import {
 } from "@/integrations/dataSource/config"
 import { getAuth, safeGet } from "./agent"
 import { ApiKeyScopes, UploadStatus } from "@/shared/types"
+import { expandSheetIds } from "./chat/chat"
 
 const EXTENSION_MIME_MAP: Record<string, string> = {
   ".pdf": "application/pdf",
@@ -1583,10 +1584,13 @@ export const DeleteItemApi = async (c: Context) => {
         try {
           // Delete from Vespa
           if (itemToDelete.vespaDocId) {
-            await DeleteDocument(itemToDelete.vespaDocId, KbItemsSchema)
-            loggerWithChild({ email: userEmail }).info(
-              `Deleted file from Vespa: ${itemToDelete.vespaDocId}`,
-            )
+            const vespaDocIds = expandSheetIds(itemToDelete.vespaDocId)
+            vespaDocIds.forEach(async (id) => {
+              await DeleteDocument(id, KbItemsSchema)
+              loggerWithChild({ email: userEmail }).info(
+                `Deleted file from Vespa: ${id}`,
+              )
+            })
           }
         } catch (error) {
           loggerWithChild({ email: userEmail }).warn(
diff --git a/server/integrations/dataSource/config.ts b/server/integrations/dataSource/config.ts
index b79b6180a..6c98f272a 100644
--- a/server/integrations/dataSource/config.ts
+++ b/server/integrations/dataSource/config.ts
@@ -1,4 +1,3 @@
-import path from "path"
 
 export const DATASOURCE_CONFIG = {
   // File size limits
@@ -18,19 +17,15 @@ export const DATASOURCE_CONFIG = {
     process.env.DATASOURCE_MAX_PPTX_TEXT_LEN || "300000",
     10,
   ),
+  MAX_SPREADSHEET_FILE_SIZE_MB: parseInt(
+    process.env.DATASOURCE_MAX_SPREADSHEET_FILE_SIZE_MB || "15",
+    10,
+  ),
   MAX_TEXT_FILE_SIZE_MB: parseInt(
     process.env.DATASOURCE_MAX_TEXT_FILE_SIZE_MB || "40",
     10,
   ),
   MAX_CHUNK_SIZE: parseInt(process.env.DATASOURCE_MAX_CHUNK_SIZE || "512", 10),
-  MAX_ATTACHMENT_SHEET_ROWS: parseInt(
-    process.env.DATASOURCE_MAX_ATTACHMENT_SHEET_ROWS || "3000",
-    10,
-  ),
-  MAX_ATTACHMENT_SHEET_TEXT_LEN: parseInt(
-    process.env.DATASOURCE_MAX_ATTACHMENT_SHEET_TEXT_LEN || "300000",
-    10,
-  ),
   MAX_IMAGE_FILE_SIZE_MB: parseInt(
     process.env.DATASOURCE_MAX_IMAGE_FILE_SIZE_MB || "40",
     10,
diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts
index d186e5797..bd6a94520 100644
--- a/server/integrations/dataSource/index.ts
+++ b/server/integrations/dataSource/index.ts
@@ -37,6 +37,7 @@ import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import imageType from "image-type"
 import { NAMESPACE } from "@/config"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "dataSourceIntegration",
@@ -374,27 +375,28 @@ const processSpreadsheetFile = async (
       const worksheet = workbook.Sheets[sheetName]
       if (!worksheet) continue
 
-      const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-        header: 1,
-        defval: "",
-        raw: false,
-      })
+      // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
+      //   header: 1,
+      //   defval: "",
+      //   raw: false,
+      // })
 
-      const validRows = sheetData.filter((row) =>
-        row.some((cell) => cell && cell.toString().trim().length > 0),
-      )
+      // const validRows = sheetData.filter((row) =>
+      //   row.some((cell) => cell && cell.toString().trim().length > 0),
+      // )
 
-      if (validRows.length === 0) continue
+      // if (validRows.length === 0) continue
 
-      if (validRows?.length > DATASOURCE_CONFIG.MAX_ATTACHMENT_SHEET_ROWS) {
-        // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
-        // Logger.warn(
-        //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
-        // )
-        return []
-      }
+      // if (validRows?.length > DATASOURCE_CONFIG.MAX_ATTACHMENT_SHEET_ROWS) {
+      //   // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
+      //   // Logger.warn(
+      //   //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
+      //   // )
+      //   return []
+      // }
 
-      const sheetChunks = chunkSheetRows(validRows)
+      // Use the new header-preserving chunking function
+      const sheetChunks = chunkSheetWithHeaders(worksheet)
 
       const filteredChunks = sheetChunks.filter(
         (chunk) => chunk.trim().length > 0,
@@ -467,55 +469,6 @@ const processSpreadsheetFile = async (
   }
 }
 
-// Function to chunk sheet rows (simplified version of chunkFinalRows)
-const chunkSheetRows = (allRows: string[][]): string[] => {
-  const chunks: string[] = []
-  let currentChunk = ""
-  let totalTextLength = 0
-  const MAX_CHUNK_SIZE = 512
-
-  for (const row of allRows) {
-    // Filter out numerical cells and empty strings, join textual cells
-    const textualCells = row
-      .filter(
-        (cell) =>
-          cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
-      )
-      .map((cell) => cell.toString().trim())
-
-    if (textualCells.length === 0) continue
-
-    const rowText = textualCells.join(" ")
-
-    // Check if adding this rowText would exceed the maximum text length
-    if (
-      totalTextLength + rowText.length >
-      DATASOURCE_CONFIG.MAX_ATTACHMENT_SHEET_TEXT_LEN
-    ) {
-      // Logger.warn(`Text length excedded, indexing with empty content`)
-      // Return an empty array if the total text length exceeds the limit
-      return []
-    }
-
-    totalTextLength += rowText.length
-
-    if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
-      if (currentChunk.trim().length > 0) {
-        chunks.push(currentChunk.trim())
-      }
-      currentChunk = rowText
-    } else {
-      currentChunk += (currentChunk ? " " : "") + rowText
-    }
-  }
-
-  if (currentChunk.trim().length > 0) {
-    chunks.push(currentChunk.trim())
-  }
-
-  return chunks
-}
-
 // Main export function
 export const handleDataSourceFileUpload = async (
   file: File,
@@ -596,6 +549,7 @@ export const handleDataSourceFileUpload = async (
         const processedFile = await processPptxContent(fileBuffer, options)
         processedFiles = [processedFile]
       } else if (isSheetFile(mimeType)) {
+        checkFileSize(file, DATASOURCE_CONFIG.MAX_SPREADSHEET_FILE_SIZE_MB)
         const fileBuffer = Buffer.from(await file.arrayBuffer())
         processedFiles = await processSheetContent(fileBuffer, options)
       } else if (isTextFile(mimeType)) {
diff --git a/server/integrations/google/config.ts b/server/integrations/google/config.ts
index b5ef691c0..475b658f3 100644
--- a/server/integrations/google/config.ts
+++ b/server/integrations/google/config.ts
@@ -12,8 +12,7 @@ export const scopes = [
 ]
 
 export const MAX_GD_PDF_SIZE = 15 // In MB
-export const MAX_GD_SHEET_ROWS = 3000
-export const MAX_GD_SHEET_TEXT_LEN = 300000
+export const MAX_GD_SHEET_SIZE = 15 // In MB
 export const MAX_GD_SLIDES_TEXT_LEN = 300000
 export const ServiceAccountUserConcurrency = 2
 export const GoogleDocsConcurrency = 8
@@ -24,8 +23,7 @@ export const MAX_ATTACHMENT_PDF_SIZE = 15
 export const MAX_ATTACHMENT_TEXT_SIZE = 10
 export const MAX_ATTACHMENT_DOCX_SIZE = 15
 export const MAX_ATTACHMENT_PPTX_SIZE = 15
-export const MAX_ATTACHMENT_SHEET_ROWS = 3000
-export const MAX_ATTACHMENT_SHEET_TEXT_LEN = 300000
+export const MAX_ATTACHMENT_SHEET_SIZE = 15
 
 // if true will directly ingest the data without checking
 // if false will check for its existance in vespa
diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index 3c086ea57..373f3cbc0 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -100,8 +100,7 @@ import { unlink } from "node:fs/promises"
 import type { Document } from "@langchain/core/documents"
 import {
   MAX_GD_PDF_SIZE,
-  MAX_GD_SHEET_ROWS,
-  MAX_GD_SHEET_TEXT_LEN,
+  MAX_GD_SHEET_SIZE,
   MAX_GD_SLIDES_TEXT_LEN,
   PDFProcessingConcurrency,
   ServiceAccountUserConcurrency,
@@ -1074,6 +1073,7 @@ import {
   totalAttachmentIngested,
   totalIngestedMails,
 } from "@/metrics/google/gmail-metrics"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
 
 const stats = z.object({
   type: z.literal(WorkerResponseTypes.Stats),
@@ -2186,49 +2186,49 @@ export const getSpreadsheet = async (
 // Adds rows' string data to a chunk until the 512-character limit is exceeded
 // If adding a row exceeds the limit, the chunk is added to the next chunk
 // Otherwise, the row is added to the current chunk
-const chunkFinalRows = (allRows: string[][]): string[] => {
-  const chunks: string[] = []
-  let currentChunk = ""
-  let totalTextLength = 0
-
-  for (const row of allRows) {
-    // Filter out numerical cells and empty strings
-    const textualCells = row.filter(
-      (cell) => isNaN(Number(cell)) && cell.trim().length > 0,
-    )
-
-    if (textualCells.length === 0) continue // Skip if no textual data
-
-    const rowText = textualCells.join(" ")
-
-    // Check if adding this rowText would exceed the maximum text length
-    if (totalTextLength + rowText.length > MAX_GD_SHEET_TEXT_LEN) {
-      // Logger.warn(`Text length excedded, indexing with empty content`)
-      // Return an empty array if the total text length exceeds the limit
-      return []
-    }
-
-    totalTextLength += rowText.length
-
-    if ((currentChunk + " " + rowText).trim().length > 512) {
-      // Add the current chunk to the list and start a new chunk
-      if (currentChunk.trim().length > 0) {
-        chunks.push(currentChunk.trim())
-      }
-      currentChunk = rowText
-    } else {
-      // Append the row text to the current chunk
-      currentChunk += " " + rowText
-    }
-  }
-
-  if (currentChunk.trim().length > 0) {
-    // Add any remaining text as the last chunk
-    chunks.push(currentChunk.trim())
-  }
-
-  return chunks
-}
+// const chunkFinalRows = (allRows: string[][]): string[] => {
+//   const chunks: string[] = []
+//   let currentChunk = ""
+//   let totalTextLength = 0
+
+//   for (const row of allRows) {
+//     // Filter out numerical cells and empty strings
+//     const textualCells = row.filter(
+//       (cell) => isNaN(Number(cell)) && cell.trim().length > 0,
+//     )
+
+//     if (textualCells.length === 0) continue // Skip if no textual data
+
+//     const rowText = textualCells.join(" ")
+
+//     // Check if adding this rowText would exceed the maximum text length
+//     if (totalTextLength + rowText.length > MAX_GD_SHEET_TEXT_LEN) {
+//       // Logger.warn(`Text length excedded, indexing with empty content`)
+//       // Return an empty array if the total text length exceeds the limit
+//       return []
+//     }
+
+//     totalTextLength += rowText.length
+
+//     if ((currentChunk + " " + rowText).trim().length > 512) {
+//       // Add the current chunk to the list and start a new chunk
+//       if (currentChunk.trim().length > 0) {
+//         chunks.push(currentChunk.trim())
+//       }
+//       currentChunk = rowText
+//     } else {
+//       // Append the row text to the current chunk
+//       currentChunk += " " + rowText
+//     }
+//   }
+
+//   if (currentChunk.trim().length > 0) {
+//     // Add any remaining text as the last chunk
+//     chunks.push(currentChunk.trim())
+//   }
+
+//   return chunks
+// }
 
 export const getSheetsListFromOneSpreadsheet = async (
   sheets: sheets_v4.Sheets,
@@ -2245,6 +2245,13 @@ export const getSheetsListFromOneSpreadsheet = async (
       userEmail,
     )
 
+    if (spreadsheet.size && parseInt(spreadsheet.size) > MAX_GD_SHEET_SIZE) {
+      loggerWithChild({ email: userEmail }).warn(
+        `Ignoring ${spreadsheet.name} as its more than ${MAX_GD_SHEET_SIZE} MB`,
+      )
+      return []
+    }
+
     if (spreadSheetData) {
       // Now we should get all sheets inside this spreadsheet using the spreadSheetData
       const allSheetsFromSpreadSheet = await getAllSheetsFromSpreadSheet(
@@ -2282,15 +2289,15 @@ export const getSheetsListFromOneSpreadsheet = async (
 
         let chunks: string[] = []
 
-        if (finalRows?.length > MAX_GD_SHEET_ROWS) {
-          // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
-          // Logger.warn(
-          //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
-          // )
-          chunks = []
-        } else {
-          chunks = chunkFinalRows(finalRows)
-        }
+        // if (finalRows?.length > MAX_GD_SHEET_ROWS) {
+        //   // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
+        //   // Logger.warn(
+        //   //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
+        //   // )
+        //   chunks = []
+        // } else {
+          chunks = chunkSheetWithHeaders(finalRows)
+        // }
 
         const sheetDataToBeIngested = {
           title: `${spreadsheet.name} / ${sheet?.sheetTitle}`,
@@ -2965,12 +2972,13 @@ export async function* listFiles(
   client: GoogleClient,
   startDate?: string,
   endDate?: string,
+  q?: string,
 ): AsyncIterableIterator<drive_v3.Schema$File[]> {
   const drive = google.drive({ version: "v3", auth: client })
   let nextPageToken = ""
 
   // Build the query with date filters if provided
-  let query = "trashed = false"
+  let query = `${q} and trashed = false`
   const dateFilters: string[] = []
 
   if (startDate) {
diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts
index e2c55bfa5..b6874da87 100644
--- a/server/integrations/google/worker-utils.ts
+++ b/server/integrations/google/worker-utils.ts
@@ -13,13 +13,13 @@ import {
   MAX_ATTACHMENT_TEXT_SIZE,
   MAX_ATTACHMENT_DOCX_SIZE,
   MAX_ATTACHMENT_PPTX_SIZE,
-  MAX_ATTACHMENT_SHEET_ROWS,
-  MAX_ATTACHMENT_SHEET_TEXT_LEN,
+  MAX_ATTACHMENT_SHEET_SIZE,
 } from "@/integrations/google/config"
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
 
 const Logger = getLogger(Subsystem.Integrations).child({ module: "google" })
 
@@ -322,6 +322,13 @@ export const getGmailSpreadsheetSheets = async (
       mimeType === "application/vnd.ms-excel" ||
       mimeType === "text/csv"
     ) {
+      const fileSizeMB = size.value / (1024 * 1024)
+      if (fileSizeMB > MAX_ATTACHMENT_SHEET_SIZE) {
+        Logger.error(
+          `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_SHEET_SIZE} MB`,
+        )
+        return null
+      }
       const sheetsData = await processSpreadsheetFileWithSheetInfo(
         attachmentBuffer,
         filename,
@@ -412,41 +419,43 @@ export const processSpreadsheetFileWithSheetInfo = async (
         if (!worksheet) continue
 
         // Get the range of the worksheet
-        const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
-        const totalRows = range.e.r - range.s.r + 1
+        // const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
+        // const totalRows = range.e.r - range.s.r + 1
 
         // Skip sheets with too many rows
-        if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
-          Logger.warn(
-            `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
-          )
-          continue
-        }
-
-        // Convert sheet to JSON array of arrays with a row limit
-        const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-          header: 1,
-          defval: "",
-          raw: false,
-          range: 0, // Start from first row
-          blankrows: false,
-        })
-
-        // Clean and get valid rows
-        const validRows = sheetData.filter((row) =>
-          row.some((cell) => cell && cell.toString().trim().length > 0),
-        )
-
-        if (validRows.length === 0) {
-          Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
-          continue
-        }
-
-        // Chunk the rows for this specific sheet
-        const sheetChunks = chunkSheetRows(validRows)
-        const filteredSheetChunks = sheetChunks.filter(
-          (chunk) => chunk.trim().length > 0,
-        )
+        // if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
+        //   Logger.warn(
+        //     `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
+        //   )
+        //   continue
+        // }
+
+        // // Convert sheet to JSON array of arrays with a row limit
+        // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
+        //   header: 1,
+        //   defval: "",
+        //   raw: false,
+        //   range: 0, // Start from first row
+        //   blankrows: false,
+        // })
+
+        // // Clean and get valid rows
+        // const validRows = sheetData.filter((row) =>
+        //   row.some((cell) => cell && cell.toString().trim().length > 0),
+        // )
+
+        // if (validRows.length === 0) {
+        //   Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
+        //   continue
+        // }
+
+        // // Chunk the rows for this specific sheet
+        // const sheetChunks = chunkSheetRows(validRows)
+        // const filteredSheetChunks = sheetChunks.filter(
+        //   (chunk) => chunk.trim().length > 0,
+        // )
+
+        const filteredSheetChunks = chunkSheetWithHeaders(worksheet);
 
         if (filteredSheetChunks.length === 0) {
           Logger.debug(
@@ -526,49 +535,49 @@ export const processSpreadsheetFileWithSheetInfo = async (
 }
 
 // Function to chunk sheet rows (simplified version of chunkFinalRows)
-const chunkSheetRows = (allRows: string[][]): string[] => {
-  const chunks: string[] = []
-  let currentChunk = ""
-  let totalTextLength = 0
-  const MAX_CHUNK_SIZE = 512
-
-  for (const row of allRows) {
-    // Filter out numerical cells and empty strings, join textual cells
-    const textualCells = row
-      .filter(
-        (cell) =>
-          cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
-      )
-      .map((cell) => cell.toString().trim())
-
-    if (textualCells.length === 0) continue
-
-    const rowText = textualCells.join(" ")
-
-    // Check if adding this rowText would exceed the maximum text length
-    if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
-      Logger.warn(
-        `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
-      )
-      // If we have some chunks, return them; otherwise return empty
-      return chunks.length > 0 ? chunks : []
-    }
-
-    totalTextLength += rowText.length
-
-    if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
-      if (currentChunk.trim().length > 0) {
-        chunks.push(currentChunk.trim())
-      }
-      currentChunk = rowText
-    } else {
-      currentChunk += (currentChunk ? " " : "") + rowText
-    }
-  }
-
-  if (currentChunk.trim().length > 0) {
-    chunks.push(currentChunk.trim())
-  }
-
-  return chunks
-}
+// const chunkSheetRows = (allRows: string[][]): string[] => {
+//   const chunks: string[] = []
+//   let currentChunk = ""
+//   let totalTextLength = 0
+//   const MAX_CHUNK_SIZE = 512
+
+//   for (const row of allRows) {
+//     // Filter out numerical cells and empty strings, join textual cells
+//     const textualCells = row
+//       .filter(
+//         (cell) =>
+//           cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
+//       )
+//       .map((cell) => cell.toString().trim())
+
+//     if (textualCells.length === 0) continue
+
+//     const rowText = textualCells.join(" ")
+
+//     // Check if adding this rowText would exceed the maximum text length
+//     if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
+//       Logger.warn(
+//         `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
+//       )
+//       // If we have some chunks, return them; otherwise return empty
+//       return chunks.length > 0 ? chunks : []
+//     }
+
+//     totalTextLength += rowText.length
+
+//     if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
+//       if (currentChunk.trim().length > 0) {
+//         chunks.push(currentChunk.trim())
+//       }
+//       currentChunk = rowText
+//     } else {
+//       currentChunk += (currentChunk ? " " : "") + rowText
+//     }
+//   }
+
+//   if (currentChunk.trim().length > 0) {
+//     chunks.push(currentChunk.trim())
+//   }
+
+//   return chunks
+// }
diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts
index 228e8087d..fa5f82a49 100644
--- a/server/integrations/microsoft/attachment-utils.ts
+++ b/server/integrations/microsoft/attachment-utils.ts
@@ -11,14 +11,14 @@ import {
   MAX_ATTACHMENT_TEXT_SIZE,
   MAX_ATTACHMENT_DOCX_SIZE,
   MAX_ATTACHMENT_PPTX_SIZE,
-  MAX_ATTACHMENT_SHEET_ROWS,
-  MAX_ATTACHMENT_SHEET_TEXT_LEN,
+  MAX_ATTACHMENT_SHEET_SIZE,
 } from "@/integrations/google/config"
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "microsoft-attachments",
@@ -360,6 +360,13 @@ export const getOutlookSpreadsheetSheets = async (
       mimeType === "application/vnd.ms-excel" ||
       mimeType === "text/csv"
     ) {
+      const fileSizeMB = size / (1024 * 1024)
+      if (fileSizeMB > MAX_ATTACHMENT_SHEET_SIZE) {
+        Logger.error(
+          `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_SHEET_SIZE} MB`,
+        )
+        return null
+      }
       const sheetsData = await processSpreadsheetFileWithSheetInfo(
         attachmentBuffer,
         filename,
@@ -441,39 +448,40 @@ export const processSpreadsheetFileWithSheetInfo = async (
         const worksheet = workbook.Sheets[sheetName]
         if (!worksheet) continue
 
-        // Get the range of the worksheet
-        const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
-        const totalRows = range.e.r - range.s.r + 1
-
-        // Skip sheets with too many rows
-        if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
-          Logger.warn(
-            `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
-          )
-          continue
-        }
-
-        // Convert sheet to JSON array of arrays with a row limit
-        const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-          header: 1,
-          defval: "",
-          raw: false,
-          range: 0, // Start from first row
-          blankrows: false,
-        })
-
-        // Clean and get valid rows
-        const validRows = sheetData.filter((row) =>
-          row.some((cell) => cell && cell.toString().trim().length > 0),
-        )
-
-        if (validRows.length === 0) {
-          Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
-          continue
-        }
-
-        // Chunk the rows for this specific sheet
-        const sheetChunks = chunkSheetRows(validRows)
+        // // Get the range of the worksheet
+        // const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
+        // const totalRows = range.e.r - range.s.r + 1
+
+        // // Skip sheets with too many rows
+        // if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
+        //   Logger.warn(
+        //     `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
+        //   )
+        //   continue
+        // }
+
+        // // Convert sheet to JSON array of arrays with a row limit
+        // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
+        //   header: 1,
+        //   defval: "",
+        //   raw: false,
+        //   range: 0, // Start from first row
+        //   blankrows: false,
+        // })
+
+        // // Clean and get valid rows
+        // const validRows = sheetData.filter((row) =>
+        //   row.some((cell) => cell && cell.toString().trim().length > 0),
+        // )
+
+        // if (validRows.length === 0) {
+        //   Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
+        //   continue
+        // }
+
+        // // Chunk the rows for this specific sheet
+        // const sheetChunks = chunkSheetRows(validRows)
+        const sheetChunks = chunkSheetWithHeaders(worksheet)
         const filteredSheetChunks = sheetChunks.filter(
           (chunk) => chunk.trim().length > 0,
         )
@@ -556,52 +564,52 @@ export const processSpreadsheetFileWithSheetInfo = async (
 }
 
 // Function to chunk sheet rows (simplified version of chunkFinalRows)
-const chunkSheetRows = (allRows: string[][]): string[] => {
-  const chunks: string[] = []
-  let currentChunk = ""
-  let totalTextLength = 0
-  const MAX_CHUNK_SIZE = 512
-
-  for (const row of allRows) {
-    // Filter out numerical cells and empty strings, join textual cells
-    const textualCells = row
-      .filter(
-        (cell) =>
-          cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
-      )
-      .map((cell) => cell.toString().trim())
-
-    if (textualCells.length === 0) continue
-
-    const rowText = textualCells.join(" ")
-
-    // Check if adding this rowText would exceed the maximum text length
-    if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
-      Logger.warn(
-        `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
-      )
-      // If we have some chunks, return them; otherwise return empty
-      return chunks.length > 0 ? chunks : []
-    }
-
-    totalTextLength += rowText.length
-
-    if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
-      if (currentChunk.trim().length > 0) {
-        chunks.push(currentChunk.trim())
-      }
-      currentChunk = rowText
-    } else {
-      currentChunk += (currentChunk ? " " : "") + rowText
-    }
-  }
-
-  if (currentChunk.trim().length > 0) {
-    chunks.push(currentChunk.trim())
-  }
-
-  return chunks
-}
+// const chunkSheetRows = (allRows: string[][]): string[] => {
+//   const chunks: string[] = []
+//   let currentChunk = ""
+//   let totalTextLength = 0
+//   const MAX_CHUNK_SIZE = 512
+
+//   for (const row of allRows) {
+//     // Filter out numerical cells and empty strings, join textual cells
+//     const textualCells = row
+//       .filter(
+//         (cell) =>
+//           cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
+//       )
+//       .map((cell) => cell.toString().trim())
+
+//     if (textualCells.length === 0) continue
+
+//     const rowText = textualCells.join(" ")
+
+//     // Check if adding this rowText would exceed the maximum text length
+//     if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
+//       Logger.warn(
+//         `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
+//       )
+//       // If we have some chunks, return them; otherwise return empty
+//       return chunks.length > 0 ? chunks : []
+//     }
+
+//     totalTextLength += rowText.length
+
+//     if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
+//       if (currentChunk.trim().length > 0) {
+//         chunks.push(currentChunk.trim())
+//       }
+//       currentChunk = rowText
+//     } else {
+//       currentChunk += (currentChunk ? " " : "") + rowText
+//     }
+//   }
+
+//   if (currentChunk.trim().length > 0) {
+//     chunks.push(currentChunk.trim())
+//   }
+
+//   return chunks
+// }
 
 // Helper function to check if a file is a spreadsheet
 export const isSpreadsheetFile = (mimeType: string): boolean => {
diff --git a/server/integrations/ribbie/index.ts b/server/integrations/ribbie/index.ts
index 3cebc975b..855c677c9 100644
--- a/server/integrations/ribbie/index.ts
+++ b/server/integrations/ribbie/index.ts
@@ -435,6 +435,8 @@ class RIBBIECircularDownloader {
                     chunks_pos: [],
                     image_chunks: [],
                     image_chunks_pos: [],
+                    chunks_map: [],
+                    image_chunks_map: [],
                     metadata: JSON.stringify({ source: 'RIBBIE-automation', vespaDocId: vespaDocId }),
                     createdBy: userEmail,
                     duration: 0,
@@ -486,7 +488,7 @@ class RIBBIECircularDownloader {
 
             // STEP 3: Process PDF into chunks
             Logger.info('⚙️ Extracting text and chunks from PDF...');
-            const processingResult = await FileProcessorService.processFile(
+            const processingResults = await FileProcessorService.processFile(
                 pdfBuffer,
                 'application/pdf',
                 fileName,
@@ -496,6 +498,12 @@ class RIBBIECircularDownloader {
                 false       // Don't describe images
             );
 
+            // For PDFs, we expect only one result, but handle array for consistency
+            const processingResult = processingResults[0];
+            if (!processingResult) {
+                throw new Error('No processing result returned for PDF');
+            }
+
             Logger.info(`✅ Extracted ${processingResult.chunks.length} text chunks and ${processingResult.image_chunks.length} image chunks`);
 
             // STEP 4: Create proper storage path (following your app's pattern)
@@ -556,6 +564,8 @@ class RIBBIECircularDownloader {
                     chunks_pos: processingResult.chunks_pos,
                     image_chunks: processingResult.image_chunks || [],
                     image_chunks_pos: processingResult.image_chunks_pos || [],
+                    chunks_map: [],
+                    image_chunks_map: [],
                     metadata: JSON.stringify({
                         source: 'RIBBIE-automation',
                         circularNumber: circular.id,
@@ -745,4 +755,4 @@ if (import.meta.main) {
     }
 
     testCompleteFlow().catch(console.error);
-}
\ No newline at end of file
+}
diff --git a/server/lib/duckdb.ts b/server/lib/duckdb.ts
new file mode 100644
index 000000000..79d81e062
--- /dev/null
+++ b/server/lib/duckdb.ts
@@ -0,0 +1,218 @@
+import { Database } from "duckdb-async";
+import { getLogger } from "@/logger";
+import { Subsystem } from "@/types";
+import type { DuckDBResult } from "@/types";
+import { analyzeQueryAndGenerateSQL } from "./sqlInference";
+import { writeFileSync, unlinkSync, createWriteStream } from "fs";
+import { join } from "path";
+import { tmpdir } from "os";
+
+const Logger = getLogger(Subsystem.Integrations).child({
+  module: "duckdb",
+});
+
+// Simple SQL validation function
+function validateSQL(sql: string): void {
+  const disallowedKeywords = [
+    'INSTALL', 'LOAD', 'PRAGMA', 'COPY', 'EXPORT', 'ATTACH', 'DETACH', 
+    'CALL', 'CREATE', 'ALTER', 'DROP', 'INSERT', 'UPDATE', 'DELETE', 
+    'SET', 'RESET'
+  ];
+  
+  const upperSQL = sql.toUpperCase();
+  
+  for (const keyword of disallowedKeywords) {
+    if (upperSQL.includes(keyword)) {
+      throw new Error(`Disallowed SQL keyword detected: ${keyword}`);
+    }
+  }
+  
+  // Ensure it's a SELECT statement
+  if (!upperSQL.trim().startsWith('SELECT')) {
+    throw new Error('Only SELECT statements are allowed');
+  }
+  
+  // Ensure there's a LIMIT clause
+  if (!upperSQL.includes('LIMIT')) {
+    throw new Error('LIMIT clause is required for all queries');
+  }
+}
+
+export const querySheetChunks = async (
+  sheetChunks: string[],
+  userQuery: string,
+): Promise<DuckDBResult | null> => {
+  if (!sheetChunks.length) {
+    return null;
+  }
+
+  Logger.debug("Processing sheet chunks with DuckDB");
+
+  // Clean HTML tags from sheet chunks
+  const cleanedSheetChunks = sheetChunks.map(chunk => 
+    chunk.replace(/<\/?hi>/g, '')
+  );
+
+  // Create a temporary CSV file using streaming for large data
+  const tempFilePath = join(tmpdir(), `duckdb_temp_${Date.now()}.tsv`);
+  Logger.debug(`Writing ${cleanedSheetChunks.length} chunks to temporary file: ${tempFilePath}`);
+  
+  if (cleanedSheetChunks.length > 100) {
+    // Use streaming for large datasets
+    const ws = createWriteStream(tempFilePath, { encoding: "utf8" });
+    for (const chunk of cleanedSheetChunks) {
+      ws.write(chunk);
+      ws.write('\n');
+    }
+    await new Promise<void>((resolve, reject) => {
+      ws.on('finish', resolve);
+      ws.on('error', reject);
+      ws.end();
+    });
+    Logger.debug("Large dataset written using streaming");
+  } else {
+    // Use simple write for small datasets
+    const combinedData = cleanedSheetChunks.join('\n');
+    writeFileSync(tempFilePath, combinedData);
+    Logger.debug("Small dataset written using simple write");
+  }
+
+  // Use on-disk DB and tune pragmas for large files
+  const db = await Database.create(join(tmpdir(), `xyne_${Date.now()}.duckdb`));
+  const connection = await db.connect();
+  
+  Logger.debug("Setting up DuckDB pragmas for large file processing");
+  await connection.run(`PRAGMA temp_directory='${tmpdir()}'`);
+  await connection.run(`PRAGMA threads=${Math.max(1, Math.floor(require('os').cpus().length / 2))}`);
+  await connection.run(`PRAGMA memory_limit='4GB'`);
+
+  const tableName = `v_${Date.now().toString(36)}`;
+  const startTime = Date.now();
+
+  try {
+    Logger.debug(`Creating VIEW ${tableName} over CSV file: ${tempFilePath}`);
+    
+    // 1) Create a VIEW over the CSV (no materialization)
+    const escapedPath = tempFilePath.replace(/'/g, "''");
+    Logger.debug(`Escaped path: ${escapedPath}`);
+    
+    try {
+      await connection.run(`
+        CREATE OR REPLACE VIEW ${tableName} AS
+        SELECT * FROM read_csv(
+          '${escapedPath}',
+          delim='\t', 
+          header=true, 
+          quote='"',
+          escape='"',
+          null_padding=true,
+          ignore_errors=true,
+          strict_mode=false,
+          sample_size=100000
+        )
+      `);
+      Logger.debug(`VIEW ${tableName} created successfully`);
+    } catch (viewError) {
+      console.error(`Failed to create VIEW ${tableName}:`, viewError);
+      throw viewError;
+    }
+
+    // 2) Get schema without loading all rows
+    Logger.debug(`Getting schema for ${tableName}`);
+    const schemaResult = await connection.all(`DESCRIBE ${tableName}`);
+    const schema = schemaResult
+      .map((col: any) => `${col.column_name}: ${col.column_type}`)
+      .join('\n');
+    Logger.debug(`Schema obtained: ${schema}`);
+
+    // 3) Get sample rows from the source (small scan only)
+    Logger.debug(`Getting sample rows from ${tableName}`);
+    const sampleRowsRes = await connection.all(
+      `SELECT * FROM ${tableName} LIMIT 5`
+    );
+    Logger.debug(`Sample rows obtained: ${sampleRowsRes.length} rows`);
+    
+    // Build sample rows text for prompt
+    const sampleRowsHeader = schemaResult.map((c: any) => c.column_name).join('\t');
+    const sampleRowsBody = sampleRowsRes
+      .map((r: any) => schemaResult.map((c: any) => String(r[c.column_name] ?? '')).join('\t'))
+      .join('\n');
+    const sampleRows = `${sampleRowsHeader}\n${sampleRowsBody}`;
+    Logger.debug(`Sample rows text prepared: ${sampleRows.length} characters`);
+    
+    // 4) Generate SQL using the schema + samples
+    Logger.debug(`Generating SQL for query: ${userQuery}`);
+    const duckDBQuery = await analyzeQueryAndGenerateSQL(
+      userQuery,
+      tableName,
+      schema,
+      sampleRows,
+    );
+    
+    if (!duckDBQuery) {
+      Logger.warn("Failed to generate DuckDB query, returning null");
+      return null;
+    }
+    Logger.debug(`Generated SQL: ${duckDBQuery.sql}`);
+
+    // 5) Validate and run
+    Logger.debug("Validating generated SQL");
+    validateSQL(duckDBQuery.sql);
+    
+    Logger.debug(`Executing DuckDB query: ${duckDBQuery.sql}`);
+    const result = await connection.all(duckDBQuery.sql);
+    const elapsedMs = Date.now() - startTime;
+    Logger.debug(`Query executed successfully, returned ${result.length} rows in ${elapsedMs}ms`);
+
+    if (result.length === 0) {
+      Logger.warn("DuckDB query returned no results, returning null");
+      return null;
+    }
+
+    const columns = Object.keys(result[0] ?? {});
+    const rows = [columns, ...result.map((row: any) => Object.values(row))];
+
+    const resultPackage: DuckDBResult = {
+      user_question: userQuery,
+      sql: duckDBQuery.sql,
+      execution_meta: {
+        row_count: result.length,
+        elapsed_ms: elapsedMs,
+        as_of: new Date().toISOString(),
+      },
+      schema_fragment: {
+        table: tableName, // it's a VIEW
+        columns: schemaResult.reduce((acc: Record<string,string>, col: any) => {
+          acc[col.column_name] = col.column_type;
+          return acc;
+        }, {}),
+      },
+      assumptions: [duckDBQuery.notes],
+      data: { rows },
+    };
+
+    Logger.debug("DuckDB processing completed successfully");
+    return resultPackage;
+  } catch (error) {
+    Logger.error("Error querying with DuckDB:", error);
+    return null;
+  } finally {
+    // Clean up
+    Logger.debug("Cleaning up DuckDB resources");
+    try {
+      await connection.close();
+      await db.close();
+      Logger.debug("DuckDB connection and database closed");
+    } catch (e) {
+      Logger.warn("Error closing DuckDB resources:", e);
+    }
+    
+    // Clean up temporary file
+    try {
+      unlinkSync(tempFilePath);
+      Logger.debug(`Temporary file deleted: ${tempFilePath}`);
+    } catch (e) {
+      Logger.warn(`Failed to delete temporary file ${tempFilePath}:`, e);
+    }
+  }
+};
diff --git a/server/lib/sqlInference.ts b/server/lib/sqlInference.ts
new file mode 100644
index 000000000..3ec74bf0e
--- /dev/null
+++ b/server/lib/sqlInference.ts
@@ -0,0 +1,126 @@
+import { getLogger } from "@/logger"
+import { Subsystem } from "@/types"
+import type { DuckDBQuery } from "@/types";
+import { getProviderByModel } from "@/ai/provider"
+import { Models } from "@/ai/types"
+import { type Message } from "@aws-sdk/client-bedrock-runtime"
+
+const Logger = getLogger(Subsystem.Integrations).child({
+  module: "sqlInference",
+})
+
+/**
+ * Combined function that classifies if a query is metric-related and generates SQL if it is
+ * @param query The user's query to analyze
+ * @param tableName The name of the table to query
+ * @param schema The schema of the table
+ * @param fewShotSamples Example rows for few-shot learning
+ * @returns DuckDBQuery if metric-related, null if not
+ */
+export const analyzeQueryAndGenerateSQL = async (
+  query: string,
+  tableName: string,
+  schema: string,
+  fewShotSamples: string
+): Promise<DuckDBQuery | null> => {
+  Logger.debug(`Analyzing query and generating SQL: ${query}`);
+
+  const stripNoise = (s: string) => {
+    let t = s.trim();
+    // remove all code fences
+    t = t.replace(/```(?:json)?/gi, "").replace(/```/g, "");
+    // remove leading/trailing non-JSON text
+    const start = t.indexOf("{");
+    const end = t.lastIndexOf("}");
+    if (start !== -1 && end !== -1 && end > start) t = t.slice(start, end + 1);
+    return t.trim();
+  };
+
+  const prompt = `You are a query analyzer and DuckDB SQL generator.
+
+First, determine if the user is asking for metrics/statistics/numerical data.
+- Metric-related queries: count, counts, sums, averages, KPIs, financial figures, quantitative analysis
+- Non-metric queries: descriptive information, definitions, qualitative info, names, categories, context, text-only attributes
+
+If the query is NOT metric-related, respond with: {"isMetric": false, "sql": null, "notes": "Query is not metric-related"}
+
+If the query IS metric-related, generate DuckDB SQL following this schema:
+{
+  "isMetric": true,
+  "sql": "SELECT ...",
+  "notes": "brief reasoning in 1-2 lines"
+}
+
+Rules for SQL generation:
+- Target database: DuckDB (SQL dialect = DuckDB)
+- Use ONLY the provided schema and column names. Do NOT invent fields
+- Output a SINGLE statement. No CTEs with CREATE/INSERT/UPDATE/DELETE. SELECT-only
+- Disallow: INSTALL, LOAD, PRAGMA, COPY, EXPORT, ATTACH, DETACH, CALL, CREATE/ALTER/DROP, SET/RESET
+- Output must be a single-line minified JSON object. Do NOT include markdown, code fences, comments, or any prose
+- If ambiguous, choose the simplest interpretation and state the assumption in "notes"
+
+Context:
+- User question: ${query}
+- Available tables and columns with types and short descriptions:
+table name: ${tableName}
+schema: ${schema}
+- Example rows (up to 5 per table; strings truncated):
+${fewShotSamples}`;
+
+  try {
+    const provider = getProviderByModel(Models.Vertex_Claude_Sonnet_4);
+    
+    const messages: Message[] = [
+      {
+        role: "user",
+        content: [{ text: prompt }]
+      }
+    ]
+
+    const modelParams = {
+      modelId: Models.Vertex_Claude_Sonnet_4,
+      temperature: 0.1,
+      max_new_tokens: 512,
+      stream: false,
+      systemPrompt: "You are a helpful assistant that analyzes queries and generates SQL when appropriate."
+    }
+
+    const response = await provider.converse(messages, modelParams);
+    const responseText = response.text || "";
+
+    console.log("DuckDB query response:", responseText);
+    
+    const cleaned = stripNoise(responseText);
+    let parsedResponse: { isMetric: boolean; sql: string | null; notes: string };
+    
+    try {
+      parsedResponse = JSON.parse(cleaned);
+    } catch (e) {
+      Logger.error("Failed to parse cleaned LLM response as JSON", { cleaned });
+      throw e;
+    }
+
+    if (!parsedResponse.isMetric) {
+      Logger.debug(`Query is not metric-related: ${parsedResponse.notes}`);
+      return null;
+    }
+
+    if (!parsedResponse.sql) {
+      Logger.warn("LLM indicated metric query but provided no SQL");
+      return null;
+    }
+
+    const result: DuckDBQuery = {
+      sql: parsedResponse.sql,
+      notes: parsedResponse.notes
+    };
+
+    console.log("DuckDB query generated:", result);
+    return result;
+  } catch (error) {
+    Logger.error("Failed to analyze query and generate SQL:", error);
+    return null;
+  }
+}
+
+
diff --git a/server/package.json b/server/package.json
index e7d44147a..8e8423e92 100644
--- a/server/package.json
+++ b/server/package.json
@@ -88,6 +88,7 @@
     "cors": "^2.8.5",
     "drizzle-orm": "^0.44.5",
     "drizzle-zod": "^0.8.3",
+    "duckdb-async": "^1.3.2",
     "fast-xml-parser": "^5.2.5",
     "file-type": "^21.0.0",
     "google-auth-library": "^9.14.0",
diff --git a/server/queue/fileProcessor.ts b/server/queue/fileProcessor.ts
index 9dea88d8c..7d94f28f2 100644
--- a/server/queue/fileProcessor.ts
+++ b/server/queue/fileProcessor.ts
@@ -1,7 +1,7 @@
 import { getLogger } from "@/logger"
 import { Subsystem, ProcessingJobType } from "@/types"
 import { getErrorMessage } from "@/utils"
-import { FileProcessorService } from "@/services/fileProcessor"
+import { FileProcessorService, type SheetProcessingResult } from "@/services/fileProcessor"
 import { insert } from "@/search/vespa"
 import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types"
 import { getBaseMimeType } from "@/integrations/dataSource/config"
@@ -195,7 +195,7 @@ async function processFileJob(jobData: FileProcessingJob, startTime: number) {
     const fileBuffer = await readFile(file.storagePath)
 
     // Process file to extract content
-    const processingResult = await FileProcessorService.processFile(
+    const processingResults = await FileProcessorService.processFile(
       fileBuffer,
       file.mimeType || "application/octet-stream",
       file.fileName,
@@ -203,60 +203,91 @@ async function processFileJob(jobData: FileProcessingJob, startTime: number) {
       file.storagePath,
     )
 
-    // Create Vespa document with proper fileName (matching original logic)
-    const targetPath = file.path
-    
-    // Reconstruct the original filePath (full path from collection root)
-    const reconstructedFilePath = targetPath === "/" 
-      ? file.fileName 
-      : targetPath.substring(1) + file.fileName // Remove leading "/" and add filename
-    
-    const vespaFileName =
-      targetPath === "/"
-        ? file.collectionName + targetPath + reconstructedFilePath    // Uses full path for root
-        : file.collectionName + targetPath + file.fileName            // Uses filename for nested
-
-    const vespaDoc = {
-      docId: file.vespaDocId,
-      clId: file.collectionId,
-      itemId: file.id,
-      fileName: vespaFileName,
-      app: Apps.KnowledgeBase as const,
-      entity: KnowledgeBaseEntity.File,
-      description: "",
-      storagePath: file.storagePath,
-      chunks: processingResult.chunks,
-      chunks_pos: processingResult.chunks_pos,
-      image_chunks: processingResult.image_chunks,
-      image_chunks_pos: processingResult.image_chunks_pos,
-      chunks_map: processingResult.chunks_map,
-      image_chunks_map: processingResult.image_chunks_map,
-      metadata: JSON.stringify({
-        originalFileName: file.originalName || file.fileName,
-        uploadedBy: file.uploadedByEmail || "system",
-        chunksCount: processingResult.chunks.length,
-        imageChunksCount: processingResult.image_chunks.length,
-        processingMethod: getBaseMimeType(file.mimeType || "text/plain"),
-        lastModified: Date.now(),
-      }),
-      createdBy: file.uploadedByEmail || "system",
-      duration: 0,
-      mimeType: getBaseMimeType(file.mimeType || "text/plain"),
-      fileSize: file.fileSize || 0,
-      createdAt: Date.now(),
-      updatedAt: Date.now(),
-      clFd: file.parentId,
+    // Handle multiple processing results (e.g., for spreadsheets with multiple sheets)
+    let totalChunksCount = 0
+    let newVespaDocId = ""
+    if(processingResults.length > 0 && 'totalSheets' in processingResults[0]) {
+      newVespaDocId = `${file.vespaDocId}_sheet_${(processingResults[0] as SheetProcessingResult).totalSheets}`
+    } else {
+      newVespaDocId = file.vespaDocId
     }
+    for (const [resultIndex, processingResult] of processingResults.entries()) {
+      // Create Vespa document with proper fileName (matching original logic)
+      const targetPath = file.path
+      
+      // Reconstruct the original filePath (full path from collection root)
+      const reconstructedFilePath = targetPath === "/" 
+        ? file.fileName 
+        : targetPath.substring(1) + file.fileName // Remove leading "/" and add filename
+      
+      let vespaFileName =
+        targetPath === "/"
+          ? file.collectionName + targetPath + reconstructedFilePath    // Uses full path for root
+          : file.collectionName + targetPath + file.fileName            // Uses filename for nested
+
+      // For sheet processing results, append sheet information to fileName
+      let docId = file.vespaDocId
+      if ('sheetName' in processingResult) {
+        const sheetResult = processingResult as SheetProcessingResult
+        vespaFileName = processingResults.length > 1 
+          ? `${vespaFileName} / ${sheetResult.sheetName}`
+          : vespaFileName
+        docId = sheetResult.docId
+      } else if (processingResults.length > 1) {
+        // For non-sheet files with multiple results, append index
+        vespaFileName = `${vespaFileName} (${resultIndex + 1})`
+        docId = `${file.vespaDocId}_${resultIndex}`
+      }
 
-    // Insert into Vespa
-    await insert(vespaDoc, KbItemsSchema)
+      const vespaDoc = {
+        docId: docId,
+        clId: file.collectionId,
+        itemId: file.id,
+        fileName: vespaFileName,
+        app: Apps.KnowledgeBase as const,
+        entity: KnowledgeBaseEntity.File,
+        description: "",
+        storagePath: file.storagePath,
+        chunks: processingResult.chunks,
+        chunks_pos: processingResult.chunks_pos,
+        image_chunks: processingResult.image_chunks,
+        image_chunks_pos: processingResult.image_chunks_pos,
+        chunks_map: processingResult.chunks_map,
+        image_chunks_map: processingResult.image_chunks_map,
+        metadata: JSON.stringify({
+          originalFileName: file.originalName || file.fileName,
+          uploadedBy: file.uploadedByEmail || "system",
+          chunksCount: processingResult.chunks.length,
+          imageChunksCount: processingResult.image_chunks.length,
+          processingMethod: getBaseMimeType(file.mimeType || "text/plain"),
+          lastModified: Date.now(),
+          ...(('sheetName' in processingResult) && {
+            sheetName: (processingResult as SheetProcessingResult).sheetName,
+            sheetIndex: (processingResult as SheetProcessingResult).sheetIndex,
+            totalSheets: (processingResult as SheetProcessingResult).totalSheets,
+          }),
+        }),
+        createdBy: file.uploadedByEmail || "system",
+        duration: 0,
+        mimeType: getBaseMimeType(file.mimeType || "text/plain"),
+        fileSize: file.fileSize || 0,
+        createdAt: Date.now(),
+        updatedAt: Date.now(),
+        clFd: file.parentId,
+      }
+
+      // Insert into Vespa
+      await insert(vespaDoc, KbItemsSchema)
+
+      totalChunksCount += processingResult.chunks.length + processingResult.image_chunks.length
+    }
 
     // Update status to completed
-    const chunksCount =
-      processingResult.chunks.length + processingResult.image_chunks.length
+    const chunksCount = totalChunksCount
     await db
       .update(collectionItems)
       .set({
+        vespaDocId: newVespaDocId,
         uploadStatus: UploadStatus.COMPLETED,
         statusMessage: `Successfully processed: ${chunksCount} chunks extracted from ${file.fileName}`,
         processedAt: new Date(),
diff --git a/server/services/fileProcessor.ts b/server/services/fileProcessor.ts
index 04fc8dd59..e73a8c465 100644
--- a/server/services/fileProcessor.ts
+++ b/server/services/fileProcessor.ts
@@ -6,6 +6,7 @@ import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import { chunkByOCRFromBuffer } from "@/lib/chunkByOCR"
 import { type ChunkMetadata } from "@/types"
+import { chunkSheetWithHeaders } from "@/sheetChunk"
 import * as XLSX from "xlsx"
 import {
   getBaseMimeType,
@@ -29,6 +30,13 @@ export interface ProcessingResult {
   image_chunks_map: ChunkMetadata[]
 }
 
+export interface SheetProcessingResult extends ProcessingResult {
+  sheetName: string
+  sheetIndex: number
+  totalSheets: number
+  docId: string
+}
+
 export class FileProcessorService {
 
   static async processFile(
@@ -39,7 +47,7 @@ export class FileProcessorService {
     storagePath?: string,
     extractImages: boolean = false,
     describeImages: boolean = false,
-  ): Promise<ProcessingResult> {
+  ): Promise<(ProcessingResult | SheetProcessingResult)[]> {
     const baseMimeType = getBaseMimeType(mimeType || "text/plain")
     let chunks: string[] = []
     let chunks_pos: number[] = []
@@ -50,10 +58,7 @@ export class FileProcessorService {
       if (baseMimeType === "application/pdf") {
         // Redirect PDF processing to OCR
         const result = await chunkByOCRFromBuffer(buffer, fileName, vespaDocId)
-
-        
-
-        return result
+        return [result]
       } else if (isDocxFile(baseMimeType)) {
         // Process DOCX
         const result = await extractTextAndImagesWithChunksFromDocx(
@@ -86,40 +91,51 @@ export class FileProcessorService {
         } else {
           workbook = XLSX.readFile(storagePath)
         }
-        const allChunks: string[] = []
 
-        for (const sheetName of workbook.SheetNames) {
+        if (!workbook.SheetNames || workbook.SheetNames.length === 0) {
+          throw new Error("No worksheets found in spreadsheet")
+        }
+
+        const sheetResults: SheetProcessingResult[] = []
+
+        for (const [sheetIndex, sheetName] of workbook.SheetNames.entries()) {
           const worksheet = workbook.Sheets[sheetName]
           if (!worksheet) continue
 
-          const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-            header: 1,
-            defval: "",
-            raw: false,
-          })
-
-          const validRows = sheetData.filter((row) =>
-            row.some((cell) => cell && cell.toString().trim().length > 0),
+          // Use the same header-preserving chunking function as dataSource integration
+          const sheetChunks = chunkSheetWithHeaders(worksheet)
+          
+          const filteredChunks = sheetChunks.filter(
+            (chunk) => chunk.trim().length > 0,
           )
 
-          for (const row of validRows) {
-            const textualCells = row
-              .filter(
-                (cell) =>
-                  cell &&
-                  isNaN(Number(cell)) &&
-                  cell.toString().trim().length > 0,
-              )
-              .map((cell) => cell.toString().trim())
-
-            if (textualCells.length > 0) {
-              allChunks.push(textualCells.join(" "))
-            }
+          // Skip sheets with no valid content
+          if (filteredChunks.length === 0) continue
+
+          // Generate a unique docId for each sheet
+          const sheetDocId = `${vespaDocId}_sheet_${sheetIndex}`
+
+          const sheetResult: SheetProcessingResult = {
+            chunks: filteredChunks,
+            chunks_pos: filteredChunks.map((_, idx) => idx),
+            image_chunks: [],
+            image_chunks_pos: [],
+            chunks_map: [],
+            image_chunks_map: [],
+            sheetName,
+            sheetIndex,
+            totalSheets: workbook.SheetNames.length,
+            docId: sheetDocId,
           }
+
+          sheetResults.push(sheetResult)
         }
 
-        chunks = allChunks
-        chunks_pos = allChunks.map((_, idx) => idx)
+        if (sheetResults.length === 0) {
+          throw new Error("No valid content found in any worksheet")
+        }
+
+        return sheetResults
       } else if (isTextFile(baseMimeType)) {
         // Process text file
         const content = buffer.toString("utf-8")
@@ -165,13 +181,13 @@ export class FileProcessorService {
       block_labels: ["image"], // Default block label
     }));
 
-    return {
+    return [{
       chunks,
       chunks_pos,
       image_chunks,
       image_chunks_pos,
       chunks_map,
       image_chunks_map,
-    }
+    }]
   }
 }
diff --git a/server/sheetChunk.ts b/server/sheetChunk.ts
new file mode 100644
index 000000000..1624a9624
--- /dev/null
+++ b/server/sheetChunk.ts
@@ -0,0 +1,465 @@
+import * as XLSX from "xlsx"
+
+// Type checking utilities for spreadsheet data
+function isTimestamp(value: any): boolean {
+  if (typeof value === 'string') {
+    // Check for ISO timestamp format (YYYY-MM-DDTHH:mm:ss.sssZ)
+    const timestampRegex = /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{3})?(Z|[+-]\d{2}:\d{2})?$/
+    if (timestampRegex.test(value)) {
+      return !isNaN(Date.parse(value))
+    }
+    
+    // Check for Unix timestamp (seconds or milliseconds since epoch)
+    const numValue = Number(value)
+    if (!isNaN(numValue)) {
+      // Unix timestamp should be reasonable (between 1970 and 2100)
+      const minTimestamp = 0 // 1970-01-01
+      const maxTimestamp = 4102444800000 // 2100-01-01 in milliseconds
+      return numValue >= minTimestamp && numValue <= maxTimestamp
+    }
+  }
+  
+  // Check if it's a Date object
+  if (value instanceof Date) {
+    return !isNaN(value.getTime())
+  }
+  
+  return false
+}
+
+function isDate(value: any): boolean {
+  if (typeof value === 'string') {
+    // Check for date-only format (YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, etc.)
+    const dateRegex = /^\d{4}-\d{2}-\d{2}$|^\d{1,2}\/\d{1,2}\/\d{4}$|^\d{1,2}-\d{1,2}-\d{4}$/
+    if (dateRegex.test(value)) {
+      return !isNaN(Date.parse(value))
+    }
+  }
+  
+  if (value instanceof Date) {
+    return !isNaN(value.getTime())
+  }
+  
+  return false
+}
+
+function isTime(value: any): boolean {
+  if (typeof value === 'string') {
+    // Check for time-only format (HH:mm:ss, HH:mm, etc.)
+    const timeRegex = /^([01]?[0-9]|2[0-3]):[0-5][0-9](:[0-5][0-9])?$/
+    return timeRegex.test(value)
+  }
+  
+  return false
+}
+
+function isBoolean(value: any): boolean {
+  if (typeof value === 'boolean') {
+    return true
+  }
+  
+  if (typeof value === 'string') {
+    const lowerValue = value.toLowerCase().trim()
+    return ['true', 'false', 'yes', 'no', 'y', 'n'].includes(lowerValue)
+  }
+  
+  return false
+}
+
+interface ChunkConfig {
+  maxChunkSize?: number
+  maxRowsPerChunk?: number
+  headerRows?: number
+}
+
+interface ChunkingState {
+  headerRow: string
+  maxRowsPerChunk: number
+  maxChunkSize: number
+  columnCount: number
+}
+
+interface ProcessedSheetData {
+  headerRow: string[]
+  dataRows: string[][]
+}
+
+// XLSX Processing Functions
+
+function unmerge(sheet: XLSX.WorkSheet): void {
+  (sheet['!merges'] ?? []).forEach((rng) => {
+    const v = sheet[XLSX.utils.encode_cell({ r: rng.s.r, c: rng.s.c })]?.v
+    for (let R = rng.s.r; R <= rng.e.r; R++) {
+      for (let C = rng.s.c; C <= rng.e.c; C++) {
+        sheet[XLSX.utils.encode_cell({ r: R, c: C })] = { t: "s", v }
+      }
+    }
+  })
+}
+
+function buildHeaders(rows: any[][], headerRows = 1): { header: string[], dataRows: any[][] } {
+  if (rows.length === 0) {
+    return { header: [], dataRows: [] }
+  }
+
+  const header = rows.slice(0, headerRows)
+    .reduce((acc, row) =>
+      acc.map((prev, i) => `${prev}_${(row[i] ?? "").toString().trim()}`), 
+      new Array(rows[0].length).fill("")
+    )
+    .map(h => h.replace(/_{2,}/g, "_").replace(/^_+|_+$/g, ""))
+
+  return { 
+    header, 
+    dataRows: rows.slice(headerRows) 
+  }
+}
+
+function guessHeaderRowsByDataTypes(rows: any[][], maxSearchRows = 3): number {
+  const isHeterogeneousRow = (row: any[]) => {
+    const types = row
+      .filter(cell => cell !== null && cell !== undefined && cell.toString().trim() !== '')
+      .map(cell => {
+        if (typeof cell === 'number' || !isNaN(Number(cell)))
+          return 'number'
+        if (isDate(cell))
+          return 'date'
+        if (isTimestamp(cell))
+          return 'timestamp'
+        if (isTime(cell))
+          return 'time'
+        if (isBoolean(cell))
+          return 'boolean'
+        return 'string'
+      })
+
+    const uniqueTypes = new Set(types)
+    return uniqueTypes.size >= 2 // Consider it heterogeneous if at least 2 types
+  }
+
+  for (let i = 0; i < Math.min(maxSearchRows, rows.length); i++) {
+    if (isHeterogeneousRow(rows[i])) {
+      return i // rows before this are likely headers
+    }
+  }
+
+  return 1
+}
+
+
+function guessHeaderRowsByKeywords(rows: any[][], maxSearchRows = 3): number {
+  const headerKeywords = ['name', 'id', 'date', 'type', 'category', 'description', 'amount', 'total', 'value', 'region', 'country', 'state', 'city', 'zip', 'address', 'phone', 'email', 'website', 'url', 'link', 'title', 'subtitle', 'summary', 'description', 'notes', 'comments', 'remarks', 'details', 'information', 'data', 'statistics', 'metrics', 'measures']
+  const lowerKeywords = headerKeywords.map(k => k.toLowerCase())
+  
+  for (let i = 0; i < Math.min(maxSearchRows, rows.length); i++) {
+    const row = rows[i]
+    if (!row) continue
+    
+    const rowText = row.map(cell => (cell ?? '').toString().toLowerCase())
+    
+    // Count how many cells contain header keywords
+    const keywordMatches = rowText.filter(cell => 
+      lowerKeywords.some(kw => cell.includes(kw))
+    ).length
+    
+    // Only consider it a header row if MOST cells contain keywords (not just one)
+    const totalCells = rowText.filter(cell => cell.trim().length > 0).length
+    if (totalCells > 0 && keywordMatches >= Math.ceil(totalCells * 0.6)) {
+      return i + 1
+    }
+  }
+  return 1
+}
+
+function inferHeaderRows(input: XLSX.WorkSheet, rows: any[][], isDummyHeader = false): number {
+  let mergedHeaderRows = 1
+    
+  // Check actual merged cells in XLSX
+  const merges = input['!merges'] ?? []
+  let maxHeaderMergeRow = -1
+  
+  merges.forEach(rng => {
+    // Only consider merges that START in the header area
+    if (rng.s.r < 4 && rng.s.r > maxHeaderMergeRow) {
+      maxHeaderMergeRow = rng.s.r
+    }
+  })
+  mergedHeaderRows = maxHeaderMergeRow >= 0 ? maxHeaderMergeRow + 2 : 1
+  mergedHeaderRows += isDummyHeader ? 1 : 0
+  
+  if (rows.length === 0) return 1
+
+  const MAX_HEADER_ROWS = isDummyHeader ? 4 : 3
+
+  // Heuristic 2: Analyze data type patterns
+  const dataTypeHeaderRows = guessHeaderRowsByDataTypes(rows, MAX_HEADER_ROWS)
+
+  // Heuristic 3: Look for header keywords
+  const keywordHeaderRows = guessHeaderRowsByKeywords(rows, MAX_HEADER_ROWS)
+
+  // Choose the maximum of these heuristics, but cap at reasonable limit
+  const inferredRows = Math.max(mergedHeaderRows, dataTypeHeaderRows, keywordHeaderRows, 1)
+  return Math.min(inferredRows, MAX_HEADER_ROWS)
+}
+
+function processSheetData(input: XLSX.WorkSheet, headerRowsParam?: number): ProcessedSheetData {
+  let rows: any[][] = []
+  try {
+    // Use sheet_to_json with proper options to preserve empty cells and formatting
+    rows = XLSX.utils.sheet_to_json<any[]>(input, { 
+      header: 1,        // Generate array of arrays
+      raw: false,       // Use formatted strings (not raw values)
+      defval: "",       // Use empty string for null/undefined values
+    })
+  } catch (error) {
+    console.error("Error converting sheet to JSON:", error)
+    return { headerRow: [], dataRows: [] }
+  }
+  
+  let headerRows = headerRowsParam ?? inferHeaderRows(input, rows)
+  
+  if (rows.length === 0) {
+    return { headerRow: [], dataRows: [] }
+  }
+
+  const isHeaderValid = rows.slice(0, headerRows).every(row => isHeaderRowValid(row))
+  if (!isHeaderValid) {
+    const maxColumns = Math.max(...rows.map(row => row.length))
+    const header = Array.from({ length: maxColumns }, (_, i) => `C${i + 1}`)
+    rows = [header, ...rows]
+    headerRows = inferHeaderRows(input, rows, true)
+  }
+
+  // Build composite headers and extract data normally
+  const result = buildHeaders(rows, headerRows)
+  const header = result.header
+  const dataRows = result.dataRows
+  
+  // Filter out completely empty rows BEFORE adding row IDs
+  const validDataRows = dataRows.filter(isRowValid)
+  
+  // Add row_id as first column and normalize data
+  const fullHeader = ["row_id", ...header]
+  const rowsWithId = validDataRows.map((row, index) => [
+    (index + 1).toString(),
+    ...row.map(cell => (cell ?? "").toString())
+  ])
+  
+  // Clear references to help garbage collection
+  rows = []
+  
+  return {
+    headerRow: fullHeader,
+    dataRows: rowsWithId
+  }
+}
+
+// Helper Functions
+
+/**
+ * Calculates byte length of a string using UTF-8 encoding
+ */
+const getByteLength = (str: string): number => Buffer.byteLength(str, "utf8")
+
+/**
+ * Cleans illegal UTF-8 characters and normalizes line endings
+ */
+const cleanText = (str: string): string => {
+  const normalized = str.replace(/\r\n|\r/g, "\n")
+  return normalized.replace(
+    /[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g,
+    "",
+  )
+}
+
+/**
+ * Normalizes a row to ensure consistent column count and clean data
+ */
+function normalizeRow(row: string[], columnCount: number): string {
+  const normalizedCells: string[] = []
+  
+  for (let i = 0; i < columnCount; i++) {
+    const cell = row[i]
+    if (cell === undefined || cell === null) {
+      normalizedCells.push("")
+    } else {
+      const cellStr = cell.toString()
+      const cleanedCell = cleanText(cellStr)
+      normalizedCells.push(cleanedCell)
+    }
+  }
+  
+  return normalizedCells.join("\t")
+}
+
+/**
+ * Validates if a row contains meaningful content
+ */
+function isRowValid(row: string[]): boolean {
+  if (!Array.isArray(row) || row.length === 0) return false
+  
+  return row.some(cell => {
+    if (cell === undefined || cell === null || cell === "") return false
+    const cellStr = cell.toString().trim()
+    return cellStr.length > 0
+  })
+}
+
+/**
+ * Validates if a header row has all cells filled (no empty, undefined, or null cells)
+ */
+function isHeaderRowValid(row: any[]): boolean {
+  if (!Array.isArray(row) || row.length === 0) return false
+  
+  return row.every(cell => {
+    if (cell === undefined || cell === null) return false
+    const cellStr = cell.toString().trim()
+    return cellStr.length > 0
+  })
+}
+
+/**
+ * Truncates string to specified byte length while preserving character boundaries
+ */
+function truncateToByteLength(str: string, limit: number): string {
+  let bytes = 0
+  let result = ''
+  
+  for (const char of str) {
+    const charBytes = getByteLength(char)
+    if (bytes + charBytes > limit) break
+    result += char
+    bytes += charBytes
+  }
+  
+  return result
+}
+
+/**
+ * Creates chunks from data rows with size and row limits
+ */
+function createChunks(dataRows: string[][], state: ChunkingState): string[] {
+  const chunks: string[] = []
+  let currentBatch: string[] = []
+
+  for (const row of dataRows) {
+    const normalizedRow = normalizeRow(row, state.columnCount)
+    
+    const potentialChunk = createChunkFromBatch(
+      [...currentBatch, normalizedRow],
+      state.headerRow
+    )
+
+    const wouldExceedRowLimit = currentBatch.length >= state.maxRowsPerChunk
+    const wouldExceedSizeLimit = getByteLength(potentialChunk) > state.maxChunkSize
+
+    if ((wouldExceedRowLimit || wouldExceedSizeLimit) && currentBatch.length > 0) {
+      chunks.push(createChunkFromBatch(currentBatch, state.headerRow))
+      
+      // Handle rows that exceed size limit
+      if (getByteLength(normalizedRow) > state.maxChunkSize) {
+        const truncatedRow = truncateToByteLength(
+          normalizedRow, 
+          state.maxChunkSize - getByteLength(state.headerRow) - 1
+        )
+        chunks.push(createChunkFromBatch([truncatedRow], state.headerRow))
+        currentBatch = []
+      } else {
+        currentBatch = [normalizedRow]
+      }
+    } else {
+      currentBatch.push(normalizedRow)
+    }
+  }
+
+  if (currentBatch.length > 0) {
+    chunks.push(createChunkFromBatch(currentBatch, state.headerRow))
+  }
+
+  return chunks
+}
+
+/**
+ * Creates a single chunk from batch of rows and header
+ */
+function createChunkFromBatch(batch: string[], headerRow: string): string {
+  if (batch.length === 0) return headerRow
+  return [headerRow, ...batch].join("\n")
+}
+
+function normalizeToWorksheet(input: string[][] | XLSX.WorkSheet): XLSX.WorkSheet {
+  if (Array.isArray(input)) {
+    return XLSX.utils.aoa_to_sheet(input)
+  }
+  return input
+}
+
+// Main Export Functions
+
+/**
+ * Chunks spreadsheet data with intelligent header preservation
+ * Applies smart processing to both XLSX WorkSheet objects and string[][] arrays
+ * - Smart header detection with multiple heuristics
+ * - Multi-row header flattening
+ * - Row ID addition for traceability
+ * - Merged cell handling (XLSX only)
+ * - Adaptive chunking for wide spreadsheets
+ */
+export function chunkSheetWithHeaders(
+  input: string[][] | XLSX.WorkSheet,
+  config?: ChunkConfig,
+): string[] {
+  let worksheet: XLSX.WorkSheet | null = null
+  let processedData: ProcessedSheetData | null = null
+  
+  try {
+    // Process input with unified smart logic
+    worksheet = normalizeToWorksheet(input)
+    unmerge(worksheet)
+    processedData = processSheetData(worksheet, config?.headerRows)
+    const { headerRow, dataRows } = processedData
+
+    if (headerRow.length === 0) {
+      return []
+    }
+
+    // Configuration with sensible defaults
+    const maxChunkSize = config?.maxChunkSize ?? 1024
+    const maxRowsPerChunk = config?.maxRowsPerChunk ?? 10
+
+    const columnCount = headerRow.length
+    
+    // Adaptive chunking for wide spreadsheets
+    const adaptiveMaxRowsPerChunk = columnCount > 15 
+      ? Math.max(3, Math.floor(maxRowsPerChunk * 0.6)) 
+      : maxRowsPerChunk
+
+    const state: ChunkingState = {
+      headerRow: normalizeRow(headerRow, columnCount),
+      maxRowsPerChunk: adaptiveMaxRowsPerChunk,
+      maxChunkSize,
+      columnCount,
+    }
+
+    if (dataRows.length === 0) {
+      return [state.headerRow]
+    }
+
+    const chunks = createChunks(dataRows, state)
+    
+    // Clear references to help garbage collection
+    processedData = null
+    
+    return chunks
+  } finally {
+    // Clean up worksheet reference if it was created from array
+    if (Array.isArray(input) && worksheet) {
+      // Clear the worksheet to help garbage collection
+      const keys = Object.keys(worksheet)
+      for (const key of keys) {
+        delete worksheet[key]
+      }
+      worksheet = null
+    }
+  }
+}
diff --git a/server/types.ts b/server/types.ts
index b87652b40..a0581fef1 100644
--- a/server/types.ts
+++ b/server/types.ts
@@ -610,3 +610,28 @@ export type ChunkMetadata = {
   page_number: number;
   block_labels: string[];
 };
+
+// DuckDB related types
+export interface DuckDBQuery {
+  sql: string
+  notes: string
+}
+
+export interface DuckDBResult {
+  user_question: string
+  resolved_metric?: string
+  sql: string
+  execution_meta: {
+    row_count: number
+    elapsed_ms: number
+    as_of: string
+  }
+  schema_fragment?: {
+    table: string
+    columns: Record<string, string>
+  }
+  assumptions: string[]
+  data: {
+    rows: any[][]
+  }
+}

From 6614da0559fdda05bb884bdb781e9c630b753e8c Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Mon, 6 Oct 2025 15:44:15 +0530
Subject: [PATCH 2/7] fix: resolved ai comments

---
 server/api/chat/chat.ts             | 13 +++---
 server/api/files.ts                 |  4 +-
 server/api/knowledgeBase.ts         | 13 +++---
 server/integrations/google/index.ts |  2 +-
 server/lib/duckdb.ts                | 70 +++++++++++------------------
 server/lib/sqlInference.ts          |  3 --
 server/package.json                 |  2 +-
 server/sheetChunk.ts                |  2 +-
 server/types.ts                     |  2 +-
 9 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts
index fde97d412..a98c79461 100644
--- a/server/api/chat/chat.ts
+++ b/server/api/chat/chat.ts
@@ -874,12 +874,15 @@ export const ChatDeleteApi = async (c: Context) => {
           try {
             // Delete from Vespa kb_items schema using the proper Vespa function
             const vespaIds = expandSheetIds(fileId)
-            vespaIds.forEach(async (id) => {
-              await DeleteDocument(id, KbItemsSchema)
-              loggerWithChild({ email: email }).info(
-                `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`,
+            await Promise.all(
+              vespaIds.map(id => 
+                DeleteDocument(id, KbItemsSchema).then(() => 
+                  loggerWithChild({ email }).info(
+                    `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`,
+                  )
+                )
               )
-            })
+            )
           } catch (error) {
             const errorMessage = getErrorMessage(error)
             if (errorMessage.includes("404 Not Found")) {
diff --git a/server/api/files.ts b/server/api/files.ts
index f1932f903..3e44e8624 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -218,11 +218,11 @@ export const handleAttachmentUpload = async (c: Context) => {
     }
 
     const attachmentMetadata: AttachmentMetadata[] = []
-    let vespaId : string = ""
-
+    
     for (const file of files) {
       const fileBuffer = await file.arrayBuffer()
       const fileId = `att_${crypto.randomUUID()}`
+      let vespaId = fileId
       const ext = file.name.split(".").pop()?.toLowerCase() || ""
       const fullFileName = `${0}.${ext}`
       const isImage = isImageFile(file.type)
diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts
index 3fa7a1622..fb10df2ff 100644
--- a/server/api/knowledgeBase.ts
+++ b/server/api/knowledgeBase.ts
@@ -1585,12 +1585,15 @@ export const DeleteItemApi = async (c: Context) => {
           // Delete from Vespa
           if (itemToDelete.vespaDocId) {
             const vespaDocIds = expandSheetIds(itemToDelete.vespaDocId)
-            vespaDocIds.forEach(async (id) => {
-              await DeleteDocument(id, KbItemsSchema)
-              loggerWithChild({ email: userEmail }).info(
-                `Deleted file from Vespa: ${id}`,
+            await Promise.all(
+              vespaDocIds.map(id => 
+                DeleteDocument(id, KbItemsSchema).then(() => 
+                  loggerWithChild({ email: userEmail }).info(
+                    `Deleted file from Vespa: ${id}`,
+                  )
+                )
               )
-            })
+            )
           }
         } catch (error) {
           loggerWithChild({ email: userEmail }).warn(
diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index 373f3cbc0..0af5133d7 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -2978,7 +2978,7 @@ export async function* listFiles(
   let nextPageToken = ""
 
   // Build the query with date filters if provided
-  let query = `${q} and trashed = false`
+  let query = q ? `${q} and trashed = false` : "trashed = false"
   const dateFilters: string[] = []
 
   if (startDate) {
diff --git a/server/lib/duckdb.ts b/server/lib/duckdb.ts
index 79d81e062..2cdf3b6c6 100644
--- a/server/lib/duckdb.ts
+++ b/server/lib/duckdb.ts
@@ -3,41 +3,14 @@ import { getLogger } from "@/logger";
 import { Subsystem } from "@/types";
 import type { DuckDBResult } from "@/types";
 import { analyzeQueryAndGenerateSQL } from "./sqlInference";
-import { writeFileSync, unlinkSync, createWriteStream } from "fs";
+import { writeFileSync, unlinkSync, createWriteStream, promises as fs } from "fs";
 import { join } from "path";
-import { tmpdir } from "os";
+import { tmpdir, cpus } from "os";
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "duckdb",
 });
 
-// Simple SQL validation function
-function validateSQL(sql: string): void {
-  const disallowedKeywords = [
-    'INSTALL', 'LOAD', 'PRAGMA', 'COPY', 'EXPORT', 'ATTACH', 'DETACH', 
-    'CALL', 'CREATE', 'ALTER', 'DROP', 'INSERT', 'UPDATE', 'DELETE', 
-    'SET', 'RESET'
-  ];
-  
-  const upperSQL = sql.toUpperCase();
-  
-  for (const keyword of disallowedKeywords) {
-    if (upperSQL.includes(keyword)) {
-      throw new Error(`Disallowed SQL keyword detected: ${keyword}`);
-    }
-  }
-  
-  // Ensure it's a SELECT statement
-  if (!upperSQL.trim().startsWith('SELECT')) {
-    throw new Error('Only SELECT statements are allowed');
-  }
-  
-  // Ensure there's a LIMIT clause
-  if (!upperSQL.includes('LIMIT')) {
-    throw new Error('LIMIT clause is required for all queries');
-  }
-}
-
 export const querySheetChunks = async (
   sheetChunks: string[],
   userQuery: string,
@@ -54,7 +27,8 @@ export const querySheetChunks = async (
   );
 
   // Create a temporary CSV file using streaming for large data
-  const tempFilePath = join(tmpdir(), `duckdb_temp_${Date.now()}.tsv`);
+  const tmpDir = tmpdir().replace(/'/g, "''");
+  const tempFilePath = join(tmpDir, `duckdb_temp_${Date.now()}.tsv`);
   Logger.debug(`Writing ${cleanedSheetChunks.length} chunks to temporary file: ${tempFilePath}`);
   
   if (cleanedSheetChunks.length > 100) {
@@ -78,12 +52,13 @@ export const querySheetChunks = async (
   }
 
   // Use on-disk DB and tune pragmas for large files
-  const db = await Database.create(join(tmpdir(), `xyne_${Date.now()}.duckdb`));
+  const dbPath = join(tmpDir, `xyne_${Date.now()}.duckdb`);
+  const db = await Database.create(dbPath);
   const connection = await db.connect();
   
   Logger.debug("Setting up DuckDB pragmas for large file processing");
-  await connection.run(`PRAGMA temp_directory='${tmpdir()}'`);
-  await connection.run(`PRAGMA threads=${Math.max(1, Math.floor(require('os').cpus().length / 2))}`);
+  await connection.run(`PRAGMA temp_directory='${tmpDir}'`);
+  await connection.run(`PRAGMA threads=${Math.max(1, Math.floor(cpus().length / 2))}`);
   await connection.run(`PRAGMA memory_limit='4GB'`);
 
   const tableName = `v_${Date.now().toString(36)}`;
@@ -113,7 +88,7 @@ export const querySheetChunks = async (
       `);
       Logger.debug(`VIEW ${tableName} created successfully`);
     } catch (viewError) {
-      console.error(`Failed to create VIEW ${tableName}:`, viewError);
+      Logger.error(`Failed to create VIEW ${tableName}:`, viewError);
       throw viewError;
     }
 
@@ -154,10 +129,6 @@ export const querySheetChunks = async (
       return null;
     }
     Logger.debug(`Generated SQL: ${duckDBQuery.sql}`);
-
-    // 5) Validate and run
-    Logger.debug("Validating generated SQL");
-    validateSQL(duckDBQuery.sql);
     
     Logger.debug(`Executing DuckDB query: ${duckDBQuery.sql}`);
     const result = await connection.all(duckDBQuery.sql);
@@ -200,19 +171,32 @@ export const querySheetChunks = async (
     // Clean up
     Logger.debug("Cleaning up DuckDB resources");
     try {
-      await connection.close();
-      await db.close();
+      if (connection) await connection.close();
+      if (db) await db.close();
       Logger.debug("DuckDB connection and database closed");
     } catch (e) {
       Logger.warn("Error closing DuckDB resources:", e);
     }
     
-    // Clean up temporary file
+    // Clean up temporary TSV file
     try {
       unlinkSync(tempFilePath);
-      Logger.debug(`Temporary file deleted: ${tempFilePath}`);
+      Logger.debug(`Temporary TSV file deleted: ${tempFilePath}`);
+    } catch (e) {
+      Logger.warn(`Failed to delete temporary TSV file ${tempFilePath}:`, e);
+    }
+    
+    // Clean up temporary DuckDB file
+    try {
+      await fs.stat(dbPath);
+      await fs.unlink(dbPath);
+      Logger.debug(`Temporary DuckDB file deleted: ${dbPath}`);
     } catch (e) {
-      Logger.warn(`Failed to delete temporary file ${tempFilePath}:`, e);
+      if (e instanceof Error && 'code' in e && e.code === 'ENOENT') {
+        Logger.debug(`Temporary DuckDB file already removed: ${dbPath}`);
+      } else {
+        Logger.warn(`Failed to delete temporary DuckDB file ${dbPath}:`, e);
+      }
     }
   }
 };
diff --git a/server/lib/sqlInference.ts b/server/lib/sqlInference.ts
index 3ec74bf0e..6ee428698 100644
--- a/server/lib/sqlInference.ts
+++ b/server/lib/sqlInference.ts
@@ -87,8 +87,6 @@ ${fewShotSamples}`;
 
     const response = await provider.converse(messages, modelParams);
     const responseText = response.text || "";
-
-    console.log("DuckDB query response:", responseText);
     
     const cleaned = stripNoise(responseText);
     let parsedResponse: { isMetric: boolean; sql: string | null; notes: string };
@@ -115,7 +113,6 @@ ${fewShotSamples}`;
       notes: parsedResponse.notes
     };
 
-    console.log("DuckDB query generated:", result);
     return result;
   } catch (error) {
     Logger.error("Failed to analyze query and generate SQL:", error);
diff --git a/server/package.json b/server/package.json
index 8e8423e92..70e96427a 100644
--- a/server/package.json
+++ b/server/package.json
@@ -88,7 +88,7 @@
     "cors": "^2.8.5",
     "drizzle-orm": "^0.44.5",
     "drizzle-zod": "^0.8.3",
-    "duckdb-async": "^1.3.2",
+    "duckdb-async": "1.4.0",
     "fast-xml-parser": "^5.2.5",
     "file-type": "^21.0.0",
     "google-auth-library": "^9.14.0",
diff --git a/server/sheetChunk.ts b/server/sheetChunk.ts
index 1624a9624..9646e4b0e 100644
--- a/server/sheetChunk.ts
+++ b/server/sheetChunk.ts
@@ -13,7 +13,7 @@ function isTimestamp(value: any): boolean {
     const numValue = Number(value)
     if (!isNaN(numValue)) {
       // Unix timestamp should be reasonable (between 1970 and 2100)
-      const minTimestamp = 0 // 1970-01-01
+      const minTimestamp = 946684800000 // 2000-01-01 in milliseconds
       const maxTimestamp = 4102444800000 // 2100-01-01 in milliseconds
       return numValue >= minTimestamp && numValue <= maxTimestamp
     }
diff --git a/server/types.ts b/server/types.ts
index a0581fef1..846b53c57 100644
--- a/server/types.ts
+++ b/server/types.ts
@@ -632,6 +632,6 @@ export interface DuckDBResult {
   }
   assumptions: string[]
   data: {
-    rows: any[][]
+    rows: unknown[][]
   }
 }

From 2911321b7c08b0b6095fcaad2d9e7ec45302c428 Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Mon, 6 Oct 2025 16:07:37 +0530
Subject: [PATCH 3/7] fix: resolved ai comments 1

---
 server/ai/context.ts                          |  3 +-
 server/api/chat/chat.ts                       | 20 +++--
 server/api/files.ts                           |  2 -
 server/api/knowledgeBase.ts                   | 20 +++--
 server/integrations/dataSource/config.ts      |  2 +-
 server/integrations/dataSource/index.ts       | 20 -----
 server/integrations/google/config.ts          |  4 +-
 server/integrations/google/index.ts           | 70 ++-------------
 server/integrations/google/worker-utils.ts    | 85 -------------------
 .../microsoft/attachment-utils.ts             | 81 ------------------
 server/sheetChunk.ts                          |  2 +-
 11 files changed, 37 insertions(+), 272 deletions(-)

diff --git a/server/ai/context.ts b/server/ai/context.ts
index dd2fe3013..2f898071d 100644
--- a/server/ai/context.ts
+++ b/server/ai/context.ts
@@ -76,7 +76,8 @@ const extractHeaderAndDataChunks = (
      // Update matchfeatures to include the header chunk score
      if (newMatchfeatures) {
         const existingCells = newMatchfeatures.chunk_scores?.cells || {};
-        const maxScore = Object.values(existingCells).length > 0 ? Math.max(...Object.values(existingCells as number[])) : 0;
+        const scores = Object.values(existingCells) as number[];
+        const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
         // Create new chunk_scores that match the new chunks
         const newChunkScores: Record<string, number> = {}
         newChunkScores["0"] = maxScore + 1
diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts
index a98c79461..b893ae462 100644
--- a/server/api/chat/chat.ts
+++ b/server/api/chat/chat.ts
@@ -874,15 +874,19 @@ export const ChatDeleteApi = async (c: Context) => {
           try {
             // Delete from Vespa kb_items schema using the proper Vespa function
             const vespaIds = expandSheetIds(fileId)
-            await Promise.all(
-              vespaIds.map(id => 
-                DeleteDocument(id, KbItemsSchema).then(() => 
-                  loggerWithChild({ email }).info(
-                    `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`,
-                  )
+            for (const id of vespaIds) {
+              try {
+                await DeleteDocument(id, KbItemsSchema)
+                loggerWithChild({ email }).info(
+                  `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`,
                 )
-              )
-            )
+              } catch (error) {
+                loggerWithChild({ email }).error(
+                  `Failed to delete non-image attachment ${id} from Vespa kb_items schema`,
+                  { error: getErrorMessage(error) }
+                )
+              }
+            }
           } catch (error) {
             const errorMessage = getErrorMessage(error)
             if (errorMessage.includes("404 Not Found")) {
diff --git a/server/api/files.ts b/server/api/files.ts
index 3e44e8624..fd13effdb 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -260,8 +260,6 @@ export const handleAttachmentUpload = async (c: Context) => {
 
           if(processingResults.length > 0 && 'totalSheets' in processingResults[0]) {
             vespaId = `${fileId}_sheet_${(processingResults[0] as SheetProcessingResult).totalSheets}`
-          } else {
-            vespaId = fileId
           }
           // Handle multiple processing results (e.g., for spreadsheets with multiple sheets)
           for (const [resultIndex, processingResult] of processingResults.entries()) {
diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts
index fb10df2ff..e3884fd28 100644
--- a/server/api/knowledgeBase.ts
+++ b/server/api/knowledgeBase.ts
@@ -1585,15 +1585,19 @@ export const DeleteItemApi = async (c: Context) => {
           // Delete from Vespa
           if (itemToDelete.vespaDocId) {
             const vespaDocIds = expandSheetIds(itemToDelete.vespaDocId)
-            await Promise.all(
-              vespaDocIds.map(id => 
-                DeleteDocument(id, KbItemsSchema).then(() => 
-                  loggerWithChild({ email: userEmail }).info(
-                    `Deleted file from Vespa: ${id}`,
-                  )
+            for (const id of vespaDocIds) {
+              try {
+                await DeleteDocument(id, KbItemsSchema)
+                loggerWithChild({ email: userEmail }).info(
+                  `Deleted file from Vespa: ${id}`,
                 )
-              )
-            )
+              } catch (error) {
+                loggerWithChild({ email: userEmail }).error(
+                  `Failed to delete file from Vespa: ${id}`,
+                  { error: getErrorMessage(error) }
+                )
+              }
+            }
           }
         } catch (error) {
           loggerWithChild({ email: userEmail }).warn(
diff --git a/server/integrations/dataSource/config.ts b/server/integrations/dataSource/config.ts
index 6c98f272a..b835278bb 100644
--- a/server/integrations/dataSource/config.ts
+++ b/server/integrations/dataSource/config.ts
@@ -18,7 +18,7 @@ export const DATASOURCE_CONFIG = {
     10,
   ),
   MAX_SPREADSHEET_FILE_SIZE_MB: parseInt(
-    process.env.DATASOURCE_MAX_SPREADSHEET_FILE_SIZE_MB || "15",
+    process.env.DATASOURCE_MAX_SPREADSHEET_FILE_SIZE_MB || "10",
     10,
   ),
   MAX_TEXT_FILE_SIZE_MB: parseInt(
diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts
index bd6a94520..67ce63831 100644
--- a/server/integrations/dataSource/index.ts
+++ b/server/integrations/dataSource/index.ts
@@ -375,26 +375,6 @@ const processSpreadsheetFile = async (
       const worksheet = workbook.Sheets[sheetName]
       if (!worksheet) continue
 
-      // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-      //   header: 1,
-      //   defval: "",
-      //   raw: false,
-      // })
-
-      // const validRows = sheetData.filter((row) =>
-      //   row.some((cell) => cell && cell.toString().trim().length > 0),
-      // )
-
-      // if (validRows.length === 0) continue
-
-      // if (validRows?.length > DATASOURCE_CONFIG.MAX_ATTACHMENT_SHEET_ROWS) {
-      //   // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
-      //   // Logger.warn(
-      //   //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
-      //   // )
-      //   return []
-      // }
-
       // Use the new header-preserving chunking function
       const sheetChunks = chunkSheetWithHeaders(worksheet)
 
diff --git a/server/integrations/google/config.ts b/server/integrations/google/config.ts
index 475b658f3..8c97312ee 100644
--- a/server/integrations/google/config.ts
+++ b/server/integrations/google/config.ts
@@ -12,7 +12,7 @@ export const scopes = [
 ]
 
 export const MAX_GD_PDF_SIZE = 15 // In MB
-export const MAX_GD_SHEET_SIZE = 15 // In MB
+export const MAX_GD_SHEET_SIZE = 10 // In MB
 export const MAX_GD_SLIDES_TEXT_LEN = 300000
 export const ServiceAccountUserConcurrency = 2
 export const GoogleDocsConcurrency = 8
@@ -23,7 +23,7 @@ export const MAX_ATTACHMENT_PDF_SIZE = 15
 export const MAX_ATTACHMENT_TEXT_SIZE = 10
 export const MAX_ATTACHMENT_DOCX_SIZE = 15
 export const MAX_ATTACHMENT_PPTX_SIZE = 15
-export const MAX_ATTACHMENT_SHEET_SIZE = 15
+export const MAX_ATTACHMENT_SHEET_SIZE = 10
 
 // if true will directly ingest the data without checking
 // if false will check for its existance in vespa
diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index 0af5133d7..bce7014db 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -2180,56 +2180,6 @@ export const getSpreadsheet = async (
   }
 }
 
-// Function to chunk rows of text data into manageable batches
-// Excludes numerical data, assuming users do not typically search by numbers
-// Concatenates all textual cells in a row into a single string
-// Adds rows' string data to a chunk until the 512-character limit is exceeded
-// If adding a row exceeds the limit, the chunk is added to the next chunk
-// Otherwise, the row is added to the current chunk
-// const chunkFinalRows = (allRows: string[][]): string[] => {
-//   const chunks: string[] = []
-//   let currentChunk = ""
-//   let totalTextLength = 0
-
-//   for (const row of allRows) {
-//     // Filter out numerical cells and empty strings
-//     const textualCells = row.filter(
-//       (cell) => isNaN(Number(cell)) && cell.trim().length > 0,
-//     )
-
-//     if (textualCells.length === 0) continue // Skip if no textual data
-
-//     const rowText = textualCells.join(" ")
-
-//     // Check if adding this rowText would exceed the maximum text length
-//     if (totalTextLength + rowText.length > MAX_GD_SHEET_TEXT_LEN) {
-//       // Logger.warn(`Text length excedded, indexing with empty content`)
-//       // Return an empty array if the total text length exceeds the limit
-//       return []
-//     }
-
-//     totalTextLength += rowText.length
-
-//     if ((currentChunk + " " + rowText).trim().length > 512) {
-//       // Add the current chunk to the list and start a new chunk
-//       if (currentChunk.trim().length > 0) {
-//         chunks.push(currentChunk.trim())
-//       }
-//       currentChunk = rowText
-//     } else {
-//       // Append the row text to the current chunk
-//       currentChunk += " " + rowText
-//     }
-//   }
-
-//   if (currentChunk.trim().length > 0) {
-//     // Add any remaining text as the last chunk
-//     chunks.push(currentChunk.trim())
-//   }
-
-//   return chunks
-// }
-
 export const getSheetsListFromOneSpreadsheet = async (
   sheets: sheets_v4.Sheets,
   client: GoogleClient,
@@ -2245,9 +2195,13 @@ export const getSheetsListFromOneSpreadsheet = async (
       userEmail,
     )
 
-    if (spreadsheet.size && parseInt(spreadsheet.size) > MAX_GD_SHEET_SIZE) {
+    // Early size check before fetching spreadsheet data
+    const sizeInBytes = spreadsheet.size ? parseInt(spreadsheet.size, 10) : 0
+    if (!isNaN(sizeInBytes) && sizeInBytes > MAX_GD_SHEET_SIZE) {
+      const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2)
+      const maxSizeInMB = (MAX_GD_SHEET_SIZE / (1024 * 1024)).toFixed(2)
       loggerWithChild({ email: userEmail }).warn(
-        `Ignoring ${spreadsheet.name} as its more than ${MAX_GD_SHEET_SIZE} MB`,
+        `Ignoring ${spreadsheet.name} as its size (${sizeInMB} MB) exceeds the limit of ${maxSizeInMB} MB`,
       )
       return []
     }
@@ -2287,17 +2241,7 @@ export const getSheetsListFromOneSpreadsheet = async (
           continue
         }
 
-        let chunks: string[] = []
-
-        // if (finalRows?.length > MAX_GD_SHEET_ROWS) {
-        //   // If there are more rows than MAX_GD_SHEET_ROWS, still index it but with empty content
-        //   // Logger.warn(
-        //   //   `Large no. of rows in ${spreadsheet.name} -> ${sheet.sheetTitle}, indexing with empty content`,
-        //   // )
-        //   chunks = []
-        // } else {
-          chunks = chunkSheetWithHeaders(finalRows)
-        // }
+        const chunks: string[] = chunkSheetWithHeaders(finalRows)
 
         const sheetDataToBeIngested = {
           title: `${spreadsheet.name} / ${sheet?.sheetTitle}`,
diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts
index b6874da87..dcd7d3e1e 100644
--- a/server/integrations/google/worker-utils.ts
+++ b/server/integrations/google/worker-utils.ts
@@ -418,43 +418,6 @@ export const processSpreadsheetFileWithSheetInfo = async (
         const worksheet = workbook.Sheets[sheetName]
         if (!worksheet) continue
 
-        // Get the range of the worksheet
-        // const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
-        // const totalRows = range.e.r - range.s.r + 1
-
-        // Skip sheets with too many rows
-        // if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
-        //   Logger.warn(
-        //     `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
-        //   )
-        //   continue
-        // }
-
-        // // Convert sheet to JSON array of arrays with a row limit
-        // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-        //   header: 1,
-        //   defval: "",
-        //   raw: false,
-        //   range: 0, // Start from first row
-        //   blankrows: false,
-        // })
-
-        // // Clean and get valid rows
-        // const validRows = sheetData.filter((row) =>
-        //   row.some((cell) => cell && cell.toString().trim().length > 0),
-        // )
-
-        // if (validRows.length === 0) {
-        //   Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
-        //   continue
-        // }
-
-        // // Chunk the rows for this specific sheet
-        // const sheetChunks = chunkSheetRows(validRows)
-        // const filteredSheetChunks = sheetChunks.filter(
-        //   (chunk) => chunk.trim().length > 0,
-        // )
-
         const filteredSheetChunks = chunkSheetWithHeaders(worksheet);
 
         if (filteredSheetChunks.length === 0) {
@@ -533,51 +496,3 @@ export const processSpreadsheetFileWithSheetInfo = async (
     }
   }
 }
-
-// Function to chunk sheet rows (simplified version of chunkFinalRows)
-// const chunkSheetRows = (allRows: string[][]): string[] => {
-//   const chunks: string[] = []
-//   let currentChunk = ""
-//   let totalTextLength = 0
-//   const MAX_CHUNK_SIZE = 512
-
-//   for (const row of allRows) {
-//     // Filter out numerical cells and empty strings, join textual cells
-//     const textualCells = row
-//       .filter(
-//         (cell) =>
-//           cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
-//       )
-//       .map((cell) => cell.toString().trim())
-
-//     if (textualCells.length === 0) continue
-
-//     const rowText = textualCells.join(" ")
-
-//     // Check if adding this rowText would exceed the maximum text length
-//     if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
-//       Logger.warn(
-//         `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
-//       )
-//       // If we have some chunks, return them; otherwise return empty
-//       return chunks.length > 0 ? chunks : []
-//     }
-
-//     totalTextLength += rowText.length
-
-//     if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
-//       if (currentChunk.trim().length > 0) {
-//         chunks.push(currentChunk.trim())
-//       }
-//       currentChunk = rowText
-//     } else {
-//       currentChunk += (currentChunk ? " " : "") + rowText
-//     }
-//   }
-
-//   if (currentChunk.trim().length > 0) {
-//     chunks.push(currentChunk.trim())
-//   }
-
-//   return chunks
-// }
diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts
index fa5f82a49..3a6a6c9b5 100644
--- a/server/integrations/microsoft/attachment-utils.ts
+++ b/server/integrations/microsoft/attachment-utils.ts
@@ -448,39 +448,6 @@ export const processSpreadsheetFileWithSheetInfo = async (
         const worksheet = workbook.Sheets[sheetName]
         if (!worksheet) continue
 
-        // // Get the range of the worksheet
-        // const range = XLSX.utils.decode_range(worksheet["!ref"] || "A1")
-        // const totalRows = range.e.r - range.s.r + 1
-
-        // // Skip sheets with too many rows
-        // if (totalRows > MAX_ATTACHMENT_SHEET_ROWS) {
-        //   Logger.warn(
-        //     `Sheet "${sheetName}" in ${filename} has ${totalRows} rows (max: ${MAX_ATTACHMENT_SHEET_ROWS}), skipping`,
-        //   )
-        //   continue
-        // }
-
-        // // Convert sheet to JSON array of arrays with a row limit
-        // const sheetData: string[][] = XLSX.utils.sheet_to_json(worksheet, {
-        //   header: 1,
-        //   defval: "",
-        //   raw: false,
-        //   range: 0, // Start from first row
-        //   blankrows: false,
-        // })
-
-        // // Clean and get valid rows
-        // const validRows = sheetData.filter((row) =>
-        //   row.some((cell) => cell && cell.toString().trim().length > 0),
-        // )
-
-        // if (validRows.length === 0) {
-        //   Logger.debug(`Sheet "${sheetName}" has no valid content, skipping`)
-        //   continue
-        // }
-
-        // // Chunk the rows for this specific sheet
-        // const sheetChunks = chunkSheetRows(validRows)
         const sheetChunks = chunkSheetWithHeaders(worksheet)
         const filteredSheetChunks = sheetChunks.filter(
           (chunk) => chunk.trim().length > 0,
@@ -563,54 +530,6 @@ export const processSpreadsheetFileWithSheetInfo = async (
   }
 }
 
-// Function to chunk sheet rows (simplified version of chunkFinalRows)
-// const chunkSheetRows = (allRows: string[][]): string[] => {
-//   const chunks: string[] = []
-//   let currentChunk = ""
-//   let totalTextLength = 0
-//   const MAX_CHUNK_SIZE = 512
-
-//   for (const row of allRows) {
-//     // Filter out numerical cells and empty strings, join textual cells
-//     const textualCells = row
-//       .filter(
-//         (cell) =>
-//           cell && isNaN(Number(cell)) && cell.toString().trim().length > 0,
-//       )
-//       .map((cell) => cell.toString().trim())
-
-//     if (textualCells.length === 0) continue
-
-//     const rowText = textualCells.join(" ")
-
-//     // Check if adding this rowText would exceed the maximum text length
-//     if (totalTextLength + rowText.length > MAX_ATTACHMENT_SHEET_TEXT_LEN) {
-//       Logger.warn(
-//         `Text length exceeded for spreadsheet, stopping at ${totalTextLength} characters`,
-//       )
-//       // If we have some chunks, return them; otherwise return empty
-//       return chunks.length > 0 ? chunks : []
-//     }
-
-//     totalTextLength += rowText.length
-
-//     if ((currentChunk + " " + rowText).trim().length > MAX_CHUNK_SIZE) {
-//       if (currentChunk.trim().length > 0) {
-//         chunks.push(currentChunk.trim())
-//       }
-//       currentChunk = rowText
-//     } else {
-//       currentChunk += (currentChunk ? " " : "") + rowText
-//     }
-//   }
-
-//   if (currentChunk.trim().length > 0) {
-//     chunks.push(currentChunk.trim())
-//   }
-
-//   return chunks
-// }
-
 // Helper function to check if a file is a spreadsheet
 export const isSpreadsheetFile = (mimeType: string): boolean => {
   return (
diff --git a/server/sheetChunk.ts b/server/sheetChunk.ts
index 9646e4b0e..968c0fb45 100644
--- a/server/sheetChunk.ts
+++ b/server/sheetChunk.ts
@@ -13,7 +13,7 @@ function isTimestamp(value: any): boolean {
     const numValue = Number(value)
     if (!isNaN(numValue)) {
       // Unix timestamp should be reasonable (between 1970 and 2100)
-      const minTimestamp = 946684800000 // 2000-01-01 in milliseconds
+      const minTimestamp = 0 // 1970-01-01 in milliseconds (Unix epoch)
       const maxTimestamp = 4102444800000 // 2100-01-01 in milliseconds
       return numValue >= minTimestamp && numValue <= maxTimestamp
     }

From 2f553750c73b4f03e3be78c3b7f4ad6b74ddd2fb Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Mon, 6 Oct 2025 17:51:20 +0530
Subject: [PATCH 4/7] fix: added sql validation

---
 server/ai/context.ts                |   2 +-
 server/api/chat/chat.ts             |   3 +-
 server/api/files.ts                 |   6 +-
 server/integrations/google/index.ts |   2 +-
 server/lib/duckdb.ts                |  34 ++-
 server/lib/sqlValidator.ts          | 319 ++++++++++++++++++++++++++++
 server/package.json                 |   1 +
 7 files changed, 355 insertions(+), 12 deletions(-)
 create mode 100644 server/lib/sqlValidator.ts

diff --git a/server/ai/context.ts b/server/ai/context.ts
index 2f898071d..4c7490b6a 100644
--- a/server/ai/context.ts
+++ b/server/ai/context.ts
@@ -69,7 +69,7 @@ const extractHeaderAndDataChunks = (
       processedChunks.push({
         chunk: headerChunk,
         score: 1,
-        index: chunks_summary.length,
+        index: 0,
       });
     }
 
diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts
index b893ae462..d3e67fb57 100644
--- a/server/api/chat/chat.ts
+++ b/server/api/chat/chat.ts
@@ -230,7 +230,8 @@ export function expandSheetIds(fileId: string): string[] {
   const sheetNumber = parseInt(sheetNumberStr, 10)
   // Generate IDs from docId_sheet_0 to docId_sheet_number
   const expandedIds: string[] = []
-  for (let i = 0; i < Math.max(sheetNumber, 1); i++) {
+  const upper = Number.isFinite(sheetNumber) ? sheetNumber : 1
+  for (let i = 0; i < upper; i++) {
     expandedIds.push(`${docId}_sheet_${i}`)
   }
   
diff --git a/server/api/files.ts b/server/api/files.ts
index fd13effdb..189e22e01 100644
--- a/server/api/files.ts
+++ b/server/api/files.ts
@@ -285,7 +285,7 @@ export const handleAttachmentUpload = async (c: Context) => {
             const vespaDoc = {
               docId: docId,
               clId: "attachment",
-              itemId: fileId,
+              itemId: docId,
               fileName: fileName,
               app: Apps.KnowledgeBase as const,
               entity: KnowledgeBaseEntity.Attachment,
@@ -334,13 +334,13 @@ export const handleAttachmentUpload = async (c: Context) => {
               ? path.relative(outputDir, thumbnailPath)
               : "",
           createdAt: new Date(),
-          url: `/api/v1/attachments/${fileId}`,
+          url: `/api/v1/attachments/${vespaId}`,
         }
 
         attachmentMetadata.push(metadata)
 
         loggerWithChild({ email }).info(
-          `Attachment "${file.name}" processed with ID ${fileId}${isImage ? " (saved to disk with thumbnail)" : " (processed and ingested into Vespa)"}`,
+          `Attachment "${file.name}" processed with ID ${vespaId}${isImage ? " (saved to disk with thumbnail)" : " (processed and ingested into Vespa)"}`,
         )
       } catch (error) {
         // Cleanup: remove the directory if file write fails (only for images)
diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index bce7014db..1d41e7a9c 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -2922,7 +2922,7 @@ export async function* listFiles(
   let nextPageToken = ""
 
   // Build the query with date filters if provided
-  let query = q ? `${q} and trashed = false` : "trashed = false"
+  let query = q ? `(${q}) and trashed = false` : "trashed = false"
   const dateFilters: string[] = []
 
   if (startDate) {
diff --git a/server/lib/duckdb.ts b/server/lib/duckdb.ts
index 2cdf3b6c6..2a6688157 100644
--- a/server/lib/duckdb.ts
+++ b/server/lib/duckdb.ts
@@ -3,7 +3,8 @@ import { getLogger } from "@/logger";
 import { Subsystem } from "@/types";
 import type { DuckDBResult } from "@/types";
 import { analyzeQueryAndGenerateSQL } from "./sqlInference";
-import { writeFileSync, unlinkSync, createWriteStream, promises as fs } from "fs";
+import { validateSQLQuery } from "./sqlValidator";
+import { writeFileSync, createWriteStream, promises as fs } from "fs";
 import { join } from "path";
 import { tmpdir, cpus } from "os";
 
@@ -81,7 +82,7 @@ export const querySheetChunks = async (
           quote='"',
           escape='"',
           null_padding=true,
-          ignore_errors=true,
+          ignore_errors=false,
           strict_mode=false,
           sample_size=100000
         )
@@ -130,8 +131,29 @@ export const querySheetChunks = async (
     }
     Logger.debug(`Generated SQL: ${duckDBQuery.sql}`);
     
-    Logger.debug(`Executing DuckDB query: ${duckDBQuery.sql}`);
-    const result = await connection.all(duckDBQuery.sql);
+    // Validate and sanitize the generated SQL using AST parsing
+    Logger.debug("Validating generated SQL for security and correctness");
+    const validationResult = validateSQLQuery(duckDBQuery.sql, tableName, {
+      allowSubqueries: true,
+      allowJoins: false,
+      allowWindowFunctions: true,
+      allowCTEs: true,
+    });
+
+    if (!validationResult.isValid) {
+      Logger.error(`SQL validation failed: ${validationResult.error}`);
+      throw new Error(`SQL validation failed: ${validationResult.error}`);
+    }
+
+    if (validationResult.warnings && validationResult.warnings.length > 0) {
+      Logger.warn(`SQL validation warnings: ${validationResult.warnings.join(", ")}`);
+    }
+
+    const finalSQL = validationResult.sanitizedSQL || duckDBQuery.sql;
+    Logger.debug(`Final validated SQL: ${finalSQL}`);
+    
+    Logger.debug(`Executing DuckDB query: ${finalSQL}`);
+    const result = await connection.all(finalSQL);
     const elapsedMs = Date.now() - startTime;
     Logger.debug(`Query executed successfully, returned ${result.length} rows in ${elapsedMs}ms`);
 
@@ -145,7 +167,7 @@ export const querySheetChunks = async (
 
     const resultPackage: DuckDBResult = {
       user_question: userQuery,
-      sql: duckDBQuery.sql,
+      sql: finalSQL, // Use the validated and sanitized SQL
       execution_meta: {
         row_count: result.length,
         elapsed_ms: elapsedMs,
@@ -180,7 +202,7 @@ export const querySheetChunks = async (
     
     // Clean up temporary TSV file
     try {
-      unlinkSync(tempFilePath);
+      await fs.unlink(tempFilePath);
       Logger.debug(`Temporary TSV file deleted: ${tempFilePath}`);
     } catch (e) {
       Logger.warn(`Failed to delete temporary TSV file ${tempFilePath}:`, e);
diff --git a/server/lib/sqlValidator.ts b/server/lib/sqlValidator.ts
new file mode 100644
index 000000000..a855f95a5
--- /dev/null
+++ b/server/lib/sqlValidator.ts
@@ -0,0 +1,319 @@
+import { Parser } from "node-sql-parser";
+import { getLogger } from "@/logger";
+import { Subsystem } from "@/types";
+
+const Logger = getLogger(Subsystem.Integrations).child({
+  module: "sqlValidator",
+});
+
+export interface SQLValidationResult {
+  isValid: boolean;
+  sanitizedSQL?: string;
+  error?: string;
+  warnings?: string[];
+}
+
+export interface SQLValidationOptions {
+  allowedViewName: string;
+  allowSubqueries?: boolean;
+  allowJoins?: boolean;
+  allowWindowFunctions?: boolean;
+  allowCTEs?: boolean;
+}
+
+/**
+ * Comprehensive SQL validator using AST parsing for security and correctness
+ */
+export class SQLValidator {
+  private parser: Parser;
+  private options: SQLValidationOptions;
+
+  constructor(options: SQLValidationOptions) {
+    this.parser = new Parser();
+    this.options = {
+      allowSubqueries: true,
+      allowJoins: false,
+      allowWindowFunctions: true,
+      allowCTEs: true,
+      ...options,
+    };
+  }
+
+  /**
+   * Validates and sanitizes SQL query using AST analysis
+   */
+  public validateSQL(sql: string): SQLValidationResult {
+    try {
+      Logger.debug(`Validating SQL: ${sql}`);
+
+      // Parse SQL into AST
+      const ast = this.parseSQL(sql);
+      if (!ast) {
+        return {
+          isValid: false,
+          error: "Failed to parse SQL syntax",
+        };
+      }
+
+      // Check for multiple statements
+      if (Array.isArray(ast)) {
+        return {
+          isValid: false,
+          error: "Multiple statements not allowed",
+        };
+      }
+
+      // Validate statement type
+      const statementTypeValidation = this.validateStatementType(ast);
+      if (!statementTypeValidation.isValid) {
+        return statementTypeValidation;
+      }
+
+      // Validate table access
+      const tableValidation = this.validateTableAccess(sql);
+      if (!tableValidation.isValid) {
+        return tableValidation;
+      }
+
+      // Validate query structure
+      const structureValidation = this.validateQueryStructure(ast);
+      if (!structureValidation.isValid) {
+        return structureValidation;
+      }
+
+      Logger.debug(`SQL validation successful: ${sql}`);
+      return {
+        isValid: true,
+        sanitizedSQL: sql,
+        warnings: this.collectWarnings(ast),
+      };
+    } catch (error) {
+      Logger.error("SQL validation error:", error);
+      return {
+        isValid: false,
+        error: `Validation error: ${error instanceof Error ? error.message : String(error)}`,
+      };
+    }
+  }
+
+  private parseSQL(sql: string): any {
+    try {
+      return this.parser.astify(sql);
+    } catch (error) {
+      Logger.error("SQL parsing failed:", error);
+      return null;
+    }
+  }
+
+  private validateStatementType(ast: any): SQLValidationResult {
+    const allowedTypes = ["select", "with"];
+    
+    if (!allowedTypes.includes(ast.type?.toLowerCase())) {
+      return {
+        isValid: false,
+        error: `Statement type '${ast.type}' is not allowed. Only SELECT and WITH statements are permitted.`,
+      };
+    }
+
+    return { isValid: true };
+  }
+
+  private validateTableAccess(sql: string): SQLValidationResult {
+    try {
+      const tableList = this.parser.tableList(sql);
+      Logger.debug("Raw table list:", tableList);
+      const allowedViewName = this.options.allowedViewName.toLowerCase();
+      
+      for (const table of tableList) {
+        // Extract the actual table name from the complex string format
+        const tableName = this.extractTableNameFromString(table);
+        Logger.debug(`Extracted table name: "${tableName}" from "${table}"`);
+        
+        if (tableName && tableName.toLowerCase() !== allowedViewName) {
+          return {
+            isValid: false,
+            error: `Access to table '${tableName}' is not allowed. Only '${this.options.allowedViewName}' is permitted.`,
+          };
+        }
+      }
+
+      return { isValid: true };
+    } catch (error) {
+      Logger.error("Table access validation failed:", error);
+      return {
+        isValid: false,
+        error: "Failed to validate table access",
+      };
+    }
+  }
+
+  private extractTableNameFromString(tableString: string): string | null {
+    if (!tableString) return null;
+    
+    // Handle the format "select::null::table_name" or similar
+    const parts = tableString.split('::');
+    if (parts.length >= 3) {
+      // Return the last part which should be the actual table name
+      return parts[parts.length - 1];
+    }
+    
+    // If it's a simple table name, return as is
+    return tableString;
+  }
+
+  private validateQueryStructure(ast: any): SQLValidationResult {
+    const warnings: string[] = [];
+
+    // Check for subqueries
+    if (this.hasSubqueries(ast) && !this.options.allowSubqueries) {
+      return {
+        isValid: false,
+        error: "Subqueries are not allowed",
+      };
+    }
+
+    // Check for joins
+    if (this.hasJoins(ast) && !this.options.allowJoins) {
+      return {
+        isValid: false,
+        error: "Joins are not allowed",
+      };
+    }
+
+    // Check for window functions
+    if (this.hasWindowFunctions(ast) && !this.options.allowWindowFunctions) {
+      return {
+        isValid: false,
+        error: "Window functions are not allowed",
+      };
+    }
+
+    // Check for CTEs
+    if (ast.type === "with" && !this.options.allowCTEs) {
+      return {
+        isValid: false,
+        error: "Common Table Expressions (CTEs) are not allowed",
+      };
+    }
+
+    return { isValid: true, warnings };
+  }
+
+  private hasSubqueries(ast: any): boolean {
+    if (!ast) return false;
+    
+    // Check if any FROM clause contains a subquery
+    if (ast.from) {
+      for (const fromItem of Array.isArray(ast.from) ? ast.from : [ast.from]) {
+        if (fromItem.expr && fromItem.expr.type === "select") {
+          return true;
+        }
+      }
+    }
+
+    // Check WHERE clause for subqueries
+    if (ast.where && this.hasSubqueryInExpression(ast.where)) {
+      return true;
+    }
+
+    // Check HAVING clause for subqueries
+    if (ast.having && this.hasSubqueryInExpression(ast.having)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  private hasSubqueryInExpression(expr: any): boolean {
+    if (!expr) return false;
+    
+    if (expr.type === "select") return true;
+    
+    if (expr.left && this.hasSubqueryInExpression(expr.left)) return true;
+    if (expr.right && this.hasSubqueryInExpression(expr.right)) return true;
+    if (expr.operand && this.hasSubqueryInExpression(expr.operand)) return true;
+    
+    return false;
+  }
+
+  private hasJoins(ast: any): boolean {
+    if (!ast || !ast.from) return false;
+    
+    const fromItems = Array.isArray(ast.from) ? ast.from : [ast.from];
+    return fromItems.some((item: any) => item.join);
+  }
+
+  private hasWindowFunctions(ast: any): boolean {
+    if (!ast) return false;
+    
+    // Check SELECT columns for window functions
+    if (ast.columns) {
+      for (const column of ast.columns) {
+        if (this.hasWindowFunctionInExpression(column)) {
+          return true;
+        }
+      }
+    }
+    
+    return false;
+  }
+
+  private hasWindowFunctionInExpression(expr: any): boolean {
+    if (!expr) return false;
+    
+    if (expr.type === "function" && expr.over) {
+      return true;
+    }
+    
+    if (expr.left && this.hasWindowFunctionInExpression(expr.left)) return true;
+    if (expr.right && this.hasWindowFunctionInExpression(expr.right)) return true;
+    if (expr.operand && this.hasWindowFunctionInExpression(expr.operand)) return true;
+    
+    return false;
+  }
+
+  private astToSQL(ast: any): string | null {
+    try {
+      // Convert AST back to SQL without any modifications
+      return this.parser.sqlify(ast);
+    } catch (error) {
+      Logger.error("Failed to convert AST to SQL:", error);
+      return null;
+    }
+  }
+
+  private collectWarnings(ast: any): string[] {
+    const warnings: string[] = [];
+    
+    // Check for potential performance issues
+    if (this.hasSubqueries(ast)) {
+      warnings.push("Query contains subqueries which may impact performance");
+    }
+    
+    if (this.hasJoins(ast)) {
+      warnings.push("Query contains joins which may impact performance");
+    }
+    
+    if (this.hasWindowFunctions(ast)) {
+      warnings.push("Query contains window functions which may impact performance");
+    }
+
+    return warnings;
+  }
+}
+
+/**
+ * Convenience function to validate SQL with default options
+ */
+export function validateSQLQuery(
+  sql: string,
+  allowedViewName: string,
+  options?: Partial<SQLValidationOptions>
+): SQLValidationResult {
+  const validator = new SQLValidator({
+    allowedViewName,
+    ...options,
+  });
+  
+  return validator.validateSQL(sql);
+}
diff --git a/server/package.json b/server/package.json
index d168cb64a..10f56b35d 100644
--- a/server/package.json
+++ b/server/package.json
@@ -103,6 +103,7 @@
     "livekit-server-sdk": "^2.13.3",
     "llama3-tokenizer-js": "^1.2.0",
     "nanoid": "^5.1.5",
+    "node-sql-parser": "^5.3.12",
     "ollama": "^0.5.11",
     "openai": "^5.16.0",
     "ora": "^8.1.1",

From b16d8522c09bf01f319b530350e5b327a2670837 Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Mon, 6 Oct 2025 17:51:38 +0530
Subject: [PATCH 5/7] fix: resolved ai comments

---
 server/integrations/google/index.ts | 22 +++++++++++-----------
 server/lib/duckdb.ts                |  7 +++----
 server/lib/sqlInference.ts          |  2 +-
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index 1d41e7a9c..e28576837 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -2186,6 +2186,17 @@ export const getSheetsListFromOneSpreadsheet = async (
   spreadsheet: drive_v3.Schema$File,
   userEmail: string,
 ): Promise<VespaFileWithDrivePermission[]> => {
+  // Early size check before fetching spreadsheet data
+  const sizeInBytes = spreadsheet.size ? parseInt(spreadsheet.size, 10) : 0
+  if (!isNaN(sizeInBytes) && sizeInBytes > MAX_GD_SHEET_SIZE) {
+    const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2)
+    const maxSizeInMB = (MAX_GD_SHEET_SIZE / (1024 * 1024)).toFixed(2)
+    loggerWithChild({ email: userEmail }).warn(
+      `Ignoring ${spreadsheet.name} as its size (${sizeInMB} MB) exceeds the limit of ${maxSizeInMB} MB`,
+    )
+    return []
+  }
+
   const sheetsArr = []
   try {
     const spreadSheetData = await getSpreadsheet(
@@ -2195,17 +2206,6 @@ export const getSheetsListFromOneSpreadsheet = async (
       userEmail,
     )
 
-    // Early size check before fetching spreadsheet data
-    const sizeInBytes = spreadsheet.size ? parseInt(spreadsheet.size, 10) : 0
-    if (!isNaN(sizeInBytes) && sizeInBytes > MAX_GD_SHEET_SIZE) {
-      const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2)
-      const maxSizeInMB = (MAX_GD_SHEET_SIZE / (1024 * 1024)).toFixed(2)
-      loggerWithChild({ email: userEmail }).warn(
-        `Ignoring ${spreadsheet.name} as its size (${sizeInMB} MB) exceeds the limit of ${maxSizeInMB} MB`,
-      )
-      return []
-    }
-
     if (spreadSheetData) {
       // Now we should get all sheets inside this spreadsheet using the spreadSheetData
       const allSheetsFromSpreadSheet = await getAllSheetsFromSpreadSheet(
diff --git a/server/lib/duckdb.ts b/server/lib/duckdb.ts
index 2a6688157..c18ebdbc8 100644
--- a/server/lib/duckdb.ts
+++ b/server/lib/duckdb.ts
@@ -28,7 +28,7 @@ export const querySheetChunks = async (
   );
 
   // Create a temporary CSV file using streaming for large data
-  const tmpDir = tmpdir().replace(/'/g, "''");
+  const tmpDir = tmpdir()
   const tempFilePath = join(tmpDir, `duckdb_temp_${Date.now()}.tsv`);
   Logger.debug(`Writing ${cleanedSheetChunks.length} chunks to temporary file: ${tempFilePath}`);
   
@@ -69,14 +69,13 @@ export const querySheetChunks = async (
     Logger.debug(`Creating VIEW ${tableName} over CSV file: ${tempFilePath}`);
     
     // 1) Create a VIEW over the CSV (no materialization)
-    const escapedPath = tempFilePath.replace(/'/g, "''");
-    Logger.debug(`Escaped path: ${escapedPath}`);
+    Logger.debug(`Escaped path: ${tempFilePath}`);
     
     try {
       await connection.run(`
         CREATE OR REPLACE VIEW ${tableName} AS
         SELECT * FROM read_csv(
-          '${escapedPath}',
+          '${tempFilePath}',
           delim='\t', 
           header=true, 
           quote='"',
diff --git a/server/lib/sqlInference.ts b/server/lib/sqlInference.ts
index 6ee428698..ad2235bb3 100644
--- a/server/lib/sqlInference.ts
+++ b/server/lib/sqlInference.ts
@@ -23,7 +23,7 @@ export const analyzeQueryAndGenerateSQL = async (
   schema: string,
   fewShotSamples: string
 ): Promise<DuckDBQuery | null> => {
-  Logger.debug(`Analyzing query and generating SQL: ${query}`);
+  Logger.debug(`Analyzing query and generating SQL`);
 
   const stripNoise = (s: string) => {
     let t = s.trim();

From 2fbf5774daff62514fb3520099998a91d42a43c7 Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Tue, 7 Oct 2025 17:43:33 +0530
Subject: [PATCH 6/7] fix: fixed tests

---
 server/api/chat/tools.ts | 48 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/server/api/chat/tools.ts b/server/api/chat/tools.ts
index 29ad98862..8ff09441c 100644
--- a/server/api/chat/tools.ts
+++ b/server/api/chat/tools.ts
@@ -446,12 +446,12 @@ async function executeVespaSearch(options: UnifiedSearchOptions): Promise<{
     return { result: "No results found.", contexts: [] }
   }
 
-  const fragments: MinimalAgentFragment[] = children.map((r) => {
+  const fragments: MinimalAgentFragment[] = await Promise.all(children.map(async (r) => {
     if (r.fields.sddocname === dataSourceFileSchema) {
       const fields = r.fields as VespaDataSourceFile
       return {
         id: `${fields.docId}`,
-        content: answerContextMap(r, userMetadata, maxDefaultSummary),
+        content: await answerContextMap(r, userMetadata, maxDefaultSummary),
         source: {
           docId: fields.docId,
           title: fields.fileName || "Untitled",
@@ -465,11 +465,11 @@ async function executeVespaSearch(options: UnifiedSearchOptions): Promise<{
     const citation = searchToCitation(r)
     return {
       id: `${citation.docId}`,
-      content: answerContextMap(r, userMetadata, maxDefaultSummary),
+      content: await answerContextMap(r, userMetadata, maxDefaultSummary),
       source: citation,
       confidence: r.relevance || 0.7,
     }
-  })
+  }))
 
   let summaryText = `Found ${fragments.length} results`
   if (query) summaryText += ` matching '${query}'`
@@ -1034,13 +1034,13 @@ export const getSlackThreads: AgentTool = {
           !!(item.fields && "sddocname" in item.fields),
       )
       if (threads.length > 0) {
-        const fragments: MinimalAgentFragment[] = threads.map(
-          (item: VespaSearchResults): MinimalAgentFragment => {
+        const fragments: MinimalAgentFragment[] = await Promise.all(threads.map(
+          async (item: VespaSearchResults): Promise<MinimalAgentFragment> => {
             const citation = searchToCitation(item)
             Logger.debug({ item }, "Processing item in metadata_retrieval tool")
 
             const content = item.fields
-              ? answerContextMap(item, userMetadata, maxDefaultSummary)
+              ? await answerContextMap(item, userMetadata, maxDefaultSummary)
               : `Context unavailable for ${citation.title || citation.docId}`
 
             return {
@@ -1050,7 +1050,7 @@ export const getSlackThreads: AgentTool = {
               confidence: item.relevance || 0.7, // Use item.relevance if available
             }
           },
-        )
+        ))
 
         let responseText = `Found ${fragments.length} Slack message${fragments.length !== 1 ? "s" : ""}`
         if (params.filter_query) {
@@ -1248,13 +1248,13 @@ export const getSlackMessagesFromUser: AgentTool = {
       )
 
       if (items.length > 0) {
-        const fragments: MinimalAgentFragment[] = items.map(
-          (item: VespaSearchResults): MinimalAgentFragment => {
+        const fragments: MinimalAgentFragment[] = await Promise.all(items.map(
+          async (item: VespaSearchResults): Promise<MinimalAgentFragment> => {
             const citation = searchToCitation(item)
             Logger.debug({ item }, "Processing item in metadata_retrieval tool")
 
             const content = item.fields
-              ? answerContextMap(item, userMetadata, maxDefaultSummary)
+              ? await answerContextMap(item, userMetadata, maxDefaultSummary)
               : `Context unavailable for ${citation.title || citation.docId}`
 
             return {
@@ -1264,7 +1264,7 @@ export const getSlackMessagesFromUser: AgentTool = {
               confidence: item.relevance || 0.7, // Use item.relevance if available
             }
           },
-        )
+        ))
 
         let responseText = `Found ${fragments.length} Slack message${fragments.length !== 1 ? "s" : ""}`
         if (params.filter_query) {
@@ -1488,13 +1488,13 @@ export const getSlackRelatedMessages: AgentTool = {
       }
 
       // Process results into fragments
-      const fragments: MinimalAgentFragment[] = items.map(
-        (item: VespaSearchResults): MinimalAgentFragment => {
+      const fragments: MinimalAgentFragment[] = await Promise.all(items.map(
+        async (item: VespaSearchResults): Promise<MinimalAgentFragment> => {
           const citation = searchToCitation(item)
           Logger.debug({ item }, "Processing Slack message item")
 
           const content = item.fields
-            ? answerContextMap(item, userMetadata, maxDefaultSummary)
+            ? await answerContextMap(item, userMetadata, maxDefaultSummary)
             : `Content unavailable for ${citation.title || citation.docId}`
 
           return {
@@ -1504,7 +1504,7 @@ export const getSlackRelatedMessages: AgentTool = {
             confidence: item.relevance || 0.7,
           }
         },
-      )
+      ))
 
       // Build response message
       let responseText = `Found ${fragments.length} Slack message${fragments.length !== 1 ? "s" : ""}`
@@ -1825,13 +1825,13 @@ export const getSlackMessagesFromChannel: AgentTool = {
           contexts: [],
         }
       }
-      const fragments: MinimalAgentFragment[] = items.map(
-        (item: VespaSearchResults): MinimalAgentFragment => {
+      const fragments: MinimalAgentFragment[] = await Promise.all(items.map(
+        async (item: VespaSearchResults): Promise<MinimalAgentFragment> => {
           const citation = searchToCitation(item)
           Logger.debug({ item }, "Processing item in metadata_retrieval tool")
 
           const content = item.fields
-            ? answerContextMap(item, userMetadata, maxDefaultSummary)
+            ? await answerContextMap(item, userMetadata, maxDefaultSummary)
             : `Context unavailable for ${citation.title || citation.docId}`
 
           return {
@@ -1841,7 +1841,7 @@ export const getSlackMessagesFromChannel: AgentTool = {
             confidence: item.relevance || 0.7, // Use item.relevance if available
           }
         },
-      )
+      ))
 
       let responseText = `Found ${fragments.length} Slack message${fragments.length !== 1 ? "s" : ""}`
       if (params.filter_query) {
@@ -2032,13 +2032,13 @@ export const getSlackMessagesFromTimeRange: AgentTool = {
           contexts: [],
         }
       }
-      const fragments: MinimalAgentFragment[] = items.map(
-        (item: VespaSearchResults): MinimalAgentFragment => {
+      const fragments: MinimalAgentFragment[] = await Promise.all(items.map(
+        async (item: VespaSearchResults): Promise<MinimalAgentFragment> => {
           const citation = searchToCitation(item)
           Logger.debug({ item }, "Processing item in metadata_retrieval tool")
 
           const content = item.fields
-            ? answerContextMap(item, userMetadata, maxDefaultSummary)
+            ? await answerContextMap(item, userMetadata, maxDefaultSummary)
             : `Context unavailable for ${citation.title || citation.docId}`
 
           return {
@@ -2048,7 +2048,7 @@ export const getSlackMessagesFromTimeRange: AgentTool = {
             confidence: item.relevance || 0.7, // Use item.relevance if available
           }
         },
-      )
+      ))
 
       let responseText = `Found ${fragments.length} Slack message${fragments.length !== 1 ? "s" : ""}`
       if (params.filter_query) {

From 931abde3959fca6933b67ea2cb36a186b9984cb7 Mon Sep 17 00:00:00 2001
From: Himansh Varma <himansh.varma@juspay.in>
Date: Wed, 8 Oct 2025 17:51:36 +0530
Subject: [PATCH 7/7] fix: XYNE-47 resolved comments

---
 server/api/knowledgeBase.ts                   |  9 ++++---
 server/config.ts                              |  4 +++
 server/integrations/dataSource/errors.ts      |  4 +--
 server/integrations/dataSource/index.ts       | 18 ++++++-------
 server/integrations/google/index.ts           |  9 ++++---
 server/integrations/google/worker-utils.ts    | 26 ++++++++++++-------
 .../microsoft/attachment-utils.ts             | 26 ++++++++++++-------
 server/lib/sqlInference.ts                    | 12 ++++++---
 server/services/fileProcessor.ts              |  4 ++-
 9 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts
index c19aff78c..aea0684c2 100644
--- a/server/api/knowledgeBase.ts
+++ b/server/api/knowledgeBase.ts
@@ -59,6 +59,7 @@ import {
 import { getAuth, safeGet } from "./agent"
 import { ApiKeyScopes, UploadStatus } from "@/shared/types"
 import { expandSheetIds } from "./chat/chat"
+import { checkFileSize } from "@/integrations/dataSource"
 
 const EXTENSION_MIME_MAP: Record<string, string> = {
   ".pdf": "application/pdf",
@@ -102,7 +103,7 @@ const { JwtPayloadKey } = config
 
 // Storage configuration for Knowledge Base feature files
 const KB_STORAGE_ROOT = join(process.cwd(), "storage", "kb_files")
-const MAX_FILE_SIZE = 100 * 1024 * 1024 // 100MB max file size
+const MAX_FILE_SIZE = 100 // 100MB max file size
 const MAX_FILES_PER_REQUEST = 100 // Maximum files per upload request
 
 // Initialize storage directory for Knowledge Base files
@@ -1195,12 +1196,14 @@ export const UploadFilesApi = async (c: Context) => {
       let storagePath = ""
       try {
         // Validate file size
-        if (file.size > MAX_FILE_SIZE) {
+        try{
+          checkFileSize(file.size, MAX_FILE_SIZE)
+        } catch (error) {
           uploadResults.push({
             success: false,
             fileName: file.name,
             parentId: targetParentId,
-            message: `Skipped: File too large (${Math.round(file.size / 1024 / 1024)}MB). Maximum size is ${Math.round(MAX_FILE_SIZE / 1024 / 1024)}MB`,
+            message: `Skipped: File too large (${Math.round(file.size / 1024 / 1024)}MB). Maximum size is ${MAX_FILE_SIZE}MB`,
           })
           loggerWithChild({ email: userEmail }).info(
             `Skipped large file: ${file.name} (${file.size} bytes)`,
diff --git a/server/config.ts b/server/config.ts
index a74a741f1..768c01373 100644
--- a/server/config.ts
+++ b/server/config.ts
@@ -48,6 +48,7 @@ let VertexRegion = ""
 let VertexAIModel = ""
 let aiProviderBaseUrl = ""
 let isReasoning = false
+let sqlInferenceModel = ""
 
 // File processing worker configuration
 let fileProcessingWorkerThreads = parseInt(process.env.FILE_PROCESSING_WORKER_THREADS || "4", 10)
@@ -74,6 +75,7 @@ if (process.env["AWS_ACCESS_KEY"] && process.env["AWS_SECRET_KEY"]) {
   AwsSecretKey = process.env["AWS_SECRET_KEY"]
   defaultFastModel = Models.Claude_3_5_Haiku
   defaultBestModel = Models.Claude_Sonnet_4
+  sqlInferenceModel = Models.Claude_Sonnet_4
 } else if (process.env["OPENAI_API_KEY"]) {
   if (process.env["BASE_URL"]) {
     if (!isURLValid(process.env["BASE_URL"])) {
@@ -136,6 +138,7 @@ if (process.env["AWS_ACCESS_KEY"] && process.env["AWS_SECRET_KEY"]) {
   defaultBestModel = process.env["VERTEX_BEST_MODEL"]
     ? (process.env["VERTEX_BEST_MODEL"] as Models)
     : Models.Vertex_Claude_Sonnet_4 // Default best model
+  sqlInferenceModel = Models.Vertex_Claude_Sonnet_4
 }
 let StartThinkingToken = "<think>"
 let EndThinkingToken = "</think>"
@@ -187,6 +190,7 @@ export default {
   GeminiAIModel,
   GeminiApiKey,
   VertexAIModel,
+  sqlInferenceModel,
   VertexProjectId,
   VertexRegion,
   aiProviderBaseUrl,
diff --git a/server/integrations/dataSource/errors.ts b/server/integrations/dataSource/errors.ts
index 639dbdd47..738bedfd4 100644
--- a/server/integrations/dataSource/errors.ts
+++ b/server/integrations/dataSource/errors.ts
@@ -92,10 +92,10 @@ export const createFileValidationError = (file: File): FileValidationError => {
 }
 
 export const createFileSizeError = (
-  file: File,
+  size: number,
   maxSizeMB: number,
 ): FileSizeExceededError => {
-  const actualSizeMB = file.size / (1024 * 1024)
+  const actualSizeMB = size / (1024 * 1024)
   return new FileSizeExceededError(maxSizeMB, actualSizeMB)
 }
 
diff --git a/server/integrations/dataSource/index.ts b/server/integrations/dataSource/index.ts
index 67ce63831..ee3f6708c 100644
--- a/server/integrations/dataSource/index.ts
+++ b/server/integrations/dataSource/index.ts
@@ -91,10 +91,10 @@ const validateFile = (file: File): void => {
   }
 }
 
-const checkFileSize = (file: File, maxFileSizeMB: number): void => {
-  const fileSizeMB = file.size / (1024 * 1024)
+export const checkFileSize = (size: number, maxFileSizeMB: number): void => {
+  const fileSizeMB = size / (1024 * 1024)
   if (fileSizeMB > maxFileSizeMB) {
-    throw createFileSizeError(file, maxFileSizeMB)
+    throw createFileSizeError(size, maxFileSizeMB)
   }
 }
 
@@ -479,7 +479,7 @@ export const handleDataSourceFileUpload = async (
           `LLM API endpoint is not set. Skipping image: ${options.fileName}`,
         )
       }
-      checkFileSize(file, DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB)
+      checkFileSize(file.size, DATASOURCE_CONFIG.MAX_IMAGE_FILE_SIZE_MB)
       const imageBuffer = Buffer.from(await file.arrayBuffer())
       const type = await imageType(new Uint8Array(imageBuffer))
       if (!type || !DATASOURCE_CONFIG.SUPPORTED_IMAGE_TYPES.has(type.mime)) {
@@ -514,26 +514,26 @@ export const handleDataSourceFileUpload = async (
     } else {
       // Process based on file type
       if (mimeType === "application/pdf") {
-        checkFileSize(file, DATASOURCE_CONFIG.MAX_PDF_FILE_SIZE_MB)
+        checkFileSize(file.size, DATASOURCE_CONFIG.MAX_PDF_FILE_SIZE_MB)
         const fileBuffer = new Uint8Array(await file.arrayBuffer())
         const processedFile = await processPdfContent(fileBuffer, options)
         processedFiles = [processedFile]
       } else if (isDocxFile(mimeType)) {
-        checkFileSize(file, DATASOURCE_CONFIG.MAX_DOCX_FILE_SIZE_MB)
+        checkFileSize(file.size, DATASOURCE_CONFIG.MAX_DOCX_FILE_SIZE_MB)
         const fileBuffer = new Uint8Array(await file.arrayBuffer())
         const processedFile = await processDocxContent(fileBuffer, options)
         processedFiles = [processedFile]
       } else if (isPptxFile(mimeType)) {
-        checkFileSize(file, DATASOURCE_CONFIG.MAX_PPTX_FILE_SIZE_MB)
+        checkFileSize(file.size, DATASOURCE_CONFIG.MAX_PPTX_FILE_SIZE_MB)
         const fileBuffer = new Uint8Array(await file.arrayBuffer())
         const processedFile = await processPptxContent(fileBuffer, options)
         processedFiles = [processedFile]
       } else if (isSheetFile(mimeType)) {
-        checkFileSize(file, DATASOURCE_CONFIG.MAX_SPREADSHEET_FILE_SIZE_MB)
+        checkFileSize(file.size, DATASOURCE_CONFIG.MAX_SPREADSHEET_FILE_SIZE_MB)
         const fileBuffer = Buffer.from(await file.arrayBuffer())
         processedFiles = await processSheetContent(fileBuffer, options)
       } else if (isTextFile(mimeType)) {
-        checkFileSize(file, DATASOURCE_CONFIG.MAX_TEXT_FILE_SIZE_MB)
+        checkFileSize(file.size, DATASOURCE_CONFIG.MAX_TEXT_FILE_SIZE_MB)
         const content = await file.text()
         const processedFile = await processTextContent(content, options)
         processedFiles = [processedFile]
diff --git a/server/integrations/google/index.ts b/server/integrations/google/index.ts
index e28576837..473494cff 100644
--- a/server/integrations/google/index.ts
+++ b/server/integrations/google/index.ts
@@ -1074,6 +1074,7 @@ import {
   totalIngestedMails,
 } from "@/metrics/google/gmail-metrics"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
+import { checkFileSize } from "../dataSource"
 
 const stats = z.object({
   type: z.literal(WorkerResponseTypes.Stats),
@@ -2188,11 +2189,11 @@ export const getSheetsListFromOneSpreadsheet = async (
 ): Promise<VespaFileWithDrivePermission[]> => {
   // Early size check before fetching spreadsheet data
   const sizeInBytes = spreadsheet.size ? parseInt(spreadsheet.size, 10) : 0
-  if (!isNaN(sizeInBytes) && sizeInBytes > MAX_GD_SHEET_SIZE) {
-    const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2)
-    const maxSizeInMB = (MAX_GD_SHEET_SIZE / (1024 * 1024)).toFixed(2)
+  try {
+    checkFileSize(sizeInBytes, MAX_GD_SHEET_SIZE)
+  } catch (error) {
     loggerWithChild({ email: userEmail }).warn(
-      `Ignoring ${spreadsheet.name} as its size (${sizeInMB} MB) exceeds the limit of ${maxSizeInMB} MB`,
+      `Ignoring ${spreadsheet.name} as its size (${Math.round(sizeInBytes / 1024 / 1024)} MB) exceeds the limit of ${MAX_GD_SHEET_SIZE} MB`,
     )
     return []
   }
diff --git a/server/integrations/google/worker-utils.ts b/server/integrations/google/worker-utils.ts
index dcd7d3e1e..746dd8353 100644
--- a/server/integrations/google/worker-utils.ts
+++ b/server/integrations/google/worker-utils.ts
@@ -20,6 +20,7 @@ import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
+import { checkFileSize } from "../dataSource"
 
 const Logger = getLogger(Subsystem.Integrations).child({ module: "google" })
 
@@ -132,8 +133,9 @@ export const getGmailAttachmentChunks = async (
     }
 
     if (mimeType === "application/pdf") {
-      const fileSizeMB = size.value / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_PDF_SIZE) {
+      try {
+        checkFileSize(size.value, MAX_ATTACHMENT_PDF_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_PDF_SIZE} MB`,
         )
@@ -153,8 +155,9 @@ export const getGmailAttachmentChunks = async (
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
       mimeType === "application/msword"
     ) {
-      const fileSizeMB = size.value / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_DOCX_SIZE) {
+      try {
+        checkFileSize(size.value, MAX_ATTACHMENT_DOCX_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_DOCX_SIZE} MB`,
         )
@@ -174,8 +177,9 @@ export const getGmailAttachmentChunks = async (
         "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
       mimeType === "application/vnd.ms-powerpoint"
     ) {
-      const fileSizeMB = size.value / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_PPTX_SIZE) {
+      try {
+        checkFileSize(size.value, MAX_ATTACHMENT_PPTX_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_PPTX_SIZE} MB`,
         )
@@ -195,8 +199,9 @@ export const getGmailAttachmentChunks = async (
       mimeType === "text/html" ||
       mimeType === "text/markdown"
     ) {
-      const fileSizeMB = size.value / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_TEXT_SIZE) {
+      try {
+        checkFileSize(size.value, MAX_ATTACHMENT_TEXT_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_TEXT_SIZE} MB`,
         )
@@ -322,8 +327,9 @@ export const getGmailSpreadsheetSheets = async (
       mimeType === "application/vnd.ms-excel" ||
       mimeType === "text/csv"
     ) {
-      const fileSizeMB = size.value / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_SHEET_SIZE) {
+      try {
+        checkFileSize(size.value, MAX_ATTACHMENT_SHEET_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_SHEET_SIZE} MB`,
         )
diff --git a/server/integrations/microsoft/attachment-utils.ts b/server/integrations/microsoft/attachment-utils.ts
index 3a6a6c9b5..1b62276b2 100644
--- a/server/integrations/microsoft/attachment-utils.ts
+++ b/server/integrations/microsoft/attachment-utils.ts
@@ -19,6 +19,7 @@ import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
 import { chunkSheetWithHeaders } from "@/sheetChunk"
+import { checkFileSize } from "../dataSource"
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "microsoft-attachments",
@@ -194,8 +195,9 @@ export const getOutlookAttachmentChunks = async (
 
     // Process based on MIME type
     if (mimeType === "application/pdf") {
-      const fileSizeMB = size / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_PDF_SIZE) {
+      try {
+        checkFileSize(size, MAX_ATTACHMENT_PDF_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_PDF_SIZE} MB`,
         )
@@ -215,8 +217,9 @@ export const getOutlookAttachmentChunks = async (
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
       mimeType === "application/msword"
     ) {
-      const fileSizeMB = size / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_DOCX_SIZE) {
+      try {
+        checkFileSize(size, MAX_ATTACHMENT_DOCX_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_DOCX_SIZE} MB`,
         )
@@ -236,8 +239,9 @@ export const getOutlookAttachmentChunks = async (
         "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
       mimeType === "application/vnd.ms-powerpoint"
     ) {
-      const fileSizeMB = size / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_PPTX_SIZE) {
+      try {
+        checkFileSize(size, MAX_ATTACHMENT_PPTX_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_PPTX_SIZE} MB`,
         )
@@ -257,8 +261,9 @@ export const getOutlookAttachmentChunks = async (
       mimeType === "text/html" ||
       mimeType === "text/markdown"
     ) {
-      const fileSizeMB = size / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_TEXT_SIZE) {
+      try {
+        checkFileSize(size, MAX_ATTACHMENT_TEXT_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_TEXT_SIZE} MB`,
         )
@@ -360,8 +365,9 @@ export const getOutlookSpreadsheetSheets = async (
       mimeType === "application/vnd.ms-excel" ||
       mimeType === "text/csv"
     ) {
-      const fileSizeMB = size / (1024 * 1024)
-      if (fileSizeMB > MAX_ATTACHMENT_SHEET_SIZE) {
+      try {
+        checkFileSize(size, MAX_ATTACHMENT_SHEET_SIZE)
+      } catch (error) {
         Logger.error(
           `Ignoring ${filename} as its more than ${MAX_ATTACHMENT_SHEET_SIZE} MB`,
         )
diff --git a/server/lib/sqlInference.ts b/server/lib/sqlInference.ts
index ad2235bb3..0dc47b688 100644
--- a/server/lib/sqlInference.ts
+++ b/server/lib/sqlInference.ts
@@ -2,8 +2,9 @@ import { getLogger } from "@/logger"
 import { Subsystem } from "@/types"
 import type { DuckDBQuery } from "@/types";
 import { getProviderByModel } from "@/ai/provider"
-import { Models } from "@/ai/types"
+import type { Models } from "@/ai/types"
 import { type Message } from "@aws-sdk/client-bedrock-runtime"
+import config from "@/config"
 
 const Logger = getLogger(Subsystem.Integrations).child({
   module: "sqlInference",
@@ -23,6 +24,11 @@ export const analyzeQueryAndGenerateSQL = async (
   schema: string,
   fewShotSamples: string
 ): Promise<DuckDBQuery | null> => {
+  const model : Models = config.sqlInferenceModel as Models
+  if (!model) {
+    Logger.warn("SQL inference model not set, returning null");
+    return null;
+  }
   Logger.debug(`Analyzing query and generating SQL`);
 
   const stripNoise = (s: string) => {
@@ -68,7 +74,7 @@ schema: ${schema}
 ${fewShotSamples}`;
 
   try {
-    const provider = getProviderByModel(Models.Vertex_Claude_Sonnet_4);
+    const provider = getProviderByModel(model);
     
     const messages: Message[] = [
       {
@@ -78,7 +84,7 @@ ${fewShotSamples}`;
     ]
 
     const modelParams = {
-      modelId: Models.Vertex_Claude_Sonnet_4,
+      modelId: model,
       temperature: 0.1,
       max_new_tokens: 512,
       stream: false,
diff --git a/server/services/fileProcessor.ts b/server/services/fileProcessor.ts
index 999dbc472..a55315356 100644
--- a/server/services/fileProcessor.ts
+++ b/server/services/fileProcessor.ts
@@ -37,6 +37,8 @@ export interface SheetProcessingResult extends ProcessingResult {
   docId: string
 }
 
+type ProcessingResultArray = (ProcessingResult | SheetProcessingResult)[]
+
 export class FileProcessorService {
 
   static async processFile(
@@ -47,7 +49,7 @@ export class FileProcessorService {
     storagePath?: string,
     extractImages: boolean = false,
     describeImages: boolean = false,
-  ): Promise<(ProcessingResult | SheetProcessingResult)[]> {
+  ): Promise<ProcessingResultArray> {
     const baseMimeType = getBaseMimeType(mimeType || "text/plain")
     let chunks: string[] = []
     let chunks_pos: number[] = []