From ef19354c38b753050adbf58c5bd28746ce3af460 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 15:36:17 +0530 Subject: [PATCH 01/17] refactor: XYNE-103 Attachments are moved to file.sd schema --- frontend/src/components/AttachmentPreview.tsx | 2 +- frontend/src/components/ChatBox.tsx | 30 ++- frontend/src/lib/common.tsx | 26 +-- frontend/src/utils/fileUtils.ts | 26 +-- server/api/chat/agents.ts | 3 +- server/api/chat/chat.ts | 104 +-------- server/api/files.ts | 214 +++++++++++++++--- server/api/search.ts | 6 +- server/server.ts | 3 + server/shared/fileUtils.ts | 24 ++ server/shared/types.ts | 18 +- server/vespa/schemas/file.sd | 63 +++++- 12 files changed, 323 insertions(+), 196 deletions(-) diff --git a/frontend/src/components/AttachmentPreview.tsx b/frontend/src/components/AttachmentPreview.tsx index fe744cc0f..9a739c945 100644 --- a/frontend/src/components/AttachmentPreview.tsx +++ b/frontend/src/components/AttachmentPreview.tsx @@ -10,7 +10,7 @@ import { DialogHeader, DialogTitle, } from "@/components/ui/dialog" -import { getFileType } from "@/utils/fileUtils" +import { getFileType } from "shared/fileUtils" import { getFileIcon } from "@/components/ChatBox" interface AttachmentPreviewProps { diff --git a/frontend/src/components/ChatBox.tsx b/frontend/src/components/ChatBox.tsx index 2af90dbf8..aecde7b7b 100644 --- a/frontend/src/components/ChatBox.tsx +++ b/frontend/src/components/ChatBox.tsx @@ -74,8 +74,8 @@ import { validateAndDeduplicateFiles, createImagePreview, cleanupPreviewUrls, - getFileType, } from "@/utils/fileUtils" +import { getFileType } from "shared/fileUtils" import { authFetch } from "@/utils/authFetch" interface SelectedFile { @@ -977,15 +977,37 @@ export const ChatBox = React.forwardRef( return ext || "file" } - const removeFile = useCallback((id: string) => { + const removeFile = useCallback(async (id: string) => { + const fileToRemove = selectedFiles.find((f) => f.id === id) + + // If the file has metadata with fileId (meaning it's already uploaded), delete it from the server + if (fileToRemove?.metadata?.fileId) { + try { + const response = await api.files.delete.$post({ + json: { + attachment: fileToRemove.metadata + } + }) + + if (!response.ok) { + const errorText = await response.text() + console.error(`Failed to delete attachment: ${errorText}`) + // Still remove from UI even if server deletion fails + } + } catch (error) { + console.error('Error deleting attachment:', error) + // Still remove from UI even if server deletion fails + } + } + + // Remove from UI setSelectedFiles((prev) => { - const fileToRemove = prev.find((f) => f.id === id) if (fileToRemove?.preview) { URL.revokeObjectURL(fileToRemove.preview) } return prev.filter((f) => f.id !== id) }) - }, []) + }, [selectedFiles]) const { handleFileSelect, handleFileChange } = createFileSelectionHandlers( fileInputRef, diff --git a/frontend/src/lib/common.tsx b/frontend/src/lib/common.tsx index 5470fa386..604c045ec 100644 --- a/frontend/src/lib/common.tsx +++ b/frontend/src/lib/common.tsx @@ -43,10 +43,9 @@ import { DataSourceEntity, WebSearchEntity, FileType, - MIME_TYPE_MAPPINGS, - EXTENSION_MAPPINGS, } from "shared/types" import { LoadingSpinner } from "@/routes/_authenticated/admin/integrations/google" +import { getFileType } from "shared/fileUtils" // Define placeholder entities if they don't exist in shared/types const PdfEntity = { Default: "pdf_default" } as const @@ -165,30 +164,9 @@ export const getIcon = ( } } -// Helper function to determine FileType from a file -export const getFileType = (file: File): FileType => { - // First check by MIME type - for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { - if ((mimeTypes as readonly string[]).includes(file.type)) { - return fileType as FileType - } - } - - // Fallback to extension checking - const fileName = file.name.toLowerCase() - for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { - if ((extensions as readonly string[]).some(ext => fileName.endsWith(ext))) { - return fileType as FileType - } - } - - // Default fallback - return FileType.FILE -} - // Icon mapping from FileType to SVG component export const getFileIcon = (file: File) => { - const fileType = getFileType(file) + const fileType = getFileType({type: file.type, name: file.name}) switch (fileType) { case FileType.TEXT: diff --git a/frontend/src/utils/fileUtils.ts b/frontend/src/utils/fileUtils.ts index 17ccd976e..98760ef18 100644 --- a/frontend/src/utils/fileUtils.ts +++ b/frontend/src/utils/fileUtils.ts @@ -1,35 +1,11 @@ import { isValidFile, isImageFile } from "shared/fileUtils" import { SelectedFile } from "@/components/ClFileUpload" import { authFetch } from "./authFetch" -import { FileType, MIME_TYPE_MAPPINGS, EXTENSION_MAPPINGS, UploadStatus } from "shared/types" +import { UploadStatus } from "shared/types" // Generate unique ID for files export const generateFileId = () => Math.random().toString(36).substring(2, 9) -export const getFileType = ({ type, name }: { type: string, name: string }): FileType => { - const fileName = name.toLowerCase() - const mimeType = type.toLowerCase() - const baseMime = mimeType.split(";")[0] - - // Check each file type category using the mappings - for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { - // Check MIME type first (more reliable) - if (mimeTypes.some(mime => baseMime === mime)) { - return fileType as FileType - } - } - - // Fallback to extension-based detection - for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { - if (extensions.some(ext => fileName.endsWith(ext))) { - return fileType as FileType - } - } - - // Default fallback - return FileType.FILE -} - // Create preview URL for image files export const createImagePreview = (file: File): string | undefined => { if (isImageFile(file.type)) { diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts index 701f05e1c..296e089fb 100644 --- a/server/api/chat/agents.ts +++ b/server/api/chat/agents.ts @@ -83,6 +83,7 @@ import { VespaChatUserSchema, type VespaSearchResult, type VespaSearchResults, + AttachmentEntity, } from "@xyne/vespa-ts/types" import { APIError } from "openai" import { insertChatTrace } from "@/db/chatTrace" @@ -391,7 +392,7 @@ const checkAndYieldCitationsForAgent = async function* ( } // we dont want citations for attachments in the chat - if (item.source.entity === KnowledgeBaseEntity.Attachment) { + if (Object.values(AttachmentEntity).includes(item.source.entity as AttachmentEntity)) { continue } diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index 121b2bafe..019aaf7b6 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -128,6 +128,8 @@ import { type VespaSearchResultsSchema, KnowledgeBaseEntity, KbItemsSchema, + AttachmentEntity, + fileSchema, } from "@xyne/vespa-ts/types" import { APIError } from "openai" import { @@ -213,6 +215,7 @@ import { getDateForAI } from "@/utils/index" import type { User } from "@microsoft/microsoft-graph-types" import { getAuth, safeGet } from "../agent" import { getChunkCountPerDoc } from "./chunk-selection" +import { handleAttachmentDelete } from "../files" const METADATA_NO_DOCUMENTS_FOUND = "METADATA_NO_DOCUMENTS_FOUND_INTERNAL" const METADATA_FALLBACK_TO_RAG = "METADATA_FALLBACK_TO_RAG_INTERNAL" @@ -485,7 +488,7 @@ const checkAndYieldCitations = async function* ( const f = (item as any)?.fields if ( f?.sddocname === dataSourceFileSchema || - f?.entity === KnowledgeBaseEntity.Attachment + Object.values(AttachmentEntity).includes(f?.entity) ) { // Skip datasource and attachment files from citations continue @@ -770,106 +773,13 @@ export const ChatDeleteApi = async (c: Context) => { throw new HTTPException(404, { message: "Chat not found" }) } - // Get all messages for the chat to find attachments + // Get all messages for the chat to delete attachments const messagesToDelete = await getChatMessagesWithAuth(tx, chatId, email) - // Collect all attachment file IDs that need to be deleted - const imageAttachmentFileIds: string[] = [] - const nonImageAttachmentFileIds: string[] = [] - for (const message of messagesToDelete) { if (message.attachments && Array.isArray(message.attachments)) { - const attachments = - message.attachments as unknown as AttachmentMetadata[] - for (const attachment of attachments) { - if (attachment && typeof attachment === "object") { - if (attachment.fileId) { - // Check if this is an image attachment using both isImage field and fileType - const isImageAttachment = - attachment.isImage || - (attachment.fileType && isImageFile(attachment.fileType)) - - if (isImageAttachment) { - imageAttachmentFileIds.push(attachment.fileId) - } else { - nonImageAttachmentFileIds.push(attachment.fileId) - } - } - } - } - } - } - - // Delete image attachments and their thumbnails from disk - if (imageAttachmentFileIds.length > 0) { - loggerWithChild({ email: email }).info( - `Deleting ${imageAttachmentFileIds.length} image attachment files and their thumbnails for chat ${chatId}`, - ) - - for (const fileId of imageAttachmentFileIds) { - try { - // Validate fileId to prevent path traversal - if ( - fileId.includes("..") || - fileId.includes("/") || - fileId.includes("\\") - ) { - loggerWithChild({ email: email }).error( - `Invalid fileId detected: ${fileId}. Skipping deletion for security.`, - ) - continue - } - const imageBaseDir = path.resolve( - process.env.IMAGE_DIR || "downloads/xyne_images_db", - ) - - const imageDir = path.join(imageBaseDir, fileId) - try { - await fs.access(imageDir) - await fs.rm(imageDir, { recursive: true, force: true }) - loggerWithChild({ email: email }).info( - `Deleted image attachment directory: ${imageDir}`, - ) - } catch (attachmentError) { - loggerWithChild({ email: email }).warn( - `Image attachment file ${fileId} not found in either directory during chat deletion`, - ) - } - } catch (error) { - loggerWithChild({ email: email }).error( - error, - `Failed to delete image attachment file ${fileId} during chat deletion: ${getErrorMessage(error)}`, - ) - } - } - } - - // Delete non-image attachments from Vespa kb_items schema - if (nonImageAttachmentFileIds.length > 0) { - loggerWithChild({ email: email }).info( - `Deleting ${nonImageAttachmentFileIds.length} non-image attachments from Vespa kb_items schema for chat ${chatId}`, - ) - - for (const fileId of nonImageAttachmentFileIds) { - try { - // Delete from Vespa kb_items schema using the proper Vespa function - await DeleteDocument(fileId, KbItemsSchema) - loggerWithChild({ email: email }).info( - `Successfully deleted non-image attachment ${fileId} from Vespa kb_items schema`, - ) - } catch (error) { - const errorMessage = getErrorMessage(error) - if (errorMessage.includes("404 Not Found")) { - loggerWithChild({ email: email }).warn( - `Non-image attachment ${fileId} not found in Vespa kb_items schema (may have been already deleted)`, - ) - } else { - loggerWithChild({ email: email }).error( - error, - `Failed to delete non-image attachment ${fileId} from Vespa kb_items schema: ${errorMessage}`, - ) - } - } + const attachments = message.attachments as AttachmentMetadata[] + await handleAttachmentDelete(attachments, email) } } diff --git a/server/api/files.ts b/server/api/files.ts index 23e66e967..561d79fb7 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -5,25 +5,31 @@ import { getLogger, getLoggerWithChild } from "@/logger" import { Subsystem } from "@/types" import { type DataSourceUploadResult, + DeleteImages, handleSingleFileUploadToDataSource, } from "@/api/dataSource" import { getUserByEmail } from "@/db/user" import { db } from "@/db/client" import { checkIfDataSourceFileExistsByNameAndId, + DeleteDocument, getDataSourceByNameAndCreator, insert, + GetDocument, } from "../search/vespa" import { NoUserFound } from "@/errors" import config from "@/config" import { HTTPException } from "hono/http-exception" -import { isValidFile, isImageFile } from "shared/fileUtils" +import { isValidFile, isImageFile, getFileType } from "shared/fileUtils" import { generateThumbnail, getThumbnailPath } from "@/utils/image" -import type { AttachmentMetadata } from "@/shared/types" +import { attachmentFileTypeMap, type AttachmentMetadata } from "@/shared/types" import { FileProcessorService } from "@/services/fileProcessor" -import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types" +import { Apps, fileSchema, KbItemsSchema } from "@xyne/vespa-ts/types" import { getBaseMimeType } from "@/integrations/dataSource/config" import { isDataSourceError } from "@/integrations/dataSource/errors" +import { handleAttachmentDeleteSchema } from "./search" +import { getErrorMessage } from "@/utils" +import { promises as fs } from "node:fs" const { JwtPayloadKey } = config const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" }) @@ -221,7 +227,7 @@ export const handleAttachmentUpload = async (c: Context) => { for (const file of files) { const fileBuffer = await file.arrayBuffer() - const fileId = `att_${crypto.randomUUID()}` + const fileId = `attf_${crypto.randomUUID()}` const ext = file.name.split(".").pop()?.toLowerCase() || "" const fullFileName = `${0}.${ext}` const isImage = isImageFile(file.type) @@ -244,7 +250,7 @@ export const handleAttachmentUpload = async (c: Context) => { thumbnailPath = getThumbnailPath(outputDir, fileId) await generateThumbnail(Buffer.from(fileBuffer), thumbnailPath) } else { - // For non-images: process through FileProcessorService and ingest into Vespa + // For non-images: process through FileProcessorService and ingest into file schema // Process the file content using FileProcessorService const processingResult = await FileProcessorService.processFile( @@ -257,56 +263,39 @@ export const handleAttachmentUpload = async (c: Context) => { false, ) - // TODO: Ingest the processed content into Vespa - // This would typically involve calling your Vespa ingestion service - // For now, we'll log the processing result - loggerWithChild({ email }).info( - `Processed non-image file "${file.name}" with ${processingResult.chunks.length} text chunks and ${processingResult.image_chunks.length} image chunks`, - ) - - const { chunks, chunks_pos, image_chunks, image_chunks_pos } = - processingResult + const { chunks, chunks_pos, image_chunks, image_chunks_pos } = processingResult const vespaDoc = { + title: file.name, + url: "", + app: Apps.Attachment, docId: fileId, - clId: "attachment", - itemId: fileId, - fileName: file.name, - app: Apps.KnowledgeBase as const, - entity: KnowledgeBaseEntity.Attachment, - description: "", - storagePath: "", + parentId: null, + owner: email, + photoLink: "", + ownerEmail: email, + entity: attachmentFileTypeMap[getFileType({ type: file.type, name: file.name })], chunks: chunks, chunks_pos: chunks_pos, image_chunks: image_chunks, image_chunks_pos: image_chunks_pos, - chunks_map: chunks.map((_, index) => ({ - chunk_index: index, - page_numbers: [0], - block_labels: [], - })), - image_chunks_map: image_chunks.map((_, index) => ({ - chunk_index: index, - page_numbers: [0], - block_labels: [], - })), + chunks_map: processingResult.chunks_map, + image_chunks_map: processingResult.image_chunks_map, + permissions: [email], + mimeType: getBaseMimeType(file.type || "text/plain"), metadata: JSON.stringify({ originalFileName: file.name, uploadedBy: email, chunksCount: chunks.length, - imageChunksCount: image_chunks.length, processingMethod: getBaseMimeType(file.type || "text/plain"), lastModified: Date.now(), + isAttachment: true, }), - createdBy: email, - duration: 0, - mimeType: getBaseMimeType(file.type || "text/plain"), - fileSize: file.size, createdAt: Date.now(), updatedAt: Date.now(), } - await insert(vespaDoc, KbItemsSchema) + await insert(vespaDoc, fileSchema) } // Create attachment metadata @@ -362,6 +351,157 @@ export const handleAttachmentUpload = async (c: Context) => { } } +export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], email: string) => { + const imageAttachmentFileIds: string[] = [] + const nonImageAttachmentFileIds: string[] = [] + + for (const attachment of attachments) { + if (attachment && typeof attachment === "object") { + if (attachment.fileId) { + // Check if this is an image attachment using both isImage field and fileType + const isImageAttachment = + attachment.isImage || + (attachment.fileType && isImageFile(attachment.fileType)) + + if (isImageAttachment) { + imageAttachmentFileIds.push(attachment.fileId) + } else { + nonImageAttachmentFileIds.push(attachment.fileId) + } + } + } + } + + // Delete image attachments and their thumbnails from disk + if (imageAttachmentFileIds.length > 0) { + loggerWithChild({ email: email }).info( + `Deleting ${imageAttachmentFileIds.length} image attachment files and their thumbnails`, + ) + + for (const fileId of imageAttachmentFileIds) { + try { + // Validate fileId to prevent path traversal + if ( + fileId.includes("..") || + fileId.includes("/") || + fileId.includes("\\") + ) { + loggerWithChild({ email: email }).error( + `Invalid fileId detected: ${fileId}. Skipping deletion for security.`, + ) + continue + } + const imageBaseDir = path.resolve( + process.env.IMAGE_DIR || "downloads/xyne_images_db", + ) + + const imageDir = path.join(imageBaseDir, fileId) + try { + await fs.access(imageDir) + await fs.rm(imageDir, { recursive: true, force: true }) + loggerWithChild({ email: email }).info( + `Deleted image attachment directory: ${imageDir}`, + ) + } catch (attachmentError) { + loggerWithChild({ email: email }).warn( + `Image attachment file ${fileId} not found in either directory during chat deletion`, + ) + } + } catch (error) { + loggerWithChild({ email: email }).error( + error, + `Failed to delete image attachment file ${fileId} during chat deletion: ${getErrorMessage(error)}`, + ) + } + } + } + + // Delete non-image attachments from Vespa + if (nonImageAttachmentFileIds.length > 0) { + loggerWithChild({ email: email }).info( + `Deleting ${nonImageAttachmentFileIds.length} non-image attachments from Vespa`, + ) + + for (const fileId of nonImageAttachmentFileIds) { + try { + // Delete images from disk + try { + await DeleteImages(fileId) + } catch (error) { + loggerWithChild({ email: email }).warn( + `Failed to delete images from disk: ${fileId} - ${getErrorMessage(error)}`, + ) + } + // Delete from Vespa kb_items or file schema + if(fileId.startsWith("att_")) { + await DeleteDocument(fileId, KbItemsSchema) + } else { + await DeleteDocument(fileId, fileSchema) + } + loggerWithChild({ email: email }).info( + `Successfully deleted non-image attachment ${fileId} from Vespa`, + ) + } catch (error) { + const errorMessage = getErrorMessage(error) + if (errorMessage.includes("404 Not Found")) { + loggerWithChild({ email: email }).warn( + `Non-image attachment ${fileId} not found in Vespa (may have been already deleted)`, + ) + } else { + loggerWithChild({ email: email }).error( + error, + `Failed to delete non-image attachment ${fileId} from Vespa: ${errorMessage}`, + ) + } + } + } + } +} + +export const handleAttachmentDeleteApi = async (c: Context) => { + const { sub } = c.get(JwtPayloadKey) + const email = sub + + const { attachment } = handleAttachmentDeleteSchema.parse(await c.req.json()) + const fileId = attachment.fileId + const isImage = attachment.isImage || (attachment.fileType && isImageFile(attachment.fileType)) + if (!fileId) { + throw new HTTPException(400, { message: "File ID is required" }) + } + + if(isImage) { + await handleAttachmentDelete([attachment], email) + return c.json({ success: true, message: "Attachment deleted successfully" }) + } + + try { + // Get the attachment document from the file schema + const attachmentDoc = await GetDocument(fileSchema, fileId) + + if (!attachmentDoc || !attachmentDoc.fields) { + return c.json({ success: true, message: "Attachment already deleted" }) + } + + // Check permissions - file schema has permissions array + const fields = attachmentDoc.fields as any + const permissions = fields.permissions as string[] || [] + if (!permissions.includes(email)) { + throw new HTTPException(403, { message: "Access denied to this attachment" }) + } + + await handleAttachmentDelete([attachment], email) + + return c.json({ success: true, message: "Attachment deleted successfully" }) + + } catch (error) { + if (error instanceof HTTPException) { + throw error + } + loggerWithChild({ email }).error({ err: error }, "Error checking attachment permissions") + throw new HTTPException(500, { message: "Internal server error" }) + } +} + /** * Serve attachment file by fileId */ diff --git a/server/api/search.ts b/server/api/search.ts index 53bca117c..f05f6ed57 100644 --- a/server/api/search.ts +++ b/server/api/search.ts @@ -48,7 +48,7 @@ import { cleanContext, userContext, } from "@/ai/context" -import { AnswerSSEvents, AuthType, ConnectorStatus } from "@/shared/types" +import { AnswerSSEvents, attachmentMetadataSchema, AuthType, ConnectorStatus } from "@/shared/types" import { agentPromptPayloadSchema } from "@/shared/types" import { streamSSE } from "hono/streaming" import { getLogger, getLoggerWithChild } from "@/logger" @@ -256,6 +256,10 @@ export const generatePromptSchema = z.object({ }) +export const handleAttachmentDeleteSchema = z.object({ + attachment: attachmentMetadataSchema, +}) + export type GeneratePromptPayload = z.infer export const AutocompleteApi = async (c: Context) => { diff --git a/server/server.ts b/server/server.ts index ee2e4d258..c1e6d9ef9 100644 --- a/server/server.ts +++ b/server/server.ts @@ -21,6 +21,7 @@ import { chatTitleSchema, GetDriveItem, GetDriveItemsByDocIds, + handleAttachmentDeleteSchema, } from "@/api/search" import { callNotificationService } from "@/services/callNotifications" import { HighlightApi, highlightSchema } from "@/api/highlight" @@ -238,6 +239,7 @@ import { handleFileUpload, handleAttachmentServe, handleThumbnailServe, + handleAttachmentDeleteApi, } from "@/api/files" import { z } from "zod" // Ensure z is imported if not already at the top for schemas import { @@ -894,6 +896,7 @@ export const AppRoutes = app .post("/files/upload-attachment", handleAttachmentUpload) .get("/attachments/:fileId", handleAttachmentServe) .get("/attachments/:fileId/thumbnail", handleThumbnailServe) + .post("/files/delete", zValidator("json", handleAttachmentDeleteSchema), handleAttachmentDeleteApi) .post("/chat", zValidator("json", chatSchema), GetChatApi) .post( "/chat/generateTitle", diff --git a/server/shared/fileUtils.ts b/server/shared/fileUtils.ts index de0040837..1fcca1476 100644 --- a/server/shared/fileUtils.ts +++ b/server/shared/fileUtils.ts @@ -7,6 +7,30 @@ export const isImageFile = (fileType: string): boolean => { ) } +export const getFileType = ({ type, name }: { type: string, name: string }): FileType => { + const fileName = name.toLowerCase() + const mimeType = type.toLowerCase() + const baseMime = mimeType.split(";")[0] + + // Check each file type category using the mappings + for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { + // Check MIME type first (more reliable) + if (mimeTypes.some(mime => baseMime === mime)) { + return fileType as FileType + } + } + + // Fallback to extension-based detection + for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { + if (extensions.some(ext => fileName.endsWith(ext))) { + return fileType as FileType + } + } + + // Default fallback + return FileType.FILE +} + export const isValidFile = (file: File) => { // Set size limits const maxGeneralSize = 40 * 1024 * 1024 // 40MB diff --git a/server/shared/types.ts b/server/shared/types.ts index 66cb6509f..ffa346be8 100644 --- a/server/shared/types.ts +++ b/server/shared/types.ts @@ -27,7 +27,7 @@ import { GooglePeopleEntity, SlackEntity, MicrosoftPeopleEntity, - + AttachmentEntity, } from "@xyne/vespa-ts/types" export { GooglePeopleEntity, @@ -36,6 +36,7 @@ export { CalendarEntity, MailAttachmentEntity, SlackEntity, + AttachmentEntity, Apps, isMailAttachment, SystemEntity, @@ -57,7 +58,10 @@ export type { } from "@xyne/vespa-ts/types" export type VespaFile = z.infer -export const FileEntitySchema = z.nativeEnum(DriveEntity) +export const FileEntitySchema = z.union([ + z.nativeEnum(DriveEntity), + z.nativeEnum(AttachmentEntity), +]) export const MailEntitySchema = z.nativeEnum(MailEntity) export const MailAttachmentEntitySchema = z.nativeEnum(MailAttachmentEntity) export const EventEntitySchema = z.nativeEnum(CalendarEntity) @@ -203,6 +207,16 @@ export const EXTENSION_MAPPINGS = { [FileType.TEXT]: [".txt", ".md"], } as const +export const attachmentFileTypeMap: Record = { + [FileType.DOCUMENT]: AttachmentEntity.Docs, + [FileType.SPREADSHEET]: AttachmentEntity.Sheets, + [FileType.PRESENTATION]: AttachmentEntity.PPT, + [FileType.PDF]: AttachmentEntity.PDF, + [FileType.TEXT]: AttachmentEntity.Text, + [FileType.IMAGE]: AttachmentEntity.Image, + [FileType.FILE]: AttachmentEntity.File, +} + export enum ApiKeyScopes { CREATE_AGENT = "CREATE_AGENT", AGENT_CHAT = "AGENT_CHAT", diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 3dc224a89..221299326 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -1,4 +1,11 @@ schema file { + + struct chunk_meta { + field chunk_index type int {} + field page_numbers type array {} + field block_labels type array {} + } + document file { field docId type string { indexing: attribute | summary @@ -40,6 +47,29 @@ schema file { index: enable-bm25 } + field image_chunks type array { + indexing: index | summary + index { enable-bm25 } + } + + field chunks_pos type array { + indexing: attribute | summary + } + + field image_chunks_pos type array { + indexing: attribute | summary + } + + field chunks_map type array { + indexing: summary + struct-field chunk_index { indexing: attribute | summary } + } + + field image_chunks_map type array { + indexing: summary + struct-field chunk_index { indexing: attribute | summary } + } + field owner type string { indexing: attribute | summary } @@ -83,6 +113,12 @@ schema file { distance-metric: angular } } + + field image_chunk_embeddings type tensor(p{}, v[DIMS]) { + indexing: input image_chunks | embed | attribute | index + attribute: paged + attribute { distance-metric: angular } + } field title_fuzzy type string { @@ -134,27 +170,31 @@ schema file { } function vector_score() { - expression: closeness(field, chunk_embeddings) + expression: closeness(field, chunk_embeddings) + closeness(field, image_chunk_embeddings) } function combined_bm25() { - expression: bm25(title) + bm25(chunks) + expression: bm25(title) + bm25(chunks) + bm25(image_chunks) } function matchedFieldCount() { expression { # The `matches` returns 1 if the field contains a match, otherwise 0 - matches(title) + matches(chunks) + matches(title) + matches(chunks) + matches(image_chunks) } } function combined_nativeRank() { - expression: (nativeRank(title) + nativeRank(chunks)) / if(matchedFieldCount == 0, 1, matchedFieldCount) + expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount == 0, 1, matchedFieldCount) } function chunk_scores() { expression: elementwise(bm25(chunks), x, double) } + + function image_chunk_scores() { + expression: elementwise(bm25(image_chunks), x, double) + } } @@ -181,6 +221,7 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores + image_chunk_scores doc_recency } } @@ -249,6 +290,7 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores + image_chunk_scores document_age_days_gs query(bin_size_days_gs) recency_bin_index_gs @@ -285,6 +327,7 @@ schema file { bm25(title) scale(combined_bm25) chunk_scores + image_chunk_scores } } @@ -311,6 +354,7 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores + image_chunk_scores doc_recency } } @@ -325,6 +369,7 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores + image_chunk_scores doc_recency } } @@ -349,6 +394,16 @@ schema file { bolding: on source: chunks } + summary image_chunks_summary { + bolding: on + source: image_chunks + } + summary chunks_pos_summary { + source: chunks_pos + } + summary image_chunks_pos_summary { + source: image_chunks_pos + } } document-summary autocomplete { From a9f2e577d651ed237d141aafdf4bce3701a63a61 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 18:02:11 +0530 Subject: [PATCH 02/17] refactor: XYNE-103 updated vespa package version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 6eb34ce86..d56050c1c 100644 --- a/package.json +++ b/package.json @@ -2,4 +2,4 @@ "dependencies": { "zustand": "^5.0.8" } -} \ No newline at end of file +} From 66b52ed7f254b89c5f33bb25c246723337c21a07 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 18:09:01 +0530 Subject: [PATCH 03/17] refactor: XYNE-103 updated vespa version in package --- server/api/files.ts | 2 +- server/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/api/files.ts b/server/api/files.ts index 561d79fb7..8916ed553 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -484,7 +484,7 @@ export const handleAttachmentDeleteApi = async (c: Context) => { // Check permissions - file schema has permissions array const fields = attachmentDoc.fields as any - const permissions = fields.permissions as string[] || [] + const permissions = Array.isArray(fields.permissions) ? fields.permissions as string[] : [] if (!permissions.includes(email)) { throw new HTTPException(403, { message: "Access denied to this attachment" }) } diff --git a/server/package.json b/server/package.json index b7fba9631..d7cdacbe0 100644 --- a/server/package.json +++ b/server/package.json @@ -78,7 +78,7 @@ "@types/json-schema": "^7.0.15", "@types/jszip": "^3.4.1", "@types/node": "^24.3.0", - "@xyne/vespa-ts": "1.1.0", + "@xyne/vespa-ts": "1.1.2", "@xynehq/jaf": "^0.1.4", "ai": "^5.0.51", "arctic": "^3.3.0", From aa45a8e77505cf8a03354e368b99b2da49cb6266 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 19:20:10 +0530 Subject: [PATCH 04/17] refactor: XYNE-103 fixed build --- server/integrations/google/sync.ts | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/server/integrations/google/sync.ts b/server/integrations/google/sync.ts index 093edff9a..2ef30553e 100644 --- a/server/integrations/google/sync.ts +++ b/server/integrations/google/sync.ts @@ -369,11 +369,15 @@ export const handleGoogleDriveChange = async ( await insertWithRetry(data, fileSchema) } } else { - vespaData.permissions = toPermissionsList( + const permissionsAsString = toPermissionsList( vespaData.permissions, email, - ) - await insertWithRetry(vespaData, fileSchema) + ); + const vespaDataForInsert = { + ...vespaData, + permissions: permissionsAsString, + }; + await insertWithRetry(vespaDataForInsert, fileSchema); } } } catch (err) { From 3a56bc315bf4d19c91c28a010c7047d44305a480 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 19:36:52 +0530 Subject: [PATCH 05/17] refactor: XYNE-90 resolved ai comments --- server/api/chat/chat.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index a488c6f29..ac75c174e 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -216,6 +216,7 @@ import type { User } from "@microsoft/microsoft-graph-types" import { getAuth, safeGet } from "../agent" import { getChunkCountPerDoc } from "./chunk-selection" import { handleAttachmentDelete } from "../files" +import { A } from "ollama/dist/shared/ollama.6319775f.mjs" const METADATA_NO_DOCUMENTS_FOUND = "METADATA_NO_DOCUMENTS_FOUND_INTERNAL" const METADATA_FALLBACK_TO_RAG = "METADATA_FALLBACK_TO_RAG_INTERNAL" @@ -787,6 +788,7 @@ export const ChatDeleteApi = async (c: Context) => { email = sub || "" // @ts-ignore const { chatId } = c.req.valid("json") + const attachmentsToDelete: AttachmentMetadata[] = [] await db.transaction(async (tx) => { // Get the chat's internal ID first const chat = await getChatByExternalIdWithAuth(tx, chatId, email) @@ -800,7 +802,7 @@ export const ChatDeleteApi = async (c: Context) => { for (const message of messagesToDelete) { if (message.attachments && Array.isArray(message.attachments)) { const attachments = message.attachments as AttachmentMetadata[] - await handleAttachmentDelete(attachments, email) + attachmentsToDelete.push(...attachments) } } @@ -813,6 +815,9 @@ export const ChatDeleteApi = async (c: Context) => { await deleteMessagesByChatId(tx, chatId) await deleteChatByExternalIdWithAuth(tx, chatId, email) }) + if (attachmentsToDelete.length) { + await handleAttachmentDelete(attachmentsToDelete, email) + } return c.json({ success: true }) } catch (error) { const errMsg = getErrorMessage(error) From 151208c97d5cce37b1b40c3d6bfb50433907775a Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 19:47:51 +0530 Subject: [PATCH 06/17] refactor: XYNE-90 removed circular dependency --- server/api/chat/agents.ts | 2 +- server/api/chat/chat.ts | 23 +---------------------- server/api/files.ts | 2 +- server/api/knowledgeBase.ts | 2 +- server/search/utils.ts | 21 +++++++++++++++++++++ 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts index 10b9040e5..e3bbe5031 100644 --- a/server/api/chat/agents.ts +++ b/server/api/chat/agents.ts @@ -122,7 +122,6 @@ import { getModelValueFromLabel } from "@/ai/modelConfig" import { buildContext, buildUserQuery, - expandSheetIds, getThreadContext, isContextSelected, UnderstandMessageAndAnswer, @@ -157,6 +156,7 @@ import { getDateForAI } from "@/utils/index" import { validateVespaIdInAgentIntegrations } from "@/search/utils" import { getAuth, safeGet } from "../agent" import { applyFollowUpContext } from "@/utils/parseAttachment" +import { expandSheetIds } from "@/search/utils" const { JwtPayloadKey, defaultBestModel, diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index ac75c174e..8e45a79ac 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -216,32 +216,11 @@ import type { User } from "@microsoft/microsoft-graph-types" import { getAuth, safeGet } from "../agent" import { getChunkCountPerDoc } from "./chunk-selection" import { handleAttachmentDelete } from "../files" -import { A } from "ollama/dist/shared/ollama.6319775f.mjs" +import { expandSheetIds } from "@/search/utils" const METADATA_NO_DOCUMENTS_FOUND = "METADATA_NO_DOCUMENTS_FOUND_INTERNAL" const METADATA_FALLBACK_TO_RAG = "METADATA_FALLBACK_TO_RAG_INTERNAL" -export function expandSheetIds(fileId: string): string[] { - // Check if the fileId matches the pattern docId_sheet_number - const sheetMatch = fileId.match(/^(.+)_sheet_(\d+)$/) - - if (!sheetMatch) { - // Not a sheet ID, return as is - return [fileId] - } - - const [, docId, sheetNumberStr] = sheetMatch - const sheetNumber = parseInt(sheetNumberStr, 10) - // Generate IDs from docId_sheet_0 to docId_sheet_number - const expandedIds: string[] = [] - const upper = Number.isFinite(sheetNumber) ? sheetNumber : 1 - for (let i = 0; i < upper; i++) { - expandedIds.push(`${docId}_sheet_${i}`) - } - - return expandedIds -} - export async function resolveNamesToEmails( intent: Intent, email: string, diff --git a/server/api/files.ts b/server/api/files.ts index 7f7b80ac1..2dd38404b 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -30,7 +30,7 @@ import { isDataSourceError } from "@/integrations/dataSource/errors" import { handleAttachmentDeleteSchema } from "./search" import { getErrorMessage } from "@/utils" import { promises as fs } from "node:fs" -import { expandSheetIds } from "./chat/chat" +import { expandSheetIds } from "@/search/utils" const { JwtPayloadKey } = config const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" }) diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts index aea0684c2..e5fe96179 100644 --- a/server/api/knowledgeBase.ts +++ b/server/api/knowledgeBase.ts @@ -58,7 +58,7 @@ import { } from "@/integrations/dataSource/config" import { getAuth, safeGet } from "./agent" import { ApiKeyScopes, UploadStatus } from "@/shared/types" -import { expandSheetIds } from "./chat/chat" +import { expandSheetIds } from "@/search/utils" import { checkFileSize } from "@/integrations/dataSource" const EXTENSION_MIME_MAP: Record = { diff --git a/server/search/utils.ts b/server/search/utils.ts index ecb6ce382..c6072a4d3 100644 --- a/server/search/utils.ts +++ b/server/search/utils.ts @@ -20,6 +20,27 @@ import { sharedVespaService } from "./vespaService" const Logger = getLogger(Subsystem.Vespa).child({ module: "search-utils" }) +export function expandSheetIds(fileId: string): string[] { + // Check if the fileId matches the pattern docId_sheet_number + const sheetMatch = fileId.match(/^(.+)_sheet_(\d+)$/) + + if (!sheetMatch) { + // Not a sheet ID, return as is + return [fileId] + } + + const [, docId, sheetNumberStr] = sheetMatch + const sheetNumber = parseInt(sheetNumberStr, 10) + // Generate IDs from docId_sheet_0 to docId_sheet_number + const expandedIds: string[] = [] + const upper = Number.isFinite(sheetNumber) ? sheetNumber : 1 + for (let i = 0; i < upper; i++) { + expandedIds.push(`${docId}_sheet_${i}`) + } + + return expandedIds +} + export function removePrefixesFromItemIds(itemIds: string[]): string[] { return itemIds.map((itemId) => { // Remove prefixes: clfd-, clf-, cl- From 19986286e235cac8cf07b67663796faa1750df10 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 21:04:54 +0530 Subject: [PATCH 07/17] refactor: XYNE-103 fixed a bug in deletion --- server/api/files.ts | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/server/api/files.ts b/server/api/files.ts index 2dd38404b..62580c850 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -293,7 +293,7 @@ export const handleAttachmentUpload = async (c: Context) => { title: file.name, url: "", app: Apps.Attachment, - docId: fileId, + docId: docId, parentId: null, owner: email, photoLink: "", @@ -497,35 +497,16 @@ export const handleAttachmentDeleteApi = async (c: Context) => { const { attachment } = handleAttachmentDeleteSchema.parse(await c.req.json()) const fileId = attachment.fileId - const isImage = attachment.isImage || (attachment.fileType && isImageFile(attachment.fileType)) if (!fileId) { throw new HTTPException(400, { message: "File ID is required" }) } - if(isImage) { - await handleAttachmentDelete([attachment], email) - return c.json({ success: true, message: "Attachment deleted successfully" }) - } - try { - // Get the attachment document from the file schema - const attachmentDoc = await GetDocument(fileSchema, fileId) - - if (!attachmentDoc || !attachmentDoc.fields) { - return c.json({ success: true, message: "Attachment already deleted" }) - } - - // Check permissions - file schema has permissions array - const fields = attachmentDoc.fields as any - const permissions = Array.isArray(fields.permissions) ? fields.permissions as string[] : [] - if (!permissions.includes(email)) { - throw new HTTPException(403, { message: "Access denied to this attachment" }) + const vespaIds = expandSheetIds(fileId) + for (const vespaId of vespaIds) { + await handleAttachmentDelete([{ ...attachment, fileId: vespaId }], email) } - - await handleAttachmentDelete([attachment], email) - return c.json({ success: true, message: "Attachment deleted successfully" }) - } catch (error) { if (error instanceof HTTPException) { throw error From 6fc4a519fc5b7f006169be51e81f5ab374d0dc76 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 21:17:32 +0530 Subject: [PATCH 08/17] refactor: XYNE-103 fixed a bug in delete api --- server/api/files.ts | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/api/files.ts b/server/api/files.ts index 62580c850..39d05cabb 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -501,11 +501,8 @@ export const handleAttachmentDeleteApi = async (c: Context) => { throw new HTTPException(400, { message: "File ID is required" }) } - try { - const vespaIds = expandSheetIds(fileId) - for (const vespaId of vespaIds) { - await handleAttachmentDelete([{ ...attachment, fileId: vespaId }], email) - } + try { + await handleAttachmentDelete([attachment], email) return c.json({ success: true, message: "Attachment deleted successfully" }) } catch (error) { if (error instanceof HTTPException) { From 1b1d6d46704126be0d242eca1e9350edce8d910e Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 21:58:03 +0530 Subject: [PATCH 09/17] refactor: XYNE-103 added permission at delete --- server/api/files.ts | 73 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/server/api/files.ts b/server/api/files.ts index 39d05cabb..abb13f72a 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -251,6 +251,31 @@ export const handleAttachmentUpload = async (c: Context) => { // Generate thumbnail for images thumbnailPath = getThumbnailPath(outputDir, fileId) await generateThumbnail(Buffer.from(fileBuffer), thumbnailPath) + + const vespaDoc = { + title: file.name, + url: "", + app: Apps.Attachment, + docId: fileId, + parentId: null, + owner: email, + photoLink: "", + ownerEmail: email, + entity: attachmentFileTypeMap[getFileType({ type: file.type, name: file.name })], + chunks: [], + chunks_pos: [], + image_chunks: [], + image_chunks_pos: [], + chunks_map: [], + image_chunks_map: [], + permissions: [email], + mimeType: getBaseMimeType(file.type), + metadata: filePath, + createdAt: Date.now(), + updatedAt: Date.now(), + } + + await insert(vespaDoc, fileSchema) } else { // For non-images: process through FileProcessorService and ingest into file schema @@ -427,8 +452,14 @@ export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], const imageDir = path.join(imageBaseDir, fileId) try { - await fs.access(imageDir) - await fs.rm(imageDir, { recursive: true, force: true }) + await db.transaction(async (tx) => { + await fs.access(imageDir) + await fs.rm(imageDir, { recursive: true, force: true }) + if(fileId.startsWith("attf_")) { + await DeleteDocument(fileId, fileSchema) + } + }) + loggerWithChild({ email: email }).info( `Deleted image attachment directory: ${imageDir}`, ) @@ -456,20 +487,16 @@ export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], try { const vespaIds = expandSheetIds(fileId) for (const vespaId of vespaIds) { - // Delete images from disk - try { + await db.transaction(async (tx) => { + // Delete images from disk await DeleteImages(vespaId) - } catch (error) { - loggerWithChild({ email: email }).warn( - `Failed to delete images from disk: ${fileId} - ${getErrorMessage(error)}`, - ) - } - // Delete from Vespa kb_items or file schema - if(vespaId.startsWith("att_")) { - await DeleteDocument(vespaId, KbItemsSchema) - } else { - await DeleteDocument(vespaId, fileSchema) - } + // Delete from Vespa kb_items or file schema + if(vespaId.startsWith("att_")) { + await DeleteDocument(vespaId, KbItemsSchema) + } else { + await DeleteDocument(vespaId, fileSchema) + } + }) loggerWithChild({ email: email }).info( `Successfully deleted non-image attachment ${vespaId} from Vespa`, ) @@ -501,7 +528,21 @@ export const handleAttachmentDeleteApi = async (c: Context) => { throw new HTTPException(400, { message: "File ID is required" }) } - try { + try { + // Get the attachment document from the file schema + const attachmentDoc = await GetDocument(fileSchema, expandSheetIds(fileId)[0]) + + if (!attachmentDoc || !attachmentDoc.fields) { + return c.json({ success: true, message: "Attachment already deleted" }) + } + + // Check permissions - file schema has permissions array + const fields = attachmentDoc.fields as any + const permissions = Array.isArray(fields.permissions) ? fields.permissions as string[] : [] + if (!permissions.includes(email)) { + throw new HTTPException(403, { message: "Access denied to this attachment" }) + } + await handleAttachmentDelete([attachment], email) return c.json({ success: true, message: "Attachment deleted successfully" }) } catch (error) { From fb78401f7de286f2d5297699829e3f57fbb1caa4 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Wed, 8 Oct 2025 22:07:48 +0530 Subject: [PATCH 10/17] refactor: XYNE-103 removed unecessary transaction --- server/api/files.ts | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/server/api/files.ts b/server/api/files.ts index abb13f72a..be0005ec2 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -452,13 +452,9 @@ export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], const imageDir = path.join(imageBaseDir, fileId) try { - await db.transaction(async (tx) => { - await fs.access(imageDir) - await fs.rm(imageDir, { recursive: true, force: true }) - if(fileId.startsWith("attf_")) { - await DeleteDocument(fileId, fileSchema) - } - }) + await fs.access(imageDir) + await fs.rm(imageDir, { recursive: true, force: true }) + await DeleteDocument(fileId, fileSchema) loggerWithChild({ email: email }).info( `Deleted image attachment directory: ${imageDir}`, @@ -487,16 +483,14 @@ export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], try { const vespaIds = expandSheetIds(fileId) for (const vespaId of vespaIds) { - await db.transaction(async (tx) => { - // Delete images from disk - await DeleteImages(vespaId) - // Delete from Vespa kb_items or file schema - if(vespaId.startsWith("att_")) { - await DeleteDocument(vespaId, KbItemsSchema) - } else { - await DeleteDocument(vespaId, fileSchema) - } - }) + // Delete from Vespa kb_items or file schema + if(vespaId.startsWith("att_")) { + await DeleteDocument(vespaId, KbItemsSchema) + } else { + await DeleteDocument(vespaId, fileSchema) + } + // Delete images from disk + await DeleteImages(vespaId) loggerWithChild({ email: email }).info( `Successfully deleted non-image attachment ${vespaId} from Vespa`, ) From 878ad4a1291d004e6c5eff977911cf604ac80b5a Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 16:30:12 +0530 Subject: [PATCH 11/17] refactor: XYNE-103 added new rank profile for attachments --- server/vespa/schemas/file.sd | 60 +++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 221299326..16c4780e6 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -170,27 +170,48 @@ schema file { } function vector_score() { - expression: closeness(field, chunk_embeddings) + closeness(field, image_chunk_embeddings) + expression: closeness(field, chunk_embeddings) } function combined_bm25() { - expression: bm25(title) + bm25(chunks) + bm25(image_chunks) + expression: bm25(title) + bm25(chunks) } function matchedFieldCount() { expression { # The `matches` returns 1 if the field contains a match, otherwise 0 - matches(title) + matches(chunks) + matches(image_chunks) + matches(title) + matches(chunks) } } function combined_nativeRank() { - expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount == 0, 1, matchedFieldCount) + expression: (nativeRank(title) + nativeRank(chunks)) / if(matchedFieldCount == 0, 1, matchedFieldCount) } function chunk_scores() { expression: elementwise(bm25(chunks), x, double) } + } + + rank-profile initial_image inherits initial { + function vector_score_image() { + expression: vector_score() + closeness(field, image_chunk_embeddings) + } + + function combined_bm25_image() { + expression: combined_bm25() + bm25(image_chunks) + } + + function matchedFieldCount_image() { + expression { + # The `matches` returns 1 if the field contains a match, otherwise 0 + matchedFieldCount() + matches(image_chunks) + } + } + + function combined_nativeRank_image() { + expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount_image == 0, 1, matchedFieldCount_image) + } function image_chunk_scores() { expression: elementwise(bm25(image_chunks), x, double) @@ -214,6 +235,33 @@ schema file { rerank-count: 1000 } + match-features { + matchedFieldCount + vector_score + combined_nativeRank + nativeRank(title) + nativeRank(chunks) + chunk_scores + doc_recency + } + } + + rank-profile default_native_image inherits initial_image { + + first-phase { + expression: (query(alpha) * (vector_score + )) + ((1 - query(alpha)) * combined_nativeRank) + } + + global-phase { + expression { + ( + (query(alpha) * vector_score) + + ((1 - query(alpha)) * combined_nativeRank) + ) * doc_recency + } + rerank-count: 1000 + } + match-features { matchedFieldCount vector_score @@ -290,7 +338,6 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores - image_chunk_scores document_age_days_gs query(bin_size_days_gs) recency_bin_index_gs @@ -327,7 +374,6 @@ schema file { bm25(title) scale(combined_bm25) chunk_scores - image_chunk_scores } } @@ -354,7 +400,6 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores - image_chunk_scores doc_recency } } @@ -369,7 +414,6 @@ schema file { nativeRank(title) nativeRank(chunks) chunk_scores - image_chunk_scores doc_recency } } From bcc0002bfbc1e9104637877d862e0dca6936972a Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 17:05:20 +0530 Subject: [PATCH 12/17] refactor: XYNE-103 renamed rank profile --- server/api/chat/chat.ts | 1 + server/package.json | 2 +- server/vespa/schemas/file.sd | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index 8e45a79ac..71c507382 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -1970,6 +1970,7 @@ async function* generateAnswerFromGivenContext( results = await searchVespaInFiles(builtUserQuery, email, nonCollectionFileIds, { limit: fileIds?.length, alpha: userAlpha, + rankProfile: SearchModes.attachmentRank, }) if (results.root.children) { combinedSearchResponse.push(...results.root.children) diff --git a/server/package.json b/server/package.json index 21e806bc6..a1202d9f2 100644 --- a/server/package.json +++ b/server/package.json @@ -78,7 +78,7 @@ "@types/json-schema": "^7.0.15", "@types/jszip": "^3.4.1", "@types/node": "^24.3.0", - "@xyne/vespa-ts": "1.1.2", + "@xyne/vespa-ts": "1.1.4", "@xynehq/jaf": "^0.1.4", "ai": "^5.0.51", "arctic": "^3.3.0", diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 16c4780e6..0647c4f90 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -106,7 +106,7 @@ schema file { } } - field chunk_embeddings type tensor(p{}, v[DIMS]) { + field chunk_embeddings type tensor(p{}, v[384]) { indexing: input chunks | embed | attribute | index attribute: paged attribute { @@ -114,7 +114,7 @@ schema file { } } - field image_chunk_embeddings type tensor(p{}, v[DIMS]) { + field image_chunk_embeddings type tensor(p{}, v[384]) { indexing: input image_chunks | embed | attribute | index attribute: paged attribute { distance-metric: angular } @@ -143,7 +143,7 @@ schema file { rank-profile initial { # Inputs for the query vector and alpha for hybrid search inputs { - query(e) tensor(v[DIMS]) # Query embedding + query(e) tensor(v[384]) # Query embedding query(alpha) double # Alpha parameter for hybrid weight query(recency_decay_rate) double } @@ -246,26 +246,26 @@ schema file { } } - rank-profile default_native_image inherits initial_image { + rank-profile attachmentRank inherits initial_image { first-phase { - expression: (query(alpha) * (vector_score + )) + ((1 - query(alpha)) * combined_nativeRank) + expression: (query(alpha) * (vector_score_image)) + ((1 - query(alpha)) * combined_nativeRank_image) } global-phase { expression { ( - (query(alpha) * vector_score) + - ((1 - query(alpha)) * combined_nativeRank) + (query(alpha) * vector_score_image) + + ((1 - query(alpha)) * combined_nativeRank_image) ) * doc_recency } rerank-count: 1000 } match-features { - matchedFieldCount - vector_score - combined_nativeRank + matchedFieldCount_image + vector_score_image + combined_nativeRank_image nativeRank(title) nativeRank(chunks) chunk_scores From 86edf8f9046074511ccb0614c80fb5ef3ad204e4 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 17:06:48 +0530 Subject: [PATCH 13/17] refactor: XYNE-103 reverted to DIMS --- server/vespa/schemas/file.sd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 0647c4f90..5031e6dc8 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -106,7 +106,7 @@ schema file { } } - field chunk_embeddings type tensor(p{}, v[384]) { + field chunk_embeddings type tensor(p{}, v[DIMS]) { indexing: input chunks | embed | attribute | index attribute: paged attribute { @@ -114,7 +114,7 @@ schema file { } } - field image_chunk_embeddings type tensor(p{}, v[384]) { + field image_chunk_embeddings type tensor(p{}, v[DIMS]) { indexing: input image_chunks | embed | attribute | index attribute: paged attribute { distance-metric: angular } @@ -143,7 +143,7 @@ schema file { rank-profile initial { # Inputs for the query vector and alpha for hybrid search inputs { - query(e) tensor(v[384]) # Query embedding + query(e) tensor(v[DIMS]) # Query embedding query(alpha) double # Alpha parameter for hybrid weight query(recency_decay_rate) double } From 83f49eb6f81f4749efbfa2744fd4c9cacddffc92 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 18:45:55 +0530 Subject: [PATCH 14/17] refactor: XYNE-103 corrected sheetIds in agent flow --- server/api/chat/agents.ts | 8 ++++++-- server/api/chat/chat.ts | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts index e3bbe5031..8960f1b9d 100644 --- a/server/api/chat/agents.ts +++ b/server/api/chat/agents.ts @@ -2870,9 +2870,11 @@ export const AgentMessageApiRagOff = async (c: Context) => { }) const dataSourceSpan = streamSpan.startSpan("get_all_data_sources") + const appIntegrations = agentForDb?.appIntegrations as string[] + const appIntegrationsWithSheetIds = appIntegrations.flatMap(expandSheetIds) const allDataSources = await getAllDocumentsForAgent( [Apps.DataSource], - agentForDb?.appIntegrations as string[], + appIntegrationsWithSheetIds, 400, email, ) @@ -3149,9 +3151,11 @@ export const AgentMessageApiRagOff = async (c: Context) => { // Build “context + fragments” (same as streaming path) ----------------------- const dataSourceSpan = nonStreamSpan.startSpan("get_all_data_sources") + const appIntegrations = agentForDb?.appIntegrations as string[] + const appIntegrationsWithSheetIds = appIntegrations.flatMap(expandSheetIds) const allDataSources = await getAllDocumentsForAgent( [Apps.DataSource], - agentForDb?.appIntegrations as string[], + appIntegrationsWithSheetIds, 400, email, ) diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index 71c507382..e580c5dd8 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -1193,7 +1193,7 @@ async function* generateIterativeTimeFilterAndQueryRewrite( lowerIntegration.startsWith("ds_") ) { // ds- is the prefix for datasource externalId - agentSpecificDataSourceIds.push(integration) + agentSpecificDataSourceIds.push(...expandSheetIds(integration)) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -1255,6 +1255,14 @@ async function* generateIterativeTimeFilterAndQueryRewrite( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems + for (const app in selectedItems) { + const fileIds = selectedItems[app]; + // Expand each fileId and flatten + if(app !== Apps.KnowledgeBase) { + selectedItems[app] = fileIds.flatMap(expandSheetIds); + } + } + selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] @@ -1275,7 +1283,7 @@ async function* generateIterativeTimeFilterAndQueryRewrite( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } @@ -2396,7 +2404,7 @@ async function* generatePointQueryTimeExpansion( lowerIntegration.startsWith("ds-") || lowerIntegration.startsWith("ds_") ) { - agentSpecificDataSourceIds.push(integration) + agentSpecificDataSourceIds.push(...expandSheetIds(integration)) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -2457,6 +2465,14 @@ async function* generatePointQueryTimeExpansion( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems + for (const app in selectedItems) { + const fileIds = selectedItems[app]; + // Expand each fileId and flatten + if(app !== Apps.KnowledgeBase) { + selectedItems[app] = fileIds.flatMap(expandSheetIds); + } + } + selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] @@ -2476,7 +2492,7 @@ async function* generatePointQueryTimeExpansion( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } @@ -2990,7 +3006,7 @@ async function* generateMetadataQueryAnswer( lowerIntegration.startsWith("ds-") || lowerIntegration.startsWith("ds_") ) { - agentSpecificDataSourceIds.push(integration) + agentSpecificDataSourceIds.push(...expandSheetIds(integration)) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -3050,6 +3066,14 @@ async function* generateMetadataQueryAnswer( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems + for (const app in selectedItems) { + const fileIds = selectedItems[app]; + // Expand each fileId and flatten + if(app !== Apps.KnowledgeBase) { + selectedItems[app] = fileIds.flatMap(expandSheetIds); + } + } + selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] @@ -3069,7 +3093,7 @@ async function* generateMetadataQueryAnswer( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } From 74e93ca1420c037fa634d52354228ebb967d25f1 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 18:52:03 +0530 Subject: [PATCH 15/17] refactor: XYNE-103 reverted some code --- server/api/chat/chat.ts | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index e580c5dd8..c619c7ea4 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -1193,7 +1193,7 @@ async function* generateIterativeTimeFilterAndQueryRewrite( lowerIntegration.startsWith("ds_") ) { // ds- is the prefix for datasource externalId - agentSpecificDataSourceIds.push(...expandSheetIds(integration)) + agentSpecificDataSourceIds.push(integration) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -1255,14 +1255,6 @@ async function* generateIterativeTimeFilterAndQueryRewrite( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems - for (const app in selectedItems) { - const fileIds = selectedItems[app]; - // Expand each fileId and flatten - if(app !== Apps.KnowledgeBase) { - selectedItems[app] = fileIds.flatMap(expandSheetIds); - } - } - selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] @@ -2404,7 +2396,7 @@ async function* generatePointQueryTimeExpansion( lowerIntegration.startsWith("ds-") || lowerIntegration.startsWith("ds_") ) { - agentSpecificDataSourceIds.push(...expandSheetIds(integration)) + agentSpecificDataSourceIds.push(integration) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -2465,14 +2457,6 @@ async function* generatePointQueryTimeExpansion( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems - for (const app in selectedItems) { - const fileIds = selectedItems[app]; - // Expand each fileId and flatten - if(app !== Apps.KnowledgeBase) { - selectedItems[app] = fileIds.flatMap(expandSheetIds); - } - } - selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] @@ -3006,7 +2990,7 @@ async function* generateMetadataQueryAnswer( lowerIntegration.startsWith("ds-") || lowerIntegration.startsWith("ds_") ) { - agentSpecificDataSourceIds.push(...expandSheetIds(integration)) + agentSpecificDataSourceIds.push(integration) if (!agentAppEnums.includes(Apps.DataSource)) { agentAppEnums.push(Apps.DataSource) } @@ -3066,14 +3050,6 @@ async function* generateMetadataQueryAnswer( agentPromptData.appIntegrations, ) // Use selectedApps and selectedItems - for (const app in selectedItems) { - const fileIds = selectedItems[app]; - // Expand each fileId and flatten - if(app !== Apps.KnowledgeBase) { - selectedItems[app] = fileIds.flatMap(expandSheetIds); - } - } - selectedItem = selectedItems // agentAppEnums = selectedApps.filter(isValidApp); agentAppEnums = [...new Set(selectedApps)] From 0b7e061ed8f0d59839411f835c45088113615e78 Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 18:57:45 +0530 Subject: [PATCH 16/17] refactor: XYNE-103 fixed ai comments --- server/api/chat/agents.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts index 8960f1b9d..e3bbe5031 100644 --- a/server/api/chat/agents.ts +++ b/server/api/chat/agents.ts @@ -2870,11 +2870,9 @@ export const AgentMessageApiRagOff = async (c: Context) => { }) const dataSourceSpan = streamSpan.startSpan("get_all_data_sources") - const appIntegrations = agentForDb?.appIntegrations as string[] - const appIntegrationsWithSheetIds = appIntegrations.flatMap(expandSheetIds) const allDataSources = await getAllDocumentsForAgent( [Apps.DataSource], - appIntegrationsWithSheetIds, + agentForDb?.appIntegrations as string[], 400, email, ) @@ -3151,11 +3149,9 @@ export const AgentMessageApiRagOff = async (c: Context) => { // Build “context + fragments” (same as streaming path) ----------------------- const dataSourceSpan = nonStreamSpan.startSpan("get_all_data_sources") - const appIntegrations = agentForDb?.appIntegrations as string[] - const appIntegrationsWithSheetIds = appIntegrations.flatMap(expandSheetIds) const allDataSources = await getAllDocumentsForAgent( [Apps.DataSource], - appIntegrationsWithSheetIds, + agentForDb?.appIntegrations as string[], 400, email, ) From 549d8c26c03c67969f5b01ef0a9ed6de1d3a2b3e Mon Sep 17 00:00:00 2001 From: Himansh Varma Date: Thu, 9 Oct 2025 19:03:23 +0530 Subject: [PATCH 17/17] refactor: XYNE-103 made some changes in file schema --- server/vespa/schemas/file.sd | 51 +++++++++++++++++------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 5031e6dc8..73b282bba 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -192,32 +192,6 @@ schema file { expression: elementwise(bm25(chunks), x, double) } } - - rank-profile initial_image inherits initial { - function vector_score_image() { - expression: vector_score() + closeness(field, image_chunk_embeddings) - } - - function combined_bm25_image() { - expression: combined_bm25() + bm25(image_chunks) - } - - function matchedFieldCount_image() { - expression { - # The `matches` returns 1 if the field contains a match, otherwise 0 - matchedFieldCount() + matches(image_chunks) - } - } - - function combined_nativeRank_image() { - expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount_image == 0, 1, matchedFieldCount_image) - } - - function image_chunk_scores() { - expression: elementwise(bm25(image_chunks), x, double) - } - } - rank-profile default_native inherits initial { @@ -246,7 +220,30 @@ schema file { } } - rank-profile attachmentRank inherits initial_image { + rank-profile attachmentRank inherits initial { + + function vector_score_image() { + expression: vector_score() + closeness(field, image_chunk_embeddings) + } + + function combined_bm25_image() { + expression: combined_bm25() + bm25(image_chunks) + } + + function matchedFieldCount_image() { + expression { + # The `matches` returns 1 if the field contains a match, otherwise 0 + matchedFieldCount() + matches(image_chunks) + } + } + + function combined_nativeRank_image() { + expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount_image == 0, 1, matchedFieldCount_image) + } + + function image_chunk_scores() { + expression: elementwise(bm25(image_chunks), x, double) + } first-phase { expression: (query(alpha) * (vector_score_image)) + ((1 - query(alpha)) * combined_nativeRank_image)