diff --git a/frontend/src/components/AttachmentPreview.tsx b/frontend/src/components/AttachmentPreview.tsx index fe744cc0f..9a739c945 100644 --- a/frontend/src/components/AttachmentPreview.tsx +++ b/frontend/src/components/AttachmentPreview.tsx @@ -10,7 +10,7 @@ import { DialogHeader, DialogTitle, } from "@/components/ui/dialog" -import { getFileType } from "@/utils/fileUtils" +import { getFileType } from "shared/fileUtils" import { getFileIcon } from "@/components/ChatBox" interface AttachmentPreviewProps { diff --git a/frontend/src/components/ChatBox.tsx b/frontend/src/components/ChatBox.tsx index 2af90dbf8..aecde7b7b 100644 --- a/frontend/src/components/ChatBox.tsx +++ b/frontend/src/components/ChatBox.tsx @@ -74,8 +74,8 @@ import { validateAndDeduplicateFiles, createImagePreview, cleanupPreviewUrls, - getFileType, } from "@/utils/fileUtils" +import { getFileType } from "shared/fileUtils" import { authFetch } from "@/utils/authFetch" interface SelectedFile { @@ -977,15 +977,37 @@ export const ChatBox = React.forwardRef( return ext || "file" } - const removeFile = useCallback((id: string) => { + const removeFile = useCallback(async (id: string) => { + const fileToRemove = selectedFiles.find((f) => f.id === id) + + // If the file has metadata with fileId (meaning it's already uploaded), delete it from the server + if (fileToRemove?.metadata?.fileId) { + try { + const response = await api.files.delete.$post({ + json: { + attachment: fileToRemove.metadata + } + }) + + if (!response.ok) { + const errorText = await response.text() + console.error(`Failed to delete attachment: ${errorText}`) + // Still remove from UI even if server deletion fails + } + } catch (error) { + console.error('Error deleting attachment:', error) + // Still remove from UI even if server deletion fails + } + } + + // Remove from UI setSelectedFiles((prev) => { - const fileToRemove = prev.find((f) => f.id === id) if (fileToRemove?.preview) { URL.revokeObjectURL(fileToRemove.preview) } return prev.filter((f) => f.id !== id) }) - }, []) + }, [selectedFiles]) const { handleFileSelect, handleFileChange } = createFileSelectionHandlers( fileInputRef, diff --git a/frontend/src/lib/common.tsx b/frontend/src/lib/common.tsx index 5470fa386..604c045ec 100644 --- a/frontend/src/lib/common.tsx +++ b/frontend/src/lib/common.tsx @@ -43,10 +43,9 @@ import { DataSourceEntity, WebSearchEntity, FileType, - MIME_TYPE_MAPPINGS, - EXTENSION_MAPPINGS, } from "shared/types" import { LoadingSpinner } from "@/routes/_authenticated/admin/integrations/google" +import { getFileType } from "shared/fileUtils" // Define placeholder entities if they don't exist in shared/types const PdfEntity = { Default: "pdf_default" } as const @@ -165,30 +164,9 @@ export const getIcon = ( } } -// Helper function to determine FileType from a file -export const getFileType = (file: File): FileType => { - // First check by MIME type - for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { - if ((mimeTypes as readonly string[]).includes(file.type)) { - return fileType as FileType - } - } - - // Fallback to extension checking - const fileName = file.name.toLowerCase() - for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { - if ((extensions as readonly string[]).some(ext => fileName.endsWith(ext))) { - return fileType as FileType - } - } - - // Default fallback - return FileType.FILE -} - // Icon mapping from FileType to SVG component export const getFileIcon = (file: File) => { - const fileType = getFileType(file) + const fileType = getFileType({type: file.type, name: file.name}) switch (fileType) { case FileType.TEXT: diff --git a/frontend/src/utils/fileUtils.ts b/frontend/src/utils/fileUtils.ts index 17ccd976e..98760ef18 100644 --- a/frontend/src/utils/fileUtils.ts +++ b/frontend/src/utils/fileUtils.ts @@ -1,35 +1,11 @@ import { isValidFile, isImageFile } from "shared/fileUtils" import { SelectedFile } from "@/components/ClFileUpload" import { authFetch } from "./authFetch" -import { FileType, MIME_TYPE_MAPPINGS, EXTENSION_MAPPINGS, UploadStatus } from "shared/types" +import { UploadStatus } from "shared/types" // Generate unique ID for files export const generateFileId = () => Math.random().toString(36).substring(2, 9) -export const getFileType = ({ type, name }: { type: string, name: string }): FileType => { - const fileName = name.toLowerCase() - const mimeType = type.toLowerCase() - const baseMime = mimeType.split(";")[0] - - // Check each file type category using the mappings - for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { - // Check MIME type first (more reliable) - if (mimeTypes.some(mime => baseMime === mime)) { - return fileType as FileType - } - } - - // Fallback to extension-based detection - for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { - if (extensions.some(ext => fileName.endsWith(ext))) { - return fileType as FileType - } - } - - // Default fallback - return FileType.FILE -} - // Create preview URL for image files export const createImagePreview = (file: File): string | undefined => { if (isImageFile(file.type)) { diff --git a/package.json b/package.json index 6eb34ce86..d56050c1c 100644 --- a/package.json +++ b/package.json @@ -2,4 +2,4 @@ "dependencies": { "zustand": "^5.0.8" } -} \ No newline at end of file +} diff --git a/server/api/chat/agents.ts b/server/api/chat/agents.ts index d4375c634..e3bbe5031 100644 --- a/server/api/chat/agents.ts +++ b/server/api/chat/agents.ts @@ -83,6 +83,7 @@ import { VespaChatUserSchema, type VespaSearchResult, type VespaSearchResults, + AttachmentEntity, } from "@xyne/vespa-ts/types" import { APIError } from "openai" import { insertChatTrace } from "@/db/chatTrace" @@ -121,7 +122,6 @@ import { getModelValueFromLabel } from "@/ai/modelConfig" import { buildContext, buildUserQuery, - expandSheetIds, getThreadContext, isContextSelected, UnderstandMessageAndAnswer, @@ -156,6 +156,7 @@ import { getDateForAI } from "@/utils/index" import { validateVespaIdInAgentIntegrations } from "@/search/utils" import { getAuth, safeGet } from "../agent" import { applyFollowUpContext } from "@/utils/parseAttachment" +import { expandSheetIds } from "@/search/utils" const { JwtPayloadKey, defaultBestModel, @@ -392,7 +393,7 @@ const checkAndYieldCitationsForAgent = async function* ( } // we dont want citations for attachments in the chat - if (item.source.entity === KnowledgeBaseEntity.Attachment) { + if (Object.values(AttachmentEntity).includes(item.source.entity as AttachmentEntity)) { continue } diff --git a/server/api/chat/chat.ts b/server/api/chat/chat.ts index 14f527776..c619c7ea4 100644 --- a/server/api/chat/chat.ts +++ b/server/api/chat/chat.ts @@ -128,6 +128,8 @@ import { type VespaSearchResultsSchema, KnowledgeBaseEntity, KbItemsSchema, + AttachmentEntity, + fileSchema, } from "@xyne/vespa-ts/types" import { APIError } from "openai" import { @@ -213,31 +215,12 @@ import { getDateForAI } from "@/utils/index" import type { User } from "@microsoft/microsoft-graph-types" import { getAuth, safeGet } from "../agent" import { getChunkCountPerDoc } from "./chunk-selection" +import { handleAttachmentDelete } from "../files" +import { expandSheetIds } from "@/search/utils" const METADATA_NO_DOCUMENTS_FOUND = "METADATA_NO_DOCUMENTS_FOUND_INTERNAL" const METADATA_FALLBACK_TO_RAG = "METADATA_FALLBACK_TO_RAG_INTERNAL" -export function expandSheetIds(fileId: string): string[] { - // Check if the fileId matches the pattern docId_sheet_number - const sheetMatch = fileId.match(/^(.+)_sheet_(\d+)$/) - - if (!sheetMatch) { - // Not a sheet ID, return as is - return [fileId] - } - - const [, docId, sheetNumberStr] = sheetMatch - const sheetNumber = parseInt(sheetNumberStr, 10) - // Generate IDs from docId_sheet_0 to docId_sheet_number - const expandedIds: string[] = [] - const upper = Number.isFinite(sheetNumber) ? sheetNumber : 1 - for (let i = 0; i < upper; i++) { - expandedIds.push(`${docId}_sheet_${i}`) - } - - return expandedIds -} - export async function resolveNamesToEmails( intent: Intent, email: string, @@ -506,7 +489,7 @@ const checkAndYieldCitations = async function* ( const f = (item as any)?.fields if ( f?.sddocname === dataSourceFileSchema || - f?.entity === KnowledgeBaseEntity.Attachment + Object.values(AttachmentEntity).includes(f?.entity) ) { // Skip datasource and attachment files from citations continue @@ -784,6 +767,7 @@ export const ChatDeleteApi = async (c: Context) => { email = sub || "" // @ts-ignore const { chatId } = c.req.valid("json") + const attachmentsToDelete: AttachmentMetadata[] = [] await db.transaction(async (tx) => { // Get the chat's internal ID first const chat = await getChatByExternalIdWithAuth(tx, chatId, email) @@ -791,116 +775,13 @@ export const ChatDeleteApi = async (c: Context) => { throw new HTTPException(404, { message: "Chat not found" }) } - // Get all messages for the chat to find attachments + // Get all messages for the chat to delete attachments const messagesToDelete = await getChatMessagesWithAuth(tx, chatId, email) - // Collect all attachment file IDs that need to be deleted - const imageAttachmentFileIds: string[] = [] - const nonImageAttachmentFileIds: string[] = [] - for (const message of messagesToDelete) { if (message.attachments && Array.isArray(message.attachments)) { - const attachments = - message.attachments as unknown as AttachmentMetadata[] - for (const attachment of attachments) { - if (attachment && typeof attachment === "object") { - if (attachment.fileId) { - // Check if this is an image attachment using both isImage field and fileType - const isImageAttachment = - attachment.isImage || - (attachment.fileType && isImageFile(attachment.fileType)) - - if (isImageAttachment) { - imageAttachmentFileIds.push(attachment.fileId) - } else { - nonImageAttachmentFileIds.push(attachment.fileId) - } - } - } - } - } - } - - // Delete image attachments and their thumbnails from disk - if (imageAttachmentFileIds.length > 0) { - loggerWithChild({ email: email }).info( - `Deleting ${imageAttachmentFileIds.length} image attachment files and their thumbnails for chat ${chatId}`, - ) - - for (const fileId of imageAttachmentFileIds) { - try { - // Validate fileId to prevent path traversal - if ( - fileId.includes("..") || - fileId.includes("/") || - fileId.includes("\\") - ) { - loggerWithChild({ email: email }).error( - `Invalid fileId detected: ${fileId}. Skipping deletion for security.`, - ) - continue - } - const imageBaseDir = path.resolve( - process.env.IMAGE_DIR || "downloads/xyne_images_db", - ) - - const imageDir = path.join(imageBaseDir, fileId) - try { - await fs.access(imageDir) - await fs.rm(imageDir, { recursive: true, force: true }) - loggerWithChild({ email: email }).info( - `Deleted image attachment directory: ${imageDir}`, - ) - } catch (attachmentError) { - loggerWithChild({ email: email }).warn( - `Image attachment file ${fileId} not found in either directory during chat deletion`, - ) - } - } catch (error) { - loggerWithChild({ email: email }).error( - error, - `Failed to delete image attachment file ${fileId} during chat deletion: ${getErrorMessage(error)}`, - ) - } - } - } - - // Delete non-image attachments from Vespa kb_items schema - if (nonImageAttachmentFileIds.length > 0) { - loggerWithChild({ email: email }).info( - `Deleting ${nonImageAttachmentFileIds.length} non-image attachments from Vespa kb_items schema for chat ${chatId}`, - ) - - for (const fileId of nonImageAttachmentFileIds) { - try { - // Delete from Vespa kb_items schema using the proper Vespa function - const vespaIds = expandSheetIds(fileId) - for (const id of vespaIds) { - try { - await DeleteDocument(id, KbItemsSchema) - loggerWithChild({ email }).info( - `Successfully deleted non-image attachment ${id} from Vespa kb_items schema`, - ) - } catch (error) { - loggerWithChild({ email }).error( - `Failed to delete non-image attachment ${id} from Vespa kb_items schema`, - { error: getErrorMessage(error) } - ) - } - } - } catch (error) { - const errorMessage = getErrorMessage(error) - if (errorMessage.includes("404 Not Found")) { - loggerWithChild({ email: email }).warn( - `Non-image attachment ${fileId} not found in Vespa kb_items schema (may have been already deleted)`, - ) - } else { - loggerWithChild({ email: email }).error( - error, - `Failed to delete non-image attachment ${fileId} from Vespa kb_items schema: ${errorMessage}`, - ) - } - } + const attachments = message.attachments as AttachmentMetadata[] + attachmentsToDelete.push(...attachments) } } @@ -913,6 +794,9 @@ export const ChatDeleteApi = async (c: Context) => { await deleteMessagesByChatId(tx, chatId) await deleteChatByExternalIdWithAuth(tx, chatId, email) }) + if (attachmentsToDelete.length) { + await handleAttachmentDelete(attachmentsToDelete, email) + } return c.json({ success: true }) } catch (error) { const errMsg = getErrorMessage(error) @@ -1391,7 +1275,7 @@ async function* generateIterativeTimeFilterAndQueryRewrite( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } @@ -2086,6 +1970,7 @@ async function* generateAnswerFromGivenContext( results = await searchVespaInFiles(builtUserQuery, email, nonCollectionFileIds, { limit: fileIds?.length, alpha: userAlpha, + rankProfile: SearchModes.attachmentRank, }) if (results.root.children) { combinedSearchResponse.push(...results.root.children) @@ -2591,7 +2476,7 @@ async function* generatePointQueryTimeExpansion( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } @@ -3184,7 +3069,7 @@ async function* generateMetadataQueryAnswer( collectionFolderIds.push(itemId.replace(/^clfd[-_]/, "")) } else if (itemId.startsWith("clf-")) { // Collection file - remove clf- prefix - collectionFileIds.push(itemId.replace(/^clf[-_]/, "")) + collectionFileIds.push(...expandSheetIds(itemId.replace(/^clf[-_]/, ""))) } } diff --git a/server/api/files.ts b/server/api/files.ts index 189e22e01..be0005ec2 100644 --- a/server/api/files.ts +++ b/server/api/files.ts @@ -5,25 +5,32 @@ import { getLogger, getLoggerWithChild } from "@/logger" import { Subsystem } from "@/types" import { type DataSourceUploadResult, + DeleteImages, handleSingleFileUploadToDataSource, } from "@/api/dataSource" import { getUserByEmail } from "@/db/user" import { db } from "@/db/client" import { checkIfDataSourceFileExistsByNameAndId, + DeleteDocument, getDataSourceByNameAndCreator, insert, + GetDocument, } from "../search/vespa" import { NoUserFound } from "@/errors" import config from "@/config" import { HTTPException } from "hono/http-exception" -import { isValidFile, isImageFile } from "shared/fileUtils" +import { isValidFile, isImageFile, getFileType } from "shared/fileUtils" import { generateThumbnail, getThumbnailPath } from "@/utils/image" -import type { AttachmentMetadata } from "@/shared/types" +import { attachmentFileTypeMap, type AttachmentMetadata } from "@/shared/types" import { FileProcessorService, type SheetProcessingResult } from "@/services/fileProcessor" -import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types" +import { Apps, fileSchema, KbItemsSchema } from "@xyne/vespa-ts/types" import { getBaseMimeType } from "@/integrations/dataSource/config" import { isDataSourceError } from "@/integrations/dataSource/errors" +import { handleAttachmentDeleteSchema } from "./search" +import { getErrorMessage } from "@/utils" +import { promises as fs } from "node:fs" +import { expandSheetIds } from "@/search/utils" const { JwtPayloadKey } = config const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" }) @@ -221,7 +228,7 @@ export const handleAttachmentUpload = async (c: Context) => { for (const file of files) { const fileBuffer = await file.arrayBuffer() - const fileId = `att_${crypto.randomUUID()}` + const fileId = `attf_${crypto.randomUUID()}` let vespaId = fileId const ext = file.name.split(".").pop()?.toLowerCase() || "" const fullFileName = `${0}.${ext}` @@ -244,8 +251,33 @@ export const handleAttachmentUpload = async (c: Context) => { // Generate thumbnail for images thumbnailPath = getThumbnailPath(outputDir, fileId) await generateThumbnail(Buffer.from(fileBuffer), thumbnailPath) + + const vespaDoc = { + title: file.name, + url: "", + app: Apps.Attachment, + docId: fileId, + parentId: null, + owner: email, + photoLink: "", + ownerEmail: email, + entity: attachmentFileTypeMap[getFileType({ type: file.type, name: file.name })], + chunks: [], + chunks_pos: [], + image_chunks: [], + image_chunks_pos: [], + chunks_map: [], + image_chunks_map: [], + permissions: [email], + mimeType: getBaseMimeType(file.type), + metadata: filePath, + createdAt: Date.now(), + updatedAt: Date.now(), + } + + await insert(vespaDoc, fileSchema) } else { - // For non-images: process through FileProcessorService and ingest into Vespa + // For non-images: process through FileProcessorService and ingest into file schema // Process the file content using FileProcessorService const processingResults = await FileProcessorService.processFile( @@ -283,20 +315,23 @@ export const handleAttachmentUpload = async (c: Context) => { processingResult const vespaDoc = { + title: file.name, + url: "", + app: Apps.Attachment, docId: docId, - clId: "attachment", - itemId: docId, - fileName: fileName, - app: Apps.KnowledgeBase as const, - entity: KnowledgeBaseEntity.Attachment, - description: "", - storagePath: "", + parentId: null, + owner: email, + photoLink: "", + ownerEmail: email, + entity: attachmentFileTypeMap[getFileType({ type: file.type, name: file.name })], chunks: chunks, chunks_pos: chunks_pos, image_chunks: image_chunks, image_chunks_pos: image_chunks_pos, chunks_map: processingResult.chunks_map, image_chunks_map: processingResult.image_chunks_map, + permissions: [email], + mimeType: getBaseMimeType(file.type || "text/plain"), metadata: JSON.stringify({ originalFileName: file.name, uploadedBy: email, @@ -310,15 +345,11 @@ export const handleAttachmentUpload = async (c: Context) => { totalSheets: (processingResult as SheetProcessingResult).totalSheets, }), }), - createdBy: email, - duration: 0, - mimeType: getBaseMimeType(file.type || "text/plain"), - fileSize: file.size, createdAt: Date.now(), updatedAt: Date.now(), } - await insert(vespaDoc, KbItemsSchema) + await insert(vespaDoc, fileSchema) } } @@ -375,6 +406,148 @@ export const handleAttachmentUpload = async (c: Context) => { } } +export const handleAttachmentDelete = async (attachments: AttachmentMetadata [], email: string) => { + const imageAttachmentFileIds: string[] = [] + const nonImageAttachmentFileIds: string[] = [] + + for (const attachment of attachments) { + if (attachment && typeof attachment === "object") { + if (attachment.fileId) { + // Check if this is an image attachment using both isImage field and fileType + const isImageAttachment = + attachment.isImage || + (attachment.fileType && isImageFile(attachment.fileType)) + + if (isImageAttachment) { + imageAttachmentFileIds.push(attachment.fileId) + } else { + nonImageAttachmentFileIds.push(attachment.fileId) + } + } + } + } + + // Delete image attachments and their thumbnails from disk + if (imageAttachmentFileIds.length > 0) { + loggerWithChild({ email: email }).info( + `Deleting ${imageAttachmentFileIds.length} image attachment files and their thumbnails`, + ) + + for (const fileId of imageAttachmentFileIds) { + try { + // Validate fileId to prevent path traversal + if ( + fileId.includes("..") || + fileId.includes("/") || + fileId.includes("\\") + ) { + loggerWithChild({ email: email }).error( + `Invalid fileId detected: ${fileId}. Skipping deletion for security.`, + ) + continue + } + const imageBaseDir = path.resolve( + process.env.IMAGE_DIR || "downloads/xyne_images_db", + ) + + const imageDir = path.join(imageBaseDir, fileId) + try { + await fs.access(imageDir) + await fs.rm(imageDir, { recursive: true, force: true }) + await DeleteDocument(fileId, fileSchema) + + loggerWithChild({ email: email }).info( + `Deleted image attachment directory: ${imageDir}`, + ) + } catch (attachmentError) { + loggerWithChild({ email: email }).warn( + `Image attachment file ${fileId} not found in either directory during chat deletion`, + ) + } + } catch (error) { + loggerWithChild({ email: email }).error( + error, + `Failed to delete image attachment file ${fileId} during chat deletion: ${getErrorMessage(error)}`, + ) + } + } + } + + // Delete non-image attachments from Vespa + if (nonImageAttachmentFileIds.length > 0) { + loggerWithChild({ email: email }).info( + `Deleting ${nonImageAttachmentFileIds.length} non-image attachments from Vespa`, + ) + + for (const fileId of nonImageAttachmentFileIds) { + try { + const vespaIds = expandSheetIds(fileId) + for (const vespaId of vespaIds) { + // Delete from Vespa kb_items or file schema + if(vespaId.startsWith("att_")) { + await DeleteDocument(vespaId, KbItemsSchema) + } else { + await DeleteDocument(vespaId, fileSchema) + } + // Delete images from disk + await DeleteImages(vespaId) + loggerWithChild({ email: email }).info( + `Successfully deleted non-image attachment ${vespaId} from Vespa`, + ) + } + } catch (error) { + const errorMessage = getErrorMessage(error) + if (errorMessage.includes("404 Not Found")) { + loggerWithChild({ email: email }).warn( + `Non-image attachment ${fileId} not found in Vespa (may have been already deleted)`, + ) + } else { + loggerWithChild({ email: email }).error( + error, + `Failed to delete non-image attachment ${fileId} from Vespa: ${errorMessage}`, + ) + } + } + } + } +} + +export const handleAttachmentDeleteApi = async (c: Context) => { + const { sub } = c.get(JwtPayloadKey) + const email = sub + + const { attachment } = handleAttachmentDeleteSchema.parse(await c.req.json()) + const fileId = attachment.fileId + if (!fileId) { + throw new HTTPException(400, { message: "File ID is required" }) + } + + try { + // Get the attachment document from the file schema + const attachmentDoc = await GetDocument(fileSchema, expandSheetIds(fileId)[0]) + + if (!attachmentDoc || !attachmentDoc.fields) { + return c.json({ success: true, message: "Attachment already deleted" }) + } + + // Check permissions - file schema has permissions array + const fields = attachmentDoc.fields as any + const permissions = Array.isArray(fields.permissions) ? fields.permissions as string[] : [] + if (!permissions.includes(email)) { + throw new HTTPException(403, { message: "Access denied to this attachment" }) + } + + await handleAttachmentDelete([attachment], email) + return c.json({ success: true, message: "Attachment deleted successfully" }) + } catch (error) { + if (error instanceof HTTPException) { + throw error + } + loggerWithChild({ email }).error({ err: error }, "Error checking attachment permissions") + throw new HTTPException(500, { message: "Internal server error" }) + } +} + /** * Serve attachment file by fileId */ diff --git a/server/api/knowledgeBase.ts b/server/api/knowledgeBase.ts index aea0684c2..e5fe96179 100644 --- a/server/api/knowledgeBase.ts +++ b/server/api/knowledgeBase.ts @@ -58,7 +58,7 @@ import { } from "@/integrations/dataSource/config" import { getAuth, safeGet } from "./agent" import { ApiKeyScopes, UploadStatus } from "@/shared/types" -import { expandSheetIds } from "./chat/chat" +import { expandSheetIds } from "@/search/utils" import { checkFileSize } from "@/integrations/dataSource" const EXTENSION_MIME_MAP: Record = { diff --git a/server/api/search.ts b/server/api/search.ts index 53bca117c..f05f6ed57 100644 --- a/server/api/search.ts +++ b/server/api/search.ts @@ -48,7 +48,7 @@ import { cleanContext, userContext, } from "@/ai/context" -import { AnswerSSEvents, AuthType, ConnectorStatus } from "@/shared/types" +import { AnswerSSEvents, attachmentMetadataSchema, AuthType, ConnectorStatus } from "@/shared/types" import { agentPromptPayloadSchema } from "@/shared/types" import { streamSSE } from "hono/streaming" import { getLogger, getLoggerWithChild } from "@/logger" @@ -256,6 +256,10 @@ export const generatePromptSchema = z.object({ }) +export const handleAttachmentDeleteSchema = z.object({ + attachment: attachmentMetadataSchema, +}) + export type GeneratePromptPayload = z.infer export const AutocompleteApi = async (c: Context) => { diff --git a/server/integrations/google/sync.ts b/server/integrations/google/sync.ts index 093edff9a..2ef30553e 100644 --- a/server/integrations/google/sync.ts +++ b/server/integrations/google/sync.ts @@ -369,11 +369,15 @@ export const handleGoogleDriveChange = async ( await insertWithRetry(data, fileSchema) } } else { - vespaData.permissions = toPermissionsList( + const permissionsAsString = toPermissionsList( vespaData.permissions, email, - ) - await insertWithRetry(vespaData, fileSchema) + ); + const vespaDataForInsert = { + ...vespaData, + permissions: permissionsAsString, + }; + await insertWithRetry(vespaDataForInsert, fileSchema); } } } catch (err) { diff --git a/server/package.json b/server/package.json index 3a6e20699..a1202d9f2 100644 --- a/server/package.json +++ b/server/package.json @@ -78,7 +78,7 @@ "@types/json-schema": "^7.0.15", "@types/jszip": "^3.4.1", "@types/node": "^24.3.0", - "@xyne/vespa-ts": "1.1.0", + "@xyne/vespa-ts": "1.1.4", "@xynehq/jaf": "^0.1.4", "ai": "^5.0.51", "arctic": "^3.3.0", diff --git a/server/search/utils.ts b/server/search/utils.ts index ecb6ce382..c6072a4d3 100644 --- a/server/search/utils.ts +++ b/server/search/utils.ts @@ -20,6 +20,27 @@ import { sharedVespaService } from "./vespaService" const Logger = getLogger(Subsystem.Vespa).child({ module: "search-utils" }) +export function expandSheetIds(fileId: string): string[] { + // Check if the fileId matches the pattern docId_sheet_number + const sheetMatch = fileId.match(/^(.+)_sheet_(\d+)$/) + + if (!sheetMatch) { + // Not a sheet ID, return as is + return [fileId] + } + + const [, docId, sheetNumberStr] = sheetMatch + const sheetNumber = parseInt(sheetNumberStr, 10) + // Generate IDs from docId_sheet_0 to docId_sheet_number + const expandedIds: string[] = [] + const upper = Number.isFinite(sheetNumber) ? sheetNumber : 1 + for (let i = 0; i < upper; i++) { + expandedIds.push(`${docId}_sheet_${i}`) + } + + return expandedIds +} + export function removePrefixesFromItemIds(itemIds: string[]): string[] { return itemIds.map((itemId) => { // Remove prefixes: clfd-, clf-, cl- diff --git a/server/server.ts b/server/server.ts index ee2e4d258..c1e6d9ef9 100644 --- a/server/server.ts +++ b/server/server.ts @@ -21,6 +21,7 @@ import { chatTitleSchema, GetDriveItem, GetDriveItemsByDocIds, + handleAttachmentDeleteSchema, } from "@/api/search" import { callNotificationService } from "@/services/callNotifications" import { HighlightApi, highlightSchema } from "@/api/highlight" @@ -238,6 +239,7 @@ import { handleFileUpload, handleAttachmentServe, handleThumbnailServe, + handleAttachmentDeleteApi, } from "@/api/files" import { z } from "zod" // Ensure z is imported if not already at the top for schemas import { @@ -894,6 +896,7 @@ export const AppRoutes = app .post("/files/upload-attachment", handleAttachmentUpload) .get("/attachments/:fileId", handleAttachmentServe) .get("/attachments/:fileId/thumbnail", handleThumbnailServe) + .post("/files/delete", zValidator("json", handleAttachmentDeleteSchema), handleAttachmentDeleteApi) .post("/chat", zValidator("json", chatSchema), GetChatApi) .post( "/chat/generateTitle", diff --git a/server/shared/fileUtils.ts b/server/shared/fileUtils.ts index de0040837..1fcca1476 100644 --- a/server/shared/fileUtils.ts +++ b/server/shared/fileUtils.ts @@ -7,6 +7,30 @@ export const isImageFile = (fileType: string): boolean => { ) } +export const getFileType = ({ type, name }: { type: string, name: string }): FileType => { + const fileName = name.toLowerCase() + const mimeType = type.toLowerCase() + const baseMime = mimeType.split(";")[0] + + // Check each file type category using the mappings + for (const [fileType, mimeTypes] of Object.entries(MIME_TYPE_MAPPINGS)) { + // Check MIME type first (more reliable) + if (mimeTypes.some(mime => baseMime === mime)) { + return fileType as FileType + } + } + + // Fallback to extension-based detection + for (const [fileType, extensions] of Object.entries(EXTENSION_MAPPINGS)) { + if (extensions.some(ext => fileName.endsWith(ext))) { + return fileType as FileType + } + } + + // Default fallback + return FileType.FILE +} + export const isValidFile = (file: File) => { // Set size limits const maxGeneralSize = 40 * 1024 * 1024 // 40MB diff --git a/server/shared/types.ts b/server/shared/types.ts index 66cb6509f..ffa346be8 100644 --- a/server/shared/types.ts +++ b/server/shared/types.ts @@ -27,7 +27,7 @@ import { GooglePeopleEntity, SlackEntity, MicrosoftPeopleEntity, - + AttachmentEntity, } from "@xyne/vespa-ts/types" export { GooglePeopleEntity, @@ -36,6 +36,7 @@ export { CalendarEntity, MailAttachmentEntity, SlackEntity, + AttachmentEntity, Apps, isMailAttachment, SystemEntity, @@ -57,7 +58,10 @@ export type { } from "@xyne/vespa-ts/types" export type VespaFile = z.infer -export const FileEntitySchema = z.nativeEnum(DriveEntity) +export const FileEntitySchema = z.union([ + z.nativeEnum(DriveEntity), + z.nativeEnum(AttachmentEntity), +]) export const MailEntitySchema = z.nativeEnum(MailEntity) export const MailAttachmentEntitySchema = z.nativeEnum(MailAttachmentEntity) export const EventEntitySchema = z.nativeEnum(CalendarEntity) @@ -203,6 +207,16 @@ export const EXTENSION_MAPPINGS = { [FileType.TEXT]: [".txt", ".md"], } as const +export const attachmentFileTypeMap: Record = { + [FileType.DOCUMENT]: AttachmentEntity.Docs, + [FileType.SPREADSHEET]: AttachmentEntity.Sheets, + [FileType.PRESENTATION]: AttachmentEntity.PPT, + [FileType.PDF]: AttachmentEntity.PDF, + [FileType.TEXT]: AttachmentEntity.Text, + [FileType.IMAGE]: AttachmentEntity.Image, + [FileType.FILE]: AttachmentEntity.File, +} + export enum ApiKeyScopes { CREATE_AGENT = "CREATE_AGENT", AGENT_CHAT = "AGENT_CHAT", diff --git a/server/vespa/schemas/file.sd b/server/vespa/schemas/file.sd index 3dc224a89..73b282bba 100644 --- a/server/vespa/schemas/file.sd +++ b/server/vespa/schemas/file.sd @@ -1,4 +1,11 @@ schema file { + + struct chunk_meta { + field chunk_index type int {} + field page_numbers type array {} + field block_labels type array {} + } + document file { field docId type string { indexing: attribute | summary @@ -40,6 +47,29 @@ schema file { index: enable-bm25 } + field image_chunks type array { + indexing: index | summary + index { enable-bm25 } + } + + field chunks_pos type array { + indexing: attribute | summary + } + + field image_chunks_pos type array { + indexing: attribute | summary + } + + field chunks_map type array { + indexing: summary + struct-field chunk_index { indexing: attribute | summary } + } + + field image_chunks_map type array { + indexing: summary + struct-field chunk_index { indexing: attribute | summary } + } + field owner type string { indexing: attribute | summary } @@ -83,6 +113,12 @@ schema file { distance-metric: angular } } + + field image_chunk_embeddings type tensor(p{}, v[DIMS]) { + indexing: input image_chunks | embed | attribute | index + attribute: paged + attribute { distance-metric: angular } + } field title_fuzzy type string { @@ -156,7 +192,6 @@ schema file { expression: elementwise(bm25(chunks), x, double) } } - rank-profile default_native inherits initial { @@ -185,6 +220,57 @@ schema file { } } + rank-profile attachmentRank inherits initial { + + function vector_score_image() { + expression: vector_score() + closeness(field, image_chunk_embeddings) + } + + function combined_bm25_image() { + expression: combined_bm25() + bm25(image_chunks) + } + + function matchedFieldCount_image() { + expression { + # The `matches` returns 1 if the field contains a match, otherwise 0 + matchedFieldCount() + matches(image_chunks) + } + } + + function combined_nativeRank_image() { + expression: (nativeRank(title) + nativeRank(chunks) + nativeRank(image_chunks)) / if(matchedFieldCount_image == 0, 1, matchedFieldCount_image) + } + + function image_chunk_scores() { + expression: elementwise(bm25(image_chunks), x, double) + } + + first-phase { + expression: (query(alpha) * (vector_score_image)) + ((1 - query(alpha)) * combined_nativeRank_image) + } + + global-phase { + expression { + ( + (query(alpha) * vector_score_image) + + ((1 - query(alpha)) * combined_nativeRank_image) + ) * doc_recency + } + rerank-count: 1000 + } + + match-features { + matchedFieldCount_image + vector_score_image + combined_nativeRank_image + nativeRank(title) + nativeRank(chunks) + chunk_scores + image_chunk_scores + doc_recency + } + } + # New rank profile designed for sorting: implements recency binning with hybrid scoring within bins. rank-profile global_sorted inherits initial { # No global-phase section to allow sorting by first-phase score directly. @@ -349,6 +435,16 @@ schema file { bolding: on source: chunks } + summary image_chunks_summary { + bolding: on + source: image_chunks + } + summary chunks_pos_summary { + source: chunks_pos + } + summary image_chunks_pos_summary { + source: image_chunks_pos + } } document-summary autocomplete {