Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 138 additions & 19 deletions server/api/knowledgeBase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,48 @@ import { DeleteDocument, GetDocument } from "@/search/vespa"
import { ChunkMetadata, KbItemsSchema } from "@xyne/vespa-ts/types"
import { boss, FileProcessingQueue } from "@/queue/api-server-queue"
import * as crypto from "crypto"
import { fileTypeFromBuffer } from "file-type"
import {
DATASOURCE_CONFIG,
getBaseMimeType,
} from "@/integrations/dataSource/config"
import { getAuth, safeGet } from "./agent"
import { ApiKeyScopes, UploadStatus } from "@/shared/types"

const EXTENSION_MIME_MAP: Record<string, string> = {
".pdf": "application/pdf",
".doc": "application/msword",
".docx":
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".ppt": "application/vnd.ms-powerpoint",
".pptx":
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
".txt": "text/plain",
".csv": "text/csv",
".html": "text/html",
".xml": "application/xml",
".json": "application/json",
".zip": "application/zip",
".rar": "application/vnd.rar",
".7z": "application/x-7z-compressed",
".tar": "application/x-tar",
".gz": "application/gzip",
".mp3": "audio/mpeg",
".wav": "audio/wav",
".mp4": "video/mp4",
".avi": "video/x-msvideo",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".bmp": "image/bmp",
".webp": "image/webp",
".svg": "image/svg+xml",
".ico": "image/x-icon",
}

const loggerWithChild = getLoggerWithChild(Subsystem.Api, {
module: "knowledgeBaseService",
})
Expand Down Expand Up @@ -127,6 +162,75 @@ function getStoragePath(
)
}

// Enhanced MIME type detection with extension normalization and magic byte analysis
async function detectMimeType(
fileName: string,
buffer: ArrayBuffer | Uint8Array | Buffer,
browserMimeType?: string,
): Promise<string> {
try {
let detectionBuffer: Uint8Array | Buffer
if (Buffer.isBuffer(buffer)) {
detectionBuffer = buffer
} else if (buffer instanceof Uint8Array) {
detectionBuffer = buffer
} else {
detectionBuffer = new Uint8Array(buffer)
}

// Step 1: Normalize the file extension (case-insensitive)
const ext = extname(fileName).toLowerCase()

// Step 2: Extension-based MIME mapping uses a module-level constant now.

// Step 3: Use magic byte detection from file-type library
let detectedType: string | undefined
try {
const fileTypeResult = await fileTypeFromBuffer(detectionBuffer)
if (fileTypeResult?.mime) {
detectedType = fileTypeResult.mime
loggerWithChild().debug(
`Magic byte detection for ${fileName}: ${detectedType}`,
)
}
} catch (error) {
loggerWithChild().debug(
`Magic byte detection failed for ${fileName}: ${getErrorMessage(error)}`,
)
}

// Step 4: Determine the best MIME type using fallback strategy
let finalMimeType: string

// Priority: 1. Magic bytes (most reliable), 2. Extension mapping, 3. Browser type, 4. Default
if (detectedType) {
finalMimeType = detectedType
} else if (EXTENSION_MIME_MAP[ext]) {
finalMimeType = EXTENSION_MIME_MAP[ext]
} else if (
browserMimeType &&
browserMimeType !== "application/octet-stream"
) {
finalMimeType = browserMimeType
} else {
finalMimeType = "application/octet-stream"
}

loggerWithChild().debug(
`MIME detection for ${fileName}: extension=${ext}, magic=${detectedType}, browser=${browserMimeType}, final=${finalMimeType}`,
)

return finalMimeType
} catch (error) {
loggerWithChild().error(
error,
`Error in MIME detection for ${fileName}: ${getErrorMessage(error)}`,
)
// Fallback to browser type or default
return browserMimeType || "application/octet-stream"
}
}

// API Handlers

// Create a new Collection
Expand Down Expand Up @@ -1240,6 +1344,13 @@ export const UploadFilesApi = async (c: Context) => {
// Write file to disk
await writeFile(storagePath, new Uint8Array(buffer))

// Detect MIME type using robust detection with extension normalization and magic bytes
const detectedMimeType = await detectMimeType(
originalFileName,
buffer,
file.type,
)

// Create file record in database first
const item = await db.transaction(async (tx: TxnOrClient) => {
return await createFileItem(
Expand All @@ -1251,7 +1362,7 @@ export const UploadFilesApi = async (c: Context) => {
fileName,
storagePath,
storageKey,
file.type || "application/octet-stream",
detectedMimeType,
file.size,
checksum,
{
Expand Down Expand Up @@ -1651,23 +1762,25 @@ export const GetChunkContentApi = async (c: Context) => {
}

// Handle both legacy number[] format and new ChunkMetadata[] format
const index = resp.fields.chunks_pos.findIndex((pos: number | ChunkMetadata) => {
// If it's a number (legacy format), compare directly
if (typeof pos === "number") {
return pos === chunkIndex
}
// If it's a ChunkMetadata object, compare the index field
if (typeof pos === "object" && pos !== null) {
if (pos.chunk_index !== undefined) {
return pos.chunk_index === chunkIndex
} else {
loggerWithChild({ email: userEmail }).warn(
`Unexpected chunk position object format: ${JSON.stringify(pos)}`,
)
const index = resp.fields.chunks_pos.findIndex(
(pos: number | ChunkMetadata) => {
// If it's a number (legacy format), compare directly
if (typeof pos === "number") {
return pos === chunkIndex
}
}
return false
})
// If it's a ChunkMetadata object, compare the index field
if (typeof pos === "object" && pos !== null) {
if (pos.chunk_index !== undefined) {
return pos.chunk_index === chunkIndex
} else {
loggerWithChild({ email: userEmail }).warn(
`Unexpected chunk position object format: ${JSON.stringify(pos)}`,
)
}
}
return false
},
)
if (index === -1) {
throw new HTTPException(404, { message: "Chunk index not found" })
}
Expand Down Expand Up @@ -1750,8 +1863,14 @@ export const PollCollectionsStatusApi = async (c: Context) => {
const body = await c.req.json()
const collectionIds = body.collectionIds as string[]

if (!collectionIds || !Array.isArray(collectionIds) || collectionIds.length === 0) {
throw new HTTPException(400, { message: "collectionIds array is required" })
if (
!collectionIds ||
!Array.isArray(collectionIds) ||
collectionIds.length === 0
) {
throw new HTTPException(400, {
message: "collectionIds array is required",
})
}

// Fetch items only for collections owned by the user (enforced in DB function)
Expand Down
50 changes: 2 additions & 48 deletions server/lib/chunkByOCR.ts
Original file line number Diff line number Diff line change
Expand Up @@ -554,54 +554,8 @@ export async function chunkByOCRFromBuffer(
docId: string,
): Promise<ProcessingResult> {

// Check if this is a PDF and handle batching if necessary
const isPdf = fileName.toLowerCase().endsWith('.pdf')
let finalApiResult: LayoutParsingApiPayload

if (isPdf) {
try {
const srcPdf = await PDFDocument.load(buffer)
const totalPages = srcPdf.getPageCount()

if (totalPages > 30) {
// Split PDF into batches and process each
const batches = await splitPdfIntoBatches(buffer, 30)
const batchResults: LayoutParsingApiPayload[] = []

for (let i = 0; i < batches.length; i++) {
const batch = batches[i]
Logger.info("Processing PDF batch", {
batchIndex: i + 1,
totalBatches: batches.length,
batchSizeBytes: batch.length,
})

const batchResult = await callLayoutParsingApi(batch, `${fileName}_batch_${i + 1}`)
batchResults.push(batchResult)
}

// Merge all batch results
finalApiResult = mergeLayoutParsingResults(batchResults)

Logger.info("Merged batch results", {
totalBatches: batches.length,
layoutResultsCount: finalApiResult.layoutParsingResults?.length || 0,
})
} else {
// Small PDF, process normally
finalApiResult = await callLayoutParsingApi(buffer, fileName)
}
} catch (error) {
Logger.warn("Failed to analyze PDF for batching, processing as single file", {
fileName,
error: (error as Error).message,
})
finalApiResult = await callLayoutParsingApi(buffer, fileName)
}
} else {
// Not a PDF, process normally
finalApiResult = await callLayoutParsingApi(buffer, fileName)
}
// Process the file directly without splitting
const finalApiResult = await callLayoutParsingApi(buffer, fileName)

Logger.info("API result received", {
layoutResultsCount: finalApiResult.layoutParsingResults?.length || 0,
Expand Down
1 change: 1 addition & 0 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
"drizzle-orm": "^0.44.5",
"drizzle-zod": "^0.8.3",
"fast-xml-parser": "^5.2.5",
"file-type": "^21.0.0",
"google-auth-library": "^9.14.0",
"googleapis": "^140.0.1",
"gpt-tokenizer": "^2.8.1",
Expand Down
Loading