Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 179 additions & 3 deletions server/ai/context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,152 @@ import {
getSortedScoredImageChunks,
} from "@xyne/vespa-ts/mappers"
import type { UserMetadataType } from "@/types"
import { querySheetChunks } from "@/lib/duckdb"
import { chunkSheetWithHeaders } from "@/sheetChunk"

// Utility function to extract header from chunks and remove headers from each chunk
const extractHeaderAndDataChunks = (
chunks_summary: (string | { chunk: string; score: number; index: number })[] | undefined,
matchfeatures?: any
): {
chunks_summary: (string | { chunk: string; score: number; index: number })[];
matchfeatures?: any;
} => {
if (!chunks_summary || chunks_summary.length === 0) {
return { chunks_summary: [], matchfeatures };
}

// Find the header from the first chunk
let headerChunk = '';
if (chunks_summary.length > 0) {
const firstChunk = typeof chunks_summary[0] === "string" ? chunks_summary[0] : chunks_summary[0].chunk;
const lines = firstChunk.split('\n');
if (lines.length > 0 && lines[0].includes('\t')) {
headerChunk = lines[0]; // Extract the header line
}
}

// Process all chunks: remove header from each and keep only data rows
const processedChunks: (string | { chunk: string; score: number; index: number })[] = [];
let newMatchfeatures = matchfeatures;

// Add header as first chunk if found, using the same structure as original
if (headerChunk) {
if (typeof chunks_summary[0] === "string") {
processedChunks.push(headerChunk);
} else {
processedChunks.push({
chunk: headerChunk,
score: 1,
index: 0,
});
}

// Update matchfeatures to include the header chunk score
if (newMatchfeatures) {
const existingCells = newMatchfeatures.chunk_scores?.cells || {};
const scores = Object.values(existingCells) as number[];
const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
// Create new chunk_scores that match the new chunks
const newChunkScores: Record<string, number> = {}
newChunkScores["0"] = maxScore + 1
Object.entries(existingCells).forEach(([idx, score]) => {
newChunkScores[(parseInt(idx) + 1).toString()] = score as number
})

newMatchfeatures = {
...newMatchfeatures,
chunk_scores: {
cells: newChunkScores
}
};
}
}

// Process each original chunk: remove header and add data rows
for (let i = 0; i < chunks_summary.length; i++) {
const originalChunk = chunks_summary[i];
const chunkContent = typeof originalChunk === "string" ? originalChunk : originalChunk.chunk;
const lines = chunkContent.split('\n');

// Skip the first line (header) and keep only data rows
const dataRows = lines.slice(1).filter(line => line.trim().length > 0);
if (dataRows.length > 0) {
const dataContent = dataRows.join('\n');

if (typeof originalChunk === "string") {
processedChunks.push(dataContent);
} else {
processedChunks.push({
chunk: dataContent,
score: originalChunk.score,
index: originalChunk.index
});
}
}
}

return { chunks_summary: processedChunks, matchfeatures: newMatchfeatures };
};

// Utility function to process sheet queries for spreadsheet files
const processSheetQuery = async (
chunks_summary: (string | { chunk: string; score: number; index: number })[] | undefined,
query: string,
matchfeatures: any
): Promise<{
chunks_summary: { chunk: string; score: number; index: number }[];
matchfeatures: any;
maxSummaryChunks: number;
} | null> => {
const duckDBResult = await querySheetChunks(
chunks_summary?.map((c) => typeof c === "string" ? c : c.chunk) || [],
query
)

// If DuckDB query failed (null means not metric-related or SQL generation failed), return null to fallback to original approach
if (!duckDBResult) {
return null;
}

// Create metadata chunk with query information (excluding data)
const metadataChunk = JSON.stringify({
assumptions: duckDBResult.assumptions,
schema_fragment: duckDBResult.schema_fragment
}, null, 2)

// Use chunkSheetWithHeaders to chunk the 2D array data
const dataChunks = chunkSheetWithHeaders(duckDBResult.data.rows, {headerRows: 1})

// Combine metadata chunk with data chunks
const allChunks = [metadataChunk, ...dataChunks]

const newChunksSummary = allChunks.map((c, idx) => ({chunk: c, score: 0, index: idx}))

// Update matchfeatures to correspond to the new chunks
let newMatchfeatures = matchfeatures
if (matchfeatures) {
// Create new chunk_scores that match the new chunks
const newChunkScores: Record<string, number> = {}
allChunks.forEach((_, idx) => {
newChunkScores[idx.toString()] = 0 // All new chunks get score 0
})

// Update the matchfeatures with new chunk_scores
newMatchfeatures = {
...matchfeatures,
chunk_scores: {
cells: newChunkScores
}
}
}

return {
chunks_summary: newChunksSummary,
matchfeatures: newMatchfeatures,
maxSummaryChunks: allChunks.length
}
}

// Utility to capitalize the first letter of a string
const capitalize = (str: string) => str.charAt(0).toUpperCase() + str.slice(1)
Expand Down Expand Up @@ -573,7 +719,7 @@ const constructCollectionFileContext = (
isMsgWithSources?: boolean,
): string => {

if ((!maxSummaryChunks && !isSelectedFiles) || isMsgWithSources) {
if ((!maxSummaryChunks && !isSelectedFiles)) {
maxSummaryChunks = fields.chunks_summary?.length
}
let chunks: ScoredChunk[] = []
Expand Down Expand Up @@ -738,13 +884,43 @@ export const answerColoredContextMap = (
}

type AiContext = string
export const answerContextMap = (
export const answerContextMap = async (
searchResult: VespaSearchResults,
userMetadata: UserMetadataType,
maxSummaryChunks?: number,
isSelectedFiles?: boolean,
isMsgWithSources?: boolean,
): AiContext => {
query?: string,
): Promise<AiContext> => {
if(searchResult.fields.sddocname === fileSchema || searchResult.fields.sddocname === dataSourceFileSchema || searchResult.fields.sddocname === KbItemsSchema || searchResult.fields.sddocname === mailAttachmentSchema) {
let mimeType
if(searchResult.fields.sddocname === mailAttachmentSchema) {
mimeType = searchResult.fields.fileType
} else {
mimeType = searchResult.fields.mimeType
}
if(mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ||
mimeType === "application/vnd.ms-excel" ||
mimeType === "text/csv") {
const result = extractHeaderAndDataChunks(searchResult.fields.chunks_summary, searchResult.fields.matchfeatures);
searchResult.fields.chunks_summary = result.chunks_summary;
if (result.matchfeatures) {
searchResult.fields.matchfeatures = result.matchfeatures;
}

if (query) {
const sheetResult = await processSheetQuery(searchResult.fields.chunks_summary, query, searchResult.fields.matchfeatures)
if (sheetResult) {
const { chunks_summary, matchfeatures, maxSummaryChunks: newMaxSummaryChunks } = sheetResult
searchResult.fields.chunks_summary = chunks_summary
searchResult.fields.matchfeatures = matchfeatures
maxSummaryChunks = newMaxSummaryChunks
} else {
maxSummaryChunks = Math.min(searchResult.fields.chunks_summary?.length || 0, 100)
}
}
}
}
if (searchResult.fields.sddocname === fileSchema) {
return constructFileContext(
searchResult.fields,
Expand Down
60 changes: 35 additions & 25 deletions server/api/chat/agents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ import { getModelValueFromLabel } from "@/ai/modelConfig"
import {
buildContext,
buildUserQuery,
expandSheetIds,
getThreadContext,
isContextSelected,
UnderstandMessageAndAnswer,
Expand Down Expand Up @@ -530,13 +531,21 @@ const checkAndYieldCitationsForAgent = async function* (
}
}

const vespaResultToMinimalAgentFragment = (
const vespaResultToMinimalAgentFragment = async (
child: VespaSearchResult,
idx: number,
userMetadata: UserMetadataType,
): MinimalAgentFragment => ({
query: string,
): Promise<MinimalAgentFragment> => ({
id: `${(child.fields as any)?.docId || `Frangment_id_${idx}`}`,
content: answerContextMap(child as VespaSearchResults, userMetadata, 0, true),
content: await answerContextMap(
child as VespaSearchResults,
userMetadata,
0,
true,
undefined,
query,
),
source: searchToCitation(child as VespaSearchResults),
confidence: 1.0,
})
Expand Down Expand Up @@ -839,15 +848,12 @@ export const MessageWithToolsApi = async (c: Context) => {
}

const attachmentMetadata = parseAttachmentMetadata(c)
const attachmentFileIds = attachmentMetadata.map(
(m: AttachmentMetadata) => m.fileId,
)
const imageAttachmentFileIds = attachmentMetadata
.filter((m) => m.isImage)
.map((m) => m.fileId)
const nonImageAttachmentFileIds = attachmentMetadata
.filter((m) => !m.isImage)
.map((m) => m.fileId)
.flatMap((m) => expandSheetIds(m.fileId))
let attachmentStorageError: Error | null = null

const contextExtractionSpan = initSpan.startSpan("context_extraction")
Expand All @@ -866,7 +872,7 @@ export const MessageWithToolsApi = async (c: Context) => {
extractedInfo?.totalValidFileIdsFromLinkCount
loggerWithChild({ email: email }).info(`Extracted ${fileIds} extractedInfo`)
loggerWithChild({ email: email }).info(
`Total attachment files received: ${attachmentFileIds.length}`,
`Total attachment files received: ${attachmentMetadata.length}`,
)
const hasReferencedContext = fileIds && fileIds.length > 0
contextExtractionSpan.setAttribute("file_ids_count", fileIds?.length || 0)
Expand Down Expand Up @@ -1521,11 +1527,13 @@ export const MessageWithToolsApi = async (c: Context) => {
if (results?.root?.children && results.root.children.length > 0) {
const contextPromises = results?.root?.children?.map(
async (v, i) => {
let content = answerContextMap(
let content = await answerContextMap(
v as VespaSearchResults,
userMetadata,
0,
true,
undefined,
message,
)
const chatContainerFields =
isChatContainerFields(v.fields) &&
Expand Down Expand Up @@ -1593,22 +1601,24 @@ export const MessageWithToolsApi = async (c: Context) => {
planningContext += "\n" + buildContext(threadContexts, 10, userMetadata)
}

gatheredFragments = results.root.children.map(
(child: VespaSearchResult, idx) =>
vespaResultToMinimalAgentFragment(child, idx, userMetadata),
gatheredFragments = await Promise.all(
results.root.children.map(
async (child: VespaSearchResult, idx) =>
await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
)
)
if (chatContexts.length > 0) {
gatheredFragments.push(
...chatContexts.map((child, idx) =>
vespaResultToMinimalAgentFragment(child, idx, userMetadata),
),
...(await Promise.all(chatContexts.map(async (child, idx) =>
await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
))),
)
}
if (threadContexts.length > 0) {
gatheredFragments.push(
...threadContexts.map((child, idx) =>
vespaResultToMinimalAgentFragment(child, idx, userMetadata),
),
...(await Promise.all(threadContexts.map(async (child, idx) =>
await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
))),
)
}
const parseSynthesisOutput = await performSynthesis(
Expand Down Expand Up @@ -2881,9 +2891,9 @@ export const AgentMessageApiRagOff = async (c: Context) => {
chunksSpan.end()
if (allChunks?.root?.children) {
const startIndex = 0
fragments = allChunks.root.children.map((child, idx) =>
vespaResultToMinimalAgentFragment(child, idx, userMetadata),
)
fragments = await Promise.all(allChunks.root.children.map(async (child, idx) =>
await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message)
))
context = answerContextMapFromFragments(
fragments,
maxDefaultSummary,
Expand Down Expand Up @@ -3149,9 +3159,9 @@ export const AgentMessageApiRagOff = async (c: Context) => {
if (docIds.length > 0) {
const allChunks = await GetDocumentsByDocIds(docIds, chunksSpan)
if (allChunks?.root?.children) {
fragments = allChunks.root.children.map((child, idx) =>
vespaResultToMinimalAgentFragment(child, idx, userMetadata),
)
fragments = await Promise.all(allChunks.root.children.map(async (child, idx) =>
await vespaResultToMinimalAgentFragment(child, idx, userMetadata, message),
))
context = answerContextMapFromFragments(
fragments,
maxDefaultSummary,
Expand Down Expand Up @@ -3365,7 +3375,7 @@ export const AgentMessageApi = async (c: Context) => {
.map((m) => m.fileId)
const nonImageAttachmentFileIds = attachmentMetadata
.filter((m) => !m.isImage)
.map((m) => m.fileId)
.flatMap((m) => expandSheetIds(m.fileId))
let attachmentStorageError: Error | null = null
let {
message,
Expand Down
Loading
Loading