Add metrics for models, tools, websearch (#1186)

nsarrazin · antoniora · web-flow · commit e988c0699b0f · 2024-05-27T18:45:44.000+02:00
* Add custom metrics for messages and conversations

* lint

* Add metrics for
- model health
- tools
- websearch

* Add time window &amp; age buckets to summaries

* Increase max age for tool use duration

---------

Co-authored-by: antoniora &lt;antonio.ramos@adyen.com&gt;
diff --git a/src/lib/server/metrics.ts b/src/lib/server/metrics.ts
@@ -1,10 +1,39 @@
-import { collectDefaultMetrics, Registry } from "prom-client";
+import { collectDefaultMetrics, Registry, Counter, Summary } from "prom-client";
 import express from "express";
 import { logger } from "$lib/server/logger";
 import { env } from "$env/dynamic/private";
+import type { Model } from "$lib/types/Model";
+import type { Tool } from "$lib/types/Tool";
+
+interface Metrics {
+	model: {
+		conversationsTotal: Counter<Model["id"]>;
+		messagesTotal: Counter<Model["id"]>;
+		tokenCountTotal: Counter<Model["id"]>;
+		timePerOutputToken: Summary<Model["id"]>;
+		timeToFirstToken: Summary<Model["id"]>;
+		latency: Summary<Model["id"]>;
+	};
+
+	webSearch: {
+		requestCount: Counter;
+		pageFetchCount: Counter;
+		pageFetchCountError: Counter;
+		pageFetchDuration: Summary;
+		embeddingDuration: Summary;
+	};
+
+	tool: {
+		toolUseCount: Counter<Tool["name"]>;
+		toolUseCountError: Counter<Tool["name"]>;
+		toolUseDuration: Summary<Tool["name"]>;
+		timeToChooseTools: Summary;
+	};
+}
 
 export class MetricsServer {
 	private static instance: MetricsServer;
+	private metrics: Metrics;
 
 	private constructor() {
 		const app = express();
@@ -17,6 +46,114 @@ export class MetricsServer {
 		const register = new Registry();
 		collectDefaultMetrics({ register });
 
+		this.metrics = {
+			model: {
+				conversationsTotal: new Counter({
+					name: "model_conversations_total",
+					help: "Total number of conversations",
+					labelNames: ["model"],
+					registers: [register],
+				}),
+				messagesTotal: new Counter({
+					name: "model_messages_total",
+					help: "Total number of messages",
+					labelNames: ["model"],
+					registers: [register],
+				}),
+				tokenCountTotal: new Counter({
+					name: "model_token_count_total",
+					help: "Total number of tokens",
+					labelNames: ["model"],
+					registers: [register],
+				}),
+				timePerOutputToken: new Summary({
+					name: "model_time_per_output_token_ms",
+					help: "Time per output token in ms",
+					labelNames: ["model"],
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+				timeToFirstToken: new Summary({
+					name: "model_time_to_first_token_ms",
+					help: "Time to first token",
+					labelNames: ["model"],
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+				latency: new Summary({
+					name: "model_latency_ms",
+					help: "Total latency until end of answer",
+					labelNames: ["model"],
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+			},
+			webSearch: {
+				requestCount: new Counter({
+					name: "web_search_request_count",
+					help: "Total number of web search requests",
+					registers: [register],
+				}),
+				pageFetchCount: new Counter({
+					name: "web_search_page_fetch_count",
+					help: "Total number of web search page fetches",
+					registers: [register],
+				}),
+				pageFetchCountError: new Counter({
+					name: "web_search_page_fetch_count_error",
+					help: "Total number of web search page fetch errors",
+					registers: [register],
+				}),
+				pageFetchDuration: new Summary({
+					name: "web_search_page_fetch_duration_ms",
+					help: "Web search page fetch duration",
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+				embeddingDuration: new Summary({
+					name: "web_search_embedding_duration_ms",
+					help: "Web search embedding duration",
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+			},
+			tool: {
+				toolUseCount: new Counter({
+					name: "tool_use_count",
+					help: "Total number of tool uses",
+					labelNames: ["tool"],
+					registers: [register],
+				}),
+				toolUseCountError: new Counter({
+					name: "tool_use_count_error",
+					help: "Total number of tool use errors",
+					labelNames: ["tool"],
+					registers: [register],
+				}),
+				toolUseDuration: new Summary({
+					name: "tool_use_duration_ms",
+					help: "Tool use duration",
+					labelNames: ["tool"],
+					registers: [register],
+					maxAgeSeconds: 30 * 60, // longer duration since we use this to give feedback to the user
+					ageBuckets: 5,
+				}),
+				timeToChooseTools: new Summary({
+					name: "time_to_choose_tools_ms",
+					help: "Time to choose tools",
+					labelNames: ["model"],
+					registers: [register],
+					maxAgeSeconds: 5 * 60,
+					ageBuckets: 5,
+				}),
+			},
+		};
+
 		app.get("/metrics", (req, res) => {
 			register.metrics().then((metrics) => {
 				res.set("Content-Type", "text/plain");
@@ -40,4 +177,8 @@ export class MetricsServer {
 
 		return MetricsServer.instance;
 	}
+
+	public static getMetrics(): Metrics {
+		return MetricsServer.getInstance().metrics;
+	}
 }
diff --git a/src/lib/server/textGeneration/tools.ts b/src/lib/server/textGeneration/tools.ts
@@ -18,6 +18,7 @@ import { logger } from "../logger";
 import { toolHasName } from "../tools/utils";
 import type { MessageFile } from "$lib/types/Message";
 import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
+import { MetricsServer } from "../metrics";
 
 function makeFilesPrompt(files: MessageFile[], fileMessageIndex: number): string {
 	if (files.length === 0) {
@@ -62,6 +63,9 @@ async function* runTool(
 	// Special case for directly_answer tool where we ignore
 	if (toolHasName(directlyAnswer.name, tool)) return;
 
+	const startTime = Date.now();
+	MetricsServer.getMetrics().tool.toolUseCount.inc({ tool: call.name });
+
 	yield {
 		type: MessageUpdateType.Tool,
 		subtype: MessageToolUpdateType.Call,
@@ -92,8 +96,14 @@ async function* runTool(
 				};
 			}
 
+			MetricsServer.getMetrics().tool.toolUseDuration.observe(
+				{ tool: call.name },
+				Date.now() - startTime
+			);
+
 			return { ...toolResult, call } as ToolResult;
 		} catch (e) {
+			MetricsServer.getMetrics().tool.toolUseCountError.inc({ tool: call.name });
 			yield {
 				type: MessageUpdateType.Tool,
 				subtype: MessageToolUpdateType.Error,
@@ -102,6 +112,7 @@ async function* runTool(
 			};
 		}
 	} catch (cause) {
+		MetricsServer.getMetrics().tool.toolUseCountError.inc({ tool: call.name });
 		console.error(Error(`Failed while running tool ${call.name}`), { cause });
 		return {
 			call,
@@ -126,6 +137,8 @@ export async function* runTools(
 		};
 	});
 
+	const pickToolStartTime = Date.now();
+
 	// do the function calling bits here
 	for await (const output of await endpoint({
 		messages: messagesWithFilesPrompt,
@@ -163,6 +176,11 @@ export async function* runTools(
 		}
 	}
 
+	MetricsServer.getMetrics().tool.timeToChooseTools.observe(
+		{ model: conv.model },
+		Date.now() - pickToolStartTime
+	);
+
 	const toolContext: BackendToolContext = { conv, messages, preprompt, assistant };
 	const toolResults: (ToolResult | undefined)[] = yield* mergeAsyncGenerators(
 		calls.map((call) => runTool(toolContext, tools, call))
diff --git a/src/lib/server/websearch/embed/embed.ts b/src/lib/server/websearch/embed/embed.ts
@@ -1,3 +1,4 @@
+import { MetricsServer } from "$lib/server/metrics";
 import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
 import type { EmbeddingBackendModel } from "../../embeddingModels";
 import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
@@ -14,6 +15,8 @@ export async function findContextSources(
 	prompt: string,
 	embeddingModel: EmbeddingBackendModel
 ) {
+	const startTime = Date.now();
+
 	const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
 	const markdownElems = sourcesMarkdownElems.flat();
 
@@ -76,5 +79,7 @@ export async function findContextSources(
 		})
 		.filter((contextSource) => contextSource.context.length > 0);
 
+	MetricsServer.getMetrics().webSearch.embeddingDuration.observe(Date.now() - startTime);
+
 	return contextSources;
 }
diff --git a/src/lib/server/websearch/runWebSearch.ts b/src/lib/server/websearch/runWebSearch.ts
@@ -17,6 +17,7 @@ import {
 	makeSourcesUpdate,
 } from "./update";
 import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
+import { MetricsServer } from "../metrics";
 
 const MAX_N_PAGES_TO_SCRAPE = 8 as const;
 const MAX_N_PAGES_TO_EMBED = 5 as const;
@@ -31,6 +32,8 @@ export async function* runWebSearch(
 	const createdAt = new Date();
 	const updatedAt = new Date();
 
+	MetricsServer.getMetrics().webSearch.requestCount.inc();
+
 	try {
 		const embeddingModel =
 			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
diff --git a/src/lib/server/websearch/scrape/scrape.ts b/src/lib/server/websearch/scrape/scrape.ts
@@ -6,16 +6,24 @@ import { spatialParser } from "./parser";
 import { htmlToMarkdownTree } from "../markdown/tree";
 import { timeout } from "$lib/utils/timeout";
 import { makeErrorUpdate, makeGeneralUpdate } from "../update";
+import { MetricsServer } from "$lib/server/metrics";
 
 export const scrape = (maxCharsPerElem: number) =>
 	async function* (
 		source: WebSearchSource
 	): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
 		try {
+			const startTime = Date.now();
+			MetricsServer.getMetrics().webSearch.pageFetchCount.inc();
+
 			const page = await scrapeUrl(source.link, maxCharsPerElem);
+
+			MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime);
+
 			yield makeGeneralUpdate({ message: "Browsing webpage", args: [source.link] });
 			return { ...source, page };
 		} catch (e) {
+			MetricsServer.getMetrics().webSearch.pageFetchCountError.inc();
 			const message = e instanceof Error ? e.message : String(e);
 			yield makeErrorUpdate({ message: "Failed to parse webpage", args: [message, source.link] });
 		}
diff --git a/src/routes/conversation/+server.ts b/src/routes/conversation/+server.ts
@@ -10,6 +10,7 @@ import { defaultEmbeddingModel } from "$lib/server/embeddingModels";
 import { v4 } from "uuid";
 import { authCondition } from "$lib/server/auth";
 import { usageLimits } from "$lib/server/usageLimits";
+import { MetricsServer } from "$lib/server/metrics";
 
 export const POST: RequestHandler = async ({ locals, request }) => {
 	const body = await request.text();
@@ -115,6 +116,8 @@ export const POST: RequestHandler = async ({ locals, request }) => {
 		...(values.fromShare ? { meta: { fromShareId: values.fromShare } } : {}),
 	});
 
+	MetricsServer.getMetrics().model.conversationsTotal.inc({ model: values.model });
+
 	return new Response(
 		JSON.stringify({
 			conversationId: res.insertedId.toString(),
diff --git a/src/routes/conversation/[id]/+server.ts b/src/routes/conversation/[id]/+server.ts
@@ -21,6 +21,7 @@ import { buildSubtree } from "$lib/utils/tree/buildSubtree.js";
 import { addChildren } from "$lib/utils/tree/addChildren.js";
 import { addSibling } from "$lib/utils/tree/addSibling.js";
 import { usageLimits } from "$lib/server/usageLimits";
+import { MetricsServer } from "$lib/server/metrics";
 import { textGeneration } from "$lib/server/textGeneration";
 import type { TextGenerationContext } from "$lib/server/textGeneration/types";
 
@@ -293,6 +294,8 @@ export async function POST({ request, locals, params, getClientAddress }) {
 
 	let doneStreaming = false;
 
+	let lastTokenTimestamp: undefined | Date = undefined;
+
 	// we now build the stream
 	const stream = new ReadableStream({
 		async start(controller) {
@@ -306,6 +309,25 @@ export async function POST({ request, locals, params, getClientAddress }) {
 				if (event.type === MessageUpdateType.Stream) {
 					if (event.token === "") return;
 					messageToWriteTo.content += event.token;
+
+					// add to token total
+					MetricsServer.getMetrics().model.tokenCountTotal.inc({ model: model?.id });
+
+					// if this is the first token, add to time to first token
+					if (!lastTokenTimestamp) {
+						MetricsServer.getMetrics().model.timeToFirstToken.observe(
+							{ model: model?.id },
+							Date.now() - promptedAt.getTime()
+						);
+						lastTokenTimestamp = new Date();
+					}
+
+					// add to time per token
+					MetricsServer.getMetrics().model.timePerOutputToken.observe(
+						{ model: model?.id },
+						Date.now() - (lastTokenTimestamp ?? promptedAt).getTime()
+					);
+					lastTokenTimestamp = new Date();
 				}
 
 				// Set the title
@@ -321,6 +343,12 @@ export async function POST({ request, locals, params, getClientAddress }) {
 				else if (event.type === MessageUpdateType.FinalAnswer) {
 					messageToWriteTo.interrupted = event.interrupted;
 					messageToWriteTo.content = initialMessageContent + event.text;
+
+					// add to latency
+					MetricsServer.getMetrics().model.latency.observe(
+						{ model: model?.id },
+						Date.now() - promptedAt.getTime()
+					);
 				}
 
 				// Add file
@@ -428,6 +456,8 @@ export async function POST({ request, locals, params, getClientAddress }) {
 		);
 	}
 
+	const metrics = MetricsServer.getMetrics();
+	metrics.model.messagesTotal.inc({ model: model?.id });
 	// Todo: maybe we should wait for the message to be saved before ending the response - in case of errors
 	return new Response(stream, {
 		headers: {