(EAI-1015): Parse Braintrust results to benchmark reporting service format (#700)

mongodben · Ben Perlmutter · nlarew · web-flow · commit cdb860ac0c9d · 2025-05-30T13:46:22.000-04:00
* minimal viable functionality

* fixes

* Tests working

* change output path

* fix build err

* Add model provider to case results

* EvalCase data model tweaks

* Initial support for multiple_choice

* Support output for multiple_choice benchmark results

---------

Co-authored-by: Ben Perlmutter &lt;mongodben@mongodb.com&gt;
Co-authored-by: Nick Larew &lt;nick.larew@mongodb.com&gt;
diff --git a/packages/benchmarks/src/nlPromptResponse/NlQuestionAnswerEval.ts b/packages/benchmarks/src/nlPromptResponse/NlQuestionAnswerEval.ts
@@ -13,7 +13,8 @@ export type NlPromptResponseEvalCaseInput = {
 
 export type NlPromptResponseTag = string;
 
-export type NlPromptResponseMetadata = Record<string, unknown>;
+export type NlPromptResponseMetadata = Record<string, unknown> &
+  Partial<Omit<LlmOptions, "openAiClient">>;
 
 export interface NlPromptResponseEvalCase
   extends EvalCase<
diff --git a/packages/benchmarks/src/nlPromptResponse/bin/techSupport/techSupportPromptCompletionBenchmark.ts b/packages/benchmarks/src/nlPromptResponse/bin/techSupport/techSupportPromptCompletionBenchmark.ts
@@ -69,6 +69,7 @@ async function main() {
         additionalMetadata: {
           judgeModelsConfig,
           ...staticLlmOptions,
+          model: model.label,
         },
         task: makeNlPromptCompletionTask({
           llmOptions,
diff --git a/packages/benchmarks/src/reporting/EvalCases.ts b/packages/benchmarks/src/reporting/EvalCases.ts
@@ -0,0 +1,126 @@
+import { ObjectId } from "mongodb-rag-core/mongodb";
+import { OpenAI } from "mongodb-rag-core/openai";
+
+type ChatCompletionMessageParam = OpenAI.Chat.ChatCompletionMessageParam;
+
+/**
+ This represents a single prompt that we pass to models for evaluation.
+*/
+export type BaseCase = {
+  /*
+  A unique identifier for the prompt.
+  */
+  _id: ObjectId;
+
+  /*
+  The type of prompt. This determines the format of the prompt, its expected response, and the type of metrics we use to evaluate the results.
+  */
+  type: string;
+
+  /*
+  A list of tags for the prompt that we can filter by.
+  */
+  tags: string[];
+
+  /*
+  Metadata about the prompt.
+  */
+  metadata?: Record<string, unknown>;
+
+  /*
+  A human readable label for the case. For most cases this will just be the natural language query.
+  */
+  name: string;
+
+  /*
+  The prompt messages that we pass to the model.
+  */
+  prompt: ChatCompletionMessageParam[];
+
+  /*
+  The expected response from the model. The format of this depends on the type of prompt.
+  */
+  expected: string;
+
+  /*
+   A list of results for the prompt.
+   */
+  results: {
+    [modelName: string]: Result;
+  };
+};
+
+/**
+ This represents the result of running a prompt through a specific model.
+*/
+export type Result = {
+  /*
+  The name of the model that was used to generate the result.
+  */
+  model: string;
+
+  /*
+  The name of the company/lab that created the model.
+  */
+  provider: string;
+
+  /*
+  The date and time the result was generated.
+  */
+  date: Date;
+
+  /*
+  The response returned from the model.
+  */
+  response: string;
+
+  /*
+  Additional metadata about the result. For example, we could use this to map results back to their experiment name in Braintrust
+  */
+  metadata?: Record<string, unknown>;
+
+  /*
+  A list of evaluation metrics that we use to judge the quality of the result.
+  */
+  metrics: {
+    [metricName: string]: number;
+  };
+};
+
+/**
+ This represents a multiple choice prompt. Used for evaluating models that can choose from a list of options.
+ For example, the prompt may be an individual MongoDB University quiz question or skill assessment question.
+ The response is a string containing one or more choices that correspond to the correct answer.
+*/
+export type MultipleChoiceCase = BaseCase & {
+  type: "multiple_choice";
+};
+
+/**
+ This represents a natural language prompt. Used for evaluating models that can generate text.
+ For example, the prompt may be a technical support question, product knowledge question, or general awareness question.
+ The response is a single string that, ideally, matches the expected answer.
+*/
+export type NaturalLanguageCase = BaseCase & {
+  type: "natural_language";
+};
+
+/**
+ This represents a code generation prompt. Used for evaluating models that can generate code.
+ For example, the prompt may be a user query on a specific collection, with or without additional context like schemas, indexes, etc.
+ The response is code representing a valid MongoDB query that returns the expected results.
+*/
+export type NaturalLanguageToCodeCase = BaseCase & {
+  type: "natural_language_to_code";
+  results: (Result & {
+    /*
+     The type of code we're generating, e.g. mongosh, Java Driver, etc.
+    */
+    target: string;
+
+    /*
+    The value returned after executing the generated code
+    */
+    codeResult: unknown;
+  })[];
+};
diff --git a/packages/benchmarks/src/reporting/bin/reportMultipleChoiceBenchmarkResults.ts b/packages/benchmarks/src/reporting/bin/reportMultipleChoiceBenchmarkResults.ts
@@ -0,0 +1,32 @@
+import { reportBenchmarkResults } from "../reportBenchmarkResults";
+import { strict as assert } from "assert";
+import fs from "fs";
+import path from "path";
+import "dotenv/config";
+import { BSON } from "mongodb-rag-core/mongodb";
+const { EJSON } = BSON;
+
+async function main() {
+  const pathOut = path.join(
+    __dirname,
+    "testData",
+    "multiple_choice_results.json"
+  );
+
+  console.log(`Reporting multiple choice benchmark results to ${pathOut}`);
+
+  const apiKey = process.env.BRAINTRUST_API_KEY;
+  assert(apiKey, "must have BRAINTRUST_API_KEY set");
+
+  const projectName = "mongodb-multiple-choice";
+
+  const cases = await reportBenchmarkResults({
+    apiKey,
+    projectName,
+    experimentType: "multiple_choice",
+  });
+  console.log(`Reported ${cases.length} cases`);
+  fs.writeFileSync(pathOut, EJSON.stringify(cases));
+}
+
+main();
diff --git a/packages/benchmarks/src/reporting/bin/reportTechSupportBenchmarkResults.ts b/packages/benchmarks/src/reporting/bin/reportTechSupportBenchmarkResults.ts
@@ -0,0 +1,28 @@
+import { reportBenchmarkResults } from "../reportBenchmarkResults";
+import { strict as assert } from "assert";
+import fs from "fs";
+import path from "path";
+import "dotenv/config";
+import { BSON } from "mongodb-rag-core/mongodb";
+const { EJSON } = BSON;
+
+async function main() {
+  const pathOut = path.join("testData", "tech_support_results.json");
+
+  console.log(`Reporting tech support benchmark results to ${pathOut}`);
+
+  const apiKey = process.env.BRAINTRUST_API_KEY;
+  assert(apiKey, "must have BRAINTRUST_API_KEY set");
+
+  const projectName = "tech-support-prompt-completion";
+
+  const cases = await reportBenchmarkResults({
+    apiKey,
+    projectName,
+    experimentType: "prompt_response",
+  });
+  console.log(`Reported ${cases.length} cases`);
+  fs.writeFileSync(pathOut, EJSON.stringify(cases));
+}
+
+main();
diff --git a/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.ts b/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.ts
@@ -10,7 +10,7 @@ export async function getBraintrustExperimentSummary({
   projectName,
   experimentName,
   apiKey,
-}: GetBraintrustExperimentSummary): Promise<unknown> {
+}: GetBraintrustExperimentSummary) {
   const experiment = await init(projectName, {
     experiment: experimentName,
     apiKey,
diff --git a/packages/benchmarks/src/reporting/listBraintrustExperiments.ts b/packages/benchmarks/src/reporting/listBraintrustExperiments.ts
@@ -15,7 +15,7 @@ interface ListBraintrustExperimentsParams {
 export async function listBraintrustExperiments({
   apiKey,
   queryParams,
-}: ListBraintrustExperimentsParams): Promise<unknown> {
+}: ListBraintrustExperimentsParams): Promise<Experiment[]> {
   const url = new URL(`https://api.braintrust.dev/v1/experiment/`);
   if (queryParams) {
     for (const [key, value] of Object.entries(queryParams)) {
@@ -35,7 +35,7 @@ export async function listBraintrustExperiments({
     throw new Error(`Failed to list experiments: ${res.status}`);
   }
   const json = await res.json();
-  return json as ListBraintrustExperimentsResponse;
+  return (json as ListBraintrustExperimentsResponse).objects;
 }
 
 /**
diff --git a/packages/benchmarks/src/reporting/materializeExperimentResultsByCase.test.ts b/packages/benchmarks/src/reporting/materializeExperimentResultsByCase.test.ts
diff --git a/packages/benchmarks/src/reporting/materializeExperimentResultsByCase.ts b/packages/benchmarks/src/reporting/materializeExperimentResultsByCase.ts
diff --git a/packages/benchmarks/src/reporting/reportBenchmarkResults.ts b/packages/benchmarks/src/reporting/reportBenchmarkResults.ts