Skip to content

Commit cdb860a

Browse files
mongodbenBen Perlmutternlarew
authored
(EAI-1015): Parse Braintrust results to benchmark reporting service format (#700)
* minimal viable functionality * fixes * Tests working * change output path * fix build err * Add model provider to case results * EvalCase data model tweaks * Initial support for multiple_choice * Support output for multiple_choice benchmark results --------- Co-authored-by: Ben Perlmutter <mongodben@mongodb.com> Co-authored-by: Nick Larew <nick.larew@mongodb.com>
1 parent ffe922f commit cdb860a

10 files changed

+801
-4
lines changed

packages/benchmarks/src/nlPromptResponse/NlQuestionAnswerEval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ export type NlPromptResponseEvalCaseInput = {
1313

1414
export type NlPromptResponseTag = string;
1515

16-
export type NlPromptResponseMetadata = Record<string, unknown>;
16+
export type NlPromptResponseMetadata = Record<string, unknown> &
17+
Partial<Omit<LlmOptions, "openAiClient">>;
1718

1819
export interface NlPromptResponseEvalCase
1920
extends EvalCase<

packages/benchmarks/src/nlPromptResponse/bin/techSupport/techSupportPromptCompletionBenchmark.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ async function main() {
6969
additionalMetadata: {
7070
judgeModelsConfig,
7171
...staticLlmOptions,
72+
model: model.label,
7273
},
7374
task: makeNlPromptCompletionTask({
7475
llmOptions,
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import { ObjectId } from "mongodb-rag-core/mongodb";
2+
import { OpenAI } from "mongodb-rag-core/openai";
3+
4+
type ChatCompletionMessageParam = OpenAI.Chat.ChatCompletionMessageParam;
5+
6+
/**
7+
This represents a single prompt that we pass to models for evaluation.
8+
*/
9+
export type BaseCase = {
10+
/*
11+
A unique identifier for the prompt.
12+
*/
13+
_id: ObjectId;
14+
15+
/*
16+
The type of prompt. This determines the format of the prompt, its expected response, and the type of metrics we use to evaluate the results.
17+
*/
18+
type: string;
19+
20+
/*
21+
A list of tags for the prompt that we can filter by.
22+
*/
23+
tags: string[];
24+
25+
/*
26+
Metadata about the prompt.
27+
*/
28+
metadata?: Record<string, unknown>;
29+
30+
/*
31+
A human readable label for the case. For most cases this will just be the natural language query.
32+
*/
33+
name: string;
34+
35+
/*
36+
The prompt messages that we pass to the model.
37+
*/
38+
prompt: ChatCompletionMessageParam[];
39+
40+
/*
41+
The expected response from the model. The format of this depends on the type of prompt.
42+
*/
43+
expected: string;
44+
45+
/*
46+
A list of results for the prompt.
47+
*/
48+
results: {
49+
[modelName: string]: Result;
50+
};
51+
};
52+
53+
/**
54+
This represents the result of running a prompt through a specific model.
55+
*/
56+
export type Result = {
57+
/*
58+
The name of the model that was used to generate the result.
59+
*/
60+
model: string;
61+
62+
/*
63+
The name of the company/lab that created the model.
64+
*/
65+
provider: string;
66+
67+
/*
68+
The date and time the result was generated.
69+
*/
70+
date: Date;
71+
72+
/*
73+
The response returned from the model.
74+
*/
75+
response: string;
76+
77+
/*
78+
Additional metadata about the result. For example, we could use this to map results back to their experiment name in Braintrust
79+
*/
80+
metadata?: Record<string, unknown>;
81+
82+
/*
83+
A list of evaluation metrics that we use to judge the quality of the result.
84+
*/
85+
metrics: {
86+
[metricName: string]: number;
87+
};
88+
};
89+
90+
/**
91+
This represents a multiple choice prompt. Used for evaluating models that can choose from a list of options.
92+
For example, the prompt may be an individual MongoDB University quiz question or skill assessment question.
93+
The response is a string containing one or more choices that correspond to the correct answer.
94+
*/
95+
export type MultipleChoiceCase = BaseCase & {
96+
type: "multiple_choice";
97+
};
98+
99+
/**
100+
This represents a natural language prompt. Used for evaluating models that can generate text.
101+
For example, the prompt may be a technical support question, product knowledge question, or general awareness question.
102+
The response is a single string that, ideally, matches the expected answer.
103+
*/
104+
export type NaturalLanguageCase = BaseCase & {
105+
type: "natural_language";
106+
};
107+
108+
/**
109+
This represents a code generation prompt. Used for evaluating models that can generate code.
110+
For example, the prompt may be a user query on a specific collection, with or without additional context like schemas, indexes, etc.
111+
The response is code representing a valid MongoDB query that returns the expected results.
112+
*/
113+
export type NaturalLanguageToCodeCase = BaseCase & {
114+
type: "natural_language_to_code";
115+
results: (Result & {
116+
/*
117+
The type of code we're generating, e.g. mongosh, Java Driver, etc.
118+
*/
119+
target: string;
120+
121+
/*
122+
The value returned after executing the generated code
123+
*/
124+
codeResult: unknown;
125+
})[];
126+
};
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { reportBenchmarkResults } from "../reportBenchmarkResults";
2+
import { strict as assert } from "assert";
3+
import fs from "fs";
4+
import path from "path";
5+
import "dotenv/config";
6+
import { BSON } from "mongodb-rag-core/mongodb";
7+
const { EJSON } = BSON;
8+
9+
async function main() {
10+
const pathOut = path.join(
11+
__dirname,
12+
"testData",
13+
"multiple_choice_results.json"
14+
);
15+
16+
console.log(`Reporting multiple choice benchmark results to ${pathOut}`);
17+
18+
const apiKey = process.env.BRAINTRUST_API_KEY;
19+
assert(apiKey, "must have BRAINTRUST_API_KEY set");
20+
21+
const projectName = "mongodb-multiple-choice";
22+
23+
const cases = await reportBenchmarkResults({
24+
apiKey,
25+
projectName,
26+
experimentType: "multiple_choice",
27+
});
28+
console.log(`Reported ${cases.length} cases`);
29+
fs.writeFileSync(pathOut, EJSON.stringify(cases));
30+
}
31+
32+
main();
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import { reportBenchmarkResults } from "../reportBenchmarkResults";
2+
import { strict as assert } from "assert";
3+
import fs from "fs";
4+
import path from "path";
5+
import "dotenv/config";
6+
import { BSON } from "mongodb-rag-core/mongodb";
7+
const { EJSON } = BSON;
8+
9+
async function main() {
10+
const pathOut = path.join("testData", "tech_support_results.json");
11+
12+
console.log(`Reporting tech support benchmark results to ${pathOut}`);
13+
14+
const apiKey = process.env.BRAINTRUST_API_KEY;
15+
assert(apiKey, "must have BRAINTRUST_API_KEY set");
16+
17+
const projectName = "tech-support-prompt-completion";
18+
19+
const cases = await reportBenchmarkResults({
20+
apiKey,
21+
projectName,
22+
experimentType: "prompt_response",
23+
});
24+
console.log(`Reported ${cases.length} cases`);
25+
fs.writeFileSync(pathOut, EJSON.stringify(cases));
26+
}
27+
28+
main();

packages/benchmarks/src/reporting/getBraintrustExperimentSummary.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export async function getBraintrustExperimentSummary({
1010
projectName,
1111
experimentName,
1212
apiKey,
13-
}: GetBraintrustExperimentSummary): Promise<unknown> {
13+
}: GetBraintrustExperimentSummary) {
1414
const experiment = await init(projectName, {
1515
experiment: experimentName,
1616
apiKey,

packages/benchmarks/src/reporting/listBraintrustExperiments.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ interface ListBraintrustExperimentsParams {
1515
export async function listBraintrustExperiments({
1616
apiKey,
1717
queryParams,
18-
}: ListBraintrustExperimentsParams): Promise<unknown> {
18+
}: ListBraintrustExperimentsParams): Promise<Experiment[]> {
1919
const url = new URL(`https://api.braintrust.dev/v1/experiment/`);
2020
if (queryParams) {
2121
for (const [key, value] of Object.entries(queryParams)) {
@@ -35,7 +35,7 @@ export async function listBraintrustExperiments({
3535
throw new Error(`Failed to list experiments: ${res.status}`);
3636
}
3737
const json = await res.json();
38-
return json as ListBraintrustExperimentsResponse;
38+
return (json as ListBraintrustExperimentsResponse).objects;
3939
}
4040

4141
/**

0 commit comments

Comments
 (0)