Skip to content

Commit 271787f

Browse files
mongodbenBen Perlmutter
andauthored
(EAI-1038, EAI-1039, EAI-1040, EAI-1041): benchmark on product, docs and TS datasets (#743)
* add product knowledge and docs100 benchmarks * DRY code * Clean configs * benchmark on more models * inflation+add model heritage * fix build err oopsies * add marketing dataset * add verified_response tag * fix build err --------- Co-authored-by: Ben Perlmutter <mongodben@mongodb.com>
1 parent cdb860a commit 271787f

26 files changed

+996
-139
lines changed

package-lock.json

Lines changed: 9 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/benchmarks/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"@ai-sdk/openai": "^1.3.6",
5252
"@supercharge/promise-pool": "^3.2.0",
5353
"ai": "^4.2.10",
54-
"autoevals": "^0.0.127",
54+
"autoevals": "^0.0.129",
5555
"csv-writer": "^1.6.0",
5656
"dotenv": "^16",
5757
"mongodb-chatbot-server": "*",
@@ -60,4 +60,4 @@
6060
"yaml": "^2.7.1",
6161
"zod": "^3.23.8"
6262
}
63-
}
63+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import { BenchmarkConfig } from "../../runNlPromptResponseBenchmark";
2+
3+
const projectName = "docs-100-prompt-completion";
4+
5+
export const docs100Config: BenchmarkConfig = {
6+
datasets: [
7+
{
8+
projectName,
9+
datasetName: "docs-100-prompt-completion",
10+
},
11+
],
12+
projectName,
13+
experimentBaseName: "docs-100",
14+
};
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import "dotenv/config";
2+
import {
3+
models,
4+
judgeModelsConfig,
5+
MAX_CONCURRENT_EXPERIMENTS,
6+
MAX_CONCURRENCY,
7+
EXPERIMENT_TYPE,
8+
BRAINTRUST_API_KEY,
9+
} from "../globalConfig";
10+
import { runNlPromptResponseBenchmark } from "../../runNlPromptResponseBenchmark";
11+
12+
import { docs100Config } from "./config";
13+
14+
runNlPromptResponseBenchmark({
15+
...docs100Config,
16+
models,
17+
judgeModelsConfig,
18+
experimentType: EXPERIMENT_TYPE,
19+
maxConcurrentPerExperiment: MAX_CONCURRENCY,
20+
maxConcurrentExperiments: MAX_CONCURRENT_EXPERIMENTS,
21+
braintrustApiKey: BRAINTRUST_API_KEY,
22+
});
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import { uploadDatasetToBraintrust } from "mongodb-rag-core/braintrust";
2+
import { docs100Config } from "./config";
3+
import { BRAINTRUST_ENV_VARS, assertEnvVars } from "mongodb-rag-core";
4+
import path from "path";
5+
import { createOpenAI } from "@ai-sdk/openai";
6+
import { getOpenAiEndpointAndApiKey, models } from "mongodb-rag-core/models";
7+
import { strict as assert } from "assert";
8+
import PromisePool from "@supercharge/promise-pool";
9+
import { loadDocs100QACsv, parseDocs100QARow } from "../../loadDocs100Dataset";
10+
11+
async function main() {
12+
const { BRAINTRUST_API_KEY } = assertEnvVars({
13+
...BRAINTRUST_ENV_VARS,
14+
});
15+
16+
const modelLabel = "gpt-4.1";
17+
const modelConfig = models.find((m) => m.label === modelLabel);
18+
assert(modelConfig, `Model ${modelLabel} not found`);
19+
20+
const openai = createOpenAI({
21+
...(await getOpenAiEndpointAndApiKey(modelConfig)),
22+
});
23+
const csvPath = path.join(
24+
__dirname,
25+
"..",
26+
"..",
27+
"..",
28+
"..",
29+
"testData",
30+
"docs_100_qa.csv"
31+
);
32+
33+
console.log(`Loading dataset from ${csvPath}`);
34+
const { results: dataset } = await PromisePool.withConcurrency(
35+
// Dividing by 3 b/c there are 3 concurrent llm calls
36+
modelConfig.maxConcurrency / 3
37+
)
38+
.for(loadDocs100QACsv(csvPath))
39+
.handleError((error, row) => {
40+
console.error(
41+
`Error processing row for question: ${row.Question}`,
42+
error
43+
);
44+
})
45+
.process(async (row) => {
46+
return await parseDocs100QARow(row, openai.languageModel(modelLabel));
47+
});
48+
49+
console.log(`Loaded ${dataset.length} records`);
50+
console.log(`Total number of records: ${dataset.length}`);
51+
const { datasets, projectName } = docs100Config;
52+
const res = await uploadDatasetToBraintrust({
53+
apiKey: BRAINTRUST_API_KEY,
54+
datasetName: datasets[0].datasetName,
55+
projectName,
56+
description:
57+
"Docs 100 prompt completion dataset. Created by Docs team, May 2025.",
58+
dataset,
59+
});
60+
console.log(res);
61+
}
62+
main();
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { getModelsFromLabels } from "../../benchmarkModels";
2+
import { assertEnvVars } from "mongodb-rag-core";
3+
4+
const { BRAINTRUST_API_KEY } = assertEnvVars({
5+
BRAINTRUST_API_KEY: "",
6+
});
7+
export { BRAINTRUST_API_KEY };
8+
9+
export const EXPERIMENT_TYPE = "prompt-response";
10+
11+
export const MAX_CONCURRENT_EXPERIMENTS = 2;
12+
13+
// Have to set low to allow for judge token limits :(
14+
export const MAX_CONCURRENCY = 15;
15+
16+
export const judgeModelsConfig = getModelsFromLabels(["gpt-4.1"]);
17+
18+
export const models = getModelsFromLabels([
19+
"gpt-4.1",
20+
"gpt-4.1-mini",
21+
"gpt-4.1-nano",
22+
"claude-37-sonnet",
23+
"gpt-4o",
24+
"gpt-4o-mini",
25+
"claude-35-sonnet-v2",
26+
"claude-35-sonnet",
27+
"llama-3.1-70b",
28+
"llama-3.2-90b",
29+
"llama-3.3-70b",
30+
"o3-mini",
31+
"o3",
32+
"o4-mini",
33+
"gemini-2-flash",
34+
"gemini-2.0-flash-lite",
35+
"gemini-2.5-flash",
36+
"gemini-2.5-pro-preview-03-25",
37+
]);
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import { BenchmarkConfig } from "../../runNlPromptResponseBenchmark";
2+
3+
const projectName = "marketing-prompt-completion";
4+
5+
export const marketingConfig: BenchmarkConfig = {
6+
datasets: [
7+
{
8+
projectName,
9+
datasetName: "marketing-prompt-completion",
10+
},
11+
],
12+
projectName,
13+
experimentBaseName: "marketing",
14+
};
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import "dotenv/config";
2+
import {
3+
models,
4+
judgeModelsConfig,
5+
MAX_CONCURRENT_EXPERIMENTS,
6+
MAX_CONCURRENCY,
7+
EXPERIMENT_TYPE,
8+
BRAINTRUST_API_KEY,
9+
} from "../globalConfig";
10+
import { runNlPromptResponseBenchmark } from "../../runNlPromptResponseBenchmark";
11+
12+
import { marketingConfig } from "./config";
13+
14+
runNlPromptResponseBenchmark({
15+
...marketingConfig,
16+
models,
17+
judgeModelsConfig,
18+
experimentType: EXPERIMENT_TYPE,
19+
maxConcurrentPerExperiment: MAX_CONCURRENCY,
20+
maxConcurrentExperiments: MAX_CONCURRENT_EXPERIMENTS,
21+
braintrustApiKey: BRAINTRUST_API_KEY,
22+
});
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import { uploadDatasetToBraintrust } from "mongodb-rag-core/braintrust";
2+
import { marketingConfig } from "./config";
3+
import { BRAINTRUST_ENV_VARS, assertEnvVars } from "mongodb-rag-core";
4+
import path from "path";
5+
import { createOpenAI } from "@ai-sdk/openai";
6+
import { getOpenAiEndpointAndApiKey, models } from "mongodb-rag-core/models";
7+
import { strict as assert } from "assert";
8+
import PromisePool from "@supercharge/promise-pool";
9+
import {
10+
loadMarketingQACsv,
11+
parseMarketingQARow,
12+
} from "../../loadMarketingDataset";
13+
14+
async function main() {
15+
const { BRAINTRUST_API_KEY } = assertEnvVars({
16+
...BRAINTRUST_ENV_VARS,
17+
});
18+
19+
const modelLabel = "gpt-4.1";
20+
const modelConfig = models.find((m) => m.label === modelLabel);
21+
assert(modelConfig, `Model ${modelLabel} not found`);
22+
23+
const openai = createOpenAI({
24+
...(await getOpenAiEndpointAndApiKey(modelConfig)),
25+
});
26+
const csvPath = path.join(
27+
__dirname,
28+
"..",
29+
"..",
30+
"..",
31+
"..",
32+
"testData",
33+
"marketing_qa.csv"
34+
);
35+
36+
console.log(`Loading dataset from ${csvPath}`);
37+
const { results: dataset } = await PromisePool.withConcurrency(
38+
// Dividing by 3 b/c there are 3 concurrent llm calls
39+
modelConfig.maxConcurrency / 3
40+
)
41+
.for(loadMarketingQACsv(csvPath))
42+
.handleError((error, row) => {
43+
console.error(
44+
`Error processing row for question: ${row.Question}`,
45+
error
46+
);
47+
})
48+
.process(async (row) => {
49+
return await parseMarketingQARow(row, openai.languageModel(modelLabel));
50+
});
51+
52+
console.log(`Loaded ${dataset.length} records`);
53+
console.log(`Total number of records: ${dataset.length}`);
54+
const { datasets, projectName } = marketingConfig;
55+
const res = await uploadDatasetToBraintrust({
56+
apiKey: BRAINTRUST_API_KEY,
57+
datasetName: datasets[0].datasetName,
58+
projectName,
59+
description:
60+
"Marketing prompt completion dataset. Created by Marketing team, spring 2025.",
61+
dataset,
62+
});
63+
console.log(res);
64+
}
65+
main();
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import { BenchmarkConfig } from "../../runNlPromptResponseBenchmark";
2+
3+
const projectName = "product-knowledge-prompt-completion";
4+
5+
export const productKnowledgeConfig: BenchmarkConfig = {
6+
datasets: [
7+
{
8+
projectName,
9+
datasetName: "product-knowledge-prompt-completion",
10+
},
11+
],
12+
projectName,
13+
experimentBaseName: "product-knowledge",
14+
};

0 commit comments

Comments
 (0)