Skip to content

Commit 2d9a2fb

Browse files
evals CLI update (#543)
* allow for concurrency, env, trials to be passed in through the CLI * prettier ignore .json
1 parent 5c2e887 commit 2d9a2fb

File tree

2 files changed

+73
-46
lines changed

2 files changed

+73
-46
lines changed

evals/args.ts

Lines changed: 57 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,58 @@
11
import process from "process";
22
import { EvalCategorySchema } from "@/types/evals";
33

4-
// Extract command-line arguments passed to this script.
5-
const args = process.argv.slice(2);
4+
const rawArgs = process.argv.slice(2);
5+
6+
const parsedArgs: {
7+
env?: string;
8+
trials?: number;
9+
concurrency?: number;
10+
extractMethod?: string;
11+
leftover: string[];
12+
} = {
13+
leftover: [],
14+
};
15+
16+
for (const arg of rawArgs) {
17+
if (arg.startsWith("env=")) {
18+
parsedArgs.env = arg.split("=")[1]?.toLowerCase();
19+
} else if (arg.startsWith("trials=")) {
20+
const val = parseInt(arg.split("=")[1], 10);
21+
if (!isNaN(val)) {
22+
parsedArgs.trials = val;
23+
}
24+
} else if (arg.startsWith("concurrency=")) {
25+
const val = parseInt(arg.split("=")[1], 10);
26+
if (!isNaN(val)) {
27+
parsedArgs.concurrency = val;
28+
}
29+
} else if (arg.startsWith("--extract-method=")) {
30+
parsedArgs.extractMethod = arg.split("=")[1];
31+
} else {
32+
parsedArgs.leftover.push(arg);
33+
}
34+
}
35+
36+
/** Apply environment defaults or overrides */
37+
if (parsedArgs.env === "browserbase") {
38+
process.env.EVAL_ENV = "BROWSERBASE";
39+
} else if (parsedArgs.env === "local") {
40+
process.env.EVAL_ENV = "LOCAL";
41+
}
42+
43+
if (parsedArgs.trials !== undefined) {
44+
process.env.EVAL_TRIAL_COUNT = String(parsedArgs.trials);
45+
}
46+
if (parsedArgs.concurrency !== undefined) {
47+
process.env.EVAL_MAX_CONCURRENCY = String(parsedArgs.concurrency);
48+
}
49+
50+
const extractMethod = parsedArgs.extractMethod || "domExtract";
51+
process.env.EXTRACT_METHOD = extractMethod;
52+
53+
const useTextExtract = extractMethod === "textExtract";
54+
const useAccessibilityTree = extractMethod === "accessibilityTree";
655

7-
/**
8-
* The default categories of evaluations to run if none is specified.
9-
* These categories represent different styles or types of tasks.
10-
*/
1156
const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
1257
? process.env.EVAL_CATEGORIES.split(",")
1358
: [
@@ -19,45 +64,17 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
1964
"text_extract",
2065
];
2166

22-
/**
23-
* Determine which extraction method to use for tasks that involve extraction.
24-
* By default, "domExtract" is used. However, if a `--extract-method=<method>`
25-
* argument is provided, it will override the default.
26-
*/
27-
let extractMethod = "domExtract";
28-
const extractMethodArg = args.find((arg) =>
29-
arg.startsWith("--extract-method="),
30-
);
31-
if (extractMethodArg) {
32-
extractMethod = extractMethodArg.split("=")[1];
33-
}
34-
35-
// Set the extraction method in the process environment so tasks can reference it.
36-
process.env.EXTRACT_METHOD = extractMethod;
37-
const useTextExtract = process.env.EXTRACT_METHOD === "textExtract";
38-
const useAccessibilityTree = process.env.EXTRACT_METHOD === "accessibilityTree";
39-
40-
/**
41-
* Variables for filtering which tasks to run:
42-
* - `filterByCategory`: if provided, only tasks that belong to this category will be run.
43-
* - `filterByEvalName`: if provided, only the task with this name will be run.
44-
*/
67+
// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name
4568
let filterByCategory: string | null = null;
4669
let filterByEvalName: string | null = null;
4770

48-
/**
49-
* Check the first argument:
50-
* - If it is "category", the next argument should be the category name.
51-
* - Otherwise, assume it is a specific evaluation (task) name.
52-
*/
53-
if (args.length > 0) {
54-
if (args[0].toLowerCase() === "category") {
55-
filterByCategory = args[1];
71+
if (parsedArgs.leftover.length > 0) {
72+
if (parsedArgs.leftover[0].toLowerCase() === "category") {
73+
filterByCategory = parsedArgs.leftover[1];
5674
if (!filterByCategory) {
5775
console.error("Error: Category name not specified.");
5876
process.exit(1);
5977
}
60-
// Validate that the category is one of the known ones.
6178
try {
6279
EvalCategorySchema.parse(filterByCategory);
6380
} catch {
@@ -67,8 +84,8 @@ if (args.length > 0) {
6784
process.exit(1);
6885
}
6986
} else {
70-
// Otherwise, treat it as a filter by evaluation name.
71-
filterByEvalName = args[0];
87+
// If leftover[0] is not "category", interpret it as a task/eval name
88+
filterByEvalName = parsedArgs.leftover[0];
7289
}
7390
}
7491

evals/index.eval.ts

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,35 @@
1515
import fs from "fs";
1616
import path from "path";
1717
import process from "process";
18-
import { env } from "./env";
19-
import { generateExperimentName } from "./utils";
20-
import { exactMatch, errorMatch } from "./scoring";
21-
import { tasksByName, MODELS } from "./taskConfig";
2218
import {
2319
filterByCategory,
2420
filterByEvalName,
2521
useTextExtract,
2622
useAccessibilityTree,
2723
} from "./args";
24+
25+
import { generateExperimentName } from "./utils";
26+
import { exactMatch, errorMatch } from "./scoring";
27+
import { tasksByName, MODELS } from "./taskConfig";
2828
import { Eval } from "braintrust";
2929
import { EvalFunction, SummaryResult, Testcase } from "@/types/evals";
3030
import { EvalLogger } from "./logger";
3131
import { AvailableModel } from "@/dist";
32+
import { env } from "./env";
3233
import dotenv from "dotenv";
3334
dotenv.config();
3435

35-
const MAX_CONCURRENCY = 20;
36-
const TRIAL_COUNT = 5;
36+
/**
37+
* Read max concurrency and trial count from environment variables set in args.ts.
38+
* Fallback to defaults (20 and 5) if they're not provided.
39+
*/
40+
const MAX_CONCURRENCY = process.env.EVAL_MAX_CONCURRENCY
41+
? parseInt(process.env.EVAL_MAX_CONCURRENCY, 10)
42+
: 20;
43+
44+
const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT
45+
? parseInt(process.env.EVAL_TRIAL_COUNT, 10)
46+
: 5;
3747

3848
/**
3949
* generateSummary:

0 commit comments

Comments
 (0)