Skip to content

Commit 0047d3f

Browse files
authored
refactor: Rename rubric to model_judge (#87)
1 parent b4bfd6c commit 0047d3f

File tree

14 files changed

+49
-75
lines changed

14 files changed

+49
-75
lines changed

examples/multi-model-simple-rubric-grader/suite.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ target:
1010
- openai/gpt-5-mini
1111
graders:
1212
quality:
13-
kind: rubric
13+
kind: model_judge
1414
prompt_path: rubric.txt
1515
model: gpt-5-mini
1616
temperature: 0.0

examples/simple-rubric-grader/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ The suite configuration declares which variables are required:
7575
```yaml
7676
graders:
7777
quality:
78-
kind: rubric
78+
kind: model_judge
7979
prompt_path: rubric.txt
8080
rubric_vars:
8181
- reference_ascii
@@ -138,7 +138,7 @@ IMPORTANT: Make sure to clearly express your rationale and scoring.
138138
```yaml
139139
graders:
140140
quality:
141-
kind: rubric
141+
kind: model_judge
142142
prompt_path: rubric.txt
143143
model: claude-haiku-4-5-20251001
144144
temperature: 0.0
@@ -157,7 +157,7 @@ Combine rubric and tool graders for comprehensive evaluation:
157157
```yaml
158158
graders:
159159
quality:
160-
kind: rubric
160+
kind: model_judge
161161
display_name: "rubric score"
162162
prompt_path: rubric.txt
163163
model: gpt-5-mini

examples/simple-rubric-grader/suite.ascii-only-accuracy.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target:
77
base_url: http://localhost:8283
88
graders:
99
quality:
10-
kind: rubric
10+
kind: model_judge
1111
display_name: "rubric score"
1212
prompt_path: rubric.txt
1313
model: gpt-5-mini

examples/simple-rubric-grader/suite.two-metrics.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target:
77
base_url: http://localhost:8283
88
graders:
99
quality:
10-
kind: rubric
10+
kind: model_judge
1111
display_name: "rubric score"
1212
prompt_path: rubric.txt
1313
model: gpt-5-mini

examples/simple-rubric-grader/suite.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ target:
88
base_url: http://localhost:8283
99
graders:
1010
quality:
11-
kind: rubric
11+
kind: model_judge
1212
display_name: "rubric score"
1313
prompt_path: rubric.txt
1414
model: claude-haiku-4-5-20251001

letta-leaderboard/core-memory-update-agent/suites/core-memory-update.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ target:
2323

2424
graders:
2525
contains_check:
26-
kind: rubric
26+
kind: model_judge
2727
prompt_path: ../rubric.txt
2828
model: gpt-5-mini
2929
temperature: 0.0

letta-leaderboard/filesystem-agent/suites/filesystem.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ target:
2525

2626
graders:
2727
rubric_check:
28-
kind: rubric
28+
kind: model_judge
2929
prompt_path: ../rubric.txt
3030
model: gpt-5-mini
3131
temperature: 0.0

letta_evals/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def list_graders():
246246
table.add_row(name, "tool")
247247

248248
console.print(table)
249-
console.print("\n[dim]You can also use 'rubric' graders with custom prompts[/dim]")
249+
console.print("\n[dim]You can also use 'model_judge' or 'letta_judge' graders with custom prompts[/dim]")
250250

251251

252252
def display_results(result: RunnerResult, verbose: bool = False, cached_mode: bool = False):

letta_evals/models.py

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -74,22 +74,22 @@ def __init__(self, **data):
7474
class GraderSpec(BaseModel):
7575
"""Grader configuration for evaluation."""
7676

77-
kind: GraderKind = Field(description="Type of grader (tool or rubric)")
77+
kind: GraderKind = Field(description="Type of grader (tool, model_judge, or letta_judge)")
7878

7979
# Optional display name for UI/CLI output
8080
display_name: Optional[str] = Field(default=None, description="Human-friendly name for this metric")
8181

8282
function: Optional[str] = Field(default=None, description="Name of grading function for tool grader")
8383

84-
prompt: Optional[str] = Field(default=None, description="Rubric prompt for LLM judge")
85-
prompt_path: Optional[Path] = Field(default=None, description="Path to file containing rubric prompt")
86-
model: Optional[str] = Field(default="gpt-4o-mini", description="LLM model to use for rubric grading")
87-
temperature: Optional[float] = Field(default=0.0, description="Temperature for LLM judge")
88-
provider: Optional[LLMProvider] = Field(default=LLMProvider.OPENAI, description="LLM provider for rubric grading")
89-
max_retries: Optional[int] = Field(default=5, description="Maximum number of retries for rubric grading")
90-
timeout: Optional[float] = Field(default=120.0, description="Timeout for rubric grading in seconds")
84+
prompt: Optional[str] = Field(default=None, description="Prompt for model judge or letta judge")
85+
prompt_path: Optional[Path] = Field(default=None, description="Path to file containing prompt")
86+
model: Optional[str] = Field(default="gpt-4o-mini", description="LLM model to use for model judge")
87+
temperature: Optional[float] = Field(default=0.0, description="Temperature for model judge")
88+
provider: Optional[LLMProvider] = Field(default=LLMProvider.OPENAI, description="LLM provider for model judge")
89+
max_retries: Optional[int] = Field(default=5, description="Maximum number of retries for model judge")
90+
timeout: Optional[float] = Field(default=120.0, description="Timeout for model judge in seconds")
9191
rubric_vars: Optional[List[str]] = Field(
92-
default=None, description="List of required custom variables for rubric substitution"
92+
default=None, description="List of required custom variables for prompt substitution"
9393
)
9494

9595
# Agent-based judge fields
@@ -115,25 +115,13 @@ def __init__(self, **data):
115115
if not self.function:
116116
raise ValueError("Tool grader requires function name")
117117
if self.rubric_vars:
118-
raise ValueError("Tool grader cannot use rubric_vars (only available for rubric graders)")
119-
elif self.kind == GraderKind.RUBRIC:
120-
# check if agent-based or LLM-based judge
121-
if self.agent_file:
122-
# agent-based judge validation
123-
if not self.prompt and not self.prompt_path:
124-
raise ValueError("Agent judge requires either prompt or prompt_path for rubric text")
125-
if self.prompt and self.prompt_path:
126-
raise ValueError("Agent judge cannot have both prompt and prompt_path")
127-
if self.model != "gpt-4o-mini" or self.temperature != 0.0 or self.provider != LLMProvider.OPENAI:
128-
raise ValueError(
129-
"Agent judge should not specify model/temperature/provider (those are only for LLM judges)"
130-
)
131-
else:
132-
# LLM-based judge validation
133-
if not self.prompt and not self.prompt_path:
134-
raise ValueError("Rubric grader requires either prompt or prompt_path")
135-
if self.prompt and self.prompt_path:
136-
raise ValueError("Rubric grader cannot have both prompt and prompt_path")
118+
raise ValueError("Tool grader cannot use rubric_vars (only available for model_judge and letta_judge)")
119+
elif self.kind == GraderKind.MODEL_JUDGE:
120+
# model judge validation
121+
if not self.prompt and not self.prompt_path:
122+
raise ValueError("Model judge requires either prompt or prompt_path")
123+
if self.prompt and self.prompt_path:
124+
raise ValueError("Model judge cannot have both prompt and prompt_path")
137125

138126
# load prompt from file if needed
139127
if self.prompt_path:
@@ -142,7 +130,7 @@ def __init__(self, **data):
142130
elif self.kind == GraderKind.LETTA_JUDGE:
143131
# letta judge validation
144132
if not self.prompt and not self.prompt_path:
145-
raise ValueError("Letta judge requires either prompt or prompt_path for rubric text")
133+
raise ValueError("Letta judge requires either prompt or prompt_path")
146134
if self.prompt and self.prompt_path:
147135
raise ValueError("Letta judge cannot have both prompt and prompt_path")
148136

@@ -153,10 +141,10 @@ def __init__(self, **data):
153141
"To use a custom judge_tool_name, provide a custom agent_file."
154142
)
155143

156-
# disallow LLM-specific fields for letta judge
144+
# disallow model-specific fields for letta judge
157145
if self.model != "gpt-4o-mini" or self.temperature != 0.0 or self.provider != LLMProvider.OPENAI:
158146
raise ValueError(
159-
"Letta judge should not specify model/temperature/provider (those are only for LLM judges)"
147+
"Letta judge should not specify model/temperature/provider (those are only for model judges)"
160148
)
161149

162150
# load prompt from file if needed

letta_evals/runner.py

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -152,33 +152,19 @@ def _init_graders(self) -> None:
152152
extractor_config=gspec.extractor_config,
153153
base_dir=gspec.base_dir,
154154
)
155-
elif gspec.kind == GraderKind.RUBRIC:
156-
# check if agent-based or LLM-based judge
157-
if gspec.agent_file:
158-
self.graders[key] = AgentJudgeGrader(
159-
agent_file=gspec.agent_file,
160-
prompt=gspec.prompt,
161-
client=self.client,
162-
project_id=self.project_id,
163-
judge_tool_name=gspec.judge_tool_name,
164-
extractor=gspec.extractor,
165-
extractor_config=gspec.extractor_config,
166-
base_dir=gspec.base_dir,
167-
rubric_vars=gspec.rubric_vars,
168-
)
169-
else:
170-
self.graders[key] = RubricGrader(
171-
prompt=gspec.prompt,
172-
model=gspec.model,
173-
temperature=gspec.temperature,
174-
provider=gspec.provider,
175-
max_retries=gspec.max_retries,
176-
timeout=gspec.timeout,
177-
extractor=gspec.extractor,
178-
extractor_config=gspec.extractor_config,
179-
base_dir=gspec.base_dir,
180-
rubric_vars=gspec.rubric_vars,
181-
)
155+
elif gspec.kind == GraderKind.MODEL_JUDGE:
156+
self.graders[key] = RubricGrader(
157+
prompt=gspec.prompt,
158+
model=gspec.model,
159+
temperature=gspec.temperature,
160+
provider=gspec.provider,
161+
max_retries=gspec.max_retries,
162+
timeout=gspec.timeout,
163+
extractor=gspec.extractor,
164+
extractor_config=gspec.extractor_config,
165+
base_dir=gspec.base_dir,
166+
rubric_vars=gspec.rubric_vars,
167+
)
182168
elif gspec.kind == GraderKind.LETTA_JUDGE:
183169
# use default agent file if not provided
184170
agent_file = gspec.agent_file
@@ -382,7 +368,7 @@ def _validate_rubric_vars(self, samples: List[Sample]) -> None:
382368
return
383369

384370
for grader_key, grader_spec in self.suite.graders.items():
385-
if grader_spec.kind != GraderKind.RUBRIC or not grader_spec.rubric_vars:
371+
if grader_spec.kind != GraderKind.MODEL_JUDGE or not grader_spec.rubric_vars:
386372
continue
387373

388374
for sample in samples:

0 commit comments

Comments
 (0)