refactor: Rename rubric to model_judge (#87)

mattzh72 · web-flow · commit 0047d3f8fa1b · 2025-10-22T16:38:34.000-07:00
diff --git a/examples/multi-model-simple-rubric-grader/suite.yaml b/examples/multi-model-simple-rubric-grader/suite.yaml
@@ -10,7 +10,7 @@ target:
     - openai/gpt-5-mini
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     prompt_path: rubric.txt
     model: gpt-5-mini
     temperature: 0.0
diff --git a/examples/simple-rubric-grader/README.md b/examples/simple-rubric-grader/README.md
@@ -75,7 +75,7 @@ The suite configuration declares which variables are required:
 ```yaml
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     prompt_path: rubric.txt
     rubric_vars:
       - reference_ascii
@@ -138,7 +138,7 @@ IMPORTANT: Make sure to clearly express your rationale and scoring.
 ```yaml
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     prompt_path: rubric.txt
     model: claude-haiku-4-5-20251001
     temperature: 0.0
@@ -157,7 +157,7 @@ Combine rubric and tool graders for comprehensive evaluation:
 ```yaml
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     display_name: "rubric score"
     prompt_path: rubric.txt
     model: gpt-5-mini
diff --git a/examples/simple-rubric-grader/suite.ascii-only-accuracy.yaml b/examples/simple-rubric-grader/suite.ascii-only-accuracy.yaml
@@ -7,7 +7,7 @@ target:
   base_url: http://localhost:8283
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     display_name: "rubric score"
     prompt_path: rubric.txt
     model: gpt-5-mini
diff --git a/examples/simple-rubric-grader/suite.two-metrics.yaml b/examples/simple-rubric-grader/suite.two-metrics.yaml
@@ -7,7 +7,7 @@ target:
   base_url: http://localhost:8283
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     display_name: "rubric score"
     prompt_path: rubric.txt
     model: gpt-5-mini
diff --git a/examples/simple-rubric-grader/suite.yaml b/examples/simple-rubric-grader/suite.yaml
@@ -8,7 +8,7 @@ target:
   base_url: http://localhost:8283
 graders:
   quality:
-    kind: rubric
+    kind: model_judge
     display_name: "rubric score"
     prompt_path: rubric.txt
     model: claude-haiku-4-5-20251001
diff --git a/letta-leaderboard/core-memory-update-agent/suites/core-memory-update.yaml b/letta-leaderboard/core-memory-update-agent/suites/core-memory-update.yaml
@@ -23,7 +23,7 @@ target:
 
 graders:
   contains_check:
-    kind: rubric
+    kind: model_judge
     prompt_path: ../rubric.txt
     model: gpt-5-mini
     temperature: 0.0
diff --git a/letta-leaderboard/filesystem-agent/suites/filesystem.yaml b/letta-leaderboard/filesystem-agent/suites/filesystem.yaml
@@ -25,7 +25,7 @@ target:
 
 graders:
   rubric_check:
-    kind: rubric
+    kind: model_judge
     prompt_path: ../rubric.txt
     model: gpt-5-mini
     temperature: 0.0
diff --git a/letta_evals/cli.py b/letta_evals/cli.py
@@ -246,7 +246,7 @@ def list_graders():
         table.add_row(name, "tool")
 
     console.print(table)
-    console.print("\n[dim]You can also use 'rubric' graders with custom prompts[/dim]")
+    console.print("\n[dim]You can also use 'model_judge' or 'letta_judge' graders with custom prompts[/dim]")
 
 
 def display_results(result: RunnerResult, verbose: bool = False, cached_mode: bool = False):
diff --git a/letta_evals/models.py b/letta_evals/models.py
@@ -74,22 +74,22 @@ def __init__(self, **data):
 class GraderSpec(BaseModel):
     """Grader configuration for evaluation."""
 
-    kind: GraderKind = Field(description="Type of grader (tool or rubric)")
+    kind: GraderKind = Field(description="Type of grader (tool, model_judge, or letta_judge)")
 
     # Optional display name for UI/CLI output
     display_name: Optional[str] = Field(default=None, description="Human-friendly name for this metric")
 
     function: Optional[str] = Field(default=None, description="Name of grading function for tool grader")
 
-    prompt: Optional[str] = Field(default=None, description="Rubric prompt for LLM judge")
-    prompt_path: Optional[Path] = Field(default=None, description="Path to file containing rubric prompt")
-    model: Optional[str] = Field(default="gpt-4o-mini", description="LLM model to use for rubric grading")
-    temperature: Optional[float] = Field(default=0.0, description="Temperature for LLM judge")
-    provider: Optional[LLMProvider] = Field(default=LLMProvider.OPENAI, description="LLM provider for rubric grading")
-    max_retries: Optional[int] = Field(default=5, description="Maximum number of retries for rubric grading")
-    timeout: Optional[float] = Field(default=120.0, description="Timeout for rubric grading in seconds")
+    prompt: Optional[str] = Field(default=None, description="Prompt for model judge or letta judge")
+    prompt_path: Optional[Path] = Field(default=None, description="Path to file containing prompt")
+    model: Optional[str] = Field(default="gpt-4o-mini", description="LLM model to use for model judge")
+    temperature: Optional[float] = Field(default=0.0, description="Temperature for model judge")
+    provider: Optional[LLMProvider] = Field(default=LLMProvider.OPENAI, description="LLM provider for model judge")
+    max_retries: Optional[int] = Field(default=5, description="Maximum number of retries for model judge")
+    timeout: Optional[float] = Field(default=120.0, description="Timeout for model judge in seconds")
     rubric_vars: Optional[List[str]] = Field(
-        default=None, description="List of required custom variables for rubric substitution"
+        default=None, description="List of required custom variables for prompt substitution"
     )
 
     # Agent-based judge fields
@@ -115,25 +115,13 @@ def __init__(self, **data):
             if not self.function:
                 raise ValueError("Tool grader requires function name")
             if self.rubric_vars:
-                raise ValueError("Tool grader cannot use rubric_vars (only available for rubric graders)")
-        elif self.kind == GraderKind.RUBRIC:
-            # check if agent-based or LLM-based judge
-            if self.agent_file:
-                # agent-based judge validation
-                if not self.prompt and not self.prompt_path:
-                    raise ValueError("Agent judge requires either prompt or prompt_path for rubric text")
-                if self.prompt and self.prompt_path:
-                    raise ValueError("Agent judge cannot have both prompt and prompt_path")
-                if self.model != "gpt-4o-mini" or self.temperature != 0.0 or self.provider != LLMProvider.OPENAI:
-                    raise ValueError(
-                        "Agent judge should not specify model/temperature/provider (those are only for LLM judges)"
-                    )
-            else:
-                # LLM-based judge validation
-                if not self.prompt and not self.prompt_path:
-                    raise ValueError("Rubric grader requires either prompt or prompt_path")
-                if self.prompt and self.prompt_path:
-                    raise ValueError("Rubric grader cannot have both prompt and prompt_path")
+                raise ValueError("Tool grader cannot use rubric_vars (only available for model_judge and letta_judge)")
+        elif self.kind == GraderKind.MODEL_JUDGE:
+            # model judge validation
+            if not self.prompt and not self.prompt_path:
+                raise ValueError("Model judge requires either prompt or prompt_path")
+            if self.prompt and self.prompt_path:
+                raise ValueError("Model judge cannot have both prompt and prompt_path")
 
             # load prompt from file if needed
             if self.prompt_path:
@@ -142,7 +130,7 @@ def __init__(self, **data):
         elif self.kind == GraderKind.LETTA_JUDGE:
             # letta judge validation
             if not self.prompt and not self.prompt_path:
-                raise ValueError("Letta judge requires either prompt or prompt_path for rubric text")
+                raise ValueError("Letta judge requires either prompt or prompt_path")
             if self.prompt and self.prompt_path:
                 raise ValueError("Letta judge cannot have both prompt and prompt_path")
 
@@ -153,10 +141,10 @@ def __init__(self, **data):
                     "To use a custom judge_tool_name, provide a custom agent_file."
                 )
 
-            # disallow LLM-specific fields for letta judge
+            # disallow model-specific fields for letta judge
             if self.model != "gpt-4o-mini" or self.temperature != 0.0 or self.provider != LLMProvider.OPENAI:
                 raise ValueError(
-                    "Letta judge should not specify model/temperature/provider (those are only for LLM judges)"
+                    "Letta judge should not specify model/temperature/provider (those are only for model judges)"
                 )
 
             # load prompt from file if needed
diff --git a/letta_evals/runner.py b/letta_evals/runner.py
@@ -152,33 +152,19 @@ def _init_graders(self) -> None:
                         extractor_config=gspec.extractor_config,
                         base_dir=gspec.base_dir,
                     )
-                elif gspec.kind == GraderKind.RUBRIC:
-                    # check if agent-based or LLM-based judge
-                    if gspec.agent_file:
-                        self.graders[key] = AgentJudgeGrader(
-                            agent_file=gspec.agent_file,
-                            prompt=gspec.prompt,
-                            client=self.client,
-                            project_id=self.project_id,
-                            judge_tool_name=gspec.judge_tool_name,
-                            extractor=gspec.extractor,
-                            extractor_config=gspec.extractor_config,
-                            base_dir=gspec.base_dir,
-                            rubric_vars=gspec.rubric_vars,
-                        )
-                    else:
-                        self.graders[key] = RubricGrader(
-                            prompt=gspec.prompt,
-                            model=gspec.model,
-                            temperature=gspec.temperature,
-                            provider=gspec.provider,
-                            max_retries=gspec.max_retries,
-                            timeout=gspec.timeout,
-                            extractor=gspec.extractor,
-                            extractor_config=gspec.extractor_config,
-                            base_dir=gspec.base_dir,
-                            rubric_vars=gspec.rubric_vars,
-                        )
+                elif gspec.kind == GraderKind.MODEL_JUDGE:
+                    self.graders[key] = RubricGrader(
+                        prompt=gspec.prompt,
+                        model=gspec.model,
+                        temperature=gspec.temperature,
+                        provider=gspec.provider,
+                        max_retries=gspec.max_retries,
+                        timeout=gspec.timeout,
+                        extractor=gspec.extractor,
+                        extractor_config=gspec.extractor_config,
+                        base_dir=gspec.base_dir,
+                        rubric_vars=gspec.rubric_vars,
+                    )
                 elif gspec.kind == GraderKind.LETTA_JUDGE:
                     # use default agent file if not provided
                     agent_file = gspec.agent_file
@@ -382,7 +368,7 @@ def _validate_rubric_vars(self, samples: List[Sample]) -> None:
             return
 
         for grader_key, grader_spec in self.suite.graders.items():
-            if grader_spec.kind != GraderKind.RUBRIC or not grader_spec.rubric_vars:
+            if grader_spec.kind != GraderKind.MODEL_JUDGE or not grader_spec.rubric_vars:
                 continue
 
             for sample in samples:
diff --git a/letta_evals/types.py b/letta_evals/types.py
@@ -7,7 +7,7 @@ class TargetKind(str, Enum):
 
 class GraderKind(str, Enum):
     TOOL = "tool"
-    RUBRIC = "rubric"
+    MODEL_JUDGE = "model_judge"
     LETTA_JUDGE = "letta_judge"
 
 
diff --git a/letta_evals/visualization/factory.py b/letta_evals/visualization/factory.py
@@ -50,11 +50,11 @@ def create_progress_callback(
     else:
         grader_kind_label = "multi"
 
-    # Choose rubric model if any grader is rubric
+    # Choose model if any grader is model_judge
     rubric_model = None
     if suite.graders:
         for _, gspec in suite.graders.items():
-            if hasattr(gspec, "kind") and getattr(gspec, "kind").value == "rubric" and hasattr(gspec, "model"):
+            if hasattr(gspec, "kind") and getattr(gspec, "kind").value == "model_judge" and hasattr(gspec, "model"):
                 rubric_model = gspec.model
                 break
 
diff --git a/letta_evals/visualization/rich_progress.py b/letta_evals/visualization/rich_progress.py
@@ -405,7 +405,7 @@ def last_update_key(s: SampleProgress) -> float:
 
         table.add_column("#", style="cyan", width=5)
         table.add_column("Model", style="yellow", width=27)
-        if self.grader_kind == GraderKind.RUBRIC.value and self.rubric_model:
+        if self.grader_kind == GraderKind.MODEL_JUDGE.value and self.rubric_model:
             table.add_column("Rubric Model", style="magenta", width=27)
         table.add_column("Status", width=20)
         # Add per-metric columns (score + rationale) or single score/rationale
@@ -479,7 +479,7 @@ def last_update_key(s: SampleProgress) -> float:
                 sample_num,
                 s.model_name or "-",
             ]
-            if self.grader_kind == GraderKind.RUBRIC.value and self.rubric_model:
+            if self.grader_kind == GraderKind.MODEL_JUDGE.value and self.rubric_model:
                 row_data.append(self.rubric_model)
             row_data.extend([self._get_state_text(s), *cells, time_text, details])
             table.add_row(*row_data)
diff --git a/tests/test_examples_e2e_live.py b/tests/test_examples_e2e_live.py
@@ -18,7 +18,7 @@
 def _requires_openai(suite: SuiteSpec) -> bool:
     if not suite.graders:
         return False
-    return any(g.kind == GraderKind.RUBRIC for g in suite.graders.values())
+    return any(g.kind == GraderKind.MODEL_JUDGE for g in suite.graders.values())
 
 
 @pytest.mark.asyncio