expose eval_scores in validate function and improve typing (#87)

LukeMainwaring · web-flow · commit fa9cc5c7a534 · 2025-06-04T16:03:18.000-04:00
* expose eval_thresholds in validate function and improve typing

* release 1.0.19

* link to TLMOptions
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.0.19] 2025-06-4
+
+- Expose `eval_scores` property for `Validator.validate()` and use Pydantic types from Codex backend
+
 ## [1.0.18] 2025-06-3
 
 - Expose `options` and `quality_preset` properties for `Validator.validate()`
@@ -90,7 +94,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Initial release of the `cleanlab-codex` client library.
 
-[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.18...HEAD
+[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.19...HEAD
+[1.0.19]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.18...v1.0.19
 [1.0.18]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.17...v1.0.18
 [1.0.17]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.16...v1.0.17
 [1.0.16]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.15...v1.0.16
diff --git a/src/cleanlab_codex/__about__.py b/src/cleanlab_codex/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.0.18"
+__version__ = "1.0.19"
diff --git a/src/cleanlab_codex/project.py b/src/cleanlab_codex/project.py
@@ -248,7 +248,7 @@ def validate(
         constrain_outputs: Optional[List[str]] = None,
         custom_metadata: Optional[object] = None,
         eval_scores: Optional[Dict[str, float]] = None,
-        custom_eval_thresholds: Optional[Dict[str, float]] = None,
+        eval_thresholds: Optional[Dict[str, float]] = None,
         options: Optional[ProjectValidateOptions] = None,
         quality_preset: Literal["best", "high", "medium", "low", "base"] = "medium",
     ) -> ProjectValidateResponse:
@@ -259,7 +259,7 @@ def validate(
             query=query,
             response=response,
             constrain_outputs=constrain_outputs,
-            custom_eval_thresholds=custom_eval_thresholds,
+            custom_eval_thresholds=eval_thresholds,
             custom_metadata=custom_metadata,
             eval_scores=eval_scores,
             options=options,
diff --git a/src/cleanlab_codex/types/validator.py b/src/cleanlab_codex/types/validator.py
diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py
@@ -14,13 +14,14 @@
 
 if _TYPE_CHECKING:
     from codex.types.project_validate_params import Options as ProjectValidateOptions
+    from codex.types.project_validate_response import ProjectValidateResponse
 
 
 class Validator:
     def __init__(
         self,
         codex_access_key: str,
-        custom_eval_thresholds: Optional[dict[str, float]] = None,
+        eval_thresholds: Optional[dict[str, float]] = None,
     ):
         """Real-time detection and remediation of bad responses in RAG applications, powered by Cleanlab's TrustworthyRAG and Codex.
 
@@ -35,7 +36,7 @@ def __init__(
             codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. Used to retrieve expert-provided answers
                 when bad responses are detected, or otherwise log the corresponding queries for SMEs to answer.
 
-            custom_eval_thresholds (dict[str, float], optional): Custom thresholds (between 0 and 1) for specific evals.
+            eval_thresholds (dict[str, float], optional): Custom thresholds (between 0 and 1) for specific evals.
                 Keys should either correspond to an Eval from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag)
                 or a custom eval for your project. If not provided, project settings will be used.
 
@@ -45,9 +46,9 @@ def __init__(
             ValueError: If any threshold value is not between 0 and 1.
         """
         self._project: Project = Project.from_access_key(access_key=codex_access_key)
-        if custom_eval_thresholds is not None:
-            validate_thresholds(custom_eval_thresholds)
-        self._custom_eval_thresholds = custom_eval_thresholds
+        if eval_thresholds is not None:
+            validate_thresholds(eval_thresholds)
+        self._eval_thresholds = eval_thresholds
 
     def validate(
         self,
@@ -58,9 +59,10 @@ def validate(
         prompt: Optional[str] = None,
         form_prompt: Optional[Callable[[str, str], str]] = None,
         metadata: Optional[dict[str, Any]] = None,
+        eval_scores: Optional[dict[str, float]] = None,
         options: Optional[ProjectValidateOptions] = None,
         quality_preset: Literal["best", "high", "medium", "low", "base"] = "medium",
-    ) -> dict[str, Any]:
+    ) -> ProjectValidateResponse:
         """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer.
         If no expert answer is available, this query is still logged for SMEs to answer.
 
@@ -71,14 +73,17 @@ def validate(
             prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response.
             form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition.
             metadata (dict, optional): Additional custom metadata to associate with the query logged in the Codex Project.
-            options (ProjectValidateOptions, optional): Typed dict of advanced configuration options for the Trustworthy Language Model.
+            eval_scores (dict[str, float], optional): Scores assessing different aspects of the RAG system. If provided, TLM Trustworthy RAG will not be used to generate scores.
+            options (ProjectValidateOptions, optional): Typed dict of advanced TLM configuration options. See [TLMOptions](/tlm/api/python/tlm/#class-tlmoptions)
             quality_preset (Literal["best", "high", "medium", "low", "base"], optional): The quality preset to use for the TLM or Trustworthy RAG API.
 
         Returns:
-            dict[str, Any]: A dictionary containing:
-                - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise.
-                - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer.
-                - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold.
+            ProjectValidateResponse: A response object containing:
+                - eval_scores (Dict[str, EvalScores]): Evaluation scores for the original response along with a boolean flag, `failed`,
+                  indicating whether the score is below the threshold.
+                - expert_answer (Optional[str]): Alternate SME-provided answer from Codex if the response was flagged as bad and
+                  an answer was found in the Codex Project, or None otherwise.
+                - is_bad_response (bool): True if the response is flagged as potentially bad and triggered escalation to SMEs.
         """
         formatted_prompt = prompt
         if not formatted_prompt:
@@ -92,27 +97,14 @@ def validate(
         if not formatted_prompt:
             raise ValueError("Exactly one of prompt or form_prompt is required")  # noqa: TRY003
 
-        result = self._project.validate(
+        return self._project.validate(
             context=context,
             prompt=formatted_prompt,
             query=query,
             response=response,
-            custom_eval_thresholds=self._custom_eval_thresholds,
             custom_metadata=metadata,
+            eval_scores=eval_scores,
+            eval_thresholds=self._eval_thresholds,
             options=options,
             quality_preset=quality_preset,
         )
-
-        formatted_eval_scores = {
-            eval_name: {
-                "score": eval_scores.score,
-                "is_bad": eval_scores.failed,
-            }
-            for eval_name, eval_scores in result.eval_scores.items()
-        }
-
-        return {
-            "expert_answer": result.expert_answer,
-            "is_bad_response": result.is_bad_response,
-            **formatted_eval_scores,
-        }
diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -52,20 +52,14 @@ def test_validate(self, mock_project: Mock) -> None:  # noqa: ARG002
         result = validator.validate(query="test query", context="test context", response="test response")
 
         # Verify expected result structure
-        assert result["is_bad_response"] is True
-        assert result["expert_answer"] is None
-
-        eval_metrics = ["trustworthiness", "response_helpfulness"]
-        for metric in eval_metrics:
-            assert metric in result
-            assert "score" in result[metric]
-            assert "is_bad" in result[metric]
+        assert result.is_bad_response is True
+        assert result.expert_answer is None
 
     def test_validate_expert_answer(self, mock_project: Mock) -> None:
-        validator = Validator(codex_access_key="test", custom_eval_thresholds={"trustworthiness": 1.0})
+        validator = Validator(codex_access_key="test", eval_thresholds={"trustworthiness": 1.0})
         mock_project.from_access_key.return_value.query.return_value = (None, None)
         result = validator.validate(query="test query", context="test context", response="test response")
-        assert result["expert_answer"] is None
+        assert result.expert_answer is None
 
         # Setup mock project query response
         mock_project.from_access_key.return_value.validate.return_value = ProjectValidateResponse(
@@ -78,7 +72,7 @@ def test_validate_expert_answer(self, mock_project: Mock) -> None:
         )
         # Basically any response will be flagged as untrustworthy
         result = validator.validate(query="test query", context="test context", response="test response")
-        assert result["expert_answer"] == "expert answer"
+        assert result.expert_answer == "expert answer"
 
     def test_user_provided_thresholds(self, mock_project_with_custom_thresholds: Mock) -> None:
         """
@@ -87,15 +81,15 @@ def test_user_provided_thresholds(self, mock_project_with_custom_thresholds: Moc
         """
         validator = Validator(
             codex_access_key="test",
-            custom_eval_thresholds={"trustworthiness": 0.4, "non_existent_metric": 0.5},
+            eval_thresholds={"trustworthiness": 0.4, "non_existent_metric": 0.5},
         )
         mock_project_with_custom_thresholds.from_access_key.assert_called_once_with(access_key="test")
         result = validator.validate(query="test query", context="test context", response="test response")
-        assert result["is_bad_response"] is False
-        assert result["expert_answer"] is None
+        assert result.is_bad_response is False
+        assert result.expert_answer is None
 
     def test_default_thresholds(self, mock_project: Mock) -> None:
-        # Test with default thresholds (custom_eval_thresholds is None)
+        # Test with default thresholds (eval_thresholds is None)
         validator = Validator(codex_access_key="test")
         mock_project.from_access_key.assert_called_once_with(access_key="test")
-        assert validator._custom_eval_thresholds is None  # noqa: SLF001
+        assert validator._eval_thresholds is None  # noqa: SLF001

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.0.18"`
	`2`	`+__version__ = "1.0.19"`