Skip to content

Commit fa9cc5c

Browse files
expose eval_scores in validate function and improve typing (#87)
* expose eval_thresholds in validate function and improve typing * release 1.0.19 * link to TLMOptions
1 parent 42d1b62 commit fa9cc5c

File tree

6 files changed

+38
-82
lines changed

6 files changed

+38
-82
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.0.19] 2025-06-4
11+
12+
- Expose `eval_scores` property for `Validator.validate()` and use Pydantic types from Codex backend
13+
1014
## [1.0.18] 2025-06-3
1115

1216
- Expose `options` and `quality_preset` properties for `Validator.validate()`
@@ -90,7 +94,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
9094

9195
- Initial release of the `cleanlab-codex` client library.
9296

93-
[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.18...HEAD
97+
[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.19...HEAD
98+
[1.0.19]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.18...v1.0.19
9499
[1.0.18]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.17...v1.0.18
95100
[1.0.17]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.16...v1.0.17
96101
[1.0.16]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.15...v1.0.16

src/cleanlab_codex/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SPDX-License-Identifier: MIT
2-
__version__ = "1.0.18"
2+
__version__ = "1.0.19"

src/cleanlab_codex/project.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def validate(
248248
constrain_outputs: Optional[List[str]] = None,
249249
custom_metadata: Optional[object] = None,
250250
eval_scores: Optional[Dict[str, float]] = None,
251-
custom_eval_thresholds: Optional[Dict[str, float]] = None,
251+
eval_thresholds: Optional[Dict[str, float]] = None,
252252
options: Optional[ProjectValidateOptions] = None,
253253
quality_preset: Literal["best", "high", "medium", "low", "base"] = "medium",
254254
) -> ProjectValidateResponse:
@@ -259,7 +259,7 @@ def validate(
259259
query=query,
260260
response=response,
261261
constrain_outputs=constrain_outputs,
262-
custom_eval_thresholds=custom_eval_thresholds,
262+
custom_eval_thresholds=eval_thresholds,
263263
custom_metadata=custom_metadata,
264264
eval_scores=eval_scores,
265265
options=options,

src/cleanlab_codex/types/validator.py

Lines changed: 0 additions & 35 deletions
This file was deleted.

src/cleanlab_codex/validator.py

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414

1515
if _TYPE_CHECKING:
1616
from codex.types.project_validate_params import Options as ProjectValidateOptions
17+
from codex.types.project_validate_response import ProjectValidateResponse
1718

1819

1920
class Validator:
2021
def __init__(
2122
self,
2223
codex_access_key: str,
23-
custom_eval_thresholds: Optional[dict[str, float]] = None,
24+
eval_thresholds: Optional[dict[str, float]] = None,
2425
):
2526
"""Real-time detection and remediation of bad responses in RAG applications, powered by Cleanlab's TrustworthyRAG and Codex.
2627
@@ -35,7 +36,7 @@ def __init__(
3536
codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. Used to retrieve expert-provided answers
3637
when bad responses are detected, or otherwise log the corresponding queries for SMEs to answer.
3738
38-
custom_eval_thresholds (dict[str, float], optional): Custom thresholds (between 0 and 1) for specific evals.
39+
eval_thresholds (dict[str, float], optional): Custom thresholds (between 0 and 1) for specific evals.
3940
Keys should either correspond to an Eval from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag)
4041
or a custom eval for your project. If not provided, project settings will be used.
4142
@@ -45,9 +46,9 @@ def __init__(
4546
ValueError: If any threshold value is not between 0 and 1.
4647
"""
4748
self._project: Project = Project.from_access_key(access_key=codex_access_key)
48-
if custom_eval_thresholds is not None:
49-
validate_thresholds(custom_eval_thresholds)
50-
self._custom_eval_thresholds = custom_eval_thresholds
49+
if eval_thresholds is not None:
50+
validate_thresholds(eval_thresholds)
51+
self._eval_thresholds = eval_thresholds
5152

5253
def validate(
5354
self,
@@ -58,9 +59,10 @@ def validate(
5859
prompt: Optional[str] = None,
5960
form_prompt: Optional[Callable[[str, str], str]] = None,
6061
metadata: Optional[dict[str, Any]] = None,
62+
eval_scores: Optional[dict[str, float]] = None,
6163
options: Optional[ProjectValidateOptions] = None,
6264
quality_preset: Literal["best", "high", "medium", "low", "base"] = "medium",
63-
) -> dict[str, Any]:
65+
) -> ProjectValidateResponse:
6466
"""Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer.
6567
If no expert answer is available, this query is still logged for SMEs to answer.
6668
@@ -71,14 +73,17 @@ def validate(
7173
prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response.
7274
form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition.
7375
metadata (dict, optional): Additional custom metadata to associate with the query logged in the Codex Project.
74-
options (ProjectValidateOptions, optional): Typed dict of advanced configuration options for the Trustworthy Language Model.
76+
eval_scores (dict[str, float], optional): Scores assessing different aspects of the RAG system. If provided, TLM Trustworthy RAG will not be used to generate scores.
77+
options (ProjectValidateOptions, optional): Typed dict of advanced TLM configuration options. See [TLMOptions](/tlm/api/python/tlm/#class-tlmoptions)
7578
quality_preset (Literal["best", "high", "medium", "low", "base"], optional): The quality preset to use for the TLM or Trustworthy RAG API.
7679
7780
Returns:
78-
dict[str, Any]: A dictionary containing:
79-
- 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise.
80-
- 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer.
81-
- Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold.
81+
ProjectValidateResponse: A response object containing:
82+
- eval_scores (Dict[str, EvalScores]): Evaluation scores for the original response along with a boolean flag, `failed`,
83+
indicating whether the score is below the threshold.
84+
- expert_answer (Optional[str]): Alternate SME-provided answer from Codex if the response was flagged as bad and
85+
an answer was found in the Codex Project, or None otherwise.
86+
- is_bad_response (bool): True if the response is flagged as potentially bad and triggered escalation to SMEs.
8287
"""
8388
formatted_prompt = prompt
8489
if not formatted_prompt:
@@ -92,27 +97,14 @@ def validate(
9297
if not formatted_prompt:
9398
raise ValueError("Exactly one of prompt or form_prompt is required") # noqa: TRY003
9499

95-
result = self._project.validate(
100+
return self._project.validate(
96101
context=context,
97102
prompt=formatted_prompt,
98103
query=query,
99104
response=response,
100-
custom_eval_thresholds=self._custom_eval_thresholds,
101105
custom_metadata=metadata,
106+
eval_scores=eval_scores,
107+
eval_thresholds=self._eval_thresholds,
102108
options=options,
103109
quality_preset=quality_preset,
104110
)
105-
106-
formatted_eval_scores = {
107-
eval_name: {
108-
"score": eval_scores.score,
109-
"is_bad": eval_scores.failed,
110-
}
111-
for eval_name, eval_scores in result.eval_scores.items()
112-
}
113-
114-
return {
115-
"expert_answer": result.expert_answer,
116-
"is_bad_response": result.is_bad_response,
117-
**formatted_eval_scores,
118-
}

tests/test_validator.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,14 @@ def test_validate(self, mock_project: Mock) -> None: # noqa: ARG002
5252
result = validator.validate(query="test query", context="test context", response="test response")
5353

5454
# Verify expected result structure
55-
assert result["is_bad_response"] is True
56-
assert result["expert_answer"] is None
57-
58-
eval_metrics = ["trustworthiness", "response_helpfulness"]
59-
for metric in eval_metrics:
60-
assert metric in result
61-
assert "score" in result[metric]
62-
assert "is_bad" in result[metric]
55+
assert result.is_bad_response is True
56+
assert result.expert_answer is None
6357

6458
def test_validate_expert_answer(self, mock_project: Mock) -> None:
65-
validator = Validator(codex_access_key="test", custom_eval_thresholds={"trustworthiness": 1.0})
59+
validator = Validator(codex_access_key="test", eval_thresholds={"trustworthiness": 1.0})
6660
mock_project.from_access_key.return_value.query.return_value = (None, None)
6761
result = validator.validate(query="test query", context="test context", response="test response")
68-
assert result["expert_answer"] is None
62+
assert result.expert_answer is None
6963

7064
# Setup mock project query response
7165
mock_project.from_access_key.return_value.validate.return_value = ProjectValidateResponse(
@@ -78,7 +72,7 @@ def test_validate_expert_answer(self, mock_project: Mock) -> None:
7872
)
7973
# Basically any response will be flagged as untrustworthy
8074
result = validator.validate(query="test query", context="test context", response="test response")
81-
assert result["expert_answer"] == "expert answer"
75+
assert result.expert_answer == "expert answer"
8276

8377
def test_user_provided_thresholds(self, mock_project_with_custom_thresholds: Mock) -> None:
8478
"""
@@ -87,15 +81,15 @@ def test_user_provided_thresholds(self, mock_project_with_custom_thresholds: Moc
8781
"""
8882
validator = Validator(
8983
codex_access_key="test",
90-
custom_eval_thresholds={"trustworthiness": 0.4, "non_existent_metric": 0.5},
84+
eval_thresholds={"trustworthiness": 0.4, "non_existent_metric": 0.5},
9185
)
9286
mock_project_with_custom_thresholds.from_access_key.assert_called_once_with(access_key="test")
9387
result = validator.validate(query="test query", context="test context", response="test response")
94-
assert result["is_bad_response"] is False
95-
assert result["expert_answer"] is None
88+
assert result.is_bad_response is False
89+
assert result.expert_answer is None
9690

9791
def test_default_thresholds(self, mock_project: Mock) -> None:
98-
# Test with default thresholds (custom_eval_thresholds is None)
92+
# Test with default thresholds (eval_thresholds is None)
9993
validator = Validator(codex_access_key="test")
10094
mock_project.from_access_key.assert_called_once_with(access_key="test")
101-
assert validator._custom_eval_thresholds is None # noqa: SLF001
95+
assert validator._eval_thresholds is None # noqa: SLF001

0 commit comments

Comments
 (0)