Skip to content

Commit cd3ad3c

Browse files
authored
[Codex update] Improve how validation happens through the project (#95)
1 parent b719c6f commit cd3ad3c

File tree

8 files changed

+224
-292
lines changed

8 files changed

+224
-292
lines changed

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ classifiers = [
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
]
2727
dependencies = [
28-
"cleanlab-tlm~=1.1",
29-
"codex-sdk==0.1.0a22",
28+
"cleanlab-tlm~=1.1,>=1.1.14",
29+
"codex-sdk==0.1.0a23",
3030
"pydantic>=2.0.0, <3",
3131
]
3232

@@ -46,6 +46,7 @@ extra-dependencies = [
4646
"smolagents; python_version >= '3.10'",
4747
"thefuzz",
4848
"langchain-core",
49+
"openai"
4950
]
5051

5152
[[tool.hatch.envs.types.matrix]]
@@ -63,6 +64,7 @@ extra-dependencies = [
6364
"smolagents; python_version >= '3.10'",
6465
"thefuzz",
6566
"langchain-core",
67+
"openai",
6668
]
6769

6870
[tool.hatch.envs.hatch-test.env-vars]

src/cleanlab_codex/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: MIT
22
from cleanlab_codex.client import Client
33
from cleanlab_codex.project import Project
4-
from cleanlab_codex.validator import Validator
54

6-
__all__ = ["Client", "Project", "Validator"]
5+
__all__ = ["Client", "Project"]

src/cleanlab_codex/project.py

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
from datetime import datetime
66
from typing import TYPE_CHECKING as _TYPE_CHECKING
7-
from typing import Dict, Optional
7+
from typing import Dict, Optional, Union, cast
88

99
from codex import AuthenticationError
10+
from codex.types.project_validate_params import Response
1011

1112
from cleanlab_codex.internal.analytics import _AnalyticsMetadata
1213
from cleanlab_codex.internal.sdk_client import client_from_access_key
@@ -17,6 +18,7 @@
1718

1819
from codex import Codex as _Codex
1920
from codex.types.project_validate_response import ProjectValidateResponse
21+
from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
2022

2123

2224
_ERROR_CREATE_ACCESS_KEY = (
@@ -145,37 +147,60 @@ def create_access_key(
145147

146148
def validate(
147149
self,
148-
context: str,
149-
prompt: str,
150-
query: str,
151-
response: str,
152150
*,
153-
custom_metadata: Optional[object] = None,
151+
messages: list[ChatCompletionMessageParam],
152+
response: Union[ChatCompletion, str],
153+
query: str,
154+
context: str,
155+
rewritten_query: Optional[str] = None,
156+
metadata: Optional[object] = None,
154157
eval_scores: Optional[Dict[str, float]] = None,
155-
eval_thresholds: Optional[Dict[str, float]] = None,
156158
) -> ProjectValidateResponse:
157-
"""Run validation on a query to an AI system.
159+
"""Evaluate the quality of an AI-generated response using the structured message history, query, and retrieved context.
160+
161+
This method runs validation on an AI response using the full `messages` history (formatted as OpenAI-style chat messages),
162+
which should include the latest user query and any preceding system or assistant messages.
163+
164+
**For single-turn Q&A apps, `messages` can be a minimal list with one user message. For multi-turn conversations, provide the full dialog
165+
leading up to the final response.
166+
167+
The function assesses the trustworthiness and quality of the AI `response` in light of the provided `context` and
168+
`query`, which should align with the most recent user message in `messages`.
169+
170+
If your AI response is flagged as problematic, then this method will:
171+
- return an expert answer if one was previously provided for a similar query
172+
- otherwise log this query for future SME review (to consider providing an expert answer) in the Web interface.
158173
159174
Args:
160-
context (str): The context used by the AI system to generate a response for the query.
161-
prompt (str): The full prompt (including system instructions, context, and the original query) used by the AI system to generate a response for the query.
162-
query (str): The original user input to the AI system.
163-
response (str): The response generated by the AI system for the query.
164-
custom_metadata (object, optional): Custom metadata to log in Codex for the query.
165-
eval_scores (Dict[str, float], optional): Optional scores to use for the query. When provided, Codex will skip running TrustworthyRAG evaluations on the query and use the provided scores instead.
166-
eval_thresholds (Dict[str, float], optional): Optional thresholds to use for evaluating the query. We recommend configuring thresholds on the Project instead and using the same thresholds for all queries.
175+
messages (list[ChatCompletionMessageParam]): The full message history from the AI conversation, formatted for OpenAI-style chat completion.
176+
This must include the final user message that triggered the AI response. All other arguments—`query`, `context`, and `response`—
177+
must correspond specifically to this final user message.
178+
response (ChatCompletion | str): Your AI-response that was generated based on the given `messages`. This is the response being evaluated, and should not appear in the `messages`.
179+
query (str): The user query that the `response` is answering. This query should be the latest user message in `messages`.
180+
context (str): The retrieved context (e.g., from your RAG system) that was supplied to the AI when generating the `response` to the final user query in `messages`.
181+
rewritten_query (str, optional): An optional reformulation of `query` (e.g. made self-contained w.r.t multi-turn conversations) to improve retrieval quality.
182+
metadata (object, optional): Arbitrary metadata to associate with this validation for logging or analysis inside the Codex project.
183+
eval_scores (dict[str, float], optional): Precomputed evaluation scores to bypass automatic scoring. Providing `eval_scores` for specific evaluations bypasses automated scoring and uses the supplied scores instead. Consider providing these scores if you already have them precomputed to reduce runtime.
167184
168185
Returns:
169-
ProjectValidateResponse: The response from the validation.
186+
ProjectValidateResponse: A structured object with the following fields:
187+
188+
- should_guardrail (bool): True if the AI system should suppress or modify the response before returning it to the user. When True, the response is considered problematic and may require further review or modification.
189+
- escalated_to_sme (bool): True if the query should be escalated to SME for review. When True, the query is logged and may be answered by an expert.
190+
- eval_scores (dict[str, ThresholdedEvalScore]): Evaluation scores for different response attributes (e.g., trustworthiness, helpfulness, ...). Each includes a numeric score and a `failed` flag indicating whether the score falls below threshold.
191+
- expert_answer (str | None): If it was auto-determined that this query should be escalated to SME, and a prior SME answer for a similar query was found, then this will return that expert answer. Otherwise, it is None.
192+
193+
When available, consider swapping your AI response with the expert answer before serving the response to your user.
170194
"""
195+
171196
return self._sdk_client.projects.validate(
172197
self._id,
198+
messages=messages,
199+
response=cast(Response, response),
173200
context=context,
174-
prompt=prompt,
175201
query=query,
176-
response=response,
177-
custom_eval_thresholds=eval_thresholds,
178-
custom_metadata=custom_metadata,
202+
rewritten_question=rewritten_query,
203+
custom_metadata=metadata,
179204
eval_scores=eval_scores,
180205
)
181206

src/cleanlab_codex/validator.py

Lines changed: 0 additions & 103 deletions
This file was deleted.

tests/conftest.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
11
from tests.fixtures.client import default_headers, mock_client_from_access_key, mock_client_from_api_key
2+
from tests.fixtures.validate import (
3+
openai_chat_completion,
4+
openai_messages_bad_no_user,
5+
openai_messages_conversational,
6+
openai_messages_single_turn,
7+
)
28

3-
__all__ = ["mock_client_from_access_key", "mock_client_from_api_key", "default_headers"]
9+
__all__ = [
10+
"mock_client_from_access_key",
11+
"mock_client_from_api_key",
12+
"default_headers",
13+
"openai_chat_completion",
14+
"openai_messages_conversational",
15+
"openai_messages_single_turn",
16+
"openai_messages_bad_no_user",
17+
]

tests/fixtures/validate.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import cast
2+
3+
import pytest
4+
from openai.types.chat import (
5+
ChatCompletion,
6+
ChatCompletionAssistantMessageParam,
7+
ChatCompletionMessageParam,
8+
ChatCompletionSystemMessageParam,
9+
ChatCompletionUserMessageParam,
10+
)
11+
12+
13+
@pytest.fixture
14+
def openai_chat_completion() -> ChatCompletion:
15+
"""Fixture that returns a static fake OpenAI ChatCompletion object."""
16+
raw_response = {
17+
"id": "chatcmpl-test123",
18+
"object": "chat.completion",
19+
"created": 1719876543,
20+
"model": "gpt-4",
21+
"choices": [
22+
{
23+
"index": 0,
24+
"message": {
25+
"role": "assistant",
26+
"content": "Paris",
27+
},
28+
"finish_reason": "stop",
29+
}
30+
],
31+
"usage": {
32+
"prompt_tokens": 5,
33+
"completion_tokens": 1,
34+
"total_tokens": 6,
35+
},
36+
}
37+
38+
return ChatCompletion.model_validate(raw_response)
39+
40+
41+
@pytest.fixture
42+
def openai_messages_single_turn() -> list[ChatCompletionMessageParam]:
43+
"""Fixture that returns a single-turn message format."""
44+
return [cast(ChatCompletionUserMessageParam, {"role": "user", "content": "What is the capital of France?"})]
45+
46+
47+
@pytest.fixture
48+
def openai_messages_bad_no_user() -> list[ChatCompletionMessageParam]:
49+
"""Fixture that returns invalid messages (missing required user message)."""
50+
return [
51+
cast(ChatCompletionAssistantMessageParam, {"role": "assistant", "content": "hi"}),
52+
cast(ChatCompletionSystemMessageParam, {"role": "system", "content": "sys"}),
53+
]
54+
55+
56+
@pytest.fixture
57+
def openai_messages_conversational() -> list[ChatCompletionMessageParam]:
58+
"""Fixture that returns a conversational message format."""
59+
return [
60+
cast(ChatCompletionSystemMessageParam, {"role": "system", "content": "You are a helpful assistant."}),
61+
cast(ChatCompletionUserMessageParam, {"role": "user", "content": "I love France!"}),
62+
cast(ChatCompletionAssistantMessageParam, {"role": "assistant", "content": "That's great!"}),
63+
cast(ChatCompletionUserMessageParam, {"role": "user", "content": "What is its capital?"}),
64+
]

0 commit comments

Comments
 (0)