|
4 | 4 |
|
5 | 5 | from datetime import datetime
|
6 | 6 | from typing import TYPE_CHECKING as _TYPE_CHECKING
|
7 |
| -from typing import Dict, Optional |
| 7 | +from typing import Dict, Optional, Union, cast |
8 | 8 |
|
9 | 9 | from codex import AuthenticationError
|
| 10 | +from codex.types.project_validate_params import Response |
10 | 11 |
|
11 | 12 | from cleanlab_codex.internal.analytics import _AnalyticsMetadata
|
12 | 13 | from cleanlab_codex.internal.sdk_client import client_from_access_key
|
|
17 | 18 |
|
18 | 19 | from codex import Codex as _Codex
|
19 | 20 | from codex.types.project_validate_response import ProjectValidateResponse
|
| 21 | + from openai.types.chat import ChatCompletion, ChatCompletionMessageParam |
20 | 22 |
|
21 | 23 |
|
22 | 24 | _ERROR_CREATE_ACCESS_KEY = (
|
@@ -145,37 +147,60 @@ def create_access_key(
|
145 | 147 |
|
146 | 148 | def validate(
|
147 | 149 | self,
|
148 |
| - context: str, |
149 |
| - prompt: str, |
150 |
| - query: str, |
151 |
| - response: str, |
152 | 150 | *,
|
153 |
| - custom_metadata: Optional[object] = None, |
| 151 | + messages: list[ChatCompletionMessageParam], |
| 152 | + response: Union[ChatCompletion, str], |
| 153 | + query: str, |
| 154 | + context: str, |
| 155 | + rewritten_query: Optional[str] = None, |
| 156 | + metadata: Optional[object] = None, |
154 | 157 | eval_scores: Optional[Dict[str, float]] = None,
|
155 |
| - eval_thresholds: Optional[Dict[str, float]] = None, |
156 | 158 | ) -> ProjectValidateResponse:
|
157 |
| - """Run validation on a query to an AI system. |
| 159 | + """Evaluate the quality of an AI-generated response using the structured message history, query, and retrieved context. |
| 160 | +
|
| 161 | + This method runs validation on an AI response using the full `messages` history (formatted as OpenAI-style chat messages), |
| 162 | + which should include the latest user query and any preceding system or assistant messages. |
| 163 | +
|
| 164 | + **For single-turn Q&A apps, `messages` can be a minimal list with one user message. For multi-turn conversations, provide the full dialog |
| 165 | + leading up to the final response. |
| 166 | +
|
| 167 | + The function assesses the trustworthiness and quality of the AI `response` in light of the provided `context` and |
| 168 | + `query`, which should align with the most recent user message in `messages`. |
| 169 | +
|
| 170 | + If your AI response is flagged as problematic, then this method will: |
| 171 | + - return an expert answer if one was previously provided for a similar query |
| 172 | + - otherwise log this query for future SME review (to consider providing an expert answer) in the Web interface. |
158 | 173 |
|
159 | 174 | Args:
|
160 |
| - context (str): The context used by the AI system to generate a response for the query. |
161 |
| - prompt (str): The full prompt (including system instructions, context, and the original query) used by the AI system to generate a response for the query. |
162 |
| - query (str): The original user input to the AI system. |
163 |
| - response (str): The response generated by the AI system for the query. |
164 |
| - custom_metadata (object, optional): Custom metadata to log in Codex for the query. |
165 |
| - eval_scores (Dict[str, float], optional): Optional scores to use for the query. When provided, Codex will skip running TrustworthyRAG evaluations on the query and use the provided scores instead. |
166 |
| - eval_thresholds (Dict[str, float], optional): Optional thresholds to use for evaluating the query. We recommend configuring thresholds on the Project instead and using the same thresholds for all queries. |
| 175 | + messages (list[ChatCompletionMessageParam]): The full message history from the AI conversation, formatted for OpenAI-style chat completion. |
| 176 | + This must include the final user message that triggered the AI response. All other arguments—`query`, `context`, and `response`— |
| 177 | + must correspond specifically to this final user message. |
| 178 | + response (ChatCompletion | str): Your AI-response that was generated based on the given `messages`. This is the response being evaluated, and should not appear in the `messages`. |
| 179 | + query (str): The user query that the `response` is answering. This query should be the latest user message in `messages`. |
| 180 | + context (str): The retrieved context (e.g., from your RAG system) that was supplied to the AI when generating the `response` to the final user query in `messages`. |
| 181 | + rewritten_query (str, optional): An optional reformulation of `query` (e.g. made self-contained w.r.t multi-turn conversations) to improve retrieval quality. |
| 182 | + metadata (object, optional): Arbitrary metadata to associate with this validation for logging or analysis inside the Codex project. |
| 183 | + eval_scores (dict[str, float], optional): Precomputed evaluation scores to bypass automatic scoring. Providing `eval_scores` for specific evaluations bypasses automated scoring and uses the supplied scores instead. Consider providing these scores if you already have them precomputed to reduce runtime. |
167 | 184 |
|
168 | 185 | Returns:
|
169 |
| - ProjectValidateResponse: The response from the validation. |
| 186 | + ProjectValidateResponse: A structured object with the following fields: |
| 187 | +
|
| 188 | + - should_guardrail (bool): True if the AI system should suppress or modify the response before returning it to the user. When True, the response is considered problematic and may require further review or modification. |
| 189 | + - escalated_to_sme (bool): True if the query should be escalated to SME for review. When True, the query is logged and may be answered by an expert. |
| 190 | + - eval_scores (dict[str, ThresholdedEvalScore]): Evaluation scores for different response attributes (e.g., trustworthiness, helpfulness, ...). Each includes a numeric score and a `failed` flag indicating whether the score falls below threshold. |
| 191 | + - expert_answer (str | None): If it was auto-determined that this query should be escalated to SME, and a prior SME answer for a similar query was found, then this will return that expert answer. Otherwise, it is None. |
| 192 | +
|
| 193 | + When available, consider swapping your AI response with the expert answer before serving the response to your user. |
170 | 194 | """
|
| 195 | + |
171 | 196 | return self._sdk_client.projects.validate(
|
172 | 197 | self._id,
|
| 198 | + messages=messages, |
| 199 | + response=cast(Response, response), |
173 | 200 | context=context,
|
174 |
| - prompt=prompt, |
175 | 201 | query=query,
|
176 |
| - response=response, |
177 |
| - custom_eval_thresholds=eval_thresholds, |
178 |
| - custom_metadata=custom_metadata, |
| 202 | + rewritten_question=rewritten_query, |
| 203 | + custom_metadata=metadata, |
179 | 204 | eval_scores=eval_scores,
|
180 | 205 | )
|
181 | 206 |
|
|
0 commit comments