Skip to content

Use evaluation SDK evals #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/workshop/src/1-build/basic.prompty
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ model:
configuration:
type: azure_openai
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
azure_deployment: gpt-35-turbo
azure_deployment: gpt-4-evals
parameters:
max_tokens: 3000
sample:
Expand Down
2 changes: 1 addition & 1 deletion infra/ai.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ deployments:
version: "0613"
sku:
name: Standard
capacity: 20
capacity: 14
- name: text-embedding-ada-002
model:
format: OpenAI
Expand Down
2 changes: 1 addition & 1 deletion infra/hooks/postprovision.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fi
# Set additional environment variables expected by app
# TODO: Standardize these and remove need for setting here
azd env set AZURE_OPENAI_API_VERSION 2023-03-15-preview
azd env set AZURE_OPENAI_CHAT_DEPLOYMENT gpt-35-turbo
azd env set AZURE_OPENAI_CHAT_DEPLOYMENT gpt-4-evals
azd env set AZURE_SEARCH_ENDPOINT $AZURE_SEARCH_ENDPOINT

# Output environment variables to .env file using azd env get-values
Expand Down
2 changes: 1 addition & 1 deletion infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ module aca 'app/aca.bicep' = {
identityId: managedIdentity.outputs.managedIdentityClientId
containerAppsEnvironmentName: containerApps.outputs.environmentName
containerRegistryName: containerApps.outputs.registryName
openAiDeploymentName: !empty(openAiDeploymentName) ? openAiDeploymentName : 'gpt-35-turbo'
openAiDeploymentName: !empty(openAiDeploymentName) ? openAiDeploymentName : 'gpt-4-evals'
openAiEmbeddingDeploymentName: openAiEmbeddingDeploymentName
openAiEndpoint: ai.outputs.openAiEndpoint
openAiType: openAiType
Expand Down
2 changes: 1 addition & 1 deletion infra/main.parameters.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"value": "${AZURE_EMBEDDING_NAME=text-embedding-ada-002}"
},
"openAiDeploymentName": {
"value": "${AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-35-turbo}"
"value": "${AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-4-evals}"
},
"principalId": {
"value": "${AZURE_PRINCIPAL_ID}"
Expand Down
2 changes: 1 addition & 1 deletion src/api/contoso_chat/chat.prompty
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ model:
api: chat
configuration:
type: azure_openai
azure_deployment: gpt-35-turbo
azure_deployment: gpt-4-evals
azure_endpoint: ${ENV:AZURE_OPENAI_ENDPOINT}
api_version: 2023-07-01-preview
parameters:
Expand Down
2 changes: 1 addition & 1 deletion src/api/contoso_chat/product/product.prompty
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ model:
api: chat
configuration:
type: azure_openai
azure_deployment: gpt-35-turbo
azure_deployment: gpt-4-evals
api_version: 2023-07-01-preview
azure_endpoint: ${ENV:AZURE_OPENAI_ENDPOINT}
parameters:
Expand Down
6 changes: 5 additions & 1 deletion src/api/contoso_chat/product/product.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import json
import re
from typing import Dict, List
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from prompty.tracer import trace
Expand Down Expand Up @@ -87,7 +88,10 @@ def find_products(context: str) -> Dict[str, any]:
configuration=model_config,
inputs={"context":context}
)
qs = json.loads(queries)
if queries.startswith("```"):
queries = re.sub(r"^```(?:json)?\n|\n```$", "", queries.strip(), flags=re.MULTILINE)
qs = json.loads(queries.strip())

# Generate embeddings
items = generate_embeddings(qs)
# Retrieve products
Expand Down
53 changes: 45 additions & 8 deletions src/api/evaluate-chat-flow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@
" \n",
" # Add results to list\n",
" result = {\n",
" 'question': question,\n",
" 'query': question,\n",
" 'context': response[\"context\"],\n",
" 'answer': response[\"answer\"]\n",
" 'response': response[\"answer\"]\n",
" }\n",
" results.append(result)\n",
"\n",
Expand All @@ -88,9 +88,9 @@
" results.append(json.loads(line))\n",
"\n",
" for result in results:\n",
" question = result['question']\n",
" question = result['query']\n",
" context = result['context']\n",
" answer = result['answer']\n",
" answer = result['response']\n",
" \n",
" groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context)\n",
" fluency_score = fluency_evaluation(question=question, answer=answer, context=context)\n",
Expand All @@ -117,6 +117,43 @@
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azure.ai.evaluation import RelevanceEvaluator, FluencyEvaluator, CoherenceEvaluator, GroundednessEvaluator\n",
"from azure.ai.evaluation import ContentSafetyEvaluator\n",
"from azure.ai.evaluation import evaluate as ev\n",
"\n",
"def evaluate_using_sdk():\n",
" # Evaluate results from results file\n",
" results_path = 'result.jsonl'\n",
" model_config = {\n",
" \"azure_endpoint\": os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
" \"azure_deployment\": os.environ[\"AZURE_OPENAI_CHAT_DEPLOYMENT\"],\n",
" }\n",
" result = ev(\n",
" data=results_path,\n",
" evaluators={\"relevance\": RelevanceEvaluator(model_config),\n",
" \"fluency\": FluencyEvaluator(model_config),\n",
" \"coherence\": CoherenceEvaluator(model_config),\n",
" \"groundedness\": GroundednessEvaluator(model_config),\n",
" \"content_safety\": ContentSafetyEvaluator(model_config)\n",
" },\n",
" evaluator_config={\"relevance\":{\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
" \"fluency\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
" \"coherence\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
" \"groundedness\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
" \"content_safety\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}}},\n",
" )\n",
" result_df = pd.DataFrame(result[\"rows\"])\n",
" result_df.to_json('eval_results.jsonl')\n",
" result_df.head()\n",
" return result_df "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -127,7 +164,7 @@
" print(\"Evaluation summary:\\n\")\n",
" print(df)\n",
" # drop question, context and answer\n",
" mean_df = df.drop([\"question\", \"context\", \"answer\"], axis=1).mean()\n",
" mean_df = df.drop([\"query\", \"context\", \"response\"], axis=1).mean()\n",
" print(\"\\nAverage scores:\")\n",
" print(mean_df)\n",
" df.to_markdown('eval_results.md')\n",
Expand All @@ -149,14 +186,14 @@
"\n",
" test_data_df = load_data()\n",
" response_results = create_response_data(test_data_df)\n",
" result_evaluated = evaluate()\n",
" result_evaluated = evaluate_using_sdk()\n",
" create_summary(result_evaluated)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "prompty",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -170,7 +207,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
27 changes: 26 additions & 1 deletion src/api/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from prompty.tracer import trace
from tracing import init_tracing
from contoso_chat.chat_request import get_response
from azure.ai.evaluation import RelevanceEvaluator, FluencyEvaluator, CoherenceEvaluator, GroundednessEvaluator
from azure.ai.evaluation import ContentSafetyEvaluator
from azure.ai.evaluation import evaluate

# %% [markdown]
# ## Get output from data and save to results jsonl file
Expand Down Expand Up @@ -90,6 +93,28 @@ def evaluate():
df.head()

return df
# %%
@trace
def evaluate_using_sdk():
# Evaluate results from results file
results_path = 'result.jsonl'
model_config = {
"azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
"api_version": os.environ["AZURE_OPENAI_API_VERSION"],
}
result = evaluate(
data=results_path,
evaluators={"relevance": RelevanceEvaluator(model_config),
"fluency": FluencyEvaluator(model_config),
"coherence": CoherenceEvaluator(model_config),
"groundedness": GroundednessEvaluator(model_config),
"content_safety": ContentSafetyEvaluator(model_config)
},
)
result_df = pd.DataFrame(result["rows"])
result_df.to_json('eval_results.jsonl')
result_df.head()
return result_df

# %%
@trace
Expand All @@ -113,7 +138,7 @@ def create_summary(df):
tracer = init_tracing(local_tracing=True)
test_data_df = load_data()
response_results = create_response_data(test_data_df)
result_evaluated = evaluate()
result_evaluated = evaluate_using_sdk()
create_summary(result_evaluated)


Expand Down
3 changes: 2 additions & 1 deletion src/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ opentelemetry-instrumentation-fastapi
jupyter
opentelemetry-instrumentation
azure-identity==1.17.1
tabulate
tabulate
azure-ai-evaluation
Loading