Azure-Samples · slister1001 · Nov 25, 2024 · Nov 25, 2024
diff --git a/docs/workshop/src/1-build/basic.prompty b/docs/workshop/src/1-build/basic.prompty
@@ -8,7 +8,7 @@ model:
   configuration:
     type: azure_openai
     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
-    azure_deployment: gpt-35-turbo
+    azure_deployment: gpt-4-evals
   parameters:
     max_tokens: 3000
 sample:

diff --git a/infra/ai.yaml b/infra/ai.yaml
@@ -8,7 +8,7 @@ deployments:
       version: "0613"
     sku:
       name: Standard
-      capacity: 20
+      capacity: 14
   - name: text-embedding-ada-002
     model:
       format: OpenAI

diff --git a/infra/hooks/postprovision.sh b/infra/hooks/postprovision.sh
@@ -17,7 +17,7 @@ fi
 # Set additional environment variables expected by app
 # TODO: Standardize these and remove need for setting here
 azd env set AZURE_OPENAI_API_VERSION 2023-03-15-preview
-azd env set AZURE_OPENAI_CHAT_DEPLOYMENT gpt-35-turbo
+azd env set AZURE_OPENAI_CHAT_DEPLOYMENT gpt-4-evals
 azd env set AZURE_SEARCH_ENDPOINT $AZURE_SEARCH_ENDPOINT
 
 # Output environment variables to .env file using azd env get-values

diff --git a/infra/main.bicep b/infra/main.bicep
@@ -204,7 +204,7 @@ module aca 'app/aca.bicep' = {
     identityId: managedIdentity.outputs.managedIdentityClientId
     containerAppsEnvironmentName: containerApps.outputs.environmentName
     containerRegistryName: containerApps.outputs.registryName
-    openAiDeploymentName: !empty(openAiDeploymentName) ? openAiDeploymentName : 'gpt-35-turbo'
+    openAiDeploymentName: !empty(openAiDeploymentName) ? openAiDeploymentName : 'gpt-4-evals'
     openAiEmbeddingDeploymentName: openAiEmbeddingDeploymentName
     openAiEndpoint: ai.outputs.openAiEndpoint
     openAiType: openAiType

diff --git a/infra/main.parameters.json b/infra/main.parameters.json
@@ -19,7 +19,7 @@
       "value": "${AZURE_EMBEDDING_NAME=text-embedding-ada-002}"
     },
     "openAiDeploymentName": {
-      "value": "${AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-35-turbo}"
+      "value": "${AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-4-evals}"
     },
     "principalId": {
       "value": "${AZURE_PRINCIPAL_ID}"

diff --git a/src/api/contoso_chat/chat.prompty b/src/api/contoso_chat/chat.prompty
@@ -8,7 +8,7 @@ model:
   api: chat
   configuration:
     type: azure_openai
-    azure_deployment: gpt-35-turbo
+    azure_deployment: gpt-4-evals
     azure_endpoint: ${ENV:AZURE_OPENAI_ENDPOINT}
     api_version: 2023-07-01-preview
   parameters:

diff --git a/src/api/contoso_chat/product/product.prompty b/src/api/contoso_chat/product/product.prompty
@@ -7,7 +7,7 @@ model:
   api: chat
   configuration:
     type: azure_openai
-    azure_deployment: gpt-35-turbo
+    azure_deployment: gpt-4-evals
     api_version: 2023-07-01-preview
     azure_endpoint: ${ENV:AZURE_OPENAI_ENDPOINT}
   parameters:

diff --git a/src/api/contoso_chat/product/product.py b/src/api/contoso_chat/product/product.py
@@ -1,5 +1,6 @@
 import os
 import json
+import re
 from typing import Dict, List
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from prompty.tracer import trace
@@ -87,7 +88,10 @@ def find_products(context: str) -> Dict[str, any]:
         configuration=model_config,
         inputs={"context":context}
         )
-    qs = json.loads(queries)
+    if queries.startswith("```"):
+        queries = re.sub(r"^```(?:json)?\n|\n```$", "", queries.strip(), flags=re.MULTILINE)
+    qs = json.loads(queries.strip())  
+
     # Generate embeddings
     items = generate_embeddings(qs)
     # Retrieve products

diff --git a/src/api/evaluate-chat-flow.ipynb b/src/api/evaluate-chat-flow.ipynb
@@ -59,9 +59,9 @@
     "        \n",
     "        # Add results to list\n",
     "        result = {\n",
-    "            'question': question,\n",
+    "            'query': question,\n",
     "            'context': response[\"context\"],\n",
-    "            'answer': response[\"answer\"]\n",
+    "            'response': response[\"answer\"]\n",
     "        }\n",
     "        results.append(result)\n",
     "\n",
@@ -88,9 +88,9 @@
     "            results.append(json.loads(line))\n",
     "\n",
     "    for result in results:\n",
-    "        question = result['question']\n",
+    "        question = result['query']\n",
     "        context = result['context']\n",
-    "        answer = result['answer']\n",
+    "        answer = result['response']\n",
     "        \n",
     "        groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context)\n",
     "        fluency_score = fluency_evaluation(question=question, answer=answer, context=context)\n",
@@ -117,6 +117,43 @@
     "    return df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.evaluation import RelevanceEvaluator, FluencyEvaluator, CoherenceEvaluator, GroundednessEvaluator\n",
+    "from azure.ai.evaluation import ContentSafetyEvaluator\n",
+    "from azure.ai.evaluation import evaluate as ev\n",
+    "\n",
+    "def evaluate_using_sdk():\n",
+    "    # Evaluate results from results file\n",
+    "    results_path = 'result.jsonl'\n",
+    "    model_config = {\n",
+    "            \"azure_endpoint\": os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
+    "            \"azure_deployment\": os.environ[\"AZURE_OPENAI_CHAT_DEPLOYMENT\"],\n",
+    "    }\n",
+    "    result = ev(\n",
+    "        data=results_path,\n",
+    "        evaluators={\"relevance\": RelevanceEvaluator(model_config),\n",
+    "                     \"fluency\": FluencyEvaluator(model_config),\n",
+    "                     \"coherence\": CoherenceEvaluator(model_config),\n",
+    "                     \"groundedness\": GroundednessEvaluator(model_config),\n",
+    "                     \"content_safety\": ContentSafetyEvaluator(model_config)\n",
+    "                     },\n",
+    "        evaluator_config={\"relevance\":{\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
+    "                          \"fluency\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
+    "                            \"coherence\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
+    "                            \"groundedness\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}},\n",
+    "                            \"content_safety\": {\"column_mapping\": {\"query\": \"${data.question}\", \"response\": \"${data.answer}\"}}},\n",
+    "    )\n",
+    "    result_df = pd.DataFrame(result[\"rows\"])\n",
+    "    result_df.to_json('eval_results.jsonl')\n",
+    "    result_df.head()\n",
+    "    return result_df "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -127,7 +164,7 @@
     "    print(\"Evaluation summary:\\n\")\n",
     "    print(df)\n",
     "    # drop question, context and answer\n",
-    "    mean_df = df.drop([\"question\", \"context\", \"answer\"], axis=1).mean()\n",
+    "    mean_df = df.drop([\"query\", \"context\", \"response\"], axis=1).mean()\n",
     "    print(\"\\nAverage scores:\")\n",
     "    print(mean_df)\n",
     "    df.to_markdown('eval_results.md')\n",
@@ -149,14 +186,14 @@
     "\n",
     "   test_data_df = load_data()\n",
     "   response_results = create_response_data(test_data_df)\n",
-    "   result_evaluated = evaluate()\n",
+    "   result_evaluated = evaluate_using_sdk()\n",
     "   create_summary(result_evaluated)\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "prompty",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -170,7 +207,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/src/api/evaluate.py b/src/api/evaluate.py
@@ -11,6 +11,9 @@
 from prompty.tracer import trace
 from tracing import init_tracing
 from contoso_chat.chat_request import get_response
+from azure.ai.evaluation import RelevanceEvaluator, FluencyEvaluator, CoherenceEvaluator, GroundednessEvaluator
+from azure.ai.evaluation import ContentSafetyEvaluator
+from azure.ai.evaluation import evaluate
 
 # %% [markdown]
 # ## Get output from data and save to results jsonl file
@@ -90,6 +93,28 @@ def evaluate():
     df.head()
 
     return df
+# %%
+@trace
+def evaluate_using_sdk():
+    # Evaluate results from results file
+    results_path = 'result.jsonl'
+    model_config = {
+            "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
+            "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
+    }
+    result = evaluate(
+        data=results_path,
+        evaluators={"relevance": RelevanceEvaluator(model_config),
+                     "fluency": FluencyEvaluator(model_config),
+                     "coherence": CoherenceEvaluator(model_config),
+                     "groundedness": GroundednessEvaluator(model_config),
+                     "content_safety": ContentSafetyEvaluator(model_config)
+                     },
+    )
+    result_df = pd.DataFrame(result["rows"])
+    result_df.to_json('eval_results.jsonl')
+    result_df.head()
+    return result_df 
 
 # %%
 @trace
@@ -113,7 +138,7 @@ def create_summary(df):
    tracer = init_tracing(local_tracing=True)
    test_data_df = load_data()
    response_results = create_response_data(test_data_df)
-   result_evaluated = evaluate()
+   result_evaluated = evaluate_using_sdk()
    create_summary(result_evaluated)
 
 

diff --git a/src/api/requirements.txt b/src/api/requirements.txt
@@ -18,4 +18,5 @@ opentelemetry-instrumentation-fastapi
 jupyter
 opentelemetry-instrumentation
 azure-identity==1.17.1
-tabulate
+tabulate
+azure-ai-evaluation