Refactor evaluation sample app (#29)

arpad-csepi · web-flow · commit 5976ec90aa4b · 2025-03-19T15:47:37.000+01:00
* chore(samples): run formatter, remove unused imports

* refactor(samples): export ragas eval results

* refactor(samples): export deepeval eval results

* refactor(samples): avoid official package name conflict
diff --git a/samples/evaluation/Taskfile.yaml b/samples/evaluation/Taskfile.yaml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 ---
-version: "3"
+version: '3'
 
 silent: true
 
@@ -26,13 +26,13 @@ tasks:
     internal: true
     dir: '{{.USER_WORKING_DIR}}'
     cmds:
-      - poetry run python -m tests.deepeval
+      - poetry run python -m tests._deepeval
 
   run:test-ragas:
     internal: true
     dir: '{{.USER_WORKING_DIR}}'
     cmds:
-      - poetry run python -m tests.ragas
+      - poetry run python -m tests._ragas
 
   run:test:
     desc: Run tests
diff --git a/samples/evaluation/tests/_deepeval.py b/samples/evaluation/tests/_deepeval.py
@@ -3,10 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from os import environ
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ToxicityMetric
-from deepeval.cli.main import set_local_model_env, unset_local_model_env, set_azure_openai_env, unset_azure_openai_env
+
+from deepeval.cli.main import (
+    set_azure_openai_env,
+    set_local_model_env,
+    unset_azure_openai_env,
+    unset_local_model_env,
+)
 from deepeval.dataset import EvaluationDataset
+from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ToxicityMetric
+from deepeval.test_case import LLMTestCase
 from model.crew import run_crew
 
 azure_openai_api_key = environ.get("AZURE_OPENAI_API_KEY", "NA")
@@ -18,6 +24,7 @@
 eval_model_name = environ.get("LOCAL_MODEL_NAME", "llama3.1")
 eval_base_url = environ.get("LOCAL_MODEL_BASE_URL", "http://localhost:11434/v1/")
 
+
 def eval():
     if azure_openai_api_key != "NA":
         print("Set Azure OpenAI model for evaluation")
@@ -30,10 +37,11 @@ def eval():
         )
     else:
         print("Set local model for evaluation")
-        set_local_model_env(model_name=eval_model_name,
-                            base_url=eval_base_url,
-                            api_key="dummy-key",
-                            format='json',
+        set_local_model_env(
+            model_name=eval_model_name,
+            base_url=eval_base_url,
+            api_key="dummy-key",
+            format="json",
         )
 
     test_input = "Gather data about new Critical CVEs from todays date"
@@ -44,8 +52,8 @@ def eval():
 
     test_cases = []
     test_case = LLMTestCase(
-        input = test_input,
-        actual_output = test_output,
+        input=test_input,
+        actual_output=test_output,
     )
     test_cases.append(test_case)
 
@@ -55,14 +63,16 @@ def eval():
     toxicity_metric = ToxicityMetric(threshold=0.5)
 
     dataset = EvaluationDataset(test_cases=test_cases)
-    dataset.evaluate([answer_relevancy_metric, bias_metric, toxicity_metric])
-
+    result = dataset.evaluate([answer_relevancy_metric, bias_metric, toxicity_metric])
 
     print("Test End")
     if azure_openai_api_key != "NA":
         unset_azure_openai_env()
     else:
         unset_local_model_env()
 
+    return (dataset, result)
+
+
 if __name__ == "__main__":
     eval()
diff --git a/samples/evaluation/tests/_ragas.py b/samples/evaluation/tests/_ragas.py
@@ -2,17 +2,32 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from os import environ
-from datasets import Dataset
 
+from datasets import Dataset
 from langchain_community.chat_models import ChatOllama
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
+from model.crew import run_crew
 from ragas import evaluate
 from ragas.embeddings import LangchainEmbeddingsWrapper
-from model.crew import run_crew
 from ragas.llms import LangchainLLMWrapper
-from ragas.metrics import answer_correctness, answer_relevancy, answer_similarity, context_entity_recall, context_precision, context_recall, faithfulness, summarization_score
-from ragas.metrics._aspect_critic import harmfulness, maliciousness, coherence, correctness, conciseness
+from ragas.metrics import (
+    answer_correctness,
+    answer_relevancy,
+    answer_similarity,
+    context_entity_recall,
+    context_precision,
+    context_recall,
+    faithfulness,
+    summarization_score,
+)
+from ragas.metrics._aspect_critic import (
+    coherence,
+    conciseness,
+    correctness,
+    harmfulness,
+    maliciousness,
+)
 
 azure_openai_api_key = environ.get("AZURE_OPENAI_API_KEY", "NA")
 azure_openai_endpoint = environ.get("AZURE_OPENAI_ENDPOINT", "NA")
@@ -23,6 +38,7 @@
 eval_model_name = environ.get("LOCAL_MODEL_NAME", "llama3.1")
 eval_base_url = environ.get("LOCAL_MODEL_BASE_URL", "http://localhost:11434/v1/")
 
+
 def eval():
     if azure_openai_api_key != "NA":
         print("Set Azure OpenAI model for evaluation")
@@ -40,20 +56,14 @@ def eval():
                 api_version=openai_api_version,
                 azure_endpoint=azure_openai_endpoint,
                 azure_deployment=azure_deployment_name,
-                model=azure_model_version
+                model=azure_model_version,
             )
         )
     else:
         print("Set local model for evaluation")
-        evaluator_llm = LangchainLLMWrapper(
-            ChatOllama(
-                model=eval_model_name
-            )
-        )
+        evaluator_llm = LangchainLLMWrapper(ChatOllama(model=eval_model_name))
         evaluator_embeddings = LangchainEmbeddingsWrapper(
-            OllamaEmbeddings(
-                model=eval_model_name
-            )
+            OllamaEmbeddings(model=eval_model_name)
         )
 
     test_input = "Gather data about new Critical CVEs from todays date"
@@ -63,13 +73,13 @@ def eval():
     print("Task output: " + test_output)
 
     d = dict()
-    d['question'] = [test_input]
-    d['answer'] = [test_output]
-    d['context'] = [[]]
-    d['retrieval_context'] = [[]]
-    d['reference'] = ['']
-    d['reference_contexts'] = [[]]
-    d['retrieved_contexts'] = [[]]
+    d["question"] = [test_input]
+    d["answer"] = [test_output]
+    d["context"] = [[]]
+    d["retrieval_context"] = [[]]
+    d["reference"] = [""]
+    d["reference_contexts"] = [[]]
+    d["retrieved_contexts"] = [[]]
     dataset = Dataset.from_dict(d)
 
     score = evaluate(
@@ -90,9 +100,12 @@ def eval():
             summarization_score,
         ],
         evaluator_llm,
-        evaluator_embeddings
+        evaluator_embeddings,
     )
     print("SCORE", score)
 
+    return (dataset, score)
+
+
 if __name__ == "__main__":
     eval()