Skip to content

Commit 5976ec9

Browse files
authored
Refactor evaluation sample app (#29)
* chore(samples): run formatter, remove unused imports * refactor(samples): export ragas eval results * refactor(samples): export deepeval eval results * refactor(samples): avoid official package name conflict
1 parent 2c55648 commit 5976ec9

File tree

3 files changed

+58
-35
lines changed

3 files changed

+58
-35
lines changed

samples/evaluation/Taskfile.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
---
5-
version: "3"
5+
version: '3'
66

77
silent: true
88

@@ -26,13 +26,13 @@ tasks:
2626
internal: true
2727
dir: '{{.USER_WORKING_DIR}}'
2828
cmds:
29-
- poetry run python -m tests.deepeval
29+
- poetry run python -m tests._deepeval
3030

3131
run:test-ragas:
3232
internal: true
3333
dir: '{{.USER_WORKING_DIR}}'
3434
cmds:
35-
- poetry run python -m tests.ragas
35+
- poetry run python -m tests._ragas
3636

3737
run:test:
3838
desc: Run tests

samples/evaluation/tests/deepeval.py renamed to samples/evaluation/tests/_deepeval.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from os import environ
6-
from deepeval.test_case import LLMTestCase
7-
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ToxicityMetric
8-
from deepeval.cli.main import set_local_model_env, unset_local_model_env, set_azure_openai_env, unset_azure_openai_env
6+
7+
from deepeval.cli.main import (
8+
set_azure_openai_env,
9+
set_local_model_env,
10+
unset_azure_openai_env,
11+
unset_local_model_env,
12+
)
913
from deepeval.dataset import EvaluationDataset
14+
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ToxicityMetric
15+
from deepeval.test_case import LLMTestCase
1016
from model.crew import run_crew
1117

1218
azure_openai_api_key = environ.get("AZURE_OPENAI_API_KEY", "NA")
@@ -18,6 +24,7 @@
1824
eval_model_name = environ.get("LOCAL_MODEL_NAME", "llama3.1")
1925
eval_base_url = environ.get("LOCAL_MODEL_BASE_URL", "http://localhost:11434/v1/")
2026

27+
2128
def eval():
2229
if azure_openai_api_key != "NA":
2330
print("Set Azure OpenAI model for evaluation")
@@ -30,10 +37,11 @@ def eval():
3037
)
3138
else:
3239
print("Set local model for evaluation")
33-
set_local_model_env(model_name=eval_model_name,
34-
base_url=eval_base_url,
35-
api_key="dummy-key",
36-
format='json',
40+
set_local_model_env(
41+
model_name=eval_model_name,
42+
base_url=eval_base_url,
43+
api_key="dummy-key",
44+
format="json",
3745
)
3846

3947
test_input = "Gather data about new Critical CVEs from todays date"
@@ -44,8 +52,8 @@ def eval():
4452

4553
test_cases = []
4654
test_case = LLMTestCase(
47-
input = test_input,
48-
actual_output = test_output,
55+
input=test_input,
56+
actual_output=test_output,
4957
)
5058
test_cases.append(test_case)
5159

@@ -55,14 +63,16 @@ def eval():
5563
toxicity_metric = ToxicityMetric(threshold=0.5)
5664

5765
dataset = EvaluationDataset(test_cases=test_cases)
58-
dataset.evaluate([answer_relevancy_metric, bias_metric, toxicity_metric])
59-
66+
result = dataset.evaluate([answer_relevancy_metric, bias_metric, toxicity_metric])
6067

6168
print("Test End")
6269
if azure_openai_api_key != "NA":
6370
unset_azure_openai_env()
6471
else:
6572
unset_local_model_env()
6673

74+
return (dataset, result)
75+
76+
6777
if __name__ == "__main__":
6878
eval()

samples/evaluation/tests/ragas.py renamed to samples/evaluation/tests/_ragas.py

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,32 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
from os import environ
5-
from datasets import Dataset
65

6+
from datasets import Dataset
77
from langchain_community.chat_models import ChatOllama
88
from langchain_community.embeddings import OllamaEmbeddings
99
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
10+
from model.crew import run_crew
1011
from ragas import evaluate
1112
from ragas.embeddings import LangchainEmbeddingsWrapper
12-
from model.crew import run_crew
1313
from ragas.llms import LangchainLLMWrapper
14-
from ragas.metrics import answer_correctness, answer_relevancy, answer_similarity, context_entity_recall, context_precision, context_recall, faithfulness, summarization_score
15-
from ragas.metrics._aspect_critic import harmfulness, maliciousness, coherence, correctness, conciseness
14+
from ragas.metrics import (
15+
answer_correctness,
16+
answer_relevancy,
17+
answer_similarity,
18+
context_entity_recall,
19+
context_precision,
20+
context_recall,
21+
faithfulness,
22+
summarization_score,
23+
)
24+
from ragas.metrics._aspect_critic import (
25+
coherence,
26+
conciseness,
27+
correctness,
28+
harmfulness,
29+
maliciousness,
30+
)
1631

1732
azure_openai_api_key = environ.get("AZURE_OPENAI_API_KEY", "NA")
1833
azure_openai_endpoint = environ.get("AZURE_OPENAI_ENDPOINT", "NA")
@@ -23,6 +38,7 @@
2338
eval_model_name = environ.get("LOCAL_MODEL_NAME", "llama3.1")
2439
eval_base_url = environ.get("LOCAL_MODEL_BASE_URL", "http://localhost:11434/v1/")
2540

41+
2642
def eval():
2743
if azure_openai_api_key != "NA":
2844
print("Set Azure OpenAI model for evaluation")
@@ -40,20 +56,14 @@ def eval():
4056
api_version=openai_api_version,
4157
azure_endpoint=azure_openai_endpoint,
4258
azure_deployment=azure_deployment_name,
43-
model=azure_model_version
59+
model=azure_model_version,
4460
)
4561
)
4662
else:
4763
print("Set local model for evaluation")
48-
evaluator_llm = LangchainLLMWrapper(
49-
ChatOllama(
50-
model=eval_model_name
51-
)
52-
)
64+
evaluator_llm = LangchainLLMWrapper(ChatOllama(model=eval_model_name))
5365
evaluator_embeddings = LangchainEmbeddingsWrapper(
54-
OllamaEmbeddings(
55-
model=eval_model_name
56-
)
66+
OllamaEmbeddings(model=eval_model_name)
5767
)
5868

5969
test_input = "Gather data about new Critical CVEs from todays date"
@@ -63,13 +73,13 @@ def eval():
6373
print("Task output: " + test_output)
6474

6575
d = dict()
66-
d['question'] = [test_input]
67-
d['answer'] = [test_output]
68-
d['context'] = [[]]
69-
d['retrieval_context'] = [[]]
70-
d['reference'] = ['']
71-
d['reference_contexts'] = [[]]
72-
d['retrieved_contexts'] = [[]]
76+
d["question"] = [test_input]
77+
d["answer"] = [test_output]
78+
d["context"] = [[]]
79+
d["retrieval_context"] = [[]]
80+
d["reference"] = [""]
81+
d["reference_contexts"] = [[]]
82+
d["retrieved_contexts"] = [[]]
7383
dataset = Dataset.from_dict(d)
7484

7585
score = evaluate(
@@ -90,9 +100,12 @@ def eval():
90100
summarization_score,
91101
],
92102
evaluator_llm,
93-
evaluator_embeddings
103+
evaluator_embeddings,
94104
)
95105
print("SCORE", score)
96106

107+
return (dataset, score)
108+
109+
97110
if __name__ == "__main__":
98111
eval()

0 commit comments

Comments
 (0)