Skip to content

Commit 8a394a3

Browse files
pjoshi30Preetam Joshi
andauthored
Adding instruction adherence and hallucinationv0.2 detectors (#9)
* Fixing doc string * Updating documentation --------- Co-authored-by: Preetam Joshi <info@aimon.ai>
1 parent 3e3625b commit 8a394a3

File tree

7 files changed

+226
-131
lines changed

7 files changed

+226
-131
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# Packages
88
*.egg
99
*.egg-info
10+
.idea
11+
.idea/
1012
build
1113
eggs
1214
parts

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The following is a list of quality metrics that are currently available and on o
1818
| Completeness | <span style="font-size: 24px; color: green;">&#10003;</span> |
1919
| Conciseness | <span style="font-size: 24px; color: green;">&#10003;</span> |
2020
| Toxicity | <span style="font-size: 24px; color: green;">&#10003;</span> |
21+
| Instruction Adherence | <span style="font-size: 24px; color: green;">&#10003;</span> |
2122
| Semantic Similarity | <span style="font-size: 24px;">⌛</span> |
2223
| Sentiment | <span style="font-size: 24px;">⌛</span> |
2324
| Coherence | <span style="font-size: 24px;">⌛</span> |
@@ -78,7 +79,7 @@ making it a suitable choice for both offline and online detection of hallucinati
7879
<img src="images/hallucination-benchmarks.png" alt="Hallucination Benchmarks">
7980
</div>
8081

81-
### Completeness, Conciseness Detection
82+
### Benchmarks on other Detectors
8283

8384
There is a lack of industry-standard benchmark datasets for these metrics. We will be publishing an evaluation dataset soon.
8485
Stay Tuned! <span style="font-size: 16px;">⌛</span>

aimon/client.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,19 @@ def detect(self, data_to_send: List[Dict[str, Any]], config=Config()):
156156
given query and context documents
157157
"reasoning": An explanation of the score that was provided.
158158
"score": A probability score of how complete the response is for the user query and context documents.
159+
"instruction_adherence": This detector checks whether the response followed the specified instructions.
160+
Results are returned in this JSON format
161+
```json
162+
{
163+
"instruction_adherence": [
164+
{
165+
"instruction": "<String>",
166+
"adherence": "<Boolean>",
167+
"detailed_explanation": "<String>"
168+
}
169+
]
170+
}
171+
```
159172
"toxicity": Indicates whether there was toxic content in the response. It uses 6 different label types for this.
160173
"identity_hate": The response contained hateful content that calls out real or perceived "identity factors" of an individual or a group.
161174
"insult": The response contained insulting content.

aimon/metrics_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
class Config:
55
SUPPORTED_DETECTORS = {'hallucination': 'default', 'toxicity': 'default', 'conciseness': 'default',
6-
'completeness': 'default'}
6+
'completeness': 'default', 'instruction_adherence': 'default', 'hallucination_v0.2': 'default'}
77
SUPPORTED_VALUES = {'default', 'hall_v2'}
88

99
def __init__(self, detectors: Dict[str, str] = None):

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
name='aimon',
55
python_requires='>3.8.0',
66
packages=find_packages(),
7-
version="0.3.1",
7+
version="0.4.0",
88
install_requires=[
99
"requests"
1010
],

test/client_test.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# Run python3 setup.py install --user
2+
import pytest
3+
from aimon import Client, Config
4+
5+
6+
API_KEY = "YOUR API KEY"
7+
class TestSimpleAimonRelyClient:
8+
9+
# Sends an HTTP POST request to the Aimon Rely Hallucination Detection API with valid data and receives a valid response
10+
def test_valid_data_valid_response(self):
11+
config = Config({'hallucination': 'default'})
12+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
13+
data_to_send = [{"context": "This is the context", "generated_text": "This is the context"}]
14+
response = client.detect(data_to_send, config=config)[0]
15+
assert "hallucination" in response
16+
assert "is_hallucinated" in response['hallucination']
17+
assert response['hallucination']["is_hallucinated"] == "False"
18+
assert "score" in response['hallucination']
19+
assert "sentences" in response['hallucination']
20+
assert len(response['hallucination']["sentences"]) == 1
21+
assert "This is the context" in response['hallucination']["sentences"][0]["text"]
22+
23+
def test_valid_data_valid_response_user_query(self):
24+
config = Config({'hallucination': 'default'})
25+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
26+
data_to_send = [{"context": "This is the context", "user_query": "This is the user query", "generated_text": "This is the context"}]
27+
response = client.detect(data_to_send, config=config)[0]
28+
assert "hallucination" in response
29+
assert "is_hallucinated" in response['hallucination']
30+
assert response['hallucination']["is_hallucinated"] == "False"
31+
assert "score" in response['hallucination']
32+
assert "sentences" in response['hallucination']
33+
assert len(response['hallucination']["sentences"]) == 1
34+
assert "This is the context" in response['hallucination']["sentences"][0]["text"]
35+
36+
def test_valid_batch_data_valid_response(self):
37+
config = Config({'hallucination': 'default'})
38+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
39+
data_to_send = [{"context": "This is the context", "generated_text": "This is the context"}, {"context": "This is the second context", "generated_text": "This is the second context"}]
40+
response = client.detect(data_to_send, config=config)
41+
for i in range(2):
42+
assert "hallucination" in response[i]
43+
assert "is_hallucinated" in response[i]['hallucination']
44+
assert response[i]['hallucination']["is_hallucinated"] == "False"
45+
assert "score" in response[i]['hallucination']
46+
assert "sentences" in response[i]['hallucination']
47+
assert len(response[i]['hallucination']["sentences"]) == 1
48+
assert "This is the context" in response[0]['hallucination']["sentences"][0]["text"]
49+
assert "This is the second context" in response[1]['hallucination']["sentences"][0]["text"]
50+
51+
# Sends an HTTP POST request to the Aimon Rely Hallucination Detection API with a single dict object containing a valid "context" and "generated_text" but with a very short text and receives a valid response
52+
def test_short_text_valid_response(self):
53+
config = Config({'hallucination': 'default'})
54+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
55+
short_text = "Yes"
56+
data_to_send = [{"context": "This is the context", "generated_text": short_text}]
57+
response = client.detect(data_to_send, config=config)[0]
58+
assert "hallucination" in response
59+
assert "is_hallucinated" in response['hallucination']
60+
assert "score" in response['hallucination']
61+
assert response['hallucination']["score"] >= 0.45
62+
assert "sentences" in response['hallucination']
63+
assert len(response['hallucination']["sentences"]) == 1
64+
assert "Yes" in response['hallucination']["sentences"][0]["text"]
65+
66+
# Sends an HTTP POST request to the Aimon Rely Hallucination Detection API with a single dict object containing a valid "context" and "generated_text" but with a text containing special characters and receives a valid response
67+
def test_special_characters_valid_response(self):
68+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
69+
special_text = "!@#$%^&*()_+"
70+
data_to_send = [{"context": "This is the context", "generated_text": special_text}]
71+
response = client.detect(data_to_send)[0]
72+
assert "hallucination" in response
73+
assert "is_hallucinated" in response['hallucination']
74+
assert response['hallucination']["is_hallucinated"] == "True"
75+
assert "score" in response['hallucination']
76+
assert response['hallucination']["score"] >= 0.5
77+
assert "sentences" in response['hallucination']
78+
assert len(response['hallucination']["sentences"]) == 1
79+
80+
def test_valid_data_valid_response_hallucination_v0_2(self):
81+
config = Config({'hallucination_v0.2': 'default'})
82+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
83+
data_to_send = [{"context": "This is the context", "generated_text": "This is the context"}]
84+
response = client.detect(data_to_send, config=config)[0]
85+
assert "hallucination_v0.2" in response
86+
assert "is_hallucinated" in response['hallucination_v0.2']
87+
assert response['hallucination_v0.2']["is_hallucinated"] == "False"
88+
assert "score" in response['hallucination_v0.2']
89+
assert "sentences" in response['hallucination_v0.2']
90+
assert len(response['hallucination_v0.2']["sentences"]) == 1
91+
assert "This is the context" in response['hallucination_v0.2']["sentences"][0]["text"]
92+
93+
def test_valid_data_valid_response_user_query_hallucination_v0_2(self):
94+
config = Config({'hallucination_v0.2': 'default'})
95+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
96+
data_to_send = [{"context": "This is the context", "user_query": "This is the user query", "generated_text": "This is the context"}]
97+
response = client.detect(data_to_send, config=config)[0]
98+
assert "hallucination_v0.2" in response
99+
assert "is_hallucinated" in response['hallucination_v0.2']
100+
# assert response['hallucination_v0.2']["is_hallucinated"] == "False"
101+
assert "score" in response['hallucination_v0.2']
102+
assert "sentences" in response['hallucination_v0.2']
103+
# assert len(response['hallucination_v0.2']["sentences"]) == 1
104+
# assert "This is the context" in response['hallucination_v0.2']["sentences"][0]["text"]
105+
106+
def test_valid_batch_data_valid_response_hallucination_v0_2(self):
107+
config = Config({'hallucination_v0.2': 'default'})
108+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
109+
data_to_send = [{"context": "This is the context", "generated_text": "This is the context"}, {"context": "This is the second context", "generated_text": "This is the second context"}]
110+
response = client.detect(data_to_send, config=config)
111+
for i in range(2):
112+
assert "hallucination_v0.2" in response[i]
113+
assert "is_hallucinated" in response[i]['hallucination_v0.2']
114+
# assert response[i]['hallucination_v0.2']["is_hallucinated"] == "False"
115+
assert "score" in response[i]['hallucination_v0.2']
116+
assert "sentences" in response[i]['hallucination_v0.2']
117+
# assert len(response[i]['hallucination_v0.2']["sentences"]) == 1
118+
# assert "This is the context" in response[0]['hallucination_v0.2']["sentences"][0]["text"]
119+
# assert "This is the second context" in response[1]['hallucination_v0.2']["sentences"][0]["text"]
120+
121+
# Sends an HTTP POST request to the Aimon Rely Hallucination Detection API with a single dict object containing a valid "context" and "generated_text" but with a very short text and receives a valid response
122+
def test_short_text_valid_response_hallucination_v0_2(self):
123+
config = Config({'hallucination_v0.2': 'default'})
124+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
125+
short_text = "Yes"
126+
data_to_send = [{"context": "This is the context", "generated_text": short_text}]
127+
response = client.detect(data_to_send, config=config)[0]
128+
assert "hallucination_v0.2" in response
129+
assert "is_hallucinated" in response['hallucination_v0.2']
130+
assert "score" in response['hallucination_v0.2']
131+
assert 0.0 <= response['hallucination_v0.2']["score"] <= 1.0
132+
assert "sentences" in response['hallucination_v0.2']
133+
# assert len(response['hallucination_v0.2']["sentences"]) == 1
134+
# assert "Yes" in response['hallucination_v0.2']["sentences"][0]["text"]
135+
136+
# Sends an HTTP POST request to the Aimon Rely Hallucination Detection API with a single dict object containing a valid "context" and "generated_text" but with a text containing special characters and receives a valid response
137+
def test_special_characters_valid_response_hallucination_v0_2(self):
138+
config = Config({'hallucination_v0.2': 'default'})
139+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
140+
special_text = "!@#$%^&*()_+"
141+
data_to_send = [{"context": "This is the context", "generated_text": special_text}]
142+
response = client.detect(data_to_send, config=config)[0]
143+
assert "hallucination_v0.2" in response
144+
assert "is_hallucinated" in response['hallucination_v0.2']
145+
assert response['hallucination_v0.2']["is_hallucinated"] == "True"
146+
assert "score" in response['hallucination_v0.2']
147+
assert response['hallucination_v0.2']["score"] >= 0.5
148+
assert "sentences" in response['hallucination_v0.2']
149+
# assert len(response['hallucination_v0.2']["sentences"]) == 1
150+
151+
def test_valid_data_valid_response_conciseness(self):
152+
config = Config({'conciseness': 'default'})
153+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
154+
data_to_send = [{
155+
"context": "the abc have reported that those who receive centrelink payments made up half of radio rental's income last year. Centrelink payments themselves were up 20%.",
156+
"generated_text": "those who receive centrelink payments made up half of radio rental's income last year. The Centrelink payments were 20% up."}]
157+
response = client.detect(data_to_send, config=config)[0]
158+
assert "conciseness" in response
159+
assert "reasoning" in response["conciseness"]
160+
assert "score" in response["conciseness"]
161+
assert response["conciseness"]["score"] >= 0.7
162+
163+
def test_valid_data_valid_response_completeness(self):
164+
config = Config({'completeness': 'default'})
165+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
166+
data_to_send = [{
167+
"context": "the abc have reported that those who receive centrelink payments made up half of radio rental's income last year. Centrelink payments themselves were up 20%.",
168+
"generated_text": "those who receive centrelink payments made up half of radio rental's income last year. The Centrelink payments were 20% up."}]
169+
response = client.detect(data_to_send, config=config)[0]
170+
assert "completeness" in response
171+
assert "reasoning" in response["completeness"]
172+
assert "score" in response["completeness"]
173+
assert response["completeness"]["score"] > 0.7
174+
175+
def test_valid_data_valid_response_instruction_adherence(self):
176+
config = Config({'instruction_adherence': 'default'})
177+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
178+
data_to_send = [{
179+
"context": "the abc have reported that those who receive centrelink payments made up half of radio rental's income last year. Centrelink payments themselves were up 20%.",
180+
"generated_text": "those who receive centrelink payments made up half of radio rental's income last year. The Centrelink payments were 20% up.",
181+
"instructions": "1. You are helpful chatbot. 2. You are friendly and polite. 3. The number of sentences in your response should not be more than two."
182+
}]
183+
response = client.detect(data_to_send, config=config)[0]
184+
assert "instruction_adherence" in response
185+
assert "results" in response["instruction_adherence"]
186+
assert len(response["instruction_adherence"]["results"]) == 3
187+
assert "instruction" in response["instruction_adherence"]["results"][2]
188+
assert "examples" in response["instruction_adherence"]["results"][2]
189+
assert "detailed_explanation" in response["instruction_adherence"]["results"][2]
190+
assert "adherence" in response["instruction_adherence"]["results"][2]
191+
# print(response)
192+
# assert response["instruction_adherence"]["results"][2]["adherence"] is True
193+
194+
def test_valid_data_valid_response_conciseness_completeness(self):
195+
config = Config({'conciseness': 'default', 'completeness': 'default'})
196+
client = Client(api_key=API_KEY, email="preetam@aimon.ai")
197+
data_to_send = [{"context": "the abc have reported that those who receive centrelink payments made up half of radio rental's income last year. Centrelink payments themselves were up 20%.",
198+
"generated_text": "those who receive centrelink payments made up half of radio rental's income last year. The Centrelink payments were 20% up."}]
199+
response = client.detect(data_to_send, config=config)[0]
200+
assert "completeness" in response
201+
assert "reasoning" in response["completeness"]
202+
assert "score" in response["completeness"]
203+
assert response["completeness"]["score"] > 0.7
204+
assert "conciseness" in response
205+
assert "reasoning" in response["conciseness"]
206+
assert "score" in response["conciseness"]
207+
assert response["conciseness"]["score"] > 0.7

0 commit comments

Comments
 (0)