Skip to content

Commit df99251

Browse files
fix: update evaluator tests to match latest implementation (#34)
* fix: update evaluator tests to match latest implementation Co-Authored-By: Han Xiao <han.xiao@jina.ai> * fix: update EvaluationResponse type and add comprehensive tests Co-Authored-By: Han Xiao <han.xiao@jina.ai> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Han Xiao <han.xiao@jina.ai>
1 parent 0c74746 commit df99251

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

src/tools/__tests__/evaluator.test.ts

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,76 @@ describe('evaluateAnswer', () => {
2626
const { response } = await evaluateAnswer(
2727
'What is TypeScript?',
2828
'TypeScript is a strongly typed programming language that builds on JavaScript.',
29+
['definitive'],
2930
tokenTracker
3031
);
31-
expect(response).toHaveProperty('is_definitive');
32-
expect(response).toHaveProperty('reasoning');
32+
expect(response).toHaveProperty('pass');
33+
expect(response).toHaveProperty('think');
34+
expect(response.type).toBe('definitive');
35+
expect(response.pass).toBe(true);
36+
});
37+
38+
it('should evaluate answer freshness', async () => {
39+
const tokenTracker = new TokenTracker();
40+
const { response } = await evaluateAnswer(
41+
'What is the latest version of Node.js?',
42+
'The latest version of Node.js is 14.0.0, released in April 2020.',
43+
['freshness'],
44+
tokenTracker
45+
);
46+
expect(response).toHaveProperty('pass');
47+
expect(response).toHaveProperty('think');
48+
expect(response.type).toBe('freshness');
49+
expect(response.freshness_analysis).toBeDefined();
50+
expect(response.freshness_analysis?.likely_outdated).toBe(true);
51+
expect(response.freshness_analysis?.dates_mentioned).toContain('2020-04');
52+
expect(response.freshness_analysis?.current_time).toBeDefined();
53+
expect(response.pass).toBe(false);
54+
});
55+
56+
it('should evaluate answer plurality', async () => {
57+
const tokenTracker = new TokenTracker();
58+
const { response } = await evaluateAnswer(
59+
'List three programming languages.',
60+
'Python is a programming language.',
61+
['plurality'],
62+
tokenTracker
63+
);
64+
expect(response).toHaveProperty('pass');
65+
expect(response).toHaveProperty('think');
66+
expect(response.type).toBe('plurality');
67+
expect(response.plurality_analysis).toBeDefined();
68+
expect(response.plurality_analysis?.expects_multiple).toBe(true);
69+
expect(response.plurality_analysis?.provides_multiple).toBe(false);
70+
expect(response.plurality_analysis?.count_expected).toBe(3);
71+
expect(response.plurality_analysis?.count_provided).toBe(1);
72+
expect(response.pass).toBe(false);
73+
});
74+
75+
it('should evaluate in order and stop at first failure', async () => {
76+
const tokenTracker = new TokenTracker();
77+
const { response } = await evaluateAnswer(
78+
'List the latest Node.js versions.',
79+
'I am not sure about the Node.js versions.',
80+
['definitive', 'freshness', 'plurality'],
81+
tokenTracker
82+
);
83+
expect(response.type).toBe('definitive');
84+
expect(response.pass).toBe(false);
85+
expect(response.freshness_analysis).toBeUndefined();
86+
expect(response.plurality_analysis).toBeUndefined();
3387
});
3488

3589
it('should track token usage', async () => {
3690
const tokenTracker = new TokenTracker();
3791
const spy = jest.spyOn(tokenTracker, 'trackUsage');
38-
const { tokens } = await evaluateAnswer(
92+
await evaluateAnswer(
3993
'What is TypeScript?',
4094
'TypeScript is a strongly typed programming language that builds on JavaScript.',
95+
['definitive', 'freshness', 'plurality'],
4196
tokenTracker
4297
);
43-
expect(spy).toHaveBeenCalledWith('evaluator', tokens);
44-
expect(tokens).toBeGreaterThan(0);
98+
expect(spy).toHaveBeenCalledWith('evaluator', expect.any(Number));
4599
});
46100
});
47101
});

src/types.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,19 @@ export interface ReadResponse {
8787
export type EvaluationResponse = {
8888
pass: boolean;
8989
think: string;
90+
type?: 'definitive' | 'freshness' | 'plurality';
91+
freshness_analysis?: {
92+
likely_outdated: boolean;
93+
dates_mentioned: string[];
94+
current_time: string;
95+
max_age_days?: number;
96+
};
97+
plurality_analysis?: {
98+
expects_multiple: boolean;
99+
provides_multiple: boolean;
100+
count_expected?: number;
101+
count_provided: number;
102+
};
90103
};
91104

92105
export type ErrorAnalysisResponse = {

0 commit comments

Comments
 (0)