Skip to content

Commit cdb1738

Browse files
authored
Add semantic similarity metric (#1039)
1 parent f2740d8 commit cdb1738

File tree

9 files changed

+351
-4
lines changed

9 files changed

+351
-4
lines changed
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import {
2+
EvaluationType,
3+
RuleEvaluationMetric,
4+
RuleEvaluationSemanticSimilaritySpecification,
5+
} from '@latitude-data/constants'
6+
import { IconName, NumberInput, Select } from '@latitude-data/web-ui'
7+
import {
8+
ChartConfigurationArgs,
9+
ConfigurationFormProps,
10+
ResultBadgeProps,
11+
ResultPanelProps,
12+
ResultRowCellsProps,
13+
ResultRowHeadersProps,
14+
} from '../index'
15+
16+
const specification = RuleEvaluationSemanticSimilaritySpecification
17+
export default {
18+
...specification,
19+
icon: 'equalApproximately' as IconName,
20+
ConfigurationForm: ConfigurationForm,
21+
ResultBadge: ResultBadge,
22+
ResultRowHeaders: ResultRowHeaders,
23+
ResultRowCells: ResultRowCells,
24+
resultPanelTabs: [],
25+
ResultPanelMetadata: ResultPanelMetadata,
26+
ResultPanelContent: ResultPanelContent,
27+
chartConfiguration: chartConfiguration,
28+
}
29+
30+
const ALGORITHM_OPTIONS =
31+
specification.configuration.shape.algorithm.options.map((option) => ({
32+
label: option.toUpperCase().split('_').join(' '),
33+
value: option,
34+
}))
35+
36+
function ConfigurationForm({
37+
configuration,
38+
setConfiguration,
39+
disabled,
40+
}: ConfigurationFormProps<
41+
EvaluationType.Rule,
42+
RuleEvaluationMetric.SemanticSimilarity
43+
>) {
44+
return (
45+
<>
46+
<Select
47+
value={configuration.algorithm ?? ''}
48+
name='algorithm'
49+
label='Algorithm'
50+
description='How to measure percentage of similarity'
51+
placeholder='Select an algorithm'
52+
options={ALGORITHM_OPTIONS}
53+
onChange={(value) =>
54+
setConfiguration({ ...configuration, algorithm: value })
55+
}
56+
disabled={disabled}
57+
required
58+
/>
59+
<NumberInput
60+
value={configuration.minSimilarity ?? undefined}
61+
name='minSimilarity'
62+
label='Minimum similarity'
63+
description='The minimum percentage of similarity of the response'
64+
placeholder='No minimum'
65+
min={0}
66+
max={100}
67+
onChange={(value) =>
68+
setConfiguration({ ...configuration, minSimilarity: value })
69+
}
70+
className='w-full'
71+
disabled={disabled}
72+
required
73+
/>
74+
<NumberInput
75+
value={configuration.maxSimilarity ?? undefined}
76+
name='maxSimilarity'
77+
label='Maximum similarity'
78+
description='The maximum percentage of similarity of the response'
79+
placeholder='No maximum'
80+
min={0}
81+
max={100}
82+
onChange={(value) =>
83+
setConfiguration({ ...configuration, maxSimilarity: value })
84+
}
85+
className='w-full'
86+
disabled={disabled}
87+
required
88+
/>
89+
</>
90+
)
91+
}
92+
93+
function ResultBadge({
94+
result,
95+
}: ResultBadgeProps<
96+
EvaluationType.Rule,
97+
RuleEvaluationMetric.SemanticSimilarity
98+
>) {
99+
return <>{result.score!.toFixed(0)}% similar</>
100+
}
101+
102+
function ResultRowHeaders(
103+
_props: ResultRowHeadersProps<
104+
EvaluationType.Rule,
105+
RuleEvaluationMetric.SemanticSimilarity
106+
>,
107+
) {
108+
return <></>
109+
}
110+
111+
function ResultRowCells(
112+
_props: ResultRowCellsProps<
113+
EvaluationType.Rule,
114+
RuleEvaluationMetric.SemanticSimilarity
115+
>,
116+
) {
117+
return <></>
118+
}
119+
120+
function ResultPanelMetadata(
121+
_props: ResultPanelProps<
122+
EvaluationType.Rule,
123+
RuleEvaluationMetric.SemanticSimilarity
124+
>,
125+
) {
126+
return <></>
127+
}
128+
129+
function ResultPanelContent(
130+
_props: ResultPanelProps<
131+
EvaluationType.Rule,
132+
RuleEvaluationMetric.SemanticSimilarity
133+
>,
134+
) {
135+
return <></>
136+
}
137+
138+
function chartConfiguration({
139+
evaluation,
140+
}: ChartConfigurationArgs<
141+
EvaluationType.Rule,
142+
RuleEvaluationMetric.SemanticSimilarity
143+
>) {
144+
return {
145+
min: 0,
146+
max: 100,
147+
thresholds: [
148+
...(evaluation.configuration.minSimilarity
149+
? [evaluation.configuration.minSimilarity]
150+
: []),
151+
...(evaluation.configuration.maxSimilarity
152+
? [evaluation.configuration.maxSimilarity]
153+
: []),
154+
] as const,
155+
scale: (point: number) => point,
156+
format: (point: number, short?: boolean) =>
157+
short ? `${point.toFixed(0)}%` : `${point.toFixed(0)}% similar`,
158+
}
159+
}

apps/web/src/components/evaluations/rule/index.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import RuleEvaluationLengthCountSpecification from './LengthCount'
1818
import RuleEvaluationLexicalOverlapSpecification from './LexicalOverlap'
1919
import RuleEvaluationRegularExpressionSpecification from './RegularExpression'
2020
import RuleEvaluationSchemaValidationSpecification from './SchemaValidation'
21+
import RuleEvaluationSemanticSimilaritySpecification from './SemanticSimilarity'
2122

2223
// prettier-ignore
2324
const METRICS: {
@@ -28,7 +29,7 @@ const METRICS: {
2829
[RuleEvaluationMetric.SchemaValidation]: RuleEvaluationSchemaValidationSpecification,
2930
[RuleEvaluationMetric.LengthCount]: RuleEvaluationLengthCountSpecification,
3031
[RuleEvaluationMetric.LexicalOverlap]: RuleEvaluationLexicalOverlapSpecification,
31-
[RuleEvaluationMetric.SemanticSimilarity]: undefined as any, // TODO: Implement
32+
[RuleEvaluationMetric.SemanticSimilarity]: RuleEvaluationSemanticSimilaritySpecification,
3233
}
3334

3435
const specification = RuleEvaluationSpecification

packages/constants/src/evaluations/rule.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ export type RuleEvaluationLexicalOverlapResultError = z.infer<
179179

180180
const ruleEvaluationSemanticSimilarityConfiguration =
181181
ruleEvaluationConfiguration.extend({
182-
algorithm: z.literal('cosine_similarity'),
182+
algorithm: z.enum(['cosine_distance']),
183183
minSimilarity: z.number().optional(), // Percentage of similarity
184184
maxSimilarity: z.number().optional(), // Percentage of similarity
185185
})

packages/core/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@
162162
"@modelcontextprotocol/sdk": "^1.6.0",
163163
"@tavily/core": "^0.3.1",
164164
"ajv": "^8.17.1",
165+
"compute-cosine-similarity": "^1.1.0",
165166
"date-fns": "^3.6.0",
166167
"diff-match-patch": "^1.0.5",
167168
"fastest-levenshtein": "^1.0.16",

packages/core/src/repositories/evaluationResultsV2Repository.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,10 @@ export class EvaluationResultsV2Repository extends Repository<EvaluationResultV2
244244
: sql`0`.mapWith(Number),
245245
}
246246

247-
const filter = this.listByEvaluationFilter({ evaluationUuid, params })
247+
const filter = and(
248+
this.listByEvaluationFilter({ evaluationUuid, params }),
249+
isNull(evaluationResultsV2.error),
250+
)
248251

249252
const totalStats = await this.db
250253
.select(stats)

packages/core/src/services/evaluationsV2/rule/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import RuleEvaluationLengthCountSpecification from './lengthCount'
1616
import RuleEvaluationLexicalOverlapSpecification from './lexicalOverlap'
1717
import RuleEvaluationRegularExpressionSpecification from './regularExpression'
1818
import RuleEvaluationSchemaValidationSpecification from './schemaValidation'
19+
import RuleEvaluationSemanticSimilaritySpecification from './semanticSimilarity'
1920

2021
// prettier-ignore
2122
const METRICS: {
@@ -26,7 +27,7 @@ const METRICS: {
2627
[RuleEvaluationMetric.SchemaValidation]: RuleEvaluationSchemaValidationSpecification,
2728
[RuleEvaluationMetric.LengthCount]: RuleEvaluationLengthCountSpecification,
2829
[RuleEvaluationMetric.LexicalOverlap]: RuleEvaluationLexicalOverlapSpecification,
29-
[RuleEvaluationMetric.SemanticSimilarity]: undefined as any, // TODO: Implement
30+
[RuleEvaluationMetric.SemanticSimilarity]: RuleEvaluationSemanticSimilaritySpecification,
3031
}
3132

3233
const specification = RuleEvaluationSpecification
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import { createOpenAI } from '@ai-sdk/openai'
2+
import { env } from '@latitude-data/env'
3+
import { embedMany } from 'ai'
4+
import similarity from 'compute-cosine-similarity'
5+
import {
6+
EvaluationType,
7+
RuleEvaluationMetric,
8+
RuleEvaluationSemanticSimilaritySpecification,
9+
} from '../../../browser'
10+
import { database, Database } from '../../../client'
11+
import { BadRequestError, Result } from '../../../lib'
12+
import {
13+
EvaluationMetricRunArgs,
14+
EvaluationMetricValidateArgs,
15+
normalizeScore,
16+
} from '../shared'
17+
18+
const specification = RuleEvaluationSemanticSimilaritySpecification
19+
export default {
20+
...specification,
21+
validate: validate,
22+
run: run,
23+
}
24+
25+
async function validate(
26+
{
27+
configuration,
28+
}: EvaluationMetricValidateArgs<
29+
EvaluationType.Rule,
30+
RuleEvaluationMetric.SemanticSimilarity
31+
>,
32+
_: Database = database,
33+
) {
34+
if (
35+
configuration.minSimilarity !== undefined &&
36+
(configuration.minSimilarity < 0 || configuration.minSimilarity > 100)
37+
) {
38+
return Result.error(
39+
new BadRequestError(
40+
'Minimum similarity must be a number between 0 and 100',
41+
),
42+
)
43+
}
44+
45+
if (
46+
configuration.maxSimilarity !== undefined &&
47+
(configuration.maxSimilarity < 0 || configuration.maxSimilarity > 100)
48+
) {
49+
return Result.error(
50+
new BadRequestError(
51+
'Maximum similarity must be a number between 0 and 100',
52+
),
53+
)
54+
}
55+
56+
if (
57+
configuration.minSimilarity !== undefined &&
58+
configuration.maxSimilarity !== undefined &&
59+
configuration.minSimilarity >= configuration.maxSimilarity
60+
) {
61+
return Result.error(
62+
new BadRequestError(
63+
'Minimum similarity must be less than maximum similarity',
64+
),
65+
)
66+
}
67+
68+
// Note: all settings are explicitly returned to ensure we don't
69+
// carry dangling fields from the original settings object
70+
return Result.ok({
71+
reverseScale: configuration.reverseScale,
72+
algorithm: configuration.algorithm,
73+
minSimilarity: configuration.minSimilarity,
74+
maxSimilarity: configuration.maxSimilarity,
75+
})
76+
}
77+
78+
async function run(
79+
{
80+
evaluation,
81+
actualOutput,
82+
expectedOutput,
83+
}: EvaluationMetricRunArgs<
84+
EvaluationType.Rule,
85+
RuleEvaluationMetric.SemanticSimilarity
86+
>,
87+
_: Database = database,
88+
) {
89+
try {
90+
let metadata = {
91+
configuration: evaluation.configuration,
92+
actualOutput: actualOutput,
93+
expectedOutput: expectedOutput,
94+
}
95+
96+
if (!metadata.expectedOutput) {
97+
throw new BadRequestError('Expected output is required')
98+
}
99+
100+
if (!env.OPENAI_API_KEY) {
101+
throw new BadRequestError('Internal OPENAI_API_KEY is not set')
102+
}
103+
104+
const {
105+
embeddings: [actualEmbedding, expectedEmbedding],
106+
} = await embedMany({
107+
model: createOpenAI({
108+
apiKey: env.OPENAI_API_KEY,
109+
compatibility: 'strict',
110+
}).textEmbeddingModel('text-embedding-3-small'),
111+
values: [metadata.actualOutput, metadata.expectedOutput],
112+
})
113+
114+
let score = 0
115+
116+
switch (metadata.configuration.algorithm) {
117+
case 'cosine_distance':
118+
{
119+
score = (similarity(actualEmbedding!, expectedEmbedding!) ?? 0) * 100
120+
}
121+
break
122+
default:
123+
throw new Error('Invalid similarity algorithm')
124+
}
125+
126+
score = Math.min(Math.max(Number(score.toFixed(0)), 0), 100)
127+
128+
const minSimilarity = metadata.configuration.minSimilarity ?? 0
129+
const maxSimilarity = metadata.configuration.maxSimilarity ?? 100
130+
131+
let normalizedScore = normalizeScore(score, minSimilarity, maxSimilarity)
132+
if (metadata.configuration.reverseScale) {
133+
normalizedScore = normalizeScore(score, maxSimilarity, minSimilarity)
134+
}
135+
136+
const hasPassed = score >= minSimilarity && score <= maxSimilarity
137+
138+
return { score, normalizedScore, metadata, hasPassed }
139+
} catch (error) {
140+
return { error: { message: (error as Error).message } }
141+
}
142+
}

packages/env/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ export const env = createEnv({
188188
CODESANDBOX_API_KEY: z.string().optional(),
189189
TAVILY_API_KEY: z.string().optional(),
190190
HANDINGER_API_KEY: z.string().optional(),
191+
OPENAI_API_KEY: z.string().optional(),
191192

192193
// Mail settings
193194
FROM_MAILER_EMAIL: z.string(),

0 commit comments

Comments
 (0)