Skip to content
This repository was archived by the owner on Sep 4, 2024. It is now read-only.

Reduce False Positive alarms #420

Merged
merged 5 commits into from
May 13, 2024
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 98 additions & 4 deletions bin/stacks/api-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ export class APIStack extends cdk.Stack {
const apiAlarmLatencySev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-Latency', {
alarmName: 'UnifiedRoutingAPI-SEV2-Latency',
metric: api.metricLatency({
period: Duration.minutes(5),
period: Duration.minutes(20),
statistic: 'p90',
}),
threshold: 8500,
Expand All @@ -332,7 +332,7 @@ export class APIStack extends cdk.Stack {
const apiAlarmLatencySev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-Latency', {
alarmName: 'UnifiedRoutingAPI-SEV3-Latency',
metric: api.metricLatency({
period: Duration.minutes(5),
period: Duration.minutes(20),
statistic: 'p90',
}),
threshold: 5500,
Expand All @@ -342,23 +342,115 @@ export class APIStack extends cdk.Stack {
const apiAlarmLatencyP99Sev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99', {
alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99',
metric: api.metricLatency({
period: Duration.minutes(5),
period: Duration.minutes(20),
statistic: 'p99',
}),
threshold: 10000,
evaluationPeriods: 3,
});

// Alarm if URA latency is high (> 10s) and Routing API is not (< 4s)
// Usually there's nothing to be done in URA when RoutingAPI latency is high
const apiAlarmLatencyP99WithDepsSev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps', {
alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps',
actionsEnabled: true,
evaluationPeriods: 3,
datapointsToAlarm: 3,
threshold: 1,
comparisonOperator: aws_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
metric:
new aws_cloudwatch.MathExpression({
expression: "IF(ura_high_latency AND low_routing_api_latency, 1, 0)",
label: 'Latency Alarm',
usingMetrics: {
ura_high_latency: new aws_cloudwatch.MathExpression({
expression: "IF(overall_latency > 10000, 1, 0)",
label: 'Overall Latency',
usingMetrics: {
overall_latency: new aws_cloudwatch.Metric({
namespace: 'AWS/ApiGateway',
metricName: 'Latency',
dimensionsMap: {
ApiName: 'UnifiedRouting'
},
statistic: 'p99',
}),
},
}),
low_routing_api_latency: new aws_cloudwatch.MathExpression({
expression: "IF(routing_api_latency < 4000, 1, 0)",
label: 'Routing API Quoter Latency',
usingMetrics: {
routing_api_latency: new aws_cloudwatch.Metric({
namespace: 'Uniswap',
metricName: 'RoutingApiQuoterLatency',
dimensionsMap: {
Service: 'UnifiedRoutingAPI'
},
statistic: 'p99',
}),
},
}),
},
}),
});

const apiAlarmLatencyP99Sev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99', {
alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99',
metric: api.metricLatency({
period: Duration.minutes(5),
period: Duration.minutes(20),
statistic: 'p99',
}),
threshold: 7000,
evaluationPeriods: 3,
});

// Alarm if URA latency is high (> 7s) and Routing API is not (< 4s)
// Usually there's nothing to be done in URA when RoutingAPI latency is high
const apiAlarmLatencyP99WithDepsSev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps', {
alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps',
actionsEnabled: true,
evaluationPeriods: 3,
datapointsToAlarm: 3,
threshold: 1,
comparisonOperator: aws_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
metric:
new aws_cloudwatch.MathExpression({
expression: "IF(ura_high_latency AND low_routing_api_latency, 1, 0)",
label: 'Latency Alarm',
usingMetrics: {
ura_high_latency: new aws_cloudwatch.MathExpression({
expression: "IF(overall_latency > 7000, 1, 0)",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: would be nice to put these numbers into constants (URA_LATENCY_SEV3 = 7000, ROUTING_API_LATENCY_SEV3 = 4000, etc.) so we can change them all in one place

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great callout! Moved these magic numbers to the constants file.

label: 'Overall Latency',
usingMetrics: {
overall_latency: new aws_cloudwatch.Metric({
namespace: 'AWS/ApiGateway',
metricName: 'Latency',
dimensionsMap: {
ApiName: 'UnifiedRouting'
},
statistic: 'p99',
}),
},
}),
low_routing_api_latency: new aws_cloudwatch.MathExpression({
expression: "IF(routing_api_latency < 4000, 1, 0)",
label: 'Routing API Quoter Latency',
usingMetrics: {
routing_api_latency: new aws_cloudwatch.Metric({
namespace: 'Uniswap',
metricName: 'RoutingApiQuoterLatency',
dimensionsMap: {
Service: 'UnifiedRoutingAPI'
},
statistic: 'p99',
}),
},
}),
},
}),
});

// Alarms for 200 rate being too low for each chain
const percent5XXByChainAlarm: cdk.aws_cloudwatch.Alarm[] = _.flatMap(ALL_ALARMED_CHAINS, (chainId) => {
const alarmNameSev3 = `UnifiedRoutingAPI-SEV3-5XXAlarm-ChainId-${chainId.toString()}`;
Expand Down Expand Up @@ -594,9 +686,11 @@ export class APIStack extends cdk.Stack {
apiAlarm5xxSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarm4xxSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencySev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencyP99WithDepsSev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencySev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencyP99Sev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencyP99Sev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
apiAlarmLatencyP99WithDepsSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));

percent5XXByChainAlarm.forEach((alarm) => {
alarm.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic));
Expand Down