From fad250affdd1eb5ba7dc20ab3d45c129daef4f70 Mon Sep 17 00:00:00 2001 From: Cody Born Date: Wed, 8 May 2024 20:02:43 +0100 Subject: [PATCH 1/4] Increase duration requirements for latency alarms --- bin/stacks/api-stack.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/stacks/api-stack.ts b/bin/stacks/api-stack.ts index ea44af70..68f6ef66 100644 --- a/bin/stacks/api-stack.ts +++ b/bin/stacks/api-stack.ts @@ -335,7 +335,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-Latency', { alarmName: 'UnifiedRoutingAPI-SEV2-Latency', metric: api.metricLatency({ - period: Duration.minutes(5), + period: Duration.minutes(15), statistic: 'p90', }), threshold: 8500, @@ -345,7 +345,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-Latency', { alarmName: 'UnifiedRoutingAPI-SEV3-Latency', metric: api.metricLatency({ - period: Duration.minutes(5), + period: Duration.minutes(15), statistic: 'p90', }), threshold: 5500, @@ -355,7 +355,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(5), + period: Duration.minutes(15), statistic: 'p99', }), threshold: 10000, @@ -365,7 +365,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(5), + period: Duration.minutes(15), statistic: 'p99', }), threshold: 7000, From f13c26e1916acac7e7929c7baa202580197df56f Mon Sep 17 00:00:00 2001 From: Cody Born Date: Fri, 10 May 2024 20:59:12 +0100 Subject: [PATCH 2/4] Add compound metrics --- bin/stacks/api-stack.ts | 102 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 4 deletions(-) diff --git a/bin/stacks/api-stack.ts b/bin/stacks/api-stack.ts index 5f2271b1..26444cbf 100644 --- a/bin/stacks/api-stack.ts +++ b/bin/stacks/api-stack.ts @@ -322,7 +322,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-Latency', { alarmName: 'UnifiedRoutingAPI-SEV2-Latency', metric: api.metricLatency({ - period: Duration.minutes(15), + period: Duration.minutes(30), statistic: 'p90', }), threshold: 8500, @@ -332,7 +332,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-Latency', { alarmName: 'UnifiedRoutingAPI-SEV3-Latency', metric: api.metricLatency({ - period: Duration.minutes(15), + period: Duration.minutes(30), statistic: 'p90', }), threshold: 5500, @@ -342,23 +342,115 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(15), + period: Duration.minutes(30), statistic: 'p99', }), threshold: 10000, evaluationPeriods: 3, }); + // Alarm if URA latency is high (> 10s) and Routing API is not (< 4s) + // Usually there's nothing to be done in URA when RoutingAPI latency is high + const apiAlarmLatencyP99WithDepsSev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps', { + alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps', + actionsEnabled: true, + evaluationPeriods: 3, + datapointsToAlarm: 3, + threshold: 1, + comparisonOperator: aws_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, + metric: + new aws_cloudwatch.MathExpression({ + expression: "IF(ura_high_latency AND low_routing_api_latency, 1, 0)", + label: 'Latency Alarm', + usingMetrics: { + ura_high_latency: new aws_cloudwatch.MathExpression({ + expression: "IF(overall_latency > 10000, 1, 0)", + label: 'Overall Latency', + usingMetrics: { + overall_latency: new aws_cloudwatch.Metric({ + namespace: 'AWS/ApiGateway', + metricName: 'Latency', + dimensionsMap: { + ApiName: 'UnifiedRouting' + }, + statistic: 'p99', + }), + }, + }), + low_routing_api_latency: new aws_cloudwatch.MathExpression({ + expression: "IF(routing_api_latency < 4000, 1, 0)", + label: 'Routing API Quoter Latency', + usingMetrics: { + routing_api_latency: new aws_cloudwatch.Metric({ + namespace: 'Uniswap', + metricName: 'RoutingApiQuoterLatency', + dimensionsMap: { + Service: 'UnifiedRoutingAPI' + }, + statistic: 'p99', + }), + }, + }), + }, + }), + }); + const apiAlarmLatencyP99Sev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(15), + period: Duration.minutes(30), statistic: 'p99', }), threshold: 7000, evaluationPeriods: 3, }); + // Alarm if URA latency is high (> 7s) and Routing API is not (< 4s) + // Usually there's nothing to be done in URA when RoutingAPI latency is high + const apiAlarmLatencyP99WithDepsSev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps', { + alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps', + actionsEnabled: true, + evaluationPeriods: 3, + datapointsToAlarm: 3, + threshold: 1, + comparisonOperator: aws_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, + metric: + new aws_cloudwatch.MathExpression({ + expression: "IF(ura_high_latency AND low_routing_api_latency, 1, 0)", + label: 'Latency Alarm', + usingMetrics: { + ura_high_latency: new aws_cloudwatch.MathExpression({ + expression: "IF(overall_latency > 7000, 1, 0)", + label: 'Overall Latency', + usingMetrics: { + overall_latency: new aws_cloudwatch.Metric({ + namespace: 'AWS/ApiGateway', + metricName: 'Latency', + dimensionsMap: { + ApiName: 'UnifiedRouting' + }, + statistic: 'p99', + }), + }, + }), + low_routing_api_latency: new aws_cloudwatch.MathExpression({ + expression: "IF(routing_api_latency < 4000, 1, 0)", + label: 'Routing API Quoter Latency', + usingMetrics: { + routing_api_latency: new aws_cloudwatch.Metric({ + namespace: 'Uniswap', + metricName: 'RoutingApiQuoterLatency', + dimensionsMap: { + Service: 'UnifiedRoutingAPI' + }, + statistic: 'p99', + }), + }, + }), + }, + }), + }); + // Alarms for 200 rate being too low for each chain const percent5XXByChainAlarm: cdk.aws_cloudwatch.Alarm[] = _.flatMap(ALL_ALARMED_CHAINS, (chainId) => { const alarmNameSev3 = `UnifiedRoutingAPI-SEV3-5XXAlarm-ChainId-${chainId.toString()}`; @@ -594,9 +686,11 @@ export class APIStack extends cdk.Stack { apiAlarm5xxSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); apiAlarm4xxSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); apiAlarmLatencySev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); + apiAlarmLatencyP99WithDepsSev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); apiAlarmLatencySev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); apiAlarmLatencyP99Sev2.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); apiAlarmLatencyP99Sev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); + apiAlarmLatencyP99WithDepsSev3.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); percent5XXByChainAlarm.forEach((alarm) => { alarm.addAlarmAction(new cdk.aws_cloudwatch_actions.SnsAction(chatBotTopic)); From 263d9a28a58bd67c4a7988127d7a1389c7dd711e Mon Sep 17 00:00:00 2001 From: Cody Born Date: Fri, 10 May 2024 21:13:47 +0100 Subject: [PATCH 3/4] Set simple latency alerts to 20 min --- bin/stacks/api-stack.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/stacks/api-stack.ts b/bin/stacks/api-stack.ts index 26444cbf..e48a8602 100644 --- a/bin/stacks/api-stack.ts +++ b/bin/stacks/api-stack.ts @@ -322,7 +322,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-Latency', { alarmName: 'UnifiedRoutingAPI-SEV2-Latency', metric: api.metricLatency({ - period: Duration.minutes(30), + period: Duration.minutes(20), statistic: 'p90', }), threshold: 8500, @@ -332,7 +332,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-Latency', { alarmName: 'UnifiedRoutingAPI-SEV3-Latency', metric: api.metricLatency({ - period: Duration.minutes(30), + period: Duration.minutes(20), statistic: 'p90', }), threshold: 5500, @@ -342,7 +342,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(30), + period: Duration.minutes(20), statistic: 'p99', }), threshold: 10000, @@ -398,7 +398,7 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(30), + period: Duration.minutes(20), statistic: 'p99', }), threshold: 7000, From ebd7c0bfa2d20fd5f164307e715e0fb65b568812 Mon Sep 17 00:00:00 2001 From: Cody Born Date: Sat, 11 May 2024 11:53:48 +0100 Subject: [PATCH 4/4] Move magic numbers to constants --- bin/constants.ts | 6 ++++++ bin/stacks/api-stack.ts | 30 +++++++++++++++--------------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/bin/constants.ts b/bin/constants.ts index 1cfced4b..5622f14d 100644 --- a/bin/constants.ts +++ b/bin/constants.ts @@ -2,3 +2,9 @@ // do not change again. Changing would cause every piece of infrastructure to change // name, and thus be redeployed. Should be camel case and contain no non-alphanumeric characters. export const SERVICE_NAME = 'UnifiedRouting'; +export const SEV3_P99LATENCY_MS = 7000; +export const SEV2_P99LATENCY_MS = 10000; +export const SEV3_P90LATENCY_MS = 5500; +export const SEV2_P90LATENCY_MS = 8500; +export const ROUTING_API_MAX_LATENCY_MS = 4000; +export const LATENCY_ALARM_DEFAULT_PERIOD_MIN = 20; diff --git a/bin/stacks/api-stack.ts b/bin/stacks/api-stack.ts index e48a8602..352ab743 100644 --- a/bin/stacks/api-stack.ts +++ b/bin/stacks/api-stack.ts @@ -15,7 +15,7 @@ import * as path from 'path'; import _ from 'lodash'; import { ChainConfigManager } from '../../lib/config/ChainConfigManager'; import { STAGE } from '../../lib/util/stage'; -import { SERVICE_NAME } from '../constants'; +import { ROUTING_API_MAX_LATENCY_MS, SERVICE_NAME, SEV2_P99LATENCY_MS, SEV2_P90LATENCY_MS, SEV3_P99LATENCY_MS, SEV3_P90LATENCY_MS, LATENCY_ALARM_DEFAULT_PERIOD_MIN } from '../constants'; import { AnalyticsStack } from './analytics-stack'; import { DashboardStack } from './dashboard-stack'; import { XPairDashboardStack } from './pair-dashboard-stack'; @@ -322,34 +322,34 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencySev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-Latency', { alarmName: 'UnifiedRoutingAPI-SEV2-Latency', metric: api.metricLatency({ - period: Duration.minutes(20), + period: Duration.minutes(LATENCY_ALARM_DEFAULT_PERIOD_MIN), statistic: 'p90', }), - threshold: 8500, + threshold: SEV2_P90LATENCY_MS, evaluationPeriods: 3, }); const apiAlarmLatencySev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-Latency', { alarmName: 'UnifiedRoutingAPI-SEV3-Latency', metric: api.metricLatency({ - period: Duration.minutes(20), + period: Duration.minutes(LATENCY_ALARM_DEFAULT_PERIOD_MIN), statistic: 'p90', }), - threshold: 5500, + threshold: SEV3_P90LATENCY_MS, evaluationPeriods: 3, }); const apiAlarmLatencyP99Sev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(20), + period: Duration.minutes(LATENCY_ALARM_DEFAULT_PERIOD_MIN), statistic: 'p99', }), - threshold: 10000, + threshold: SEV2_P99LATENCY_MS, evaluationPeriods: 3, }); - // Alarm if URA latency is high (> 10s) and Routing API is not (< 4s) + // Alarm if URA latency is high (> SEV2_P99LATENCY_MS) and Routing API is not (< ROUTING_API_MAX_LATENCY_MS) // Usually there's nothing to be done in URA when RoutingAPI latency is high const apiAlarmLatencyP99WithDepsSev2 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps', { alarmName: 'UnifiedRoutingAPI-SEV2-LatencyP99WithDeps', @@ -364,7 +364,7 @@ export class APIStack extends cdk.Stack { label: 'Latency Alarm', usingMetrics: { ura_high_latency: new aws_cloudwatch.MathExpression({ - expression: "IF(overall_latency > 10000, 1, 0)", + expression: `IF(overall_latency > ${SEV2_P99LATENCY_MS}, 1, 0)`, label: 'Overall Latency', usingMetrics: { overall_latency: new aws_cloudwatch.Metric({ @@ -378,7 +378,7 @@ export class APIStack extends cdk.Stack { }, }), low_routing_api_latency: new aws_cloudwatch.MathExpression({ - expression: "IF(routing_api_latency < 4000, 1, 0)", + expression: `IF(routing_api_latency < ${ROUTING_API_MAX_LATENCY_MS}, 1, 0)`, label: 'Routing API Quoter Latency', usingMetrics: { routing_api_latency: new aws_cloudwatch.Metric({ @@ -398,14 +398,14 @@ export class APIStack extends cdk.Stack { const apiAlarmLatencyP99Sev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99', { alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99', metric: api.metricLatency({ - period: Duration.minutes(20), + period: Duration.minutes(LATENCY_ALARM_DEFAULT_PERIOD_MIN), statistic: 'p99', }), - threshold: 7000, + threshold: SEV3_P99LATENCY_MS, evaluationPeriods: 3, }); - // Alarm if URA latency is high (> 7s) and Routing API is not (< 4s) + // Alarm if URA latency is high (> SEV3_P99LATENCY_MS) and Routing API is not (< ROUTING_API_MAX_LATENCY_MS) // Usually there's nothing to be done in URA when RoutingAPI latency is high const apiAlarmLatencyP99WithDepsSev3 = new aws_cloudwatch.Alarm(this, 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps', { alarmName: 'UnifiedRoutingAPI-SEV3-LatencyP99WithDeps', @@ -420,7 +420,7 @@ export class APIStack extends cdk.Stack { label: 'Latency Alarm', usingMetrics: { ura_high_latency: new aws_cloudwatch.MathExpression({ - expression: "IF(overall_latency > 7000, 1, 0)", + expression: `IF(overall_latency > ${SEV3_P99LATENCY_MS}, 1, 0)`, label: 'Overall Latency', usingMetrics: { overall_latency: new aws_cloudwatch.Metric({ @@ -434,7 +434,7 @@ export class APIStack extends cdk.Stack { }, }), low_routing_api_latency: new aws_cloudwatch.MathExpression({ - expression: "IF(routing_api_latency < 4000, 1, 0)", + expression: `IF(routing_api_latency < ${ROUTING_API_MAX_LATENCY_MS}, 1, 0)`, label: 'Routing API Quoter Latency', usingMetrics: { routing_api_latency: new aws_cloudwatch.Metric({