Skip to content

Commit c218be9

Browse files
committed
Initial commit
1 parent d102590 commit c218be9

19 files changed

+1782
-58
lines changed

.github/settings.yml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
# Upstream changes from _extends are only recognized when modifications are made to this file in the default branch.
22
_extends: .github
33
repository:
4-
name: template
5-
description: Template for Terraform Components
4+
name: aws-datadog-monitor
5+
description: This component is responsible for provisioning Datadog monitors and assigning Datadog roles to the monitors
66
homepage: https://cloudposse.com/accelerate
77
topics: terraform, terraform-component
8-
9-
10-
11-

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
## PR [#814](https://github.com/cloudposse/terraform-aws-components/pull/814)
2+
3+
### Removed Dead Code, Possible Breaking Change
4+
5+
The following inputs were removed because they no longer have any effect:
6+
7+
- datadog_api_secret_key
8+
- datadog_app_secret_key
9+
- datadog_secrets_source_store_account
10+
- monitors_roles_map
11+
- role_paths
12+
- secrets_store_type
13+
14+
Except for `monitors_roles_map` and `role_paths`, these inputs were deprecated in an earlier PR, and replaced with
15+
outputs from `datadog-configuration`.
16+
17+
The implementation of `monitors_roles_map` and `role_paths` has been lost.

README.yaml

Lines changed: 281 additions & 48 deletions
Large diffs are not rendered by default.

src/catalog/monitors/aurora.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# The official Datadog API documentation with available query parameters & alert types:
2+
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
3+
4+
aurora-replica-lag:
5+
name: "(RDS) ${tenant} ${stage} - Aurora Replica Lag Detected"
6+
type: metric alert
7+
query: |
8+
min(last_15m):min:aws.rds.aurora_replica_lag{stage:${ stage }} by {dbinstanceidentifier,stage,tenant,environment,team} > 1000
9+
message: |
10+
({{tenant.name}}-{{environment.name}}-{{stage.name}})
11+
{{#is_warning}}
12+
({dbinstanceidentifier}) Replica lag has been greater than half a second for more than 15 minutes
13+
{{/is_warning}}
14+
{{#is_alert}}
15+
({dbinstanceidentifier}) Replica lag has been greater than 1s for more than 15 minutes
16+
{{/is_alert}}
17+
escalation_message: ""
18+
tags:
19+
managed-by: Terraform
20+
options:
21+
notify_no_data: false
22+
notify_audit: true
23+
require_full_window: false
24+
include_tags: true
25+
renotify_interval: 60
26+
timeout_h: 24
27+
evaluation_delay: 60
28+
new_host_delay: 300
29+
no_data_timeframe: 10
30+
threshold_windows: { }
31+
thresholds:
32+
critical: 1000
33+
warning: 500
34+
#unknown:
35+
#ok:
36+
#critical_recovery:
37+
#warning_recovery:
38+
priority: 3
39+
restricted_roles: null

src/catalog/monitors/ec2.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# The official Datadog API documentation with available query parameters & alert types:
2+
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
3+
4+
ec2-failed-status-check:
5+
name: "(EC2) ${tenant} ${ stage } - Failed Status Check"
6+
type: metric alert
7+
query: |
8+
avg(last_10m):avg:aws.ec2.status_check_failed{stage:${ stage }} by {instance_id,stage,tenant,environment,team} > 0
9+
message: |
10+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{instance_id}} failed a status check
11+
escalation_message: ""
12+
tags:
13+
managed-by: Terraform
14+
options:
15+
notify_no_data: false
16+
notify_audit: true
17+
require_full_window: true
18+
include_tags: true
19+
renotify_interval: 60
20+
timeout_h: 24
21+
evaluation_delay: 60
22+
new_host_delay: 300
23+
no_data_timeframe: 10
24+
threshold_windows: { }
25+
thresholds:
26+
critical: 0
27+
#warning:
28+
#unknown:
29+
#ok:
30+
#critical_recovery:
31+
#warning_recovery:
32+
priority: 3
33+
restricted_roles: null

src/catalog/monitors/efs.yaml

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# The official Datadog API documentation with available query parameters & alert types:
2+
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
3+
4+
efs-throughput-utilization-check:
5+
name: "(EFS) ${tenant} ${ stage } - % Throughput Utilization"
6+
type: metric alert
7+
query: |
8+
avg(last_1h):(sum:aws.efs.metered_iobytes{stage:${ stage }} by {filesystemid} * 100 / 1048576) / (sum:aws.efs.permitted_throughput{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} / 1048576) > 75
9+
message: |
10+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} Throughput Utilization is too high
11+
escalation_message: ""
12+
tags:
13+
managed-by: Terraform
14+
options:
15+
notify_no_data: false
16+
notify_audit: true
17+
require_full_window: false
18+
include_tags: true
19+
renotify_interval: 60
20+
timeout_h: 24
21+
evaluation_delay: 60
22+
new_host_delay: 300
23+
no_data_timeframe: 10
24+
threshold_windows: { }
25+
thresholds:
26+
critical: 75
27+
warning: 50
28+
#unknown:
29+
#ok:
30+
#critical_recovery:
31+
#warning_recovery:
32+
priority: 3
33+
restricted_roles: null
34+
35+
# The official Datadog API documentation with available query parameters & alert types:
36+
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
37+
38+
efs-burst-balance:
39+
name: "(EFS) ${tenant} ${ stage } - Burst Balance Low (< 100 GB)"
40+
type: metric alert
41+
query: |
42+
min(last_1h):avg:aws.efs.burst_credit_balance{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} < 100000000000
43+
message: |
44+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} EFS Burst Balance for {{filesystemid}} dipped below 100 GB.
45+
escalation_message: ""
46+
tags:
47+
managed-by: Terraform
48+
options:
49+
notify_no_data: false
50+
notify_audit: true
51+
require_full_window: false
52+
include_tags: true
53+
renotify_interval: 60
54+
timeout_h: 24
55+
evaluation_delay: 60
56+
new_host_delay: 300
57+
no_data_timeframe: 10
58+
threshold_windows: { }
59+
thresholds:
60+
critical: 100000000000 # 100 GB
61+
warning: 1000000000000 # 1TB
62+
#unknown:
63+
#ok:
64+
#critical_recovery:
65+
#warning_recovery:
66+
priority: 3
67+
restricted_roles: null
68+
69+
efs-io-percent-limit:
70+
name: "(EFS) ${tenant} ${ stage } - I/O limit has been reached (> 90%)"
71+
type: metric alert
72+
query: |
73+
max(last_1h):avg:aws.efs.percent_iolimit{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} > 90
74+
message: |
75+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} EFS I/O limit has been reached for fs {{filesystemid}}.
76+
escalation_message: ""
77+
tags:
78+
managed-by: Terraform
79+
options:
80+
notify_no_data: false
81+
notify_audit: true
82+
require_full_window: false
83+
include_tags: true
84+
renotify_interval: 60
85+
timeout_h: 24
86+
evaluation_delay: 60
87+
new_host_delay: 300
88+
no_data_timeframe: 10
89+
threshold_windows: { }
90+
thresholds:
91+
critical: 90
92+
warning: 50
93+
#unknown:
94+
#ok:
95+
#critical_recovery:
96+
#warning_recovery:
97+
priority: 3
98+
restricted_roles: null
99+
100+
efs-client-connection-anomaly:
101+
name: "(EFS) ${tenant} ${ stage } - Client Connection Anomaly"
102+
type: metric alert
103+
query: |
104+
avg(last_4h):anomalies(avg:aws.efs.client_connections{stage:${ stage }} by {aws_account,filesystemid,name,stage,tenant,environment,team}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
105+
message: |
106+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) [{{name}}] EFS Client Connection Anomaly for filesystem {{filesystemid}}.
107+
escalation_message: ""
108+
tags:
109+
managed-by: Terraform
110+
options:
111+
notify_no_data: false
112+
notify_audit: true
113+
require_full_window: false
114+
include_tags: true
115+
renotify_interval: 60
116+
timeout_h: 24
117+
evaluation_delay: 60
118+
new_host_delay: 300
119+
no_data_timeframe: 10
120+
threshold_windows: { }
121+
thresholds:
122+
critical: 1
123+
critical_recovery: 0
124+
#warning:
125+
#unknown:
126+
#ok:
127+
#warning_recovery:
128+
priority: 3
129+
restricted_roles: null

src/catalog/monitors/elb.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
elb-lb-httpcode-5xx-notify:
2+
name: "(ELB) ${tenant} ${ stage } HTTP 5XX client error detected"
3+
type: query alert
4+
query: |
5+
avg(last_15m):max:aws.elb.httpcode_elb_5xx{${context_dd_tags}} by {env,host,stage,tenant,environment,team} > 50
6+
message: |
7+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) lb:[ {{host}} ]
8+
{{#is_warning}}
9+
Number of HTTP 5XX client error codes generated by the load balancer > {{warn_threshold}}%
10+
{{/is_warning}}
11+
{{#is_alert}}
12+
Number of HTTP 5XX client error codes generated by the load balancer > {{threshold}}%
13+
{{/is_alert}}
14+
Check LB
15+
escalation_message: ""
16+
tags:
17+
managed-by: Terraform
18+
options:
19+
notify_no_data: false
20+
notify_audit: true
21+
require_full_window: true
22+
include_tags: true
23+
renotify_interval: 60
24+
timeout_h: 24
25+
evaluation_delay: 60
26+
new_host_delay: 300
27+
no_data_timeframe: 10
28+
threshold_windows: {}
29+
thresholds:
30+
critical: 50
31+
warning: 20
32+
priority: 3
33+
restricted_roles: null

src/catalog/monitors/host.yaml

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# The official Datadog API documentation with available query parameters & alert types:
2+
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
3+
4+
host-io-wait-times:
5+
name: "(Host) ${tenant} ${ stage } - I/O Wait Times"
6+
type: metric alert
7+
query: "avg(last_10m):avg:system.cpu.iowait{stage:${ stage }} by {host,stage,tenant,environment,team} > 50"
8+
message: |-
9+
The I/O wait time for ({{host.name}} {{host.ip}}) is very high
10+
escalation_message: ""
11+
tags:
12+
managed-by: Terraform
13+
options:
14+
notify_no_data: false
15+
notify_audit: true
16+
require_full_window: true
17+
include_tags: true
18+
renotify_interval: 60
19+
timeout_h: 24
20+
evaluation_delay: 60
21+
new_host_delay: 300
22+
no_data_timeframe: 10
23+
threshold_windows: { }
24+
thresholds:
25+
critical: 50
26+
warning: 30
27+
priority: 3
28+
restricted_roles: null
29+
30+
host-disk-use:
31+
name: "(Host) ${tenant} ${ stage } - Host Disk Usage"
32+
type: metric alert
33+
query: "avg(last_30m):(avg:system.disk.total{stage:${ stage }} by {host,stage,tenant,environment,team} - avg:system.disk.free{stage:${ stage }} by {host}) / avg:system.disk.total{stage:${ stage }} by {host} * 100 > 90"
34+
message: |-
35+
Disk Usage has been above threshold over 30 minutes on ({{host.name}} {{host.ip}})
36+
escalation_message: ""
37+
tags:
38+
managed-by: Terraform
39+
options:
40+
notify_no_data: false
41+
notify_audit: true
42+
require_full_window: true
43+
include_tags: true
44+
renotify_interval: 60
45+
timeout_h: 24
46+
evaluation_delay: 60
47+
new_host_delay: 300
48+
no_data_timeframe: 10
49+
threshold_windows: { }
50+
thresholds:
51+
critical: 90
52+
warning: 80
53+
#unknown:
54+
#ok:
55+
critical_recovery: 85
56+
warning_recovery: 75
57+
priority: 3
58+
restricted_roles: null
59+
60+
host-high-mem-use:
61+
name: "(Host) ${tenant} ${ stage } - Memory Utilization"
62+
type: query alert
63+
query: "avg(last_15m):avg:system.mem.pct_usable{stage:${ stage }} by {host,stage,tenant,environment,team} < 0.1"
64+
message: |-
65+
Running out of free memory on ({{host.name}} {{host.ip}})
66+
escalation_message: ""
67+
tags:
68+
managed-by: Terraform
69+
options:
70+
notify_no_data: false
71+
notify_audit: true
72+
require_full_window: true
73+
include_tags: true
74+
renotify_interval: 60
75+
timeout_h: 24
76+
evaluation_delay: 60
77+
new_host_delay: 300
78+
no_data_timeframe: 10
79+
threshold_windows: { }
80+
thresholds:
81+
critical: 0.1
82+
warning: 0.15
83+
#unknown:
84+
#ok:
85+
#critical_recovery:
86+
#warning_recovery:
87+
priority: 3
88+
restricted_roles: null
89+
90+
host-high-load-avg:
91+
name: "(Host) ${tenant} ${ stage } - High System Load Average"
92+
type: metric alert
93+
query: "avg(last_30m):avg:system.load.norm.5{stage:${ stage }} by {host,stage,tenant,environment,team} > 0.8"
94+
message: |-
95+
({{tenant.name}}-{{environment.name}}-{{stage.name}}) Load average is high on ({{host.name}} {{host.ip}})
96+
escalation_message: ""
97+
tags:
98+
managed-by: Terraform
99+
options:
100+
notify_no_data: false
101+
notify_audit: true
102+
require_full_window: true
103+
include_tags: true
104+
renotify_interval: 60
105+
timeout_h: 24
106+
evaluation_delay: 60
107+
new_host_delay: 300
108+
no_data_timeframe: 10
109+
threshold_windows: { }
110+
thresholds:
111+
critical: 0.8
112+
warning: 0.75
113+
#unknown:
114+
#ok:
115+
#critical_recovery:
116+
#warning_recovery:
117+
priority: 3
118+
restricted_roles: null

0 commit comments

Comments
 (0)