cloudposse-terraform-components
diff --git a/‎.github/settings.yml
Lines changed: 2 additions & 6 deletions b/‎.github/settings.yml
Lines changed: 2 additions & 6 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 17 additions & 0 deletions
diff --git a/‎README.yaml
Lines changed: 281 additions & 48 deletions b/‎README.yaml
Lines changed: 281 additions & 48 deletions
diff --git a/‎src/catalog/monitors/aurora.yaml
Lines changed: 39 additions & 0 deletions b/‎src/catalog/monitors/aurora.yaml
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/catalog/monitors/ec2.yaml
Lines changed: 33 additions & 0 deletions b/‎src/catalog/monitors/ec2.yaml
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/catalog/monitors/efs.yaml
Lines changed: 129 additions & 0 deletions b/‎src/catalog/monitors/efs.yaml
Lines changed: 129 additions & 0 deletions
diff --git a/‎src/catalog/monitors/elb.yaml
Lines changed: 33 additions & 0 deletions b/‎src/catalog/monitors/elb.yaml
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/catalog/monitors/host.yaml
Lines changed: 118 additions & 0 deletions b/‎src/catalog/monitors/host.yaml
Lines changed: 118 additions & 0 deletions
@@ -1,11 +1,7 @@
 # Upstream changes from _extends are only recognized when modifications are made to this file in the default branch.
 _extends: .github
 repository:
-  name: template
-  description: Template for Terraform Components
+  name: aws-datadog-monitor
+  description: This component is responsible for provisioning Datadog monitors and assigning Datadog roles to the monitors
   homepage: https://cloudposse.com/accelerate
   topics: terraform, terraform-component
-
-
-
-
@@ -0,0 +1,17 @@
+## PR [#814](https://github.com/cloudposse/terraform-aws-components/pull/814)
+
+### Removed Dead Code, Possible Breaking Change
+
+The following inputs were removed because they no longer have any effect:
+
+- datadog_api_secret_key
+- datadog_app_secret_key
+- datadog_secrets_source_store_account
+- monitors_roles_map
+- role_paths
+- secrets_store_type
+
+Except for `monitors_roles_map` and `role_paths`, these inputs were deprecated in an earlier PR, and replaced with
+outputs from `datadog-configuration`.
+
+The implementation of `monitors_roles_map` and `role_paths` has been lost.
@@ -0,0 +1,39 @@
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+aurora-replica-lag:
+  name: "(RDS) ${tenant} ${stage} - Aurora Replica Lag Detected"
+  type: metric alert
+  query: |
+    min(last_15m):min:aws.rds.aurora_replica_lag{stage:${ stage }} by {dbinstanceidentifier,stage,tenant,environment,team} > 1000
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}})
+    {{#is_warning}}
+    ({dbinstanceidentifier}) Replica lag has been greater than half a second for more than 15 minutes
+    {{/is_warning}}
+    {{#is_alert}}
+    ({dbinstanceidentifier}) Replica lag has been greater than 1s for more than 15 minutes
+    {{/is_alert}}
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: false
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 1000
+      warning: 500
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
@@ -0,0 +1,33 @@
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+ec2-failed-status-check:
+  name: "(EC2) ${tenant} ${ stage } - Failed Status Check"
+  type: metric alert
+  query: |
+    avg(last_10m):avg:aws.ec2.status_check_failed{stage:${ stage }} by {instance_id,stage,tenant,environment,team} > 0
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{instance_id}} failed a status check
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 0
+      #warning:
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
@@ -0,0 +1,129 @@
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+efs-throughput-utilization-check:
+  name: "(EFS) ${tenant} ${ stage } - % Throughput Utilization"
+  type: metric alert
+  query: |
+    avg(last_1h):(sum:aws.efs.metered_iobytes{stage:${ stage }} by {filesystemid} * 100 / 1048576) / (sum:aws.efs.permitted_throughput{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} / 1048576) > 75
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} Throughput Utilization is too high
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: false
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 75
+      warning: 50
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
+
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+efs-burst-balance:
+  name: "(EFS) ${tenant} ${ stage } - Burst Balance Low (< 100 GB)"
+  type: metric alert
+  query: |
+    min(last_1h):avg:aws.efs.burst_credit_balance{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} < 100000000000
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} EFS Burst Balance for {{filesystemid}} dipped below 100 GB.
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: false
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 100000000000 # 100 GB
+      warning:  1000000000000 # 1TB
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
+
+efs-io-percent-limit:
+  name: "(EFS) ${tenant} ${ stage } - I/O limit has been reached (> 90%)"
+  type: metric alert
+  query: |
+    max(last_1h):avg:aws.efs.percent_iolimit{stage:${ stage }} by {filesystemid,stage,tenant,environment,team} > 90
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) {{filesystemid}} EFS I/O limit has been reached for fs {{filesystemid}}.
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: false
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 90
+      warning: 50
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
+
+efs-client-connection-anomaly:
+  name: "(EFS) ${tenant} ${ stage } - Client Connection Anomaly"
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.efs.client_connections{stage:${ stage }} by {aws_account,filesystemid,name,stage,tenant,environment,team}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) [{{name}}] EFS Client Connection Anomaly for filesystem {{filesystemid}}.
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: false
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 1
+      critical_recovery: 0
+      #warning:
+      #unknown:
+      #ok:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
@@ -0,0 +1,33 @@
+elb-lb-httpcode-5xx-notify:
+  name: "(ELB) ${tenant} ${ stage } HTTP 5XX client error detected"
+  type: query alert
+  query: |
+    avg(last_15m):max:aws.elb.httpcode_elb_5xx{${context_dd_tags}} by {env,host,stage,tenant,environment,team} > 50
+  message: |
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) lb:[ {{host}} ]
+    {{#is_warning}}
+    Number of HTTP 5XX client error codes generated by the load balancer > {{warn_threshold}}%
+    {{/is_warning}}
+    {{#is_alert}}
+    Number of HTTP 5XX client error codes generated by the load balancer > {{threshold}}%
+    {{/is_alert}}
+    Check LB
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: {}
+    thresholds:
+      critical: 50
+      warning: 20
+  priority: 3
+  restricted_roles: null
@@ -0,0 +1,118 @@
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+host-io-wait-times:
+  name: "(Host) ${tenant} ${ stage } - I/O Wait Times"
+  type: metric alert
+  query: "avg(last_10m):avg:system.cpu.iowait{stage:${ stage }} by {host,stage,tenant,environment,team} > 50"
+  message: |-
+    The I/O wait time for ({{host.name}} {{host.ip}}) is very high
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 50
+      warning: 30
+  priority: 3
+  restricted_roles: null
+
+host-disk-use:
+  name: "(Host) ${tenant} ${ stage } - Host Disk Usage"
+  type: metric alert
+  query: "avg(last_30m):(avg:system.disk.total{stage:${ stage }} by {host,stage,tenant,environment,team} - avg:system.disk.free{stage:${ stage }} by {host}) / avg:system.disk.total{stage:${ stage }} by {host} * 100 > 90"
+  message: |-
+    Disk Usage has been above threshold over 30 minutes on ({{host.name}} {{host.ip}})
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 90
+      warning: 80
+      #unknown:
+      #ok:
+      critical_recovery: 85
+      warning_recovery: 75
+  priority: 3
+  restricted_roles: null
+
+host-high-mem-use:
+  name: "(Host) ${tenant} ${ stage } - Memory Utilization"
+  type: query alert
+  query: "avg(last_15m):avg:system.mem.pct_usable{stage:${ stage }} by {host,stage,tenant,environment,team} < 0.1"
+  message: |-
+    Running out of free memory on ({{host.name}} {{host.ip}})
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 0.1
+      warning: 0.15
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null
+
+host-high-load-avg:
+  name: "(Host) ${tenant} ${ stage } - High System Load Average"
+  type: metric alert
+  query: "avg(last_30m):avg:system.load.norm.5{stage:${ stage }} by {host,stage,tenant,environment,team} > 0.8"
+  message: |-
+    ({{tenant.name}}-{{environment.name}}-{{stage.name}}) Load average is high on ({{host.name}} {{host.ip}})
+  escalation_message: ""
+  tags:
+    managed-by: Terraform
+  options:
+    notify_no_data: false
+    notify_audit: true
+    require_full_window: true
+    include_tags: true
+    renotify_interval: 60
+    timeout_h: 24
+    evaluation_delay: 60
+    new_host_delay: 300
+    no_data_timeframe: 10
+    threshold_windows: { }
+    thresholds:
+      critical: 0.8
+      warning: 0.75
+      #unknown:
+      #ok:
+      #critical_recovery:
+      #warning_recovery:
+  priority: 3
+  restricted_roles: null