diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf --- a/services/terraform/remote/aws_cloudwatch_alarms.tf +++ b/services/terraform/remote/aws_cloudwatch_alarms.tf @@ -14,6 +14,16 @@ Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" } Http = { name = "HTTP", pattern = "HTTP Error" } } + + service_log_groups = { + Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" }, + Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" }, + ElectronUpdate = { name = "ElectronUpdate", log_group_name = "/ecs/electron-update-service-task-def" }, + FeatureFlags = { name = "FeatureFlags", log_group_name = "/ecs/feature-flags-service-task-def" }, + Identity = { name = "Identity", log_group_name = "/ecs/identity-service-task-def" }, + Reports = { name = "Reports", log_group_name = "/ecs/reports-service-task-def" }, + Tunnelbroker = { name = "Tunnelbroker", log_group_name = "/ecs/tunnelbroker-service-task-def" } + } } resource "aws_sns_topic" "lambda_alarm_topic" { @@ -127,3 +137,38 @@ actions_enabled = true alarm_actions = [aws_sns_topic.ecs_task_stop_topic.arn] } + +resource "aws_sns_topic" "service_dns_error_topic" { + name = "service-dns-error-topic" +} + +resource "aws_cloudwatch_log_metric_filter" "service_dns_error_filters" { + for_each = local.service_log_groups + + name = "DNS${each.value.name}ErrorCount" + pattern = "{ $.message = \"*service connection error*\" }" + log_group_name = "/ecs/identity-service-task-def" + + metric_transformation { + name = "DNS${each.value.name}ErrorCount" + namespace = "DNSServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "service_dns_error_alarms" { + for_each = local.service_log_groups + + alarm_name = "DNS${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "DNS${each.value.name}ErrorCount" + namespace = "DNSServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = "1" + alarm_description = "Alarm when Identity ${each.value.name} dns errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.service_dns_error_topic.arn] +} +