diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf --- a/services/terraform/remote/aws_cloudwatch_alarms.tf +++ b/services/terraform/remote/aws_cloudwatch_alarms.tf @@ -13,6 +13,16 @@ Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" } Http = { name = "HTTP", pattern = "HTTP Error" } } + + service_log_groups = { + Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" }, + Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" }, + ElectronUpdate = { name = "ElectronUpdate", log_group_name = "/ecs/electron-update-task-def" }, + FeatureFlags = { name = "FeatureFlags", log_group_name = "/ecs/feature-flags-task-def" }, + Identity = { name = "Identity", log_group_name = "/ecs/identity-service-task-def" }, + Reports = { name = "Reports", log_group_name = "/ecs/reports-service-task-def" }, + Tunnelbroker = { name = "Tunnelbroker", log_group_name = "/ecs/tunnelbroker-task-def" } + } } resource "aws_sns_topic" "lambda_alarm_topic" { @@ -126,3 +136,44 @@ actions_enabled = true alarm_actions = [aws_sns_topic.ecs_task_stop_topic.arn] } + +resource "aws_sns_topic" "service_connection_error_topic" { + name = "service-connection-error-topic" +} + +resource "aws_sns_topic_subscription" "service_connection_error_email_subscription" { + topic_arn = aws_sns_topic.service_connection_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "service_connection_error_filters" { + for_each = local.service_log_groups + + name = "${each.value.name}ConnectionErrorCount" + pattern = "dns error" + log_group_name = each.value.log_group_name + + metric_transformation { + name = "${each.value.name}ConnectionErrorCount" + namespace = "ServiceConnectionMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "service_connection_error_alarms" { + for_each = local.service_log_groups + + alarm_name = "${each.value.name}ConnectionErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "${each.value.name}ConnectionErrorCount" + namespace = "ServiceConnectionMetricFilters" + period = "300" + statistic = "Sum" + threshold = "1" + alarm_description = "Alarm when ${each.value.name} connection errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.service_connection_error_topic.arn] +} +