diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf --- a/services/terraform/remote/aws_cloudwatch_alarms.tf +++ b/services/terraform/remote/aws_cloudwatch_alarms.tf @@ -14,6 +14,13 @@ Http = { name = "HTTP", pattern = "HTTP Error" } } + blob_error_patterns = { + S3 = { name = "S3", pattern = "S3 Error" }, + DDB = { name = "DDB", pattern = "DDB Error" }, + HTTP = { name = "HTTP", pattern = "HTTP Error" }, + Other = { name = "Other", pattern = "Other Error" }, + } + service_log_groups = { Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" }, Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" }, @@ -128,6 +135,7 @@ } resource "aws_cloudwatch_event_target" "ecs_task_stop" { + rule = aws_cloudwatch_event_rule.ecs_task_stop.name arn = aws_cloudwatch_log_group.ecs_task_stop.arn } @@ -199,3 +207,44 @@ alarm_actions = [aws_sns_topic.service_connection_error_topic.arn] } + +resource "aws_sns_topic" "blob_error_topic" { + name = "blob-error-topic" +} + +resource "aws_sns_topic_subscription" "blob_email_subscription" { + topic_arn = aws_sns_topic.blob_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" { + for_each = local.blob_error_patterns + + name = "Blob${each.value.name}ErrorCount" + pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" + log_group_name = "/ecs/blob-service-task-def" + + metric_transformation { + name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" { + for_each = local.blob_error_patterns + + alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = 1 + alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.blob_error_topic.arn] +} + diff --git a/services/terraform/remote/main.tf b/services/terraform/remote/main.tf --- a/services/terraform/remote/main.tf +++ b/services/terraform/remote/main.tf @@ -23,6 +23,8 @@ target_account_id = lookup(local.secrets.accountIDs, local.environment) terraform_role_arn = "arn:aws:iam::${local.target_account_id}:role/Terraform" + + comm_services_use_json_logs = "true" } provider "aws" { diff --git a/services/terraform/remote/service_identity.tf b/services/terraform/remote/service_identity.tf --- a/services/terraform/remote/service_identity.tf +++ b/services/terraform/remote/service_identity.tf @@ -34,8 +34,6 @@ http://localhost:3009 EOT production_allow_origin_list = "https://web.comm.app" - - comm_services_use_json_logs = "true" } data "aws_secretsmanager_secret" "identity_server_setup" {