diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf index 80a2b34ae..aa4a83ca2 100644 --- a/services/terraform/remote/aws_cloudwatch_alarms.tf +++ b/services/terraform/remote/aws_cloudwatch_alarms.tf @@ -1,201 +1,250 @@ locals { error_reports_subscribed_email = "error-reports@comm.app" lambda_error_threshold = "2" identity_error_threshold = "1" identity_error_patterns = { Search = { name = "Search", pattern = "Search Error" }, Sync = { name = "Sync", pattern = "Sync Error" }, Database = { name = "DB", pattern = "*DB Error" }, GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" }, Siwe = { name = "Siwe", pattern = "SIWE Error" }, Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" } Http = { name = "HTTP", pattern = "HTTP Error" } } + blob_error_patterns = { + S3 = { name = "S3", pattern = "S3 Error" }, + DDB = { name = "DDB", pattern = "DDB Error" }, + HTTP = { name = "HTTP", pattern = "HTTP Error" }, + Other = { name = "Other", pattern = "Other Error" }, + } + service_log_groups = { Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" }, Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" }, ElectronUpdate = { name = "ElectronUpdate", log_group_name = "/ecs/electron-update-task-def" }, FeatureFlags = { name = "FeatureFlags", log_group_name = "/ecs/feature-flags-task-def" }, Identity = { name = "Identity", log_group_name = "/ecs/identity-service-task-def" }, Reports = { name = "Reports", log_group_name = "/ecs/reports-service-task-def" }, Tunnelbroker = { name = "Tunnelbroker", log_group_name = "/ecs/tunnelbroker-task-def" } } } resource "aws_sns_topic" "lambda_alarm_topic" { name = "lambda-error-alarm-topic" } resource "aws_sns_topic_subscription" "email_subscription" { topic_arn = aws_sns_topic.lambda_alarm_topic.arn protocol = "email" endpoint = local.error_reports_subscribed_email } resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" { alarm_name = "SearchIndexLambdaErrorAlarm" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" metric_name = "LambdaErrors" namespace = "AWS/Lambda" period = "300" statistic = "Sum" threshold = local.lambda_error_threshold alarm_description = "Alarm tracking search index lambda function failure" actions_enabled = true alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn] dimensions = { FunctionName = module.shared.search_index_lambda.function_name } } resource "aws_sns_topic" "identity_error_topic" { name = "identity-error-topic" } resource "aws_sns_topic_subscription" "identity_email_subscription" { topic_arn = aws_sns_topic.identity_error_topic.arn protocol = "email" endpoint = local.error_reports_subscribed_email } resource "aws_sns_topic_subscription" "ecs_task_stop_subscription" { topic_arn = aws_sns_topic.ecs_task_stop_topic.arn protocol = "email" endpoint = local.error_reports_subscribed_email } resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" { for_each = local.identity_error_patterns name = "Identity${each.value.name}ErrorCount" pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" log_group_name = "/ecs/identity-service-task-def" metric_transformation { name = "Identity${each.value.name}ErrorCount" namespace = "IdentityServiceMetricFilters" value = "1" } } resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" { for_each = local.identity_error_patterns alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" metric_name = "Identity${each.value.name}ErrorCount" namespace = "IdentityServiceMetricFilters" period = "300" statistic = "Sum" threshold = local.identity_error_threshold alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold" actions_enabled = true alarm_actions = [aws_sns_topic.identity_error_topic.arn] } resource "aws_sns_topic" "ecs_task_stop_topic" { name = "ecs-task-stop-topic" } resource "aws_cloudwatch_event_rule" "ecs_task_stop" { name = "ecs-task-stop-rule" description = "Filters for ecs task stop events" event_pattern = jsonencode({ source = ["aws.ecs"], detail = { lastStatus = ["STOPPED"], clusterArn = [aws_ecs_cluster.comm_services.arn], containers = { exitCode = [ { anything-but = 0 } ] } } }) } resource "aws_cloudwatch_log_group" "ecs_task_stop" { name = "/aws/events/ecs_task_stop" retention_in_days = 1 } resource "aws_cloudwatch_event_target" "ecs_task_stop" { + rule = aws_cloudwatch_event_rule.ecs_task_stop.name arn = aws_cloudwatch_log_group.ecs_task_stop.arn } resource "aws_cloudwatch_log_metric_filter" "ecs_task_stop" { name = "ECSTaskStopCount" log_group_name = aws_cloudwatch_log_group.ecs_task_stop.name pattern = "{ $.detail.stopCode = \"EssentialContainerExited\" }" metric_transformation { name = "ECSTaskStopCount" namespace = "ECSMetrics" value = "1" default_value = 0 } } resource "aws_cloudwatch_metric_alarm" "ecs_task_stop" { alarm_name = "ECS${local.is_staging ? "Staging" : "Production"}TaskStopAlarm" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" metric_name = "ECSTaskStopCount" namespace = "ECSMetrics" period = "300" statistic = "Sum" threshold = "1" alarm_description = "This metric monitors ECS tasks stops" actions_enabled = true alarm_actions = [aws_sns_topic.ecs_task_stop_topic.arn] } resource "aws_sns_topic" "service_connection_error_topic" { name = "service-connection-error-topic" } resource "aws_sns_topic_subscription" "service_connection_error_email_subscription" { topic_arn = aws_sns_topic.service_connection_error_topic.arn protocol = "email" endpoint = local.error_reports_subscribed_email } resource "aws_cloudwatch_log_metric_filter" "service_connection_error_filters" { for_each = local.service_log_groups name = "${each.value.name}ConnectionErrorCount" pattern = "dns error" log_group_name = each.value.log_group_name metric_transformation { name = "${each.value.name}ConnectionErrorCount" namespace = "ServiceConnectionMetricFilters" value = "1" } } resource "aws_cloudwatch_metric_alarm" "service_connection_error_alarms" { for_each = local.service_log_groups alarm_name = "${each.value.name}ConnectionErrorAlarm" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" metric_name = "${each.value.name}ConnectionErrorCount" namespace = "ServiceConnectionMetricFilters" period = "300" statistic = "Sum" threshold = "1" alarm_description = "Alarm when ${each.value.name} connection errors exceed threshold" actions_enabled = true alarm_actions = [aws_sns_topic.service_connection_error_topic.arn] } + +resource "aws_sns_topic" "blob_error_topic" { + name = "blob-error-topic" +} + +resource "aws_sns_topic_subscription" "blob_email_subscription" { + topic_arn = aws_sns_topic.blob_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" { + for_each = local.blob_error_patterns + + name = "Blob${each.value.name}ErrorCount" + pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" + log_group_name = "/ecs/blob-service-task-def" + + metric_transformation { + name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" { + for_each = local.blob_error_patterns + + alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = 1 + alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.blob_error_topic.arn] +} + diff --git a/services/terraform/remote/main.tf b/services/terraform/remote/main.tf index 4160e922e..e75e96516 100644 --- a/services/terraform/remote/main.tf +++ b/services/terraform/remote/main.tf @@ -1,68 +1,70 @@ terraform { backend "s3" { region = "us-east-2" key = "terraform.tfstate" bucket = "commapp-terraform" dynamodb_table = "terraform-lock" encrypt = true } } provider "random" {} provider "sops" {} data "sops_file" "secrets_json" { source_file = "secrets.json" } locals { environment = terraform.workspace is_staging = local.environment == "staging" secrets = jsondecode(data.sops_file.secrets_json.raw) target_account_id = lookup(local.secrets.accountIDs, local.environment) terraform_role_arn = "arn:aws:iam::${local.target_account_id}:role/Terraform" + + comm_services_use_json_logs = "true" } provider "aws" { region = "us-east-2" assume_role { role_arn = local.terraform_role_arn external_id = "terraform" } # automatically add these tags to all resources default_tags { tags = { # Helps to distinguish which resources are managed by Terraform managed_by = "terraform" } } } locals { # S3 bucket names are globally unique so we add a suffix to staging buckets s3_bucket_name_suffix = local.is_staging ? "-staging" : "" } # Shared resources between local dev environment and remote AWS module "shared" { source = "../modules/shared" bucket_name_suffix = local.s3_bucket_name_suffix vpc_id = aws_vpc.default.id search_index_lambda_iam_role_arn = aws_iam_role.search_index_lambda.arn cidr_block = aws_vpc.default.cidr_block subnet_ids = [ aws_subnet.public_a.id, ] } check "workspace_check" { assert { condition = terraform.workspace == "staging" || terraform.workspace == "production" error_message = "Terraform workspace must be either 'staging' or 'production'!" } } diff --git a/services/terraform/remote/service_identity.tf b/services/terraform/remote/service_identity.tf index d79866182..a565d2fd6 100644 --- a/services/terraform/remote/service_identity.tf +++ b/services/terraform/remote/service_identity.tf @@ -1,309 +1,307 @@ locals { identity_service_image_tag = local.is_staging ? "0.38" : "0.37" identity_service_server_image = "commapp/identity-server:${local.identity_service_image_tag}" identity_service_container_name = "identity-server" # Port that the container is listening on identity_service_container_grpc_port = 50054 identity_sc_port_name = "identity-service-ecs-grpc" identity_sc_dns_name = "identity-service" # Port that Websocket server listens on identity_service_container_ws_port = 51004 identity_sc_ws_port_name = "identity-service-ecs-ws" # URL accessible by other services in the same Service Connect namespace # This renders to e.g. 'http://identity-service:50054' identity_local_url = "http://${local.identity_sc_dns_name}:${local.identity_service_container_grpc_port}" # Port that is exposed to the public SSL endpoint (appended to domain name) identity_service_grpc_public_port = 50054 identity_service_domain_name = "identity.${local.root_domain}" opaque_server_setup_secret_name = "identity/ServerSetup" staging_allow_origin_list = <