Page MenuHomePhorge

D13943.1765145664.diff
No OneTemporary

Size
20 KB
Referenced Files
None
Subscribers
None

D13943.1765145664.diff

diff --git a/services/terraform/remote/alarms_blob.tf b/services/terraform/remote/alarms_blob.tf
new file mode 100644
--- /dev/null
+++ b/services/terraform/remote/alarms_blob.tf
@@ -0,0 +1,82 @@
+locals {
+ blob_error_patterns = {
+ S3 = { name = "S3", pattern = "S3 Error" },
+ DDB = { name = "DDB", pattern = "DDB Error" },
+ HTTP = { name = "HTTP", pattern = "HTTP Error" },
+ Other = { name = "Other", pattern = "Other Error" },
+ }
+}
+
+resource "aws_sns_topic" "blob_error_topic" {
+ name = "blob-error-topic"
+}
+
+resource "aws_sns_topic_subscription" "blob_email_subscription" {
+ topic_arn = aws_sns_topic.blob_error_topic.arn
+ protocol = "email"
+ endpoint = local.error_reports_subscribed_email
+}
+
+resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" {
+ for_each = local.blob_error_patterns
+
+ name = "Blob${each.value.name}ErrorCount"
+ pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
+ log_group_name = "/ecs/blob-service-task-def"
+
+ metric_transformation {
+ name = "Blob${each.value.name}ErrorCount"
+ namespace = "BlobServiceMetricFilters"
+ value = "1"
+ }
+}
+
+resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" {
+ for_each = local.blob_error_patterns
+
+ alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = "1"
+ metric_name = "Blob${each.value.name}ErrorCount"
+ namespace = "BlobServiceMetricFilters"
+ period = "300"
+ statistic = "Sum"
+ threshold = 1
+ alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold"
+ actions_enabled = true
+ alarm_actions = [aws_sns_topic.blob_error_topic.arn]
+}
+
+resource "aws_cloudwatch_metric_alarm" "blob_memory_utilization" {
+ alarm_name = "BlobMemoryUtilizationAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = 1
+ metric_name = "MemoryUtilization"
+ namespace = "AWS/ECS"
+ period = 60
+ statistic = "Average"
+ threshold = 90
+ alarm_description = "Alarm when Blob service memory utilization exceeds 90%"
+ dimensions = {
+ ClusterName = aws_ecs_cluster.comm_services.name
+ ServiceName = aws_ecs_service.blob_service.name
+ }
+ alarm_actions = [aws_sns_topic.blob_error_topic.arn]
+}
+
+resource "aws_cloudwatch_metric_alarm" "blob_cpu_utilization" {
+ alarm_name = "BlobCPUUtilizationAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = 1
+ metric_name = "CPUUtilization"
+ namespace = "AWS/ECS"
+ period = 60
+ statistic = "Average"
+ threshold = 90
+ alarm_description = "Alarm when Blob service CPU utilization exceeds 90%"
+ dimensions = {
+ ClusterName = aws_ecs_cluster.comm_services.name
+ ServiceName = aws_ecs_service.blob_service.name
+ }
+ alarm_actions = [aws_sns_topic.blob_error_topic.arn]
+}
diff --git a/services/terraform/remote/alarms_identity.tf b/services/terraform/remote/alarms_identity.tf
new file mode 100644
--- /dev/null
+++ b/services/terraform/remote/alarms_identity.tf
@@ -0,0 +1,54 @@
+locals {
+ identity_error_patterns = {
+ Search = { name = "Search", pattern = "Search Error" },
+ Sync = { name = "Sync", pattern = "Sync Error" },
+ Database = { name = "DB", pattern = "*DB Error" },
+ GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" },
+ Siwe = { name = "Siwe", pattern = "SIWE Error" },
+ Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" }
+ Http = { name = "HTTP", pattern = "HTTP Error" }
+ }
+
+ identity_error_threshold = 1
+}
+
+resource "aws_sns_topic" "identity_error_topic" {
+ name = "identity-error-topic"
+}
+
+resource "aws_sns_topic_subscription" "identity_email_subscription" {
+ topic_arn = aws_sns_topic.identity_error_topic.arn
+ protocol = "email"
+ endpoint = local.error_reports_subscribed_email
+}
+
+resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" {
+ for_each = local.identity_error_patterns
+
+ name = "Identity${each.value.name}ErrorCount"
+ pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
+ log_group_name = "/ecs/identity-service-task-def"
+
+ metric_transformation {
+ name = "Identity${each.value.name}ErrorCount"
+ namespace = "IdentityServiceMetricFilters"
+ value = "1"
+ }
+}
+
+resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" {
+ for_each = local.identity_error_patterns
+
+ alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = "1"
+ metric_name = "Identity${each.value.name}ErrorCount"
+ namespace = "IdentityServiceMetricFilters"
+ period = "300"
+ statistic = "Sum"
+ threshold = local.identity_error_threshold
+ alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold"
+ actions_enabled = true
+ alarm_actions = [aws_sns_topic.identity_error_topic.arn]
+}
+
diff --git a/services/terraform/remote/alarms_lambda.tf b/services/terraform/remote/alarms_lambda.tf
new file mode 100644
--- /dev/null
+++ b/services/terraform/remote/alarms_lambda.tf
@@ -0,0 +1,30 @@
+locals {
+ lambda_error_threshold = "2"
+}
+
+resource "aws_sns_topic" "lambda_alarm_topic" {
+ name = "lambda-error-alarm-topic"
+}
+
+resource "aws_sns_topic_subscription" "email_subscription" {
+ topic_arn = aws_sns_topic.lambda_alarm_topic.arn
+ protocol = "email"
+ endpoint = local.error_reports_subscribed_email
+}
+
+resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" {
+ alarm_name = "SearchIndexLambdaErrorAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = "1"
+ metric_name = "LambdaErrors"
+ namespace = "AWS/Lambda"
+ period = "300"
+ statistic = "Sum"
+ threshold = local.lambda_error_threshold
+ alarm_description = "Alarm tracking search index lambda function failure"
+ actions_enabled = true
+ alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn]
+ dimensions = {
+ FunctionName = module.shared.search_index_lambda.function_name
+ }
+}
diff --git a/services/terraform/remote/alarms_tunnelbroker.tf b/services/terraform/remote/alarms_tunnelbroker.tf
new file mode 100644
--- /dev/null
+++ b/services/terraform/remote/alarms_tunnelbroker.tf
@@ -0,0 +1,88 @@
+locals {
+ tunnelbroker_error_patterns = {
+ AMQP = { name = "AMQP", pattern = "AMQP Error" },
+ DDB = { name = "DDB", pattern = "DDB Error" },
+ FCM = { name = "FCM", pattern = "FCM Error" },
+ APNs = { name = "APNs", pattern = "APNs Error" },
+ WebPush = { name = "WebPush", pattern = "Web Push Error" },
+ WNS = { name = "WNS", pattern = "WNS Error" },
+ Identity = { name = "Identity", pattern = "Identity Error" },
+ Websocket = { name = "Websocket", pattern = "Websocket Error" },
+ Server = { name = "Server", pattern = "Server Error" },
+ }
+}
+
+resource "aws_sns_topic" "tunnelbroker_error_topic" {
+ name = "tunnelbroker-error-topic"
+}
+
+resource "aws_sns_topic_subscription" "tunnelbroker_email_subscription" {
+ topic_arn = aws_sns_topic.tunnelbroker_error_topic.arn
+ protocol = "email"
+ endpoint = local.error_reports_subscribed_email
+}
+
+resource "aws_cloudwatch_log_metric_filter" "tunnelbroker_error_filters" {
+ for_each = local.tunnelbroker_error_patterns
+
+ name = "Tunnelbroker${each.value.name}ErrorCount"
+ pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
+ log_group_name = "/ecs/tunnelbroker-task-def"
+
+ metric_transformation {
+ name = "Tunnelbroker${each.value.name}ErrorCount"
+ namespace = "TunnelbrokerServiceMetricFilters"
+ value = "1"
+ }
+}
+
+resource "aws_cloudwatch_metric_alarm" "tunnelbroker_error_alarms" {
+ for_each = local.tunnelbroker_error_patterns
+
+ alarm_name = "Tunnelbroker${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = "1"
+ metric_name = "Tunnelbroker${each.value.name}ErrorCount"
+ namespace = "TunnelbrokerServiceMetricFilters"
+ period = "300"
+ statistic = "Sum"
+ threshold = 1
+ alarm_description = "Alarm when Tunnelbroker ${each.value.name} errors exceed threshold"
+ actions_enabled = true
+ alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
+}
+
+resource "aws_cloudwatch_metric_alarm" "tunnelbroker_memory_utilization" {
+ alarm_name = "TunnelbrokerMemoryUtilizationAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = 1
+ metric_name = "MemoryUtilization"
+ period = 60
+ statistic = "Average"
+ threshold = 90
+ alarm_description = "Alarm when Tunnelbroker service memory utilization exceeds 90%"
+ alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
+ namespace = "AWS/ECS"
+ dimensions = {
+ ClusterName = aws_ecs_cluster.comm_services.name
+ ServiceName = aws_ecs_service.tunnelbroker.name
+ }
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "tunnelbroker_cpu_utilization" {
+ alarm_name = "TunnelbrokerCPUUtilizationAlarm"
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluation_periods = 1
+ metric_name = "CPUUtilization"
+ period = 60
+ statistic = "Average"
+ threshold = 90
+ alarm_description = "Alarm when Tunnelbroker service CPU utilization exceeds 90%"
+ alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
+ namespace = "AWS/ECS"
+ dimensions = {
+ ClusterName = aws_ecs_cluster.comm_services.name
+ ServiceName = aws_ecs_service.tunnelbroker.name
+ }
+}
diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf
--- a/services/terraform/remote/aws_cloudwatch_alarms.tf
+++ b/services/terraform/remote/aws_cloudwatch_alarms.tf
@@ -1,38 +1,6 @@
locals {
error_reports_subscribed_email = "error-reports@comm.app"
- lambda_error_threshold = "2"
- identity_error_threshold = "1"
-
- identity_error_patterns = {
- Search = { name = "Search", pattern = "Search Error" },
- Sync = { name = "Sync", pattern = "Sync Error" },
- Database = { name = "DB", pattern = "*DB Error" },
- GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" },
- Siwe = { name = "Siwe", pattern = "SIWE Error" },
- Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" }
- Http = { name = "HTTP", pattern = "HTTP Error" }
- }
-
- blob_error_patterns = {
- S3 = { name = "S3", pattern = "S3 Error" },
- DDB = { name = "DDB", pattern = "DDB Error" },
- HTTP = { name = "HTTP", pattern = "HTTP Error" },
- Other = { name = "Other", pattern = "Other Error" },
- }
-
- tunnelbroker_error_patterns = {
- AMQP = { name = "AMQP", pattern = "AMQP Error" },
- DDB = { name = "DDB", pattern = "DDB Error" },
- FCM = { name = "FCM", pattern = "FCM Error" },
- APNs = { name = "APNs", pattern = "APNs Error" },
- WebPush = { name = "WebPush", pattern = "Web Push Error" },
- WNS = { name = "WNS", pattern = "WNS Error" },
- Identity = { name = "Identity", pattern = "Identity Error" },
- Websocket = { name = "Websocket", pattern = "Websocket Error" },
- Server = { name = "Server", pattern = "Server Error" },
- }
-
service_log_groups = {
Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" },
Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" },
@@ -44,79 +12,12 @@
}
}
-resource "aws_sns_topic" "lambda_alarm_topic" {
- name = "lambda-error-alarm-topic"
-}
-
-resource "aws_sns_topic_subscription" "email_subscription" {
- topic_arn = aws_sns_topic.lambda_alarm_topic.arn
- protocol = "email"
- endpoint = local.error_reports_subscribed_email
-}
-
-resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" {
- alarm_name = "SearchIndexLambdaErrorAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = "1"
- metric_name = "LambdaErrors"
- namespace = "AWS/Lambda"
- period = "300"
- statistic = "Sum"
- threshold = local.lambda_error_threshold
- alarm_description = "Alarm tracking search index lambda function failure"
- actions_enabled = true
- alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn]
- dimensions = {
- FunctionName = module.shared.search_index_lambda.function_name
- }
-}
-
-resource "aws_sns_topic" "identity_error_topic" {
- name = "identity-error-topic"
-}
-
-resource "aws_sns_topic_subscription" "identity_email_subscription" {
- topic_arn = aws_sns_topic.identity_error_topic.arn
- protocol = "email"
- endpoint = local.error_reports_subscribed_email
-}
-
resource "aws_sns_topic_subscription" "ecs_task_stop_subscription" {
topic_arn = aws_sns_topic.ecs_task_stop_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}
-resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" {
- for_each = local.identity_error_patterns
-
- name = "Identity${each.value.name}ErrorCount"
- pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
- log_group_name = "/ecs/identity-service-task-def"
-
- metric_transformation {
- name = "Identity${each.value.name}ErrorCount"
- namespace = "IdentityServiceMetricFilters"
- value = "1"
- }
-}
-
-resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" {
- for_each = local.identity_error_patterns
-
- alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = "1"
- metric_name = "Identity${each.value.name}ErrorCount"
- namespace = "IdentityServiceMetricFilters"
- period = "300"
- statistic = "Sum"
- threshold = local.identity_error_threshold
- alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold"
- actions_enabled = true
- alarm_actions = [aws_sns_topic.identity_error_topic.arn]
-}
-
resource "aws_sns_topic" "ecs_task_stop_topic" {
name = "ecs-task-stop-topic"
}
@@ -217,154 +118,3 @@
actions_enabled = true
alarm_actions = [aws_sns_topic.service_connection_error_topic.arn]
}
-
-
-resource "aws_sns_topic" "blob_error_topic" {
- name = "blob-error-topic"
-}
-
-resource "aws_sns_topic_subscription" "blob_email_subscription" {
- topic_arn = aws_sns_topic.blob_error_topic.arn
- protocol = "email"
- endpoint = local.error_reports_subscribed_email
-}
-
-resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" {
- for_each = local.blob_error_patterns
-
- name = "Blob${each.value.name}ErrorCount"
- pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
- log_group_name = "/ecs/blob-service-task-def"
-
- metric_transformation {
- name = "Blob${each.value.name}ErrorCount"
- namespace = "BlobServiceMetricFilters"
- value = "1"
- }
-}
-
-resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" {
- for_each = local.blob_error_patterns
-
- alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = "1"
- metric_name = "Blob${each.value.name}ErrorCount"
- namespace = "BlobServiceMetricFilters"
- period = "300"
- statistic = "Sum"
- threshold = 1
- alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold"
- actions_enabled = true
- alarm_actions = [aws_sns_topic.blob_error_topic.arn]
-}
-
-resource "aws_cloudwatch_metric_alarm" "blob_memory_utilization" {
- alarm_name = "BlobMemoryUtilizationAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = 1
- metric_name = "MemoryUtilization"
- namespace = "AWS/ECS"
- period = 60
- statistic = "Average"
- threshold = 90
- alarm_description = "Alarm when Blob service memory utilization exceeds 90%"
- dimensions = {
- ClusterName = aws_ecs_cluster.comm_services.name
- ServiceName = aws_ecs_service.blob_service.name
- }
- alarm_actions = [aws_sns_topic.blob_error_topic.arn]
-}
-
-
-resource "aws_cloudwatch_metric_alarm" "blob_cpu_utilization" {
- alarm_name = "BlobCPUUtilizationAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = 1
- metric_name = "CPUUtilization"
- namespace = "AWS/ECS"
- period = 60
- statistic = "Average"
- threshold = 90
- alarm_description = "Alarm when Blob service CPU utilization exceeds 90%"
- dimensions = {
- ClusterName = aws_ecs_cluster.comm_services.name
- ServiceName = aws_ecs_service.blob_service.name
- }
- alarm_actions = [aws_sns_topic.blob_error_topic.arn]
-}
-
-resource "aws_sns_topic" "tunnelbroker_error_topic" {
- name = "tunnelbroker-error-topic"
-}
-
-resource "aws_sns_topic_subscription" "tunnelbroker_email_subscription" {
- topic_arn = aws_sns_topic.tunnelbroker_error_topic.arn
- protocol = "email"
- endpoint = local.error_reports_subscribed_email
-}
-
-resource "aws_cloudwatch_log_metric_filter" "tunnelbroker_error_filters" {
- for_each = local.tunnelbroker_error_patterns
-
- name = "Tunnelbroker${each.value.name}ErrorCount"
- pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
- log_group_name = "/ecs/tunnelbroker-task-def"
-
- metric_transformation {
- name = "Tunnelbroker${each.value.name}ErrorCount"
- namespace = "TunnelbrokerServiceMetricFilters"
- value = "1"
- }
-}
-
-resource "aws_cloudwatch_metric_alarm" "tunnelbroker_error_alarms" {
- for_each = local.tunnelbroker_error_patterns
-
- alarm_name = "Tunnelbroker${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = "1"
- metric_name = "Tunnelbroker${each.value.name}ErrorCount"
- namespace = "TunnelbrokerServiceMetricFilters"
- period = "300"
- statistic = "Sum"
- threshold = 1
- alarm_description = "Alarm when Tunnelbroker ${each.value.name} errors exceed threshold"
- actions_enabled = true
- alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
-}
-
-resource "aws_cloudwatch_metric_alarm" "tunnelbroker_memory_utilization" {
- alarm_name = "TunnelbrokerMemoryUtilizationAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = 1
- metric_name = "MemoryUtilization"
- period = 60
- statistic = "Average"
- threshold = 90
- alarm_description = "Alarm when Tunnelbroker service memory utilization exceeds 90%"
- alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
- namespace = "AWS/ECS"
- dimensions = {
- ClusterName = aws_ecs_cluster.comm_services.name
- ServiceName = aws_ecs_service.tunnelbroker.name
- }
-}
-
-
-resource "aws_cloudwatch_metric_alarm" "tunnelbroker_cpu_utilization" {
- alarm_name = "TunnelbrokerCPUUtilizationAlarm"
- comparison_operator = "GreaterThanOrEqualToThreshold"
- evaluation_periods = 1
- metric_name = "CPUUtilization"
- period = 60
- statistic = "Average"
- threshold = 90
- alarm_description = "Alarm when Tunnelbroker service CPU utilization exceeds 90%"
- alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
- namespace = "AWS/ECS"
- dimensions = {
- ClusterName = aws_ecs_cluster.comm_services.name
- ServiceName = aws_ecs_service.tunnelbroker.name
- }
-}

File Metadata

Mime Type
text/plain
Expires
Sun, Dec 7, 10:14 PM (4 h, 19 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5843235
Default Alt Text
D13943.1765145664.diff (20 KB)

Event Timeline