diff --git a/services/terraform/self-host/aws-deploy.sh b/services/terraform/self-host/aws-deploy.sh new file mode 100755 index 000000000..6ae6098bb --- /dev/null +++ b/services/terraform/self-host/aws-deploy.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# Disable AWS cli command pager outputs +export AWS_PAGER="" + + +# Do not change without replacing keyserver_cluster name in aws_ecs.tf +cluster_name="keyserver-cluster" + +# Do not change without replacing keyserver_primary_service +# name in keyserver_primary.tf +primary_service_name="keyserver-primary-service" + +# Do not change without replacing keyserver_secondary_service +# name in keyserver_secondary.tf +secondary_service_name="keyserver-secondary-service" + +# Grab user configuration variables from terraform.tfvars +health_check_domain=$(echo "var.domain_name" | terraform console -var-file terraform.tfvars.json | tr -d '"') +health_check_url="https://${health_check_domain}/health" +num_desired_secondary_nodes=$(echo "var.desired_secondary_nodes" | terraform console -var-file terraform.tfvars.json) + +if [[ -z "${AWS_ACCESS_KEY_ID}" || -z "${AWS_SECRET_ACCESS_KEY}" ]]; then + echo "Error: AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables must be set to run migration." + exit 1 +fi + +if ! aws sts get-caller-identity > /dev/null; then + echo "Error: Invalid AWS credentials or not in nix shell. Please check your AWS configuration and/or run nix develop." + exit 1 +fi + +convert_seconds() { + total_seconds="$1" + minutes="$((total_seconds / 60))" + seconds="$((total_seconds % 60))" + + if (( minutes > 0 )); then + echo "${minutes} minute(s) and ${seconds} seconds" + else + echo "${seconds} seconds" + fi +} + +check_health() { + local retry_interval=10 + local total_elapsed_time=0 + + while true; do + http_code=$(curl -s -o /dev/null -w "%{http_code}" "$health_check_url") + if [[ "$http_code" -eq 200 ]]; then + echo "Health check returned status 200 OK $http_code. Primary keyserver node ready" + return 0 + fi + total_elapsed_time="$(( total_elapsed_time + retry_interval ))" + converted_time="$(convert_seconds $total_elapsed_time)" + + echo "Health check returned status $http_code. Elapsed time: ${converted_time}." + sleep $retry_interval + done +} + +disable_lb_traffic() { + aws ec2 revoke-security-group-ingress \ + --group-id "$(aws ec2 describe-security-groups --filters "Name=group-name,Values=lb-sg" --query "SecurityGroups[0].GroupId" --output text)" \ + --protocol tcp \ + --port 443 \ + --cidr 0.0.0.0/0 > /dev/null +} + +enable_lb_traffic() { + aws ec2 authorize-security-group-ingress \ + --group-id "$(aws ec2 describe-security-groups --filters "Name=group-name,Values=lb-sg" --query "SecurityGroups[0].GroupId" --output text)" \ + --protocol tcp \ + --port 443 \ + --cidr 0.0.0.0/0 > /dev/null +} + +# Stop all primary and secondary tasks and disable traffic to load balancer +echo "Disabling traffic to load balancer" +disable_lb_traffic + +http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$health_check_url") +if [[ "$http_code" -eq 000 ]]; then + echo "Error: Health check timed out trying to access keyserver domain at ${health_check_url}." + echo "Ensure terraform variable allowed_ip is properly configured and run terraform apply" + + echo "Re-enabling traffic to load balancer until domain is accessible and migration script is rerun" + enable_lb_traffic + exit 1 +fi + +echo "Set desired count of secondary service to 0" +aws ecs update-service --cluster "$cluster_name" --service "$secondary_service_name" --desired-count 0 > /dev/null + +echo "Taking down all secondary nodes in $cluster_name" + +task_arns=$(aws ecs list-tasks --cluster "$cluster_name" --service-name "$secondary_service_name" --query 'taskArns[*]' --output text) + +for task_arn in $task_arns; do + echo "Stopping secondary node running on task $task_arn" + aws ecs stop-task --cluster "$cluster_name" --task "$task_arn" > /dev/null +done + +echo "Set desired count of primary service to 0" +aws ecs update-service --cluster "$cluster_name" --service "$primary_service_name" --desired-count 0 > /dev/null + +echo "Taking down primary node in $cluster_name" +primary_task_arns=$(aws ecs list-tasks --cluster "$cluster_name" --service-name "$primary_service_name" --query 'taskArns[*]' --output text) + +for primary_task_arn in $primary_task_arns; do + echo "Stopping primary node running on task $primary_task_arn" + aws ecs stop-task --cluster "$cluster_name" --task "$primary_task_arn" > /dev/null +done + +echo "Waiting until primary and secondary nodes have been shutdown" +total_elapsed_time=0 +retry_interval=10 +while true; do + http_code=$(curl -s -o /dev/null -w "%{http_code}" "$health_check_url") + + echo "Health check returned status $http_code. Elapsed time: $(convert_seconds $total_elapsed_time)" + + if [[ "$http_code" -ne 200 ]]; then + echo "Stopping primary and secondary nodes was successful. Continuing with migration." + break + fi + + total_elapsed_time=$(( total_elapsed_time + retry_interval )) + sleep $retry_interval +done + +echo "Redeploying primary service in $cluster_name" +aws ecs update-service --cluster "$cluster_name" --service "$primary_service_name" --force-new-deployment --desired-count 1 > /dev/null + +echo "Waiting for health check at $health_check_domain to return status 200 OK" +check_health + +echo "Primary node successfully running. Re-enabling public access to load balancer" +enable_lb_traffic + +echo "Setting desired count of secondary service to $num_desired_secondary_nodes". +aws ecs update-service --cluster "$cluster_name" --service "$secondary_service_name" --desired-count "$num_desired_secondary_nodes" --force-new-deployment > /dev/null + +echo "Successfully ran migration" diff --git a/services/terraform/self-host/aws_ecs.tf b/services/terraform/self-host/aws_ecs.tf index f9376d01c..868778701 100644 --- a/services/terraform/self-host/aws_ecs.tf +++ b/services/terraform/self-host/aws_ecs.tf @@ -1,23 +1,24 @@ resource "aws_ecs_cluster" "keyserver_cluster" { + # Do not change without replacing cluster_name in aws-deploy.sh name = "keyserver-cluster" configuration { execute_command_configuration { logging = "DEFAULT" } } } # Namespace for services to be able to communicate with each other # by their hostnames. Similar to docker compose network. resource "aws_service_discovery_http_namespace" "keyserver_cluster" { name = "keyserver-cluster-http-namespace" tags = { "AmazonECSManaged" = "true" } } resource "aws_ecs_cluster_capacity_providers" "keyserver_cluster" { cluster_name = aws_ecs_cluster.keyserver_cluster.name capacity_providers = ["FARGATE"] } diff --git a/services/terraform/self-host/aws_lb.tf b/services/terraform/self-host/aws_lb.tf index f85eb2252..1e1872e98 100644 --- a/services/terraform/self-host/aws_lb.tf +++ b/services/terraform/self-host/aws_lb.tf @@ -1,82 +1,89 @@ resource "aws_security_group" "lb_sg" { name = "lb-sg" description = "Security group for keyserver load balancer" vpc_id = local.vpc_id ingress { from_port = 443 to_port = 443 protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] } + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["${var.allowed_ip}/32"] + } + egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } resource "aws_lb_target_group" "keyserver_service" { name = "keyserver-service-ecs-tg" port = 3000 protocol = "HTTP" vpc_id = local.vpc_id # "awsvpc" network mode requires target type set to ip target_type = "ip" stickiness { type = "lb_cookie" cookie_duration = 86500 enabled = true } health_check { enabled = true healthy_threshold = 2 unhealthy_threshold = 3 protocol = "HTTP" path = "/health" matcher = "200-299" } } resource "aws_lb" "keyserver_service" { load_balancer_type = "application" name = "keyserver-service-lb" security_groups = [aws_security_group.lb_sg.id] internal = false subnets = local.vpc_subnets } resource "aws_lb_listener" "keyserver_service" { load_balancer_arn = aws_lb.keyserver_service.arn port = "443" protocol = "HTTPS" ssl_policy = "ELBSecurityPolicy-2016-08" certificate_arn = data.aws_acm_certificate.keyserver_service.arn default_action { type = "forward" target_group_arn = aws_lb_target_group.keyserver_service.arn } lifecycle { ignore_changes = [default_action[0].forward[0].stickiness[0].duration] replace_triggered_by = [aws_lb_target_group.keyserver_service] } } data "aws_acm_certificate" "keyserver_service" { domain = var.domain_name statuses = ["ISSUED"] } output "keyserver_service_load_balancer_dns_name" { value = aws_lb.keyserver_service.dns_name } diff --git a/services/terraform/self-host/keyserver_primary.tf b/services/terraform/self-host/keyserver_primary.tf index a681e5dac..8ddcd9c77 100644 --- a/services/terraform/self-host/keyserver_primary.tf +++ b/services/terraform/self-host/keyserver_primary.tf @@ -1,202 +1,203 @@ locals { keyserver_service_image_tag = "0.1" keyserver_service_server_image = "commapp/keyserver:${local.keyserver_service_image_tag}" keyserver_primary_container_name = "keyserver-primary" } resource "aws_cloudwatch_log_group" "keyserver_primary_service" { name = "/ecs/keyserver-primary-task-def" retention_in_days = 7 } output "mariadb_address" { value = aws_db_instance.mariadb.address } resource "aws_ecs_task_definition" "keyserver_primary_service" { network_mode = "awsvpc" family = "keyserver-primary-task-def" requires_compatibilities = ["FARGATE"] task_role_arn = aws_iam_role.ecs_task_role.arn execution_role_arn = aws_iam_role.ecs_task_execution.arn cpu = "2048" memory = "4096" ephemeral_storage { size_in_gib = 40 } container_definitions = jsonencode([ { name = local.keyserver_primary_container_name image = local.keyserver_service_server_image essential = true portMappings = [ { name = "keyserver-port" containerPort = 3000 hostPort = 3000, protocol = "tcp" }, ] environment = [ { name = "REDIS_URL" value = "rediss://${aws_elasticache_serverless_cache.redis.endpoint[0].address}:6379" }, { name = "COMM_NODE_ROLE" value = "primary" }, { name = "COMM_LISTEN_ADDR" value = "0.0.0.0" }, { name = "COMM_DATABASE_HOST" value = "${aws_db_instance.mariadb.address}" }, { name = "COMM_DATABASE_DATABASE" value = "comm" }, { name = "COMM_DATABASE_PORT" value = "3307" }, { name = "COMM_DATABASE_USER" value = "${var.mariadb_username}" }, { name = "COMM_DATABASE_PASSWORD" value = "${var.mariadb_password}" }, { name = "COMM_JSONCONFIG_secrets_user_credentials" value = jsonencode(var.keyserver_user_credentials) }, { name = "COMM_JSONCONFIG_facts_webapp_cors" value = jsonencode({ "domain" : "https://web.comm.app" }) }, { name = "COMM_JSONCONFIG_facts_keyserver_url" value = jsonencode({ "baseDomain" : "https://${var.domain_name}", "basePath" : "/", "baseRoutePath" : "/", "https" : true, "proxy" : "aws" }) }, { name = "COMM_JSONCONFIG_secrets_identity_service_config", value = jsonencode({ "identitySocketAddr" : "${var.identity_socket_address}" }) }, { name = "COMM_JSONCONFIG_facts_authoritative_keyserver", value = jsonencode(var.authoritative_keyserver_config), }, { name = "COMM_JSONCONFIG_facts_tunnelbroker", value = jsonencode({ "url" : "${var.tunnelbroker_url}" }) } ] logConfiguration = { "logDriver" = "awslogs" "options" = { "awslogs-create-group" = "true" "awslogs-group" = aws_cloudwatch_log_group.keyserver_primary_service.name "awslogs-stream-prefix" = "ecs" "awslogs-region" = "${var.region}" } } linuxParameters = { initProcessEnabled = true } } ]) runtime_platform { cpu_architecture = "ARM64" operating_system_family = "LINUX" } skip_destroy = false } resource "aws_ecs_service" "keyserver_primary_service" { depends_on = [null_resource.create_comm_database] + # Do not change name without replacing primary_service_name in aws-deploy.sh name = "keyserver-primary-service" cluster = aws_ecs_cluster.keyserver_cluster.id task_definition = aws_ecs_task_definition.keyserver_primary_service.arn launch_type = "FARGATE" enable_execute_command = true enable_ecs_managed_tags = true force_new_deployment = true desired_count = 1 deployment_maximum_percent = 100 deployment_minimum_healthy_percent = 0 network_configuration { subnets = local.vpc_subnets security_groups = [aws_security_group.keyserver_service.id] assign_public_ip = true } load_balancer { target_group_arn = aws_lb_target_group.keyserver_service.arn container_name = local.keyserver_primary_container_name container_port = 3000 } deployment_circuit_breaker { enable = true rollback = true } } resource "aws_security_group" "keyserver_service" { name = "keyserver-service-ecs-sg" vpc_id = local.vpc_id # Allow all inbound traffic on port 3000 ingress { from_port = 3000 to_port = 3000 protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] } ingress { description = "Allow inbound traffic from any IPv6 address" from_port = 3000 to_port = 3000 protocol = "tcp" ipv6_cidr_blocks = ["::/0"] } # Allow all outbound traffic egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } lifecycle { create_before_destroy = true } } diff --git a/services/terraform/self-host/keyserver_secondary.tf b/services/terraform/self-host/keyserver_secondary.tf index 3af9b2996..a36371a57 100644 --- a/services/terraform/self-host/keyserver_secondary.tf +++ b/services/terraform/self-host/keyserver_secondary.tf @@ -1,175 +1,176 @@ locals { keyserver_secondary_container_name = "keyserver-secondary" } resource "aws_cloudwatch_log_group" "keyserver_secondary_service" { name = "/ecs/keyserver-secondary-task-def" retention_in_days = 7 } resource "aws_ecs_task_definition" "keyserver_secondary_service" { depends_on = [aws_ecs_service.keyserver_primary_service] network_mode = "awsvpc" family = "keyserver-secondary-task-def" requires_compatibilities = ["FARGATE"] task_role_arn = aws_iam_role.ecs_task_role.arn execution_role_arn = aws_iam_role.ecs_task_execution.arn cpu = "2048" memory = "4096" ephemeral_storage { size_in_gib = 40 } container_definitions = jsonencode([ { name = local.keyserver_secondary_container_name image = local.keyserver_service_server_image essential = true portMappings = [ { name = "keyserver-port" containerPort = 3000 hostPort = 3000, protocol = "tcp" }, ] environment = [ { name = "REDIS_URL" value = "rediss://${aws_elasticache_serverless_cache.redis.endpoint[0].address}:6379" }, { name = "COMM_NODE_ROLE" value = "secondary" }, { name = "COMM_LISTEN_ADDR" value = "0.0.0.0" }, { name = "COMM_DATABASE_HOST" value = "${aws_db_instance.mariadb.address}" }, { name = "COMM_DATABASE_DATABASE" value = "comm" }, { name = "COMM_DATABASE_PORT" value = "3307" }, { name = "COMM_DATABASE_USER" value = "${var.mariadb_username}" }, { name = "COMM_DATABASE_PASSWORD" value = "${var.mariadb_password}" }, { name = "COMM_JSONCONFIG_secrets_user_credentials" value = jsonencode(var.keyserver_user_credentials) }, { name = "COMM_JSONCONFIG_facts_keyserver_url" value = jsonencode({ "baseDomain" : "https://${var.domain_name}", "basePath" : "/", "baseRoutePath" : "/", "https" : true, "proxy" : "aws" }) }, { name = "COMM_JSONCONFIG_facts_webapp_cors" value = jsonencode({ "domain" : "https://web.comm.app" }) }, { name = "COMM_JSONCONFIG_facts_tunnelbroker", value = jsonencode({ "url" : "${var.tunnelbroker_url}" }) }, { name = "COMM_JSONCONFIG_secrets_identity_service_config", value = jsonencode({ "identitySocketAddr" : "${var.identity_socket_address}" }) }, { name = "COMM_JSONCONFIG_facts_authoritative_keyserver", value = jsonencode(var.authoritative_keyserver_config), } ] logConfiguration = { "logDriver" = "awslogs" "options" = { "awslogs-create-group" = "true" "awslogs-group" = aws_cloudwatch_log_group.keyserver_secondary_service.name "awslogs-stream-prefix" = "ecs" "awslogs-region" = "${var.region}" } } linuxParameters = { initProcessEnabled = true } } ]) runtime_platform { cpu_architecture = "ARM64" operating_system_family = "LINUX" } # Wait indefinitely for primary service to become healthy before deploying secondary service provisioner "local-exec" { command = <