I work at a AWS/Docker/ECS/Terraform shop. Our ECS cluster environments are pretty much automated at this point. One of the last remaining tasks we want to accomplish is setting up autoscaling on containers accordling. Here is our setup for AWS ECS Autoscale.
- Modules
- Environment
Modules ALB Service
main.tf
In addition to the usual alb resources, we define the following for autoscale:
- aws_cloudwatch_metric_alarm
- aws_appautoscaling_target
- aws_appautoscaling_policy
We first define the cloudwatch alarms for RPM high/low.
resource "aws_cloudwatch_metric_alarm" "cloudwatch_metric_alarm_rpm_high" {
count = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
alarm_name = "${var.name}-RequestCountPerTarget-High"
alarm_description = "Managed by Terraform"
alarm_actions = ["${aws_appautoscaling_policy.appautoscaling_policy_rpm_scale_up.arn}", "${var.alarm_pagerduty_sns}"]
ok_actions = ["${var.alarm_pagerduty_sns}"]
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "${var.alarm_rpm_high_evaluation_periods}"
metric_name = "RequestCountPerTarget"
namespace = "AWS/ApplicationELB"
period = "${var.alarm_rpm_high_period}"
statistic = "${var.alarm_rpm_high_statistic}"
threshold = "${var.alarm_rpm_high_threshold}"
dimensions {
LoadBalancer = "${var.alb_listener_arn}"
TargetGroup = "${aws_alb_target_group.alb_target_group.arn_suffix}"
}
}
resource "aws_cloudwatch_metric_alarm" "cloudwatch_metric_alarm_rpm_low" {
count = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
alarm_name = "${var.name}-RequestCountPerTarget-Low"
alarm_description = "Managed by Terraform"
alarm_actions = ["${aws_appautoscaling_policy.appautoscaling_policy_rpm_scale_down.arn}", "${var.alarm_pagerduty_sns}"]
ok_actions = ["${var.alarm_pagerduty_sns}"]
comparison_operator = "LessThanOrEqualToThreshold"
evaluation_periods = "${var.alarm_rpm_low_evaluation_periods}"
metric_name = "RequestCountPerTarget"
namespace = "AWS/ApplicationELB"
period = "${var.alarm_rpm_low_period}"
statistic = "${var.alarm_rpm_low_statistic}"
threshold = "${var.alarm_rpm_low_threshold}"
dimensions {
LoadBalancer = "${var.alb_listener_arn}"
TargetGroup = "${aws_alb_target_group.alb_target_group.arn_suffix}"
}
}
Second, we need an auto scaling target.
resource "aws_appautoscaling_target" "appautoscaling_target" {
count = "${var.autoscale_enabled == "true" ? 1 : 0}"
max_capacity = "${var.autoscale_max_capacity}"
min_capacity = "${var.service_desired_count}"
resource_id = "service/${var.cluster}/${var.name}"
role_arn = "arn:aws:iam::${var.account_id}:role/aws-service-role/ecs.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_ECSService"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
}
Lastly, we need to setup policies when the cloudwatch alerts get triggered.
resource "aws_appautoscaling_policy" "appautoscaling_policy_rpm_scale_up" {
count = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
name = "rpm-scale-up"
resource_id = "service/${var.cluster}/${var.name}"
policy_type = "StepScaling"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = "${var.autoscale_up_rpm_cooldown}"
metric_aggregation_type = "${var.autoscale_up_rpm_aggregation_type}"
step_adjustment {
metric_interval_lower_bound = "${var.autoscale_up_rpm_interval_lower_bound}"
scaling_adjustment = "${var.autoscale_up_rpm_adjustment}"
}
}
depends_on = ["aws_appautoscaling_target.appautoscaling_target"]
}
resource "aws_appautoscaling_policy" "appautoscaling_policy_rpm_scale_down" {
count = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
name = "rpm-scale-down"
resource_id = "service/${var.cluster}/${var.name}"
policy_type = "StepScaling"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = "${var.autoscale_down_rpm_cooldown}"
metric_aggregation_type = "${var.autoscale_down_rpm_aggregation_type}"
step_adjustment {
metric_interval_upper_bound = "${var.autoscale_down_rpm_interval_upper_bound}"
scaling_adjustment = "${var.autoscale_down_rpm_adjustment}"
}
}
depends_on = ["aws_appautoscaling_target.appautoscaling_target"]
}
variables.tf
We separate out the autoscale_enabled
and autoscale_rpm_enabled
so that services can only enable certain autoscale policies (CPU/Memory/RPM/etc) …
variable "autoscale_enabled" {
description = "Setup autoscale."
default = "false"
}
variable "autoscale_rpm_enabled" {
description = "Setup autoscale for RPM."
default = "false"
}
variable "alarm_rpm_high_evaluation_periods" {
description = "The number of periods over which data is compared to the specified threshold."
default = "1"
}
variable "alarm_rpm_high_period" {
description = "The period in seconds over which the specified statistic is applied."
default = "300"
}
variable "alarm_rpm_high_statistic" {
description = "The statistic to apply to the alarm's associated metric. Either of the following is supported: SampleCount, Average, Sum, Minimum, Maximum"
default = "Sum"
}
variable "alarm_rpm_high_threshold" {
description = "The value against which the specified statistic is compared."
default = "100000"
}
variable "alarm_rpm_low_evaluation_periods" {
description = "The number of periods over which data is compared to the specified threshold."
default = "1"
}
variable "alarm_rpm_low_period" {
description = "The period in seconds over which the specified statistic is applied."
default = "300"
}
variable "alarm_rpm_low_statistic" {
description = "The statistic to apply to the alarm's associated metric. Either of the following is supported: SampleCount, Average, Sum, Minimum, Maximum"
default = "Sum"
}
variable "alarm_rpm_low_threshold" {
description = "The value against which the specified statistic is compared."
default = "0"
}
variable "autoscale_max_capacity" {
description = "Max containers count for autoscale."
default = "4"
}
variable "autoscale_up_rpm_cooldown" {
description = "Seconds between scaling actions."
default = "300"
}
variable "autoscale_up_rpm_aggregation_type" {
description = "Valid values are Minimum, Maximum, and Average."
default = "Average"
}
variable "autoscale_up_rpm_interval_lower_bound" {
description = "Difference between the alarm threshold and the CloudWatch metric."
default = "5"
}
variable "autoscale_up_rpm_adjustment" {
default = "1"
}
variable "autoscale_down_rpm_cooldown" {
description = "Seconds between scaling actions."
default = "300"
}
variable "autoscale_down_rpm_aggregation_type" {
description = "Valid values are Minimum, Maximum, and Average."
default = "Average"
}
variable "autoscale_down_rpm_interval_upper_bound" {
description = "Difference between the alarm threshold and the CloudWatch metric."
default = "0"
}
variable "autoscale_down_rpm_adjustment" {
default = "-1"
}
variable "alarm_pagerduty_sns" {
default = ""
}
Environment
webservice.tf
Calling the aws_alb_webservice
module like below:
module "webservice_default_c01" {
source = "../../modules/aws_alb_service"
name = "${var.env}-default"
vpc_id = "${module.vpc.vpc_id}"
subnets = "${module.vpc.backend_subnets_list}"
allowed_cidr_blocks = ["${var.default_allowed_ips}"]
region = "${var.region}"
environment = "${var.env}"
cluster_num = "01"
// autoscale settings
autoscale_enabled = "true"
autoscale_rpm_enabled = "true"
alb_listener_arn = "${module.alb_webservice_c01.arn_suffix}"
alarm_pagerduty_sns = "${aws_sns_topic.cloudwatch_pagerduty_ecs_autoscale.arn}"
}
Cloudwatch/PagerDuty Integration
Assuming you have your PagerDuty alert service setup with the given ${var.pagerduty_service_cloudwatch_alert}
, create the following resources in the environment main.tf
as they will/can be used by all services.
resource "aws_sns_topic" "cloudwatch_pagerduty_ecs_autoscale" {
name = "${var.env}-cloudwatch-pagerduty-ecs-autoscale"
}
resource "aws_sns_topic_subscription" "cloudwatch_pagerduty_ecs_autoscale" {
topic_arn = "${aws_sns_topic.cloudwatch_pagerduty_ecs_autoscale.arn}"
protocol = "https"
endpoint_auto_confirms = "true"
endpoint = "${var.pagerduty_service_cloudwatch_alert}"
}