Terraform and AWS ECS Autoscale

25 Jun 2018

Reading time ~4 minutes

I work at a AWS/Docker/ECS/Terraform shop. Our ECS cluster environments are pretty much automated at this point. One of the last remaining tasks we want to accomplish is setting up autoscaling on containers accordling. Here is our setup for AWS ECS Autoscale.

Modules
- Modules ALB Service
Environment
- webservice.tf

Modules ALB Service

main.tf

In addition to the usual alb resources, we define the following for autoscale:

aws_cloudwatch_metric_alarm
aws_appautoscaling_target
aws_appautoscaling_policy

We first define the cloudwatch alarms for RPM high/low.

resource "aws_cloudwatch_metric_alarm" "cloudwatch_metric_alarm_rpm_high" {
  count               = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
  alarm_name          = "${var.name}-RequestCountPerTarget-High"
  alarm_description   = "Managed by Terraform"
  alarm_actions       = ["${aws_appautoscaling_policy.appautoscaling_policy_rpm_scale_up.arn}", "${var.alarm_pagerduty_sns}"]
  ok_actions          = ["${var.alarm_pagerduty_sns}"]
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "${var.alarm_rpm_high_evaluation_periods}"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "${var.alarm_rpm_high_period}"
  statistic           = "${var.alarm_rpm_high_statistic}"
  threshold           = "${var.alarm_rpm_high_threshold}"

  dimensions {
    LoadBalancer = "${var.alb_listener_arn}"
    TargetGroup  = "${aws_alb_target_group.alb_target_group.arn_suffix}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cloudwatch_metric_alarm_rpm_low" {
  count               = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
  alarm_name          = "${var.name}-RequestCountPerTarget-Low"
  alarm_description   = "Managed by Terraform"
  alarm_actions       = ["${aws_appautoscaling_policy.appautoscaling_policy_rpm_scale_down.arn}", "${var.alarm_pagerduty_sns}"]
  ok_actions          = ["${var.alarm_pagerduty_sns}"]
  comparison_operator = "LessThanOrEqualToThreshold"
  evaluation_periods  = "${var.alarm_rpm_low_evaluation_periods}"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "${var.alarm_rpm_low_period}"
  statistic           = "${var.alarm_rpm_low_statistic}"
  threshold           = "${var.alarm_rpm_low_threshold}"

  dimensions {
    LoadBalancer = "${var.alb_listener_arn}"
    TargetGroup  = "${aws_alb_target_group.alb_target_group.arn_suffix}"
  }
}

Second, we need an auto scaling target.

resource "aws_appautoscaling_target" "appautoscaling_target" {
  count              = "${var.autoscale_enabled == "true" ? 1 : 0}"
  max_capacity       = "${var.autoscale_max_capacity}"
  min_capacity       = "${var.service_desired_count}"
  resource_id        = "service/${var.cluster}/${var.name}"
  role_arn           = "arn:aws:iam::${var.account_id}:role/aws-service-role/ecs.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_ECSService"
  scalable_dimension = "ecs:service:DesiredCount"
  service_namespace  = "ecs"
}

Lastly, we need to setup policies when the cloudwatch alerts get triggered.

resource "aws_appautoscaling_policy" "appautoscaling_policy_rpm_scale_up" {
  count              = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
  name               = "rpm-scale-up"
  resource_id        = "service/${var.cluster}/${var.name}"
  policy_type        = "StepScaling"
  scalable_dimension = "ecs:service:DesiredCount"
  service_namespace  = "ecs"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = "${var.autoscale_up_rpm_cooldown}"
    metric_aggregation_type = "${var.autoscale_up_rpm_aggregation_type}"

    step_adjustment {
      metric_interval_lower_bound = "${var.autoscale_up_rpm_interval_lower_bound}"
      scaling_adjustment          = "${var.autoscale_up_rpm_adjustment}"
    }
  }

  depends_on = ["aws_appautoscaling_target.appautoscaling_target"]
}

resource "aws_appautoscaling_policy" "appautoscaling_policy_rpm_scale_down" {
  count              = "${var.autoscale_rpm_enabled == "true" ? 1 : 0}"
  name               = "rpm-scale-down"
  resource_id        = "service/${var.cluster}/${var.name}"
  policy_type        = "StepScaling"
  scalable_dimension = "ecs:service:DesiredCount"
  service_namespace  = "ecs"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = "${var.autoscale_down_rpm_cooldown}"
    metric_aggregation_type = "${var.autoscale_down_rpm_aggregation_type}"

    step_adjustment {
      metric_interval_upper_bound = "${var.autoscale_down_rpm_interval_upper_bound}"
      scaling_adjustment          = "${var.autoscale_down_rpm_adjustment}"
    }
  }

  depends_on = ["aws_appautoscaling_target.appautoscaling_target"]
}

variables.tf

We separate out the autoscale_enabled and autoscale_rpm_enabled so that services can only enable certain autoscale policies (CPU/Memory/RPM/etc) …

variable "autoscale_enabled" {
  description = "Setup autoscale."
  default     = "false"
}

variable "autoscale_rpm_enabled" {
  description = "Setup autoscale for RPM."
  default     = "false"
}

variable "alarm_rpm_high_evaluation_periods" {
  description = "The number of periods over which data is compared to the specified threshold."
  default     = "1"
}

variable "alarm_rpm_high_period" {
  description = "The period in seconds over which the specified statistic is applied."
  default     = "300"
}

variable "alarm_rpm_high_statistic" {
  description = "The statistic to apply to the alarm's associated metric. Either of the following is supported: SampleCount, Average, Sum, Minimum, Maximum"
  default     = "Sum"
}

variable "alarm_rpm_high_threshold" {
  description = "The value against which the specified statistic is compared."
  default     = "100000"
}

variable "alarm_rpm_low_evaluation_periods" {
  description = "The number of periods over which data is compared to the specified threshold."
  default     = "1"
}

variable "alarm_rpm_low_period" {
  description = "The period in seconds over which the specified statistic is applied."
  default     = "300"
}

variable "alarm_rpm_low_statistic" {
  description = "The statistic to apply to the alarm's associated metric. Either of the following is supported: SampleCount, Average, Sum, Minimum, Maximum"
  default     = "Sum"
}

variable "alarm_rpm_low_threshold" {
  description = "The value against which the specified statistic is compared."
  default     = "0"
}

variable "autoscale_max_capacity" {
  description = "Max containers count for autoscale."
  default     = "4"
}

variable "autoscale_up_rpm_cooldown" {
  description = "Seconds between scaling actions."
  default     = "300"
}

variable "autoscale_up_rpm_aggregation_type" {
  description = "Valid values are Minimum, Maximum, and Average."
  default     = "Average"
}

variable "autoscale_up_rpm_interval_lower_bound" {
  description = "Difference between the alarm threshold and the CloudWatch metric."
  default     = "5"
}

variable "autoscale_up_rpm_adjustment" {
  default = "1"
}

variable "autoscale_down_rpm_cooldown" {
  description = "Seconds between scaling actions."
  default     = "300"
}

variable "autoscale_down_rpm_aggregation_type" {
  description = "Valid values are Minimum, Maximum, and Average."
  default     = "Average"
}

variable "autoscale_down_rpm_interval_upper_bound" {
  description = "Difference between the alarm threshold and the CloudWatch metric."
  default     = "0"
}

variable "autoscale_down_rpm_adjustment" {
  default = "-1"
}

variable "alarm_pagerduty_sns" {
    default = ""
}

Environment

webservice.tf

Calling the aws_alb_webservice module like below:

module "webservice_default_c01" {
  source              = "../../modules/aws_alb_service"
  name                = "${var.env}-default"
  vpc_id              = "${module.vpc.vpc_id}"
  subnets             = "${module.vpc.backend_subnets_list}"
  allowed_cidr_blocks = ["${var.default_allowed_ips}"]
  region              = "${var.region}"
  environment         = "${var.env}"
  cluster_num         = "01"

  // autoscale settings
  autoscale_enabled     = "true"
  autoscale_rpm_enabled = "true"
  alb_listener_arn      = "${module.alb_webservice_c01.arn_suffix}"
  alarm_pagerduty_sns   = "${aws_sns_topic.cloudwatch_pagerduty_ecs_autoscale.arn}"
}

Cloudwatch/PagerDuty Integration

Assuming you have your PagerDuty alert service setup with the given ${var.pagerduty_service_cloudwatch_alert}, create the following resources in the environment main.tf as they will/can be used by all services.

resource "aws_sns_topic" "cloudwatch_pagerduty_ecs_autoscale" {
  name = "${var.env}-cloudwatch-pagerduty-ecs-autoscale"
}

resource "aws_sns_topic_subscription" "cloudwatch_pagerduty_ecs_autoscale" {
  topic_arn              = "${aws_sns_topic.cloudwatch_pagerduty_ecs_autoscale.arn}"
  protocol               = "https"
  endpoint_auto_confirms = "true"
  endpoint               = "${var.pagerduty_service_cloudwatch_alert}"
}