Configuring AWS CloudWatch with Terraform

A comprehensive guide to setting up Amazon CloudWatch monitoring and alerting using Terraform Infrastructure as Code

Configuring AWS CloudWatch with Terraform

Amazon CloudWatch is a monitoring and observability service. This guide shows how to set up CloudWatch using Terraform.

Prerequisites

  • AWS CLI configured
  • Terraform installed
  • Basic understanding of monitoring concepts
  • Resources to monitor

Project Structure

aws-cloudwatch-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic CloudWatch Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "main" {
  name              = "/aws/${var.project_name}"
  retention_in_days = 30

  tags = {
    Environment = var.environment
  }
}

# CloudWatch Metric Alarm
resource "aws_cloudwatch_metric_alarm" "high_cpu" {
  alarm_name          = "${var.project_name}-high-cpu"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
  alarm_description  = "This metric monitors EC2 CPU utilization"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    InstanceId = var.instance_id
  }

  tags = {
    Environment = var.environment
  }
}

# SNS Topic for Alerts
resource "aws_sns_topic" "alerts" {
  name = "${var.project_name}-alerts"
}

resource "aws_sns_topic_subscription" "email" {
  topic_arn = aws_sns_topic.alerts.arn
  protocol  = "email"
  endpoint  = var.alert_email
}

Dashboard Configuration

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "main" {
  dashboard_name = "${var.project_name}-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "CPUUtilization", "InstanceId", var.instance_id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EC2 CPU Utilization"
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "NetworkIn", "InstanceId", var.instance_id],
            ["AWS/EC2", "NetworkOut", "InstanceId", var.instance_id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EC2 Network Traffic"
        }
      }
    ]
  })
}

Log Metrics Configuration

# Log Metric Filter
resource "aws_cloudwatch_log_metric_filter" "error_count" {
  name           = "${var.project_name}-error-count"
  pattern        = "ERROR"
  log_group_name = aws_cloudwatch_log_group.main.name

  metric_transformation {
    name      = "ErrorCount"
    namespace = "${var.project_name}/Errors"
    value     = "1"
  }
}

# Metric Alarm for Log Errors
resource "aws_cloudwatch_metric_alarm" "error_count" {
  alarm_name          = "${var.project_name}-error-count"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "ErrorCount"
  namespace           = "${var.project_name}/Errors"
  period             = "300"
  statistic          = "Sum"
  threshold          = "10"
  alarm_description  = "This metric monitors error count in logs"
  alarm_actions      = [aws_sns_topic.alerts.arn]
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "instance_id" {
  description = "EC2 instance ID to monitor"
  type        = string
}

variable "alert_email" {
  description = "Email address for alerts"
  type        = string
}

Best Practices

  1. Monitoring Strategy

    • Define clear monitoring objectives
    • Use appropriate metrics and thresholds
    • Implement proper alerting
    • Create comprehensive dashboards
  2. Log Management

    • Set appropriate retention periods
    • Use log metric filters effectively
    • Implement structured logging
    • Monitor log volume
  3. Alerting

    • Avoid alert fatigue
    • Set meaningful thresholds
    • Use proper evaluation periods
    • Implement escalation policies
  4. Cost Optimization

    • Monitor log storage usage
    • Clean up unused metrics
    • Use appropriate retention periods
    • Consider metric resolution

Composite Alarms

# Primary Alarm
resource "aws_cloudwatch_metric_alarm" "primary" {
  alarm_name          = "${var.project_name}-primary"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
}

# Secondary Alarm
resource "aws_cloudwatch_metric_alarm" "secondary" {
  alarm_name          = "${var.project_name}-secondary"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "MemoryUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
}

# Composite Alarm
resource "aws_cloudwatch_composite_alarm" "composite" {
  alarm_name = "${var.project_name}-composite"
  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.primary.alarm_name}) AND ALARM(${aws_cloudwatch_metric_alarm.secondary.alarm_name})"

  alarm_actions = [aws_sns_topic.alerts.arn]
}

Anomaly Detection

# Anomaly Detection Alarm
resource "aws_cloudwatch_metric_alarm" "anomaly" {
  alarm_name          = "${var.project_name}-anomaly"
  comparison_operator = "GreaterThanUpperThreshold"
  evaluation_periods  = "2"
  threshold_metric_id = "e1"
  alarm_description  = "This metric monitors for anomalous behavior"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  metric_query {
    id          = "e1"
    expression  = "ANOMALY_DETECTION_BAND(m1)"
    label       = "CPUUtilization (Expected)"
    return_data = true
  }

  metric_query {
    id = "m1"
    metric {
      metric_name = "CPUUtilization"
      namespace   = "AWS/EC2"
      period     = "300"
      stat       = "Average"
      dimensions = {
        InstanceId = var.instance_id
      }
    }
  }
}

Deployment Steps

  1. Initialize Terraform:
terraform init
  1. Plan the deployment:
terraform plan
  1. Apply the configuration:
terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

  1. Application Monitoring
resource "aws_cloudwatch_log_group" "application" {
  name              = "/aws/${var.project_name}/application"
  retention_in_days = 30
}

resource "aws_cloudwatch_metric_alarm" "api_latency" {
  alarm_name          = "${var.project_name}-api-latency"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "Latency"
  namespace           = "AWS/ApiGateway"
  period             = "300"
  statistic          = "Average"
  threshold          = "1000"
  alarm_description  = "This metric monitors API latency"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    ApiName = var.api_name
    Stage   = var.api_stage
  }
}
  1. Infrastructure Monitoring
resource "aws_cloudwatch_dashboard" "infrastructure" {
  dashboard_name = "${var.project_name}-infrastructure"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "CPUUtilization"],
            ["AWS/RDS", "CPUUtilization"],
            ["AWS/ElastiCache", "CPUUtilization"]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "System CPU Utilization"
        }
      }
    ]
  })
}

Container Monitoring

# ECS Container Insights
resource "aws_ecs_cluster" "main" {
  name = "${var.project_name}-cluster"

  setting {
    name  = "containerInsights"
    value = "enabled"
  }
}

# Container Metrics Dashboard
resource "aws_cloudwatch_dashboard" "containers" {
  dashboard_name = "${var.project_name}-containers"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", aws_ecs_cluster.main.name],
            ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", aws_ecs_cluster.main.name]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "Container Resource Utilization"
        }
      }
    ]
  })
}

Conclusion

This setup provides a comprehensive foundation for deploying CloudWatch using Terraform. Remember to:

  • Plan your monitoring strategy carefully
  • Implement proper alerting thresholds
  • Create meaningful dashboards
  • Keep your configurations versioned
  • Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.