Configuring AWS CloudWatch with Terraform

Amazon CloudWatch is a monitoring and observability service. This guide shows how to set up CloudWatch using Terraform.

Prerequisites

AWS CLI configured
Terraform installed
Basic understanding of monitoring concepts
Resources to monitor

Project Structure

aws-cloudwatch-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic CloudWatch Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "main" {
  name              = "/aws/${var.project_name}"
  retention_in_days = 30

  tags = {
    Environment = var.environment
  }
}

# CloudWatch Metric Alarm
resource "aws_cloudwatch_metric_alarm" "high_cpu" {
  alarm_name          = "${var.project_name}-high-cpu"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
  alarm_description  = "This metric monitors EC2 CPU utilization"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    InstanceId = var.instance_id
  }

  tags = {
    Environment = var.environment
  }
}

# SNS Topic for Alerts
resource "aws_sns_topic" "alerts" {
  name = "${var.project_name}-alerts"
}

resource "aws_sns_topic_subscription" "email" {
  topic_arn = aws_sns_topic.alerts.arn
  protocol  = "email"
  endpoint  = var.alert_email
}

Dashboard Configuration

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "main" {
  dashboard_name = "${var.project_name}-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "CPUUtilization", "InstanceId", var.instance_id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EC2 CPU Utilization"
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "NetworkIn", "InstanceId", var.instance_id],
            ["AWS/EC2", "NetworkOut", "InstanceId", var.instance_id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EC2 Network Traffic"
        }
      }
    ]
  })
}

Log Metrics Configuration

# Log Metric Filter
resource "aws_cloudwatch_log_metric_filter" "error_count" {
  name           = "${var.project_name}-error-count"
  pattern        = "ERROR"
  log_group_name = aws_cloudwatch_log_group.main.name

  metric_transformation {
    name      = "ErrorCount"
    namespace = "${var.project_name}/Errors"
    value     = "1"
  }
}

# Metric Alarm for Log Errors
resource "aws_cloudwatch_metric_alarm" "error_count" {
  alarm_name          = "${var.project_name}-error-count"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "ErrorCount"
  namespace           = "${var.project_name}/Errors"
  period             = "300"
  statistic          = "Sum"
  threshold          = "10"
  alarm_description  = "This metric monitors error count in logs"
  alarm_actions      = [aws_sns_topic.alerts.arn]
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "instance_id" {
  description = "EC2 instance ID to monitor"
  type        = string
}

variable "alert_email" {
  description = "Email address for alerts"
  type        = string
}

Best Practices

Monitoring Strategy
- Define clear monitoring objectives
- Use appropriate metrics and thresholds
- Implement proper alerting
- Create comprehensive dashboards
Log Management
- Set appropriate retention periods
- Use log metric filters effectively
- Implement structured logging
- Monitor log volume
Alerting
- Avoid alert fatigue
- Set meaningful thresholds
- Use proper evaluation periods
- Implement escalation policies
Cost Optimization
- Monitor log storage usage
- Clean up unused metrics
- Use appropriate retention periods
- Consider metric resolution

Composite Alarms

# Primary Alarm
resource "aws_cloudwatch_metric_alarm" "primary" {
  alarm_name          = "${var.project_name}-primary"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
}

# Secondary Alarm
resource "aws_cloudwatch_metric_alarm" "secondary" {
  alarm_name          = "${var.project_name}-secondary"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "MemoryUtilization"
  namespace           = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = "80"
}

# Composite Alarm
resource "aws_cloudwatch_composite_alarm" "composite" {
  alarm_name = "${var.project_name}-composite"
  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.primary.alarm_name}) AND ALARM(${aws_cloudwatch_metric_alarm.secondary.alarm_name})"

  alarm_actions = [aws_sns_topic.alerts.arn]
}

Anomaly Detection

# Anomaly Detection Alarm
resource "aws_cloudwatch_metric_alarm" "anomaly" {
  alarm_name          = "${var.project_name}-anomaly"
  comparison_operator = "GreaterThanUpperThreshold"
  evaluation_periods  = "2"
  threshold_metric_id = "e1"
  alarm_description  = "This metric monitors for anomalous behavior"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  metric_query {
    id          = "e1"
    expression  = "ANOMALY_DETECTION_BAND(m1)"
    label       = "CPUUtilization (Expected)"
    return_data = true
  }

  metric_query {
    id = "m1"
    metric {
      metric_name = "CPUUtilization"
      namespace   = "AWS/EC2"
      period     = "300"
      stat       = "Average"
      dimensions = {
        InstanceId = var.instance_id
      }
    }
  }
}

Deployment Steps

Initialize Terraform:

terraform init

Plan the deployment:

terraform plan

Apply the configuration:

terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

Application Monitoring

resource "aws_cloudwatch_log_group" "application" {
  name              = "/aws/${var.project_name}/application"
  retention_in_days = 30
}

resource "aws_cloudwatch_metric_alarm" "api_latency" {
  alarm_name          = "${var.project_name}-api-latency"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "Latency"
  namespace           = "AWS/ApiGateway"
  period             = "300"
  statistic          = "Average"
  threshold          = "1000"
  alarm_description  = "This metric monitors API latency"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    ApiName = var.api_name
    Stage   = var.api_stage
  }
}

Infrastructure Monitoring

resource "aws_cloudwatch_dashboard" "infrastructure" {
  dashboard_name = "${var.project_name}-infrastructure"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "CPUUtilization"],
            ["AWS/RDS", "CPUUtilization"],
            ["AWS/ElastiCache", "CPUUtilization"]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "System CPU Utilization"
        }
      }
    ]
  })
}

Container Monitoring

# ECS Container Insights
resource "aws_ecs_cluster" "main" {
  name = "${var.project_name}-cluster"

  setting {
    name  = "containerInsights"
    value = "enabled"
  }
}

# Container Metrics Dashboard
resource "aws_cloudwatch_dashboard" "containers" {
  dashboard_name = "${var.project_name}-containers"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", aws_ecs_cluster.main.name],
            ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", aws_ecs_cluster.main.name]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "Container Resource Utilization"
        }
      }
    ]
  })
}

Conclusion

This setup provides a comprehensive foundation for deploying CloudWatch using Terraform. Remember to:

Plan your monitoring strategy carefully
Implement proper alerting thresholds
Create meaningful dashboards
Keep your configurations versioned
Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.