Monitoring AWS Resources with CloudWatch and Terraform

Learn how to implement comprehensive monitoring for AWS resources using CloudWatch and Terraform, including metrics, logs, alarms, and dashboards

Monitoring AWS Resources with CloudWatch and Terraform

Effective monitoring is crucial for maintaining reliable and performant AWS infrastructure. This guide demonstrates how to implement comprehensive monitoring using CloudWatch and Terraform.

Video Tutorial

Learn more about implementing Monitoring with Terraform in AWS in this comprehensive video tutorial:

Prerequisites

  • AWS CLI configured with appropriate permissions
  • Terraform installed (version 1.0.0 or later)
  • Basic understanding of CloudWatch concepts
  • Existing AWS resources to monitor

Project Structure

terraform-monitoring/
├── main.tf
├── variables.tf
├── outputs.tf
├── modules/
│   └── monitoring/
│       ├── main.tf
│       ├── variables.tf
│       └── outputs.tf
└── dashboards/
    └── main.json

CloudWatch Configuration

Create modules/monitoring/main.tf:

# Log Group
resource "aws_cloudwatch_log_group" "main" {
  name              = "/aws/${var.project_name}/${var.environment}"
  retention_in_days = var.log_retention_days

  tags = merge(
    var.tags,
    {
      Name = "${var.project_name}-logs"
    }
  )
}

# Metric Filter
resource "aws_cloudwatch_log_metric_filter" "error_count" {
  name           = "${var.project_name}-error-count"
  pattern        = "[timestamp, requestid, level = ERROR, message]"
  log_group_name = aws_cloudwatch_log_group.main.name

  metric_transformation {
    name          = "ErrorCount"
    namespace     = "${var.project_name}/Application"
    value         = "1"
    default_value = "0"
  }
}

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "main" {
  dashboard_name = "${var.project_name}-dashboard"
  dashboard_body = templatefile("${path.module}/../../dashboards/main.json", {
    region         = data.aws_region.current.name
    log_group_name = aws_cloudwatch_log_group.main.name
    project_name   = var.project_name
  })
}

# CPU Utilization Alarm
resource "aws_cloudwatch_metric_alarm" "cpu" {
  alarm_name          = "${var.project_name}-cpu-utilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name        = "CPUUtilization"
  namespace          = "AWS/EC2"
  period             = "300"
  statistic          = "Average"
  threshold          = var.cpu_threshold
  alarm_description  = "Monitor CPU utilization"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    AutoScalingGroupName = var.asg_name
  }
}

# Memory Utilization Alarm
resource "aws_cloudwatch_metric_alarm" "memory" {
  alarm_name          = "${var.project_name}-memory-utilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name        = "mem_used_percent"
  namespace          = "CWAgent"
  period             = "300"
  statistic          = "Average"
  threshold          = var.memory_threshold
  alarm_description  = "Monitor memory utilization"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    AutoScalingGroupName = var.asg_name
  }
}

# Disk Space Alarm
resource "aws_cloudwatch_metric_alarm" "disk" {
  alarm_name          = "${var.project_name}-disk-space"
  comparison_operator = "LessThanThreshold"
  evaluation_periods  = "1"
  metric_name        = "disk_free"
  namespace          = "CWAgent"
  period             = "300"
  statistic          = "Average"
  threshold          = var.disk_threshold
  alarm_description  = "Monitor disk space"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    AutoScalingGroupName = var.asg_name
    device              = "xvda1"
    fstype              = "ext4"
    path                = "/"
  }
}

# API Gateway 4XX Error Alarm
resource "aws_cloudwatch_metric_alarm" "api_4xx" {
  alarm_name          = "${var.project_name}-api-4xx"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "5"
  metric_name        = "4XXError"
  namespace          = "AWS/ApiGateway"
  period             = "300"
  statistic          = "Sum"
  threshold          = var.api_error_threshold
  alarm_description  = "Monitor API Gateway 4XX errors"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    ApiName = var.api_name
    Stage   = var.api_stage
  }
}

# RDS CPU Alarm
resource "aws_cloudwatch_metric_alarm" "rds_cpu" {
  alarm_name          = "${var.project_name}-rds-cpu"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name        = "CPUUtilization"
  namespace          = "AWS/RDS"
  period             = "300"
  statistic          = "Average"
  threshold          = var.rds_cpu_threshold
  alarm_description  = "Monitor RDS CPU utilization"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  dimensions = {
    DBInstanceIdentifier = var.db_instance_id
  }
}

# Composite Alarm
resource "aws_cloudwatch_composite_alarm" "critical" {
  alarm_name = "${var.project_name}-critical-state"
  alarm_description = "Composite alarm for critical system state"
  alarm_actions    = [aws_sns_topic.critical_alerts.arn]

  alarm_rule = <<EOF
    ALARM(${aws_cloudwatch_metric_alarm.cpu.alarm_name}) AND
    ALARM(${aws_cloudwatch_metric_alarm.memory.alarm_name})
  EOF
}

# SNS Topic for Alerts
resource "aws_sns_topic" "alerts" {
  name = "${var.project_name}-alerts"
}

resource "aws_sns_topic" "critical_alerts" {
  name = "${var.project_name}-critical-alerts"
}

# CloudWatch Logs Insights Query
resource "aws_cloudwatch_query_definition" "error_analysis" {
  name = "${var.project_name}-error-analysis"

  log_group_names = [aws_cloudwatch_log_group.main.name]

  query_string = <<EOF
fields @timestamp, @message
| filter level == "ERROR"
| stats count(*) as error_count by bin(30m)
| sort @timestamp desc
EOF
}

# Anomaly Detection Alarm
resource "aws_cloudwatch_metric_alarm" "anomaly" {
  alarm_name          = "${var.project_name}-anomaly"
  comparison_operator = "GreaterThanUpperThreshold"
  evaluation_periods  = "2"
  threshold_metric_id = "e1"
  alarm_description  = "Monitor for anomalous behavior"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  metric_query {
    id          = "e1"
    expression  = "ANOMALY_DETECTION_BAND(m1, 2)"
    label       = "CPUUtilization (Expected)"
    return_data = true
  }

  metric_query {
    id = "m1"
    metric {
      metric_name = "CPUUtilization"
      namespace   = "AWS/EC2"
      period     = "300"
      stat       = "Average"
      unit       = "Percent"

      dimensions = {
        AutoScalingGroupName = var.asg_name
      }
    }
  }
}

Dashboard Configuration

Create dashboards/main.json:

{
  "widgets": [
    {
      "type": "metric",
      "x": 0,
      "y": 0,
      "width": 12,
      "height": 6,
      "properties": {
        "metrics": [
          ["AWS/EC2", "CPUUtilization", "AutoScalingGroupName", "${asg_name}"]
        ],
        "period": 300,
        "stat": "Average",
        "region": "${region}",
        "title": "CPU Utilization"
      }
    },
    {
      "type": "metric",
      "x": 12,
      "y": 0,
      "width": 12,
      "height": 6,
      "properties": {
        "metrics": [
          ["CWAgent", "mem_used_percent", "AutoScalingGroupName", "${asg_name}"]
        ],
        "period": 300,
        "stat": "Average",
        "region": "${region}",
        "title": "Memory Utilization"
      }
    },
    {
      "type": "log",
      "x": 0,
      "y": 6,
      "width": 24,
      "height": 6,
      "properties": {
        "query": "SOURCE '${log_group_name}' | fields @timestamp, @message\n| filter level == 'ERROR'\n| sort @timestamp desc",
        "region": "${region}",
        "title": "Error Logs",
        "view": "table"
      }
    }
  ]
}

Monitoring Best Practices

  1. Metric Collection

    • Use CloudWatch agent
    • Collect custom metrics
    • Enable detailed monitoring
    • Implement proper aggregation
  2. Log Management

    • Define retention periods
    • Use log insights
    • Create metric filters
    • Implement structured logging
  3. Alerting

    • Set appropriate thresholds
    • Use composite alarms
    • Implement proper notification
    • Configure alert routing
  4. Dashboard Design

    • Group related metrics
    • Use appropriate widgets
    • Include key performance indicators
    • Enable auto-refresh

Advanced Features

  1. Anomaly Detection
resource "aws_cloudwatch_metric_alarm" "latency_anomaly" {
  alarm_name          = "${var.project_name}-latency-anomaly"
  comparison_operator = "GreaterThanUpperThreshold"
  evaluation_periods  = "2"
  threshold_metric_id = "e1"
  alarm_description  = "Monitor for anomalous latency"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  metric_query {
    id          = "e1"
    expression  = "ANOMALY_DETECTION_BAND(m1, 2)"
    label       = "Latency (Expected)"
    return_data = true
  }

  metric_query {
    id = "m1"
    metric {
      metric_name = "Latency"
      namespace   = "AWS/ApiGateway"
      period     = "300"
      stat       = "Average"

      dimensions = {
        ApiName = var.api_name
        Stage   = var.api_stage
      }
    }
  }
}
  1. Metric Math
resource "aws_cloudwatch_metric_alarm" "error_rate" {
  alarm_name          = "${var.project_name}-error-rate"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  threshold          = 5
  alarm_description  = "Monitor error rate percentage"
  alarm_actions      = [aws_sns_topic.alerts.arn]

  metric_query {
    id          = "e1"
    expression  = "m2/m1*100"
    label       = "Error Rate"
    return_data = true
  }

  metric_query {
    id = "m1"
    metric {
      metric_name = "RequestCount"
      namespace   = "AWS/ApiGateway"
      period     = "300"
      stat       = "Sum"
      dimensions = {
        ApiName = var.api_name
        Stage   = var.api_stage
      }
    }
  }

  metric_query {
    id = "m2"
    metric {
      metric_name = "5XXError"
      namespace   = "AWS/ApiGateway"
      period     = "300"
      stat       = "Sum"
      dimensions = {
        ApiName = var.api_name
        Stage   = var.api_stage
      }
    }
  }
}
  1. Synthetic Monitoring
resource "aws_synthetics_canary" "api" {
  name                 = "${var.project_name}-api-canary"
  artifact_s3_location = "s3://${aws_s3_bucket.monitoring.id}/canary-artifacts/"
  execution_role_arn   = aws_iam_role.canary.arn
  handler             = "index.handler"
  runtime_version     = "syn-nodejs-puppeteer-3.9"
  schedule {
    expression = "rate(5 minutes)"
  }

  code {
    handler  = "index.handler"
    script   = file("${path.module}/canary-script.js")
  }
}

Integration with Other Services

  1. X-Ray Tracing
resource "aws_xray_sampling_rule" "main" {
  rule_name      = "${var.project_name}-sampling"
  priority       = 1000
  reservoir_size = 1
  fixed_rate     = 0.05
  host           = "*"
  http_method    = "*"
  service_name   = "*"
  service_type   = "*"
  url_path       = "*"
  version        = 1
}
  1. ServiceLens Integration
resource "aws_apigatewayv2_api" "main" {
  name          = "${var.project_name}-api"
  protocol_type = "HTTP"

  cors_configuration {
    allow_origins = ["*"]
    allow_methods = ["*"]
    allow_headers = ["*"]
  }

  target = aws_lambda_function.main.arn
}

resource "aws_lambda_function" "main" {
  filename         = "function.zip"
  function_name    = "${var.project_name}-function"
  role            = aws_iam_role.lambda.arn
  handler         = "index.handler"
  runtime         = "nodejs18.x"

  tracing_config {
    mode = "Active"
  }
}

Conclusion

You’ve learned how to implement comprehensive monitoring using CloudWatch and Terraform. This setup provides:

  • Real-time monitoring
  • Automated alerting
  • Performance insights
  • Troubleshooting capabilities

Remember to:

  • Monitor key metrics
  • Configure appropriate alerts
  • Review dashboards regularly
  • Optimize monitoring costs