Configuring AWS CloudWatch with Terraform
A comprehensive guide to setting up Amazon CloudWatch monitoring and alerting using Terraform Infrastructure as Code
Configuring AWS CloudWatch with Terraform
Amazon CloudWatch is a monitoring and observability service. This guide shows how to set up CloudWatch using Terraform.
Prerequisites
- AWS CLI configured
- Terraform installed
- Basic understanding of monitoring concepts
- Resources to monitor
Project Structure
aws-cloudwatch-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars
Basic CloudWatch Configuration
# main.tf
provider "aws" {
region = var.aws_region
}
# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "main" {
name = "/aws/${var.project_name}"
retention_in_days = 30
tags = {
Environment = var.environment
}
}
# CloudWatch Metric Alarm
resource "aws_cloudwatch_metric_alarm" "high_cpu" {
alarm_name = "${var.project_name}-high-cpu"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "80"
alarm_description = "This metric monitors EC2 CPU utilization"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
InstanceId = var.instance_id
}
tags = {
Environment = var.environment
}
}
# SNS Topic for Alerts
resource "aws_sns_topic" "alerts" {
name = "${var.project_name}-alerts"
}
resource "aws_sns_topic_subscription" "email" {
topic_arn = aws_sns_topic.alerts.arn
protocol = "email"
endpoint = var.alert_email
}
Dashboard Configuration
# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "main" {
dashboard_name = "${var.project_name}-dashboard"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["AWS/EC2", "CPUUtilization", "InstanceId", var.instance_id]
]
period = 300
stat = "Average"
region = var.aws_region
title = "EC2 CPU Utilization"
}
},
{
type = "metric"
x = 12
y = 0
width = 12
height = 6
properties = {
metrics = [
["AWS/EC2", "NetworkIn", "InstanceId", var.instance_id],
["AWS/EC2", "NetworkOut", "InstanceId", var.instance_id]
]
period = 300
stat = "Average"
region = var.aws_region
title = "EC2 Network Traffic"
}
}
]
})
}
Log Metrics Configuration
# Log Metric Filter
resource "aws_cloudwatch_log_metric_filter" "error_count" {
name = "${var.project_name}-error-count"
pattern = "ERROR"
log_group_name = aws_cloudwatch_log_group.main.name
metric_transformation {
name = "ErrorCount"
namespace = "${var.project_name}/Errors"
value = "1"
}
}
# Metric Alarm for Log Errors
resource "aws_cloudwatch_metric_alarm" "error_count" {
alarm_name = "${var.project_name}-error-count"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "ErrorCount"
namespace = "${var.project_name}/Errors"
period = "300"
statistic = "Sum"
threshold = "10"
alarm_description = "This metric monitors error count in logs"
alarm_actions = [aws_sns_topic.alerts.arn]
}
Variables Configuration
# variables.tf
variable "aws_region" {
description = "AWS region"
type = string
default = "us-west-2"
}
variable "project_name" {
description = "Project name"
type = string
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "instance_id" {
description = "EC2 instance ID to monitor"
type = string
}
variable "alert_email" {
description = "Email address for alerts"
type = string
}
Best Practices
-
Monitoring Strategy
- Define clear monitoring objectives
- Use appropriate metrics and thresholds
- Implement proper alerting
- Create comprehensive dashboards
-
Log Management
- Set appropriate retention periods
- Use log metric filters effectively
- Implement structured logging
- Monitor log volume
-
Alerting
- Avoid alert fatigue
- Set meaningful thresholds
- Use proper evaluation periods
- Implement escalation policies
-
Cost Optimization
- Monitor log storage usage
- Clean up unused metrics
- Use appropriate retention periods
- Consider metric resolution
Composite Alarms
# Primary Alarm
resource "aws_cloudwatch_metric_alarm" "primary" {
alarm_name = "${var.project_name}-primary"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "80"
}
# Secondary Alarm
resource "aws_cloudwatch_metric_alarm" "secondary" {
alarm_name = "${var.project_name}-secondary"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "MemoryUtilization"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "80"
}
# Composite Alarm
resource "aws_cloudwatch_composite_alarm" "composite" {
alarm_name = "${var.project_name}-composite"
alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.primary.alarm_name}) AND ALARM(${aws_cloudwatch_metric_alarm.secondary.alarm_name})"
alarm_actions = [aws_sns_topic.alerts.arn]
}
Anomaly Detection
# Anomaly Detection Alarm
resource "aws_cloudwatch_metric_alarm" "anomaly" {
alarm_name = "${var.project_name}-anomaly"
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "This metric monitors for anomalous behavior"
alarm_actions = [aws_sns_topic.alerts.arn]
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1)"
label = "CPUUtilization (Expected)"
return_data = true
}
metric_query {
id = "m1"
metric {
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
stat = "Average"
dimensions = {
InstanceId = var.instance_id
}
}
}
}
Deployment Steps
- Initialize Terraform:
terraform init
- Plan the deployment:
terraform plan
- Apply the configuration:
terraform apply
Clean Up
Remove all resources when done:
terraform destroy
Common Use Cases
- Application Monitoring
resource "aws_cloudwatch_log_group" "application" {
name = "/aws/${var.project_name}/application"
retention_in_days = 30
}
resource "aws_cloudwatch_metric_alarm" "api_latency" {
alarm_name = "${var.project_name}-api-latency"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "Latency"
namespace = "AWS/ApiGateway"
period = "300"
statistic = "Average"
threshold = "1000"
alarm_description = "This metric monitors API latency"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
ApiName = var.api_name
Stage = var.api_stage
}
}
- Infrastructure Monitoring
resource "aws_cloudwatch_dashboard" "infrastructure" {
dashboard_name = "${var.project_name}-infrastructure"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["AWS/EC2", "CPUUtilization"],
["AWS/RDS", "CPUUtilization"],
["AWS/ElastiCache", "CPUUtilization"]
]
period = 300
stat = "Average"
region = var.aws_region
title = "System CPU Utilization"
}
}
]
})
}
Container Monitoring
# ECS Container Insights
resource "aws_ecs_cluster" "main" {
name = "${var.project_name}-cluster"
setting {
name = "containerInsights"
value = "enabled"
}
}
# Container Metrics Dashboard
resource "aws_cloudwatch_dashboard" "containers" {
dashboard_name = "${var.project_name}-containers"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["ECS/ContainerInsights", "CpuUtilized", "ClusterName", aws_ecs_cluster.main.name],
["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", aws_ecs_cluster.main.name]
]
period = 300
stat = "Average"
region = var.aws_region
title = "Container Resource Utilization"
}
}
]
})
}
Conclusion
This setup provides a comprehensive foundation for deploying CloudWatch using Terraform. Remember to:
- Plan your monitoring strategy carefully
- Implement proper alerting thresholds
- Create meaningful dashboards
- Keep your configurations versioned
- Test thoroughly before production deployment
The complete code can be customized based on your specific requirements and use cases.