Monitoring AWS Resources with CloudWatch and Terraform
Learn how to implement comprehensive monitoring for AWS resources using CloudWatch and Terraform, including metrics, logs, alarms, and dashboards
Monitoring AWS Resources with CloudWatch and Terraform
Effective monitoring is crucial for maintaining reliable and performant AWS infrastructure. This guide demonstrates how to implement comprehensive monitoring using CloudWatch and Terraform.
Video Tutorial
Learn more about implementing Monitoring with Terraform in AWS in this comprehensive video tutorial:
Prerequisites
- AWS CLI configured with appropriate permissions
- Terraform installed (version 1.0.0 or later)
- Basic understanding of CloudWatch concepts
- Existing AWS resources to monitor
Project Structure
terraform-monitoring/
├── main.tf
├── variables.tf
├── outputs.tf
├── modules/
│ └── monitoring/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── dashboards/
└── main.json
CloudWatch Configuration
Create modules/monitoring/main.tf:
# Log Group
resource "aws_cloudwatch_log_group" "main" {
name = "/aws/${var.project_name}/${var.environment}"
retention_in_days = var.log_retention_days
tags = merge(
var.tags,
{
Name = "${var.project_name}-logs"
}
)
}
# Metric Filter
resource "aws_cloudwatch_log_metric_filter" "error_count" {
name = "${var.project_name}-error-count"
pattern = "[timestamp, requestid, level = ERROR, message]"
log_group_name = aws_cloudwatch_log_group.main.name
metric_transformation {
name = "ErrorCount"
namespace = "${var.project_name}/Application"
value = "1"
default_value = "0"
}
}
# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "main" {
dashboard_name = "${var.project_name}-dashboard"
dashboard_body = templatefile("${path.module}/../../dashboards/main.json", {
region = data.aws_region.current.name
log_group_name = aws_cloudwatch_log_group.main.name
project_name = var.project_name
})
}
# CPU Utilization Alarm
resource "aws_cloudwatch_metric_alarm" "cpu" {
alarm_name = "${var.project_name}-cpu-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = var.cpu_threshold
alarm_description = "Monitor CPU utilization"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
AutoScalingGroupName = var.asg_name
}
}
# Memory Utilization Alarm
resource "aws_cloudwatch_metric_alarm" "memory" {
alarm_name = "${var.project_name}-memory-utilization"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "mem_used_percent"
namespace = "CWAgent"
period = "300"
statistic = "Average"
threshold = var.memory_threshold
alarm_description = "Monitor memory utilization"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
AutoScalingGroupName = var.asg_name
}
}
# Disk Space Alarm
resource "aws_cloudwatch_metric_alarm" "disk" {
alarm_name = "${var.project_name}-disk-space"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "disk_free"
namespace = "CWAgent"
period = "300"
statistic = "Average"
threshold = var.disk_threshold
alarm_description = "Monitor disk space"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
AutoScalingGroupName = var.asg_name
device = "xvda1"
fstype = "ext4"
path = "/"
}
}
# API Gateway 4XX Error Alarm
resource "aws_cloudwatch_metric_alarm" "api_4xx" {
alarm_name = "${var.project_name}-api-4xx"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "5"
metric_name = "4XXError"
namespace = "AWS/ApiGateway"
period = "300"
statistic = "Sum"
threshold = var.api_error_threshold
alarm_description = "Monitor API Gateway 4XX errors"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
ApiName = var.api_name
Stage = var.api_stage
}
}
# RDS CPU Alarm
resource "aws_cloudwatch_metric_alarm" "rds_cpu" {
alarm_name = "${var.project_name}-rds-cpu"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/RDS"
period = "300"
statistic = "Average"
threshold = var.rds_cpu_threshold
alarm_description = "Monitor RDS CPU utilization"
alarm_actions = [aws_sns_topic.alerts.arn]
dimensions = {
DBInstanceIdentifier = var.db_instance_id
}
}
# Composite Alarm
resource "aws_cloudwatch_composite_alarm" "critical" {
alarm_name = "${var.project_name}-critical-state"
alarm_description = "Composite alarm for critical system state"
alarm_actions = [aws_sns_topic.critical_alerts.arn]
alarm_rule = <<EOF
ALARM(${aws_cloudwatch_metric_alarm.cpu.alarm_name}) AND
ALARM(${aws_cloudwatch_metric_alarm.memory.alarm_name})
EOF
}
# SNS Topic for Alerts
resource "aws_sns_topic" "alerts" {
name = "${var.project_name}-alerts"
}
resource "aws_sns_topic" "critical_alerts" {
name = "${var.project_name}-critical-alerts"
}
# CloudWatch Logs Insights Query
resource "aws_cloudwatch_query_definition" "error_analysis" {
name = "${var.project_name}-error-analysis"
log_group_names = [aws_cloudwatch_log_group.main.name]
query_string = <<EOF
fields @timestamp, @message
| filter level == "ERROR"
| stats count(*) as error_count by bin(30m)
| sort @timestamp desc
EOF
}
# Anomaly Detection Alarm
resource "aws_cloudwatch_metric_alarm" "anomaly" {
alarm_name = "${var.project_name}-anomaly"
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "Monitor for anomalous behavior"
alarm_actions = [aws_sns_topic.alerts.arn]
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1, 2)"
label = "CPUUtilization (Expected)"
return_data = true
}
metric_query {
id = "m1"
metric {
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "300"
stat = "Average"
unit = "Percent"
dimensions = {
AutoScalingGroupName = var.asg_name
}
}
}
}
Dashboard Configuration
Create dashboards/main.json:
{
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/EC2", "CPUUtilization", "AutoScalingGroupName", "${asg_name}"]
],
"period": 300,
"stat": "Average",
"region": "${region}",
"title": "CPU Utilization"
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["CWAgent", "mem_used_percent", "AutoScalingGroupName", "${asg_name}"]
],
"period": 300,
"stat": "Average",
"region": "${region}",
"title": "Memory Utilization"
}
},
{
"type": "log",
"x": 0,
"y": 6,
"width": 24,
"height": 6,
"properties": {
"query": "SOURCE '${log_group_name}' | fields @timestamp, @message\n| filter level == 'ERROR'\n| sort @timestamp desc",
"region": "${region}",
"title": "Error Logs",
"view": "table"
}
}
]
}
Monitoring Best Practices
-
Metric Collection
- Use CloudWatch agent
- Collect custom metrics
- Enable detailed monitoring
- Implement proper aggregation
-
Log Management
- Define retention periods
- Use log insights
- Create metric filters
- Implement structured logging
-
Alerting
- Set appropriate thresholds
- Use composite alarms
- Implement proper notification
- Configure alert routing
-
Dashboard Design
- Group related metrics
- Use appropriate widgets
- Include key performance indicators
- Enable auto-refresh
Advanced Features
- Anomaly Detection
resource "aws_cloudwatch_metric_alarm" "latency_anomaly" {
alarm_name = "${var.project_name}-latency-anomaly"
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "Monitor for anomalous latency"
alarm_actions = [aws_sns_topic.alerts.arn]
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1, 2)"
label = "Latency (Expected)"
return_data = true
}
metric_query {
id = "m1"
metric {
metric_name = "Latency"
namespace = "AWS/ApiGateway"
period = "300"
stat = "Average"
dimensions = {
ApiName = var.api_name
Stage = var.api_stage
}
}
}
}
- Metric Math
resource "aws_cloudwatch_metric_alarm" "error_rate" {
alarm_name = "${var.project_name}-error-rate"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "2"
threshold = 5
alarm_description = "Monitor error rate percentage"
alarm_actions = [aws_sns_topic.alerts.arn]
metric_query {
id = "e1"
expression = "m2/m1*100"
label = "Error Rate"
return_data = true
}
metric_query {
id = "m1"
metric {
metric_name = "RequestCount"
namespace = "AWS/ApiGateway"
period = "300"
stat = "Sum"
dimensions = {
ApiName = var.api_name
Stage = var.api_stage
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "5XXError"
namespace = "AWS/ApiGateway"
period = "300"
stat = "Sum"
dimensions = {
ApiName = var.api_name
Stage = var.api_stage
}
}
}
}
- Synthetic Monitoring
resource "aws_synthetics_canary" "api" {
name = "${var.project_name}-api-canary"
artifact_s3_location = "s3://${aws_s3_bucket.monitoring.id}/canary-artifacts/"
execution_role_arn = aws_iam_role.canary.arn
handler = "index.handler"
runtime_version = "syn-nodejs-puppeteer-3.9"
schedule {
expression = "rate(5 minutes)"
}
code {
handler = "index.handler"
script = file("${path.module}/canary-script.js")
}
}
Integration with Other Services
- X-Ray Tracing
resource "aws_xray_sampling_rule" "main" {
rule_name = "${var.project_name}-sampling"
priority = 1000
reservoir_size = 1
fixed_rate = 0.05
host = "*"
http_method = "*"
service_name = "*"
service_type = "*"
url_path = "*"
version = 1
}
- ServiceLens Integration
resource "aws_apigatewayv2_api" "main" {
name = "${var.project_name}-api"
protocol_type = "HTTP"
cors_configuration {
allow_origins = ["*"]
allow_methods = ["*"]
allow_headers = ["*"]
}
target = aws_lambda_function.main.arn
}
resource "aws_lambda_function" "main" {
filename = "function.zip"
function_name = "${var.project_name}-function"
role = aws_iam_role.lambda.arn
handler = "index.handler"
runtime = "nodejs18.x"
tracing_config {
mode = "Active"
}
}
Conclusion
You’ve learned how to implement comprehensive monitoring using CloudWatch and Terraform. This setup provides:
- Real-time monitoring
- Automated alerting
- Performance insights
- Troubleshooting capabilities
Remember to:
- Monitor key metrics
- Configure appropriate alerts
- Review dashboards regularly
- Optimize monitoring costs