Managing AWS EMR with Terraform
A comprehensive guide to setting up Amazon EMR (Elastic MapReduce) clusters using Terraform Infrastructure as Code
Managing AWS EMR with Terraform
Amazon EMR (Elastic MapReduce) is a cloud big data platform for processing vast amounts of data using open source tools. This guide shows how to set up EMR using Terraform.
Prerequisites
- AWS CLI configured
- Terraform installed
- Basic understanding of big data processing
- Data processing requirements defined
Project Structure
aws-emr-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars
Basic EMR Configuration
# main.tf
provider "aws" {
region = var.aws_region
}
# EMR Cluster
resource "aws_emr_cluster" "main" {
name = "${var.project_name}-cluster"
release_label = "emr-6.10.0"
applications = ["Spark", "Hive", "Hadoop"]
service_role = aws_iam_role.emr_service_role.arn
termination_protection = false
keep_job_flow_alive_when_no_steps = true
ec2_attributes {
subnet_id = var.subnet_id
emr_managed_master_security_group = aws_security_group.master.id
emr_managed_slave_security_group = aws_security_group.slave.id
instance_profile = aws_iam_instance_profile.emr_profile.arn
}
master_instance_group {
instance_type = "m5.xlarge"
}
core_instance_group {
instance_type = "m5.xlarge"
instance_count = 2
ebs_config {
size = "40"
type = "gp2"
volumes_per_instance = 1
}
}
tags = {
Environment = var.environment
}
bootstrap_action {
path = "s3://${aws_s3_bucket.scripts.id}/bootstrap.sh"
name = "Custom Bootstrap Action"
}
configurations_json = jsonencode([
{
Classification = "spark-defaults"
Properties = {
"spark.driver.memory" = "5g"
"spark.executor.memory" = "5g"
"spark.executor.instances" = "2"
}
}
])
}
# S3 Bucket for Scripts and Logs
resource "aws_s3_bucket" "scripts" {
bucket = "${var.project_name}-emr-scripts"
tags = {
Environment = var.environment
}
}
# Security Groups
resource "aws_security_group" "master" {
name = "${var.project_name}-emr-master"
description = "Security group for EMR master node"
vpc_id = var.vpc_id
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_security_group" "slave" {
name = "${var.project_name}-emr-slave"
description = "Security group for EMR slave nodes"
vpc_id = var.vpc_id
ingress {
from_port = 0
to_port = 0
protocol = "-1"
security_groups = [aws_security_group.master.id]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
IAM Configuration
# EMR Service Role
resource "aws_iam_role" "emr_service_role" {
name = "${var.project_name}-emr-service-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "elasticmapreduce.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy_attachment" "emr_service_role" {
role = aws_iam_role.emr_service_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole"
}
# EMR Instance Profile
resource "aws_iam_role" "emr_profile_role" {
name = "${var.project_name}-emr-profile-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ec2.amazonaws.com"
}
}
]
})
}
resource "aws_iam_instance_profile" "emr_profile" {
name = "${var.project_name}-emr-profile"
role = aws_iam_role.emr_profile_role.name
}
resource "aws_iam_role_policy_attachment" "emr_profile" {
role = aws_iam_role.emr_profile_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
}
Variables Configuration
# variables.tf
variable "aws_region" {
description = "AWS region"
type = string
default = "us-west-2"
}
variable "project_name" {
description = "Project name"
type = string
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "vpc_id" {
description = "VPC ID"
type = string
}
variable "subnet_id" {
description = "Subnet ID"
type = string
}
Best Practices
-
Cluster Management
- Use appropriate instance types
- Configure proper scaling
- Implement proper monitoring
- Regular maintenance
-
Security
- Implement proper IAM roles
- Use security groups effectively
- Enable encryption
- Regular security reviews
-
Cost Optimization
- Use spot instances when possible
- Implement auto-termination
- Monitor cluster usage
- Clean up unused resources
-
Performance
- Optimize cluster configuration
- Use appropriate storage
- Monitor job performance
- Regular performance reviews
Instance Groups Configuration
# EMR Cluster with Multiple Instance Groups
resource "aws_emr_cluster" "advanced" {
name = "${var.project_name}-advanced-cluster"
release_label = "emr-6.10.0"
applications = ["Spark", "Hive", "Hadoop"]
service_role = aws_iam_role.emr_service_role.arn
master_instance_group {
instance_type = "m5.xlarge"
ebs_config {
size = "100"
type = "gp2"
volumes_per_instance = 1
}
}
core_instance_group {
instance_type = "m5.2xlarge"
instance_count = 3
ebs_config {
size = "200"
type = "gp2"
volumes_per_instance = 2
}
bid_price = "0.30" # For Spot instances
}
task_instance_group {
instance_type = "m5.xlarge"
instance_count = 2
bid_price = "0.20"
ebs_config {
size = "100"
type = "gp2"
volumes_per_instance = 1
}
autoscaling_policy = jsonencode({
Constraints = {
MinCapacity = 1
MaxCapacity = 5
}
Rules = [
{
Name = "ScaleOutMemoryPercentage"
Description = "Scale out if YARNMemoryAvailablePercentage is less than 15"
Action = {
SimpleScalingPolicyConfiguration = {
AdjustmentType = "CHANGE_IN_CAPACITY"
ScalingAdjustment = 1
CoolDown = 300
}
}
Trigger = {
CloudWatchAlarmDefinition = {
ComparisonOperator = "LESS_THAN"
EvaluationPeriods = 1
MetricName = "YARNMemoryAvailablePercentage"
Namespace = "AWS/ElasticMapReduce"
Period = 300
Threshold = 15
Statistic = "AVERAGE"
}
}
}
]
})
}
}
Step Configuration
# EMR Steps
resource "aws_emr_step" "spark_step" {
name = "Spark Application Step"
action_on_failure = "CONTINUE"
cluster_id = aws_emr_cluster.main.id
hadoop_jar_step {
jar = "command-runner.jar"
args = [
"spark-submit",
"--class", "com.example.SparkApp",
"--master", "yarn",
"s3://${aws_s3_bucket.scripts.id}/app.jar",
"arg1",
"arg2"
]
}
}
resource "aws_emr_step" "hive_step" {
name = "Hive Script Step"
action_on_failure = "CONTINUE"
cluster_id = aws_emr_cluster.main.id
hadoop_jar_step {
jar = "command-runner.jar"
args = [
"hive-script",
"--run-hive-script",
"--args",
"-f",
"s3://${aws_s3_bucket.scripts.id}/query.hql"
]
}
}
Deployment Steps
- Initialize Terraform:
terraform init
- Plan the deployment:
terraform plan
- Apply the configuration:
terraform apply
Clean Up
Remove all resources when done:
terraform destroy
Common Use Cases
- Data Processing Pipeline
resource "aws_emr_cluster" "pipeline" {
name = "${var.project_name}-pipeline"
release_label = "emr-6.10.0"
applications = ["Spark", "Hive"]
# ... other configuration ...
step {
action_on_failure = "CONTINUE"
name = "Setup Hadoop Debugging"
hadoop_jar_step {
jar = "command-runner.jar"
args = ["state-pusher-script"]
}
}
step {
action_on_failure = "CANCEL_AND_WAIT"
name = "Data Processing Step"
hadoop_jar_step {
jar = "command-runner.jar"
args = [
"spark-submit",
"--class", "com.example.DataProcessor",
"s3://${aws_s3_bucket.scripts.id}/processor.jar"
]
}
}
}
- Scheduled Processing
resource "aws_cloudwatch_event_rule" "emr_schedule" {
name = "${var.project_name}-emr-schedule"
description = "Schedule for EMR cluster creation"
schedule_expression = "cron(0 0 * * ? *)" # Daily at midnight
}
resource "aws_cloudwatch_event_target" "emr_target" {
rule = aws_cloudwatch_event_rule.emr_schedule.name
target_id = "EMRClusterCreation"
arn = aws_lambda_function.create_emr.arn
input = jsonencode({
cluster_name = "${var.project_name}-scheduled"
instance_count = 3
instance_type = "m5.xlarge"
})
}
Monitoring Configuration
# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "emr" {
dashboard_name = "${var.project_name}-emr-dashboard"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["AWS/ElasticMapReduce", "IsIdle", "JobFlowId", aws_emr_cluster.main.id],
["AWS/ElasticMapReduce", "AppsRunning", "JobFlowId", aws_emr_cluster.main.id]
]
period = 300
stat = "Average"
region = var.aws_region
title = "EMR Cluster Metrics"
}
}
]
})
}
# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "cluster_error" {
alarm_name = "${var.project_name}-cluster-error"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "MRUnhealthyNodes"
namespace = "AWS/ElasticMapReduce"
period = "300"
statistic = "Average"
threshold = "0"
alarm_description = "This metric monitors unhealthy EMR nodes"
alarm_actions = [var.sns_topic_arn]
dimensions = {
JobFlowId = aws_emr_cluster.main.id
}
}
Conclusion
This setup provides a comprehensive foundation for deploying EMR using Terraform. Remember to:
- Plan your cluster architecture carefully
- Implement proper security measures
- Monitor cluster performance
- Keep your configurations versioned
- Test thoroughly before production deployment
The complete code can be customized based on your specific requirements and use cases.