Managing AWS EMR with Terraform

Amazon EMR (Elastic MapReduce) is a cloud big data platform for processing vast amounts of data using open source tools. This guide shows how to set up EMR using Terraform.

Prerequisites

AWS CLI configured
Terraform installed
Basic understanding of big data processing
Data processing requirements defined

Project Structure

aws-emr-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic EMR Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# EMR Cluster
resource "aws_emr_cluster" "main" {
  name          = "${var.project_name}-cluster"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive", "Hadoop"]

  service_role = aws_iam_role.emr_service_role.arn

  termination_protection            = false
  keep_job_flow_alive_when_no_steps = true

  ec2_attributes {
    subnet_id                         = var.subnet_id
    emr_managed_master_security_group = aws_security_group.master.id
    emr_managed_slave_security_group  = aws_security_group.slave.id
    instance_profile                  = aws_iam_instance_profile.emr_profile.arn
  }

  master_instance_group {
    instance_type = "m5.xlarge"
  }

  core_instance_group {
    instance_type  = "m5.xlarge"
    instance_count = 2

    ebs_config {
      size                 = "40"
      type                = "gp2"
      volumes_per_instance = 1
    }
  }

  tags = {
    Environment = var.environment
  }

  bootstrap_action {
    path = "s3://${aws_s3_bucket.scripts.id}/bootstrap.sh"
    name = "Custom Bootstrap Action"
  }

  configurations_json = jsonencode([
    {
      Classification = "spark-defaults"
      Properties = {
        "spark.driver.memory"      = "5g"
        "spark.executor.memory"    = "5g"
        "spark.executor.instances" = "2"
      }
    }
  ])
}

# S3 Bucket for Scripts and Logs
resource "aws_s3_bucket" "scripts" {
  bucket = "${var.project_name}-emr-scripts"

  tags = {
    Environment = var.environment
  }
}

# Security Groups
resource "aws_security_group" "master" {
  name        = "${var.project_name}-emr-master"
  description = "Security group for EMR master node"
  vpc_id      = var.vpc_id

  ingress {
    from_port   = 22
    to_port     = 22
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

resource "aws_security_group" "slave" {
  name        = "${var.project_name}-emr-slave"
  description = "Security group for EMR slave nodes"
  vpc_id      = var.vpc_id

  ingress {
    from_port       = 0
    to_port         = 0
    protocol        = "-1"
    security_groups = [aws_security_group.master.id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

IAM Configuration

# EMR Service Role
resource "aws_iam_role" "emr_service_role" {
  name = "${var.project_name}-emr-service-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "elasticmapreduce.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_role_policy_attachment" "emr_service_role" {
  role       = aws_iam_role.emr_service_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole"
}

# EMR Instance Profile
resource "aws_iam_role" "emr_profile_role" {
  name = "${var.project_name}-emr-profile-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "ec2.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_instance_profile" "emr_profile" {
  name = "${var.project_name}-emr-profile"
  role = aws_iam_role.emr_profile_role.name
}

resource "aws_iam_role_policy_attachment" "emr_profile" {
  role       = aws_iam_role.emr_profile_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "vpc_id" {
  description = "VPC ID"
  type        = string
}

variable "subnet_id" {
  description = "Subnet ID"
  type        = string
}

Best Practices

Cluster Management
- Use appropriate instance types
- Configure proper scaling
- Implement proper monitoring
- Regular maintenance
Security
- Implement proper IAM roles
- Use security groups effectively
- Enable encryption
- Regular security reviews
Cost Optimization
- Use spot instances when possible
- Implement auto-termination
- Monitor cluster usage
- Clean up unused resources
Performance
- Optimize cluster configuration
- Use appropriate storage
- Monitor job performance
- Regular performance reviews

Instance Groups Configuration

# EMR Cluster with Multiple Instance Groups
resource "aws_emr_cluster" "advanced" {
  name          = "${var.project_name}-advanced-cluster"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive", "Hadoop"]

  service_role = aws_iam_role.emr_service_role.arn

  master_instance_group {
    instance_type = "m5.xlarge"
    
    ebs_config {
      size                 = "100"
      type                = "gp2"
      volumes_per_instance = 1
    }
  }

  core_instance_group {
    instance_type  = "m5.2xlarge"
    instance_count = 3

    ebs_config {
      size                 = "200"
      type                = "gp2"
      volumes_per_instance = 2
    }

    bid_price = "0.30"  # For Spot instances
  }

  task_instance_group {
    instance_type  = "m5.xlarge"
    instance_count = 2
    
    bid_price = "0.20"

    ebs_config {
      size                 = "100"
      type                = "gp2"
      volumes_per_instance = 1
    }

    autoscaling_policy = jsonencode({
      Constraints = {
        MinCapacity = 1
        MaxCapacity = 5
      }
      Rules = [
        {
          Name = "ScaleOutMemoryPercentage"
          Description = "Scale out if YARNMemoryAvailablePercentage is less than 15"
          Action = {
            SimpleScalingPolicyConfiguration = {
              AdjustmentType = "CHANGE_IN_CAPACITY"
              ScalingAdjustment = 1
              CoolDown = 300
            }
          }
          Trigger = {
            CloudWatchAlarmDefinition = {
              ComparisonOperator = "LESS_THAN"
              EvaluationPeriods = 1
              MetricName = "YARNMemoryAvailablePercentage"
              Namespace = "AWS/ElasticMapReduce"
              Period = 300
              Threshold = 15
              Statistic = "AVERAGE"
            }
          }
        }
      ]
    })
  }
}

Step Configuration

# EMR Steps
resource "aws_emr_step" "spark_step" {
  name              = "Spark Application Step"
  action_on_failure = "CONTINUE"
  cluster_id        = aws_emr_cluster.main.id

  hadoop_jar_step {
    jar  = "command-runner.jar"
    args = [
      "spark-submit",
      "--class", "com.example.SparkApp",
      "--master", "yarn",
      "s3://${aws_s3_bucket.scripts.id}/app.jar",
      "arg1",
      "arg2"
    ]
  }
}

resource "aws_emr_step" "hive_step" {
  name              = "Hive Script Step"
  action_on_failure = "CONTINUE"
  cluster_id        = aws_emr_cluster.main.id

  hadoop_jar_step {
    jar  = "command-runner.jar"
    args = [
      "hive-script",
      "--run-hive-script",
      "--args",
      "-f",
      "s3://${aws_s3_bucket.scripts.id}/query.hql"
    ]
  }
}

Deployment Steps

Initialize Terraform:

terraform init

Plan the deployment:

terraform plan

Apply the configuration:

terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

Data Processing Pipeline

resource "aws_emr_cluster" "pipeline" {
  name          = "${var.project_name}-pipeline"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive"]

  # ... other configuration ...

  step {
    action_on_failure = "CONTINUE"
    name              = "Setup Hadoop Debugging"

    hadoop_jar_step {
      jar  = "command-runner.jar"
      args = ["state-pusher-script"]
    }
  }

  step {
    action_on_failure = "CANCEL_AND_WAIT"
    name              = "Data Processing Step"

    hadoop_jar_step {
      jar  = "command-runner.jar"
      args = [
        "spark-submit",
        "--class", "com.example.DataProcessor",
        "s3://${aws_s3_bucket.scripts.id}/processor.jar"
      ]
    }
  }
}

Scheduled Processing

resource "aws_cloudwatch_event_rule" "emr_schedule" {
  name                = "${var.project_name}-emr-schedule"
  description         = "Schedule for EMR cluster creation"
  schedule_expression = "cron(0 0 * * ? *)"  # Daily at midnight
}

resource "aws_cloudwatch_event_target" "emr_target" {
  rule      = aws_cloudwatch_event_rule.emr_schedule.name
  target_id = "EMRClusterCreation"
  arn       = aws_lambda_function.create_emr.arn

  input = jsonencode({
    cluster_name = "${var.project_name}-scheduled"
    instance_count = 3
    instance_type = "m5.xlarge"
  })
}

Monitoring Configuration

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "emr" {
  dashboard_name = "${var.project_name}-emr-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/ElasticMapReduce", "IsIdle", "JobFlowId", aws_emr_cluster.main.id],
            ["AWS/ElasticMapReduce", "AppsRunning", "JobFlowId", aws_emr_cluster.main.id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EMR Cluster Metrics"
        }
      }
    ]
  })
}

# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "cluster_error" {
  alarm_name          = "${var.project_name}-cluster-error"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "MRUnhealthyNodes"
  namespace           = "AWS/ElasticMapReduce"
  period             = "300"
  statistic          = "Average"
  threshold          = "0"
  alarm_description  = "This metric monitors unhealthy EMR nodes"
  alarm_actions      = [var.sns_topic_arn]

  dimensions = {
    JobFlowId = aws_emr_cluster.main.id
  }
}

Conclusion

This setup provides a comprehensive foundation for deploying EMR using Terraform. Remember to:

Plan your cluster architecture carefully
Implement proper security measures
Monitor cluster performance
Keep your configurations versioned
Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.