Managing AWS EMR with Terraform

A comprehensive guide to setting up Amazon EMR (Elastic MapReduce) clusters using Terraform Infrastructure as Code

Managing AWS EMR with Terraform

Amazon EMR (Elastic MapReduce) is a cloud big data platform for processing vast amounts of data using open source tools. This guide shows how to set up EMR using Terraform.

Prerequisites

  • AWS CLI configured
  • Terraform installed
  • Basic understanding of big data processing
  • Data processing requirements defined

Project Structure

aws-emr-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic EMR Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# EMR Cluster
resource "aws_emr_cluster" "main" {
  name          = "${var.project_name}-cluster"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive", "Hadoop"]

  service_role = aws_iam_role.emr_service_role.arn

  termination_protection            = false
  keep_job_flow_alive_when_no_steps = true

  ec2_attributes {
    subnet_id                         = var.subnet_id
    emr_managed_master_security_group = aws_security_group.master.id
    emr_managed_slave_security_group  = aws_security_group.slave.id
    instance_profile                  = aws_iam_instance_profile.emr_profile.arn
  }

  master_instance_group {
    instance_type = "m5.xlarge"
  }

  core_instance_group {
    instance_type  = "m5.xlarge"
    instance_count = 2

    ebs_config {
      size                 = "40"
      type                = "gp2"
      volumes_per_instance = 1
    }
  }

  tags = {
    Environment = var.environment
  }

  bootstrap_action {
    path = "s3://${aws_s3_bucket.scripts.id}/bootstrap.sh"
    name = "Custom Bootstrap Action"
  }

  configurations_json = jsonencode([
    {
      Classification = "spark-defaults"
      Properties = {
        "spark.driver.memory"      = "5g"
        "spark.executor.memory"    = "5g"
        "spark.executor.instances" = "2"
      }
    }
  ])
}

# S3 Bucket for Scripts and Logs
resource "aws_s3_bucket" "scripts" {
  bucket = "${var.project_name}-emr-scripts"

  tags = {
    Environment = var.environment
  }
}

# Security Groups
resource "aws_security_group" "master" {
  name        = "${var.project_name}-emr-master"
  description = "Security group for EMR master node"
  vpc_id      = var.vpc_id

  ingress {
    from_port   = 22
    to_port     = 22
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

resource "aws_security_group" "slave" {
  name        = "${var.project_name}-emr-slave"
  description = "Security group for EMR slave nodes"
  vpc_id      = var.vpc_id

  ingress {
    from_port       = 0
    to_port         = 0
    protocol        = "-1"
    security_groups = [aws_security_group.master.id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

IAM Configuration

# EMR Service Role
resource "aws_iam_role" "emr_service_role" {
  name = "${var.project_name}-emr-service-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "elasticmapreduce.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_role_policy_attachment" "emr_service_role" {
  role       = aws_iam_role.emr_service_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole"
}

# EMR Instance Profile
resource "aws_iam_role" "emr_profile_role" {
  name = "${var.project_name}-emr-profile-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "ec2.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_instance_profile" "emr_profile" {
  name = "${var.project_name}-emr-profile"
  role = aws_iam_role.emr_profile_role.name
}

resource "aws_iam_role_policy_attachment" "emr_profile" {
  role       = aws_iam_role.emr_profile_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "vpc_id" {
  description = "VPC ID"
  type        = string
}

variable "subnet_id" {
  description = "Subnet ID"
  type        = string
}

Best Practices

  1. Cluster Management

    • Use appropriate instance types
    • Configure proper scaling
    • Implement proper monitoring
    • Regular maintenance
  2. Security

    • Implement proper IAM roles
    • Use security groups effectively
    • Enable encryption
    • Regular security reviews
  3. Cost Optimization

    • Use spot instances when possible
    • Implement auto-termination
    • Monitor cluster usage
    • Clean up unused resources
  4. Performance

    • Optimize cluster configuration
    • Use appropriate storage
    • Monitor job performance
    • Regular performance reviews

Instance Groups Configuration

# EMR Cluster with Multiple Instance Groups
resource "aws_emr_cluster" "advanced" {
  name          = "${var.project_name}-advanced-cluster"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive", "Hadoop"]

  service_role = aws_iam_role.emr_service_role.arn

  master_instance_group {
    instance_type = "m5.xlarge"
    
    ebs_config {
      size                 = "100"
      type                = "gp2"
      volumes_per_instance = 1
    }
  }

  core_instance_group {
    instance_type  = "m5.2xlarge"
    instance_count = 3

    ebs_config {
      size                 = "200"
      type                = "gp2"
      volumes_per_instance = 2
    }

    bid_price = "0.30"  # For Spot instances
  }

  task_instance_group {
    instance_type  = "m5.xlarge"
    instance_count = 2
    
    bid_price = "0.20"

    ebs_config {
      size                 = "100"
      type                = "gp2"
      volumes_per_instance = 1
    }

    autoscaling_policy = jsonencode({
      Constraints = {
        MinCapacity = 1
        MaxCapacity = 5
      }
      Rules = [
        {
          Name = "ScaleOutMemoryPercentage"
          Description = "Scale out if YARNMemoryAvailablePercentage is less than 15"
          Action = {
            SimpleScalingPolicyConfiguration = {
              AdjustmentType = "CHANGE_IN_CAPACITY"
              ScalingAdjustment = 1
              CoolDown = 300
            }
          }
          Trigger = {
            CloudWatchAlarmDefinition = {
              ComparisonOperator = "LESS_THAN"
              EvaluationPeriods = 1
              MetricName = "YARNMemoryAvailablePercentage"
              Namespace = "AWS/ElasticMapReduce"
              Period = 300
              Threshold = 15
              Statistic = "AVERAGE"
            }
          }
        }
      ]
    })
  }
}

Step Configuration

# EMR Steps
resource "aws_emr_step" "spark_step" {
  name              = "Spark Application Step"
  action_on_failure = "CONTINUE"
  cluster_id        = aws_emr_cluster.main.id

  hadoop_jar_step {
    jar  = "command-runner.jar"
    args = [
      "spark-submit",
      "--class", "com.example.SparkApp",
      "--master", "yarn",
      "s3://${aws_s3_bucket.scripts.id}/app.jar",
      "arg1",
      "arg2"
    ]
  }
}

resource "aws_emr_step" "hive_step" {
  name              = "Hive Script Step"
  action_on_failure = "CONTINUE"
  cluster_id        = aws_emr_cluster.main.id

  hadoop_jar_step {
    jar  = "command-runner.jar"
    args = [
      "hive-script",
      "--run-hive-script",
      "--args",
      "-f",
      "s3://${aws_s3_bucket.scripts.id}/query.hql"
    ]
  }
}

Deployment Steps

  1. Initialize Terraform:
terraform init
  1. Plan the deployment:
terraform plan
  1. Apply the configuration:
terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

  1. Data Processing Pipeline
resource "aws_emr_cluster" "pipeline" {
  name          = "${var.project_name}-pipeline"
  release_label = "emr-6.10.0"
  applications  = ["Spark", "Hive"]

  # ... other configuration ...

  step {
    action_on_failure = "CONTINUE"
    name              = "Setup Hadoop Debugging"

    hadoop_jar_step {
      jar  = "command-runner.jar"
      args = ["state-pusher-script"]
    }
  }

  step {
    action_on_failure = "CANCEL_AND_WAIT"
    name              = "Data Processing Step"

    hadoop_jar_step {
      jar  = "command-runner.jar"
      args = [
        "spark-submit",
        "--class", "com.example.DataProcessor",
        "s3://${aws_s3_bucket.scripts.id}/processor.jar"
      ]
    }
  }
}
  1. Scheduled Processing
resource "aws_cloudwatch_event_rule" "emr_schedule" {
  name                = "${var.project_name}-emr-schedule"
  description         = "Schedule for EMR cluster creation"
  schedule_expression = "cron(0 0 * * ? *)"  # Daily at midnight
}

resource "aws_cloudwatch_event_target" "emr_target" {
  rule      = aws_cloudwatch_event_rule.emr_schedule.name
  target_id = "EMRClusterCreation"
  arn       = aws_lambda_function.create_emr.arn

  input = jsonencode({
    cluster_name = "${var.project_name}-scheduled"
    instance_count = 3
    instance_type = "m5.xlarge"
  })
}

Monitoring Configuration

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "emr" {
  dashboard_name = "${var.project_name}-emr-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/ElasticMapReduce", "IsIdle", "JobFlowId", aws_emr_cluster.main.id],
            ["AWS/ElasticMapReduce", "AppsRunning", "JobFlowId", aws_emr_cluster.main.id]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "EMR Cluster Metrics"
        }
      }
    ]
  })
}

# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "cluster_error" {
  alarm_name          = "${var.project_name}-cluster-error"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "MRUnhealthyNodes"
  namespace           = "AWS/ElasticMapReduce"
  period             = "300"
  statistic          = "Average"
  threshold          = "0"
  alarm_description  = "This metric monitors unhealthy EMR nodes"
  alarm_actions      = [var.sns_topic_arn]

  dimensions = {
    JobFlowId = aws_emr_cluster.main.id
  }
}

Conclusion

This setup provides a comprehensive foundation for deploying EMR using Terraform. Remember to:

  • Plan your cluster architecture carefully
  • Implement proper security measures
  • Monitor cluster performance
  • Keep your configurations versioned
  • Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.