Setting up AWS Athena with Terraform

A comprehensive guide to configuring Amazon Athena for serverless queries using Terraform Infrastructure as Code

Setting up AWS Athena with Terraform

Amazon Athena is an interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL. This guide shows how to set up Athena using Terraform.

Prerequisites

  • AWS CLI configured
  • Terraform installed
  • S3 bucket with data to query
  • Basic understanding of SQL and data analytics

Project Structure

aws-athena-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic Athena Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# S3 Bucket for Query Results
resource "aws_s3_bucket" "athena_results" {
  bucket = "${var.project_name}-athena-results"

  tags = {
    Environment = var.environment
  }
}

# S3 Bucket Policy
resource "aws_s3_bucket_policy" "athena_results" {
  bucket = aws_s3_bucket.athena_results.id

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid    = "AllowAthenaAccess"
        Effect = "Allow"
        Principal = {
          AWS = "*"
        }
        Action = [
          "s3:GetBucketLocation",
          "s3:GetObject",
          "s3:ListBucket",
          "s3:PutObject"
        ]
        Resource = [
          aws_s3_bucket.athena_results.arn,
          "${aws_s3_bucket.athena_results.arn}/*"
        ]
        Condition = {
          StringEquals = {
            "aws:PrincipalAccount": data.aws_caller_identity.current.account_id
          }
        }
      }
    ]
  })
}

# Athena Workgroup
resource "aws_athena_workgroup" "main" {
  name = "${var.project_name}-workgroup"

  configuration {
    enforce_workgroup_configuration    = true
    publish_cloudwatch_metrics_enabled = true

    result_configuration {
      output_location = "s3://${aws_s3_bucket.athena_results.bucket}/output/"

      encryption_configuration {
        encryption_option = "SSE_S3"
      }
    }
  }

  tags = {
    Environment = var.environment
  }
}

# Athena Database
resource "aws_athena_database" "main" {
  name   = replace("${var.project_name}_db", "-", "_")
  bucket = aws_s3_bucket.athena_results.id
}

# Athena Named Query
resource "aws_athena_named_query" "example" {
  name        = "${var.project_name}-example-query"
  workgroup   = aws_athena_workgroup.main.id
  database    = aws_athena_database.main.name
  description = "Example query"
  query       = "SELECT * FROM example_table LIMIT 10;"
}

# Data source for current account
data "aws_caller_identity" "current" {}

Table Configuration

# Create Athena Table
resource "aws_glue_catalog_table" "example" {
  name          = "example_table"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "json"
    "compressionType"     = "none"
    "typeOfData"         = "file"
  }

  storage_descriptor {
    location      = "s3://${var.data_bucket}/data/"
    input_format  = "org.apache.hadoop.mapred.TextInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

    ser_de_info {
      name                  = "JsonSerDe"
      serialization_library = "org.openx.data.jsonserde.JsonSerDe"

      parameters = {
        "serialization.format" = 1
      }
    }

    columns {
      name = "id"
      type = "string"
    }

    columns {
      name = "timestamp"
      type = "timestamp"
    }

    columns {
      name = "data"
      type = "string"
    }
  }
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "data_bucket" {
  description = "S3 bucket containing source data"
  type        = string
}

Best Practices

  1. Query Optimization

    • Partition data effectively
    • Use appropriate file formats
    • Implement column pruning
    • Regular query optimization
  2. Cost Management

    • Use workgroup controls
    • Monitor query costs
    • Implement data lifecycle policies
    • Regular cost reviews
  3. Security

    • Implement proper IAM roles
    • Enable encryption
    • Use workgroup restrictions
    • Regular security reviews
  4. Performance

    • Use appropriate file formats
    • Optimize table design
    • Monitor query performance
    • Regular performance reviews

Workgroup Configuration with Cost Controls

# Workgroup with Cost Controls
resource "aws_athena_workgroup" "controlled" {
  name = "${var.project_name}-controlled"

  configuration {
    enforce_workgroup_configuration    = true
    publish_cloudwatch_metrics_enabled = true
    bytes_scanned_cutoff_per_query    = 10737418240  # 10GB

    result_configuration {
      output_location = "s3://${aws_s3_bucket.athena_results.bucket}/controlled/"

      encryption_configuration {
        encryption_option = "SSE_KMS"
        kms_key_arn      = aws_kms_key.athena.arn
      }
    }

    engine_version {
      selected_engine_version = "Athena engine version 3"
    }
  }

  force_destroy = true

  tags = {
    Environment = var.environment
  }
}

# KMS Key for Encryption
resource "aws_kms_key" "athena" {
  description             = "KMS key for Athena encryption"
  deletion_window_in_days = 7
  enable_key_rotation     = true

  tags = {
    Environment = var.environment
  }
}

Query Monitoring

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "athena" {
  dashboard_name = "${var.project_name}-athena-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/Athena", "TotalExecutionTime", "WorkGroup", aws_athena_workgroup.main.name],
            ["AWS/Athena", "QueryScanBytes", "WorkGroup", aws_athena_workgroup.main.name]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "Athena Query Metrics"
        }
      }
    ]
  })
}

# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "query_scan_bytes" {
  alarm_name          = "${var.project_name}-query-scan-bytes"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "QueryScanBytes"
  namespace           = "AWS/Athena"
  period             = "300"
  statistic          = "Average"
  threshold          = "5000000000"  # 5GB
  alarm_description  = "This metric monitors the amount of data scanned by Athena queries"
  alarm_actions      = [var.sns_topic_arn]

  dimensions = {
    WorkGroup = aws_athena_workgroup.main.name
  }
}

Deployment Steps

  1. Initialize Terraform:
terraform init
  1. Plan the deployment:
terraform plan
  1. Apply the configuration:
terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

  1. Log Analysis
resource "aws_glue_catalog_table" "logs" {
  name          = "application_logs"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "json"
    "compressionType"     = "gzip"
  }

  partition_keys {
    name = "year"
    type = "string"
  }

  partition_keys {
    name = "month"
    type = "string"
  }

  partition_keys {
    name = "day"
    type = "string"
  }

  storage_descriptor {
    location      = "s3://${var.logs_bucket}/logs/"
    input_format  = "org.apache.hadoop.mapred.TextInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

    ser_de_info {
      name                  = "JsonSerDe"
      serialization_library = "org.openx.data.jsonserde.JsonSerDe"
    }

    columns {
      name = "timestamp"
      type = "timestamp"
    }

    columns {
      name = "level"
      type = "string"
    }

    columns {
      name = "message"
      type = "string"
    }
  }
}

resource "aws_athena_named_query" "error_logs" {
  name      = "find_error_logs"
  workgroup = aws_athena_workgroup.main.id
  database  = aws_athena_database.main.name
  query     = <<EOF
SELECT timestamp, message
FROM application_logs
WHERE level = 'ERROR'
  AND year = '2024'
  AND month = '01'
ORDER BY timestamp DESC
LIMIT 100;
EOF
}
  1. Data Lake Queries
resource "aws_glue_catalog_table" "data_lake" {
  name          = "data_lake"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "parquet"
  }

  storage_descriptor {
    location      = "s3://${var.data_lake_bucket}/data/"
    input_format  = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"

    ser_de_info {
      name                  = "ParquetSerDe"
      serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
    }
  }
}

Conclusion

This setup provides a comprehensive foundation for deploying Athena using Terraform. Remember to:

  • Plan your data structure carefully
  • Implement proper partitioning
  • Monitor query performance and costs
  • Keep your configurations versioned
  • Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.