Setting up AWS Athena with Terraform

Amazon Athena is an interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL. This guide shows how to set up Athena using Terraform.

Prerequisites

AWS CLI configured
Terraform installed
S3 bucket with data to query
Basic understanding of SQL and data analytics

Project Structure

aws-athena-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars

Basic Athena Configuration

# main.tf
provider "aws" {
  region = var.aws_region
}

# S3 Bucket for Query Results
resource "aws_s3_bucket" "athena_results" {
  bucket = "${var.project_name}-athena-results"

  tags = {
    Environment = var.environment
  }
}

# S3 Bucket Policy
resource "aws_s3_bucket_policy" "athena_results" {
  bucket = aws_s3_bucket.athena_results.id

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid    = "AllowAthenaAccess"
        Effect = "Allow"
        Principal = {
          AWS = "*"
        }
        Action = [
          "s3:GetBucketLocation",
          "s3:GetObject",
          "s3:ListBucket",
          "s3:PutObject"
        ]
        Resource = [
          aws_s3_bucket.athena_results.arn,
          "${aws_s3_bucket.athena_results.arn}/*"
        ]
        Condition = {
          StringEquals = {
            "aws:PrincipalAccount": data.aws_caller_identity.current.account_id
          }
        }
      }
    ]
  })
}

# Athena Workgroup
resource "aws_athena_workgroup" "main" {
  name = "${var.project_name}-workgroup"

  configuration {
    enforce_workgroup_configuration    = true
    publish_cloudwatch_metrics_enabled = true

    result_configuration {
      output_location = "s3://${aws_s3_bucket.athena_results.bucket}/output/"

      encryption_configuration {
        encryption_option = "SSE_S3"
      }
    }
  }

  tags = {
    Environment = var.environment
  }
}

# Athena Database
resource "aws_athena_database" "main" {
  name   = replace("${var.project_name}_db", "-", "_")
  bucket = aws_s3_bucket.athena_results.id
}

# Athena Named Query
resource "aws_athena_named_query" "example" {
  name        = "${var.project_name}-example-query"
  workgroup   = aws_athena_workgroup.main.id
  database    = aws_athena_database.main.name
  description = "Example query"
  query       = "SELECT * FROM example_table LIMIT 10;"
}

# Data source for current account
data "aws_caller_identity" "current" {}

Table Configuration

# Create Athena Table
resource "aws_glue_catalog_table" "example" {
  name          = "example_table"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "json"
    "compressionType"     = "none"
    "typeOfData"         = "file"
  }

  storage_descriptor {
    location      = "s3://${var.data_bucket}/data/"
    input_format  = "org.apache.hadoop.mapred.TextInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

    ser_de_info {
      name                  = "JsonSerDe"
      serialization_library = "org.openx.data.jsonserde.JsonSerDe"

      parameters = {
        "serialization.format" = 1
      }
    }

    columns {
      name = "id"
      type = "string"
    }

    columns {
      name = "timestamp"
      type = "timestamp"
    }

    columns {
      name = "data"
      type = "string"
    }
  }
}

Variables Configuration

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "project_name" {
  description = "Project name"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "data_bucket" {
  description = "S3 bucket containing source data"
  type        = string
}

Best Practices

Query Optimization
- Partition data effectively
- Use appropriate file formats
- Implement column pruning
- Regular query optimization
Cost Management
- Use workgroup controls
- Monitor query costs
- Implement data lifecycle policies
- Regular cost reviews
Security
- Implement proper IAM roles
- Enable encryption
- Use workgroup restrictions
- Regular security reviews
Performance
- Use appropriate file formats
- Optimize table design
- Monitor query performance
- Regular performance reviews

Workgroup Configuration with Cost Controls

# Workgroup with Cost Controls
resource "aws_athena_workgroup" "controlled" {
  name = "${var.project_name}-controlled"

  configuration {
    enforce_workgroup_configuration    = true
    publish_cloudwatch_metrics_enabled = true
    bytes_scanned_cutoff_per_query    = 10737418240  # 10GB

    result_configuration {
      output_location = "s3://${aws_s3_bucket.athena_results.bucket}/controlled/"

      encryption_configuration {
        encryption_option = "SSE_KMS"
        kms_key_arn      = aws_kms_key.athena.arn
      }
    }

    engine_version {
      selected_engine_version = "Athena engine version 3"
    }
  }

  force_destroy = true

  tags = {
    Environment = var.environment
  }
}

# KMS Key for Encryption
resource "aws_kms_key" "athena" {
  description             = "KMS key for Athena encryption"
  deletion_window_in_days = 7
  enable_key_rotation     = true

  tags = {
    Environment = var.environment
  }
}

Query Monitoring

# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "athena" {
  dashboard_name = "${var.project_name}-athena-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/Athena", "TotalExecutionTime", "WorkGroup", aws_athena_workgroup.main.name],
            ["AWS/Athena", "QueryScanBytes", "WorkGroup", aws_athena_workgroup.main.name]
          ]
          period = 300
          stat   = "Average"
          region = var.aws_region
          title  = "Athena Query Metrics"
        }
      }
    ]
  })
}

# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "query_scan_bytes" {
  alarm_name          = "${var.project_name}-query-scan-bytes"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "QueryScanBytes"
  namespace           = "AWS/Athena"
  period             = "300"
  statistic          = "Average"
  threshold          = "5000000000"  # 5GB
  alarm_description  = "This metric monitors the amount of data scanned by Athena queries"
  alarm_actions      = [var.sns_topic_arn]

  dimensions = {
    WorkGroup = aws_athena_workgroup.main.name
  }
}

Deployment Steps

Initialize Terraform:

terraform init

Plan the deployment:

terraform plan

Apply the configuration:

terraform apply

Clean Up

Remove all resources when done:

terraform destroy

Common Use Cases

Log Analysis

resource "aws_glue_catalog_table" "logs" {
  name          = "application_logs"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "json"
    "compressionType"     = "gzip"
  }

  partition_keys {
    name = "year"
    type = "string"
  }

  partition_keys {
    name = "month"
    type = "string"
  }

  partition_keys {
    name = "day"
    type = "string"
  }

  storage_descriptor {
    location      = "s3://${var.logs_bucket}/logs/"
    input_format  = "org.apache.hadoop.mapred.TextInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

    ser_de_info {
      name                  = "JsonSerDe"
      serialization_library = "org.openx.data.jsonserde.JsonSerDe"
    }

    columns {
      name = "timestamp"
      type = "timestamp"
    }

    columns {
      name = "level"
      type = "string"
    }

    columns {
      name = "message"
      type = "string"
    }
  }
}

resource "aws_athena_named_query" "error_logs" {
  name      = "find_error_logs"
  workgroup = aws_athena_workgroup.main.id
  database  = aws_athena_database.main.name
  query     = <<EOF
SELECT timestamp, message
FROM application_logs
WHERE level = 'ERROR'
  AND year = '2024'
  AND month = '01'
ORDER BY timestamp DESC
LIMIT 100;
EOF
}

Data Lake Queries

resource "aws_glue_catalog_table" "data_lake" {
  name          = "data_lake"
  database_name = aws_athena_database.main.name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "classification"      = "parquet"
  }

  storage_descriptor {
    location      = "s3://${var.data_lake_bucket}/data/"
    input_format  = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"

    ser_de_info {
      name                  = "ParquetSerDe"
      serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
    }
  }
}

Conclusion

This setup provides a comprehensive foundation for deploying Athena using Terraform. Remember to:

Plan your data structure carefully
Implement proper partitioning
Monitor query performance and costs
Keep your configurations versioned
Test thoroughly before production deployment

The complete code can be customized based on your specific requirements and use cases.