Setting up AWS Athena with Terraform
A comprehensive guide to configuring Amazon Athena for serverless queries using Terraform Infrastructure as Code
Setting up AWS Athena with Terraform
Amazon Athena is an interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL. This guide shows how to set up Athena using Terraform.
Prerequisites
- AWS CLI configured
- Terraform installed
- S3 bucket with data to query
- Basic understanding of SQL and data analytics
Project Structure
aws-athena-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
└── terraform.tfvars
Basic Athena Configuration
# main.tf
provider "aws" {
region = var.aws_region
}
# S3 Bucket for Query Results
resource "aws_s3_bucket" "athena_results" {
bucket = "${var.project_name}-athena-results"
tags = {
Environment = var.environment
}
}
# S3 Bucket Policy
resource "aws_s3_bucket_policy" "athena_results" {
bucket = aws_s3_bucket.athena_results.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Sid = "AllowAthenaAccess"
Effect = "Allow"
Principal = {
AWS = "*"
}
Action = [
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:PutObject"
]
Resource = [
aws_s3_bucket.athena_results.arn,
"${aws_s3_bucket.athena_results.arn}/*"
]
Condition = {
StringEquals = {
"aws:PrincipalAccount": data.aws_caller_identity.current.account_id
}
}
}
]
})
}
# Athena Workgroup
resource "aws_athena_workgroup" "main" {
name = "${var.project_name}-workgroup"
configuration {
enforce_workgroup_configuration = true
publish_cloudwatch_metrics_enabled = true
result_configuration {
output_location = "s3://${aws_s3_bucket.athena_results.bucket}/output/"
encryption_configuration {
encryption_option = "SSE_S3"
}
}
}
tags = {
Environment = var.environment
}
}
# Athena Database
resource "aws_athena_database" "main" {
name = replace("${var.project_name}_db", "-", "_")
bucket = aws_s3_bucket.athena_results.id
}
# Athena Named Query
resource "aws_athena_named_query" "example" {
name = "${var.project_name}-example-query"
workgroup = aws_athena_workgroup.main.id
database = aws_athena_database.main.name
description = "Example query"
query = "SELECT * FROM example_table LIMIT 10;"
}
# Data source for current account
data "aws_caller_identity" "current" {}
Table Configuration
# Create Athena Table
resource "aws_glue_catalog_table" "example" {
name = "example_table"
database_name = aws_athena_database.main.name
table_type = "EXTERNAL_TABLE"
parameters = {
EXTERNAL = "TRUE"
"classification" = "json"
"compressionType" = "none"
"typeOfData" = "file"
}
storage_descriptor {
location = "s3://${var.data_bucket}/data/"
input_format = "org.apache.hadoop.mapred.TextInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
ser_de_info {
name = "JsonSerDe"
serialization_library = "org.openx.data.jsonserde.JsonSerDe"
parameters = {
"serialization.format" = 1
}
}
columns {
name = "id"
type = "string"
}
columns {
name = "timestamp"
type = "timestamp"
}
columns {
name = "data"
type = "string"
}
}
}
Variables Configuration
# variables.tf
variable "aws_region" {
description = "AWS region"
type = string
default = "us-west-2"
}
variable "project_name" {
description = "Project name"
type = string
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "data_bucket" {
description = "S3 bucket containing source data"
type = string
}
Best Practices
-
Query Optimization
- Partition data effectively
- Use appropriate file formats
- Implement column pruning
- Regular query optimization
-
Cost Management
- Use workgroup controls
- Monitor query costs
- Implement data lifecycle policies
- Regular cost reviews
-
Security
- Implement proper IAM roles
- Enable encryption
- Use workgroup restrictions
- Regular security reviews
-
Performance
- Use appropriate file formats
- Optimize table design
- Monitor query performance
- Regular performance reviews
Workgroup Configuration with Cost Controls
# Workgroup with Cost Controls
resource "aws_athena_workgroup" "controlled" {
name = "${var.project_name}-controlled"
configuration {
enforce_workgroup_configuration = true
publish_cloudwatch_metrics_enabled = true
bytes_scanned_cutoff_per_query = 10737418240 # 10GB
result_configuration {
output_location = "s3://${aws_s3_bucket.athena_results.bucket}/controlled/"
encryption_configuration {
encryption_option = "SSE_KMS"
kms_key_arn = aws_kms_key.athena.arn
}
}
engine_version {
selected_engine_version = "Athena engine version 3"
}
}
force_destroy = true
tags = {
Environment = var.environment
}
}
# KMS Key for Encryption
resource "aws_kms_key" "athena" {
description = "KMS key for Athena encryption"
deletion_window_in_days = 7
enable_key_rotation = true
tags = {
Environment = var.environment
}
}
Query Monitoring
# CloudWatch Dashboard
resource "aws_cloudwatch_dashboard" "athena" {
dashboard_name = "${var.project_name}-athena-dashboard"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["AWS/Athena", "TotalExecutionTime", "WorkGroup", aws_athena_workgroup.main.name],
["AWS/Athena", "QueryScanBytes", "WorkGroup", aws_athena_workgroup.main.name]
]
period = 300
stat = "Average"
region = var.aws_region
title = "Athena Query Metrics"
}
}
]
})
}
# CloudWatch Alarms
resource "aws_cloudwatch_metric_alarm" "query_scan_bytes" {
alarm_name = "${var.project_name}-query-scan-bytes"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "QueryScanBytes"
namespace = "AWS/Athena"
period = "300"
statistic = "Average"
threshold = "5000000000" # 5GB
alarm_description = "This metric monitors the amount of data scanned by Athena queries"
alarm_actions = [var.sns_topic_arn]
dimensions = {
WorkGroup = aws_athena_workgroup.main.name
}
}
Deployment Steps
- Initialize Terraform:
terraform init
- Plan the deployment:
terraform plan
- Apply the configuration:
terraform apply
Clean Up
Remove all resources when done:
terraform destroy
Common Use Cases
- Log Analysis
resource "aws_glue_catalog_table" "logs" {
name = "application_logs"
database_name = aws_athena_database.main.name
table_type = "EXTERNAL_TABLE"
parameters = {
EXTERNAL = "TRUE"
"classification" = "json"
"compressionType" = "gzip"
}
partition_keys {
name = "year"
type = "string"
}
partition_keys {
name = "month"
type = "string"
}
partition_keys {
name = "day"
type = "string"
}
storage_descriptor {
location = "s3://${var.logs_bucket}/logs/"
input_format = "org.apache.hadoop.mapred.TextInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
ser_de_info {
name = "JsonSerDe"
serialization_library = "org.openx.data.jsonserde.JsonSerDe"
}
columns {
name = "timestamp"
type = "timestamp"
}
columns {
name = "level"
type = "string"
}
columns {
name = "message"
type = "string"
}
}
}
resource "aws_athena_named_query" "error_logs" {
name = "find_error_logs"
workgroup = aws_athena_workgroup.main.id
database = aws_athena_database.main.name
query = <<EOF
SELECT timestamp, message
FROM application_logs
WHERE level = 'ERROR'
AND year = '2024'
AND month = '01'
ORDER BY timestamp DESC
LIMIT 100;
EOF
}
- Data Lake Queries
resource "aws_glue_catalog_table" "data_lake" {
name = "data_lake"
database_name = aws_athena_database.main.name
table_type = "EXTERNAL_TABLE"
parameters = {
EXTERNAL = "TRUE"
"classification" = "parquet"
}
storage_descriptor {
location = "s3://${var.data_lake_bucket}/data/"
input_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
ser_de_info {
name = "ParquetSerDe"
serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
}
}
}
Conclusion
This setup provides a comprehensive foundation for deploying Athena using Terraform. Remember to:
- Plan your data structure carefully
- Implement proper partitioning
- Monitor query performance and costs
- Keep your configurations versioned
- Test thoroughly before production deployment
The complete code can be customized based on your specific requirements and use cases.