Managing BigQuery with Terraform
Learn how to set up and manage Google BigQuery using Terraform
In this guide, we’ll explore how to manage Google BigQuery using Terraform.
Video Tutorial
Prerequisites
- Google Cloud SDK installed and configured
- Terraform installed (version 1.0.0 or later)
- A GCP project with billing enabled
Project Structure
.
├── main.tf # Main Terraform configuration file
├── variables.tf # Variable definitions
├── outputs.tf # Output definitions
├── terraform.tfvars # Variable values
└── modules/
└── bigquery/
├── main.tf # BigQuery specific configurations
├── variables.tf # Module variables
├── datasets.tf # Dataset configurations
└── outputs.tf # Module outputs
Provider Configuration
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 4.0"
}
}
}
provider "google" {
project = var.project_id
region = var.region
}
Variables
variable "project_id" {
description = "The ID of the GCP project"
type = string
}
variable "region" {
description = "The region to deploy resources to"
type = string
default = "us-central1"
}
variable "dataset_id" {
description = "ID of the BigQuery dataset"
type = string
}
Dataset Configuration
resource "google_bigquery_dataset" "dataset" {
dataset_id = var.dataset_id
friendly_name = "Analytics Dataset"
description = "Dataset for analytics data"
location = var.region
default_table_expiration_ms = 3600000 # 1 hour
labels = {
environment = "production"
}
access {
role = "OWNER"
user_by_email = google_service_account.bq_admin.email
}
access {
role = "READER"
user_by_email = google_service_account.bq_reader.email
}
}
Table Configuration
resource "google_bigquery_table" "table" {
dataset_id = google_bigquery_dataset.dataset.dataset_id
table_id = "events"
time_partitioning {
type = "DAY"
field = "timestamp"
}
clustering = ["user_id", "event_type"]
schema = jsonencode([
{
name = "timestamp",
type = "TIMESTAMP",
mode = "REQUIRED",
description = "Event timestamp"
},
{
name = "user_id",
type = "STRING",
mode = "REQUIRED",
description = "User ID"
},
{
name = "event_type",
type = "STRING",
mode = "REQUIRED",
description = "Type of event"
},
{
name = "properties",
type = "RECORD",
mode = "REPEATED",
fields = [
{
name = "key",
type = "STRING",
mode = "REQUIRED"
},
{
name = "value",
type = "STRING",
mode = "REQUIRED"
}
]
}
])
}
View Configuration
resource "google_bigquery_table" "view" {
dataset_id = google_bigquery_dataset.dataset.dataset_id
table_id = "daily_events"
view {
query = <<EOF
SELECT
DATE(timestamp) as date,
event_type,
COUNT(*) as event_count
FROM
`${var.project_id}.${var.dataset_id}.events`
GROUP BY
date,
event_type
ORDER BY
date DESC
EOF
use_legacy_sql = false
}
}
IAM Configuration
resource "google_service_account" "bq_admin" {
account_id = "bq-admin"
display_name = "BigQuery Admin"
}
resource "google_service_account" "bq_reader" {
account_id = "bq-reader"
display_name = "BigQuery Reader"
}
resource "google_project_iam_member" "bq_admin_role" {
project = var.project_id
role = "roles/bigquery.admin"
member = "serviceAccount:${google_service_account.bq_admin.email}"
}
resource "google_project_iam_member" "bq_reader_role" {
project = var.project_id
role = "roles/bigquery.dataViewer"
member = "serviceAccount:${google_service_account.bq_reader.email}"
}
Data Transfer Configuration
resource "google_bigquery_data_transfer_config" "scheduled_query" {
display_name = "Daily Analytics Query"
data_source_id = "scheduled_query"
destination_dataset_id = google_bigquery_dataset.dataset.dataset_id
location = var.region
schedule = "every 24 hours"
params = {
query = <<EOF
SELECT
DATE(timestamp) as date,
event_type,
COUNT(*) as event_count
FROM
`${var.project_id}.${var.dataset_id}.events`
WHERE
DATE(timestamp) = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
GROUP BY
date,
event_type
EOF
}
}
Reservation Configuration
resource "google_bigquery_reservation" "reservation" {
name = "analytics-reservation"
location = var.region
slot_capacity = 100
ignore_idle_slots = true
}
resource "google_bigquery_reservation_assignment" "assignment" {
name = "analytics-assignment"
reservation = google_bigquery_reservation.reservation.name
assignee = "projects/${var.project_id}"
job_type = "QUERY"
}
Outputs
output "dataset_id" {
value = google_bigquery_dataset.dataset.dataset_id
description = "The ID of the BigQuery dataset"
}
output "table_id" {
value = google_bigquery_table.table.table_id
description = "The ID of the events table"
}
output "admin_service_account" {
value = google_service_account.bq_admin.email
description = "The email of the admin service account"
}
Best Practices
-
Dataset Management:
- Plan partitioning
- Use clustering
- Set expiration
- Monitor size
-
Security:
- Use IAM roles
- Enable encryption
- Regular audits
- Monitor access
-
Performance:
- Optimize queries
- Use partitioning
- Monitor costs
- Regular maintenance
-
Cost Optimization:
- Monitor usage
- Set quotas
- Clean up data
- Regular review
Common Operations
Creating Resources
terraform init
terraform plan
terraform apply
Query Examples
# Sample query
SELECT
DATE(timestamp) as date,
event_type,
COUNT(*) as count
FROM
`project.dataset.events`
GROUP BY
date,
event_type
ORDER BY
date DESC
Best Practices and Tips
-
Query Optimization:
- Use partitioning
- Implement clustering
- Regular cleanup
- Monitor costs
-
Data Management:
- Plan schema
- Set retention
- Monitor usage
- Regular backups
-
Operations:
- Monitor performance
- Track costs
- Set up alerts
- Regular maintenance
Conclusion
You’ve learned how to set up and manage Google BigQuery using Terraform. This setup provides:
- Scalable analytics
- Cost optimization
- Performance tuning
- Best practices implementation