Add BigQuery dataset setup and script to load Titanic data

JeanFraga · JeanFraga · commit 7e9f2924ce93 · 2025-05-23T20:12:44.000-04:00
diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml
@@ -50,3 +50,12 @@ jobs:
         working-directory: ./terraform
         # Only run on pushes to the main branch (e.g., after a PR is merged)
         if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+
+      - name: 'Check BigQuery Dataset and Load Titanic Data'
+        id: bigquery_check
+        run: |
+          chmod +x ../scripts/check_and_load_titanic_data.sh
+          ../scripts/check_and_load_titanic_data.sh ${{ secrets.GCP_PROJECT_ID }}
+        working-directory: ./terraform
+        # Only run on pushes to the main branch (after terraform apply)
+        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
diff --git a/scripts/check_and_load_titanic_data.sh b/scripts/check_and_load_titanic_data.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Script to check BigQuery dataset and load Titanic data if needed
+# Usage: ./check_and_load_titanic_data.sh <project_id>
+
+set -e  # Exit on any error
+
+PROJECT_ID=$1
+
+if [ -z "$PROJECT_ID" ]; then
+    echo "Error: Project ID is required as first argument"
+    exit 1
+fi
+
+echo "Checking BigQuery dataset and Titanic data for project: $PROJECT_ID"
+
+# Check if dataset exists
+if ! gcloud alpha bq datasets describe "test_dataset" --project="$PROJECT_ID" >/dev/null 2>&1; then
+    echo "Dataset 'test_dataset' does not exist. Creating dataset and loading Titanic data..."
+    
+    # Create dataset
+    gcloud alpha bq datasets create "test_dataset" \
+        --project="$PROJECT_ID" \
+        --description="Test dataset for Titanic data"
+    
+    NEED_DATA=true
+else
+    echo "Dataset 'test_dataset' exists. Checking for 'titanic' table..."
+    
+    # Check if table exists
+    if ! gcloud alpha bq tables describe "test_dataset.titanic" --project="$PROJECT_ID" >/dev/null 2>&1; then
+        echo "Table 'titanic' does not exist in dataset 'test_dataset'."
+        NEED_DATA=true
+    else
+        echo "Table 'titanic' already exists in dataset 'test_dataset'."
+        NEED_DATA=false
+    fi
+fi
+
+# Download and upload data if needed
+if [ "$NEED_DATA" = "true" ]; then
+    echo "Downloading Titanic dataset..."
+    curl -o titanic.csv "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
+    
+    # Create temporary bucket (with timestamp to ensure uniqueness)
+    TIMESTAMP=$(date +%s)
+    BUCKET_NAME="temp-titanic-data-${TIMESTAMP}"
+    
+    echo "Creating temporary bucket: gs://$BUCKET_NAME"
+    gsutil mb gs://$BUCKET_NAME
+    
+    echo "Uploading Titanic dataset to temporary bucket..."
+    gsutil cp titanic.csv gs://$BUCKET_NAME/
+    
+    echo "Loading data from bucket to BigQuery..."
+    gcloud alpha bq load \
+        --project="$PROJECT_ID" \
+        --source_format=CSV \
+        --skip_leading_rows=1 \
+        --autodetect \
+        "test_dataset.titanic" \
+        "gs://$BUCKET_NAME/titanic.csv"
+    
+    echo "Cleaning up temporary bucket..."
+    gsutil rm -r gs://$BUCKET_NAME
+    
+    # Clean up local file
+    rm -f titanic.csv
+    
+    echo "Titanic data successfully loaded to BigQuery table 'test_dataset.titanic'"
+else
+    echo "Titanic data already exists in BigQuery. No action needed."
+fi
+
+echo "Script completed successfully."
diff --git a/terraform/data warehouse/bigquery.tf b/terraform/data warehouse/bigquery.tf
@@ -0,0 +1,10 @@
+# big query dataset setup
+# resource "google_bigquery_dataset" "dataset" {
+#     dataset_id = var.dataset_id
+#     location   = var.region
+#     description = "Dataset for ${var.project_id}"
+#     labels = {
+#         environment = var.environment
+#         project     = var.project_id
+#     }
+# }
diff --git a/terraform/main.tf b/terraform/main.tf
@@ -2,4 +2,15 @@
 provider "google" {
   project = var.project_id
   region  = var.region
+}
+
+resource "google_bigquery_dataset" "test_dataset" {
+  dataset_id = "test_dataset"
+  friendly_name = "test_dataset"
+  location   = var.region
+  description = "Dataset for ${var.project_id}"
+  labels = {
+    environment = var.environment
+    project     = var.project_id
+  }
 }
diff --git a/terraform/variables.tf b/terraform/variables.tf
@@ -8,3 +8,12 @@ variable "region" {
   type        = string
   default     = "us-east1"
 }
+
+variable "environment" {
+  description = "The environment name"
+  type        = string
+  validation {
+    condition = contains(["dev", "staging", "prod"], var.environment)
+    error_message = "Environment must be one of: dev, staging, prod."
+  }
+}