Skip to content

Commit 7e9f292

Browse files
committed
Add BigQuery dataset setup and script to load Titanic data
1 parent eb44bf1 commit 7e9f292

File tree

5 files changed

+114
-0
lines changed

5 files changed

+114
-0
lines changed

.github/workflows/terraform.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,12 @@ jobs:
5050
working-directory: ./terraform
5151
# Only run on pushes to the main branch (e.g., after a PR is merged)
5252
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
53+
54+
- name: 'Check BigQuery Dataset and Load Titanic Data'
55+
id: bigquery_check
56+
run: |
57+
chmod +x ../scripts/check_and_load_titanic_data.sh
58+
../scripts/check_and_load_titanic_data.sh ${{ secrets.GCP_PROJECT_ID }}
59+
working-directory: ./terraform
60+
# Only run on pushes to the main branch (after terraform apply)
61+
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/bin/bash
2+
3+
# Script to check BigQuery dataset and load Titanic data if needed
4+
# Usage: ./check_and_load_titanic_data.sh <project_id>
5+
6+
set -e # Exit on any error
7+
8+
PROJECT_ID=$1
9+
10+
if [ -z "$PROJECT_ID" ]; then
11+
echo "Error: Project ID is required as first argument"
12+
exit 1
13+
fi
14+
15+
echo "Checking BigQuery dataset and Titanic data for project: $PROJECT_ID"
16+
17+
# Check if dataset exists
18+
if ! gcloud alpha bq datasets describe "test_dataset" --project="$PROJECT_ID" >/dev/null 2>&1; then
19+
echo "Dataset 'test_dataset' does not exist. Creating dataset and loading Titanic data..."
20+
21+
# Create dataset
22+
gcloud alpha bq datasets create "test_dataset" \
23+
--project="$PROJECT_ID" \
24+
--description="Test dataset for Titanic data"
25+
26+
NEED_DATA=true
27+
else
28+
echo "Dataset 'test_dataset' exists. Checking for 'titanic' table..."
29+
30+
# Check if table exists
31+
if ! gcloud alpha bq tables describe "test_dataset.titanic" --project="$PROJECT_ID" >/dev/null 2>&1; then
32+
echo "Table 'titanic' does not exist in dataset 'test_dataset'."
33+
NEED_DATA=true
34+
else
35+
echo "Table 'titanic' already exists in dataset 'test_dataset'."
36+
NEED_DATA=false
37+
fi
38+
fi
39+
40+
# Download and upload data if needed
41+
if [ "$NEED_DATA" = "true" ]; then
42+
echo "Downloading Titanic dataset..."
43+
curl -o titanic.csv "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
44+
45+
# Create temporary bucket (with timestamp to ensure uniqueness)
46+
TIMESTAMP=$(date +%s)
47+
BUCKET_NAME="temp-titanic-data-${TIMESTAMP}"
48+
49+
echo "Creating temporary bucket: gs://$BUCKET_NAME"
50+
gsutil mb gs://$BUCKET_NAME
51+
52+
echo "Uploading Titanic dataset to temporary bucket..."
53+
gsutil cp titanic.csv gs://$BUCKET_NAME/
54+
55+
echo "Loading data from bucket to BigQuery..."
56+
gcloud alpha bq load \
57+
--project="$PROJECT_ID" \
58+
--source_format=CSV \
59+
--skip_leading_rows=1 \
60+
--autodetect \
61+
"test_dataset.titanic" \
62+
"gs://$BUCKET_NAME/titanic.csv"
63+
64+
echo "Cleaning up temporary bucket..."
65+
gsutil rm -r gs://$BUCKET_NAME
66+
67+
# Clean up local file
68+
rm -f titanic.csv
69+
70+
echo "Titanic data successfully loaded to BigQuery table 'test_dataset.titanic'"
71+
else
72+
echo "Titanic data already exists in BigQuery. No action needed."
73+
fi
74+
75+
echo "Script completed successfully."
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# big query dataset setup
2+
# resource "google_bigquery_dataset" "dataset" {
3+
# dataset_id = var.dataset_id
4+
# location = var.region
5+
# description = "Dataset for ${var.project_id}"
6+
# labels = {
7+
# environment = var.environment
8+
# project = var.project_id
9+
# }
10+
# }

terraform/main.tf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,15 @@
22
provider "google" {
33
project = var.project_id
44
region = var.region
5+
}
6+
7+
resource "google_bigquery_dataset" "test_dataset" {
8+
dataset_id = "test_dataset"
9+
friendly_name = "test_dataset"
10+
location = var.region
11+
description = "Dataset for ${var.project_id}"
12+
labels = {
13+
environment = var.environment
14+
project = var.project_id
15+
}
516
}

terraform/variables.tf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,12 @@ variable "region" {
88
type = string
99
default = "us-east1"
1010
}
11+
12+
variable "environment" {
13+
description = "The environment name"
14+
type = string
15+
validation {
16+
condition = contains(["dev", "staging", "prod"], var.environment)
17+
error_message = "Environment must be one of: dev, staging, prod."
18+
}
19+
}

0 commit comments

Comments
 (0)