|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Script to check BigQuery dataset and load Titanic data if needed |
| 4 | +# Usage: ./check_and_load_titanic_data.sh <project_id> |
| 5 | + |
| 6 | +set -e # Exit on any error |
| 7 | + |
| 8 | +PROJECT_ID=$1 |
| 9 | + |
| 10 | +if [ -z "$PROJECT_ID" ]; then |
| 11 | + echo "Error: Project ID is required as first argument" |
| 12 | + exit 1 |
| 13 | +fi |
| 14 | + |
| 15 | +echo "Checking BigQuery dataset and Titanic data for project: $PROJECT_ID" |
| 16 | + |
| 17 | +# Check if dataset exists |
| 18 | +if ! gcloud alpha bq datasets describe "test_dataset" --project="$PROJECT_ID" >/dev/null 2>&1; then |
| 19 | + echo "Dataset 'test_dataset' does not exist. Creating dataset and loading Titanic data..." |
| 20 | + |
| 21 | + # Create dataset |
| 22 | + gcloud alpha bq datasets create "test_dataset" \ |
| 23 | + --project="$PROJECT_ID" \ |
| 24 | + --description="Test dataset for Titanic data" |
| 25 | + |
| 26 | + NEED_DATA=true |
| 27 | +else |
| 28 | + echo "Dataset 'test_dataset' exists. Checking for 'titanic' table..." |
| 29 | + |
| 30 | + # Check if table exists |
| 31 | + if ! gcloud alpha bq tables describe "test_dataset.titanic" --project="$PROJECT_ID" >/dev/null 2>&1; then |
| 32 | + echo "Table 'titanic' does not exist in dataset 'test_dataset'." |
| 33 | + NEED_DATA=true |
| 34 | + else |
| 35 | + echo "Table 'titanic' already exists in dataset 'test_dataset'." |
| 36 | + NEED_DATA=false |
| 37 | + fi |
| 38 | +fi |
| 39 | + |
| 40 | +# Download and upload data if needed |
| 41 | +if [ "$NEED_DATA" = "true" ]; then |
| 42 | + echo "Downloading Titanic dataset..." |
| 43 | + curl -o titanic.csv "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv" |
| 44 | + |
| 45 | + # Create temporary bucket (with timestamp to ensure uniqueness) |
| 46 | + TIMESTAMP=$(date +%s) |
| 47 | + BUCKET_NAME="temp-titanic-data-${TIMESTAMP}" |
| 48 | + |
| 49 | + echo "Creating temporary bucket: gs://$BUCKET_NAME" |
| 50 | + gsutil mb gs://$BUCKET_NAME |
| 51 | + |
| 52 | + echo "Uploading Titanic dataset to temporary bucket..." |
| 53 | + gsutil cp titanic.csv gs://$BUCKET_NAME/ |
| 54 | + |
| 55 | + echo "Loading data from bucket to BigQuery..." |
| 56 | + gcloud alpha bq load \ |
| 57 | + --project="$PROJECT_ID" \ |
| 58 | + --source_format=CSV \ |
| 59 | + --skip_leading_rows=1 \ |
| 60 | + --autodetect \ |
| 61 | + "test_dataset.titanic" \ |
| 62 | + "gs://$BUCKET_NAME/titanic.csv" |
| 63 | + |
| 64 | + echo "Cleaning up temporary bucket..." |
| 65 | + gsutil rm -r gs://$BUCKET_NAME |
| 66 | + |
| 67 | + # Clean up local file |
| 68 | + rm -f titanic.csv |
| 69 | + |
| 70 | + echo "Titanic data successfully loaded to BigQuery table 'test_dataset.titanic'" |
| 71 | +else |
| 72 | + echo "Titanic data already exists in BigQuery. No action needed." |
| 73 | +fi |
| 74 | + |
| 75 | +echo "Script completed successfully." |
0 commit comments