diff --git a/dagster_fly.yaml b/dagster_fly.yaml index c69001a1..eb820626 100644 --- a/dagster_fly.yaml +++ b/dagster_fly.yaml @@ -6,11 +6,11 @@ run_coordinator: module: dagster.core.run_coordinator class: QueuedRunCoordinator config: - max_concurrent_runs: 12 # Increased for 8GB RAM + 4 performance CPUs (prevents queue backup) + max_concurrent_runs: 8 # Reduced for better stability in single container tag_concurrency_limits: - key: "dagster/concurrency_key" value: "database" - limit: 2 + limit: 1 # Reduced to prevent resource contention - key: "dagster/concurrency_key" value: "ml_training" limit: 1 @@ -27,7 +27,7 @@ storage: run_retries: enabled: true - max_retries: 1 + max_retries: 2 # Increased for better reliability # Aggressive retention policies optimized for Fly.io disk usage retention: @@ -39,25 +39,26 @@ retention: failure: 2 success: 1 -# Run monitoring for Fly.io environment +# Enhanced run monitoring for Fly.io environment run_monitoring: enabled: true - start_timeout_seconds: 180 # 3 minutes to start - cancel_timeout_seconds: 120 # 2 minutes to cancel - max_runtime_seconds: 2700 # 45 minutes max runtime per run (increased with better resources) - poll_interval_seconds: 60 # Check every minute + start_timeout_seconds: 300 # 5 minutes to start (increased for cold starts) + cancel_timeout_seconds: 180 # 3 minutes to cancel (increased) + max_runtime_seconds: 3600 # 1 hour max runtime per run + poll_interval_seconds: 30 # Check every 30 seconds (more frequent) # Disable telemetry telemetry: enabled: false +# Optimized for single container environment schedules: use_threads: true - num_workers: 4 # Conservative for Fly.io + num_workers: 2 # Reduced for single container sensors: use_threads: true - num_workers: 2 # Conservative for Fly.io + num_workers: 1 # Reduced for single container compute_logs: module: dagster.core.storage.local_compute_log_manager @@ -71,3 +72,8 @@ local_artifact_storage: class: LocalArtifactStorage config: base_dir: "/data/artifacts" + +# Enhanced logging for debugging +code_servers: + reload_timeout: 60 # Give code servers more time to reload + heartbeat_timeout: 60 # Longer heartbeat timeout for reliability diff --git a/dagster_home/workspace.yaml b/dagster_home/workspace.yaml index 3f6682d3..597f4c93 100644 --- a/dagster_home/workspace.yaml +++ b/dagster_home/workspace.yaml @@ -7,3 +7,14 @@ load_from: env: DAGSTER_CODE_SERVER_HOST port: 4000 location_name: "anomstack_code" + # Enhanced connection settings for single container deployment + heartbeat_timeout: 60 # Allow 60 seconds for heartbeat + startup_timeout: 180 # Allow 3 minutes for startup + max_send_message_length: 52428800 # 50MB max message size + max_receive_message_length: 52428800 # 50MB max message size + # gRPC keepalive settings for better connection stability + grpc_keepalive_time_ms: 30000 # Send keepalive every 30 seconds + grpc_keepalive_timeout_ms: 10000 # Wait 10 seconds for keepalive response + grpc_keepalive_permit_without_calls: true # Allow keepalive without active calls + grpc_http2_max_pings_without_data: 0 # No limit on pings without data + grpc_http2_min_ping_interval_without_data_ms: 300000 # 5 minutes between pings diff --git a/docker/Dockerfile.fly b/docker/Dockerfile.fly index b4ed98e4..8c418980 100644 --- a/docker/Dockerfile.fly +++ b/docker/Dockerfile.fly @@ -8,6 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ cmake \ nginx \ apache2-utils \ + net-tools \ + netcat-openbsd \ && apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /opt/dagster/app @@ -26,13 +28,15 @@ COPY metrics ./metrics COPY dagster_fly.yaml ./dagster_home/dagster.yaml COPY dagster_home/workspace.yaml ./dagster_home/workspace.yaml COPY scripts/deployment/start.sh /opt/dagster/start.sh +COPY scripts/deployment/debug_grpc.sh /opt/dagster/debug_grpc.sh # Verify files were copied and create any missing directories RUN ls -la ./dagster_home/ && \ mkdir -p /opt/dagster/dagster_home && \ cp -r ./dagster_home/* /opt/dagster/dagster_home/ && \ ls -la /opt/dagster/dagster_home/ && \ - chmod +x /opt/dagster/start.sh + chmod +x /opt/dagster/start.sh && \ + chmod +x /opt/dagster/debug_grpc.sh # Create necessary directories for SQLite storage and artifacts RUN mkdir -p /data/models /data/dagster_storage /data/artifacts /tmp/dagster/compute_logs diff --git a/scripts/deployment/debug_grpc.sh b/scripts/deployment/debug_grpc.sh new file mode 100644 index 00000000..a968aa5c --- /dev/null +++ b/scripts/deployment/debug_grpc.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Debug script for Dagster gRPC connectivity issues in Fly.io deployment +echo "🔍 Dagster gRPC Connectivity Debugger" +echo "=======================================" + +# Check environment +echo "📋 Environment Information:" +echo "DAGSTER_HOME: ${DAGSTER_HOME:-not set}" +echo "DAGSTER_CODE_SERVER_HOST: ${DAGSTER_CODE_SERVER_HOST:-not set}" +echo "PYTHONPATH: ${PYTHONPATH:-not set}" +echo "" + +# Check if processes are running +echo "🔄 Process Status:" +echo "Code server (port 4000):" +if pgrep -f "dagster code-server" > /dev/null; then + echo " ✅ Code server process is running (PID: $(pgrep -f 'dagster code-server'))" +else + echo " ❌ Code server process is not running" +fi + +echo "Webserver (port 3000):" +if pgrep -f "dagster-webserver" > /dev/null; then + echo " ✅ Webserver process is running (PID: $(pgrep -f 'dagster-webserver'))" +else + echo " ❌ Webserver process is not running" +fi + +echo "Daemon:" +if pgrep -f "dagster-daemon" > /dev/null; then + echo " ✅ Daemon process is running (PID: $(pgrep -f 'dagster-daemon'))" +else + echo " ❌ Daemon process is not running" +fi +echo "" + +# Check ports +echo "🌐 Port Status:" +echo "Port 4000 (code server):" +if netstat -tuln 2>/dev/null | grep ":4000" > /dev/null; then + echo " ✅ Port 4000 is listening" + netstat -tuln | grep ":4000" +else + echo " ❌ Port 4000 is not listening" +fi + +echo "Port 3000 (webserver):" +if netstat -tuln 2>/dev/null | grep ":3000" > /dev/null; then + echo " ✅ Port 3000 is listening" + netstat -tuln | grep ":3000" +else + echo " ❌ Port 3000 is not listening" +fi +echo "" + +# Test gRPC health check +echo "💓 gRPC Health Check:" +if dagster api grpc-health-check -p 4000 2>/dev/null; then + echo " ✅ gRPC health check passed" +else + echo " ❌ gRPC health check failed" + echo " Detailed error:" + dagster api grpc-health-check -p 4000 2>&1 | head -5 +fi +echo "" + +# Test workspace loading +echo "📚 Workspace Configuration:" +if [ -f "$DAGSTER_HOME/workspace.yaml" ]; then + echo " ✅ Workspace file exists: $DAGSTER_HOME/workspace.yaml" + echo " Content preview:" + head -15 "$DAGSTER_HOME/workspace.yaml" | sed 's/^/ /' +else + echo " ❌ Workspace file not found: $DAGSTER_HOME/workspace.yaml" +fi +echo "" + +# Check log files for errors +echo "📄 Recent Log Entries:" +for logfile in /tmp/code_server.log /tmp/webserver.log /tmp/daemon.log; do + if [ -f "$logfile" ]; then + echo " 📄 $logfile (last 10 lines):" + tail -10 "$logfile" 2>/dev/null | sed 's/^/ /' || echo " Could not read log file" + echo "" + fi +done + +# Test direct connection +echo "🔌 Direct Connection Test:" +if command -v telnet >/dev/null 2>&1; then + echo " Testing localhost:4000..." + timeout 5 telnet localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed" +elif command -v nc >/dev/null 2>&1; then + echo " Testing localhost:4000..." + timeout 5 nc -z localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed" +else + echo " ⚠️ No telnet or nc available for connection testing" +fi +echo "" + +# System resources +echo "💾 System Resources:" +echo " Memory usage:" +free -h 2>/dev/null | head -2 | sed 's/^/ /' || echo " Memory info not available" +echo " Disk usage for /data:" +df -h /data 2>/dev/null | sed 's/^/ /' || echo " Disk info not available" +echo "" + +echo "🏁 Debug complete!" +echo "" +echo "💡 Common fixes:" +echo " 1. Restart the deployment: fly deploy" +echo " 2. Check resource limits in fly.toml" +echo " 3. Review logs: fly logs" +echo " 4. Scale up if memory/CPU constrained: fly scale memory 8192" \ No newline at end of file diff --git a/scripts/deployment/start.sh b/scripts/deployment/start.sh index 7b3f3522..0116c43d 100644 --- a/scripts/deployment/start.sh +++ b/scripts/deployment/start.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Anomstack Startup Script for Fly.io +# Anomstack Startup Script for Fly.io with improved gRPC connectivity set -e echo "🚀 Starting Anomstack services..." @@ -24,35 +24,132 @@ ADMIN_PASSWORD="${ANOMSTACK_ADMIN_PASSWORD:-anomstack2024}" htpasswd -bc /etc/nginx/.htpasswd "$ADMIN_USERNAME" "$ADMIN_PASSWORD" echo "✅ Authentication configured for user: $ADMIN_USERNAME" +# Function to check if code server is healthy +check_code_server_health() { + local retries=0 + local max_retries=30 + while [ $retries -lt $max_retries ]; do + if dagster api grpc-health-check -p 4000 >/dev/null 2>&1; then + echo "✅ Code server is healthy" + return 0 + fi + echo "⏳ Waiting for code server to be ready... (attempt $((retries + 1))/$max_retries)" + sleep 2 + retries=$((retries + 1)) + done + echo "❌ Code server failed to start after $max_retries attempts" + return 1 +} + +# Function to start process with retry logic +start_process_with_retry() { + local name=$1 + local command=$2 + local logfile=$3 + local max_retries=3 + local retry=0 + + while [ $retry -lt $max_retries ]; do + echo "🔧 Starting $name (attempt $((retry + 1))/$max_retries)..." + nohup $command > $logfile 2>&1 & + local pid=$! + echo "$name PID: $pid" + + # Give it a moment to crash if it's going to + sleep 3 + + if kill -0 $pid 2>/dev/null; then + echo "✅ $name started successfully" + echo $pid + return 0 + else + echo "⚠️ $name failed to start, retrying..." + retry=$((retry + 1)) + fi + done + + echo "❌ Failed to start $name after $max_retries attempts" + return 1 +} + echo "🔧 Starting code server..." -nohup dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py > /tmp/code_server.log 2>&1 & -CODE_SERVER_PID=$! +CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log") +if [ $? -ne 0 ]; then + echo "❌ Failed to start code server, exiting" + exit 1 +fi -echo "⏳ Waiting for code server to start..." -sleep 10 +echo "⏳ Waiting for code server to be ready..." +if ! check_code_server_health; then + echo "❌ Code server health check failed, exiting" + exit 1 +fi echo "🌐 Starting webserver..." -nohup dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml > /tmp/webserver.log 2>&1 & -WEBSERVER_PID=$! +WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log") +if [ $? -ne 0 ]; then + echo "❌ Failed to start webserver, exiting" + exit 1 +fi echo "⚙️ Starting daemon..." -nohup dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml > /tmp/daemon.log 2>&1 & -DAEMON_PID=$! +DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log") +if [ $? -ne 0 ]; then + echo "❌ Failed to start daemon, exiting" + exit 1 +fi echo "📊 Starting dashboard..." -nohup uvicorn dashboard.app:app --host 0.0.0.0 --port 8080 > /tmp/dashboard.log 2>&1 & -DASHBOARD_PID=$! +DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log") +if [ $? -ne 0 ]; then + echo "❌ Failed to start dashboard, exiting" + exit 1 +fi echo "🌐 Starting nginx reverse proxy..." -nginx -t && nginx -g "daemon off;" > /tmp/nginx.log 2>&1 & +nginx -t && nginx -g "daemon off;" & NGINX_PID=$! -echo "✅ All services started!" +echo "✅ All services started successfully!" echo "Code Server PID: $CODE_SERVER_PID" -echo "Webserver PID: $WEBSERVER_PID" +echo "Webserver PID: $WEBSERVER_PID" echo "Daemon PID: $DAEMON_PID" echo "Dashboard PID: $DASHBOARD_PID" echo "Nginx PID: $NGINX_PID" -# Keep the script running -wait +# Function to handle shutdown gracefully +cleanup() { + echo "🛑 Shutting down services..." + kill $NGINX_PID $DASHBOARD_PID $DAEMON_PID $WEBSERVER_PID $CODE_SERVER_PID 2>/dev/null || true + wait + exit 0 +} + +# Trap signals for graceful shutdown +trap cleanup SIGTERM SIGINT + +# Monitor processes and restart if they crash +while true; do + # Check if critical processes are still running + if ! kill -0 $CODE_SERVER_PID 2>/dev/null; then + echo "❌ Code server crashed, restarting..." + CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log") + fi + + if ! kill -0 $WEBSERVER_PID 2>/dev/null; then + echo "❌ Webserver crashed, restarting..." + WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log") + fi + + if ! kill -0 $DAEMON_PID 2>/dev/null; then + echo "❌ Daemon crashed, restarting..." + DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log") + fi + + if ! kill -0 $DASHBOARD_PID 2>/dev/null; then + echo "❌ Dashboard crashed, restarting..." + DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log") + fi + + sleep 30 +done