andrewm4894 · andrewm4894 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025 · Copilot
diff --git a/dagster_fly.yaml b/dagster_fly.yaml
@@ -6,11 +6,11 @@ run_coordinator:
   module: dagster.core.run_coordinator
   class: QueuedRunCoordinator
   config:
-    max_concurrent_runs: 12  # Increased for 8GB RAM + 4 performance CPUs (prevents queue backup)
+    max_concurrent_runs: 8  # Reduced for better stability in single container
     tag_concurrency_limits:
       - key: "dagster/concurrency_key"
         value: "database"
-        limit: 2
+        limit: 1  # Reduced to prevent resource contention
       - key: "dagster/concurrency_key"
         value: "ml_training"
         limit: 1
@@ -27,7 +27,7 @@ storage:
 
 run_retries:
   enabled: true
-  max_retries: 1
+  max_retries: 2  # Increased for better reliability
 
 # Aggressive retention policies optimized for Fly.io disk usage
 retention:
@@ -39,25 +39,26 @@ retention:
       failure: 2
       success: 1
 
-# Run monitoring for Fly.io environment
+# Enhanced run monitoring for Fly.io environment
 run_monitoring:
   enabled: true
-  start_timeout_seconds: 180   # 3 minutes to start
-  cancel_timeout_seconds: 120  # 2 minutes to cancel
-  max_runtime_seconds: 2700    # 45 minutes max runtime per run (increased with better resources)
-  poll_interval_seconds: 60    # Check every minute
+  start_timeout_seconds: 300   # 5 minutes to start (increased for cold starts)
+  cancel_timeout_seconds: 180  # 3 minutes to cancel (increased)
+  max_runtime_seconds: 3600    # 1 hour max runtime per run
+  poll_interval_seconds: 30    # Check every 30 seconds (more frequent)
 
 # Disable telemetry
 telemetry:
   enabled: false
 
+# Optimized for single container environment
 schedules:
   use_threads: true
-  num_workers: 4  # Conservative for Fly.io
+  num_workers: 2  # Reduced for single container
 
 sensors:
   use_threads: true
-  num_workers: 2  # Conservative for Fly.io
+  num_workers: 1  # Reduced for single container
 
 compute_logs:
   module: dagster.core.storage.local_compute_log_manager
@@ -71,3 +72,8 @@ local_artifact_storage:
   class: LocalArtifactStorage
   config:
     base_dir: "/data/artifacts"
+
+# Enhanced logging for debugging
+code_servers:
+  reload_timeout: 60  # Give code servers more time to reload
+  heartbeat_timeout: 60  # Longer heartbeat timeout for reliability
diff --git a/dagster_home/workspace.yaml b/dagster_home/workspace.yaml
@@ -7,3 +7,14 @@ load_from:
         env: DAGSTER_CODE_SERVER_HOST
       port: 4000
       location_name: "anomstack_code"
+      # Enhanced connection settings for single container deployment
+      heartbeat_timeout: 60      # Allow 60 seconds for heartbeat
+      startup_timeout: 180       # Allow 3 minutes for startup
+      max_send_message_length: 52428800  # 50MB max message size
+      max_receive_message_length: 52428800  # 50MB max message size
+      # gRPC keepalive settings for better connection stability
+      grpc_keepalive_time_ms: 30000        # Send keepalive every 30 seconds
+      grpc_keepalive_timeout_ms: 10000     # Wait 10 seconds for keepalive response
+      grpc_keepalive_permit_without_calls: true  # Allow keepalive without active calls
+      grpc_http2_max_pings_without_data: 0      # No limit on pings without data
+      grpc_http2_min_ping_interval_without_data_ms: 300000  # 5 minutes between pings
-      grpc_http2_min_ping_interval_without_data_ms: 300000  # 5 minutes between pings
+      grpc_http2_min_ping_interval_without_data_ms: 30000  # 30 seconds between pings
-      grpc_http2_min_ping_interval_without_data_ms: 300000  # 5 minutes between pings
+      grpc_http2_min_ping_interval_without_data_ms: 30000  # 30 seconds between pings
diff --git a/docker/Dockerfile.fly b/docker/Dockerfile.fly
@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     cmake \
     nginx \
     apache2-utils \
+    net-tools \
+    netcat-openbsd \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /opt/dagster/app
@@ -26,13 +28,15 @@ COPY metrics ./metrics
 COPY dagster_fly.yaml ./dagster_home/dagster.yaml
 COPY dagster_home/workspace.yaml ./dagster_home/workspace.yaml
 COPY scripts/deployment/start.sh /opt/dagster/start.sh
+COPY scripts/deployment/debug_grpc.sh /opt/dagster/debug_grpc.sh
 
 # Verify files were copied and create any missing directories
 RUN ls -la ./dagster_home/ && \
     mkdir -p /opt/dagster/dagster_home && \
     cp -r ./dagster_home/* /opt/dagster/dagster_home/ && \
     ls -la /opt/dagster/dagster_home/ && \
-    chmod +x /opt/dagster/start.sh
+    chmod +x /opt/dagster/start.sh && \
+    chmod +x /opt/dagster/debug_grpc.sh
 
 # Create necessary directories for SQLite storage and artifacts
 RUN mkdir -p /data/models /data/dagster_storage /data/artifacts /tmp/dagster/compute_logs

diff --git a/scripts/deployment/debug_grpc.sh b/scripts/deployment/debug_grpc.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Debug script for Dagster gRPC connectivity issues in Fly.io deployment
+echo "🔍 Dagster gRPC Connectivity Debugger"
+echo "======================================="
+
+# Check environment
+echo "📋 Environment Information:"
+echo "DAGSTER_HOME: ${DAGSTER_HOME:-not set}"
+echo "DAGSTER_CODE_SERVER_HOST: ${DAGSTER_CODE_SERVER_HOST:-not set}"
+echo "PYTHONPATH: ${PYTHONPATH:-not set}"
+echo ""
+
+# Check if processes are running
+echo "🔄 Process Status:"
+echo "Code server (port 4000):"
+if pgrep -f "dagster code-server" > /dev/null; then
+    echo "  ✅ Code server process is running (PID: $(pgrep -f 'dagster code-server'))"
+else
+    echo "  ❌ Code server process is not running"
+fi
+
+echo "Webserver (port 3000):"
+if pgrep -f "dagster-webserver" > /dev/null; then
+    echo "  ✅ Webserver process is running (PID: $(pgrep -f 'dagster-webserver'))"
+else
+    echo "  ❌ Webserver process is not running"
+fi
+
+echo "Daemon:"
+if pgrep -f "dagster-daemon" > /dev/null; then
+    echo "  ✅ Daemon process is running (PID: $(pgrep -f 'dagster-daemon'))"
+else
+    echo "  ❌ Daemon process is not running"
+fi
+echo ""
+
+# Check ports
+echo "🌐 Port Status:"
+echo "Port 4000 (code server):"
+if netstat -tuln 2>/dev/null | grep ":4000" > /dev/null; then
+    echo "  ✅ Port 4000 is listening"
+    netstat -tuln | grep ":4000"
+else
+    echo "  ❌ Port 4000 is not listening"
+fi
+
+echo "Port 3000 (webserver):"
+if netstat -tuln 2>/dev/null | grep ":3000" > /dev/null; then
+    echo "  ✅ Port 3000 is listening"
+    netstat -tuln | grep ":3000"
+else
+    echo "  ❌ Port 3000 is not listening"
+fi
+echo ""
+
+# Test gRPC health check
+echo "💓 gRPC Health Check:"
+if dagster api grpc-health-check -p 4000 2>/dev/null; then
+    echo "  ✅ gRPC health check passed"
+else
+    echo "  ❌ gRPC health check failed"
+    echo "  Detailed error:"
+    dagster api grpc-health-check -p 4000 2>&1 | head -5
+fi
+echo ""
+
+# Test workspace loading
+echo "📚 Workspace Configuration:"
+if [ -f "$DAGSTER_HOME/workspace.yaml" ]; then
+    echo "  ✅ Workspace file exists: $DAGSTER_HOME/workspace.yaml"
+    echo "  Content preview:"
+    head -15 "$DAGSTER_HOME/workspace.yaml" | sed 's/^/    /'
+else
+    echo "  ❌ Workspace file not found: $DAGSTER_HOME/workspace.yaml"
+fi
+echo ""
+
+# Check log files for errors
+echo "📄 Recent Log Entries:"
+for logfile in /tmp/code_server.log /tmp/webserver.log /tmp/daemon.log; do
+    if [ -f "$logfile" ]; then
+        echo "  📄 $logfile (last 10 lines):"
+        tail -10 "$logfile" 2>/dev/null | sed 's/^/    /' || echo "    Could not read log file"
+        echo ""
+    fi
+done
+
+# Test direct connection
+echo "🔌 Direct Connection Test:"
+if command -v telnet >/dev/null 2>&1; then
+    echo "  Testing localhost:4000..."
+    timeout 5 telnet localhost 4000 2>/dev/null && echo "  ✅ Connection successful" || echo "  ❌ Connection failed"
+elif command -v nc >/dev/null 2>&1; then
+    echo "  Testing localhost:4000..."
+    timeout 5 nc -z localhost 4000 2>/dev/null && echo "  ✅ Connection successful" || echo "  ❌ Connection failed"
+else
+    echo "  ⚠️ No telnet or nc available for connection testing"
+fi
+echo ""
+
+# System resources
+echo "💾 System Resources:"
+echo "  Memory usage:"
+free -h 2>/dev/null | head -2 | sed 's/^/    /' || echo "    Memory info not available"
+echo "  Disk usage for /data:"
+df -h /data 2>/dev/null | sed 's/^/    /' || echo "    Disk info not available"
+echo ""
+
+echo "🏁 Debug complete!"
+echo ""
+echo "💡 Common fixes:"
+echo "  1. Restart the deployment: fly deploy"
+echo "  2. Check resource limits in fly.toml"
+echo "  3. Review logs: fly logs"
+echo "  4. Scale up if memory/CPU constrained: fly scale memory 8192" 
diff --git a/scripts/deployment/start.sh b/scripts/deployment/start.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Anomstack Startup Script for Fly.io
+# Anomstack Startup Script for Fly.io with improved gRPC connectivity
 set -e
 
 echo "🚀 Starting Anomstack services..."
@@ -24,35 +24,132 @@ ADMIN_PASSWORD="${ANOMSTACK_ADMIN_PASSWORD:-anomstack2024}"
 htpasswd -bc /etc/nginx/.htpasswd "$ADMIN_USERNAME" "$ADMIN_PASSWORD"
 echo "✅ Authentication configured for user: $ADMIN_USERNAME"
 
+# Function to check if code server is healthy
+check_code_server_health() {
+    local retries=0
+    local max_retries=30
+    while [ $retries -lt $max_retries ]; do
+        if dagster api grpc-health-check -p 4000 >/dev/null 2>&1; then
+            echo "✅ Code server is healthy"
+            return 0
+        fi
+        echo "⏳ Waiting for code server to be ready... (attempt $((retries + 1))/$max_retries)"
+        sleep 2
+        retries=$((retries + 1))
+    done
+    echo "❌ Code server failed to start after $max_retries attempts"
+    return 1
+}
+
+# Function to start process with retry logic
+start_process_with_retry() {
+    local name=$1
+    local command=$2
+    local logfile=$3
+    local max_retries=3
+    local retry=0
+
+    while [ $retry -lt $max_retries ]; do
+        echo "🔧 Starting $name (attempt $((retry + 1))/$max_retries)..."
+        nohup $command > $logfile 2>&1 &
+        local pid=$!
+        echo "$name PID: $pid"
+
+        # Give it a moment to crash if it's going to
+        sleep 3
+
+        if kill -0 $pid 2>/dev/null; then
+            echo "✅ $name started successfully"
+            echo $pid
+            return 0
+        else
+            echo "⚠️ $name failed to start, retrying..."
+            retry=$((retry + 1))
+        fi
+    done
+
+    echo "❌ Failed to start $name after $max_retries attempts"
+    return 1
+}
+
 echo "🔧 Starting code server..."
-nohup dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py > /tmp/code_server.log 2>&1 &
-CODE_SERVER_PID=$!
+CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start code server, exiting"
+    exit 1
+fi
 
-echo "⏳ Waiting for code server to start..."
-sleep 10
+echo "⏳ Waiting for code server to be ready..."
+if ! check_code_server_health; then
+    echo "❌ Code server health check failed, exiting"
+    exit 1
+fi
 
 echo "🌐 Starting webserver..."
-nohup dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml > /tmp/webserver.log 2>&1 &
-WEBSERVER_PID=$!
+WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start webserver, exiting"
+    exit 1
+fi
 
 echo "⚙️ Starting daemon..."
-nohup dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml > /tmp/daemon.log 2>&1 &
-DAEMON_PID=$!
+DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start daemon, exiting"
+    exit 1
+fi
 
 echo "📊 Starting dashboard..."
-nohup uvicorn dashboard.app:app --host 0.0.0.0 --port 8080 > /tmp/dashboard.log 2>&1 &
-DASHBOARD_PID=$!
+DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start dashboard, exiting"
+    exit 1
+fi
 
 echo "🌐 Starting nginx reverse proxy..."
-nginx -t && nginx -g "daemon off;" > /tmp/nginx.log 2>&1 &
+nginx -t && nginx -g "daemon off;" &
 NGINX_PID=$!
-nginx -t && nginx -g "daemon off;" &
-NGINX_PID=$!
+NGINX_PID=$(start_process_with_retry "Nginx" "nginx -t && nginx -g 'daemon off;'" "/tmp/nginx.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start nginx, exiting"
+    exit 1
+fi
-nginx -t && nginx -g "daemon off;" &
-NGINX_PID=$!
+NGINX_PID=$(start_process_with_retry "Nginx" "nginx -t && nginx -g 'daemon off;'" "/tmp/nginx.log")
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to start nginx, exiting"
+    exit 1
+fi
 
-echo "✅ All services started!"
+echo "✅ All services started successfully!"
 echo "Code Server PID: $CODE_SERVER_PID"
-echo "Webserver PID: $WEBSERVER_PID"
+echo "Webserver PID: $WEBSERVER_PID" 
 echo "Daemon PID: $DAEMON_PID"
 echo "Dashboard PID: $DASHBOARD_PID"
 echo "Nginx PID: $NGINX_PID"
 
-# Keep the script running
-wait
+# Function to handle shutdown gracefully
+cleanup() {
+    echo "🛑 Shutting down services..."
+    kill $NGINX_PID $DASHBOARD_PID $DAEMON_PID $WEBSERVER_PID $CODE_SERVER_PID 2>/dev/null || true
+    wait
+    exit 0
+}
+
+# Trap signals for graceful shutdown
+trap cleanup SIGTERM SIGINT
+
+# Monitor processes and restart if they crash
+while true; do
+    # Check if critical processes are still running
+    if ! kill -0 $CODE_SERVER_PID 2>/dev/null; then
+        echo "❌ Code server crashed, restarting..."
+        CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
+    fi
+
+    if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
+        echo "❌ Webserver crashed, restarting..."
+        WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
-        CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
-    fi
-    
-    if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
-        echo "❌ Webserver crashed, restarting..."
-        WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
+        CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
+        if ! [[ $CODE_SERVER_PID =~ ^[0-9]+$ ]]; then
+            echo "❌ Failed to restart Code Server: Invalid PID returned"
+            CODE_SERVER_PID=""
+            continue
+        fi
+    fi
+    
+    if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
+        echo "❌ Webserver crashed, restarting..."
+        WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
+        if ! [[ $WEBSERVER_PID =~ ^[0-9]+$ ]]; then
+            echo "❌ Failed to restart Webserver: Invalid PID returned"
+            WEBSERVER_PID=""
+            continue
+        fi
-        CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
-    fi
-    
-    if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
-        echo "❌ Webserver crashed, restarting..."
-        WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
+        CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
+        if ! [[ $CODE_SERVER_PID =~ ^[0-9]+$ ]]; then
+            echo "❌ Failed to restart Code Server: Invalid PID returned"
+            CODE_SERVER_PID=""
+            continue
+        fi
+    fi
+    
+    if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
+        echo "❌ Webserver crashed, restarting..."
+        WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
+        if ! [[ $WEBSERVER_PID =~ ^[0-9]+$ ]]; then
+            echo "❌ Failed to restart Webserver: Invalid PID returned"
+            WEBSERVER_PID=""
+            continue
+        fi
+    fi
+
+    if ! kill -0 $DAEMON_PID 2>/dev/null; then
+        echo "❌ Daemon crashed, restarting..."
+        DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log")
+    fi
+
+    if ! kill -0 $DASHBOARD_PID 2>/dev/null; then
+        echo "❌ Dashboard crashed, restarting..."
+        DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log")
+    fi
+
+    sleep 30
+done