1+ #! /bin/bash
2+
3+ # Debug script for Dagster gRPC connectivity issues in Fly.io deployment
4+ echo " 🔍 Dagster gRPC Connectivity Debugger"
5+ echo " ======================================="
6+
7+ # Check environment
8+ echo " 📋 Environment Information:"
9+ echo " DAGSTER_HOME: ${DAGSTER_HOME:- not set} "
10+ echo " DAGSTER_CODE_SERVER_HOST: ${DAGSTER_CODE_SERVER_HOST:- not set} "
11+ echo " PYTHONPATH: ${PYTHONPATH:- not set} "
12+ echo " "
13+
14+ # Check if processes are running
15+ echo " 🔄 Process Status:"
16+ echo " Code server (port 4000):"
17+ if pgrep -f " dagster code-server" > /dev/null; then
18+ echo " ✅ Code server process is running (PID: $( pgrep -f ' dagster code-server' ) )"
19+ else
20+ echo " ❌ Code server process is not running"
21+ fi
22+
23+ echo " Webserver (port 3000):"
24+ if pgrep -f " dagster-webserver" > /dev/null; then
25+ echo " ✅ Webserver process is running (PID: $( pgrep -f ' dagster-webserver' ) )"
26+ else
27+ echo " ❌ Webserver process is not running"
28+ fi
29+
30+ echo " Daemon:"
31+ if pgrep -f " dagster-daemon" > /dev/null; then
32+ echo " ✅ Daemon process is running (PID: $( pgrep -f ' dagster-daemon' ) )"
33+ else
34+ echo " ❌ Daemon process is not running"
35+ fi
36+ echo " "
37+
38+ # Check ports
39+ echo " 🌐 Port Status:"
40+ echo " Port 4000 (code server):"
41+ if netstat -tuln 2> /dev/null | grep " :4000" > /dev/null; then
42+ echo " ✅ Port 4000 is listening"
43+ netstat -tuln | grep " :4000"
44+ else
45+ echo " ❌ Port 4000 is not listening"
46+ fi
47+
48+ echo " Port 3000 (webserver):"
49+ if netstat -tuln 2> /dev/null | grep " :3000" > /dev/null; then
50+ echo " ✅ Port 3000 is listening"
51+ netstat -tuln | grep " :3000"
52+ else
53+ echo " ❌ Port 3000 is not listening"
54+ fi
55+ echo " "
56+
57+ # Test gRPC health check
58+ echo " 💓 gRPC Health Check:"
59+ if dagster api grpc-health-check -p 4000 2> /dev/null; then
60+ echo " ✅ gRPC health check passed"
61+ else
62+ echo " ❌ gRPC health check failed"
63+ echo " Detailed error:"
64+ dagster api grpc-health-check -p 4000 2>&1 | head -5
65+ fi
66+ echo " "
67+
68+ # Test workspace loading
69+ echo " 📚 Workspace Configuration:"
70+ if [ -f " $DAGSTER_HOME /workspace.yaml" ]; then
71+ echo " ✅ Workspace file exists: $DAGSTER_HOME /workspace.yaml"
72+ echo " Content preview:"
73+ head -15 " $DAGSTER_HOME /workspace.yaml" | sed ' s/^/ /'
74+ else
75+ echo " ❌ Workspace file not found: $DAGSTER_HOME /workspace.yaml"
76+ fi
77+ echo " "
78+
79+ # Check log files for errors
80+ echo " 📄 Recent Log Entries:"
81+ for logfile in /tmp/code_server.log /tmp/webserver.log /tmp/daemon.log; do
82+ if [ -f " $logfile " ]; then
83+ echo " 📄 $logfile (last 10 lines):"
84+ tail -10 " $logfile " 2> /dev/null | sed ' s/^/ /' || echo " Could not read log file"
85+ echo " "
86+ fi
87+ done
88+
89+ # Test direct connection
90+ echo " 🔌 Direct Connection Test:"
91+ if command -v telnet > /dev/null 2>&1 ; then
92+ echo " Testing localhost:4000..."
93+ timeout 5 telnet localhost 4000 2> /dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
94+ elif command -v nc > /dev/null 2>&1 ; then
95+ echo " Testing localhost:4000..."
96+ timeout 5 nc -z localhost 4000 2> /dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
97+ else
98+ echo " ⚠️ No telnet or nc available for connection testing"
99+ fi
100+ echo " "
101+
102+ # System resources
103+ echo " 💾 System Resources:"
104+ echo " Memory usage:"
105+ free -h 2> /dev/null | head -2 | sed ' s/^/ /' || echo " Memory info not available"
106+ echo " Disk usage for /data:"
107+ df -h /data 2> /dev/null | sed ' s/^/ /' || echo " Disk info not available"
108+ echo " "
109+
110+ echo " 🏁 Debug complete!"
111+ echo " "
112+ echo " 💡 Common fixes:"
113+ echo " 1. Restart the deployment: fly deploy"
114+ echo " 2. Check resource limits in fly.toml"
115+ echo " 3. Review logs: fly logs"
116+ echo " 4. Scale up if memory/CPU constrained: fly scale memory 8192"
0 commit comments