Skip to content

Commit b9109ac

Browse files
committed
feat: add gRPC debugging tools for Fly deployment
- Added comprehensive gRPC connectivity debugger script - Enhanced Dockerfile with network debugging tools (net-tools, netcat) - Added process status, port status, and log analysis - Includes direct connection testing and system resource monitoring - Provides actionable troubleshooting guidance Facilitates faster diagnosis of deployment issues
1 parent 328a015 commit b9109ac

File tree

2 files changed

+121
-1
lines changed

2 files changed

+121
-1
lines changed

docker/Dockerfile.fly

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
88
cmake \
99
nginx \
1010
apache2-utils \
11+
net-tools \
12+
netcat-openbsd \
1113
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1214

1315
WORKDIR /opt/dagster/app
@@ -26,13 +28,15 @@ COPY metrics ./metrics
2628
COPY dagster_fly.yaml ./dagster_home/dagster.yaml
2729
COPY dagster_home/workspace.yaml ./dagster_home/workspace.yaml
2830
COPY scripts/deployment/start.sh /opt/dagster/start.sh
31+
COPY scripts/deployment/debug_grpc.sh /opt/dagster/debug_grpc.sh
2932

3033
# Verify files were copied and create any missing directories
3134
RUN ls -la ./dagster_home/ && \
3235
mkdir -p /opt/dagster/dagster_home && \
3336
cp -r ./dagster_home/* /opt/dagster/dagster_home/ && \
3437
ls -la /opt/dagster/dagster_home/ && \
35-
chmod +x /opt/dagster/start.sh
38+
chmod +x /opt/dagster/start.sh && \
39+
chmod +x /opt/dagster/debug_grpc.sh
3640

3741
# Create necessary directories for SQLite storage and artifacts
3842
RUN mkdir -p /data/models /data/dagster_storage /data/artifacts /tmp/dagster/compute_logs

scripts/deployment/debug_grpc.sh

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/bin/bash
2+
3+
# Debug script for Dagster gRPC connectivity issues in Fly.io deployment
4+
echo "🔍 Dagster gRPC Connectivity Debugger"
5+
echo "======================================="
6+
7+
# Check environment
8+
echo "📋 Environment Information:"
9+
echo "DAGSTER_HOME: ${DAGSTER_HOME:-not set}"
10+
echo "DAGSTER_CODE_SERVER_HOST: ${DAGSTER_CODE_SERVER_HOST:-not set}"
11+
echo "PYTHONPATH: ${PYTHONPATH:-not set}"
12+
echo ""
13+
14+
# Check if processes are running
15+
echo "🔄 Process Status:"
16+
echo "Code server (port 4000):"
17+
if pgrep -f "dagster code-server" > /dev/null; then
18+
echo " ✅ Code server process is running (PID: $(pgrep -f 'dagster code-server'))"
19+
else
20+
echo " ❌ Code server process is not running"
21+
fi
22+
23+
echo "Webserver (port 3000):"
24+
if pgrep -f "dagster-webserver" > /dev/null; then
25+
echo " ✅ Webserver process is running (PID: $(pgrep -f 'dagster-webserver'))"
26+
else
27+
echo " ❌ Webserver process is not running"
28+
fi
29+
30+
echo "Daemon:"
31+
if pgrep -f "dagster-daemon" > /dev/null; then
32+
echo " ✅ Daemon process is running (PID: $(pgrep -f 'dagster-daemon'))"
33+
else
34+
echo " ❌ Daemon process is not running"
35+
fi
36+
echo ""
37+
38+
# Check ports
39+
echo "🌐 Port Status:"
40+
echo "Port 4000 (code server):"
41+
if netstat -tuln 2>/dev/null | grep ":4000" > /dev/null; then
42+
echo " ✅ Port 4000 is listening"
43+
netstat -tuln | grep ":4000"
44+
else
45+
echo " ❌ Port 4000 is not listening"
46+
fi
47+
48+
echo "Port 3000 (webserver):"
49+
if netstat -tuln 2>/dev/null | grep ":3000" > /dev/null; then
50+
echo " ✅ Port 3000 is listening"
51+
netstat -tuln | grep ":3000"
52+
else
53+
echo " ❌ Port 3000 is not listening"
54+
fi
55+
echo ""
56+
57+
# Test gRPC health check
58+
echo "💓 gRPC Health Check:"
59+
if dagster api grpc-health-check -p 4000 2>/dev/null; then
60+
echo " ✅ gRPC health check passed"
61+
else
62+
echo " ❌ gRPC health check failed"
63+
echo " Detailed error:"
64+
dagster api grpc-health-check -p 4000 2>&1 | head -5
65+
fi
66+
echo ""
67+
68+
# Test workspace loading
69+
echo "📚 Workspace Configuration:"
70+
if [ -f "$DAGSTER_HOME/workspace.yaml" ]; then
71+
echo " ✅ Workspace file exists: $DAGSTER_HOME/workspace.yaml"
72+
echo " Content preview:"
73+
head -15 "$DAGSTER_HOME/workspace.yaml" | sed 's/^/ /'
74+
else
75+
echo " ❌ Workspace file not found: $DAGSTER_HOME/workspace.yaml"
76+
fi
77+
echo ""
78+
79+
# Check log files for errors
80+
echo "📄 Recent Log Entries:"
81+
for logfile in /tmp/code_server.log /tmp/webserver.log /tmp/daemon.log; do
82+
if [ -f "$logfile" ]; then
83+
echo " 📄 $logfile (last 10 lines):"
84+
tail -10 "$logfile" 2>/dev/null | sed 's/^/ /' || echo " Could not read log file"
85+
echo ""
86+
fi
87+
done
88+
89+
# Test direct connection
90+
echo "🔌 Direct Connection Test:"
91+
if command -v telnet >/dev/null 2>&1; then
92+
echo " Testing localhost:4000..."
93+
timeout 5 telnet localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
94+
elif command -v nc >/dev/null 2>&1; then
95+
echo " Testing localhost:4000..."
96+
timeout 5 nc -z localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
97+
else
98+
echo " ⚠️ No telnet or nc available for connection testing"
99+
fi
100+
echo ""
101+
102+
# System resources
103+
echo "💾 System Resources:"
104+
echo " Memory usage:"
105+
free -h 2>/dev/null | head -2 | sed 's/^/ /' || echo " Memory info not available"
106+
echo " Disk usage for /data:"
107+
df -h /data 2>/dev/null | sed 's/^/ /' || echo " Disk info not available"
108+
echo ""
109+
110+
echo "🏁 Debug complete!"
111+
echo ""
112+
echo "💡 Common fixes:"
113+
echo " 1. Restart the deployment: fly deploy"
114+
echo " 2. Check resource limits in fly.toml"
115+
echo " 3. Review logs: fly logs"
116+
echo " 4. Scale up if memory/CPU constrained: fly scale memory 8192"

0 commit comments

Comments
 (0)