Skip to content

Commit e63d9de

Browse files
authored
Merge pull request #188 from andrewm4894/improve-grpc-reliability
Improve grpc reliability
2 parents bbd74b2 + b9109ac commit e63d9de

File tree

5 files changed

+261
-27
lines changed

5 files changed

+261
-27
lines changed

dagster_fly.yaml

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ run_coordinator:
66
module: dagster.core.run_coordinator
77
class: QueuedRunCoordinator
88
config:
9-
max_concurrent_runs: 12 # Increased for 8GB RAM + 4 performance CPUs (prevents queue backup)
9+
max_concurrent_runs: 8 # Reduced for better stability in single container
1010
tag_concurrency_limits:
1111
- key: "dagster/concurrency_key"
1212
value: "database"
13-
limit: 2
13+
limit: 1 # Reduced to prevent resource contention
1414
- key: "dagster/concurrency_key"
1515
value: "ml_training"
1616
limit: 1
@@ -27,7 +27,7 @@ storage:
2727

2828
run_retries:
2929
enabled: true
30-
max_retries: 1
30+
max_retries: 2 # Increased for better reliability
3131

3232
# Aggressive retention policies optimized for Fly.io disk usage
3333
retention:
@@ -39,25 +39,26 @@ retention:
3939
failure: 2
4040
success: 1
4141

42-
# Run monitoring for Fly.io environment
42+
# Enhanced run monitoring for Fly.io environment
4343
run_monitoring:
4444
enabled: true
45-
start_timeout_seconds: 180 # 3 minutes to start
46-
cancel_timeout_seconds: 120 # 2 minutes to cancel
47-
max_runtime_seconds: 2700 # 45 minutes max runtime per run (increased with better resources)
48-
poll_interval_seconds: 60 # Check every minute
45+
start_timeout_seconds: 300 # 5 minutes to start (increased for cold starts)
46+
cancel_timeout_seconds: 180 # 3 minutes to cancel (increased)
47+
max_runtime_seconds: 3600 # 1 hour max runtime per run
48+
poll_interval_seconds: 30 # Check every 30 seconds (more frequent)
4949

5050
# Disable telemetry
5151
telemetry:
5252
enabled: false
5353

54+
# Optimized for single container environment
5455
schedules:
5556
use_threads: true
56-
num_workers: 4 # Conservative for Fly.io
57+
num_workers: 2 # Reduced for single container
5758

5859
sensors:
5960
use_threads: true
60-
num_workers: 2 # Conservative for Fly.io
61+
num_workers: 1 # Reduced for single container
6162

6263
compute_logs:
6364
module: dagster.core.storage.local_compute_log_manager
@@ -71,3 +72,8 @@ local_artifact_storage:
7172
class: LocalArtifactStorage
7273
config:
7374
base_dir: "/data/artifacts"
75+
76+
# Enhanced logging for debugging
77+
code_servers:
78+
reload_timeout: 60 # Give code servers more time to reload
79+
heartbeat_timeout: 60 # Longer heartbeat timeout for reliability

dagster_home/workspace.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,14 @@ load_from:
77
env: DAGSTER_CODE_SERVER_HOST
88
port: 4000
99
location_name: "anomstack_code"
10+
# Enhanced connection settings for single container deployment
11+
heartbeat_timeout: 60 # Allow 60 seconds for heartbeat
12+
startup_timeout: 180 # Allow 3 minutes for startup
13+
max_send_message_length: 52428800 # 50MB max message size
14+
max_receive_message_length: 52428800 # 50MB max message size
15+
# gRPC keepalive settings for better connection stability
16+
grpc_keepalive_time_ms: 30000 # Send keepalive every 30 seconds
17+
grpc_keepalive_timeout_ms: 10000 # Wait 10 seconds for keepalive response
18+
grpc_keepalive_permit_without_calls: true # Allow keepalive without active calls
19+
grpc_http2_max_pings_without_data: 0 # No limit on pings without data
20+
grpc_http2_min_ping_interval_without_data_ms: 300000 # 5 minutes between pings

docker/Dockerfile.fly

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
88
cmake \
99
nginx \
1010
apache2-utils \
11+
net-tools \
12+
netcat-openbsd \
1113
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1214

1315
WORKDIR /opt/dagster/app
@@ -26,13 +28,15 @@ COPY metrics ./metrics
2628
COPY dagster_fly.yaml ./dagster_home/dagster.yaml
2729
COPY dagster_home/workspace.yaml ./dagster_home/workspace.yaml
2830
COPY scripts/deployment/start.sh /opt/dagster/start.sh
31+
COPY scripts/deployment/debug_grpc.sh /opt/dagster/debug_grpc.sh
2932

3033
# Verify files were copied and create any missing directories
3134
RUN ls -la ./dagster_home/ && \
3235
mkdir -p /opt/dagster/dagster_home && \
3336
cp -r ./dagster_home/* /opt/dagster/dagster_home/ && \
3437
ls -la /opt/dagster/dagster_home/ && \
35-
chmod +x /opt/dagster/start.sh
38+
chmod +x /opt/dagster/start.sh && \
39+
chmod +x /opt/dagster/debug_grpc.sh
3640

3741
# Create necessary directories for SQLite storage and artifacts
3842
RUN mkdir -p /data/models /data/dagster_storage /data/artifacts /tmp/dagster/compute_logs

scripts/deployment/debug_grpc.sh

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/bin/bash
2+
3+
# Debug script for Dagster gRPC connectivity issues in Fly.io deployment
4+
echo "🔍 Dagster gRPC Connectivity Debugger"
5+
echo "======================================="
6+
7+
# Check environment
8+
echo "📋 Environment Information:"
9+
echo "DAGSTER_HOME: ${DAGSTER_HOME:-not set}"
10+
echo "DAGSTER_CODE_SERVER_HOST: ${DAGSTER_CODE_SERVER_HOST:-not set}"
11+
echo "PYTHONPATH: ${PYTHONPATH:-not set}"
12+
echo ""
13+
14+
# Check if processes are running
15+
echo "🔄 Process Status:"
16+
echo "Code server (port 4000):"
17+
if pgrep -f "dagster code-server" > /dev/null; then
18+
echo " ✅ Code server process is running (PID: $(pgrep -f 'dagster code-server'))"
19+
else
20+
echo " ❌ Code server process is not running"
21+
fi
22+
23+
echo "Webserver (port 3000):"
24+
if pgrep -f "dagster-webserver" > /dev/null; then
25+
echo " ✅ Webserver process is running (PID: $(pgrep -f 'dagster-webserver'))"
26+
else
27+
echo " ❌ Webserver process is not running"
28+
fi
29+
30+
echo "Daemon:"
31+
if pgrep -f "dagster-daemon" > /dev/null; then
32+
echo " ✅ Daemon process is running (PID: $(pgrep -f 'dagster-daemon'))"
33+
else
34+
echo " ❌ Daemon process is not running"
35+
fi
36+
echo ""
37+
38+
# Check ports
39+
echo "🌐 Port Status:"
40+
echo "Port 4000 (code server):"
41+
if netstat -tuln 2>/dev/null | grep ":4000" > /dev/null; then
42+
echo " ✅ Port 4000 is listening"
43+
netstat -tuln | grep ":4000"
44+
else
45+
echo " ❌ Port 4000 is not listening"
46+
fi
47+
48+
echo "Port 3000 (webserver):"
49+
if netstat -tuln 2>/dev/null | grep ":3000" > /dev/null; then
50+
echo " ✅ Port 3000 is listening"
51+
netstat -tuln | grep ":3000"
52+
else
53+
echo " ❌ Port 3000 is not listening"
54+
fi
55+
echo ""
56+
57+
# Test gRPC health check
58+
echo "💓 gRPC Health Check:"
59+
if dagster api grpc-health-check -p 4000 2>/dev/null; then
60+
echo " ✅ gRPC health check passed"
61+
else
62+
echo " ❌ gRPC health check failed"
63+
echo " Detailed error:"
64+
dagster api grpc-health-check -p 4000 2>&1 | head -5
65+
fi
66+
echo ""
67+
68+
# Test workspace loading
69+
echo "📚 Workspace Configuration:"
70+
if [ -f "$DAGSTER_HOME/workspace.yaml" ]; then
71+
echo " ✅ Workspace file exists: $DAGSTER_HOME/workspace.yaml"
72+
echo " Content preview:"
73+
head -15 "$DAGSTER_HOME/workspace.yaml" | sed 's/^/ /'
74+
else
75+
echo " ❌ Workspace file not found: $DAGSTER_HOME/workspace.yaml"
76+
fi
77+
echo ""
78+
79+
# Check log files for errors
80+
echo "📄 Recent Log Entries:"
81+
for logfile in /tmp/code_server.log /tmp/webserver.log /tmp/daemon.log; do
82+
if [ -f "$logfile" ]; then
83+
echo " 📄 $logfile (last 10 lines):"
84+
tail -10 "$logfile" 2>/dev/null | sed 's/^/ /' || echo " Could not read log file"
85+
echo ""
86+
fi
87+
done
88+
89+
# Test direct connection
90+
echo "🔌 Direct Connection Test:"
91+
if command -v telnet >/dev/null 2>&1; then
92+
echo " Testing localhost:4000..."
93+
timeout 5 telnet localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
94+
elif command -v nc >/dev/null 2>&1; then
95+
echo " Testing localhost:4000..."
96+
timeout 5 nc -z localhost 4000 2>/dev/null && echo " ✅ Connection successful" || echo " ❌ Connection failed"
97+
else
98+
echo " ⚠️ No telnet or nc available for connection testing"
99+
fi
100+
echo ""
101+
102+
# System resources
103+
echo "💾 System Resources:"
104+
echo " Memory usage:"
105+
free -h 2>/dev/null | head -2 | sed 's/^/ /' || echo " Memory info not available"
106+
echo " Disk usage for /data:"
107+
df -h /data 2>/dev/null | sed 's/^/ /' || echo " Disk info not available"
108+
echo ""
109+
110+
echo "🏁 Debug complete!"
111+
echo ""
112+
echo "💡 Common fixes:"
113+
echo " 1. Restart the deployment: fly deploy"
114+
echo " 2. Check resource limits in fly.toml"
115+
echo " 3. Review logs: fly logs"
116+
echo " 4. Scale up if memory/CPU constrained: fly scale memory 8192"

scripts/deployment/start.sh

Lines changed: 113 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
# Anomstack Startup Script for Fly.io
3+
# Anomstack Startup Script for Fly.io with improved gRPC connectivity
44
set -e
55

66
echo "🚀 Starting Anomstack services..."
@@ -24,35 +24,132 @@ ADMIN_PASSWORD="${ANOMSTACK_ADMIN_PASSWORD:-anomstack2024}"
2424
htpasswd -bc /etc/nginx/.htpasswd "$ADMIN_USERNAME" "$ADMIN_PASSWORD"
2525
echo "✅ Authentication configured for user: $ADMIN_USERNAME"
2626

27+
# Function to check if code server is healthy
28+
check_code_server_health() {
29+
local retries=0
30+
local max_retries=30
31+
while [ $retries -lt $max_retries ]; do
32+
if dagster api grpc-health-check -p 4000 >/dev/null 2>&1; then
33+
echo "✅ Code server is healthy"
34+
return 0
35+
fi
36+
echo "⏳ Waiting for code server to be ready... (attempt $((retries + 1))/$max_retries)"
37+
sleep 2
38+
retries=$((retries + 1))
39+
done
40+
echo "❌ Code server failed to start after $max_retries attempts"
41+
return 1
42+
}
43+
44+
# Function to start process with retry logic
45+
start_process_with_retry() {
46+
local name=$1
47+
local command=$2
48+
local logfile=$3
49+
local max_retries=3
50+
local retry=0
51+
52+
while [ $retry -lt $max_retries ]; do
53+
echo "🔧 Starting $name (attempt $((retry + 1))/$max_retries)..."
54+
nohup $command > $logfile 2>&1 &
55+
local pid=$!
56+
echo "$name PID: $pid"
57+
58+
# Give it a moment to crash if it's going to
59+
sleep 3
60+
61+
if kill -0 $pid 2>/dev/null; then
62+
echo "$name started successfully"
63+
echo $pid
64+
return 0
65+
else
66+
echo "⚠️ $name failed to start, retrying..."
67+
retry=$((retry + 1))
68+
fi
69+
done
70+
71+
echo "❌ Failed to start $name after $max_retries attempts"
72+
return 1
73+
}
74+
2775
echo "🔧 Starting code server..."
28-
nohup dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py > /tmp/code_server.log 2>&1 &
29-
CODE_SERVER_PID=$!
76+
CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
77+
if [ $? -ne 0 ]; then
78+
echo "❌ Failed to start code server, exiting"
79+
exit 1
80+
fi
3081

31-
echo "⏳ Waiting for code server to start..."
32-
sleep 10
82+
echo "⏳ Waiting for code server to be ready..."
83+
if ! check_code_server_health; then
84+
echo "❌ Code server health check failed, exiting"
85+
exit 1
86+
fi
3387

3488
echo "🌐 Starting webserver..."
35-
nohup dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml > /tmp/webserver.log 2>&1 &
36-
WEBSERVER_PID=$!
89+
WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
90+
if [ $? -ne 0 ]; then
91+
echo "❌ Failed to start webserver, exiting"
92+
exit 1
93+
fi
3794

3895
echo "⚙️ Starting daemon..."
39-
nohup dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml > /tmp/daemon.log 2>&1 &
40-
DAEMON_PID=$!
96+
DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log")
97+
if [ $? -ne 0 ]; then
98+
echo "❌ Failed to start daemon, exiting"
99+
exit 1
100+
fi
41101

42102
echo "📊 Starting dashboard..."
43-
nohup uvicorn dashboard.app:app --host 0.0.0.0 --port 8080 > /tmp/dashboard.log 2>&1 &
44-
DASHBOARD_PID=$!
103+
DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log")
104+
if [ $? -ne 0 ]; then
105+
echo "❌ Failed to start dashboard, exiting"
106+
exit 1
107+
fi
45108

46109
echo "🌐 Starting nginx reverse proxy..."
47-
nginx -t && nginx -g "daemon off;" > /tmp/nginx.log 2>&1 &
110+
nginx -t && nginx -g "daemon off;" &
48111
NGINX_PID=$!
49112

50-
echo "✅ All services started!"
113+
echo "✅ All services started successfully!"
51114
echo "Code Server PID: $CODE_SERVER_PID"
52-
echo "Webserver PID: $WEBSERVER_PID"
115+
echo "Webserver PID: $WEBSERVER_PID"
53116
echo "Daemon PID: $DAEMON_PID"
54117
echo "Dashboard PID: $DASHBOARD_PID"
55118
echo "Nginx PID: $NGINX_PID"
56119

57-
# Keep the script running
58-
wait
120+
# Function to handle shutdown gracefully
121+
cleanup() {
122+
echo "🛑 Shutting down services..."
123+
kill $NGINX_PID $DASHBOARD_PID $DAEMON_PID $WEBSERVER_PID $CODE_SERVER_PID 2>/dev/null || true
124+
wait
125+
exit 0
126+
}
127+
128+
# Trap signals for graceful shutdown
129+
trap cleanup SIGTERM SIGINT
130+
131+
# Monitor processes and restart if they crash
132+
while true; do
133+
# Check if critical processes are still running
134+
if ! kill -0 $CODE_SERVER_PID 2>/dev/null; then
135+
echo "❌ Code server crashed, restarting..."
136+
CODE_SERVER_PID=$(start_process_with_retry "Code Server" "dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" "/tmp/code_server.log")
137+
fi
138+
139+
if ! kill -0 $WEBSERVER_PID 2>/dev/null; then
140+
echo "❌ Webserver crashed, restarting..."
141+
WEBSERVER_PID=$(start_process_with_retry "Webserver" "dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/webserver.log")
142+
fi
143+
144+
if ! kill -0 $DAEMON_PID 2>/dev/null; then
145+
echo "❌ Daemon crashed, restarting..."
146+
DAEMON_PID=$(start_process_with_retry "Daemon" "dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" "/tmp/daemon.log")
147+
fi
148+
149+
if ! kill -0 $DASHBOARD_PID 2>/dev/null; then
150+
echo "❌ Dashboard crashed, restarting..."
151+
DASHBOARD_PID=$(start_process_with_retry "Dashboard" "uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" "/tmp/dashboard.log")
152+
fi
153+
154+
sleep 30
155+
done

0 commit comments

Comments
 (0)