11#! /bin/bash
22
3- # Anomstack Startup Script for Fly.io
3+ # Anomstack Startup Script for Fly.io with improved gRPC connectivity
44set -e
55
66echo " 🚀 Starting Anomstack services..."
@@ -24,35 +24,132 @@ ADMIN_PASSWORD="${ANOMSTACK_ADMIN_PASSWORD:-anomstack2024}"
2424htpasswd -bc /etc/nginx/.htpasswd " $ADMIN_USERNAME " " $ADMIN_PASSWORD "
2525echo " ✅ Authentication configured for user: $ADMIN_USERNAME "
2626
27+ # Function to check if code server is healthy
28+ check_code_server_health () {
29+ local retries=0
30+ local max_retries=30
31+ while [ $retries -lt $max_retries ]; do
32+ if dagster api grpc-health-check -p 4000 > /dev/null 2>&1 ; then
33+ echo " ✅ Code server is healthy"
34+ return 0
35+ fi
36+ echo " ⏳ Waiting for code server to be ready... (attempt $(( retries + 1 )) /$max_retries )"
37+ sleep 2
38+ retries=$(( retries + 1 ))
39+ done
40+ echo " ❌ Code server failed to start after $max_retries attempts"
41+ return 1
42+ }
43+
44+ # Function to start process with retry logic
45+ start_process_with_retry () {
46+ local name=$1
47+ local command=$2
48+ local logfile=$3
49+ local max_retries=3
50+ local retry=0
51+
52+ while [ $retry -lt $max_retries ]; do
53+ echo " 🔧 Starting $name (attempt $(( retry + 1 )) /$max_retries )..."
54+ nohup $command > $logfile 2>&1 &
55+ local pid=$!
56+ echo " $name PID: $pid "
57+
58+ # Give it a moment to crash if it's going to
59+ sleep 3
60+
61+ if kill -0 $pid 2> /dev/null; then
62+ echo " ✅ $name started successfully"
63+ echo $pid
64+ return 0
65+ else
66+ echo " ⚠️ $name failed to start, retrying..."
67+ retry=$(( retry + 1 ))
68+ fi
69+ done
70+
71+ echo " ❌ Failed to start $name after $max_retries attempts"
72+ return 1
73+ }
74+
2775echo " 🔧 Starting code server..."
28- nohup dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py > /tmp/code_server.log 2>&1 &
29- CODE_SERVER_PID=$!
76+ CODE_SERVER_PID=$( start_process_with_retry " Code Server" " dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" " /tmp/code_server.log" )
77+ if [ $? -ne 0 ]; then
78+ echo " ❌ Failed to start code server, exiting"
79+ exit 1
80+ fi
3081
31- echo " ⏳ Waiting for code server to start..."
32- sleep 10
82+ echo " ⏳ Waiting for code server to be ready..."
83+ if ! check_code_server_health; then
84+ echo " ❌ Code server health check failed, exiting"
85+ exit 1
86+ fi
3387
3488echo " 🌐 Starting webserver..."
35- nohup dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml > /tmp/webserver.log 2>&1 &
36- WEBSERVER_PID=$!
89+ WEBSERVER_PID=$( start_process_with_retry " Webserver" " dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" " /tmp/webserver.log" )
90+ if [ $? -ne 0 ]; then
91+ echo " ❌ Failed to start webserver, exiting"
92+ exit 1
93+ fi
3794
3895echo " ⚙️ Starting daemon..."
39- nohup dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml > /tmp/daemon.log 2>&1 &
40- DAEMON_PID=$!
96+ DAEMON_PID=$( start_process_with_retry " Daemon" " dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" " /tmp/daemon.log" )
97+ if [ $? -ne 0 ]; then
98+ echo " ❌ Failed to start daemon, exiting"
99+ exit 1
100+ fi
41101
42102echo " 📊 Starting dashboard..."
43- nohup uvicorn dashboard.app:app --host 0.0.0.0 --port 8080 > /tmp/dashboard.log 2>&1 &
44- DASHBOARD_PID=$!
103+ DASHBOARD_PID=$( start_process_with_retry " Dashboard" " uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" " /tmp/dashboard.log" )
104+ if [ $? -ne 0 ]; then
105+ echo " ❌ Failed to start dashboard, exiting"
106+ exit 1
107+ fi
45108
46109echo " 🌐 Starting nginx reverse proxy..."
47- nginx -t && nginx -g " daemon off;" > /tmp/nginx.log 2>&1 &
110+ nginx -t && nginx -g " daemon off;" &
48111NGINX_PID=$!
49112
50- echo " ✅ All services started!"
113+ echo " ✅ All services started successfully !"
51114echo " Code Server PID: $CODE_SERVER_PID "
52- echo " Webserver PID: $WEBSERVER_PID "
115+ echo " Webserver PID: $WEBSERVER_PID "
53116echo " Daemon PID: $DAEMON_PID "
54117echo " Dashboard PID: $DASHBOARD_PID "
55118echo " Nginx PID: $NGINX_PID "
56119
57- # Keep the script running
58- wait
120+ # Function to handle shutdown gracefully
121+ cleanup () {
122+ echo " 🛑 Shutting down services..."
123+ kill $NGINX_PID $DASHBOARD_PID $DAEMON_PID $WEBSERVER_PID $CODE_SERVER_PID 2> /dev/null || true
124+ wait
125+ exit 0
126+ }
127+
128+ # Trap signals for graceful shutdown
129+ trap cleanup SIGTERM SIGINT
130+
131+ # Monitor processes and restart if they crash
132+ while true ; do
133+ # Check if critical processes are still running
134+ if ! kill -0 $CODE_SERVER_PID 2> /dev/null; then
135+ echo " ❌ Code server crashed, restarting..."
136+ CODE_SERVER_PID=$( start_process_with_retry " Code Server" " dagster code-server start -h 0.0.0.0 -p 4000 -f anomstack/main.py" " /tmp/code_server.log" )
137+ fi
138+
139+ if ! kill -0 $WEBSERVER_PID 2> /dev/null; then
140+ echo " ❌ Webserver crashed, restarting..."
141+ WEBSERVER_PID=$( start_process_with_retry " Webserver" " dagster-webserver -h 0.0.0.0 -p 3000 -w /opt/dagster/dagster_home/workspace.yaml" " /tmp/webserver.log" )
142+ fi
143+
144+ if ! kill -0 $DAEMON_PID 2> /dev/null; then
145+ echo " ❌ Daemon crashed, restarting..."
146+ DAEMON_PID=$( start_process_with_retry " Daemon" " dagster-daemon run -w /opt/dagster/dagster_home/workspace.yaml" " /tmp/daemon.log" )
147+ fi
148+
149+ if ! kill -0 $DASHBOARD_PID 2> /dev/null; then
150+ echo " ❌ Dashboard crashed, restarting..."
151+ DASHBOARD_PID=$( start_process_with_retry " Dashboard" " uvicorn dashboard.app:app --host 0.0.0.0 --port 8080" " /tmp/dashboard.log" )
152+ fi
153+
154+ sleep 30
155+ done
0 commit comments