Skip to content

Commit 5718065

Browse files
authored
Merge pull request #191 from andrewm4894/disk-cleanup
Disk cleanup
2 parents d3d7a6d + b16e786 commit 5718065

File tree

11 files changed

+596
-18
lines changed

11 files changed

+596
-18
lines changed

Makefile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,18 +216,24 @@ fly-deploy-development:
216216
fly-deploy-demo-fresh:
217217
@echo "🧹 Cleaning local Docker cache to ensure fresh build..."
218218
docker system prune -f --filter "until=1h"
219+
@echo "🧹 Cleaning Docker builder cache..."
220+
docker builder prune -f 2>/dev/null || true
219221
./scripts/deployment/deploy_fly.sh --profile demo --force-rebuild
220222

221223
# deploy with fresh build (clears local Docker cache first) - production profile
222224
fly-deploy-production-fresh:
223225
@echo "🧹 Cleaning local Docker cache to ensure fresh build..."
224226
docker system prune -f --filter "until=1h"
227+
@echo "🧹 Cleaning Docker builder cache..."
228+
docker builder prune -f 2>/dev/null || true
225229
./scripts/deployment/deploy_fly.sh --profile production --force-rebuild
226230

227231
# deploy with fresh build (clears local Docker cache first) - development profile
228232
fly-deploy-development-fresh:
229233
@echo "🧹 Cleaning local Docker cache to ensure fresh build..."
230234
docker system prune -f --filter "until=1h"
235+
@echo "🧹 Cleaning Docker builder cache..."
236+
docker builder prune -f 2>/dev/null || true
231237
./scripts/deployment/deploy_fly.sh --profile development --force-rebuild
232238

233239
# test fly.io build locally before deploying (helps catch issues early)
@@ -416,6 +422,26 @@ posthog-example:
416422
kill-long-runs:
417423
python scripts/maintenance/kill_long_running_tasks.py
418424

425+
# clean up disk space on fly instance (requires SSH access)
426+
fly-cleanup:
427+
@echo "🧹 Running disk cleanup on Fly instance..."
428+
@echo "This will SSH into your Fly instance and run cleanup"
429+
@if [ -z "$$FLY_APP" ]; then echo "Set FLY_APP environment variable"; exit 1; fi
430+
fly ssh console -a $$FLY_APP -C "cd /opt/dagster/app && python scripts/maintenance/cleanup_disk_space.py"
431+
432+
# preview cleanup on fly instance (dry run)
433+
fly-cleanup-preview:
434+
@echo "🔍 Previewing disk cleanup on Fly instance..."
435+
@if [ -z "$$FLY_APP" ]; then echo "Set FLY_APP environment variable"; exit 1; fi
436+
fly ssh console -a $$FLY_APP -C "cd /opt/dagster/app && python scripts/maintenance/cleanup_disk_space.py --dry-run"
437+
438+
# aggressive cleanup for emergency situations
439+
fly-cleanup-aggressive:
440+
@echo "⚡ Running AGGRESSIVE disk cleanup on Fly instance..."
441+
@echo "This will remove more files - use only if disk is critically full"
442+
@if [ -z "$$FLY_APP" ]; then echo "Set FLY_APP environment variable"; exit 1; fi
443+
fly ssh console -a $$FLY_APP -C "cd /opt/dagster/app && python scripts/maintenance/cleanup_disk_space.py --aggressive"
444+
419445
# run docker in dev mode with correct environment
420446
docker-dev-env:
421447
docker compose -f docker-compose.yaml -f docker-compose.dev.yaml up -d

Makefile.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,43 @@ make posthog-example
562562
make kill-long-runs
563563
```
564564

565+
### Fly.io Disk Space Management
566+
567+
#### `make fly-cleanup-preview`
568+
**Preview disk cleanup on Fly instance (dry run)**
569+
- Shows what files would be removed
570+
- Safe way to check cleanup impact
571+
- Requires `FLY_APP` environment variable
572+
573+
```bash
574+
export FLY_APP=anomstack-demo
575+
make fly-cleanup-preview
576+
```
577+
578+
#### `make fly-cleanup`
579+
**Clean up disk space on Fly instance**
580+
- Removes old artifacts (6+ hours)
581+
- Removes old logs (24+ hours)
582+
- Cleans database and runs VACUUM
583+
- Reports disk usage before/after
584+
585+
```bash
586+
export FLY_APP=anomstack-demo
587+
make fly-cleanup
588+
```
589+
590+
#### `make fly-cleanup-aggressive`
591+
**Emergency disk cleanup (aggressive mode)**
592+
- Removes artifacts older than 1 hour
593+
- Removes ALL log files
594+
- Use only when disk is critically full
595+
- More thorough than normal cleanup
596+
597+
```bash
598+
export FLY_APP=anomstack-demo
599+
make fly-cleanup-aggressive
600+
```
601+
565602
### Legacy Targets
566603

567604
#### `make docker-dev-env`

anomstack/jobs/cleanup.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
"""
2+
Cleanup job for managing disk space and removing old artifacts.
3+
"""
4+
5+
import os
6+
import shutil
7+
import sqlite3
8+
from datetime import datetime, timedelta
9+
from pathlib import Path
10+
11+
from dagster import DefaultScheduleStatus, JobDefinition, ScheduleDefinition, job, op, get_dagster_logger
12+
13+
14+
@op
15+
def cleanup_old_artifacts():
16+
"""Clean up old Dagster artifacts to free disk space."""
17+
logger = get_dagster_logger()
18+
19+
artifacts_path = "/data/artifacts/storage"
20+
if not os.path.exists(artifacts_path):
21+
logger.info("Artifacts directory does not exist, skipping cleanup")
22+
return
23+
24+
# Remove artifacts older than 6 hours
25+
cutoff_time = datetime.now() - timedelta(hours=6)
26+
removed_count = 0
27+
freed_bytes = 0
28+
29+
try:
30+
for item in os.listdir(artifacts_path):
31+
item_path = os.path.join(artifacts_path, item)
32+
if os.path.isdir(item_path):
33+
# Get directory modification time
34+
mod_time = datetime.fromtimestamp(os.path.getmtime(item_path))
35+
if mod_time < cutoff_time:
36+
# Calculate size before removal
37+
try:
38+
size = sum(
39+
os.path.getsize(os.path.join(dirpath, filename))
40+
for dirpath, dirnames, filenames in os.walk(item_path)
41+
for filename in filenames
42+
)
43+
shutil.rmtree(item_path)
44+
removed_count += 1
45+
freed_bytes += size
46+
logger.info(f"Removed old artifact directory: {item}")
47+
except Exception as e:
48+
logger.warning(f"Failed to remove {item_path}: {e}")
49+
50+
freed_mb = freed_bytes / (1024 * 1024)
51+
logger.info(f"Cleanup complete: removed {removed_count} directories, freed {freed_mb:.1f}MB")
52+
53+
except Exception as e:
54+
logger.error(f"Error during artifact cleanup: {e}")
55+
56+
57+
@op
58+
def cleanup_old_logs():
59+
"""Clean up old log files."""
60+
logger = get_dagster_logger()
61+
62+
log_dirs = ["/tmp/dagster", "/data/dagster_storage"]
63+
removed_count = 0
64+
freed_bytes = 0
65+
66+
for log_dir in log_dirs:
67+
if not os.path.exists(log_dir):
68+
continue
69+
70+
try:
71+
for root, dirs, files in os.walk(log_dir):
72+
for file in files:
73+
if file.endswith(('.log', '.out', '.err')):
74+
file_path = os.path.join(root, file)
75+
# Remove log files older than 24 hours
76+
if os.path.getmtime(file_path) < (datetime.now() - timedelta(hours=24)).timestamp():
77+
try:
78+
size = os.path.getsize(file_path)
79+
os.remove(file_path)
80+
removed_count += 1
81+
freed_bytes += size
82+
except Exception as e:
83+
logger.warning(f"Failed to remove log file {file_path}: {e}")
84+
except Exception as e:
85+
logger.warning(f"Error cleaning logs in {log_dir}: {e}")
86+
87+
freed_mb = freed_bytes / (1024 * 1024)
88+
logger.info(f"Log cleanup complete: removed {removed_count} files, freed {freed_mb:.1f}MB")
89+
90+
91+
@op
92+
def cleanup_old_metrics():
93+
"""Clean up old metric data from database."""
94+
logger = get_dagster_logger()
95+
96+
db_path = "/data/anomstack.db"
97+
if not os.path.exists(db_path):
98+
logger.info("Database does not exist, skipping metric cleanup")
99+
return
100+
101+
try:
102+
conn = sqlite3.connect(db_path)
103+
cursor = conn.cursor()
104+
105+
# Remove metrics older than 90 days
106+
cutoff_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
107+
108+
# Get count before deletion
109+
cursor.execute("SELECT COUNT(*) FROM metrics WHERE metric_timestamp < ?", (cutoff_date,))
110+
old_count = cursor.fetchone()[0]
111+
112+
# Delete old metrics
113+
cursor.execute("DELETE FROM metrics WHERE metric_timestamp < ?", (cutoff_date,))
114+
115+
# Vacuum to reclaim space
116+
cursor.execute("VACUUM")
117+
118+
conn.commit()
119+
conn.close()
120+
121+
logger.info(f"Database cleanup complete: removed {old_count} old metric records")
122+
123+
except Exception as e:
124+
logger.error(f"Error during database cleanup: {e}")
125+
126+
127+
@op
128+
def report_disk_usage():
129+
"""Report current disk usage."""
130+
logger = get_dagster_logger()
131+
132+
try:
133+
# Get disk usage for /data
134+
statvfs = os.statvfs('/data')
135+
total_bytes = statvfs.f_frsize * statvfs.f_blocks
136+
free_bytes = statvfs.f_frsize * statvfs.f_bavail
137+
used_bytes = total_bytes - free_bytes
138+
139+
total_gb = total_bytes / (1024 ** 3)
140+
used_gb = used_bytes / (1024 ** 3)
141+
free_gb = free_bytes / (1024 ** 3)
142+
usage_percent = (used_bytes / total_bytes) * 100
143+
144+
logger.info(f"Disk usage - Total: {total_gb:.1f}GB, Used: {used_gb:.1f}GB ({usage_percent:.1f}%), Free: {free_gb:.1f}GB")
145+
146+
# Get directory sizes
147+
data_dirs = ['/data/artifacts', '/data/dagster_storage', '/data/models']
148+
for dir_path in data_dirs:
149+
if os.path.exists(dir_path):
150+
try:
151+
total_size = sum(
152+
os.path.getsize(os.path.join(dirpath, filename))
153+
for dirpath, dirnames, filenames in os.walk(dir_path)
154+
for filename in filenames
155+
)
156+
size_gb = total_size / (1024 ** 3)
157+
logger.info(f"{dir_path}: {size_gb:.2f}GB")
158+
except Exception as e:
159+
logger.warning(f"Could not calculate size for {dir_path}: {e}")
160+
161+
except Exception as e:
162+
logger.error(f"Error reporting disk usage: {e}")
163+
164+
165+
@job(
166+
name="cleanup_disk_space",
167+
description="Clean up old artifacts, logs, and metrics to free disk space"
168+
)
169+
def cleanup_job():
170+
"""Job to clean up disk space."""
171+
report_disk_usage()
172+
cleanup_old_artifacts()
173+
cleanup_old_logs()
174+
cleanup_old_metrics()
175+
report_disk_usage() # Report again after cleanup
176+
177+
178+
# Create schedule to run cleanup every 2 hours
179+
cleanup_schedule = ScheduleDefinition(
180+
job=cleanup_job,
181+
cron_schedule="0 */2 * * *", # Every 2 hours
182+
default_status=DefaultScheduleStatus.RUNNING,
183+
)
184+
185+
# Export for main.py
186+
cleanup_jobs = [cleanup_job]
187+
cleanup_schedules = [cleanup_schedule]

anomstack/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from anomstack.jobs.alert import alert_jobs, alert_schedules
88
from anomstack.jobs.change import change_jobs, change_schedules
9+
# from anomstack.jobs.cleanup import cleanup_jobs, cleanup_schedules # Temporarily disabled
910
from anomstack.jobs.delete import delete_jobs, delete_schedules
1011
from anomstack.jobs.ingest import ingest_jobs, ingest_schedules
1112
from anomstack.jobs.llmalert import llmalert_jobs, llmalert_schedules
@@ -29,6 +30,7 @@
2930
+ summary_jobs
3031
+ delete_jobs
3132
+ reload_jobs
33+
# + cleanup_jobs # Temporarily disabled
3234
)
3335
sensors = [email_on_run_failure, kill_long_running_runs, config_file_watcher]
3436
schedules = (
@@ -42,6 +44,7 @@
4244
+ summary_schedules
4345
+ delete_schedules
4446
+ reload_schedules
47+
# + cleanup_schedules # Temporarily disabled
4548
)
4649

4750
defs = Definitions(

dagster_fly.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ run_retries:
3232
# Aggressive retention policies optimized for Fly.io disk usage
3333
retention:
3434
schedule:
35-
purge_after_days: 2 # Keep for 2 days
35+
purge_after_days: 1 # Keep for 1 day only
3636
sensor:
3737
purge_after_days:
38-
skipped: 1
39-
failure: 2
40-
success: 1
38+
skipped: 1 # 1 day for skipped (minimum allowed by Dagster)
39+
failure: 1 # 1 day for failures
40+
success: 1 # 1 day for successful runs (minimum allowed by Dagster)
4141

4242
# Enhanced run monitoring for Fly.io environment
4343
run_monitoring:

docker/Dockerfile.fly

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ FROM python:3.12-slim
33
# Cache busting argument (set during build to force fresh layers)
44
ARG CACHEBUST=1
55

6+
# Use CACHEBUST to invalidate cache when needed (this layer changes when CACHEBUST changes)
7+
RUN echo "Cache bust: $CACHEBUST" > /tmp/cachebust
8+
69
# Install system dependencies including nginx
710
RUN apt-get update && apt-get install -y --no-install-recommends \
811
git \

scripts/deployment/deploy_fly.sh

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,23 @@ rm fly.toml.bak
235235
echo "🚀 Deploying application..."
236236

237237
if [[ "$FORCE_REBUILD" == "true" ]]; then
238+
# Generate unique cache busting value with timestamp + random
239+
CACHEBUST_VALUE="$(date +%s)-$(openssl rand -hex 4 2>/dev/null || echo $RANDOM)"
238240
echo "🔄 Force rebuild enabled - using aggressive cache busting..."
239-
fly deploy --no-cache --build-arg CACHEBUST="$(date +%s)" -a "$APP_NAME"
241+
echo "🎯 Cache bust value: $CACHEBUST_VALUE"
242+
243+
# Use multiple cache busting strategies:
244+
# 1. --no-cache: Skip Docker layer cache
245+
# 2. CACHEBUST build arg: Force rebuild of layers that use it
246+
# 3. --dockerfile: Explicit dockerfile path to avoid confusion
247+
fly deploy \
248+
--no-cache \
249+
--build-arg CACHEBUST="$CACHEBUST_VALUE" \
250+
--dockerfile docker/Dockerfile.fly \
251+
-a "$APP_NAME"
240252
else
241-
fly deploy --no-cache -a "$APP_NAME"
253+
echo "⚡ Standard deployment (with caching)..."
254+
fly deploy --dockerfile docker/Dockerfile.fly -a "$APP_NAME"
242255
fi
243256

244257
# Show the status

0 commit comments

Comments
 (0)