feat: support resuming wandb run from training checkpoint

Frankstein73 · Frankstein73 · commit 322af3bf6d02 · 2025-11-30T22:44:17.000+08:00
- Add wandb_run_id and wandb_resume config options
- Save wandb run id when saving checkpoint
- Load trainer from checkpoint when from_pretrained_path is set
diff --git a/pyproject.toml b/pyproject.toml
@@ -89,6 +89,8 @@ dev = [
     "qwen-vl-utils>=0.0.10",
     "tabulate>=0.9.0",
     "gradio>=5.34.0",
+    "sqlalchemy>=2.0.44",
+    "apscheduler>=3.11.1",
 ]
 docs = [
     "mkdocs-gen-files>=0.5.0",
diff --git a/src/lm_saes/config.py b/src/lm_saes/config.py
@@ -782,6 +782,8 @@ class WandbConfig(BaseConfig):
     wandb_project: str = "gpt2-sae-training"
     exp_name: str | None = None
     wandb_entity: str | None = None
+    wandb_run_id: str | None = None
+    wandb_resume: Literal["allow", "must", "never", "auto"] = "never"
 
 
 class MongoDBConfig(BaseConfig):
diff --git a/src/lm_saes/runners/train.py b/src/lm_saes/runners/train.py
@@ -168,14 +168,23 @@ def train_sae(settings: TrainSAESettings) -> None:
             entity=settings.wandb.wandb_entity,
             settings=wandb.Settings(x_disable_stats=True),
             mode=os.getenv("WANDB_MODE", "online"),  # type: ignore
+            resume=settings.wandb.wandb_resume,
+            id=settings.wandb.wandb_run_id,
         )
         if settings.wandb is not None and (device_mesh is None or mesh_rank(device_mesh) == 0)
         else None
     )
-
     sae = initializer.initialize_sae_from_config(
         settings.sae, activation_stream=activations_stream, device_mesh=device_mesh, wandb_logger=wandb_logger
     )
+    if settings.trainer.from_pretrained_path is not None:
+        trainer = Trainer.from_checkpoint(
+            sae,
+            settings.trainer.from_pretrained_path,
+        )
+        trainer.wandb_logger = wandb_logger
+    else:
+        trainer = Trainer(settings.trainer)
 
     logger.info(f"SAE initialized: {type(sae).__name__}")
 
@@ -186,17 +195,24 @@ def train_sae(settings: TrainSAESettings) -> None:
     eval_fn = (lambda x: None) if settings.eval else None
 
     logger.info("Starting training")
-    trainer = Trainer(settings.trainer)
-    sae.cfg.save_hyperparameters(settings.trainer.exp_result_path)
-    trainer.fit(sae=sae, activation_stream=activations_stream, eval_fn=eval_fn, wandb_logger=wandb_logger)
 
-    logger.info("Training completed, saving model")
-    sae.save_pretrained(
-        save_path=settings.trainer.exp_result_path,
-        sae_name=settings.sae_name,
-        sae_series=settings.sae_series,
-        mongo_client=mongo_client,
+    sae.cfg.save_hyperparameters(settings.trainer.exp_result_path)
+    end_of_stream = trainer.fit(
+        sae=sae, activation_stream=activations_stream, eval_fn=eval_fn, wandb_logger=wandb_logger
     )
+    logger.info("Training completed, saving model")
+    if end_of_stream:
+        trainer.save_checkpoint(
+            sae=sae,
+            checkpoint_path=settings.trainer.exp_result_path,
+        )
+    else:
+        sae.save_pretrained(
+            save_path=settings.trainer.exp_result_path,
+            sae_name=settings.sae_name,
+            sae_series=settings.sae_series,
+            mongo_client=mongo_client,
+        )
 
     if wandb_logger is not None:
         wandb_logger.finish()
diff --git a/src/lm_saes/trainer.py b/src/lm_saes/trainer.py
@@ -1,3 +1,4 @@
+import json
 import math
 import os
 from pathlib import Path
@@ -88,11 +89,12 @@ def save_checkpoint(self, sae: AbstractSparseAutoEncoder, checkpoint_path: Path
                 "checkpoint_thresholds": self.checkpoint_thresholds,
                 "cfg": self.cfg,
             }
-
             # Save trainer state
             trainer_path = checkpoint_dir / "trainer.pt"
             torch.save(trainer_state, trainer_path)
-
+            if self.wandb_logger is not None:
+                with open(checkpoint_dir / "wandb_run_id.json", "w") as f:
+                    json.dump({"wandb_run_id": self.wandb_logger.id}, f)
         # Save optimizer state - handle distributed tensors
         if self.optimizer is not None:
             if sae.device_mesh is None:
@@ -479,14 +481,13 @@ def fit(
                         with timer.time("evaluation"):
                             eval_fn(sae)
 
-                    self._maybe_save_sae_checkpoint(sae)
                     with timer.time("scheduler_step"):
                         self.scheduler.step()
-
                     self.cur_step += 1
                     self.cur_tokens += (
                         batch["tokens"].numel() if batch.get("mask") is None else int(item(batch["mask"].sum()))
                     )
+                    self._maybe_save_sae_checkpoint(sae)
                     if self.cur_tokens >= self.cfg.total_training_tokens:
                         break
         except StopIteration:

Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,8 @@ dev = [`
`89`	`89`	`"qwen-vl-utils>=0.0.10",`
`90`	`90`	`"tabulate>=0.9.0",`
`91`	`91`	`"gradio>=5.34.0",`
	`92`	`+ "sqlalchemy>=2.0.44",`
	`93`	`+ "apscheduler>=3.11.1",`
`92`	`94`	`]`
`93`	`95`	`docs = [`
`94`	`96`	`"mkdocs-gen-files>=0.5.0",`