Merge pull request #120 from OpenMOSS/llada

dest1n1s · web-flow · commit 13576f8b8a6f · 2025-07-07T19:03:04.000+08:00
Fix batch size validation for data parallelism and adjust total count for activation processing in CachedActivationLoader
diff --git a/src/lm_saes/activation/processors/activation.py b/src/lm_saes/activation/processors/activation.py
@@ -349,11 +349,15 @@ def process(self, data: Iterable[dict[str, Any]], **kwargs) -> Iterable[dict[str
         """
         buffer = ActivationBuffer(generator=self.perm_generator, device_mesh=self.device_mesh)
         pbar = tqdm(total=self.buffer_size, desc="Buffer monitor", miniters=1, disable=True)
-
+        dp_size = get_mesh_dim_size(self.device_mesh, "data")
         for d in data:
+
+            def get_batch_size(x):
+                return len(x) if isinstance(x, DTensor) else len(x) * dp_size
+
             # Validate input: ensure all tensors and lists have consistent shapes
-            assert all(len(d[k]) == len(d[next(iter(d.keys()))]) for k in d.keys()), (
-                "All tensors and lists must have the same batch size"
+            assert all(get_batch_size(d[k]) == get_batch_size(d[next(iter(d.keys()))]) for k in d.keys()), (
+                f"All tensors and lists must have the same batch size, {[(k, len(d[k])) for k in d.keys()]}"
             )
 
             # Add new data to buffer
diff --git a/src/lm_saes/activation/processors/cached_activation.py b/src/lm_saes/activation/processors/cached_activation.py
@@ -240,7 +240,7 @@ def _process_chunks(self, hook_chunks: dict[str, list[ChunkInfo]], num_chunks: i
         else:
             for data in tqdm(
                 dataloader,
-                total=len(cached_activation_dataset),
+                total=len(cached_activation_dataset) // self.device_mesh.size(),
                 desc="Processing activation chunks",
                 disable=not is_master(),
             ):