Fix FSDP v2 defaulting to version 1 in TrainingArguments

amanzoni1 · amanzoni1 · commit bcd35997a6aa · 2025-12-02T15:16:14.000+04:00
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -2676,9 +2676,10 @@ def _process_fsdp_args(self):
             with open(self.fsdp_config, encoding="utf-8") as f:
                 self.fsdp_config = json.load(f)
 
+        fsdp_version = self.fsdp_config.get("fsdp_version", 1)
         if self.fsdp_config is not None and isinstance(self.fsdp_config, dict):
             for k in list(self.fsdp_config.keys()):
-                if k.startswith("fsdp_"):
+                if k.startswith("fsdp_") and k != "fsdp_version":
                     v = self.fsdp_config.pop(k)
                     self.fsdp_config[k[5:]] = v
 
@@ -2722,15 +2723,20 @@ def _process_fsdp_args(self):
         # accelerate integration for FSDP
         fsdp_plugin_args = None
         if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
+            from accelerate.utils import FullyShardedDataParallelPlugin
             from accelerate.utils.constants import (
                 FSDP_AUTO_WRAP_POLICY,
                 FSDP_SHARDING_STRATEGY,
             )
 
             fsdp_plugin_args = {}
+            # Handle basic FSDP options from command-line flags
             for fsdp_option in self.fsdp:
                 if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
-                    fsdp_plugin_args["sharding_strategy"] = fsdp_option
+                    # Set deprecated sharding_strategy from CLI (plugin maps to reshard_after_forward)
+                    # Skip if config has explicit reshard_after_forward (prioritize config)
+                    if "reshard_after_forward" not in self.fsdp_config:
+                        fsdp_plugin_args["sharding_strategy"] = fsdp_option
                 elif fsdp_option == FSDPOption.OFFLOAD:
                     fsdp_plugin_args["cpu_offload"] = True
                 elif fsdp_option == FSDPOption.AUTO_WRAP:
@@ -2742,24 +2748,43 @@ def _process_fsdp_args(self):
                         fsdp_plugin_args["transformer_cls_names_to_wrap"] = ",".join(
                             self.fsdp_config["transformer_layer_cls_to_wrap"]
                         )
-            fsdp_plugin_args["fsdp_version"] = self.fsdp_config.get("fsdp_version", 1)
-            prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
-            fsdp_plugin_args["backward_prefetch"] = prefetch_policy.upper()
-            fsdp_plugin_args["forward_prefetch"] = str(self.fsdp_config.get("forward_prefetch", "false")).lower()
-
-            sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower()
-            cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower()
-            if sync_module_states == "false" and cpu_ram_efficient_loading == "true":
-                # In this case, all the processes except the main process would have random weights leading
-                # to unexpected behaviour during training, thus throwing error here to prevent it.
-                raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`')
-
-            # we need to set the env here as otherwise we get a warning in accelerate + we need to set it for transformers
-            fsdp_plugin_args["cpu_ram_efficient_loading"] = cpu_ram_efficient_loading
-            os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading
-
-            fsdp_plugin_args["sync_module_states"] = sync_module_states
-            fsdp_plugin_args["use_orig_params"] = str(self.fsdp_config.get("use_orig_params", "true")).lower()
+
+            # Pull allowed parameters from fsdp_config
+            ALLOWED_FSDP_PARAMS = {f.name for f in fields(FullyShardedDataParallelPlugin)}
+            for key in ALLOWED_FSDP_PARAMS:
+                if key in self.fsdp_config and key not in fsdp_plugin_args:
+                    fsdp_plugin_args[key] = self.fsdp_config[key]
+            fsdp_plugin_args["fsdp_version"] = fsdp_version
+
+            # Special HF-to-plugin map: transformer_layer_cls_to_wrap → joined cls_names
+            if (
+                "transformer_layer_cls_to_wrap" in self.fsdp_config
+                and "transformer_cls_names_to_wrap" not in fsdp_plugin_args
+            ):
+                fsdp_plugin_args["transformer_cls_names_to_wrap"] = ",".join(
+                    self.fsdp_config["transformer_layer_cls_to_wrap"]
+                )
+
+            # Validation: sync_module_states vs cpu_ram_efficient_loading
+            sync_states = fsdp_plugin_args.get("sync_module_states", "true")
+            cpu_loading = fsdp_plugin_args.get("cpu_ram_efficient_loading", "false")
+
+            if isinstance(sync_states, str):
+                sync_states = sync_states.lower()
+            if isinstance(cpu_loading, str):
+                cpu_loading = cpu_loading.lower()
+            if sync_states == "false" and cpu_loading == "true":
+                raise ValueError('`sync_module_states` must be `"true"` if `cpu_ram_efficient_loading` is `"true"`.')
+
+            # CRITICAL: Set environment variable for cpu_ram_efficient_loading
+            if "cpu_ram_efficient_loading" in fsdp_plugin_args:
+                cpu_ram_value = fsdp_plugin_args["cpu_ram_efficient_loading"]
+                # Handle both bool and string values
+                if isinstance(cpu_ram_value, bool):
+                    cpu_ram_value = str(cpu_ram_value).lower()
+                elif isinstance(cpu_ram_value, str):
+                    cpu_ram_value = cpu_ram_value.lower()
+                os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_value
 
         return fsdp_plugin_args
 
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
@@ -211,6 +211,25 @@ def test_fsdp_config(self, sharding_strategy, dtype):
             for k, v in trainer.args.fsdp_config.items():
                 self.assertEqual(v, self.fsdp_config[k])
 
+    def test_fsdp_version_2_config(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "fsdp": True,
+            "fsdp_config": {
+                "fsdp_version": 2,
+                "reshard_after_forward": True,
+            },
+        }
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            plugin_args = trainer.args._process_fsdp_args()
+            self.assertEqual(plugin_args["fsdp_version"], 2)
+            self.assertTrue(plugin_args["reshard_after_forward"])
+
     @parameterized.expand(params, name_func=_parameterized_custom_name_func)
     @require_torch_multi_accelerator
     @run_first