fix safetensors

liangel-02 · liangel-02 · commit a431b9afec1a · 2025-11-03T13:07:28.000-08:00
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -112,6 +112,7 @@
     is_torch_mlu_available,
     is_torch_npu_available,
     is_torch_xla_available,
+    is_torchao_available,
     logging,
 )
 from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder
@@ -148,6 +149,9 @@
 else:
     IS_SAGEMAKER_MP_POST_1_10 = False
 
+if is_torchao_available():
+    if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.14.0"):
+        from torchao.prototype.safetensors.safetensors_utils import is_metadata_torchao
 
 logger = logging.get_logger(__name__)
 
@@ -545,6 +549,7 @@ def _infer_parameter_dtype(
             QuantizationMethod.QUARK,
             QuantizationMethod.MXFP4,
             QuantizationMethod.BITS_AND_BYTES,
+            QuantizationMethod.TORCHAO,
         }:
             return True, None
         else:
@@ -659,7 +664,10 @@ def _load_state_dict_into_meta_model(
             if param_device == "disk":
                 if not is_safetensors:
                     disk_offload_index = offload_weight(param, param_name, disk_offload_folder, disk_offload_index)
-            elif not is_quantized or not hf_quantizer.param_needs_quantization(model, param_name):
+            elif not is_quantized or (
+                not is_metadata_torchao(hf_quantizer.metadata)
+                and not hf_quantizer.param_needs_quantization(model, param_name)
+            ):
                 if is_fsdp_enabled():
                     param_device = "cpu" if is_local_dist_rank_0() else "meta"
 
@@ -4808,6 +4816,9 @@ def _load_pretrained_model(
                 _error_msgs, disk_offload_index = load_shard_file(args)
                 error_msgs += _error_msgs
 
+        if hf_quantizer:
+            hf_quantizer.update_model_with_metadata(model, hf_quantizer.metadata)
+
         # Save offloaded index if needed
         if disk_offload_index is not None and len(disk_offload_index) > 0 and not is_offloaded_safetensors:
             save_offload_index(disk_offload_index, disk_offload_folder)
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -67,6 +67,7 @@ class HfQuantizer(ABC):
 
     def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
         self.quantization_config = quantization_config
+        self.metadata = {}
 
         # -- Handle extra kwargs below --
         self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
@@ -344,9 +345,9 @@ def get_state_dict_and_metadata(self, model, safe_serialization=False):
         """Get state dict and metadata. Useful when we need to modify a bit the state dict due to quantization"""
         return None, {}
 
-    def update_state_dict_with_metadata(self, state_dict, metadata):
+    def update_model_with_metadata(self, model, metadata):
         """Update state dict with metadata. Default behaviour returns state_dict"""
-        return state_dict
+        pass
 
     @abstractmethod
     def is_serializable(self, safe_serialization=None): ...
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -14,7 +14,6 @@
 import importlib
 import re
 import types
-from collections import defaultdict
 from typing import TYPE_CHECKING, Optional, Union
 
 from packaging import version
@@ -87,6 +86,9 @@ def _linear_extra_repr(self):
     SUPPORTED_SAFE_SERIALIZATION_CONFIGS = [
         torchao.quantization.Float8WeightOnlyConfig,
         torchao.quantization.Float8DynamicActivationFloat8WeightConfig,
+        torchao.quantization.Int4WeightOnlyConfig,
+        torchao.quantization.IntxWeightOnlyConfig,
+        torchao.quantization.Int8DynamicActivationIntxWeightConfig,
     ]
 
     TORCHAO_VERSION = version.parse(importlib.metadata.version("torchao"))
@@ -104,20 +106,6 @@ class TorchAoHfQuantizer(HfQuantizer):
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
-        if isinstance(self.quantization_config.quant_type, str):
-            is_int_4 = "int4" in self.quantization_config.quant_type
-        else:
-            config_name = self.quantization_config.quant_type.__class__.__name__
-            is_int_4 = fuzzy_match_size(config_name) == "4"
-
-        # TODO: better way to get the serialized key names? Hard to read from torchao codebase
-        if is_int_4:
-            self.weight_ao_keys = ["qdata", "scale", "zero_point"]
-        else:
-            self.weight_ao_keys = ["qdata", "scale"]
-        # Instead of serializing the simple torch.Tensor like usual, torchao adds a `:_data` suffix so we need this
-        self.full_ao_keys = self.weight_ao_keys + ["_data"]
-
     def validate_environment(self, *args, **kwargs):
         if not is_torchao_available():
             raise ImportError("Loading an torchao quantized model requires torchao library (`pip install torchao`)")
@@ -234,7 +222,7 @@ def _process_model_before_weight_loading(
         return
 
     def update_unexpected_keys(self, model, unexpected_keys: list[str]) -> list[str]:
-        return [k for k in unexpected_keys if not any(k.endswith(x) for x in self.full_ao_keys)]
+        return [k for k in unexpected_keys if "_" not in k]
 
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
         if self.quantization_config.quant_type == "autoquant":
@@ -243,7 +231,7 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
         # check if the param_name is not in self.modules_to_not_convert
         if any(key + "." in param_name or key == param_name for key in self.modules_to_not_convert):
             return False
-        elif any(param_name.endswith(f":{x}") for x in self.full_ao_keys):
+        elif "_" in param_name:
             return True
         else:
             # we only quantize the weight of nn.Linear and nn.Embedding
@@ -253,6 +241,34 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
                 _QUANTIZABLE.append(torch.nn.Embedding)
             return isinstance(module, tuple(_QUANTIZABLE)) and tensor_name == "weight"
 
+    def update_model_with_metadata(self, model, metadata):
+        if TORCHAO_VERSION >= version.parse("0.14.0") and is_metadata_torchao(self.metadata):
+            updated_state_dict = unflatten_tensor_state_dict(model.state_dict(), metadata)
+
+            weights_to_register = set(updated_state_dict.keys())
+
+            for name, param in list(model.named_parameters()):
+                module_fqn, weight_name = name.rsplit(".", 1)
+                module = model.get_submodule(module_fqn)
+                weight = getattr(module, weight_name)
+
+                device = weight.device
+                requires_grad = weight.requires_grad
+
+                if "_weight_" in weight_name:
+                    delattr(module, weight_name)
+
+                if name in weights_to_register:
+                    new_param_value = updated_state_dict[name]
+                    new_param = torch.nn.Parameter(new_param_value.to(device), requires_grad=requires_grad)
+                    module.register_parameter(weight_name, new_param)
+
+                    weights_to_register.remove(name)
+
+            model.load_state_dict(updated_state_dict, strict=False)
+        else:
+            return super().update_model_with_metadata(model, metadata)
+
     def create_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -267,42 +283,12 @@ def create_quantized_param(
         """
         from torchao.quantization import quantize_
 
-        full_name = param_name
-        # Those are the pre quantized weights
-        if ":" in param_name:
-            param_name = param_name.rsplit(":", 1)[0]
         module, tensor_name = get_module_from_name(model, param_name)
-
         if self.pre_quantized:
-            # If it's a bias, no need to do anything special (except removing the ":_data" part of the key, but was
-            # already done) - if it's unsafe-serialized (i.e. not safetensors), not need for anything either
-            is_unsafe_serialization = ":" not in full_name
-            if tensor_name == "bias" or is_unsafe_serialization:
-                module._parameters[tensor_name] = torch.nn.Parameter(
-                    param_value.to(target_device), requires_grad=param_value.requires_grad
-                )
-                return
-            # Sanity check for the new serialization format
-            elif not (TORCHAO_VERSION >= version.parse("0.14.0") and is_metadata_torchao(self.metadata)):
-                raise ValueError("To use `safetensors` serialization, you should have `torchao>=0.14.0` installed")
-
-            # Save the states for later quantization when they are all gathered
-            if not hasattr(self, "ao_params"):
-                self.ao_params = defaultdict(dict)
-            self.ao_params[param_name].update({full_name: param_value})
-
-            # We are ready for quantization in this case (we retrieved all the needed keys)
-            if len(self.ao_params[param_name]) == len(self.weight_ao_keys):
-                new_param = unflatten_tensor_state_dict(self.ao_params[param_name], self.metadata)[param_name]
-                # Set it
-                module._parameters[tensor_name] = torch.nn.Parameter(
-                    new_param.to(target_device), requires_grad=new_param.requires_grad
-                )
-
-                # Free memory
-                del self.ao_params[param_name]
+            module._parameters[tensor_name] = torch.nn.Parameter(
+                param_value.to(target_device), requires_grad=param_value.requires_grad
+            )
 
-            # Add repr to the module
             if isinstance(module, nn.Linear):
                 module.extra_repr = types.MethodType(_linear_extra_repr, module)
         else:
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
@@ -536,7 +536,7 @@ def setUpClass(cls):
 
     def setUp(self):
         self.quant_config = TorchAoConfig(self.quant_scheme)
-        dtype = torch.bfloat16 if isinstance(self.quant_scheme, Int4WeightOnlyConfig) else "auto"
+        dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
         self.quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             dtype=dtype,
@@ -552,7 +552,6 @@ def tearDown(self):
     def test_original_model_expected_output(self):
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
         output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
-
         self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
     def check_serialization_expected_output(self, device, expected_output, safe_serialization=False):
@@ -578,6 +577,7 @@ class TorchAoSafeSerializationTest(TorchAoSerializationTest):
     # called only once for all test in this class
     @classmethod
     def setUpClass(cls):
+        super().setUpClass()
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
         cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
 
@@ -596,6 +596,16 @@ def tearDown(self):
                 "What are we having for dinner?\n\nJess: (smiling) I",
             ),
             (torchao.quantization.Float8WeightOnlyConfig(), "What are we having for dinner?\n\nJessica: (smiling)"),
+            (Int4WeightOnlyConfig(), "What are we having for dinner?"),
+            (
+                Int4WeightOnlyConfig(int4_packing_format="tile_packed_to_4d"),
+                "What are we having for dinner?\nRed, white, and green beans,",
+            ),
+            (
+                torchao.quantization.Int8DynamicActivationIntxWeightConfig(),
+                "What are we having for dinner?\n\nJessica: (smiling)",
+            ),
+            (torchao.quantization.IntxWeightOnlyConfig(), "What are we having for dinner?\n\nJessica: (smiling)"),
         ]
         if is_torchao_available()
         else []