update torchao safetensors

liangel-02 · liangel-02 · commit 661e342a0d4b · 2025-12-01T09:03:19.000-08:00
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -48,24 +48,16 @@
 logger = logging.get_logger(__name__)
 
 
-def extract_concrete_key(key: str, pattern: str, pattern_regex: re.Pattern) -> str:
+def extract_concrete_key_from_regex_pattern(key: str, pattern: str, pattern_regex: re.Pattern) -> str:
     match = pattern_regex.match(key)
     if not match:
         return pattern
 
     groups = match.groups()
-    wildcard_count = pattern.count("*")
-
-    if wildcard_count == 0:
-        return pattern
-    elif wildcard_count == 1:
-        return pattern.replace("*", groups[0])
-    else:
-        parts = pattern.split("*")
-        result = "*".join(parts[1:])
-        for i, captured in enumerate(groups[1:], start=0):
-            result = result.replace("*", str(captured), 1)
-        return result
+    parts = pattern.split("*")
+    result = "*".join(parts[1:])
+    result = result.replace("*", groups[1], 1)
+    return result
 
 
 def build_glob_alternation(
@@ -469,7 +461,6 @@ def convert(
             )
 
         collected_tensors = self.collected_tensors
-
         for op in self.operations:
             with log_to_misc(layer_name, misc, (collected_tensors, layer_name), op):
                 collected_tensors = op.convert(
@@ -552,7 +543,7 @@ def log_to_misc(
     try:
         yield
     except Exception as e:
-        print(f"error: {e}")
+
         def _format_op_name(curr_op: Union[list[ConversionOps], ConversionOps, None]) -> Optional[str]:
             if curr_op is None:
                 return None
@@ -567,7 +558,6 @@ def _format_op_name(curr_op: Union[list[ConversionOps], ConversionOps, None]) ->
         if isinstance(extras, tuple) and len(extras) == 2:
             values, target_keys = extras
             descriptor = f"{op_name} " if op_name else ""
-            # print(values)
             misc[first_target_key] = (
                 f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values)}"
             )
@@ -616,7 +606,7 @@ def set_param_for_module(
                         param_value = param_value.to_local()
                 if param_name not in module_obj._buffers:
                     param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
-            print(f"removing {target_name} from missing keys")
+
             # Remove from missing keys (it's either mismatched, or all good)
             missing_keys.discard(target_name)
             if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
@@ -789,9 +779,6 @@ def convert_and_load_state_dict_in_model(
     ```
 
     """
-    print('in convert and load state dict')
-    print(f"model state_dict keys: {model.state_dict().keys()}")
-    print(f"state_dict keys: {state_dict}")
     prefix = model.base_model_prefix
     tp_plan = tp_plan or {}
     device_map = device_map or {"": "cpu"}
@@ -833,7 +820,6 @@ def convert_and_load_state_dict_in_model(
 
         # 2. finally, collect the tensor into the proper converter
         if renamed_key in missing_keys:
-            print(f"orignal key in state_dict: {original_key}, renamed_key: {renamed_key}, matched_pattern: {matched_pattern}")
             empty_param = meta_model_state_dict.get(renamed_key)
             # If we enter here, we have a WeightConverter operation to perform
             if source_pattern is not None:
@@ -863,7 +849,6 @@ def convert_and_load_state_dict_in_model(
                 if matched_dtype_pattern is not None:
                     _dtype = dtype_plan[matched_dtype_pattern.group()]
             elif empty_param is not None and empty_param.dtype != _dtype:
-                print("using empty param")
                 _dtype = empty_param.dtype  # usually correct when initializing
 
             # 4. Handle TP sharding or device_map placement -> scheduled materialization
@@ -891,7 +876,7 @@ def convert_and_load_state_dict_in_model(
                 # If disk, we need to materialize on cpu first
                 param_device = "cpu" if param_device == "disk" else param_device
                 future = spawn_materialize(thread_pool, tensor, param_device, _dtype)
-            print("adding tensor")
+
             mapping.add_tensor(renamed_key, original_key, source_pattern, future)
         elif source_pattern is not None:  # add all target keys as unexpected
             mapping = pattern_to_converter[source_pattern]
@@ -903,7 +888,6 @@ def convert_and_load_state_dict_in_model(
     total_entries = len(param_name_to_load)
     with logging.tqdm(total=total_entries, desc="Loading weights") as pbar:
         for first_param_name, mapping in param_name_to_load.items():
-            print(f"first_param_name: {first_param_name}")
             pbar.update(1)
             pbar.set_postfix({"Materializing param": first_param_name})
             pbar.refresh()
@@ -917,7 +901,6 @@ def convert_and_load_state_dict_in_model(
                     misc=misc,
                 )
                 for target_name, param in realized_value.items():
-                    print(f"target_name: {target_name}")
                     param = param[0] if isinstance(param, list) else param
                     device_match = device_map_regex.match(target_name)
                     param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
@@ -942,7 +925,6 @@ def convert_and_load_state_dict_in_model(
                 # Cleanup the tensors
                 mapping.reset()
             except SkipLayer:
-                print(f"skipping layer {first_param_name}")
                 continue
 
     # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
diff --git a/src/transformers/integrations/torchao.py b/src/transformers/integrations/torchao.py
@@ -215,29 +215,10 @@ def convert(
         missing_keys=None,
         **kwargs,
     ) -> dict[str, torch.Tensor]:
-        print(f"in deserialize: {input_dict.keys(), full_layer_name}")
-        if isinstance(self.hf_quantizer.quantization_config.quant_type, str):
-            is_int_4 = "int4" in self.hf_quantizer.quantization_config.quant_type
-        else:
-            config_name = self.hf_quantizer.quantization_config.quant_type.__class__.__name__
-            is_int_4 = fuzzy_match_size(config_name) == "4"
-
-        # Simple case if we gather layermsnorm weights, we can just return the value since they are not quantized
-        # if "._weight__data" in input_dict.keys():
-        #     value = (
-        #         input_dict["_weight__data"][0]
-        #         if isinstance(input_dict["._weight__data"], list)
-        #         else input_dict["_weight__data"]
-        #     )
-        #     return {full_layer_name: value}
-
-        print(list(input_dict.keys())[0])
         is_unsafe_serialization = "_weight_" not in list(input_dict.keys())[0]
 
         param_data = {}
         layer_name = '.'.join(full_layer_name.split(".")[:-1])
-        print(f"layer_name: {layer_name}")
-        print(is_unsafe_serialization)
         if is_unsafe_serialization:
             if isinstance(input_dict["weight"], list):
                 weight = input_dict["weight"][0]
@@ -250,41 +231,14 @@ def convert(
                 else:
                     param_data[f"{layer_name}.{suffix}"] = input_dict[suffix]
 
-            # print("processing qdata")
-            # if isinstance(input_dict["_weight_qdata"], list):
-            #     param_data[f"{layer_name}._weight_qdata"] = input_dict["_weight_qdata"][0]
-            # else:
-            #     param_data[f"{layer_name}._weight_qdata"] = input_dict["_weight_qdata"]
-
-            # print("processing scale")
-            # if isinstance(input_dict["_weight_scale"], list):
-            #     param_data[f"{layer_name}._weight_scale"] = input_dict["_weight_scale"][0]
-            # else:
-            #     param_data[f"{layer_name}._weight_scale"] = input_dict["_weight_scale"]
-
-            # if is_int_4:
-            #     if isinstance(input_dict["weight:zero_point"], list):
-            #         param_data[f"{layer_name}:zero_point"] = input_dict["weight:zero_point"][0]
-            #     else:
-            #         param_data[f"{layer_name}:zero_point"] = input_dict["weight:zero_point"]
-
-        # If it's a bias, no need to do anything special (except removing the ":_data" part of the key, but was
-        # already done) - if it's unsafe-serialized (i.e. not safetensors), not need for anything either
+        # If it's unsafe-serialized (i.e. not safetensors), no need for anything
         if is_unsafe_serialization:
-            print("returning")
             return {full_layer_name: weight}
         # Sanity check for the new serialization format
-        elif not (TORCHAO_VERSION >= version.parse("0.14.0") and is_metadata_torchao(self.hf_quantizer.metadata)):
-            # print("metadata", self.hf_quantizer.metadata)
-            print("here")
-            print(is_metadata_torchao(self.hf_quantizer.metadata))
+        elif not (TORCHAO_VERSION >= version.parse("0.15.0") and is_metadata_torchao(self.hf_quantizer.metadata)):
             raise ValueError("To use `safetensors` serialization, you should have `torchao>=0.14.0` installed")
 
-        print("calling unflatten")
-        print(param_data)
-        print(self.hf_quantizer.metadata)
         unflattened_state_dict, _ = unflatten_tensor_state_dict(param_data, self.hf_quantizer.metadata)
-        print(f"unflattened_state_dict: {unflattened_state_dict}")
         new_param = unflattened_state_dict[full_layer_name]
 
         module, _ = get_module_from_name(model, full_layer_name)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -2104,7 +2104,6 @@ def set_decoder(self, decoder):
         possible_module_names = ["language_model", "text_model", "decoder"]
         for name in possible_module_names:
             if hasattr(self, name):
-                print(name)
                 setattr(self, name, decoder)
                 return
 
@@ -3111,8 +3110,6 @@ def save_pretrained(
         metadata = {}
         if hf_quantizer is not None:
             state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self, safe_serialization)
-        print("saving")
-        print(state_dict)
         metadata["format"] = "pt"
 
         # Only save the model itself if we are using distributed training
@@ -3949,6 +3946,19 @@ def from_pretrained(
 
         is_quantized = hf_quantizer is not None
 
+        weight_conversions: Optional[list[WeightConverter | WeightRenaming]] = None
+        model_type = getattr(config, "model_type", None)
+        if model_type is not None:
+            weight_conversions = get_checkpoint_conversion_mapping(model_type)
+            if weight_conversions is None:
+                weight_conversions = get_checkpoint_conversion_mapping("legacy")
+            if key_mapping is not None:
+                weight_conversions.extend(
+                    [WeightRenaming(source_keys=k, target_keys=v) for k, v in key_mapping.items()]
+                )
+            if hf_quantizer is not None:
+                weight_conversions.extend(hf_quantizer.get_weight_conversions())
+
         if gguf_file:
             from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
 
@@ -3988,19 +3998,6 @@ def from_pretrained(
                 use_kernels=use_kernels,
             )
 
-        weight_conversions: Optional[list[WeightConverter | WeightRenaming]] = None
-        model_type = getattr(config, "model_type", None)
-        if model_type is not None:
-            weight_conversions = get_checkpoint_conversion_mapping(model_type)
-            if weight_conversions is None:
-                weight_conversions = get_checkpoint_conversion_mapping("legacy")
-            if key_mapping is not None:
-                weight_conversions.extend(
-                    [WeightRenaming(source_keys=k, target_keys=v) for k, v in key_mapping.items()]
-                )
-            if hf_quantizer is not None:
-                weight_conversions.extend(hf_quantizer.get_weight_conversions())
-
         if _torch_distributed_available and device_mesh is not None:  # add hooks to nn.Modules: no weights
             model = distribute_model(model, tp_plan, distributed_config, device_mesh, tp_size)
 
@@ -4136,13 +4133,11 @@ def _load_pretrained_model(
             # Checkpoints are safetensors
             if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
                 merged_state_dict = {}
-                i = 0
                 for file in checkpoint_files:
                     file_pointer = safe_open(file, framework="pt", device="cpu")
                     all_pointer.add(file_pointer)
                     for k in file_pointer.keys():
                         merged_state_dict[k] = file_pointer.get_slice(k)  # don't materialize yet
-                    i += 1
             # User passed an explicit state_dict
             elif state_dict is not None:
                 merged_state_dict = state_dict
@@ -4466,14 +4461,13 @@ def _initialize_missing_keys(self, is_quantized: bool) -> None:
             self.initialize_weights()
 
     def _adjust_missing_and_unexpected_keys(
-        self, missing_keys: set[str], unexpected_keys: set[str],
+        self, missing_keys: set[str], unexpected_keys: set[str]
     ) -> tuple[set[str], set[str]]:
         """Adjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid
         raising unneeded warnings/errors.
         Also, set the `_is_hf_initialized` on tied weight keys, to avoid initializing them as they are going to
         be tied anyway.
         """
-
         # Old checkpoints may have keys for rotary_emb.inv_freq forach layer, however we moved this buffer to the main model
         # (so the buffer name has changed). Remove them in such a case. This is another exception that was not added to
         # `_keys_to_ignore_on_load_unexpected` as it touches many models -> we add it manually to the existing patterns
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -297,7 +297,7 @@ def create_quantized_param(
         First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
         """
         from torchao.quantization import quantize_
-        print("in create quantized param")
+
         full_name = param_name
         # Those are the pre quantized weights
         if ":" in param_name:
@@ -554,20 +554,11 @@ def get_weight_conversions(self):
         from ..integrations.torchao import TorchAoDeserialize
 
         if self.pre_quantized:
-            print("pre_quantized")
-            print(self.metadata)
             return [
                 WeightConverter(
-                    # source_keys=["_weight_qdata", "_weight_scale", "_weight_zero_point"],
                     source_keys=["*_weight_*"],
                     target_keys="*weight",
                     operations=[TorchAoDeserialize(self)],
                 ),
-                # WeightConverter(
-                #     source_keys=["._weight__data"],
-                #     target_keys=".weight",
-                #     operations=[TorchAoDeserialize(self)],
-                # ),
-                # used for unsafe serialization
             ]
         return []
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
@@ -712,11 +712,11 @@ def tearDown(self):
         backend_empty_cache(torch_device)
         gc.collect()
 
-    # def test_original_model_expected_output(self):
-    #     input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
-    #     output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+    def test_original_model_expected_output(self):
+        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
+        output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
 
-    #     self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
     def check_serialization_expected_output(self, device, expected_output, safe_serialization=False):
         """
@@ -725,26 +725,9 @@ def check_serialization_expected_output(self, device, expected_output, safe_seri
         dtype = torch.bfloat16 if isinstance(self.quant_scheme, Int4WeightOnlyConfig) else "auto"
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
-
-            original_state_dict = self.quantized_model.state_dict()
-            print(original_state_dict)
-
             loaded_quantized_model = AutoModelForCausalLM.from_pretrained(
                 tmpdirname, dtype=dtype, device_map=device, torch_dtype=dtype, use_safetensors=safe_serialization
             )
-
-            loaded_state_dict = loaded_quantized_model.state_dict()
-            for key in original_state_dict:
-                if not hasattr(original_state_dict[key], "qdata"):
-                    print(torch.equal(original_state_dict[key], loaded_state_dict[key]))
-                    continue
-                print(original_state_dict[key].qdata)
-                print(loaded_state_dict[key].qdata)
-                if not torch.equal(original_state_dict[key].qdata, loaded_state_dict[key].qdata):
-                    print("not equal")
-                    print(f"key: {key}, {original_state_dict[key]}, {loaded_state_dict[key]}")
-            print("equal")
-
             input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
 
             output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)