no new parameters

kylesayrs · kylesayrs · commit dbf10e5999c2 · 2025-11-11T13:37:14.000-05:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -7,7 +7,6 @@
 
 import torch
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -525,13 +524,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             weight = weight.t()
 
         # Update layer with new values.
-        layer.weight = Parameter(weight.data, requires_grad=False)
-        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
-        layer.input_scale = (
-            Parameter(input_scale, requires_grad=False)
-            if input_scale is not None
-            else None
-        )
+        layer.weight.copy_(weight.data)
+        layer.weight_scale.copy_(weight_scale.data)
+        if input_scale is not None:
+            layer.input_scale.copy_(input_scale)
 
         if self.use_marlin:
             prepare_fp8_layer_for_marlin(layer, size_k_first)
@@ -827,22 +823,18 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 w2_weight_scale_inv = layer.w2_weight_scale_inv
 
             # torch.compile() cannot use Parameter subclasses.
-            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale_inv = Parameter(
-                w13_weight_scale_inv, requires_grad=False
-            )
-            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale_inv = Parameter(
-                w2_weight_scale_inv, requires_grad=False
-            )
+            layer.w13_weight.copy_(w13_weight)
+            layer.w13_weight_scale_inv.copy_(w13_weight_scale_inv)
+            layer.w2_weight.copy_(w2_weight)
+            layer.w2_weight_scale_inv.copy_(w2_weight_scale_inv)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                layer.w13_weight.copy_(shuffled_w13)
+                layer.w2_weight.copy_(shuffled_w2)
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
@@ -864,7 +856,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
-            layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.copy_(
                 torch.ones(
                     layer.local_num_experts,
                     dtype=torch.float32,
@@ -879,16 +871,16 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w13_weight.copy_(w13_weight)
+            layer.w2_weight.copy_(w2_weight)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                layer.w13_weight.copy_(shuffled_w13)
+                layer.w2_weight.copy_(shuffled_w2)
         # If checkpoint is fp8, we need to handle that the
         # MoE kernels require single activation scale and single weight
         # scale for w13 per expert.
@@ -909,12 +901,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer."
                     )
-                layer.w13_input_scale = torch.nn.Parameter(
-                    layer.w13_input_scale.max(), requires_grad=False
-                )
-                layer.w2_input_scale = torch.nn.Parameter(
-                    layer.w2_input_scale.max(), requires_grad=False
-                )
+                layer.w13_input_scale.copy_(layer.w13_input_scale.max())
+                layer.w2_input_scale.copy_(layer.w2_input_scale.max())
             if current_platform.is_fp8_fnuz():
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
@@ -928,22 +916,14 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     )
                 )
                 # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_weight_scale, requires_grad=False
-                )
+                layer.w13_weight.copy_(w13_weight)
+                layer.w13_weight_scale.copy_(w13_weight_scale)
                 if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False
-                    )
-                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_weight_scale, requires_grad=False
-                )
+                    layer.w13_input_scale.copy_(w13_input_scale)
+                layer.w2_weight.copy_(w2_weight)
+                layer.w2_weight_scale.copy_(w2_weight_scale)
                 if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False
-                    )
+                    layer.w2_input_scale.copy_(w2_input_scale)
 
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
@@ -967,12 +947,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     layer.w13_weight, layer.w2_weight
                 )
 
-                layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+                layer.w13_weight.copy_(shuffled_w13)
+                layer.w2_weight.copy_(shuffled_w2)
 
-            layer.w13_weight_scale = torch.nn.Parameter(
-                max_w13_scales, requires_grad=False
-            )
+            layer.w13_weight_scale.copy_(max_w13_scales)
 
             if self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -45,6 +45,13 @@ def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # skip if there are no weights to process (for examplle, weight reloading)
+        if not hasattr(layer, "q_scale"):
+            assert not hasattr(layer, "k_scale")
+            assert not hasattr(layer, "v_scale")
+            assert not hasattr(layer, "prob_scale")
+            return
+
         # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
         # regardless whether the kv-scale is available in the checkpoint.
         # No need to process kv scales after loading if we are going to