register weight scale in create params, still issue with reloading from disk

kylesayrs · kylesayrs · commit 749c91c8ff5c · 2025-11-11T17:09:41.000-05:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -483,6 +483,18 @@ def create_weights(
             else:
                 layer.register_parameter("input_scale", None)
 
+        # create per-tensor qparams populated by process_weights_after_loading
+        else:
+            scale = create_fp8_scale_parameter(
+                PerTensorScaleParameter,
+                output_partition_sizes,
+                input_size_per_partition,
+                None,
+                weight_loader,
+            )
+            set_weight_attrs(scale, {"scale_type": "weight_scale"})
+            layer.register_parameter("weight_scale", scale)
+
     def process_weights_after_loading(self, layer: Module) -> None:
         size_k_first = True
         input_scale = None
@@ -494,8 +506,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             weight, weight_scale = process_fp8_weight_block_strategy(
                 layer.weight, layer.weight_scale_inv
             )
-            # Delete the weight_scale_inv parameter to avoid confusion
-            # with the weight_scale parameter
+            # Rename weight_scale_inv parameter for consistency
+            layer.weight_scale = layer.weight_scale_inv
             del layer.weight_scale_inv
 
         # If checkpoint not serialized fp8, quantize the weights.
@@ -755,12 +767,10 @@ def create_weights(
             if self.block_quant
             else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
         )
-        # If loading fp8 checkpoint, pass the weight loaders.
-        # If loading an fp16 checkpoint, do not (we will quantize in
-        #   process_weights_after_loading()
-        if self.quant_config.is_checkpoint_fp8_serialized:
-            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # add weight loaders to support loading (and reloading)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.quant_config.activation_scheme == "static":