[GPU] Dyn quan bugfix for cache (#32582)

isanghao · web-flow · commit ec23e2155fe4 · 2025-10-29T02:21:59.000Z
### Description of the issue(symptom, root-cause, how it was resolved)
- New attributes in dynamic quantization are missing from load/store in
caching

#### Reproduction step and snapshot (if applicable. Do not attach for
customer model)
 - Two consecutive execution of WWB shows the issue on BMG
- $rm -rf minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM//model_cache/ ;
python wwb.py --target-model
minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM/ --device gpu.1 --gt-data
reference1.csv --num-sample 1 ; python wwb.py --target-model
minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM/ --device gpu.1 --gt-data
reference1.csv --num-sample 1

#### Checklist
 - [x] Is it a proper fix? (not a workaround)
 - [x] Did you include test case for this fix, if necessary?
- [x] Did you review existing test that can be extended to cover this
scenario? Which test did you review?
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
@@ -52,6 +52,8 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         seed = hash_combine(seed, attrs.scale_dt.hash());
         seed = hash_combine(seed, attrs.zp_dt.hash());
         seed = hash_combine(seed, attrs.output_storage_type);
+        seed = hash_combine(seed, attrs.precomputed_reduction);
+        seed = hash_combine(seed, attrs.precomputed_reduction_dt.hash());
         seed = hash_combine(seed, input_size);
 
         return seed;
@@ -70,6 +72,8 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
                attrs.scale_dt == rhs_casted.attrs.scale_dt &&
                attrs.zp_dt == rhs_casted.attrs.zp_dt &&
                attrs.quantization_type == rhs_casted.attrs.quantization_type &&
+               attrs.precomputed_reduction == rhs_casted.attrs.precomputed_reduction &&
+               attrs.precomputed_reduction_dt == rhs_casted.attrs.precomputed_reduction_dt &&
                input_size == rhs_casted.input_size;
     }
 
@@ -81,8 +85,10 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         ob << make_data(&attrs.scale_dt, sizeof(attrs.scale_dt));
         ob << make_data(&attrs.zp_dt, sizeof(attrs.zp_dt));
         ob << make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
+        ob << make_data(&attrs.precomputed_reduction_dt, sizeof(attrs.precomputed_reduction_dt));
         ob << attrs.scales_zp_output_order;
         ob << attrs.group_sizes;
+        ob << attrs.precomputed_reduction;
         ob << input_size;
     }
 
@@ -94,8 +100,10 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
         ib >> make_data(&attrs.scale_dt, sizeof(attrs.scale_dt));
         ib >> make_data(&attrs.zp_dt, sizeof(attrs.zp_dt));
         ib >> make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
+        ib >> make_data(&attrs.precomputed_reduction_dt, sizeof(attrs.precomputed_reduction_dt));
         ib >> attrs.scales_zp_output_order;
         ib >> attrs.group_sizes;
+        ib >> attrs.precomputed_reduction;
         ib >> input_size;
     }
 };
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp
@@ -254,6 +254,20 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_g
                                     PrecomputeSum::Enabled);
 }
 
+TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_gs128_cache) {
+    this->test_dynamic_quantization(true, {-1, 1, 512},
+                                    {32, 1, 512},
+                                    QuantizationType::Symmetric,
+                                    128,
+                                    data_types::i8,
+                                    data_types::i8,
+                                    OutputStorageType::Planar,
+                                    "",
+                                    SetInnerMostDimValuesZero::No,
+                                    PrecomputeSum::Enabled);
+}
+
+
 TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_gs128_small_values) {
     this->test_dynamic_quantization(false, {1, 1, 512},
                                     {32, 1, 512},