Skip to content

Commit ec23e21

Browse files
authored
[GPU] Dyn quan bugfix for cache (#32582)
### Description of the issue(symptom, root-cause, how it was resolved) - New attributes in dynamic quantization are missing from load/store in caching #### Reproduction step and snapshot (if applicable. Do not attach for customer model) - Two consecutive execution of WWB shows the issue on BMG - $rm -rf minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM//model_cache/ ; python wwb.py --target-model minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM/ --device gpu.1 --gt-data reference1.csv --num-sample 1 ; python wwb.py --target-model minicpm-1b-sft/pytorch/ov/OV_FP16-INT8_ASYM/ --device gpu.1 --gt-data reference1.csv --num-sample 1 #### Checklist - [x] Is it a proper fix? (not a workaround) - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review?
1 parent 4b3d6a7 commit ec23e21

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
5252
seed = hash_combine(seed, attrs.scale_dt.hash());
5353
seed = hash_combine(seed, attrs.zp_dt.hash());
5454
seed = hash_combine(seed, attrs.output_storage_type);
55+
seed = hash_combine(seed, attrs.precomputed_reduction);
56+
seed = hash_combine(seed, attrs.precomputed_reduction_dt.hash());
5557
seed = hash_combine(seed, input_size);
5658

5759
return seed;
@@ -70,6 +72,8 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
7072
attrs.scale_dt == rhs_casted.attrs.scale_dt &&
7173
attrs.zp_dt == rhs_casted.attrs.zp_dt &&
7274
attrs.quantization_type == rhs_casted.attrs.quantization_type &&
75+
attrs.precomputed_reduction == rhs_casted.attrs.precomputed_reduction &&
76+
attrs.precomputed_reduction_dt == rhs_casted.attrs.precomputed_reduction_dt &&
7377
input_size == rhs_casted.input_size;
7478
}
7579

@@ -81,8 +85,10 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
8185
ob << make_data(&attrs.scale_dt, sizeof(attrs.scale_dt));
8286
ob << make_data(&attrs.zp_dt, sizeof(attrs.zp_dt));
8387
ob << make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
88+
ob << make_data(&attrs.precomputed_reduction_dt, sizeof(attrs.precomputed_reduction_dt));
8489
ob << attrs.scales_zp_output_order;
8590
ob << attrs.group_sizes;
91+
ob << attrs.precomputed_reduction;
8692
ob << input_size;
8793
}
8894

@@ -94,8 +100,10 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
94100
ib >> make_data(&attrs.scale_dt, sizeof(attrs.scale_dt));
95101
ib >> make_data(&attrs.zp_dt, sizeof(attrs.zp_dt));
96102
ib >> make_data(&attrs.output_storage_type, sizeof(attrs.output_storage_type));
103+
ib >> make_data(&attrs.precomputed_reduction_dt, sizeof(attrs.precomputed_reduction_dt));
97104
ib >> attrs.scales_zp_output_order;
98105
ib >> attrs.group_sizes;
106+
ib >> attrs.precomputed_reduction;
99107
ib >> input_size;
100108
}
101109
};

src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,20 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_g
254254
PrecomputeSum::Enabled);
255255
}
256256

257+
TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_gs128_cache) {
258+
this->test_dynamic_quantization(true, {-1, 1, 512},
259+
{32, 1, 512},
260+
QuantizationType::Symmetric,
261+
128,
262+
data_types::i8,
263+
data_types::i8,
264+
OutputStorageType::Planar,
265+
"",
266+
SetInnerMostDimValuesZero::No,
267+
PrecomputeSum::Enabled);
268+
}
269+
270+
257271
TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_small_size_precompute_gs128_small_values) {
258272
this->test_dynamic_quantization(false, {1, 1, 512},
259273
{32, 1, 512},

0 commit comments

Comments
 (0)