Update embedding_forward_quantized_cpu_template.cpp to use initialized output memory instead of uninitialized (#5054)

Hang Qu · facebook-github-bot · commit e449ca522873 · 2025-10-27T13:09:17.000-07:00
Summary: X-link: facebookresearch/FBGEMM#2064 We observe, if the memory of output is uninitialized, the output may be garbage. This is because certain memory is untouched. The proposed fix is a quick workaround, but it will be more efficient to directly fill the untouched memory with zero. Reviewed By: sryap Differential Revision: D85447298
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp
@@ -210,6 +210,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
       total_adjusted_D += T * kINT8QparamsBytes;
     }
     output = at::empty({B, total_adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
+    if (!output_is_int4 && !output_is_int8) {
+      output.fill_(0);
+    }
     {% else %}
     constexpr int kINT8QparamsBytes = 4; // no bag int8 output aligns with fbgemm weights storage size and layout
     constexpr int kINT4QparamsElems = 8; // scale + bias takes 4 bytes which are 8 int4 elements
@@ -220,7 +223,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
       adjusted_D += kINT4QparamsElems;
     }
     output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
-
+    if (!output_is_int4 && !output_is_int8) {
+      output.fill_(0);
+    }
     {% endif %}