Update embedding_forward_quantized_cpu_template.cpp to use initialized output memory instead of uninitialized (#5054)

Hang Qu · facebook-github-bot · commit 7dd7de363618 · 2025-10-30T12:22:47.000-07:00
Summary: X-link: facebookresearch/FBGEMM#2064 We observe, if the memory of output is uninitialized, the output may be garbage. This is because certain memory is untouched. The proposed fix is a quick workaround, but it will be more efficient to directly fill the untouched memory with zero. Reviewed By: sryap Differential Revision: D85447298
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp
@@ -210,6 +210,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
       total_adjusted_D += T * kINT8QparamsBytes;
     }
     output = at::empty({B, total_adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
+    output.fill_(0);
     {% else %}
     constexpr int kINT8QparamsBytes = 4; // no bag int8 output aligns with fbgemm weights storage size and layout
     constexpr int kINT4QparamsElems = 8; // scale + bias takes 4 bytes which are 8 int4 elements
@@ -220,6 +221,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
       adjusted_D += kINT4QparamsElems;
     }
     output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
+    output.fill_(0);
 
     {% endif %}
 

Original file line number	Diff line number	Diff line change
`@@ -210,6 +210,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{`
`210`	`210`	`total_adjusted_D += T * kINT8QparamsBytes;`
`211`	`211`	`}`
`212`	`212`	`output = at::empty({B, total_adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));`
	`213`	`+ output.fill_(0);`
`213`	`214`	`{% else %}`
`214`	`215`	`constexpr int kINT8QparamsBytes = 4; // no bag int8 output aligns with fbgemm weights storage size and layout`
`215`	`216`	`constexpr int kINT4QparamsElems = 8; // scale + bias takes 4 bytes which are 8 int4 elements`
`@@ -220,6 +221,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{`
`220`	`221`	`adjusted_D += kINT4QparamsElems;`
`221`	`222`	`}`
`222`	`223`	`output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));`
	`224`	`+ output.fill_(0);`
`223`	`225`
`224`	`226`	`{% endif %}`
`225`	`227`