Add NEON-based FloatOrHalfToFused8BitRowwiseQuantizedSBFloat (#5089)

Nicoshev · meta-codesync[bot] · commit 16aa87b3dc6b · 2025-11-05T09:58:34.000-08:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2098 Pull Request resolved: #5089 Adding NEON translation of FloatOrHalfToFused8BitRowwiseQuantizedSBFloat, used by Ads Performance improves by an order of magnitude: Before: bit_rate, rows, cols, elems_per_usec, GB/Sec 8, 100, 16, 378.68, 1.51 8, 100, 64, 286.91, 1.15 8, 100, 128, 262.06, 1.05 8, 100, 256, 251.34, 1.01 8, 100, 512, 244.92, 0.98 8, 100, 1024, 237.35, 0.95 8, 100, 2048, 230.83, 0.92 8, 120, 16, 378.70, 1.51 8, 120, 64, 286.72, 1.15 8, 120, 128, 263.40, 1.05 8, 120, 256, 251.58, 1.01 8, 120, 512, 245.30, 0.98 8, 120, 1024, 238.17, 0.95 8, 120, 2048, 230.69, 0.92 8, 1000, 16, 392.85, 1.57 8, 1000, 64, 294.35, 1.18 8, 1000, 128, 264.35, 1.06 8, 1000, 256, 252.13, 1.01 8, 1000, 512, 245.50, 0.98 8, 1000, 1024, 241.61, 0.97 8, 1000, 2048, 231.39, 0.93 After: bit_rate, rows, cols, elems_per_usec, GB/Sec 8, 100, 16, 1855.59, 7.42 8, 100, 64, 2615.43, 10.46 8, 100, 128, 3134.34, 12.54 8, 100, 256, 2610.72, 10.44 8, 100, 512, 3065.20, 12.26 8, 100, 1024, 3535.29, 14.14 8, 100, 2048, 3757.66, 15.03 8, 120, 16, 1991.94, 7.97 8, 120, 64, 2971.25, 11.89 8, 120, 128, 3403.37, 13.61 8, 120, 256, 2750.87, 11.00 8, 120, 512, 3272.63, 13.09 8, 120, 1024, 3618.98, 14.48 8, 120, 2048, 3848.59, 15.39 8, 1000, 16, 2329.11, 9.32 8, 1000, 64, 3068.76, 12.28 8, 1000, 128, 3678.86, 14.72 8, 1000, 256, 4440.37, 17.76 8, 1000, 512, 4558.70, 18.23 8, 1000, 1024, 4620.94, 18.48 8, 1000, 2048, 3898.84, 15.60 Reviewed By: mcfi Differential Revision: D86236406 fbshipit-source-id: 12c20cbdbbc9b0674ccca8e1aa598b7de144dea9
diff --git a/include/fbgemm/QuantUtilsNeon.h b/include/fbgemm/QuantUtilsNeon.h
@@ -22,6 +22,13 @@ namespace fbgemm {
 // Utility functions
 ////////////////////////////////////////////////////////////////////////////////
 
+template <typename InputType>
+void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatNeon(
+    const InputType* input,
+    size_t input_rows,
+    int input_columns,
+    uint8_t* output);
+
 template <typename OutputType>
 void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfNeon(
     const std::uint8_t* input,
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
@@ -714,6 +714,10 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat(
     int input_columns,
     std::uint8_t* output,
     const InputType* rowwise_min_max) {
+#if HAVE_SVE
+  FloatOrHalfToFused8BitRowwiseQuantizedSBFloatNeon<InputType>(
+      input, input_rows, input_columns, output);
+#else
   if (cpuinfo_initialize() && fbgemmHasAvx2Support()) {
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
     FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2<InputType>(
@@ -723,6 +727,7 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat(
     FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef<InputType>(
         input, input_rows, input_columns, output);
   }
+#endif
 }
 
 template <typename OutputType, bool is_uint16_t_of_type_bf16>
diff --git a/src/QuantUtilsNeon.cc b/src/QuantUtilsNeon.cc
@@ -11,12 +11,13 @@
 #include "fbgemm/Utils.h"
 
 #define FBGEMM_EXPORTS
+#include <arm_fp16.h> // @manual
 #include <arm_neon.h> // @manual
 #if HAVE_SVE
+#include <arm_neon_sve_bridge.h> // @manual
 #include <arm_sve.h> // @manual
 #endif
 
-#include <arm_neon_sve_bridge.h> // @manual
 #include <algorithm> //for std::min/std::max
 #include <cassert> //for assert
 #include <cfloat> // for FLT_MAX
@@ -32,41 +33,48 @@ namespace fbgemm {
 using namespace std;
 ////////////////////////////////////////////////////////////////////////////////
 // Utility functions
-
-void FindMinMax(const float* m, float* min, float* max, int64_t len) {
-  if (__builtin_expect(len <= 0, 0)) {
-    *min = 0.0f;
-    *max = 0.0f;
-    return;
-  }
-
+static inline void
+FindMinMaxImpl_f32(const float* m, float* min, float* max, uint64_t count) {
   float first = *m;
 
+  float tmp_min_s = first;
+  float tmp_max_s = first;
+
   float32x4_t temp_min_0 = vdupq_n_f32(first);
   float32x4_t temp_min_1 = vdupq_n_f32(first);
   float32x4_t temp_max_0 = vdupq_n_f32(first);
   float32x4_t temp_max_1 = vdupq_n_f32(first);
-  uint64_t i = 0;
-  uint64_t count = static_cast<uint64_t>(len);
-  uint64_t loopBound = count - (count % 8);
-
-  for (; i < loopBound; i += 8) {
-    float32x4_t v0 = vld1q_f32(m + i);
-    float32x4_t v1 = vld1q_f32(m + i + 4);
-    temp_min_0 = vminq_f32(temp_min_0, v0);
-    temp_min_1 = vminq_f32(temp_min_1, v1);
-    temp_max_0 = vmaxq_f32(temp_max_0, v0);
-    temp_max_1 = vmaxq_f32(temp_max_1, v1);
+  constexpr uint64_t kItemsPerIter = 8;
+  uint64_t loopIters = count / kItemsPerIter;
+  uint64_t loopRemainder = count % kItemsPerIter;
+
+  if (__builtin_expect(loopIters > 0, 1)) {
+    do {
+      float32x4_t v0 = vld1q_f32(m);
+      float32x4_t v1 = vld1q_f32(m + 4);
+      m += kItemsPerIter;
+      loopIters -= 1;
+      temp_min_0 = vminq_f32(temp_min_0, v0);
+      temp_min_1 = vminq_f32(temp_min_1, v1);
+      temp_max_0 = vmaxq_f32(temp_max_0, v0);
+      temp_max_1 = vmaxq_f32(temp_max_1, v1);
+    } while (loopIters > 0);
+
+    temp_min_0 = vminq_f32(temp_min_0, temp_min_1);
+    temp_max_0 = vmaxq_f32(temp_max_0, temp_max_1);
+
+    tmp_min_s = vminvq_f32(temp_min_0);
+    tmp_max_s = vmaxvq_f32(temp_max_0);
   }
 
-  temp_min_0 = vminq_f32(temp_min_0, temp_min_1);
-  temp_max_0 = vmaxq_f32(temp_max_0, temp_max_1);
-
-  float tmp_min_s = vminvq_f32(temp_min_0);
-  float tmp_max_s = vmaxvq_f32(temp_max_0);
-
-  for (; i < count; i++) {
-    float tmp = *m;
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
+#elif defined(__GNUC__)
+#pragma GCC novector unroll 0
+#endif
+  while (loopRemainder > 0) {
+    float tmp = *m++;
+    loopRemainder -= 1;
     tmp_min_s = std::min(tmp_min_s, tmp);
     tmp_max_s = std::max(tmp_max_s, tmp);
   }
@@ -75,8 +83,180 @@ void FindMinMax(const float* m, float* min, float* max, int64_t len) {
   *max = tmp_max_s;
 }
 
+void FindMinMax(const float* m, float* min, float* max, int64_t len) {
+  if (__builtin_expect(len <= 0, 0)) {
+    *min = 0.0f;
+    *max = 0.0f;
+    return;
+  }
+
+  FindMinMaxImpl_f32(m, min, max, static_cast<uint64_t>(len));
+}
+
 #if HAVE_SVE
 
+static inline void
+FindMinMaxImpl_f16(const float16_t* m, float* min, float* max, uint64_t count) {
+  float16_t first = *m;
+
+  float16_t tmp_min_s = first;
+  float16_t tmp_max_s = first;
+
+  float16x8_t temp_min_0 = vdupq_n_f16(first);
+  float16x8_t temp_min_1 = vdupq_n_f16(first);
+  float16x8_t temp_max_0 = vdupq_n_f16(first);
+  float16x8_t temp_max_1 = vdupq_n_f16(first);
+  constexpr uint64_t kItemsPerIter = 16;
+  uint64_t loopIters = count / kItemsPerIter;
+  uint64_t loopRemainder = count % kItemsPerIter;
+
+  if (__builtin_expect(loopIters > 0, 1)) {
+    do {
+      float16x8_t v0 = vld1q_f16(m);
+      float16x8_t v1 = vld1q_f16(m + 8);
+      m += kItemsPerIter;
+      loopIters -= 1;
+      temp_min_0 = vminq_f16(temp_min_0, v0);
+      temp_min_1 = vminq_f16(temp_min_1, v1);
+      temp_max_0 = vmaxq_f16(temp_max_0, v0);
+      temp_max_1 = vmaxq_f16(temp_max_1, v1);
+    } while (loopIters > 0);
+
+    temp_min_0 = vminq_f16(temp_min_0, temp_min_1);
+    temp_max_0 = vmaxq_f16(temp_max_0, temp_max_1);
+
+    tmp_min_s = vminvq_f16(temp_min_0);
+    tmp_max_s = vmaxvq_f16(temp_max_0);
+  }
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
+#elif defined(__GNUC__)
+#pragma GCC novector unroll 0
+#endif
+  while (loopRemainder > 0) {
+    float16_t tmp = *m++;
+    loopRemainder -= 1;
+    tmp_min_s = vminh_f16(tmp_min_s, tmp);
+    tmp_max_s = vmaxh_f16(tmp_max_s, tmp);
+  }
+
+  *min = static_cast<float>(tmp_min_s);
+  *max = static_cast<float>(tmp_max_s);
+}
+
+template <typename InputType>
+void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatNeon(
+    const InputType* input,
+    size_t input_rows,
+    int input_columns,
+    uint8_t* output) {
+  constexpr float kEpsilon = 1e-8f;
+
+  if (input_rows == 0 || input_columns <= 0) {
+    return;
+  }
+
+  uint64_t column_count = static_cast<uint64_t>(input_columns);
+
+  const uint64_t output_columns = column_count + 2 * sizeof(float);
+
+  for (size_t row = 0; __builtin_expect(row < input_rows, 1); ++row) {
+    const InputType* input_row = input + row * column_count;
+    uint8_t* output_row = output + row * output_columns;
+
+    float* output_row_scale_bias =
+        reinterpret_cast<float*>(output_row + column_count);
+
+    float minimum_element;
+    float maximum_element;
+    if constexpr (std::is_same<InputType, float>()) {
+      FindMinMaxImpl_f32(
+          input_row, &minimum_element, &maximum_element, column_count);
+    } else {
+      FindMinMaxImpl_f16(
+          reinterpret_cast<const float16_t*>(input_row),
+          &minimum_element,
+          &maximum_element,
+          column_count);
+    }
+    float range = maximum_element - minimum_element;
+
+    const auto inverse_scale = 255.0f / (range + kEpsilon);
+
+    float32x4_t inverse_scale_v = vdupq_n_f32(inverse_scale);
+    float32x4_t min_v = vdupq_n_f32(minimum_element);
+
+    constexpr uint64_t kItemsPerIter = 8;
+    uint64_t loopIters = column_count / kItemsPerIter;
+    uint64_t loopRemainder = column_count % kItemsPerIter;
+
+    output_row_scale_bias[0] = range / 255.0f;
+    output_row_scale_bias[1] = minimum_element;
+
+    while (__builtin_expect(loopIters > 0, 1)) {
+      float32x4_t v0;
+      float32x4_t v1;
+
+      if constexpr (std::is_same<InputType, float>()) {
+        v0 = vld1q_f32(input_row);
+        v1 = vld1q_f32(input_row + 4);
+      } else {
+        float16x8_t h0 =
+            vld1q_f16(reinterpret_cast<const float16_t*>(input_row));
+        v0 = vcvt_f32_f16(vget_low_f16(h0));
+        v1 = vcvt_high_f32_f16(h0);
+      }
+
+      input_row += kItemsPerIter;
+      loopIters -= 1;
+
+      v0 = vsubq_f32(v0, min_v);
+      v1 = vsubq_f32(v1, min_v);
+
+      v0 = vmulq_f32(v0, inverse_scale_v);
+      v1 = vmulq_f32(v1, inverse_scale_v);
+
+      int32x4_t i0 = vcvtnq_s32_f32(v0);
+      int32x4_t i1 = vcvtnq_s32_f32(v1);
+
+      svst1b_s32(
+          svptrue_b8(),
+          reinterpret_cast<int8_t*>(output_row),
+          svset_neonq_s32(svundef_s32(), i0));
+      svst1b_s32(
+          svptrue_b8(),
+          reinterpret_cast<int8_t*>(output_row + 4),
+          svset_neonq_s32(svundef_s32(), i1));
+
+      output_row += kItemsPerIter;
+    }
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
+#elif defined(__GNUC__)
+#pragma GCC novector unroll 0
+#endif
+    while (loopRemainder > 0) {
+      float32x4_t v0;
+      if constexpr (std::is_same<InputType, float>()) {
+        v0[0] = *input_row++;
+      } else {
+        v0[0] =
+            static_cast<float>(*reinterpret_cast<const float16_t*>(input_row));
+        input_row += 1;
+      }
+      loopRemainder -= 1;
+      v0 = vsubq_f32(v0, min_v);
+      v0 = vmulq_f32(v0, inverse_scale_v);
+      int32x4_t i0 = vcvtnq_s32_f32(v0);
+      *output_row = i0[0];
+      output_row += 1;
+    }
+
+  } // for each row
+}
+
 template <typename OutputType>
 void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfNeon(
     const std::uint8_t* input,
@@ -179,7 +359,12 @@ void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfNeon(
       const std::uint8_t* input,                                         \
       size_t input_rows,                                                 \
       int input_columns,                                                 \
-      type* output);
+      type* output);                                                     \
+  template void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatNeon<type>( \
+      const type* input,                                                 \
+      size_t input_rows,                                                 \
+      int input_columns,                                                 \
+      uint8_t* output);
 
 // clang-format off
 INSTANTIATE_QuantizationNeonFunctions8Bits(float)