Add MlasIsDynamicQGemmAvailable() helper and use that in place of platform-specific checks (#26668)

edgchen1 · web-flow · commit 61ff403b45e7 · 2025-12-04T09:38:05.000-08:00
### Description
&lt;!-- Describe your changes. --&gt;

Add `MlasIsDynamicQGemmAvailable()` helper function and use that in
place of platform-specific checks.

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;

Try to reduce platform-specific code.
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/cpuid_info.h"  // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2()
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"
@@ -213,9 +212,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
         }
       }
 
-      // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops.
-      // We check that here too before attempting to use them.
-      if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) {
+      if (!MlasIsDynamicQGemmAvailable()) {
         can_use_dynamic_quant_mlas_ = false;
       }
 
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -634,6 +634,7 @@ MlasGemm(
 {
     MlasGemmBatch(Shape, &DataParams, 1, ThreadPool);
 }
+
 /**
  * @brief Parameters that define the shape of a dynamically quantized GEMM operation.
  *
@@ -646,6 +647,7 @@ struct MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS {
     size_t N = 0;                  /**< Column size of matrix B */
     size_t K = 0;                  /**< Column size of matrix A and Row size of matrix B */
 };
+
 /**
  * @brief Parameters that define the data buffers and layout for a dynamic quant GEMM.
  *
@@ -680,6 +682,14 @@ MlasDynamicQGemm (
     MlasDynamicQGemmBatch(Shape, DataParams, 1, ThreadPool);
 }
 
+/**
+ * @brief Determines whether a dynamic quantized GEMM implementation is available on the current platform.
+ *
+ * MlasDynamicQGemm() and MlasDynamicQGemmBatch() should only be called if this function returns true.
+ */
+bool
+MLASCALL
+MlasIsDynamicQGemmAvailable();
 
 //
 // Symmetric QGEMM has limited buffer overrun.
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -201,6 +201,17 @@ MlasGemmBatch(
     });
 }
 
+bool
+MLASCALL
+MlasIsDynamicQGemmAvailable()
+{
+#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+  return ArmKleidiAI::UseSME2;
+#else
+  return false;
+#endif
+}
+
 void
 MLASCALL
 MlasDynamicQGemmBatch (
@@ -209,11 +220,11 @@ MlasDynamicQGemmBatch (
     const size_t BatchN,
     MLAS_THREADPOOL* ThreadPool
 ) {
+    assert(MlasIsDynamicQGemmAvailable());
+
 #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
-    //No fallback and putting in guards. This implementation is SME2 specific.
-    if(ArmKleidiAI::UseSME2){
-        ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
-    }
+    //No fallback
+    ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(Shape);
@@ -332,13 +343,13 @@ MlasDynamicQgemmPackBSize(
     size_t K
 )
 {
+    assert(MlasIsDynamicQGemmAvailable());
+
     size_t bytes = 0;
 #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
     //No fallback available
     //TODO: Insert Override
-    if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override
-        bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
-    }
+    bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
@@ -405,11 +416,15 @@ Return Value:
     const size_t BufferAlignment = MlasGetPreferredBufferAlignment();
     const size_t AlignedBytesRequired = (BytesRequired + BufferAlignment - 1) &
         ~(BufferAlignment - 1);
-    // If this gemm B argument is used in a dynamically quantization gemm operation we can optimize for
+    // If this gemm B argument is used in a dynamically quantized gemm operation we can optimize for
     // this use case. Concat both packed representations for later decision. This allows for cases later
-    // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization 
+    // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization
     // for better performance
-    return AlignedBytesRequired + MlasDynamicQgemmPackBSize(N, K);
+    if (MlasIsDynamicQGemmAvailable()) {
+        return AlignedBytesRequired + MlasDynamicQgemmPackBSize(N, K);
+    } else {
+        return AlignedBytesRequired;
+    }
 }
 
 void
@@ -423,11 +438,11 @@ MlasDynamicQgemmPackB(
     void* PackedB
 )
 {
+    assert(MlasIsDynamicQGemmAvailable());
+
 #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
     //No fallback
-    if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override
-        ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
-    }
+    ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp
@@ -4,11 +4,8 @@
 // SPDX-License-Identifier: MIT
 //
 
-// Currently this test only applies to KleidiAI Guard against it running in any other situation
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
-
+#include "mlas.h"
 #include "test_util.h"
-#include "core/mlas/lib/mlasi.h"  // for MLAS_CPUIDINFO
 
 class MlasDynamicQgemmTest {
  private:
@@ -20,11 +17,6 @@ class MlasDynamicQgemmTest {
 
  public:
   void Test(size_t M, size_t N, size_t K, size_t BatchSize) {
-    // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops.
-    if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
-      GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME2 but it was not detected. Skipping test.";
-    }
-
     // Setup buffers for holding various data
 
     float* A = buffer_a.GetBuffer(M * K * BatchSize);
@@ -167,6 +159,10 @@ class DynamicQgemmExecuteTest : public MlasTestFixture<MlasDynamicQgemmTest> {
 };
 
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  // Only register tests if MlasDynamicQGemmBatch() has an implementation available.
+  if (!MlasIsDynamicQGemmAvailable()) {
+    return size_t{0};
+  }
+
   return DynamicQgemmExecuteTest::RegisterAll(is_short_execute);
 });
-#endif

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`// Copyright (c) Microsoft Corporation. All rights reserved.`
`2`	`2`	`// Licensed under the MIT License.`
`3`	`3`
`4`		`-#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2()`
`5`	`4`	`#include "core/common/narrow.h"`
`6`	`5`	`#include "core/common/safeint.h"`
`7`	`6`	`#include "core/mlas/inc/mlas.h"`
`@@ -213,9 +212,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {`
`213`	`212`	`}`
`214`	`213`	`}`
`215`	`214`
`216`		`- // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops.`
`217`		`- // We check that here too before attempting to use them.`
`218`		`- if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) {`
	`215`	`+ if (!MlasIsDynamicQGemmAvailable()) {`
`219`	`216`	`can_use_dynamic_quant_mlas_ = false;`
`220`	`217`	`}`
`221`	`218`
Original file line number	Diff line number	Diff line change
`@@ -634,6 +634,7 @@ MlasGemm(`
`634`	`634`	`{`
`635`	`635`	`MlasGemmBatch(Shape, &DataParams, 1, ThreadPool);`
`636`	`636`	`}`
	`637`	`+`
`637`	`638`	`/**`
`638`	`639`	`* @brief Parameters that define the shape of a dynamically quantized GEMM operation.`
`639`	`640`	`*`
`@@ -646,6 +647,7 @@ struct MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS {`
`646`	`647`	`size_t N = 0; /*< Column size of matrix B /`
`647`	`648`	`size_t K = 0; /*< Column size of matrix A and Row size of matrix B /`
`648`	`649`	`};`
	`650`	`+`
`649`	`651`	`/**`
`650`	`652`	`* @brief Parameters that define the data buffers and layout for a dynamic quant GEMM.`
`651`	`653`	`*`
`@@ -680,6 +682,14 @@ MlasDynamicQGemm (`
`680`	`682`	`MlasDynamicQGemmBatch(Shape, DataParams, 1, ThreadPool);`
`681`	`683`	`}`
`682`	`684`
	`685`	`+/**`
	`686`	`+ * @brief Determines whether a dynamic quantized GEMM implementation is available on the current platform.`
	`687`	`+ *`
	`688`	`+ * MlasDynamicQGemm() and MlasDynamicQGemmBatch() should only be called if this function returns true.`
	`689`	`+ */`
	`690`	`+bool`
	`691`	`+MLASCALL`
	`692`	`+MlasIsDynamicQGemmAvailable();`
`683`	`693`
`684`	`694`	`//`
`685`	`695`	`// Symmetric QGEMM has limited buffer overrun.`