vllm-project · wangxiyuan · Dec 6, 2025 · Dec 1, 2025 · Dec 6, 2025 · wangxiyuan
diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -138,3 +138,21 @@ jobs:
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
       tests: ${{ matrix.test_config.tests }}
       name: ${{ matrix.test_config.name }}
+  custom-ops-tests:
+    name: test ops
+    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+    needs: multi-node-tests
+    strategy:
+      fail-fast: false
+      matrix:
+        test_config:
+          - name: custom-op-dispatch_gmm_combine_decode
+            os: linux-aarch64-a3-16
+            tests: tests/e2e/nightly/multicard_ops/test_dispatch_gmm_combine_decode.py
+    uses: ./.github/workflows/_e2e_nightly_single_node.yaml
+    with:
+      runner: ${{ matrix.test_config.os }}
+      vllm: v0.12.0
+      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
+      tests: ${{ matrix.test_config.tests }}
+      name: ${{ matrix.test_config.name }}
diff --git a/csrc/build_aclnn.sh b/csrc/build_aclnn.sh
@@ -3,8 +3,6 @@
 ROOT_DIR=$1
 SOC_VERSION=$2
 
-git config --global --add safe.directory "$ROOT_DIR"
-
 if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
     # ASCEND310P series
     # currently, no custom aclnn ops for ASCEND310 series
@@ -13,41 +11,48 @@ if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
     exit 0
 elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
     # ASCEND910B (A2) series
-    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;dispatch_ffn_combine"
+    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention"
     SOC_ARG="ascend910b"
 elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
     # ASCEND910C (A3) series
-    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;dispatch_ffn_combine"
+    # depdendency: catlass
+    git config --global --add safe.directory "$ROOT_DIR"
+    CATLASS_PATH=${ROOT_DIR}/csrc/third_party/catlass/include
+    if [[ ! -d "${CATLASS_PATH}" ]]; then
+        echo "depdendency catlass is missing, try to fetch it..."
+        if ! git submodule update --init --recursive; then
+            echo "fetch failed"
+            exit 1
+        fi
+    fi
+    # depdendency: cann-toolkit file moe_distribute_base.h
+    HCCL_STRUCT_FILE_PATH=$(find -L "${ASCEND_TOOLKIT_HOME}" -name "moe_distribute_base.h" 2>/dev/null | head -n1)
+    if [ -z "$HCCL_STRUCT_FILE_PATH" ]; then
+        echo "cannot find moe_distribute_base.h file in CANN env"
+        exit 1
+    fi
+    # for dispatch_gmm_combine_decode
+    yes | cp "${HCCL_STRUCT_FILE_PATH}" "${ROOT_DIR}/csrc/dispatch_gmm_combine_decode/op_kernel"
+    # for dispatch_ffn_combine
+    SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+    TARGET_DIR="$SCRIPT_DIR/dispatch_ffn_combine/op_kernel/utils/"
+    TARGET_FILE="$TARGET_DIR/$(basename "$HCCL_STRUCT_FILE_PATH")"
+
+    echo "*************************************"
+    echo $HCCL_STRUCT_FILE_PATH
+    echo "$TARGET_DIR"
+    cp "$HCCL_STRUCT_FILE_PATH" "$TARGET_DIR"
+
+    sed -i 's/struct HcclOpResParam {/struct HcclOpResParamCustom {/g' "$TARGET_FILE"
+    sed -i 's/struct HcclRankRelationResV2 {/struct HcclRankRelationResV2Custom {/g' "$TARGET_FILE"
+    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;dispatch_ffn_combine;dispatch_gmm_combine_decode;"
     SOC_ARG="ascend910_93"
 else
     # others
     # currently, no custom aclnn ops for other series
     exit 0
 fi
 
-git submodule init
-git submodule update
-
-
-# For the compatibility of CANN8.5 and CANN8.3: copy and modify moe_distribute_base.h
-file_path=$(find /usr/local/Ascend/ascend-toolkit -name "moe_distribute_base.h" 2>/dev/null | head -n1)
-if [ -z "$file_path" ]; then
-    echo "cannot find moe_distribute_base.h file in CANN env"
-    exit 1
-fi
-
-SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
-TARGET_DIR="$SCRIPT_DIR/dispatch_ffn_combine/op_kernel/utils/"
-TARGET_FILE="$TARGET_DIR/$(basename "$file_path")"
-
-echo "*************************************"
-echo $file_path
-echo "$TARGET_DIR"
-cp "$file_path" "$TARGET_DIR"
-
-sed -i 's/struct HcclOpResParam {/struct HcclOpResParamCustom {/g' "$TARGET_FILE"
-sed -i 's/struct HcclRankRelationResV2 {/struct HcclRankRelationResV2Custom {/g' "$TARGET_FILE"
-
 
 # build custom ops
 cd csrc

diff --git a/csrc/dispatch_gmm_combine_decode/op_host/CMakeLists.txt b/csrc/dispatch_gmm_combine_decode/op_host/CMakeLists.txt
@@ -0,0 +1,59 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+set(_DISPATCH_GMM_INC_OPTS)
+if (EXISTS ${CMAKE_SOURCE_DIR}/third_party/catlass/include)
+    list(APPEND _DISPATCH_GMM_INC_OPTS -I${CMAKE_SOURCE_DIR}/third_party/catlass/include)
+else()
+    message(FATAL_ERROR "dependency catlass is missing, you can fetch it by running 'git submodule update --init --recursive'")
+endif()
+
+add_ops_compile_options(
+        OP_NAME DispatchGmmCombineDecode
+        OPTIONS --cce-auto-sync=off
+                -Wno-deprecated-declarations
+                -Werror
+                ${_DISPATCH_GMM_INC_OPTS}
+)
+
+target_sources(op_host_aclnnInner PRIVATE
+        dispatch_gmm_combine_decode_def.cpp
+)
+
+target_sources(opapi PRIVATE
+        aclnn_dispatch_gmm_combine_decode.cpp
+)
+
+if (NOT BUILD_OPEN_PROJECT)
+    target_sources(aclnn_ops_train PRIVATE
+        aclnn_dispatch_gmm_combine_decode.cpp
+    )
+
+    target_sources(aclnn_ops_infer PRIVATE
+        aclnn_dispatch_gmm_combine_decode.cpp
+    )
+endif ()
+
+target_sources(optiling PRIVATE
+        dispatch_gmm_combine_decode_tiling.cpp
+)
+
+target_include_directories(optiling PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+target_sources(opsproto PRIVATE
+        dispatch_gmm_combine_decode_proto.cpp
+)
+
+file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_dispatch_gmm_combine_decode.h")
+
+install(FILES ${_GMM_Aclnn_header}
+        DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
+)
diff --git a/csrc/dispatch_gmm_combine_decode/op_host/aclnn_dispatch_gmm_combine_decode.cpp b/csrc/dispatch_gmm_combine_decode/op_host/aclnn_dispatch_gmm_combine_decode.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <string.h>
+#include "graph/types.h"
+#include "aclnn/opdev/platform.h"
+#include "aclnn_dispatch_gmm_combine_decode.h"
+
+enum NnopbaseHcclServerType {
+    NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
+    NNOPBASE_HCCL_SERVER_TYPE_MTE,
+    NNOPBASE_HCCL_SERVER_TYPE_END
+};
+extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern aclnnStatus aclnnInnerDispatchGmmCombineDecodeGetWorkspaceSize(
+    const aclTensor *x,
+    const aclTensor *expertIds,
+    const aclTensor *gmm1PermutedWeight,
+    const aclTensor *gmm1PermutedWeightScale,
+    const aclTensor *gmm2Weight,
+    const aclTensor *gmm2WeightScale,
+    const aclTensor *expertSmoothScalesOptional,
+    const aclTensor *expertScalesOptional,
+    char *groupEp,
+    int64_t epRankSize,
+    int64_t epRankId,
+    int64_t moeExpertNum,
+    int64_t shareExpertNum,
+    int64_t shareExpertRankNum,
+    int64_t quantMode,
+    int64_t globalBs,
+    const aclTensor *output,
+    const aclTensor *epRecvCount,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+extern aclnnStatus aclnnInnerDispatchGmmCombineDecode(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+
+aclnnStatus aclnnDispatchGmmCombineDecodeGetWorkspaceSize(
+    const aclTensor *x,
+    const aclTensor *expertIds,
+    const aclTensor *gmm1PermutedWeight,
+    const aclTensor *gmm1PermutedWeightScale,
+    const aclTensor *gmm2Weight,
+    const aclTensor *gmm2WeightScale,
+    const aclTensor *expertSmoothScalesOptional,
+    const aclTensor *expertScalesOptional,
+    char *groupEp,
+    int64_t epRankSize,
+    int64_t epRankId,
+    int64_t moeExpertNum,
+    int64_t shareExpertNum,
+    int64_t shareExpertRankNum,
+    int64_t quantMode,
+    int64_t globalBs,
+    const aclTensor *output,
+    const aclTensor *epRecvCount,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor)
+{
+    return aclnnInnerDispatchGmmCombineDecodeGetWorkspaceSize(x, expertIds, gmm1PermutedWeight, gmm1PermutedWeightScale,
+        gmm2Weight, gmm2WeightScale, expertSmoothScalesOptional, expertScalesOptional, groupEp, epRankSize,
+        epRankId, moeExpertNum, shareExpertNum, shareExpertRankNum, quantMode, globalBs,
+        output, epRecvCount, workspaceSize, executor);
+}
+
+aclnnStatus aclnnDispatchGmmCombineDecode(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream)
+{
+    if (NnopbaseSetHcclServerType) {
+        if (op::GetCurrentPlatformInfo().GetSocVersion() == op::SocVersion::ASCEND910B) {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_AICPU);
+        } else {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
+        }
+    }
+    return aclnnInnerDispatchGmmCombineDecode(workspace, workspaceSize, executor, stream);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
diff --git a/csrc/dispatch_gmm_combine_decode/op_host/aclnn_dispatch_gmm_combine_decode.h b/csrc/dispatch_gmm_combine_decode/op_host/aclnn_dispatch_gmm_combine_decode.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef DISPATCH_GMM_COMBINE_DECODE
+#define DISPATCH_GMM_COMBINE_DECODE
+
+#include "aclnn/acl_meta.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__attribute__((visibility("default"))) aclnnStatus aclnnDispatchGmmCombineDecodeGetWorkspaceSize(
+    const aclTensor *x,
+    const aclTensor *expertIds,
+    const aclTensor *gmm1PermutedWeight,
+    const aclTensor *gmm1PermutedWeightScale,
+    const aclTensor *gmm2Weight,
+    const aclTensor *gmm2WeightScale,
+    const aclTensor *expertSmoothScalesOptional,
+    const aclTensor *expertScalesOptional,
+    char *groupEp,
+    int64_t epRankSize,
+    int64_t epRankId,
+    int64_t moeExpertNum,
+    int64_t shareExpertNum,
+    int64_t shareExpertRankNum,
+    int64_t quantMode,
+    int64_t globalBs,
+    const aclTensor *output,
+    const aclTensor *epRecvCount,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+
+__attribute__((visibility("default"))) aclnnStatus aclnnDispatchGmmCombineDecode(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif