DeepLink-org
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py‎
Lines changed: 53 additions & 0 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_infer_param.py‎
Lines changed: 53 additions & 1 deletion b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_infer_param.py‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py‎
Lines changed: 73 additions & 1 deletion b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py‎
Lines changed: 73 additions & 1 deletion
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/model.cpp‎
Lines changed: 5 additions & 5 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/model.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/gt_scalar_operation.cpp‎
Lines changed: 4 additions & 2 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/gt_scalar_operation.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/index_select_operation.cpp‎
Lines changed: 51 additions & 0 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/index_select_operation.cpp‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/index_select_operation.h‎
Lines changed: 36 additions & 0 deletions b/‎dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/aclnn_ops/index_select_operation.h‎
Lines changed: 36 additions & 0 deletions
@@ -37,6 +37,14 @@ def infer_result(self, x, weight, bias):
         return out
 
 
+class AllReduce(Operator):
+    def __init__(self):
+        super().__init__("AllReduce")
+
+    def infer_result(self, x, reduce_type):
+        return torch.ops._c10d_functional.all_reduce.default(x, reduce_type, "0")
+
+
 class Add(Operator):
     def __init__(self):
         super().__init__("Add")
@@ -353,3 +361,48 @@ def __init__(self):
 
     def infer_result(self, x1, x2, axis):
         return torch.ops.aten.embedding.default(x1, x2, axis)
+
+
+class Softmax(Operator):
+    def __init__(self):
+        super().__init__("Softmax")
+
+    def infer_result(self, x, dim):
+        return torch.softmax(x, dim=self.dim)
+
+
+class Sort(Operator):
+    def __init__(self):
+        super().__init__("Sort")
+
+    def infer_result(self, x, topk):
+        value, index = torch.topk(x, topk)
+        return value, index
+
+
+class Slice(Operator):
+    def __init__(self):
+        super().__init__("Slice")
+
+    def infer_result(self, x, dim, offsets, size):
+        return torch.ops.aten.slice.Tensor(
+            x, dim, offsets[dim], offsets[dim] + size[dim], 1
+        )
+
+
+class AclNnSlice(Operator):
+    def __init__(self):
+        super().__init__("AclNnSlice")
+
+    def infer_result(self, x, dim, start, end, step):
+        return torch.ops.aten.slice.Tensor(x, dim, start, end, step)
+
+
+class IndexSelect(Operator):
+    def __init__(self):
+        super().__init__("IndexSelect")
+
+    def infer_result(self, x, dim, index):
+        indices = [None] * len(x.shape)
+        indices[dim] = index
+        return torch.ops.aten.index.Tensor(x, indices)
@@ -435,7 +435,7 @@ class LinearParallelParam:
     rankSize: int = 0
     rankRoot: int = 0
     hasResidual: bool = False
-    backend: str = "hccl"
+    backend: str = "lccl"
     commMode: CommMode = CommMode.COMM_MULTI_PROCESS
     rankTableFile: str = ""
     parallelType: ParallelType = ParallelType.LINEAR_ALL_REDUCE
@@ -446,6 +446,58 @@ class LinearParallelParam:
     commDomain: str = ""
 
 
+class AllReducQuantType(IntEnum):
+    QUANT_TYPE_UNDEFINED = 0
+    QUANT_TYPE_PER_TENSOR = 1
+    QUANT_TYPE_PER_CHANNEL = 2
+    QUANT_TYPE_MAX = 3
+
+
+@dataclass
+class AllReduceParam:
+    rank: int = 0
+    rankSize: int = 0
+    rankRoot: int = 0
+    allReduceType: str = "sum"
+    backend: str = "lccl"
+    quantType: QuantType = AllReducQuantType.QUANT_TYPE_UNDEFINED
+    rankTableFile: str = ""
+    outDataType: AclDataType = AclDataType.ACL_DT_UNDEFINED
+    commMode: CommMode = CommMode.COMM_MULTI_PROCESS
+    commDomain = ""
+
+
+@dataclass
+class SortParam:
+    num: int = 0
+
+
+@dataclass
+class SoftmaxParam:
+    axes: list[int] = field(default_factory=list)
+
+
+@dataclass
+class SliceParam:
+    offsets: list[int] = field(default_factory=list)
+    size: list[int] = field(default_factory=list)
+
+
+@dataclass
+class AclNnSliceParam:
+    name: str = ""
+    dim: int = 0
+    start: int = 0
+    end: int = 0
+    step: int = 0
+
+
+@dataclass
+class IndexSelectParam:
+    name: str = ""
+    dim: int = 0
+
+
 def custom_asdict_factory(data):
     def convert_value(obj):
         if isinstance(obj, IntEnum):
 
@@ -54,6 +54,7 @@ def LinearAllReduce(name, x, weight, bias):
         param.rankRoot = 0
         param.hasResidual = False
         param.parallelType = infer_param.ParallelType.LINEAR_ALL_REDUCE
+        param.backend = "lccl"
 
         if bias:
             op.set_input([x, weight, bias])
@@ -63,7 +64,19 @@ def LinearAllReduce(name, x, weight, bias):
         op.set_output([name])
         return op
 
-    @staticmethod
+    def AllReduce(name, x, reduce_type):
+        op = Operation(name, "AllReduceOperation")
+        param = infer_param.AllReduceParam()
+        param.rank = dist.get_rank()
+        param.rankSize = dist.get_world_size()
+        param.rankRoot = 0
+        param.allReduceType = reduce_type
+        param.backend = "lccl"
+        op.set_input([x])
+        op.set_param(param)
+        op.set_output([name])
+        return op
+
     def Add(name, x, y):
         op = Operation(name, "ElewiseOperation")
         param = infer_param.ElewiseParam()
@@ -537,3 +550,62 @@ def Gather(name, x1, x2, axis):
         op.set_param(param)
         op.set_output([name])
         return op
+
+    def Softmax(name, x, dim):
+        op = Operation(name, "AclNnSoftmaxOperation")
+        param = infer_param.SoftmaxParam()
+        param.name = name
+        if not isinstance(dim, list):
+            dim = [dim]
+        param.axes = dim
+
+        op.set_input([x])
+        op.set_param(param)
+        op.set_output([name])
+        return op
+
+    def Sort(name, x, topk):
+        op = Operation(name, "AclNnTopkOperation")
+        param = infer_param.SortParam()
+        param.num = topk
+
+        op.set_input([x])
+        op.set_param(param)
+        op.set_output([f"{name}__0", f"{name}__1"])
+        return op
+
+    def Slice(name, x, dim, offsets, size):
+        op = Operation(name, "SliceOperation")
+        param = infer_param.SliceParam()
+        param.offsets = offsets
+        param.size = size
+
+        op.set_input([x])
+        op.set_param(param)
+        op.set_output([name])
+        return op
+
+    def AclNnSlice(name, x, dim, start, end, step):
+        op = Operation(name, "AclNnSliceOperation")
+        param = infer_param.AclNnSliceParam()
+        param.name = name
+        param.dim = dim
+        param.start = start
+        param.end = end
+        param.step = step
+
+        op.set_input([x])
+        op.set_param(param)
+        op.set_output([name])
+        return op
+
+    def IndexSelect(name, x, dim, index):
+        op = Operation(name, "AclNnIndexSelectOperation")
+        param = infer_param.IndexSelectParam()
+        param.name = name
+        param.dim = dim
+
+        op.set_input([x, index])
+        op.set_param(param)
+        op.set_output([name])
+        return op
@@ -32,6 +32,7 @@ target_include_directories(
     ${CMAKE_CURRENT_SOURCE_DIR}
     ${TORCH_NPU_INCLUDE_DIRS}
     ${CANN_INCLUDE_DIRS}
+    ${CANN_INCLUDE_DIRS}/aclnn
     ${ATB_INCLUDE_DIRS}
 )
 
 
@@ -109,7 +109,6 @@ atb::Tensor Model::CreateInternalTensorFromDesc(const atb::TensorDesc& tensorDes
 Model::Model(const std::string& modelId, const std::string& modelPath) : modelId_(modelId), modelPath_(modelPath) {
     auto st = BuildGraph();
     DICP_LOG_IF(st != atb::NO_ERROR, ERROR) << modelId_ << " init graph:\n" << graph_.ToString();
-
     graph_.Init();
     DICP_LOG(INFO) << modelId_ << " init graph:\n" << graph_.ToString();
 }
@@ -249,7 +248,6 @@ void Model::BuildNodeVariantPack(int nodeId) {
         if (needReshape) {
             node.inTensorReshapeFuncs.at(i)(node.inTensors.at(i)->desc.shape, inTensorDescs.at(i).shape);
             node.variantPack.inTensors.at(i).desc.shape = inTensorDescs.at(i).shape;
-            node.inTensors.at(i)->desc.shape = inTensorDescs.at(i).shape;
         }
         DICP_LOG(INFO) << modelId_ << " nodes[" << nodeId << "] inTensors[" << i << "]:" << tensor_utils::TensorToString(node.variantPack.inTensors.at(i));
     }
@@ -265,7 +263,7 @@ void Model::BuildNodeVariantPack(int nodeId) {
     for (size_t i = 0; i < node.outTensors.size(); ++i) {
         if (hasInplaceOutputs && node.inplaceIndices.count(i) > 0) {
             auto inputIdx = node.inplaceIndices[i];
-            node.variantPack.outTensors.at(i) = *node.inTensors.at(inputIdx);
+            node.variantPack.outTensors.at(i) = node.variantPack.inTensors.at(inputIdx);
             *node.outTensors.at(i) = node.variantPack.outTensors.at(i);
             continue;
         }
@@ -494,7 +492,8 @@ void Model::SetupUnsqueezeReshape(const nlohmann::json& reshapeInput, atb::Resha
     func = [=](const atb::Dims& oldShape, atb::Dims& newShape) {
         std::vector<int64_t> dimValues(oldShape.dims, oldShape.dims + oldShape.dimNum);
         for (const auto& d : dims) {
-            dimValues.insert(dimValues.begin() + d, 1);
+            int offset = d < 0 ? d + oldShape.dimNum + 1 : d;
+            dimValues.insert(dimValues.begin() + offset, 1);
         }
         newShape.dimNum = dimValues.size();
         std::copy(dimValues.begin(), dimValues.end(), newShape.dims);
@@ -506,7 +505,8 @@ void Model::SetupSqueezeReshape(const nlohmann::json& reshapeInput, atb::Reshape
     func = [=](const atb::Dims& oldShape, atb::Dims& newShape) {
         std::vector<int64_t> dimValues(oldShape.dims, oldShape.dims + oldShape.dimNum);
         for (const auto& d : dims) {
-            dimValues.erase(dimValues.begin() + d);
+            int offset = d < 0 ? d + oldShape.dimNum : d;
+            dimValues.erase(dimValues.begin() + offset);
         }
         newShape.dimNum = dimValues.size();
         std::copy(dimValues.begin(), dimValues.end(), newShape.dims);
 
@@ -22,9 +22,11 @@ AclNnGtScalarOperation::~AclNnGtScalarOperation() {
 atb::Status AclNnGtScalarOperation::InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const {
     DICP_LOG(INFO) << opName_ << " infer shape start";
     outTensorDescs.at(0).format = inTensorDescs.at(0).format;
-    outTensorDescs.at(0).shape.dimNum = NUM1;
-    outTensorDescs.at(0).shape.dims[0] = 1;
+    outTensorDescs.at(0).shape.dimNum = inTensorDescs.at(0).shape.dimNum;
     outTensorDescs.at(0).dtype = aclDataType::ACL_BOOL;
+    for (size_t i = 0; i < outTensorDescs.at(0).shape.dimNum; ++i) {
+        outTensorDescs.at(0).shape.dims[i] = inTensorDescs.at(0).shape.dims[i];
+    }
     DICP_LOG(INFO) << opName_ << " infer shape end";
     return 0;
 }
 
@@ -0,0 +1,51 @@
+#include "index_select_operation.h"
+
+#include "aclnnop/aclnn_index_select.h"
+#include "utils/log.h"
+
+namespace dicp {
+
+const int NUM1 = 1;
+const int NUM2 = 2;
+
+AclNnIndexSelectOperation::AclNnIndexSelectOperation(const std::string& name, int64_t dim) : AclNnOperation(name), dim_(dim) {}
+
+AclNnIndexSelectOperation::~AclNnIndexSelectOperation() {}
+
+atb::Status AclNnIndexSelectOperation::InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const {
+    DICP_LOG(INFO) << opName_ << " infer shape start";
+    outTensorDescs.at(0).format = inTensorDescs.at(0).format;
+    outTensorDescs.at(0).shape.dimNum = inTensorDescs.at(0).shape.dimNum;
+    outTensorDescs.at(0).dtype = inTensorDescs.at(0).dtype;
+
+    for (size_t i = 0; i < outTensorDescs.at(0).shape.dimNum; ++i) {
+        outTensorDescs.at(0).shape.dims[i] = inTensorDescs.at(0).shape.dims[i];
+    }
+    outTensorDescs.at(0).shape.dims[dim_] = inTensorDescs.at(1).shape.dims[0];
+    DICP_LOG(INFO) << opName_ << " infer shape end";
+    return 0;
+}
+
+uint32_t AclNnIndexSelectOperation::GetInputNum() const { return NUM2; }
+
+uint32_t AclNnIndexSelectOperation::GetOutputNum() const { return NUM1; }
+
+int AclNnIndexSelectOperation::SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) {
+    DICP_LOG(INFO) << opName_ << " AclNnIndexSelectGetWorkspaceSize start";
+
+    int ret = aclnnIndexSelectGetWorkspaceSize(
+        aclInTensors_.at(0).tensor, dim_, aclInTensors_.at(1).tensor, aclOutTensors_.at(0).tensor, &workspaceSize, &aclExecutor_);
+    DICP_LOG(INFO) << opName_ << " AclNnIndexSelectGetWorkspaceSize end, ret:" << ret << ", workspaceSize:" << workspaceSize
+                   << ", aclExecutor:" << aclExecutor_;
+
+    return ret;
+}
+
+int AclNnIndexSelectOperation::CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) {
+    DICP_LOG(INFO) << opName_ << " AclNnIndexSelect start";
+    int ret = aclnnIndexSelect(workspace, workspaceSize, aclExecutor, stream);
+    DICP_LOG(INFO) << opName_ << " AclNnIndexSelect end, ret:" << ret;
+    return ret;
+}
+
+}  // namespace dicp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "acl_nn_operation.h"
+
+namespace dicp {
+
+class AclNnIndexSelectOperation : public AclNnOperation {
+public:
+    explicit AclNnIndexSelectOperation(const std::string& name, int64_t dim);
+    ~AclNnIndexSelectOperation() override;
+    atb::Status InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const override;
+    uint32_t GetInputNum() const override;
+    uint32_t GetOutputNum() const override;
+
+private:
+    int64_t dim_;
+    int SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) override;
+    int CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) override;
+};
+
+inline atb::Operation* AclNnIndexSelectOperationCreate(const nlohmann::json& paramJson) {
+    std::string opName;
+    int64_t dim;
+    std::string dtype;
+    if (paramJson.contains("name")) {
+        opName = paramJson["name"].get<std::string>();
+    }
+    if (paramJson.contains("dim")) {
+        dim = paramJson["dim"].get<int64_t>();
+    }
+    DICP_LOG(INFO) << "AclNnIndexSelectOperation: name: " << opName << " dim:" << dim;
+    atb::Operation* op = new AclNnIndexSelectOperation(opName, dim);
+    return op;
+}
+
+}  // namespace dicp
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ target_include_directories(`
`32`	`32`	`${CMAKE_CURRENT_SOURCE_DIR}`
`33`	`33`	`${TORCH_NPU_INCLUDE_DIRS}`
`34`	`34`	`${CANN_INCLUDE_DIRS}`
	`35`	`+ ${CANN_INCLUDE_DIRS}/aclnn`
`35`	`36`	`${ATB_INCLUDE_DIRS}`
`36`	`37`	`)`
`37`	`38`