CANN: Support Opt CONV_TRANSPOSE_1D and ELU (#12786)

* [CANN] Support ELU and CONV_TRANSPOSE_1D * [CANN]Modification review comments * [CANN]Modification review comments * [CANN]name adjustment * [CANN]remove lambda used in template * [CANN]Use std::func instead of template * [CANN]Modify the code according to the review comments --------- Signed-off-by: noemotiovon <noemotiovon@gmail.com>
2025-04-14 10:36:07 +00:00 · 2025-04-09 14:04:14 +08:00 · 2025-04-09 14:04:14 +08:00 · 6e1c4cebdb
commit 6e1c4cebdb
parent 0090950f67
5 changed files with 204 additions and 78 deletions
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

@ -6,7 +6,7 @@ WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1771,7 +1771,7 @@ jobs:
    strategy:
      matrix:
        cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
        device:
          - 'ascend910b3'
        build:
@ -1784,7 +1784,7 @@ jobs:
      - name: Dependencies
        run: |
          yum update -y
-          yum install -y git gcc gcc-c++ make cmake
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel

      - name: Build
        run: |
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -57,6 +57,8 @@
 #include <aclnnop/aclnn_sub.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_convolution.h>
+#include <aclnnop/aclnn_elu.h>
 #include <float.h>

 #include <cmath>
@ -86,6 +88,20 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
    }
 }

+void ggml_cann_unary_op(
+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
+    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    unary_op(ctx, acl_src, acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
 /**
 * @brief Repeats elements of a tensor along each dimension according to the
 * specified repeat array.
@ -2585,3 +2601,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
+
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];
+
+    // stride
+    int64_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
+    aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+
+    int64_t strideVal[1];
+    strideVal[0] = s0;
+    aclIntArray *stride = aclCreateIntArray(strideVal, 1);
+    int64_t paddingVal[] = {0};
+    aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
+    int64_t dilationVal[] = {1};
+    aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
+    bool transposed = true;
+    int64_t groups = 1;
+    int8_t cubeMathType = 0;
+
+    GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
+        padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
+
+    ACL_CHECK(aclDestroyTensor(acl_weight));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_input = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float alphaValue = 1.0f;
+    aclScalar* alpha = nullptr;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha,
+        acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_input));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -1,15 +1,4 @@
-#ifndef CANN_ACLNN_OPS
-#define CANN_ACLNN_OPS
-
 /**
- * @file    acl_tensor
- * @brief   This file contains related functions of ggml_tensor and acl_tensor.
- *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
- *          functions.
- * @author  hipudding <huafengchun@gmail.com>
- * @author  wangshuai09 <391746016@qq.com>
- * @date    July 15, 2024
- *
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,6 +20,9 @@
 * IN THE SOFTWARE.
 */

+#ifndef CANN_ACLNN_OPS
+#define CANN_ACLNN_OPS
+
 #include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_neg.h>
 #include <aclnnop/aclnn_exp.h>
@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *          operation is executed using the CANN backend for optimized performance.
 *
 * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the indices of the maximum values will be stored.
- *            dst->op is `GGML_OP_ARGMAX`.
+ * @param dst The destination tensor where the indices of the maximum values will
+ *            be stored. dst->op is `GGML_OP_ARGMAX`.
 */
 void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);

@ -599,6 +591,99 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_dst);

+/**
+ * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
+ * output tensor.
+ *
+ * This function checks whether broadcasting is needed between `src0` and `src1`.
+ * If broadcasting is required, it calculates the proper shapes and creates
+ * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
+ * based on the original tensor shapes.
+ *
+ * @param src0     The first input tensor (reference shape).
+ * @param src1     The second input tensor (possibly broadcasted).
+ * @param dst      The destination/output tensor.
+ * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
+ * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
+ * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
+ */
+void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
+    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
+
+/**
+ * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
+ * tensor using the CANN backend.
+ *
+ * @details This function performs a 1D transposed convolution (also known as
+ * deconvolution) operation on the input tensor. The computed result is stored
+ * in the destination tensor `dst`. The operation is optimized using the CANN
+ * backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the transposed convolution result
+ * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
+ */
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
+ * using the CANN backend.
+ *
+ * @details This function performs an element-wise ELU activation on the input
+ *          tensor.
+ *          The result is written to the destination tensor `dst` in-place.
+ *          The ELU function is defined as:
+ *
+ *          \text{ELU}(x) =
+ *          \begin{cases}
+ *          x, & \text{if } x > 0 \\
+ *          \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
+ *          \end{cases}
+ *
+ *          where α (alpha) is a hyperparameter, typically set to 1.0.
+ *          This operation is optimized using the CANN backend for high-performance
+ *          inference or training.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the ELU-activated result will be stored.
+ *            dst->op is expected to be `GGML_OP_ELU`.
+ */
+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief Applies a element-wise operation to two input tensors using the CANN
+ * backend.
+ *
+ * This templated function takes a binary operator and applies it to two source
+ * tensors
+ * associated with the destination tensor. The function handles broadcasting as
+ * needed.
+ *
+ * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
+ *         the binary operation to be performed. It must take three arguments:
+ *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
+ *
+ * @param ctx The CANN backend context used to manage execution and resources.
+ * @param dst The destination tensor.
+ */
+template <auto binary_op>
+void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    aclTensor* acl_src0;
+    aclTensor* acl_src1;
+    aclTensor* acl_dst;
+
+    // Need bcast
+    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
+    binary_op(ctx, acl_src0, acl_src1, acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
 /**
 * @brief Launches an asynchronous task using the memory allocator.
 *
@ -631,56 +716,6 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
    } while (0)

-
-/**
- * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
- *
- * This function checks whether broadcasting is needed between `src0` and `src1`.
- * If broadcasting is required, it calculates the proper shapes and creates
- * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
- * based on the original tensor shapes.
- *
- * @param src0     The first input tensor (reference shape).
- * @param src1     The second input tensor (possibly broadcasted).
- * @param dst      The destination/output tensor.
- * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
- * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
- * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
- */
-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
-                        aclTensor ** acl_src1, aclTensor ** acl_dst);
-
-/**
- * @brief Applies a element-wise operation to two input tensors using the CANN backend.
- *
- * This templated function takes a binary operator and applies it to two source tensors
- * associated with the destination tensor. The function handles broadcasting as needed.
- *
- * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
- *         the binary operation to be performed. It must take three arguments:
- *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
- *
- * @param ctx The CANN backend context used to manage execution and resources.
- * @param dst The destination tensor.
- */
-template <auto binary_op>
-void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
-
-    aclTensor* acl_src0;
-    aclTensor* acl_src1;
-    aclTensor* acl_dst;
-
-    // Need bcast
-    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
-    binary_op(ctx, acl_src0, acl_src1, acl_dst);
-
-    ACL_CHECK(aclDestroyTensor(acl_src0));
-    ACL_CHECK(aclDestroyTensor(acl_src1));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
 /**
 * @brief Applies a unary operation to an input tensor using the CANN backend.
 *
@ -690,7 +725,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * @tparam unary_op A callable with the signature:
 *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
 *         where the first aclTensor is the source and the second is the destination.
- *
 * @param ctx The CANN backend context for managing resources and execution.
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
@ -702,10 +736,30 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

    unary_op(ctx, acl_src, acl_dst);
+
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }

+/**
+ * @brief   Applies a unary operation to a ggml tensor using the CANN backend.
+ *
+ * @details This function performs a unary operation on the input tensor using
+ * a user-provided lambda or callable object `unary_op`, which accepts the CANN
+ * context and two ACL tensors (source and destination). Internally, this function
+ * creates ACL representations of the ggml tensors and invokes the unary operation.
+ * The result is stored in the destination tensor `dst`. This utility abstracts the
+ * common boilerplate of tensor conversion and cleanup when implementing unary ops.
+ *
+ * @param unary_op A callable that performs the unary operation using CANN APIs.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ *            The source tensor is retrieved from `dst->src[0]`.
+ */
+void ggml_cann_unary_op(
+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
+    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /**
 * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
 *
@ -725,11 +779,12 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 */
 #define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
    do {                                                         \
-        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
+        auto lambda = [](ggml_backend_cann_context& ctx,         \
+            aclTensor* acl_src,                                  \
+            aclTensor* acl_dst) {                                \
            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
        };                                                       \
-        ggml_cann_unary_op<lambda>(ctx, dst);                    \
+        ggml_cann_unary_op(lambda, ctx, dst);                    \
    }                                                            \
    while (0)
-
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -1330,12 +1330,13 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                    GGML_CANN_CALL_UNARY_OP(Silu);
                    break;
                case GGML_UNARY_OP_GELU_QUICK: {
-                        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
-                            GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-                        };
-                        ggml_cann_unary_op<lambda>(ctx, dst);
-                    }
-                    break;
+                    auto lambda = [](ggml_backend_cann_context& ctx,
+                        aclTensor* acl_src,
+                        aclTensor* acl_dst) {
+                        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
+                    };
+                    ggml_cann_unary_op(lambda, ctx, dst);
+                } break;
                case GGML_UNARY_OP_TANH:
                    GGML_CANN_CALL_UNARY_OP(Tanh);
                    break;
@ -1354,6 +1355,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                case GGML_UNARY_OP_EXP:
                    GGML_CANN_CALL_UNARY_OP(Exp);
                    break;
+                case GGML_UNARY_OP_ELU:
+                    ggml_cann_elu(ctx, dst);
+                    break;
                default:
                    return false;
            }
@ -1448,7 +1452,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            break;
        case GGML_OP_SIN:
            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
-        break;
+            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_cann_conv_transpose_1d(ctx, dst);
+            break;
        default:
            return false;
    }
@ -1710,6 +1717,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_ELU:
                    return true;
                default:
                    return false;
@ -1842,6 +1850,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_ARGMAX:
        case GGML_OP_COS:
        case GGML_OP_SIN:
+        case GGML_OP_CONV_TRANSPOSE_1D:
            return true;
        default:
            return false;