CANN: Support Opt CONV_TRANSPOSE_1D and ELU (#12786)

* [CANN] Support ELU and CONV_TRANSPOSE_1D * [CANN]Modification review comments * [CANN]Modification review comments * [CANN]name adjustment * [CANN]remove lambda used in template * [CANN]Use std::func instead of template * [CANN]Modify the code according to the review comments --------- Signed-off-by: noemotiovon <noemotiovon@gmail.com>
2025-04-26 09:06:07 +00:00 · 2025-04-09 14:04:14 +08:00 · 2025-04-09 14:04:14 +08:00 · 6e1c4cebdb
commit 6e1c4cebdb
parent 0090950f67
5 changed files with 204 additions and 78 deletions
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
 FROM ascendai/cann:$ASCEND_VERSION AS build
@ -6,7 +6,7 @@ WORKDIR /app
 COPY . .
-RUN yum install -y gcc g++ cmake make
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1771,7 +1771,7 @@ jobs:
    strategy:
      matrix:
        cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
        device:
          - 'ascend910b3'
        build:
@ -1784,7 +1784,7 @@ jobs:
      - name: Dependencies
        run: |
          yum update -y
-          yum install -y git gcc gcc-c++ make cmake
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel
      - name: Build
        run: |
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -57,6 +57,8 @@
 #include <aclnnop/aclnn_sub.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_convolution.h>
 #include <aclnnop/aclnn_elu.h>
 #include <float.h>
 #include <cmath>
@ -86,6 +88,20 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
    }
 }
 void ggml_cann_unary_op(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    unary_op(ctx, acl_src, acl_dst);
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 /**
 * @brief Repeats elements of a tensor along each dimension according to the
 * specified repeat array.
@ -2585,3 +2601,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
    ggml_tensor * src0 = dst->src[0];
    ggml_tensor * src1 = dst->src[1];
    // stride
    int64_t s0 = ((const int32_t*)(dst->op_params))[0];
    aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
    aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
    int64_t strideVal[1];
    strideVal[0] = s0;
    aclIntArray *stride = aclCreateIntArray(strideVal, 1);
    int64_t paddingVal[] = {0};
    aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
    int64_t dilationVal[] = {1};
    aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
    bool transposed = true;
    int64_t groups = 1;
    int8_t cubeMathType = 0;
    GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
        padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
    ACL_CHECK(aclDestroyTensor(acl_weight));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
    ggml_tensor * src0 = dst->src[0];
    aclTensor* acl_input = ggml_cann_create_tensor(src0);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    float alphaValue = 1.0f;
    aclScalar* alpha = nullptr;
    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
    GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha,
        acl_dst);
    ACL_CHECK(aclDestroyTensor(acl_input));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -1,15 +1,4 @@
 #ifndef CANN_ACLNN_OPS
 #define CANN_ACLNN_OPS
 /**
 * @file    acl_tensor
 * @brief   This file contains related functions of ggml_tensor and acl_tensor.
 *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
 *          functions.
 * @author  hipudding <huafengchun@gmail.com>
 * @author  wangshuai09 <391746016@qq.com>
 * @date    July 15, 2024
 *
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,6 +20,9 @@
 * IN THE SOFTWARE.
 */
 #ifndef CANN_ACLNN_OPS
 #define CANN_ACLNN_OPS
 #include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_neg.h>
 #include <aclnnop/aclnn_exp.h>
@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *          operation is executed using the CANN backend for optimized performance.
 *
 * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the indices of the maximum values will be stored.
+ * @param dst The destination tensor where the indices of the maximum values will
- *            dst->op is `GGML_OP_ARGMAX`.
+ *            be stored. dst->op is `GGML_OP_ARGMAX`.
 */
 void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
@ -599,6 +591,99 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_dst);
 /**
 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
 * output tensor.
 *
 * This function checks whether broadcasting is needed between `src0` and `src1`.
 * If broadcasting is required, it calculates the proper shapes and creates
 * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
 * based on the original tensor shapes.
 *
 * @param src0     The first input tensor (reference shape).
 * @param src1     The second input tensor (possibly broadcasted).
 * @param dst      The destination/output tensor.
 * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 */
 void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
 /**
 * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
 * tensor using the CANN backend.
 *
 * @details This function performs a 1D transposed convolution (also known as
 * deconvolution) operation on the input tensor. The computed result is stored
 * in the destination tensor `dst`. The operation is optimized using the CANN
 * backend for improved performance.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the transposed convolution result
 * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
 */
 void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
 * using the CANN backend.
 *
 * @details This function performs an element-wise ELU activation on the input
 *          tensor.
 *          The result is written to the destination tensor `dst` in-place.
 *          The ELU function is defined as:
 *
 *          \text{ELU}(x) =
 *          \begin{cases}
 *          x, & \text{if } x > 0 \\
 *          \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
 *          \end{cases}
 *
 *          where α (alpha) is a hyperparameter, typically set to 1.0.
 *          This operation is optimized using the CANN backend for high-performance
 *          inference or training.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the ELU-activated result will be stored.
 *            dst->op is expected to be `GGML_OP_ELU`.
 */
 void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN
 * backend.
 *
 * This templated function takes a binary operator and applies it to two source
 * tensors
 * associated with the destination tensor. The function handles broadcasting as
 * needed.
 *
 * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
 *         the binary operation to be performed. It must take three arguments:
 *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
 *
 * @param ctx The CANN backend context used to manage execution and resources.
 * @param dst The destination tensor.
 */
 template <auto binary_op>
 void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];
    aclTensor* acl_src0;
    aclTensor* acl_src1;
    aclTensor* acl_dst;
    // Need bcast
    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
    binary_op(ctx, acl_src0, acl_src1, acl_dst);
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 /**
 * @brief Launches an asynchronous task using the memory allocator.
 *
@ -631,56 +716,6 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
    } while (0)
 /**
 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
 *
 * This function checks whether broadcasting is needed between `src0` and `src1`.
 * If broadcasting is required, it calculates the proper shapes and creates
 * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
 * based on the original tensor shapes.
 *
 * @param src0     The first input tensor (reference shape).
 * @param src1     The second input tensor (possibly broadcasted).
 * @param dst      The destination/output tensor.
 * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 */
 void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
                        aclTensor ** acl_src1, aclTensor ** acl_dst);
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN backend.
 *
 * This templated function takes a binary operator and applies it to two source tensors
 * associated with the destination tensor. The function handles broadcasting as needed.
 *
 * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
 *         the binary operation to be performed. It must take three arguments:
 *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
 *
 * @param ctx The CANN backend context used to manage execution and resources.
 * @param dst The destination tensor.
 */
 template <auto binary_op>
 void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];
    aclTensor* acl_src0;
    aclTensor* acl_src1;
    aclTensor* acl_dst;
    // Need bcast
    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
    binary_op(ctx, acl_src0, acl_src1, acl_dst);
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 /**
 * @brief Applies a unary operation to an input tensor using the CANN backend.
 *
@ -690,7 +725,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * @tparam unary_op A callable with the signature:
 *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
 *         where the first aclTensor is the source and the second is the destination.
 *
 * @param ctx The CANN backend context for managing resources and execution.
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
@ -702,10 +736,30 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    unary_op(ctx, acl_src, acl_dst);
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 /**
 * @brief   Applies a unary operation to a ggml tensor using the CANN backend.
 *
 * @details This function performs a unary operation on the input tensor using
 * a user-provided lambda or callable object `unary_op`, which accepts the CANN
 * context and two ACL tensors (source and destination). Internally, this function
 * creates ACL representations of the ggml tensors and invokes the unary operation.
 * The result is stored in the destination tensor `dst`. This utility abstracts the
 * common boilerplate of tensor conversion and cleanup when implementing unary ops.
 *
 * @param unary_op A callable that performs the unary operation using CANN APIs.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the result will be stored.
 *            The source tensor is retrieved from `dst->src[0]`.
 */
 void ggml_cann_unary_op(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
 *
@ -725,11 +779,12 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 */
 #define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
    do {                                                         \
-        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
+        auto lambda = [](ggml_backend_cann_context& ctx,         \
            aclTensor* acl_src,                                  \
            aclTensor* acl_dst) {                                \
            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
        };                                                       \
-        ggml_cann_unary_op<lambda>(ctx, dst);                    \
+        ggml_cann_unary_op(lambda, ctx, dst);                    \
    }                                                            \
    while (0)
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -1330,12 +1330,13 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                    GGML_CANN_CALL_UNARY_OP(Silu);
                    break;
                case GGML_UNARY_OP_GELU_QUICK: {
-                        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
+                    auto lambda = [](ggml_backend_cann_context& ctx,
-                            GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
+                        aclTensor* acl_src,
-                        };
+                        aclTensor* acl_dst) {
-                        ggml_cann_unary_op<lambda>(ctx, dst);
+                        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-                    }
+                    };
-                    break;
+                    ggml_cann_unary_op(lambda, ctx, dst);
                } break;
                case GGML_UNARY_OP_TANH:
                    GGML_CANN_CALL_UNARY_OP(Tanh);
                    break;
@ -1354,6 +1355,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                case GGML_UNARY_OP_EXP:
                    GGML_CANN_CALL_UNARY_OP(Exp);
                    break;
                case GGML_UNARY_OP_ELU:
                    ggml_cann_elu(ctx, dst);
                    break;
                default:
                    return false;
            }
@ -1448,7 +1452,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            break;
        case GGML_OP_SIN:
            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
-        break;
+            break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cann_conv_transpose_1d(ctx, dst);
            break;
        default:
            return false;
    }
@ -1710,6 +1717,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
                case GGML_UNARY_OP_ELU:
                    return true;
                default:
                    return false;
@ -1842,6 +1850,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_ARGMAX:
        case GGML_OP_COS:
        case GGML_OP_SIN:
        case GGML_OP_CONV_TRANSPOSE_1D:
            return true;
        default:
            return false;