From 6e1c4cebdb697f925c523d3a969128d945161bdd Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 9 Apr 2025 14:04:14 +0800 Subject: [PATCH] CANN: Support Opt CONV_TRANSPOSE_1D and ELU (#12786) * [CANN] Support ELU and CONV_TRANSPOSE_1D * [CANN]Modification review comments * [CANN]Modification review comments * [CANN]name adjustment * [CANN]remove lambda used in template * [CANN]Use std::func instead of template * [CANN]Modify the code according to the review comments --------- Signed-off-by: noemotiovon --- .devops/llama-cli-cann.Dockerfile | 4 +- .github/workflows/build.yml | 4 +- ggml/src/ggml-cann/aclnn_ops.cpp | 62 ++++++++++ ggml/src/ggml-cann/aclnn_ops.h | 189 +++++++++++++++++++----------- ggml/src/ggml-cann/ggml-cann.cpp | 23 ++-- 5 files changed, 204 insertions(+), 78 deletions(-) diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index 02dce501c..0eb1af87c 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,4 +1,4 @@ -ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8 +ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10 FROM ascendai/cann:$ASCEND_VERSION AS build @@ -6,7 +6,7 @@ WORKDIR /app COPY . . -RUN yum install -y gcc g++ cmake make +RUN yum install -y gcc g++ cmake make libcurl-devel ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 33f6a4fb4..bcfcf08ac 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1771,7 +1771,7 @@ jobs: strategy: matrix: cann: - - '8.0.rc3.beta1-910b-openeuler22.03-py3.10' + - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10' device: - 'ascend910b3' build: @@ -1784,7 +1784,7 @@ jobs: - name: Dependencies run: | yum update -y - yum install -y git gcc gcc-c++ make cmake + yum install -y git gcc gcc-c++ make cmake libcurl-devel - name: Build run: | diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index fee66aea9..25b2599c7 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include #include @@ -86,6 +88,20 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT } } +void ggml_cann_unary_op( + std::function unary_op, + ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + unary_op(ctx, acl_src, acl_dst); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + /** * @brief Repeats elements of a tensor along each dimension according to the * specified repeat array. @@ -2585,3 +2601,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); } + +void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){ + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + + // stride + int64_t s0 = ((const int32_t*)(dst->op_params))[0]; + + aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL); + aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL); + aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL); + + int64_t strideVal[1]; + strideVal[0] = s0; + aclIntArray *stride = aclCreateIntArray(strideVal, 1); + int64_t paddingVal[] = {0}; + aclIntArray *padding = aclCreateIntArray(paddingVal, 1); + int64_t dilationVal[] = {1}; + aclIntArray *dilation = aclCreateIntArray(dilationVal, 1); + bool transposed = true; + int64_t groups = 1; + int8_t cubeMathType = 0; + + GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride, + padding, dilation, transposed, padding, groups, acl_dst, cubeMathType); + + ACL_CHECK(aclDestroyTensor(acl_weight)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){ + ggml_tensor * src0 = dst->src[0]; + + aclTensor* acl_input = ggml_cann_create_tensor(src0); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + float alphaValue = 1.0f; + aclScalar* alpha = nullptr; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha, + acl_dst); + + ACL_CHECK(aclDestroyTensor(acl_input)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 116ddf0fb..aadf013de 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -1,15 +1,4 @@ -#ifndef CANN_ACLNN_OPS -#define CANN_ACLNN_OPS - /** - * @file acl_tensor - * @brief This file contains related functions of ggml_tensor and acl_tensor. - * Contains conversion from ggml_tensor to acl_tensor, broadcast and other - * functions. - * @author hipudding - * @author wangshuai09 <391746016@qq.com> - * @date July 15, 2024 - * * Copyright (c) 2023-2024 The ggml authors * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,6 +20,9 @@ * IN THE SOFTWARE. */ +#ifndef CANN_ACLNN_OPS +#define CANN_ACLNN_OPS + #include #include #include @@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst); * operation is executed using the CANN backend for optimized performance. * * @param ctx The CANN context used for operations. - * @param dst The destination tensor where the indices of the maximum values will be stored. - * dst->op is `GGML_OP_ARGMAX`. + * @param dst The destination tensor where the indices of the maximum values will + * be stored. dst->op is `GGML_OP_ARGMAX`. */ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); @@ -599,6 +591,99 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst); +/** + * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one + * output tensor. + * + * This function checks whether broadcasting is needed between `src0` and `src1`. + * If broadcasting is required, it calculates the proper shapes and creates + * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors + * based on the original tensor shapes. + * + * @param src0 The first input tensor (reference shape). + * @param src1 The second input tensor (possibly broadcasted). + * @param dst The destination/output tensor. + * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0. + * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1. + * @param acl_dst Output pointer to the created ACL tensor corresponding to dst. + */ +void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, + aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst); + +/** + * @brief Computes the 1D transposed convolution (deconvolution) of a ggml + * tensor using the CANN backend. + * + * @details This function performs a 1D transposed convolution (also known as + * deconvolution) operation on the input tensor. The computed result is stored + * in the destination tensor `dst`. The operation is optimized using the CANN + * backend for improved performance. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the transposed convolution result + * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`. + */ +void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor + * using the CANN backend. + * + * @details This function performs an element-wise ELU activation on the input + * tensor. + * The result is written to the destination tensor `dst` in-place. + * The ELU function is defined as: + * + * \text{ELU}(x) = + * \begin{cases} + * x, & \text{if } x > 0 \\ + * \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0 + * \end{cases} + * + * where α (alpha) is a hyperparameter, typically set to 1.0. + * This operation is optimized using the CANN backend for high-performance + * inference or training. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the ELU-activated result will be stored. + * dst->op is expected to be `GGML_OP_ELU`. + */ +void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies a element-wise operation to two input tensors using the CANN + * backend. + * + * This templated function takes a binary operator and applies it to two source + * tensors + * associated with the destination tensor. The function handles broadcasting as + * needed. + * + * @tparam binary_op A callable object (e.g., lambda or function pointer) representing + * the binary operation to be performed. It must take three arguments: + * (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*). + * + * @param ctx The CANN backend context used to manage execution and resources. + * @param dst The destination tensor. + */ +template +void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + aclTensor* acl_src0; + aclTensor* acl_src1; + aclTensor* acl_dst; + + // Need bcast + bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst); + binary_op(ctx, acl_src0, acl_src1, acl_dst); + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + /** * @brief Launches an asynchronous task using the memory allocator. * @@ -631,56 +716,6 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \ } while (0) - -/** - * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor. - * - * This function checks whether broadcasting is needed between `src0` and `src1`. - * If broadcasting is required, it calculates the proper shapes and creates - * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors - * based on the original tensor shapes. - * - * @param src0 The first input tensor (reference shape). - * @param src1 The second input tensor (possibly broadcasted). - * @param dst The destination/output tensor. - * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0. - * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1. - * @param acl_dst Output pointer to the created ACL tensor corresponding to dst. - */ -void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0, - aclTensor ** acl_src1, aclTensor ** acl_dst); - -/** - * @brief Applies a element-wise operation to two input tensors using the CANN backend. - * - * This templated function takes a binary operator and applies it to two source tensors - * associated with the destination tensor. The function handles broadcasting as needed. - * - * @tparam binary_op A callable object (e.g., lambda or function pointer) representing - * the binary operation to be performed. It must take three arguments: - * (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*). - * - * @param ctx The CANN backend context used to manage execution and resources. - * @param dst The destination tensor. - */ -template -void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; - - aclTensor* acl_src0; - aclTensor* acl_src1; - aclTensor* acl_dst; - - // Need bcast - bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst); - binary_op(ctx, acl_src0, acl_src1, acl_dst); - - ACL_CHECK(aclDestroyTensor(acl_src0)); - ACL_CHECK(aclDestroyTensor(acl_src1)); - ACL_CHECK(aclDestroyTensor(acl_dst)); -} - /** * @brief Applies a unary operation to an input tensor using the CANN backend. * @@ -690,7 +725,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @tparam unary_op A callable with the signature: * void(ggml_backend_cann_context&, aclTensor*, aclTensor*) * where the first aclTensor is the source and the second is the destination. - * * @param ctx The CANN backend context for managing resources and execution. * @param dst The destination tensor. Its src[0] is treated as the input tensor. */ @@ -702,10 +736,30 @@ template aclTensor* acl_dst = ggml_cann_create_tensor(dst); unary_op(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Applies a unary operation to a ggml tensor using the CANN backend. + * + * @details This function performs a unary operation on the input tensor using + * a user-provided lambda or callable object `unary_op`, which accepts the CANN + * context and two ACL tensors (source and destination). Internally, this function + * creates ACL representations of the ggml tensors and invokes the unary operation. + * The result is stored in the destination tensor `dst`. This utility abstracts the + * common boilerplate of tensor conversion and cleanup when implementing unary ops. + * + * @param unary_op A callable that performs the unary operation using CANN APIs. + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the result will be stored. + * The source tensor is retrieved from `dst->src[0]`. + */ +void ggml_cann_unary_op( + std::function unary_op, + ggml_backend_cann_context& ctx, ggml_tensor* dst); + /** * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op. * @@ -725,11 +779,12 @@ template */ #define GGML_CANN_CALL_UNARY_OP(OP_NAME) \ do { \ - auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \ + auto lambda = [](ggml_backend_cann_context& ctx, \ + aclTensor* acl_src, \ + aclTensor* acl_dst) { \ GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \ }; \ - ggml_cann_unary_op(ctx, dst); \ + ggml_cann_unary_op(lambda, ctx, dst); \ } \ while (0) - #endif // CANN_ACLNN_OPS diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 326f9d298..f9187ba81 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1330,12 +1330,13 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, GGML_CANN_CALL_UNARY_OP(Silu); break; case GGML_UNARY_OP_GELU_QUICK: { - auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { - GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst); - }; - ggml_cann_unary_op(ctx, dst); - } - break; + auto lambda = [](ggml_backend_cann_context& ctx, + aclTensor* acl_src, + aclTensor* acl_dst) { + GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst); + }; + ggml_cann_unary_op(lambda, ctx, dst); + } break; case GGML_UNARY_OP_TANH: GGML_CANN_CALL_UNARY_OP(Tanh); break; @@ -1354,6 +1355,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_UNARY_OP_EXP: GGML_CANN_CALL_UNARY_OP(Exp); break; + case GGML_UNARY_OP_ELU: + ggml_cann_elu(ctx, dst); + break; default: return false; } @@ -1448,7 +1452,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, break; case GGML_OP_SIN: ggml_cann_unary_op(ctx, dst); - break; + break; + case GGML_OP_CONV_TRANSPOSE_1D: + ggml_cann_conv_transpose_1d(ctx, dst); + break; default: return false; } @@ -1710,6 +1717,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_ELU: return true; default: return false; @@ -1842,6 +1850,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_ARGMAX: case GGML_OP_COS: case GGML_OP_SIN: + case GGML_OP_CONV_TRANSPOSE_1D: return true; default: return false;