mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-04-16 11:36:08 +00:00
[CANN]MUL_MAT optimization (#12382)
This commit is contained in:
parent
9f2250ba72
commit
92a391327e
@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|||||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||||
output_ne_offset);
|
output_ne_offset);
|
||||||
|
int64_t antiquantGroupSize = 0;
|
||||||
|
if (src0->ne[0] > QK8_0) {
|
||||||
|
antiquantGroupSize = QK8_0;
|
||||||
|
}
|
||||||
|
|
||||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
|
||||||
&workspaceSize, &executor));
|
&workspaceSize, &executor));
|
||||||
if (workspaceAddr == nullptr) {
|
if (workspaceAddr == nullptr) {
|
||||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||||
@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|||||||
|
|
||||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||||
acl_output_tensor, &workspaceSize, &executor));
|
acl_output_tensor, &workspaceSize, &executor));
|
||||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||||||
case GGML_OP_MUL_MAT: {
|
case GGML_OP_MUL_MAT: {
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
// Current groupsize should not be greater than k-1 in
|
|
||||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
|
||||||
if (op->src[0]->ne[0] <= QK8_0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user