From fe92821ea9ae53f3088cf2699a9e102448295fa0 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 9 Apr 2025 12:32:13 +0200 Subject: [PATCH] ggml : add bilinear upscale support (ggml/1185) --- ggml/include/ggml.h | 19 +++++--- ggml/src/ggml-cann/ggml-cann.cpp | 3 ++ ggml/src/ggml-cpu/ops.cpp | 72 +++++++++++++++++++++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 1 + ggml/src/ggml-metal/ggml-metal.m | 3 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +-- ggml/src/ggml.c | 15 ++++-- tests/test-backend-ops.cpp | 36 +++++++++----- 9 files changed, 119 insertions(+), 40 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a5447ecdf..8fcc16df9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1717,24 +1717,29 @@ extern "C" { float p0, float p1); - // nearest interpolate + enum ggml_scale_mode { + GGML_SCALE_MODE_NEAREST = 0, + GGML_SCALE_MODE_BILINEAR = 1, + }; + + // interpolate // multiplies ne0 and ne1 by scale factor - // used in stable-diffusion GGML_API struct ggml_tensor * ggml_upscale( struct ggml_context * ctx, struct ggml_tensor * a, - int scale_factor); + int scale_factor, + enum ggml_scale_mode mode); - // nearest interpolate - // nearest interpolate to specified dimensions - // used in tortoise.cpp + // interpolate + // interpolate scale to specified dimensions GGML_API struct ggml_tensor * ggml_upscale_ext( struct ggml_context * ctx, struct ggml_tensor * a, int ne0, int ne1, int ne2, - int ne3); + int ne3, + enum ggml_scale_mode mode); // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] GGML_API struct ggml_tensor * ggml_pad( diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index b513270c6..cec36b36e 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1824,6 +1824,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) { return false; } + if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) { + return false; + } return true; } case GGML_OP_POOL_2D: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 36b98152e..6050147be 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6351,24 +6351,72 @@ static void ggml_compute_forward_upscale_f32( const float sf2 = (float)ne2/src0->ne[2]; const float sf3 = (float)ne3/src0->ne[3]; - // TODO: optimize + const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - const int64_t i01 = i1 / sf1; - for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t i00 = i0 / sf0; + if (mode == GGML_SCALE_MODE_NEAREST) { + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const int64_t i01 = i1 / sf1; + for (int64_t i0 = 0; i0 < ne0; i0++) { + const int64_t i00 = i0 / sf0; - const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - *y = *x; + *y = *x; + } } } } + } else if (mode == GGML_SCALE_MODE_BILINEAR) { + // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True + const float pixel_offset = 0.5f; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset; + int64_t y0 = (int64_t)floorf(y); + int64_t y1 = y0 + 1; + + y0 = std::max(int64_t(0), std::min(y0, ne01 - 1)); + y1 = std::max(int64_t(0), std::min(y1, ne01 - 1)); + + float dy = y - (float)y0; + dy = std::max(0.0f, std::min(dy, 1.0f)); + + for (int64_t i0 = 0; i0 < ne0; i0++) { + const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset; + int64_t x0 = (int64_t)floorf(x); + int64_t x1 = x0 + 1; + + x0 = std::max(int64_t(0), std::min(x0, ne00 - 1)); + x1 = std::max(int64_t(0), std::min(x1, ne00 - 1)); + + float dx = x - (float)x0; + dx = std::max(0.0f, std::min(dx, 1.0f)); + + // fetch the four surrounding pixel values and interpolate + const float a = *(const float *)((const char *)src0->data + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03); + const float b = *(const float *)((const char *)src0->data + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03); + const float c = *(const float *)((const char *)src0->data + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03); + const float d = *(const float *)((const char *)src0->data + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03); + + const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy; + + float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + *y_dst = val; + } + } + } + } + } else { + GGML_ABORT("unsupported upscale mode"); } } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 633456a92..fafe9633e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3216,6 +3216,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f22682602..9f1c6c6cc 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1334,8 +1334,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_1D: return false; - case GGML_OP_POOL_2D: case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; + case GGML_OP_POOL_2D: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 89715eaea..e6f1603d8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4055,12 +4055,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_IM2COL: // TODO: add support for the new F32 operations return op->src[0]->type == GGML_TYPE_F16; + case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_POOL_2D: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: - case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e69d00ad5..783a0ff86 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5749,7 +5749,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_UPSCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) { return ctx->device->pipeline_upscale_f32; } return nullptr; @@ -9404,9 +9404,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_COS: case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_UPSCALE: + return op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_ACC: case GGML_OP_CONCAT: - case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_PAD: case GGML_OP_DIAG_MASK_INF: @@ -9774,7 +9775,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_CONCAT) { tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_UPSCALE) { - tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->op_params[0], tensor->op_params[1], (ggml_scale_mode) tensor->op_params[0]); } else if (tensor->op == GGML_OP_SCALE) { const float * params = (const float *)tensor->op_params; tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 98a0f6164..950772c75 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4174,7 +4174,8 @@ static struct ggml_tensor * ggml_upscale_impl( int ne0, int ne1, int ne2, - int ne3) { + int ne3, + enum ggml_scale_mode mode) { GGML_ASSERT(a->ne[0] <= ne0); GGML_ASSERT(a->ne[1] <= ne1); GGML_ASSERT(a->ne[2] <= ne2); @@ -4182,6 +4183,8 @@ static struct ggml_tensor * ggml_upscale_impl( struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); + ggml_set_op_params_i32(result, 0, mode); + result->op = GGML_OP_UPSCALE; result->src[0] = a; @@ -4191,8 +4194,9 @@ static struct ggml_tensor * ggml_upscale_impl( struct ggml_tensor * ggml_upscale( struct ggml_context * ctx, struct ggml_tensor * a, - int scale_factor) { - return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]); + int scale_factor, + enum ggml_scale_mode mode) { + return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode); } struct ggml_tensor * ggml_upscale_ext( @@ -4201,8 +4205,9 @@ struct ggml_tensor * ggml_upscale_ext( int ne0, int ne1, int ne2, - int ne3) { - return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3); + int ne3, + enum ggml_scale_mode mode) { + return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode); } // ggml_pad diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e61a126cf..9dc9f015a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -271,6 +271,14 @@ static std::string var_to_str(ggml_op_pool pool) { } } +static std::string var_to_str(ggml_scale_mode mode) { + switch (mode) { + case GGML_SCALE_MODE_NEAREST: return "nearest"; + case GGML_SCALE_MODE_BILINEAR: return "bilinear"; + default: return std::to_string(mode); + } +} + #define VAR_TO_STR(x) (#x "=" + var_to_str(x)) #define VARS_TO_STR1(a) VAR_TO_STR(a) @@ -2948,15 +2956,16 @@ struct test_upscale : public test_case { const std::array ne; const int32_t scale_factor; const bool transpose; + const ggml_scale_mode mode; std::string vars() override { - return VARS_TO_STR4(type, ne, scale_factor, transpose); + return VARS_TO_STR5(type, ne, scale_factor, mode, transpose); } test_upscale(ggml_type type = GGML_TYPE_F32, std::array ne = {512, 512, 3, 1}, - int32_t scale_factor = 2, bool transpose = false) - : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {} + int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false) + : type(type), ne(ne), scale_factor(scale_factor), mode(mode), transpose(transpose) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); @@ -2967,7 +2976,7 @@ struct test_upscale : public test_case { ggml_set_name(a, "a_transposed"); } - ggml_tensor * out = ggml_upscale(ctx, a, scale_factor); + ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode); ggml_set_name(out, "out"); return out; @@ -2979,21 +2988,23 @@ struct test_upscale_ext : public test_case { const ggml_type type; const std::array ne; const std::array ne_tgt; + const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST; std::string vars() override { - return VARS_TO_STR3(type, ne, ne_tgt); + return VARS_TO_STR4(type, ne, ne_tgt, mode); } test_upscale_ext(ggml_type type = GGML_TYPE_F32, std::array ne = {2, 5, 7, 11}, - std::array ne_tgt = {5, 7, 11, 13}) - : type(type), ne(ne), ne_tgt(ne_tgt) {} + std::array ne_tgt = {5, 7, 11, 13}, + ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST) + : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_name(a, "a"); - ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]); + ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode); ggml_set_name(out, "out"); return out; @@ -4399,12 +4410,15 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen } + for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) { + test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode)); + test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true)); + test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode)); + } + test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_mean()); - test_cases.emplace_back(new test_upscale()); - test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true)); - test_cases.emplace_back(new test_upscale_ext()); test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1})); test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc());