From 5bbe6a9fe9a8796a9389c85accec89dbc4d91e39 Mon Sep 17 00:00:00 2001 From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> Date: Tue, 4 Mar 2025 17:53:26 +0100 Subject: [PATCH] ggml : portability fixes for VS 2017 (#12150) * Add include files for std::min/max and std::toupper/tolower * win32: move _USE_MATH_DEFINES before includes to ensure M_PI is defined * Use GGML_RESTRICT instead of "restrict" keyword everywhere, and use "__restrict" in MSVC plain C mode * win32: only use __restrict in MSVC if C11/C17 support is not enabled --------- Co-authored-by: Marcus Groeber --- common/ngram-cache.cpp | 1 + common/sampling.cpp | 1 + common/speculative.cpp | 1 + examples/embedding/embedding.cpp | 1 + examples/lookahead/lookahead.cpp | 1 + examples/parallel/parallel.cpp | 1 + examples/passkey/passkey.cpp | 1 + examples/quantize/quantize.cpp | 1 + examples/tts/tts.cpp | 4 +- ggml/include/ggml.h | 6 +- ggml/src/ggml-backend-reg.cpp | 1 + ggml/src/ggml-backend.cpp | 1 + ggml/src/ggml-cpu/ggml-cpu-quants.c | 792 ++++++++++++++-------------- ggml/src/ggml-cpu/ggml-cpu.c | 26 +- ggml/src/ggml-quants.c | 228 ++++---- ggml/src/ggml.c | 6 +- src/llama-chat.cpp | 1 + src/llama-kv-cache.h | 1 + src/llama-mmap.cpp | 1 + src/llama-vocab.cpp | 1 + 20 files changed, 547 insertions(+), 529 deletions(-) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a057ae45f..d1a4d84c4 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -7,6 +7,7 @@ #include #include #include +#include void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { diff --git a/common/sampling.cpp b/common/sampling.cpp index 37a0d9c85..972fbe9da 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -4,6 +4,7 @@ #include #include +#include // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h diff --git a/common/speculative.cpp b/common/speculative.cpp index b1fff27a5..1bac3a1ce 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -5,6 +5,7 @@ #include "sampling.h" #include +#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38d22c90f..3dd9f2b07 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -4,6 +4,7 @@ #include "llama.h" #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 2f0898e62..b9e8de694 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -7,6 +7,7 @@ #include #include #include +#include struct ngram_data { bool active = false; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7ef43d5e1..be18909ed 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -12,6 +12,7 @@ #include #include #include +#include // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5953928d4..fa8519051 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -7,6 +7,7 @@ #include #include #include +#include static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d47b17b6..a4468b169 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,6 +8,7 @@ #include #include #include +#include struct quant_option { std::string name; diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 8108bac8c..c658f3182 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -1,3 +1,5 @@ +#define _USE_MATH_DEFINES // For M_PI on MSVC + #include "arg.h" #include "common.h" #include "sampling.h" @@ -5,8 +7,6 @@ #include "llama.h" #include "json.hpp" -#define _USE_MATH_DEFINES // For M_PI on MSVC - #include #include #include diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index dd0c6a96e..2e5076d36 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2140,7 +2140,11 @@ extern "C" { # define GGML_RESTRICT # endif #else -# define GGML_RESTRICT restrict +# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT restrict +# endif #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 23ddf41d4..d0d68becd 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 184f99af5..273075f4e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef __APPLE__ #include diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 2679b71ff..c30ac0318 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) { } #endif //__loongarch_asx -void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q4_0_ref(x, y, k); } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q4_1_ref(x, y, k); } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q5_0_ref(x, y, k); } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q5_1_ref(x, y, k); } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0 * restrict y = vy; + block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1050,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) #endif } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; - block_q8_1 * restrict y = vy; + block_q8_1 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1428,8 +1428,8 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type, - const float * restrict qw) { +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type, + const float * GGML_RESTRICT qw) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1497,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1556,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -1599,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -1680,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f return scale; } -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -1691,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { quantize_row_q2_K_ref(x, vy, k); } //========================= 3-bit (de)-quantization -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { quantize_row_q3_K_ref(x, vy, k); } // ====================== 4-bit (de)-quantization -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; + block_q4_K * GGML_RESTRICT y = vy; quantize_row_q4_K_ref(x, y, k); } // ====================== 5-bit (de)-quantization -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; + block_q5_K * GGML_RESTRICT y = vy; quantize_row_q5_K_ref(x, y, k); } // ====================== 6-bit (de)-quantization -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; + block_q6_K * GGML_RESTRICT y = vy; quantize_row_q6_K_ref(x, y, k); } // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) -void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK_K == 0); - block_tq1_0 * restrict y = vy; + block_tq1_0 * GGML_RESTRICT y = vy; quantize_row_tq1_0_ref(x, y, k); } -void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) { +void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK_K == 0); - block_tq2_0 * restrict y = vy; + block_tq2_0 * GGML_RESTRICT y = vy; quantize_row_tq2_0_ref(x, y, k); } @@ -1743,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1 //===================================== Q8_K ============================================== -void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { #ifdef __wasm_simd128__ assert(k % QK_K == 0); const int64_t nb = k / QK_K; - block_q8_K * restrict yc = y; // Cast to proper type + block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type for (int i = 0; i < nb; i++) { const float * x_block = x + i * QK_K; @@ -1909,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -1924,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { - const block_q4_0 * restrict vx0 = vx; - const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); - const block_q8_0 * restrict vy0 = vy; - const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); + const block_q4_0 * GGML_RESTRICT vx0 = vx; + const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); + const block_q8_0 * GGML_RESTRICT vy0 = vy; + const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i++) { - const block_q4_0 * restrict b_x0 = &vx0[i]; - const block_q4_0 * restrict b_x1 = &vx1[i]; - const block_q8_0 * restrict b_y0 = &vy0[i]; - const block_q8_0 * restrict b_y1 = &vy1[i]; + const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; + const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2017,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const svbool_t ph4 = svptrue_pat_b32(SV_VL4); for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // load x const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); @@ -2063,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // load x const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); @@ -2104,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const svbool_t pl16 = svnot_b_z(ph32, ph16); for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // load x const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); @@ -2144,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float32x4_t sumv1 = vdupq_n_f32(0.0f); for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2189,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const v128_t s8b = wasm_i8x16_splat(0x8); for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // Load and process x0 v128_t v0_0 = wasm_v128_load(x0->qs); @@ -2609,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; @@ -2624,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { - const block_q4_1 * restrict vx0 = vx; - const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); - const block_q8_1 * restrict vy0 = vy; - const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); + const block_q4_1 * GGML_RESTRICT vx0 = vx; + const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); + const block_q8_1 * GGML_RESTRICT vy0 = vy; + const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t summs0 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i++) { - const block_q4_1 * restrict b_x0 = &vx0[i]; - const block_q4_1 * restrict b_x1 = &vx1[i]; - const block_q8_1 * restrict b_y0 = &vy0[i]; - const block_q8_1 * restrict b_y1 = &vy1[i]; + const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i]; + const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i]; float32_t summs_t[4] = { GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), @@ -2715,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r float summs = 0; for (; ib + 1 < nb; ib += 2) { - const block_q4_1 * restrict x0 = &x[ib + 0]; - const block_q4_1 * restrict x1 = &x[ib + 1]; - const block_q8_1 * restrict y0 = &y[ib + 0]; - const block_q8_1 * restrict y1 = &y[ib + 1]; + const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); @@ -2931,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -2946,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2960,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r uint64_t tmp1[4]; for (; ib + 1 < nb; ib += 2) { - const block_q5_0 * restrict x0 = &x[ib]; - const block_q5_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3024,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r // TODO: check if unrolling this is better for (; ib < nb; ++ib) { - const block_q5_0 * restrict x0 = &x[ib]; - const block_q8_0 * restrict y0 = &y[ib]; + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3286,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; } -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; @@ -3301,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3318,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r uint64_t tmp1[4]; for (; ib + 1 < nb; ib += 2) { - const block_q5_1 * restrict x0 = &x[ib]; - const block_q5_1 * restrict x1 = &x[ib + 1]; - const block_q8_1 * restrict y0 = &y[ib]; - const block_q8_1 * restrict y1 = &y[ib + 1]; + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3387,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r // TODO: check if unrolling this is better for (; ib < nb; ++ib) { - const block_q5_1 * restrict x0 = &x[ib]; - const block_q8_1 * restrict y0 = &y[ib]; + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); @@ -3660,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r *s = sumf; } -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -3675,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { - const block_q8_0 * restrict vx0 = vx; - const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); - const block_q8_0 * restrict vy0 = vy; - const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); + const block_q8_0 * GGML_RESTRICT vx0 = vx; + const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); + const block_q8_0 * GGML_RESTRICT vy0 = vy; + const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); float32x4_t sumv0 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i++) { - const block_q8_0 * restrict b_x0 = &vx0[i]; - const block_q8_0 * restrict b_y0 = &vy0[i]; + const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; - const block_q8_0 * restrict b_x1 = &vx1[i]; - const block_q8_0 * restrict b_y1 = &vy1[i]; + const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; const int8x16_t x0_l = vld1q_s8(b_x0->qs); const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); @@ -3757,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r const svbool_t pl16 = svptrue_pat_b32(SV_VL4); for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // load x const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); @@ -3788,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r { //printf("sve256"); for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; // load x const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); @@ -3824,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv00 = svdup_n_f32(0.0f); for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits // and add them to make one 64 element vector @@ -3867,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float32x4_t sumv1 = vdupq_n_f32(0.0f); for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -3897,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r v128_t sumv = wasm_f32x4_splat(0.0f); for (; ib < nb; ++ib) { - const block_q8_0 * restrict x0 = &x[ib]; - const block_q8_0 * restrict y0 = &y[ib]; + const block_q8_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; const v128_t x0_0 = wasm_v128_load(x0->qs); const v128_t x0_1 = wasm_v128_load(x0->qs + 16); @@ -4080,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; } -void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); - const block_tq1_0 * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_tq1_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4403,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); - const block_tq2_0 * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_tq2_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4575,15 +4575,15 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4603,9 +4603,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8_sv = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc); const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); @@ -4748,9 +4748,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8_sv = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8; const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s)); @@ -4847,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; const uint8x16_t mins_and_scales = vld1q_u8(sc); const uint8x16_t scales = vandq_u8(mins_and_scales, m4); @@ -4912,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); @@ -4979,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -5306,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed int vsumi6 = v0; vector signed int vsumi7 = v0; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/128; ++j) { __builtin_prefetch(q2, 0, 1); @@ -5398,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf); @@ -5492,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -5503,8 +5503,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5529,9 +5529,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3_sv = x[i].qs; - const uint8_t * restrict qh_sv = x[i].hmask; - const int8_t * restrict q8_sv = y[i].qs; + const uint8_t * GGML_RESTRICT q3_sv = x[i].qs; + const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -5705,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -5791,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -5896,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; @@ -6030,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Process blocks with SIMD int8_t * a = aux8; @@ -6119,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(aux, x[i].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); @@ -6261,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed int vsumi6 = v0; vector signed int vsumi7 = v0; - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/128; ++j) { __builtin_prefetch(q3, 0, 1); @@ -6375,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); __m128i scales128 = lsx_set_w( @@ -6461,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; @@ -6508,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r } -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -6516,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -6552,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const int vector_length = ggml_cpu_get_sve_cnt()*8; const svuint8_t m4b = svdup_n_u8(0xf); @@ -6640,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -6679,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Process scales and mins memcpy(utmp, x[i].scales, 12); @@ -6692,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r // Sum mins * q8sums int32_t sumi = 0; - const int16_t * restrict q8sums = y[i].bsums; + const int16_t * GGML_RESTRICT q8sums = y[i].bsums; const uint8_t * m = (const uint8_t *)&utmp[2]; for (int j = 0; j < 16; j += 2) { sumi += (q8sums[j] + q8sums[j+1]) * m[j/2]; @@ -6791,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -6850,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -6951,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vl = 32; @@ -7053,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/64; j+=2) { __builtin_prefetch(q4, 0, 1); @@ -7145,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); @@ -7228,8 +7228,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]); const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict x0 = x[i].qs; - const int8_t * restrict y0 = y[i].qs; + const uint8_t * GGML_RESTRICT x0 = x[i].qs; + const int8_t * GGML_RESTRICT y0 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -7277,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); a += 32; @@ -7323,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -7331,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -7374,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -7421,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r float summs = 0.f; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); @@ -7505,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -7597,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Process scales and mins memcpy(utmp, x[i].scales, 12); @@ -7611,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r // Sum mins * q8sums int32_t sumi_mins = 0; - const int16_t * restrict q8sums = y[i].bsums; + const int16_t * GGML_RESTRICT q8sums = y[i].bsums; const uint8_t * m = (const uint8_t *)&utmp[2]; for (int j = 0; j < 16; j += 2) { sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2]; @@ -7715,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vl = 8; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; @@ -7856,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/64; ++j) { __builtin_prefetch(q5, 0, 1); @@ -7929,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); @@ -8039,9 +8039,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]; const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict x0l = x[i].qs; - const uint8_t * restrict x0h = x[i].qh; - const int8_t * restrict y0 = y[i].qs; + const uint8_t * GGML_RESTRICT x0l = x[i].qs; + const uint8_t * GGML_RESTRICT x0h = x[i].qh; + const int8_t * GGML_RESTRICT y0 = y[i].qs; v_xh[0] = vec_xl(0 , x0h); v_xh[1] = vec_xl(16, x0h); @@ -8094,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); @@ -8145,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -8153,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r UNUSED(by); UNUSED(bs); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -8174,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); @@ -8265,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -8343,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // handle the q6_k -32 offset separately using bsums const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); @@ -8444,8 +8444,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { // Unpack 6-bit quantized data into aux8 (unchanged) - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; int8_t * a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -8459,8 +8459,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r qh += 32; } - const int8_t * restrict a_ptr = aux8; - const int8_t * restrict q8 = y[i].qs; + const int8_t * GGML_RESTRICT a_ptr = aux8; + const int8_t * GGML_RESTRICT q8 = y[i].qs; v128_t acc0 = wasm_i32x4_splat(0); v128_t acc1 = wasm_i32x4_splat(0); @@ -8523,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; size_t vl; @@ -8629,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed int vsumi6 = v0; vector signed int vsumi7 = v0; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict qs = x[i].scales; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT qs = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/128; ++j) { __builtin_prefetch(q6, 0, 0); @@ -8748,9 +8748,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; @@ -8816,11 +8816,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict x0l = x[i].ql; - const uint8_t * restrict x0h = x[i].qh; - const int8_t * restrict y0 = y[i].qs; + const uint8_t * GGML_RESTRICT x0l = x[i].ql; + const uint8_t * GGML_RESTRICT x0h = x[i].qh; + const int8_t * GGML_RESTRICT y0 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); @@ -8931,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; @@ -9003,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = { }; #endif -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -9011,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void UNUSED(by); UNUSED(bs); - const block_iq2_xxs * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -9030,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void float sumf = 0; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; float sumf1 = 0, sumf2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; @@ -9067,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -9108,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); @@ -9173,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/32; j += 2) { __builtin_prefetch(q2, 0, 1); @@ -9250,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = __lasx_xvldi(0); __m256i sumi2 = __lasx_xvldi(0); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -9291,8 +9291,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void // // for (int i = 0; i < nb; ++i) { // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; -// const uint16_t * restrict q2 = x[i].qs; -// const int8_t * restrict q8 = y[i].qs; +// const uint16_t * GGML_RESTRICT q2 = x[i].qs; +// const int8_t * GGML_RESTRICT q8 = y[i].qs; // // float sumf1 = 0, sumf2 = 0; // @@ -9340,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void float sumf = 0.f; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { memcpy(aux32, q2, 2*sizeof(uint32_t)); @@ -9364,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -9372,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); - const block_iq2_xs * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -9390,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t scales8 = vld1_u8(x[i].scales); const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); const uint8x8_t scales_h = vshr_n_u8(scales8, 4); @@ -9468,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); __m128i stmp = _mm_set1_epi64x(aux64); @@ -9589,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); __m128i stmp = _mm_set1_epi64x(aux64); @@ -9744,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); __m128i stmp = __lsx_vreplgr2vr_d(aux64); @@ -9842,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint16_t * restrict q2 = x[i].qs; - const uint8_t * restrict sc = x[i].scales; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/64; ++j) { __builtin_prefetch(q2, 0, 1); @@ -9914,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * float sumf = 0.f; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * restrict q2 = x[i].qs; - const uint8_t * restrict sc = x[i].scales; - const int8_t * restrict q8 = y[i].qs; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; @@ -9949,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -9957,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); - const block_iq2_s * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -9984,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; int sumi1 = 0, sumi2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -10058,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); @@ -10131,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); @@ -10229,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint8_t * restrict q2 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); - const uint8_t * restrict sc = x[i].scales; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; for (int j = 0; j < QK_K/32; j += 2) { __builtin_prefetch(q2, 0, 1); @@ -10330,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i tmp1; memcpy(&aux64, x[i].scales, 8); @@ -10427,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * } -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -10435,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void UNUSED(by); UNUSED(bs); - const block_iq3_xxs * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -10452,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void float sumf = 0; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict gas = x[i].qs + QK_K/4; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; float sumf1 = 0, sumf2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; @@ -10490,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict gas = x[i].qs + QK_K/4; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -10535,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict gas = x[i].qs + QK_K/4; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); @@ -10604,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector signed int vsumi2 = v0; vector signed int vsumi3 = v0; - const uint8_t * restrict q3 = x[i].qs; - const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4); - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4); + const int8_t * GGML_RESTRICT q8 = y[i].qs; #pragma GCC unroll 1 for (int j = 0; j < QK_K/32; j += 2) { @@ -10678,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict gas = x[i].qs + QK_K/4; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = __lasx_xvldi(0); __m256i sumi2 = __lasx_xvldi(0); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -10723,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void float sumf = 0.f; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict gas = x[i].qs + QK_K/4; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); @@ -10750,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -10758,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); - const block_iq3_s * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -10796,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)x[i].signs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(scales32, x[i].scales, 4); scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; @@ -10878,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)x[i].signs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -10963,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)x[i].signs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); @@ -11064,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)(x[i].signs); - const uint8_t * restrict sc = x[i].scales; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs); + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vector signed int vsumi0 = v0; vector signed int vsumi1 = v0; @@ -11175,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint16_t * restrict signs = (const uint16_t *)x[i].signs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = __lasx_xvldi(0); __m256i sumi2 = __lasx_xvldi(0); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -11236,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * float sumf = 0.f; for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict qs = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const uint8_t * restrict signs = x[i].signs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; @@ -11291,7 +11291,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #endif -void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -11299,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void UNUSED(by); UNUSED(bs); - const block_iq1_s * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -11458,10 +11458,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void vector signed int vsumi3 = vec_splats((int32_t)0); vector signed int vsumi8 = vec_splats((int32_t)0); - const uint8_t * restrict q1 = x[i].qs; - const uint16_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - const int16_t * restrict qs = y[i].bsums; + const uint8_t * GGML_RESTRICT q1 = x[i].qs; + const uint16_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const int16_t * GGML_RESTRICT qs = y[i].bsums; for (int j = 0; j < QK_K/32; j += 2) { __builtin_prefetch(q1, 0, 1); @@ -11622,7 +11622,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -11630,8 +11630,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void UNUSED(by); UNUSED(bs); - const block_iq1_m * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq1_m * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -11912,7 +11912,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -11921,8 +11921,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * assert(n % QK4_NL == 0); static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); - const block_iq4_nl * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; const int nb = n / QK4_NL; @@ -12097,8 +12097,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * const uint8x16_t v_m = vec_splat_u8(0x0F); for (; ib < nb; ++ib) { - const block_iq4_nl * restrict x0 = &x[ib]; - const block_q8_0 * restrict y0 = &y[ib]; + const block_iq4_nl * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; const uint8x16_t v_x = vec_xl(0, x0->qs); int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); @@ -12126,7 +12126,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * *s = sumf; } -void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -12134,8 +12134,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(bs); assert(n % QK_K == 0); - const block_iq4_xs * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -12292,9 +12292,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * uint16_t h = x[ibl].scales_h; - const uint8_t * restrict q4 = x[ibl].qs; - const uint8_t * restrict sc = x[ibl].scales_l; - const int8_t * restrict q8 = y[ibl].qs; + const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; + const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l; + const int8_t * GGML_RESTRICT q8 = y[ibl].qs; for (int ib = 0; ib < QK_K/64; ib ++ ) { __builtin_prefetch(q4, 0, 1); @@ -12398,8 +12398,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const uint8_t * restrict q4 = x[ibl].qs; - const int8_t * restrict q8 = y[ibl].qs; + const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; + const int8_t * GGML_RESTRICT q8 = y[ibl].qs; uint16_t h = x[ibl].scales_h; @@ -12479,12 +12479,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * // ============================ 4-bit non-linear quants -void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { assert(k % QK4_NL == 0); quantize_row_iq4_nl_ref(x, y, k); } -void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) { +void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); quantize_iq4_xs(x, y, 1, k, NULL); } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 9ab24522c..2a5463fdf 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -247,9 +247,9 @@ typedef pthread_t ggml_thread_t; static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); -static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); -static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); +static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); +static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); +static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = { @@ -1451,7 +1451,7 @@ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp } } -static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { +static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1494,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * *s = sumf; } -static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) { +static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1562,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t *s = sumf; } -static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { +static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1606,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); @@ -1659,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re } } -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1690,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float #endif } -inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) { +inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) const int np = (n & ~(GGML_F16_STEP - 1)); @@ -1722,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const } // xs and vs are byte strides of x and v -inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { - const float * restrict x[GGML_VEC_MAD_UNROLL]; - const float * restrict v[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { x[i] = (const float *) ((const char *) xv + i*xs); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7918388ae..ac918a60d 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -28,7 +28,7 @@ #define UNUSED GGML_UNUSED // reference implementation for deterministic creation of model files -void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) { +void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in } } -void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) { +void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) { const int qk = QK4_1; assert(k % qk == 0); @@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in } } -void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) { +void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in } } -void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) { +void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) { const int qk = QK5_1; assert(k % qk == 0); @@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) { +void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) { +void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in } } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6 } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6 } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6 } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6 } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -376,8 +376,8 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type, - const float * restrict qw) { +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type, + const float * GGML_RESTRICT qw) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f return scale; } -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) { +void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6 } } -static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f return scale; } -static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) { +static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) { float max = 0; for (int i = 0; i < n; ++i) { max = MAX(max, x[i]); @@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * return sumlx/suml2; } -static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) { +static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) { GGML_ASSERT(quant_weights); assert(k % QK_K == 0); const int nb = k / QK_K; @@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j]; float sigma2 = sumx2/QK_K; for (int j = 0; j < QK_K/16; ++j) { - const float * restrict qw = quant_weights + QK_K * i + 16*j; + const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j; for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l]; scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); @@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); if (!quant_weights) { quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr //========================= 3-bit (de)-quantization -void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) { +void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in } } -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6 const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; uint8_t m = 1; memcpy(aux, x[i].scales, 12); @@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6 } } -static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) { +static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) { assert(n_per_row % QK_K == 0); const int nb = n_per_row / QK_K; @@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri } } -size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); if (!quant_weights) { quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) { +void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6 } } -static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri } } -size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); if (!quant_weights) { quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) { +void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6 } } -static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri } } -size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); if (!quant_weights) { quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) { +void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { const uint8_t q1 = L[j + l + 0] & 0xF; @@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT ql = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT sc = x[i].scales; for (int n = 0; n < QK_K; n += 128) { for (int l = 0; l < 32; ++l) { @@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6 } } -static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { const uint8_t q1 = L[j + l + 0] & 0xF; @@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri } } -size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); if (!quant_weights) { quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { static_assert(QK4_0 == 32, "QK4_0 must be 32"); if (!quant_weights) { @@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri } } -size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); @@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { static_assert(QK4_1 == 32, "QK4_1 must be 32"); if (!quant_weights) { @@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri } } -size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row); @@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { static_assert(QK5_0 == 32, "QK5_0 must be 32"); if (!quant_weights) { @@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri } } -size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row); @@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { static_assert(QK5_1 == 32, "QK5_1 must be 32"); if (!quant_weights) { @@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri } } -size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row); @@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row); @@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) -void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) { +void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, } } -void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) { +void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, } } -size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * row_size; } -size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * row_size; } -void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in } } -void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) { +void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in // ====================== "True" 2-bit (de)-quantization -void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y // ====================== 2.3125 bpw (de)-quantization -void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, // ====================== 2.5625 bpw (de)-quantization -void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in // ====================== 3.0625 bpw (de)-quantization -void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y // ====================== 3.3125 bpw (de)-quantization -void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in // ====================== 1.5625 bpw (de)-quantization -void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in } } -void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; @@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, } } -void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) { +void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, //===================================== Q8_K ============================================== -void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) { +void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) { } } -static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, - const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { +static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid, + const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) { int num_neighbors = neighbours[0]; GGML_ASSERT(num_neighbors > 0); float best_d2 = FLT_MAX; @@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { +static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS); @@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } } -static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { +static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS); @@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } } -size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq2_xxs); } -size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) { } } -static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid, - const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { +static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid, + const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) { int num_neighbors = neighbours[0]; GGML_ASSERT(num_neighbors > 0); float best_d2 = FLT_MAX; @@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n, - const float * restrict quant_weights) { +static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, + const float * GGML_RESTRICT quant_weights) { const int gindex = iq3_data_index(grid_size); @@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } } -size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq3_xxs); } -void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) { +void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); quantize_row_iq3_xxs_impl(256, x, y, k, NULL); } -static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n, - const float * restrict quant_weights, +static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n, + const float * GGML_RESTRICT quant_weights, float * scales, float * weight, float * xval, @@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } #define IQ3S_BLOCK_SIZE 32 -size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; float scales[QK_K/IQ3S_BLOCK_SIZE]; @@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq3_s); } -void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) { +void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); quantize_iq3_s(x, y, 1, k, NULL); } @@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, // =================================== 1.5 bpw =================================================== -static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, - const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) { +static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid, + const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) { int num_neighbors = neighbours[0]; GGML_ASSERT(num_neighbors > 0); float best_score = -FLT_MAX; @@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid, - const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) { +static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid, + const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) { int num_neighbors = neighbours[0]; GGML_ASSERT(num_neighbors > 0); float best_score = FLT_MAX; @@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) { #define IQ1S_BLOCK_SIZE 32 #define IQ1M_BLOCK_SIZE 16 -static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, +static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights, float * scales, float * weight, float * sumx, @@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); float scales[QK_K/IQ1S_BLOCK_SIZE]; float weight[IQ1S_BLOCK_SIZE]; @@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq1_s); } -static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, +static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights, float * scales, float * weight, float * pairs, @@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); float scales[QK_K/IQ1M_BLOCK_SIZE]; float weight[IQ1M_BLOCK_SIZE]; @@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) { return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } -static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x, +static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, float * scales, float * weight, uint8_t * L, const int8_t * values, @@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } -size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK4_NL == 0); int64_t nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; @@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq4_nl); } -//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) { -void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) { +//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { +void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) { GGML_ASSERT(k%QK4_NL == 0); int64_t nblock = k/QK4_NL; uint8_t L[QK4_NL]; @@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y } } -size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq4_xs); } -void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) { +void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); quantize_iq4_xs(x, y, 1, k, NULL); } // =============================== 2.5625 bpw -static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { +static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_S); @@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq2_s); } -void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) { +void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); quantize_iq2_s(x, y, 1, k, NULL); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7fc06724e..084240331 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -565,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) { #endif } -static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); -static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); -static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); +static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); +static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); +static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 028a64794..de4458696 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -4,6 +4,7 @@ #include #include +#include #if __cplusplus >= 202000L #define LU8(x) (const char*)(u8##x) diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 1ed688e3b..1ce0850ec 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -6,6 +6,7 @@ #include #include +#include struct llama_kv_cell { llama_pos pos = -1; diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index b716630a8..c13b3755a 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef __has_include #if __has_include() diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 163ff64f7..a708d8b88 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -16,6 +16,7 @@ #include #include #include +#include // // helpers