|
|
|
@ -28,7 +28,7 @@
|
|
|
|
|
#define UNUSED GGML_UNUSED
|
|
|
|
|
|
|
|
|
|
// reference implementation for deterministic creation of model files
|
|
|
|
|
void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK4_0;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
const int qk = QK4_1;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK5_0;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
const int qk = QK5_1;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reference implementation for deterministic creation of model files
|
|
|
|
|
void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK8_0 == 0);
|
|
|
|
|
const int nb = k / QK8_0;
|
|
|
|
|
|
|
|
|
@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reference implementation for deterministic creation of model files
|
|
|
|
|
void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(QK8_1 == 32);
|
|
|
|
|
assert(k % QK8_1 == 0);
|
|
|
|
|
const int nb = k / QK8_1;
|
|
|
|
@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK4_0;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK4_1;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK5_0;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK5_1;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
static const int qk = QK8_0;
|
|
|
|
|
|
|
|
|
|
assert(k % qk == 0);
|
|
|
|
@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
|
|
|
|
|
return (i & 0x007fffff) - 0x00400000;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
|
|
|
|
|
const float * restrict qw) {
|
|
|
|
|
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
|
|
|
|
|
const float * GGML_RESTRICT qw) {
|
|
|
|
|
float max = 0;
|
|
|
|
|
float amax = 0;
|
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
|
|
@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
|
|
return scale;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
|
|
|
|
|
static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
|
|
|
|
|
float max = 0;
|
|
|
|
|
float amax = 0;
|
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
|
|
@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
|
|
return 1/iscale;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
|
|
|
|
static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
|
|
|
|
|
int ntry, float alpha) {
|
|
|
|
|
float min = x[0];
|
|
|
|
|
float max = x[0];
|
|
|
|
@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
|
|
|
return scale;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
|
|
|
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
|
|
|
|
static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
|
|
|
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
|
|
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
|
|
|
float min = x[0];
|
|
|
|
|
float max = x[0];
|
|
|
|
@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
|
|
|
return scale;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
|
|
|
|
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
|
|
|
|
|
if (j < 4) {
|
|
|
|
|
*d = q[j] & 63; *m = q[j + 4] & 63;
|
|
|
|
|
} else {
|
|
|
|
@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
|
|
|
|
|
|
|
|
//========================- 2-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
|
|
|
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
|
|
|
|
static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
|
|
|
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
|
|
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
|
|
|
float min = x[0];
|
|
|
|
|
float max = x[0];
|
|
|
|
@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|
|
|
|
return scale;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
|
|
|
|
static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
|
|
|
|
|
float max = 0;
|
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
|
|
|
max = MAX(max, x[i]);
|
|
|
|
@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
|
|
|
|
return sumlx/suml2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
GGML_ASSERT(quant_weights);
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
|
|
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
|
|
|
|
float sigma2 = sumx2/QK_K;
|
|
|
|
|
for (int j = 0; j < QK_K/16; ++j) {
|
|
|
|
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
|
|
|
|
const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
|
|
|
|
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
|
|
|
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
|
|
|
|
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
|
|
|
@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
|
|
|
|
|
//========================= 3-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|
|
|
|
|
|
|
|
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
|
|
|
|
|
|
const uint8_t * restrict q = x[i].qs;
|
|
|
|
|
const uint8_t * restrict hm = x[i].hmask;
|
|
|
|
|
const uint8_t * GGML_RESTRICT q = x[i].qs;
|
|
|
|
|
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
|
|
|
uint8_t m = 1;
|
|
|
|
|
|
|
|
|
|
memcpy(aux, x[i].scales, 12);
|
|
|
|
@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
assert(n_per_row % QK_K == 0);
|
|
|
|
|
const int nb = n_per_row / QK_K;
|
|
|
|
|
|
|
|
|
@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
|
|
|
|
|
// ====================== 4-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
assert(n_per_row % QK_K == 0);
|
|
|
|
|
const int64_t nb = n_per_row / QK_K;
|
|
|
|
|
|
|
|
|
@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
|
|
|
|
|
// ====================== 5-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t * restrict qh = y[i].qh;
|
|
|
|
|
uint8_t * restrict ql = y[i].qs;
|
|
|
|
|
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
|
|
|
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
|
|
|
|
memset(qh, 0, QK_K/8);
|
|
|
|
|
|
|
|
|
|
uint8_t m1 = 1, m2 = 2;
|
|
|
|
@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
assert(n_per_row % QK_K == 0);
|
|
|
|
|
const int64_t nb = n_per_row / QK_K;
|
|
|
|
|
|
|
|
|
@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t * restrict qh = y[i].qh;
|
|
|
|
|
uint8_t * restrict ql = y[i].qs;
|
|
|
|
|
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
|
|
|
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
|
|
|
|
memset(qh, 0, QK_K/8);
|
|
|
|
|
|
|
|
|
|
uint8_t m1 = 1, m2 = 2;
|
|
|
|
@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
|
|
|
|
|
// ====================== 6-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t * restrict ql = y[i].ql;
|
|
|
|
|
uint8_t * restrict qh = y[i].qh;
|
|
|
|
|
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
|
|
|
|
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
|
|
|
for (int j = 0; j < QK_K; j += 128) {
|
|
|
|
|
for (int l = 0; l < 32; ++l) {
|
|
|
|
|
const uint8_t q1 = L[j + l + 0] & 0xF;
|
|
|
|
@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
|
|
|
|
|
|
const uint8_t * restrict ql = x[i].ql;
|
|
|
|
|
const uint8_t * restrict qh = x[i].qh;
|
|
|
|
|
const int8_t * restrict sc = x[i].scales;
|
|
|
|
|
const uint8_t * GGML_RESTRICT ql = x[i].ql;
|
|
|
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
|
|
|
const int8_t * GGML_RESTRICT sc = x[i].scales;
|
|
|
|
|
|
|
|
|
|
for (int n = 0; n < QK_K; n += 128) {
|
|
|
|
|
for (int l = 0; l < 32; ++l) {
|
|
|
|
@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
assert(n_per_row % QK_K == 0);
|
|
|
|
|
const int64_t nb = n_per_row / QK_K;
|
|
|
|
|
|
|
|
|
@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t * restrict ql = y[i].ql;
|
|
|
|
|
uint8_t * restrict qh = y[i].qh;
|
|
|
|
|
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
|
|
|
|
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
|
|
|
for (int j = 0; j < QK_K; j += 128) {
|
|
|
|
|
for (int l = 0; l < 32; ++l) {
|
|
|
|
|
const uint8_t q1 = L[j + l + 0] & 0xF;
|
|
|
|
@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
|
|
|
|
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
|
|
|
@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
|
|
|
|
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
|
|
|
@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
|
|
|
|
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
|
|
|
@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
|
|
|
|
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
if (!quant_weights) {
|
|
|
|
|
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
|
|
|
@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
(void)quant_weights; // not used
|
|
|
|
|
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
|
|
|
|
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
|
|
|
|
|
|
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
|
|
|
|
|
|
|
|
|
void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
(void)quant_weights; // not used
|
|
|
|
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
|
|
|
|
quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
(void)quant_weights; // not used
|
|
|
|
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
|
|
|
|
|
quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
|
|
|
return nrow * row_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
|
|
|
|
|
|
|
|
|
|
// ====================== "True" 2-bit (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
|
|
|
|
|
|
|
|
// ====================== 2.3125 bpw (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
|
|
|
|
|
|
|
|
// ====================== 2.5625 bpw (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
|
|
|
|
|
|
|
|
|
|
// ====================== 3.0625 bpw (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
|
|
|
|
|
|
|
|
// ====================== 3.3125 bpw (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
|
|
|
|
|
|
|
|
// ====================== 1.5625 bpw (de)-quantization
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
|
|
|
|
|
|
|
|
|
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK4_NL == 0);
|
|
|
|
|
const int64_t nb = k / QK4_NL;
|
|
|
|
|
|
|
|
|
@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|
|
|
|
|
|
|
|
|
//===================================== Q8_K ==============================================
|
|
|
|
|
|
|
|
|
|
void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
|
|
|
|
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
const int64_t nb = k / QK_K;
|
|
|
|
|
|
|
|
|
@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
|
|
|
|
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
|
|
|
|
static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
|
|
|
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
|
|
|
|
int num_neighbors = neighbours[0];
|
|
|
|
|
GGML_ASSERT(num_neighbors > 0);
|
|
|
|
|
float best_d2 = FLT_MAX;
|
|
|
|
@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
|
|
return grid_index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
|
|
|
|
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
|
|
|
|
|
|
|
|
@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
|
|
|
|
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
|
|
|
|
|
|
|
|
@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
|
|
|
|
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
|
|
|
|
|
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
|
|
|
|
static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
|
|
|
|
|
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
|
|
|
|
int num_neighbors = neighbours[0];
|
|
|
|
|
GGML_ASSERT(num_neighbors > 0);
|
|
|
|
|
float best_d2 = FLT_MAX;
|
|
|
|
@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
|
|
return grid_index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
|
|
|
|
|
const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
|
|
|
|
|
const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
|
|
|
|
|
const int gindex = iq3_data_index(grid_size);
|
|
|
|
|
|
|
|
|
@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
|
|
|
|
return nrow * nblock * sizeof(block_iq3_xxs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
|
|
|
|
const float * restrict quant_weights,
|
|
|
|
|
static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
|
|
|
|
|
const float * GGML_RESTRICT quant_weights,
|
|
|
|
|
float * scales,
|
|
|
|
|
float * weight,
|
|
|
|
|
float * xval,
|
|
|
|
@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define IQ3S_BLOCK_SIZE 32
|
|
|
|
|
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
|
|
|
@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
|
|
return nrow * nblock * sizeof(block_iq3_s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
quantize_iq3_s(x, y, 1, k, NULL);
|
|
|
|
|
}
|
|
|
|
@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
|
|
|
|
|
|
|
|
|
|
// =================================== 1.5 bpw ===================================================
|
|
|
|
|
|
|
|
|
|
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
|
|
|
|
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
|
|
|
|
static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
|
|
|
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
|
|
|
|
|
int num_neighbors = neighbours[0];
|
|
|
|
|
GGML_ASSERT(num_neighbors > 0);
|
|
|
|
|
float best_score = -FLT_MAX;
|
|
|
|
@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
|
|
return grid_index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
|
|
|
|
const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
|
|
|
|
|
static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
|
|
|
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
|
|
|
|
|
int num_neighbors = neighbours[0];
|
|
|
|
|
GGML_ASSERT(num_neighbors > 0);
|
|
|
|
|
float best_score = FLT_MAX;
|
|
|
|
@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
|
|
|
|
|
|
|
|
#define IQ1S_BLOCK_SIZE 32
|
|
|
|
|
#define IQ1M_BLOCK_SIZE 16
|
|
|
|
|
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
|
|
|
|
static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
|
|
|
|
float * scales,
|
|
|
|
|
float * weight,
|
|
|
|
|
float * sumx,
|
|
|
|
@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
|
|
|
|
float weight[IQ1S_BLOCK_SIZE];
|
|
|
|
@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
|
|
return nrow * nblock * sizeof(block_iq1_s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
|
|
|
|
static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
|
|
|
|
float * scales,
|
|
|
|
|
float * weight,
|
|
|
|
|
float * pairs,
|
|
|
|
@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
|
|
|
|
float weight[IQ1M_BLOCK_SIZE];
|
|
|
|
@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
|
|
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
|
|
|
|
|
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
|
|
|
|
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
|
|
|
|
float * scales, float * weight, uint8_t * L,
|
|
|
|
|
const int8_t * values,
|
|
|
|
@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK4_NL;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
|
|
|
|
|
return nrow * nblock * sizeof(block_iq4_nl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
|
|
|
|
|
void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
|
|
|
|
//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
|
|
|
void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
GGML_ASSERT(k%QK4_NL == 0);
|
|
|
|
|
int64_t nblock = k/QK4_NL;
|
|
|
|
|
uint8_t L[QK4_NL];
|
|
|
|
@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
|
|
|
|
return nrow * nblock * sizeof(block_iq4_xs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================== 2.5625 bpw
|
|
|
|
|
|
|
|
|
|
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
|
|
|
|
static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
|
|
|
|
|
|
|
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
|
|
|
|
|
|
|
|
@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
|
|
|
int64_t nblock = n_per_row/QK_K;
|
|
|
|
|
char * qrow = (char *)dst;
|
|
|
|
@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
|
|
return nrow * nblock * sizeof(block_iq2_s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
|
|
|
|
void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
|
|
|
|
|
assert(k % QK_K == 0);
|
|
|
|
|
quantize_iq2_s(x, y, 1, k, NULL);
|
|
|
|
|
}
|
|
|
|
|