From 6eec06081b71f6ef55c3deb2114df39ec52064b0 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 19 Apr 2023 17:10:58 +0200 Subject: [PATCH 1/4] Q4_2 quantization with rmse-optimized scale and quants For quantize-stats we get q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012 For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks. Quantization is slow (~90 seconds on my Mac for 7B) as not multi-threaded as in PR #896. --- ggml.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 13c1548fee895..8792dd638e895 100644 --- a/ggml.c +++ b/ggml.c @@ -19,6 +19,7 @@ #include #include #include +#include // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 @@ -1123,12 +1124,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } +inline int nearestInt(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kQuantizeWithBounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i Date: Wed, 19 Apr 2023 19:18:28 +0300 Subject: [PATCH 2/4] ggml : satisfy the sanitizer builds Not sure why this makes them fail --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 8792dd638e895..5177baeb42b31 100644 --- a/ggml.c +++ b/ggml.c @@ -1124,7 +1124,7 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } -inline int nearestInt(float fval) { +static inline int nearestInt(float fval) { assert(fval <= 4194303.f); float val = fval + 12582912.f; int i; memcpy(&i, &val, sizeof(int)); From 49beb2cdb8af54202ec122e3e07789a46d519dae Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 19 Apr 2023 18:46:44 +0200 Subject: [PATCH 3/4] Better follow ggml conventions for function names --- ggml.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 5177baeb42b31..dd150718e1b03 100644 --- a/ggml.c +++ b/ggml.c @@ -1124,14 +1124,14 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } -static inline int nearestInt(float fval) { +static inline int nearest_int(float fval) { assert(fval <= 4194303.f); float val = fval + 12582912.f; int i; memcpy(&i, &val, sizeof(int)); return (i & 0x007fffff) - 0x00400000; } -static float kQuantizeWithBounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, const float * restrict candidates, int8_t * restrict L) { assert (nmin >= INT8_MIN); assert (nmax <= INT8_MAX); @@ -1147,7 +1147,7 @@ static float kQuantizeWithBounds(int n, int nmin, int nmax, const float * restri float sumlxP = 0; int suml2P = 0; float sumlxM = 0; int suml2M = 0; for (int i=0; i Date: Wed, 19 Apr 2023 18:57:07 +0200 Subject: [PATCH 4/4] Fixed type as per reviewer comment --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index dd150718e1b03..5f424ec96ab4a 100644 --- a/ggml.c +++ b/ggml.c @@ -1177,8 +1177,8 @@ static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * r } static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) { -#define kCandiateCount 8 - static const float candidates[kCandiateCount] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f }; +#define CANDIDATE_COUNT 8 + static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f }; assert(k % QK4_2 == 0); int8_t L[QK4_2]; @@ -1187,7 +1187,7 @@ static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restri for (int i = 0; i < nb; i++) { - float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, kCandiateCount, candidates, L); + float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L); y[i].d = GGML_FP32_TO_FP16(scale); for (int l = 0; l < QK4_2; l += 2) {