Skip to content

Commit 469e75d

Browse files
ikawrakowKawrakowggerganov
authored
llama : restore intended k-quants mixes for MoE models (#4872)
* Restore intended k-quants quantization mixes for MoE models * Update Q2_K_S values in the quantize tool Still using LLaMA-v1 PPL values in the quant description today does not make much sense. But let's leave this update for another PR. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 49662cb commit 469e75d

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

Diff for: examples/quantize/quantize.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1818
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
1919
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
2020
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
21+
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
2122
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
2223
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
2324
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },

Diff for: llama.cpp

+15-9
Original file line numberDiff line numberDiff line change
@@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
25862586
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
25872587

25882588
// K-quants
2589-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2589+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2590+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
25902591
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
25912592
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
25922593
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89558956
// TODO: explore better strategies
89568957
new_type = GGML_TYPE_Q8_0;
89578958
}
8958-
} else if (name.find("ffn_down.weight") != std::string::npos) {
8959+
} else if (name.find("ffn_down") != std::string::npos) {
89598960
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8963+
}
89608964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8961-
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8965+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
89628966
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
89638967
: GGML_TYPE_Q3_K;
89648968
}
@@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89678971
}
89688972
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
89698973
if (arch == LLM_ARCH_FALCON) {
8970-
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8974+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
89718975
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
89728976
} else {
89738977
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
89748978
}
89758979
}
89768980
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8977-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8981+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
89788982
new_type = GGML_TYPE_Q5_K;
89798983
}
89808984
++qs.i_feed_forward_w2;
@@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89928996
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
89938997
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89948998
}
8995-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8996-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8997-
}
8999+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002+
//}
89989003
// This can be used to reduce the size of the Q5_K_S model.
89999004
// The associated PPL increase is fully in line with the size reduction
90009005
//else {
@@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90439048

90449049
// K-quants
90459050
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9051+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
90469052
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
90479053
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
90489054
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
91019107
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
91029108
++qs.n_attention_wv;
91039109
}
9104-
else if (name.find("ffn_down.weight") != std::string::npos) {
9110+
else if (name.find("ffn_down") != std::string::npos) {
91059111
++qs.n_feed_forward_w2;
91069112
}
91079113
}

Diff for: llama.h

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ extern "C" {
105105
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
106106
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107107
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
108109

109110
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
110111
};

0 commit comments

Comments
 (0)