@@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2586
2586
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2587
2587
2588
2588
// K-quants
2589
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2589
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2590
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
2590
2591
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2591
2592
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2592
2593
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8955
8956
// TODO: explore better strategies
8956
8957
new_type = GGML_TYPE_Q8_0;
8957
8958
}
8958
- } else if (name.find("ffn_down.weight ") != std::string::npos) {
8959
+ } else if (name.find("ffn_down") != std::string::npos) {
8959
8960
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8963
+ }
8960
8964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8961
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8965
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8962
8966
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8963
8967
: GGML_TYPE_Q3_K;
8964
8968
}
@@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8967
8971
}
8968
8972
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8969
8973
if (arch == LLM_ARCH_FALCON) {
8970
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8974
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8971
8975
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8972
8976
} else {
8973
8977
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8974
8978
}
8975
8979
}
8976
8980
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8977
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4 ) {
8981
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8 ) {
8978
8982
new_type = GGML_TYPE_Q5_K;
8979
8983
}
8980
8984
++qs.i_feed_forward_w2;
@@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8992
8996
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8993
8997
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8994
8998
}
8995
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8996
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8997
- }
8999
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002
+ //}
8998
9003
// This can be used to reduce the size of the Q5_K_S model.
8999
9004
// The associated PPL increase is fully in line with the size reduction
9000
9005
//else {
@@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9043
9048
9044
9049
// K-quants
9045
9050
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9051
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9046
9052
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9047
9053
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9048
9054
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9101
9107
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9102
9108
++qs.n_attention_wv;
9103
9109
}
9104
- else if (name.find("ffn_down.weight ") != std::string::npos) {
9110
+ else if (name.find("ffn_down") != std::string::npos) {
9105
9111
++qs.n_feed_forward_w2;
9106
9112
}
9107
9113
}
0 commit comments