@@ -4653,6 +4653,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4653
4653
4654
4654
std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
4655
4655
4656
+ llama_model model;
4657
+ llm_load_arch (*ml, model);
4658
+ llm_load_hparams (*ml, model, 0 , 0 , 0 );
4659
+
4656
4660
const size_t align = GGUF_DEFAULT_ALIGNMENT;
4657
4661
struct gguf_context * ctx_out = gguf_init_empty ();
4658
4662
@@ -4678,6 +4682,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4678
4682
++n_feed_forward_w2;
4679
4683
}
4680
4684
}
4685
+ if (n_attention_wv != n_feed_forward_w2 || (uint32_t )n_attention_wv != model.hparams .n_layer ) {
4686
+ LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
4687
+ __func__, n_attention_wv, n_feed_forward_w2, model.hparams .n_layer );
4688
+ }
4681
4689
4682
4690
int i_attention_wv = 0 ;
4683
4691
int i_feed_forward_w2 = 0 ;
@@ -4769,6 +4777,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4769
4777
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
4770
4778
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4771
4779
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
4780
+ if (model.type == MODEL_70B) {
4781
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4782
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4783
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4784
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4785
+ }
4772
4786
++i_attention_wv;
4773
4787
} else if (name.find (" ffn_down.weight" ) != std::string::npos) {
4774
4788
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
0 commit comments