@@ -2020,8 +2020,9 @@ struct llama_layer {
2020
2020
struct ggml_tensor * ffn_up_shexp;
2021
2021
2022
2022
// ff bias
2023
- struct ggml_tensor * ffn_down_b; // b2
2024
- struct ggml_tensor * ffn_up_b; // b3
2023
+ struct ggml_tensor * ffn_gate_b = nullptr;
2024
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2025
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
2025
2026
struct ggml_tensor * ffn_act;
2026
2027
2027
2028
// mamba proj
@@ -4051,7 +4052,9 @@ static void llm_load_hparams(
4051
4052
switch (hparams.n_layer) {
4052
4053
case 22: model.type = e_model::MODEL_1B; break;
4053
4054
case 26: model.type = e_model::MODEL_3B; break;
4054
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4055
+ // granite uses a vocab with len 49152
4056
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4057
+ case 36: model.type = e_model::MODEL_8B; break; // granite
4055
4058
case 40: model.type = e_model::MODEL_13B; break;
4056
4059
case 48: model.type = e_model::MODEL_34B; break;
4057
4060
case 60: model.type = e_model::MODEL_30B; break;
@@ -4321,6 +4324,8 @@ static void llm_load_hparams(
4321
4324
case 30: model.type = e_model::MODEL_3B; break;
4322
4325
case 32: model.type = e_model::MODEL_7B; break;
4323
4326
case 40: model.type = e_model::MODEL_15B; break;
4327
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4328
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4324
4329
default: model.type = e_model::MODEL_UNKNOWN;
4325
4330
}
4326
4331
} break;
@@ -4583,6 +4588,11 @@ static void llm_load_vocab(
4583
4588
} else {
4584
4589
if (tokenizer_model == "gpt2") {
4585
4590
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4591
+
4592
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4593
+ if (add_space_prefix_keyidx != -1) {
4594
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4595
+ }
4586
4596
} else {
4587
4597
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4588
4598
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -5213,6 +5223,11 @@ static bool llm_load_tensors(
5213
5223
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5214
5224
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5215
5225
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5226
+
5227
+ // optional MLP bias
5228
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5229
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5230
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5216
5231
} else {
5217
5232
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5218
5233
@@ -7485,9 +7500,9 @@ struct llm_build_context {
7485
7500
cb(cur, "ffn_norm", il);
7486
7501
7487
7502
cur = llm_build_ffn(ctx0, cur,
7488
- model.layers[il].ffn_up, NULL ,
7489
- model.layers[il].ffn_gate, NULL ,
7490
- model.layers[il].ffn_down, NULL ,
7503
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b ,
7504
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b ,
7505
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b ,
7491
7506
NULL,
7492
7507
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7493
7508
cb(cur, "ffn_out", il);
0 commit comments