Skip to content

Commit 28d4a7f

Browse files
authored
Merge pull request #8 from OpenBMB/master
sync master
2 parents 8bd47ce + 5442939 commit 28d4a7f

File tree

2 files changed

+34
-8
lines changed

2 files changed

+34
-8
lines changed

convert-hf-to-gguf.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,17 @@ def set_gguf_parameters(self):
12791279
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
12801280
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
12811281

1282+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1283+
if tokenizer_config_file.is_file():
1284+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1285+
tokenizer_config_json = json.load(f)
1286+
if "add_prefix_space" in tokenizer_config_json:
1287+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1288+
1289+
# Apply to granite small models only
1290+
if self.hparams.get("vocab_size", 32000) == 49152:
1291+
self.gguf_writer.add_add_bos_token(False)
1292+
12821293
@staticmethod
12831294
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
12841295
if n_head_kv is not None and n_head != n_head_kv:
@@ -1293,9 +1304,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
12931304
n_head = self.hparams["num_attention_heads"]
12941305
n_kv_head = self.hparams.get("num_key_value_heads")
12951306

1296-
if name.endswith("q_proj.weight"):
1307+
if name.endswith(("q_proj.weight", "q_proj.bias")):
12971308
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1298-
if name.endswith("k_proj.weight"):
1309+
if name.endswith(("k_proj.weight", "k_proj.bias")):
12991310
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
13001311

13011312
# process the experts separately

llama.cpp

+21-6
Original file line numberDiff line numberDiff line change
@@ -2020,8 +2020,9 @@ struct llama_layer {
20202020
struct ggml_tensor * ffn_up_shexp;
20212021

20222022
// ff bias
2023-
struct ggml_tensor * ffn_down_b; // b2
2024-
struct ggml_tensor * ffn_up_b; // b3
2023+
struct ggml_tensor * ffn_gate_b = nullptr;
2024+
struct ggml_tensor * ffn_down_b = nullptr; // b2
2025+
struct ggml_tensor * ffn_up_b = nullptr; // b3
20252026
struct ggml_tensor * ffn_act;
20262027

20272028
// mamba proj
@@ -4051,7 +4052,9 @@ static void llm_load_hparams(
40514052
switch (hparams.n_layer) {
40524053
case 22: model.type = e_model::MODEL_1B; break;
40534054
case 26: model.type = e_model::MODEL_3B; break;
4054-
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4055+
// granite uses a vocab with len 49152
4056+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4057+
case 36: model.type = e_model::MODEL_8B; break; // granite
40554058
case 40: model.type = e_model::MODEL_13B; break;
40564059
case 48: model.type = e_model::MODEL_34B; break;
40574060
case 60: model.type = e_model::MODEL_30B; break;
@@ -4321,6 +4324,8 @@ static void llm_load_hparams(
43214324
case 30: model.type = e_model::MODEL_3B; break;
43224325
case 32: model.type = e_model::MODEL_7B; break;
43234326
case 40: model.type = e_model::MODEL_15B; break;
4327+
case 52: model.type = e_model::MODEL_20B; break; // granite
4328+
case 88: model.type = e_model::MODEL_34B; break; // granite
43244329
default: model.type = e_model::MODEL_UNKNOWN;
43254330
}
43264331
} break;
@@ -4583,6 +4588,11 @@ static void llm_load_vocab(
45834588
} else {
45844589
if (tokenizer_model == "gpt2") {
45854590
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4591+
4592+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4593+
if (add_space_prefix_keyidx != -1) {
4594+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4595+
}
45864596
} else {
45874597
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
45884598
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -5213,6 +5223,11 @@ static bool llm_load_tensors(
52135223
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
52145224
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
52155225
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5226+
5227+
// optional MLP bias
5228+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5229+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5230+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
52165231
} else {
52175232
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
52185233

@@ -7485,9 +7500,9 @@ struct llm_build_context {
74857500
cb(cur, "ffn_norm", il);
74867501

74877502
cur = llm_build_ffn(ctx0, cur,
7488-
model.layers[il].ffn_up, NULL,
7489-
model.layers[il].ffn_gate, NULL,
7490-
model.layers[il].ffn_down, NULL,
7503+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7504+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7505+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
74917506
NULL,
74927507
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
74937508
cb(cur, "ffn_out", il);

0 commit comments

Comments
 (0)