Skip to content

Commit 38a4de3

Browse files
committed
Do QK norm stacking in model conversion step
1 parent 04a0ece commit 38a4de3

File tree

1 file changed

+101
-1
lines changed

1 file changed

+101
-1
lines changed

convert-hf-to-gguf.py

+101-1
Original file line numberDiff line numberDiff line change
@@ -1208,7 +1208,6 @@ def set_vocab(self):
12081208
self._set_vocab_qwen()
12091209

12101210
def set_gguf_parameters(self):
1211-
super().set_gguf_parameters()
12121211
hparams = self.hparams
12131212
block_count = hparams["num_hidden_layers"]
12141213

@@ -1224,6 +1223,107 @@ def set_gguf_parameters(self):
12241223
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
12251224
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
12261225

1226+
def write_tensors(self):
1227+
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1228+
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1229+
n_head = self.hparams.get("num_attention_heads")
1230+
n_kv_head = self.hparams.get("num_key_value_heads")
1231+
q_norms = dict()
1232+
k_norms = dict()
1233+
for name, data_torch in self.get_tensors():
1234+
# we don't need these
1235+
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1236+
continue
1237+
1238+
old_dtype = data_torch.dtype
1239+
1240+
# convert any unsupported data types to float32
1241+
if data_torch.dtype not in (torch.float16, torch.float32):
1242+
data_torch = data_torch.to(torch.float32)
1243+
1244+
data = data_torch.squeeze().numpy()
1245+
n_dims = len(data.shape)
1246+
if name.find("q_layernorm.norms") != -1:
1247+
q_norms[name] = data
1248+
if len(q_norms) >= (block_count * n_head):
1249+
for bid in range(block_count):
1250+
datas = []
1251+
for xid in range(n_head):
1252+
ename = f"model.layers.{bid}.self_attn.q_layernorm.norms.{xid}.weight"
1253+
datas.append(q_norms[ename])
1254+
del q_norms[ename]
1255+
data = np.stack(datas, axis=0)
1256+
data_dtype = data.dtype
1257+
merged_name = f"model.layers.{bid}.self_attn.q_layernorm.weight"
1258+
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1259+
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1260+
data = data.astype(np.float32)
1261+
1262+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1263+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1264+
data = data.astype(np.float16)
1265+
if new_name is None:
1266+
print(f"Can not map tensor {name!r}")
1267+
sys.exit()
1268+
1269+
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1270+
1271+
self.gguf_writer.add_tensor(new_name, data)
1272+
continue
1273+
if name.find("k_layernorm.norms") != -1:
1274+
k_norms[name] = data
1275+
if len(k_norms) >= (block_count * n_kv_head):
1276+
for bid in range(block_count):
1277+
full = True
1278+
datas = []
1279+
for xid in range(n_kv_head):
1280+
ename = f"model.layers.{bid}.self_attn.k_layernorm.norms.{xid}.weight"
1281+
datas.append(k_norms[ename])
1282+
del k_norms[ename]
1283+
data = np.stack(datas, axis=0)
1284+
data_dtype = data.dtype
1285+
merged_name = f"model.layers.{bid}.self_attn.k_layernorm.weight"
1286+
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1287+
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1288+
data = data.astype(np.float32)
1289+
1290+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1291+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1292+
data = data.astype(np.float16)
1293+
if new_name is None:
1294+
print(f"Can not map tensor {name!r}")
1295+
sys.exit()
1296+
1297+
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1298+
1299+
self.gguf_writer.add_tensor(new_name, data)
1300+
continue
1301+
1302+
1303+
# map tensor names
1304+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1305+
if new_name is None:
1306+
print(f"Can not map tensor {name!r}")
1307+
sys.exit()
1308+
1309+
n_dims = len(data.shape)
1310+
data_dtype = data.dtype
1311+
1312+
# if f32 desired, convert any float16 to float32
1313+
if self.ftype == 0 and data_dtype == np.float16:
1314+
data = data.astype(np.float32)
1315+
1316+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1317+
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1318+
data = data.astype(np.float32)
1319+
1320+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1321+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1322+
data = data.astype(np.float16)
1323+
1324+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1325+
1326+
self.gguf_writer.add_tensor(new_name, data)
12271327

12281328
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
12291329
class LlamaModel(Model):

0 commit comments

Comments
 (0)