@@ -1208,7 +1208,6 @@ def set_vocab(self):
1208
1208
self ._set_vocab_qwen ()
1209
1209
1210
1210
def set_gguf_parameters (self ):
1211
- super ().set_gguf_parameters ()
1212
1211
hparams = self .hparams
1213
1212
block_count = hparams ["num_hidden_layers" ]
1214
1213
@@ -1224,6 +1223,107 @@ def set_gguf_parameters(self):
1224
1223
self .gguf_writer .add_parallel_residual (hparams ["use_parallel_residual" ] if "use_parallel_residual" in hparams else True )
1225
1224
self .gguf_writer .add_layer_norm_eps (self .find_hparam (["layer_norm_eps" , "norm_eps" ]))
1226
1225
1226
+ def write_tensors (self ):
1227
+ block_count = self .hparams .get ("n_layers" , self .hparams .get ("num_hidden_layers" , self .hparams .get ("n_layer" )))
1228
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
1229
+ n_head = self .hparams .get ("num_attention_heads" )
1230
+ n_kv_head = self .hparams .get ("num_key_value_heads" )
1231
+ q_norms = dict ()
1232
+ k_norms = dict ()
1233
+ for name , data_torch in self .get_tensors ():
1234
+ # we don't need these
1235
+ if name .endswith ((".attention.masked_bias" , ".attention.bias" , ".attention.rotary_emb.inv_freq" )):
1236
+ continue
1237
+
1238
+ old_dtype = data_torch .dtype
1239
+
1240
+ # convert any unsupported data types to float32
1241
+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
1242
+ data_torch = data_torch .to (torch .float32 )
1243
+
1244
+ data = data_torch .squeeze ().numpy ()
1245
+ n_dims = len (data .shape )
1246
+ if name .find ("q_layernorm.norms" ) != - 1 :
1247
+ q_norms [name ] = data
1248
+ if len (q_norms ) >= (block_count * n_head ):
1249
+ for bid in range (block_count ):
1250
+ datas = []
1251
+ for xid in range (n_head ):
1252
+ ename = f"model.layers.{ bid } .self_attn.q_layernorm.norms.{ xid } .weight"
1253
+ datas .append (q_norms [ename ])
1254
+ del q_norms [ename ]
1255
+ data = np .stack (datas , axis = 0 )
1256
+ data_dtype = data .dtype
1257
+ merged_name = f"model.layers.{ bid } .self_attn.q_layernorm.weight"
1258
+ new_name = tensor_map .get_name (merged_name , try_suffixes = (".weight" , ".bias" ))
1259
+ if self .ftype == 1 and data_dtype == np .float16 and (n_dims == 1 or new_name .endswith ("_norm.weight" )):
1260
+ data = data .astype (np .float32 )
1261
+
1262
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1263
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and not new_name .endswith ("_norm.weight" ) and n_dims == 2 :
1264
+ data = data .astype (np .float16 )
1265
+ if new_name is None :
1266
+ print (f"Can not map tensor { name !r} " )
1267
+ sys .exit ()
1268
+
1269
+ print (f"{ new_name } , n_dims = { len (data .shape )} , shape = { data .shape } --> { data .dtype } " )
1270
+
1271
+ self .gguf_writer .add_tensor (new_name , data )
1272
+ continue
1273
+ if name .find ("k_layernorm.norms" ) != - 1 :
1274
+ k_norms [name ] = data
1275
+ if len (k_norms ) >= (block_count * n_kv_head ):
1276
+ for bid in range (block_count ):
1277
+ full = True
1278
+ datas = []
1279
+ for xid in range (n_kv_head ):
1280
+ ename = f"model.layers.{ bid } .self_attn.k_layernorm.norms.{ xid } .weight"
1281
+ datas .append (k_norms [ename ])
1282
+ del k_norms [ename ]
1283
+ data = np .stack (datas , axis = 0 )
1284
+ data_dtype = data .dtype
1285
+ merged_name = f"model.layers.{ bid } .self_attn.k_layernorm.weight"
1286
+ new_name = tensor_map .get_name (merged_name , try_suffixes = (".weight" , ".bias" ))
1287
+ if self .ftype == 1 and data_dtype == np .float16 and (n_dims == 1 or new_name .endswith ("_norm.weight" )):
1288
+ data = data .astype (np .float32 )
1289
+
1290
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1291
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and not new_name .endswith ("_norm.weight" ) and n_dims == 2 :
1292
+ data = data .astype (np .float16 )
1293
+ if new_name is None :
1294
+ print (f"Can not map tensor { name !r} " )
1295
+ sys .exit ()
1296
+
1297
+ print (f"{ new_name } , n_dims = { len (data .shape )} , shape = { data .shape } --> { data .dtype } " )
1298
+
1299
+ self .gguf_writer .add_tensor (new_name , data )
1300
+ continue
1301
+
1302
+
1303
+ # map tensor names
1304
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
1305
+ if new_name is None :
1306
+ print (f"Can not map tensor { name !r} " )
1307
+ sys .exit ()
1308
+
1309
+ n_dims = len (data .shape )
1310
+ data_dtype = data .dtype
1311
+
1312
+ # if f32 desired, convert any float16 to float32
1313
+ if self .ftype == 0 and data_dtype == np .float16 :
1314
+ data = data .astype (np .float32 )
1315
+
1316
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1317
+ if self .ftype == 1 and data_dtype == np .float16 and (n_dims == 1 or new_name .endswith ("_norm.weight" )):
1318
+ data = data .astype (np .float32 )
1319
+
1320
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1321
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and not new_name .endswith ("_norm.weight" ) and n_dims == 2 :
1322
+ data = data .astype (np .float16 )
1323
+
1324
+ print (f"{ new_name } , n_dims = { n_dims } , { old_dtype } --> { data .dtype } " )
1325
+
1326
+ self .gguf_writer .add_tensor (new_name , data )
1227
1327
1228
1328
@Model .register ("LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" )
1229
1329
class LlamaModel (Model ):
0 commit comments