@@ -133,19 +133,20 @@ def make_tensors_list() -> List[str]:
133
133
134
134
def find_n_mult (n_ff : int , n_embd : int ) -> int :
135
135
# hardcoded magic range
136
- for n_mult in range (256 , 1 , - 1 ):
136
+ for n_mult in range (8192 , 1 , - 1 ):
137
137
calc_ff = (((8 * n_embd ) // 3 + n_mult - 1 ) // n_mult )* n_mult
138
138
if calc_ff == n_ff :
139
139
return n_mult
140
140
raise Exception (f"failed to find n_mult for (n_ff={ n_ff } , n_embd={ n_embd } )." )
141
141
142
142
@dataclass
143
143
class Params :
144
- n_vocab : int
145
- n_embd : int
146
- n_mult : int
147
- n_head : int
148
- n_layer : int
144
+ n_vocab : int
145
+ n_embd : int
146
+ n_mult : int
147
+ n_head : int
148
+ n_layer : int
149
+ n_kv_head : Optional [int ] # This parameter is only used for Llama 2
149
150
150
151
@staticmethod
151
152
def guessed (model : 'LazyModel' ) -> 'Params' :
@@ -167,11 +168,12 @@ def guessed(model: 'LazyModel') -> 'Params':
167
168
n_head = n_embd // 128 # guessed
168
169
169
170
return Params (
170
- n_vocab = n_vocab ,
171
- n_embd = n_embd ,
172
- n_mult = 256 ,
173
- n_head = n_head ,
174
- n_layer = n_layer ,
171
+ n_vocab = n_vocab ,
172
+ n_embd = n_embd ,
173
+ n_mult = 256 ,
174
+ n_head = n_head ,
175
+ n_layer = n_layer ,
176
+ n_kv_head = None ,
175
177
)
176
178
177
179
@staticmethod
@@ -183,15 +185,17 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
183
185
n_head = config ["num_attention_heads" ];
184
186
n_layer = config ["num_hidden_layers" ];
185
187
n_ff = config ["intermediate_size" ];
188
+ n_kv_head = config .get ("num_key_value_heads" )
186
189
187
190
n_mult = find_n_mult (n_ff , n_embd );
188
191
189
192
return Params (
190
- n_vocab = n_vocab ,
191
- n_embd = n_embd ,
192
- n_mult = n_mult ,
193
- n_head = n_head ,
194
- n_layer = n_layer ,
193
+ n_vocab = n_vocab ,
194
+ n_embd = n_embd ,
195
+ n_mult = n_mult ,
196
+ n_head = n_head ,
197
+ n_layer = n_layer ,
198
+ n_kv_head = n_kv_head ,
195
199
)
196
200
197
201
# LLaMA v2 70B params.json
@@ -200,21 +204,22 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
200
204
def loadOriginalParamsJson (model : 'LazyModel' , config_path : 'Path' ) -> 'Params' :
201
205
config = json .load (open (config_path ))
202
206
203
- n_vocab = config ["vocab_size" ];
204
- n_embd = config ["dim" ];
205
- n_head = config ["n_heads" ];
206
- n_layer = config ["n_layers" ];
207
- n_mult = config ["multiple_of" ];
207
+ n_vocab = config ["vocab_size" ];
208
+ n_embd = config ["dim" ];
209
+ n_head = config ["n_heads" ];
210
+ n_layer = config ["n_layers" ];
211
+ n_mult = config ["multiple_of" ];
208
212
209
213
if n_vocab == - 1 :
210
214
n_vocab = model ["tok_embeddings.weight" ].shape [0 ]
211
215
212
216
return Params (
213
- n_vocab = n_vocab ,
214
- n_embd = n_embd ,
215
- n_mult = n_mult ,
216
- n_head = n_head ,
217
- n_layer = n_layer ,
217
+ n_vocab = n_vocab ,
218
+ n_embd = n_embd ,
219
+ n_mult = n_mult ,
220
+ n_head = n_head ,
221
+ n_layer = n_layer ,
222
+ n_kv_head = None ,
218
223
)
219
224
220
225
@staticmethod
@@ -234,14 +239,21 @@ def load(model_plus: 'ModelPlus') -> 'Params':
234
239
235
240
236
241
class SentencePieceVocab :
237
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
238
- self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
242
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
243
+ self .vocabtype = vocabtype
244
+ if self .vocabtype == "bpe" :
245
+ self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
246
+ else :
247
+ self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
239
248
added_tokens : Dict [str , int ]
240
249
if fname_added_tokens is not None :
241
250
added_tokens = json .load (open (fname_added_tokens ))
242
251
else :
243
252
added_tokens = {}
244
- vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
253
+ if self .vocabtype == "bpe" :
254
+ vocab_size : int = len (self .sentencepiece_tokenizer )
255
+ else :
256
+ vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
245
257
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
246
258
actual_ids = sorted (added_tokens .values ())
247
259
if expected_ids != actual_ids :
@@ -255,22 +267,32 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255
267
256
268
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
257
269
tokenizer = self .sentencepiece_tokenizer
258
- for i in range (tokenizer .vocab_size ()):
270
+ if self .vocabtype == "bpe" :
271
+ from transformers .models .gpt2 import tokenization_gpt2
272
+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
273
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
274
+ for i , item in enumerate (tokenizer ):
259
275
text : bytes
260
- if tokenizer .is_unknown (i ):
261
- text = " \u2047 " .encode ("utf-8" )
262
- elif tokenizer .is_control (i ):
263
- text = b""
264
- elif tokenizer .is_byte (i ):
265
- piece = tokenizer .id_to_piece (i )
266
- if len (piece ) != 6 :
267
- raise Exception (f"Invalid token: { piece } " )
268
- byte_value = int (piece [3 :- 1 ], 16 )
269
- text = struct .pack ("B" , byte_value )
270
- else :
271
- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
272
- score : float = tokenizer .get_score (i )
276
+ text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
277
+ score : float = - i
273
278
yield text , score
279
+ else :
280
+ for i in range (tokenizer .vocab_size ()):
281
+ text : bytes
282
+ if tokenizer .is_unknown (i ):
283
+ text = " \u2047 " .encode ("utf-8" )
284
+ elif tokenizer .is_control (i ):
285
+ text = b""
286
+ elif tokenizer .is_byte (i ):
287
+ piece = tokenizer .id_to_piece (i )
288
+ if len (piece ) != 6 :
289
+ raise Exception (f"Invalid token: { piece } " )
290
+ byte_value = int (piece [3 :- 1 ], 16 )
291
+ text = struct .pack ("B" , byte_value )
292
+ else :
293
+ text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
294
+ score : float = tokenizer .get_score (i )
295
+ yield text , score
274
296
275
297
def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
276
298
for text in self .added_tokens_list :
@@ -300,10 +322,12 @@ def __repr__(self) -> str:
300
322
Vocab = Union [SentencePieceVocab , GGMLVocab ]
301
323
302
324
303
- def permute (weights : NDArray , n_head : int ) -> NDArray :
325
+ def permute (weights : NDArray , n_head : int , n_kv_head : Optional [int ] = None ) -> NDArray :
326
+ if n_kv_head is not None and n_head != n_kv_head :
327
+ n_head //= n_kv_head
304
328
return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
305
- .swapaxes (1 , 2 )
306
- .reshape (weights .shape ))
329
+ .swapaxes (1 , 2 )
330
+ .reshape (weights .shape ))
307
331
308
332
309
333
def dequantize_q4 (qvalues_pack32 : NDArray , scales : NDArray , addends : Optional [NDArray ], g_idx : Optional [NDArray ]) -> NDArray :
@@ -351,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
351
375
@abstractmethod
352
376
def astype (self , data_type : DataType ) -> 'Tensor' : ...
353
377
@abstractmethod
354
- def permute (self , n_head : int ) -> 'Tensor' : ...
378
+ def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> 'Tensor' : ...
355
379
@abstractmethod
356
380
def permute_part (self , n_part : int , n_head : int ) -> 'UnquantizedTensor' : ...
357
381
@abstractmethod
@@ -389,8 +413,8 @@ def part(self, n_part: int) -> 'UnquantizedTensor':
389
413
r = self .ndarray .shape [0 ] // 3
390
414
return UnquantizedTensor (self .ndarray [r * n_part : r * n_part + r , ...])
391
415
392
- def permute (self , n_head : int ) -> 'UnquantizedTensor' :
393
- return UnquantizedTensor (permute (self .ndarray , n_head ))
416
+ def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> 'UnquantizedTensor' :
417
+ return UnquantizedTensor (permute (self .ndarray , n_head , n_kv_head ))
394
418
395
419
396
420
def load_unquantized (lazy_tensor : 'LazyTensor' , expected_dtype : Any = None , convert : bool = False ) -> NDArray :
@@ -438,26 +462,27 @@ def astype(self, data_type: DataType) -> Tensor:
438
462
def to_ggml (self ) -> 'GGMLQuantizedTensor' :
439
463
return self
440
464
441
- def permute (self , n_head : int ) -> 'GGMLQuantizedTensor' :
442
- return GGMLQuantizedTensor (permute (self .ndarray , n_head ), self .shape , self .data_type )
465
+ def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> 'GGMLQuantizedTensor' :
466
+ return GGMLQuantizedTensor (permute (self .ndarray , n_head , n_kv_head ), self .shape , self .data_type )
443
467
444
468
445
469
GGMLCompatibleTensor = Union [UnquantizedTensor , GGMLQuantizedTensor ]
446
470
447
471
448
472
class DeferredPermutedTensor (Tensor ):
449
- def __init__ (self , base : Tensor , n_head : int ) -> None :
473
+ def __init__ (self , base : Tensor , n_head : int , n_kv_head : Optional [ int ] = None ) -> None :
450
474
self .base = base
451
475
self .n_head = n_head
476
+ self .n_kv_head = n_kv_head
452
477
self .data_type = self .base .data_type
453
478
454
479
def astype (self , data_type : DataType ) -> Tensor :
455
- return self .base .astype (data_type ).permute (self .n_head )
480
+ return self .base .astype (data_type ).permute (self .n_head , self . n_kv_head )
456
481
457
482
def to_ggml (self ) -> GGMLCompatibleTensor :
458
- return self .base .to_ggml ().permute (self .n_head )
483
+ return self .base .to_ggml ().permute (self .n_head , self . n_kv_head )
459
484
460
- def permute (self , n_head : int ) -> Tensor :
485
+ def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> Tensor :
461
486
raise Exception ("shouldn't permute twice" )
462
487
463
488
@@ -549,8 +574,8 @@ def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
549
574
ret .data_type = QuantizedDataType (groupsize = new_groupsize , have_addends = True , have_g_idx = False )
550
575
return ret
551
576
552
- def permute (self , n_head : int ) -> Tensor :
553
- return DeferredPermutedTensor (self , n_head )
577
+ def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> Tensor :
578
+ return DeferredPermutedTensor (self , n_head , n_kv_head )
554
579
555
580
def to_ggml (self ) -> GGMLQuantizedTensor :
556
581
# The output format looks like this:
@@ -681,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
681
706
return ModelPlus (model , paths , format , vocab )
682
707
683
708
684
- def permute_lazy (lazy_tensor : LazyTensor , n_head : int ) -> LazyTensor :
709
+ def permute_lazy (lazy_tensor : LazyTensor , n_head : int , n_kv_head : Optional [ int ] = None ) -> LazyTensor :
685
710
def load () -> Tensor :
686
- return lazy_tensor .load ().permute (n_head )
687
- return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } ) ' + lazy_tensor .description )
711
+ return lazy_tensor .load ().permute (n_head , n_kv_head )
712
+ return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } , { n_kv_head } ) ' + lazy_tensor .description )
688
713
689
714
def permute_part_lazy (lazy_tensor : LazyTensor , n_part : int , n_head : int ) -> LazyTensor :
690
715
def load () -> Tensor :
@@ -709,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
709
734
for i in itertools .count ():
710
735
if f"model.layers.{ i } .self_attn.q_proj.weight" in model :
711
736
out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head )
712
- out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head )
737
+ out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head , params . n_kv_head )
713
738
out [f"layers.{ i } .attention.wv.weight" ] = model [f"model.layers.{ i } .self_attn.v_proj.weight" ]
714
739
elif f"model.layers.{ i } .self_attn.W_pack.weight" in model :
715
740
out [f"layers.{ i } .attention.wq.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 0 , params .n_head )
@@ -1196,14 +1221,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
1196
1221
return {name : model [name ] for name in TENSORS_LIST if name in model }
1197
1222
1198
1223
1199
- def load_vocab (path : Path ) -> SentencePieceVocab :
1224
+ def load_vocab (path : Path , vocabtype : Optional [str ]) -> SentencePieceVocab :
1225
+ print (f"vocabtype: { vocabtype } " )
1200
1226
# Be extra-friendly and accept either a file or a directory. Also, if it's
1201
1227
# a directory, it might be the model directory, and tokenizer.model might
1202
1228
# be in the parent of that.
1203
1229
if path .is_dir ():
1204
- path2 = path / "tokenizer.model"
1230
+ vocab_file = "tokenizer.model"
1231
+ if vocabtype == 'bpe' :
1232
+ vocab_file = "vocab.json"
1233
+ path2 = path / vocab_file
1205
1234
# Use `.parent` instead of /.. to handle the symlink case better.
1206
- path3 = path .parent / "tokenizer.model"
1235
+ path3 = path .parent / vocab_file
1207
1236
if path2 .exists ():
1208
1237
path = path2
1209
1238
elif path3 .exists ():
@@ -1214,7 +1243,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1214
1243
"if it's in another directory, pass the directory as --vocab-dir" )
1215
1244
added_tokens_path = path .parent / "added_tokens.json"
1216
1245
print (f"Loading vocab file { path } " )
1217
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1246
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
1247
+ vocabtype )
1218
1248
1219
1249
1220
1250
def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
@@ -1252,14 +1282,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
1252
1282
parser .add_argument ("--outfile" , type = Path , help = "path to write to; default: based on input" )
1253
1283
parser .add_argument ("model" , type = Path ,
1254
1284
help = "directory containing model file, or model file itself (*.pth, *.pt, *.bin)" )
1285
+ parser .add_argument ("--vocabtype" , default = 'spm' , choices = ["spm" , "bpe" ], help = "vocab format (default: spm)" )
1255
1286
args = parser .parse_args (args_in )
1256
1287
1257
1288
vocab : Vocab
1258
1289
if args .dump_single :
1259
1290
model_plus = lazy_load_file (args .model )
1260
1291
do_dump_model (model_plus )
1261
1292
elif args .vocab_only :
1262
- vocab = load_vocab (args .vocab_dir or args .model )
1293
+ vocab = load_vocab (args .vocab_dir or args .model , args . vocabtype )
1263
1294
assert args .outfile , "need --outfile if using --vocab-only"
1264
1295
outfile = args .outfile
1265
1296
OutputFile .write_vocab_only (outfile , vocab )
@@ -1273,7 +1304,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
1273
1304
vocab = model_plus .vocab
1274
1305
else :
1275
1306
vocab_dir = args .vocab_dir if args .vocab_dir else model_plus .paths [0 ].parent
1276
- vocab = load_vocab (vocab_dir )
1307
+ vocab = load_vocab (vocab_dir , args . vocabtype )
1277
1308
params = Params .load (model_plus )
1278
1309
model = model_plus .model
1279
1310
model = do_necessary_conversions (model , params )
0 commit comments