Skip to content

Commit f013c39

Browse files
committed
Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the migrate-ggml-2023-03-30-pr613.py script will allow you to convert your old ggml files into the new mmap()'able format. See #613
1 parent a45e843 commit f013c39

File tree

3 files changed

+325
-7
lines changed

3 files changed

+325
-7
lines changed

convert-pth-to-ggml.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Convert a LLaMA model checkpoint to a ggml compatible file
1+
# Convert a LLaMA model checkpoint to a ggjt compatible file
22
#
33
# Load the model using Torch
44
# Iterate over all variables and write them to a binary file.
@@ -52,8 +52,8 @@
5252
}
5353

5454
GGML_TYPE_SIZE = {
55-
GGML_TYPE_Q4_0: 4 + QK/2,
56-
GGML_TYPE_Q4_1: 4*2 + QK/2,
55+
GGML_TYPE_Q4_0: 4 + QK//2,
56+
GGML_TYPE_Q4_1: 4*2 + QK//2,
5757
GGML_TYPE_I8: 1,
5858
GGML_TYPE_I16: 2,
5959
GGML_TYPE_I32: 4,

llama.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -350,10 +350,11 @@ static void munmap_file(void * addr, size_t length) {
350350
static bool report_bad_magic(const char *path) {
351351
fprintf(stderr,
352352
"%s: invalid model file (bad magic)\n"
353-
"you most likely need to regenerate your ggml files\n"
354-
"the benefit is you'll get 10-100x faster load times\n"
355-
"see https://github.com/ggerganov/llama.cpp/issues/91\n"
356-
"use convert-pth-to-ggml.py on your llama model files\n",
353+
"\tyou most likely need to regenerate your ggml files\n"
354+
"\tthe benefit is you'll get 10-100x faster load times\n"
355+
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356+
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357+
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
357358
path);
358359
return false;
359360
}

migrate-ggml-2023-03-30-pr613.py

+317
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
2+
#
3+
# We caused a breaking change to the file format on 2023-03-30 in:
4+
# https://github.com/ggerganov/llama.cpp/pull/613
5+
#
6+
# (1) If you still have the Meta LLaMA .pth files, then close this
7+
# file now; you can just run `convert-pth-to-ggml.py` again to
8+
# migrate to the new format. The tool is easier to use too. It
9+
# isn't necessary anymore to manage split output files because
10+
# the new format always combines things into a single file.
11+
#
12+
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
13+
# space, then this tool is intended to help you. Please check
14+
# out the instructions below.
15+
#
16+
# USAGE
17+
#
18+
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
19+
#
20+
# PREREQUISITES
21+
#
22+
# pip install numpy
23+
# cd llama.cpp
24+
# make -j4
25+
#
26+
# EXAMPLE (7B MODEL)
27+
#
28+
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
29+
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
30+
#
31+
# # check that it works
32+
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
33+
#
34+
# # you can delete the old files
35+
# rm -f models/7B/ggml-model-f16.bin
36+
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
37+
#
38+
# EXAMPLE (13B MODEL)
39+
#
40+
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
41+
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
42+
#
43+
# # check that it works
44+
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
45+
#
46+
# # you can delete the old files
47+
# rm -f models/13B/ggml-model-f16.bin*
48+
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
49+
#
50+
51+
import argparse
52+
import os
53+
import sys
54+
import json
55+
import struct
56+
import numpy as np
57+
58+
QK = 32
59+
60+
GGML_TYPE_Q4_0 = 0
61+
GGML_TYPE_Q4_1 = 1
62+
GGML_TYPE_I8 = 2
63+
GGML_TYPE_I16 = 3
64+
GGML_TYPE_I32 = 4
65+
GGML_TYPE_F16 = 5
66+
GGML_TYPE_F32 = 6
67+
68+
WTYPE_NAMES = {
69+
0: "F32",
70+
1: "F16",
71+
2: "Q4_0",
72+
3: "Q4_1",
73+
}
74+
75+
WTYPES = {
76+
0: GGML_TYPE_F32,
77+
1: GGML_TYPE_F16,
78+
2: GGML_TYPE_Q4_0,
79+
3: GGML_TYPE_Q4_1,
80+
}
81+
82+
GGML_BLCK_SIZE = {
83+
GGML_TYPE_Q4_0: QK,
84+
GGML_TYPE_Q4_1: QK,
85+
GGML_TYPE_I8: 1,
86+
GGML_TYPE_I16: 1,
87+
GGML_TYPE_I32: 1,
88+
GGML_TYPE_F16: 1,
89+
GGML_TYPE_F32: 1,
90+
}
91+
92+
GGML_TYPE_SIZE = {
93+
GGML_TYPE_Q4_0: 4 + QK//2,
94+
GGML_TYPE_Q4_1: 4*2 + QK//2,
95+
GGML_TYPE_I8: 1,
96+
GGML_TYPE_I16: 2,
97+
GGML_TYPE_I32: 4,
98+
GGML_TYPE_F16: 2,
99+
GGML_TYPE_F32: 4,
100+
}
101+
102+
HPARAMS = [
103+
'magic', # int32
104+
'version', # int32
105+
'n_vocab', # int32
106+
'n_embd', # int32
107+
'n_mult', # int32
108+
'n_head', # int32
109+
'n_layer', # int32
110+
'n_rot', # int32
111+
'f16', # int32
112+
]
113+
114+
def read_hparams(fin):
115+
struct_fmt = "i" * len(HPARAMS)
116+
struct_size = struct.calcsize(struct_fmt)
117+
buf = fin.read(struct_size)
118+
ints = struct.unpack(struct_fmt, buf)
119+
hparams = dict(zip(HPARAMS, ints))
120+
return hparams
121+
122+
def write_hparams(fout, hparams):
123+
struct_fmt = "i" * len(HPARAMS)
124+
struct_size = struct.calcsize(struct_fmt)
125+
ints = [hparams[h] for h in HPARAMS]
126+
fout.write(struct.pack(struct_fmt, *ints))
127+
128+
def read_tokens(fin, hparams):
129+
tokens = []
130+
for i in range(hparams['n_vocab']):
131+
len_b = fin.read(4)
132+
(length,) = struct.unpack("i", len_b)
133+
word = fin.read(length)
134+
score_b = fin.read(4)
135+
(score,) = struct.unpack("f", score_b)
136+
tokens.append((word, score))
137+
return tokens
138+
139+
def write_tokens(fout, tokens):
140+
for word, score in tokens:
141+
fout.write(struct.pack("i", len(word)))
142+
fout.write(word)
143+
fout.write(struct.pack("f", score))
144+
145+
def ggml_nelements(shape):
146+
r = 1
147+
for i in shape:
148+
r *= i
149+
return r
150+
151+
def ggml_nbytes(shape, ftype):
152+
x = ggml_nelements(shape)
153+
t = WTYPES[ftype]
154+
x *= GGML_TYPE_SIZE[t]
155+
x //= GGML_BLCK_SIZE[t]
156+
return x
157+
158+
def get_n_parts(dim):
159+
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
160+
n_parts = mappings.get(dim)
161+
if n_parts is None:
162+
print(f"Invalid dim: {dim}")
163+
sys.exit(1)
164+
print(f"n_parts = {n_parts}\n")
165+
return n_parts
166+
167+
def copy_tensors(fin, fout, part_id, n_parts):
168+
while True:
169+
170+
b = fin.read(4)
171+
if not b: break
172+
(n_dims,) = struct.unpack("i", b)
173+
b = fin.read(4)
174+
(length,) = struct.unpack("i", b)
175+
b = fin.read(4)
176+
(ftype,) = struct.unpack("i", b)
177+
178+
assert n_dims in (1, 2)
179+
180+
nelements = 1
181+
partshape = list(range(n_dims))
182+
for i in range(n_dims):
183+
b = fin.read(4)
184+
partshape[i] = struct.unpack("i", b)[0]
185+
nelements *= partshape[i]
186+
partshape = list(reversed(partshape))
187+
188+
name = fin.read(length)
189+
data = fin.read(ggml_nbytes(partshape, ftype))
190+
191+
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
192+
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
193+
194+
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
195+
196+
# determine dimension along which multipart tensor is sharded
197+
#
198+
# split_dim 0 regex:
199+
# - output.*
200+
# - layers.*.attention.wq.weight
201+
# - layers.*.attention.wk.weight
202+
# - layers.*.attention.wv.weight
203+
# - layers.*.feed_forward.w1.weight
204+
# - layers.*.feed_forward.w3.weight
205+
#
206+
# split_dim 1 regex:
207+
# - tok_embeddings.*
208+
# - layers.*.attention.wo.weight
209+
# - layers.*.feed_forward.w2.weight
210+
#
211+
if n_dims > 1:
212+
split_dim = 1
213+
if b"tok_embeddings" in name:
214+
split_dim = 1
215+
elif b"layers" in name:
216+
if b"attention.wo.weight" in name:
217+
split_dim = 1
218+
elif b"feed_forward.w2.weight" in name:
219+
split_dim = 1
220+
else:
221+
split_dim = 0
222+
elif b"output" in name:
223+
split_dim = 0
224+
225+
# output tensor header
226+
fullshape = list(partshape)
227+
if n_dims > 1:
228+
fullshape[split_dim] *= n_parts
229+
fout.write(struct.pack("iii", n_dims, len(name), ftype))
230+
for dim in reversed(fullshape):
231+
fout.write(struct.pack("i", dim))
232+
fout.write(name)
233+
234+
# ensure tensor data is aligned
235+
tensor_data_offset = fout.tell()
236+
while tensor_data_offset % QK != 0:
237+
fout.write(struct.pack("B", 0))
238+
tensor_data_offset += 1
239+
240+
# output unified mappable tensor data
241+
if n_dims == 1 or n_parts == 1:
242+
# copy tensor which we thankfully received in one piece
243+
if part_id == 0:
244+
fout.write(data)
245+
elif split_dim == 0:
246+
# reassemble multifile tensor containing some of the rows
247+
rows_per_chunk = partshape[0]
248+
current_row = part_id * rows_per_chunk
249+
bytes_per_row = fullshape[1] // blck_size * type_size
250+
offset = current_row * bytes_per_row
251+
fout.seek(tensor_data_offset + offset)
252+
fout.write(data)
253+
elif split_dim == 1:
254+
# reassemble multifile tensor containing some of the cols
255+
cols_per_chunk = partshape[1]
256+
current_col = part_id * cols_per_chunk
257+
bpr = partshape[1] // blck_size * type_size
258+
bytes_per_row = fullshape[1] // blck_size * type_size
259+
offset_current_col = current_col // blck_size * type_size
260+
for row in range(partshape[0]):
261+
offset_row = row * bytes_per_row
262+
offset = offset_row + offset_current_col
263+
fout.seek(tensor_data_offset + offset)
264+
fout.write(data[row * bpr:row * bpr + bpr])
265+
266+
# advance file position to next tensor
267+
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
268+
269+
def parse_args():
270+
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
271+
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
272+
parser.add_argument('fout_path', help='your new ggjt file name')
273+
return parser.parse_args()
274+
275+
def main():
276+
args = parse_args()
277+
assert args.fin_path
278+
assert args.fout_path
279+
assert args.fin_path != args.fout_path
280+
281+
with open(args.fin_path, "rb") as fin:
282+
hparams = read_hparams(fin)
283+
tokens = read_tokens(fin, hparams)
284+
285+
if hparams['magic'] == 0x67676a74: # ggjt
286+
print("%s: input ggml has already been converted to 'ggjt' magic\n" %
287+
(args.fin_path))
288+
sys.exit(1)
289+
290+
if hparams['magic'] != 0x67676d66: # ggmf
291+
print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
292+
(args.fin_path, hparams['magic']))
293+
sys.exit(1)
294+
295+
hparams['magic'] = 0x67676a74 # ggjt
296+
n_parts = get_n_parts(hparams['n_embd'])
297+
298+
# we output a single file for ggml
299+
with open(args.fout_path, "wb") as fout:
300+
write_hparams(fout, hparams)
301+
write_tokens(fout, tokens)
302+
offset_of_tensors = fout.tell()
303+
# the tensors we load could be split across multiple files
304+
for part_id in range(n_parts):
305+
fout.seek(offset_of_tensors)
306+
print(f"Processing part {part_id+1} of {n_parts}\n")
307+
fin_path = args.fin_path
308+
if part_id > 0:
309+
fin_path += ".%d" % (part_id)
310+
with open(fin_path, "rb") as fin:
311+
read_tokens(fin, read_hparams(fin))
312+
copy_tensors(fin, fout, part_id, n_parts)
313+
314+
print(f"Done. Output file: {args.fout_path}\n")
315+
316+
if __name__ == "__main__":
317+
main()

0 commit comments

Comments
 (0)