|
| 1 | +# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic |
| 2 | +# |
| 3 | +# We caused a breaking change to the file format on 2023-03-30 in: |
| 4 | +# https://github.com/ggerganov/llama.cpp/pull/613 |
| 5 | +# |
| 6 | +# (1) If you still have the Meta LLaMA .pth files, then close this |
| 7 | +# file now; you can just run `convert-pth-to-ggml.py` again to |
| 8 | +# migrate to the new format. The tool is easier to use too. It |
| 9 | +# isn't necessary anymore to manage split output files because |
| 10 | +# the new format always combines things into a single file. |
| 11 | +# |
| 12 | +# (2) If you deleted the Meta LLaMA .pth files due to save on disk |
| 13 | +# space, then this tool is intended to help you. Please check |
| 14 | +# out the instructions below. |
| 15 | +# |
| 16 | +# USAGE |
| 17 | +# |
| 18 | +# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT |
| 19 | +# |
| 20 | +# PREREQUISITES |
| 21 | +# |
| 22 | +# pip install numpy |
| 23 | +# cd llama.cpp |
| 24 | +# make -j4 |
| 25 | +# |
| 26 | +# EXAMPLE (7B MODEL) |
| 27 | +# |
| 28 | +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights |
| 29 | +# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin |
| 30 | +# |
| 31 | +# # check that it works |
| 32 | +# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' |
| 33 | +# |
| 34 | +# # you can delete the old files |
| 35 | +# rm -f models/7B/ggml-model-f16.bin |
| 36 | +# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin |
| 37 | +# |
| 38 | +# EXAMPLE (13B MODEL) |
| 39 | +# |
| 40 | +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights |
| 41 | +# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin |
| 42 | +# |
| 43 | +# # check that it works |
| 44 | +# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' |
| 45 | +# |
| 46 | +# # you can delete the old files |
| 47 | +# rm -f models/13B/ggml-model-f16.bin* |
| 48 | +# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin |
| 49 | +# |
| 50 | + |
| 51 | +import argparse |
| 52 | +import os |
| 53 | +import sys |
| 54 | +import json |
| 55 | +import struct |
| 56 | +import numpy as np |
| 57 | + |
| 58 | +QK = 32 |
| 59 | + |
| 60 | +GGML_TYPE_Q4_0 = 0 |
| 61 | +GGML_TYPE_Q4_1 = 1 |
| 62 | +GGML_TYPE_I8 = 2 |
| 63 | +GGML_TYPE_I16 = 3 |
| 64 | +GGML_TYPE_I32 = 4 |
| 65 | +GGML_TYPE_F16 = 5 |
| 66 | +GGML_TYPE_F32 = 6 |
| 67 | + |
| 68 | +WTYPE_NAMES = { |
| 69 | + 0: "F32", |
| 70 | + 1: "F16", |
| 71 | + 2: "Q4_0", |
| 72 | + 3: "Q4_1", |
| 73 | +} |
| 74 | + |
| 75 | +WTYPES = { |
| 76 | + 0: GGML_TYPE_F32, |
| 77 | + 1: GGML_TYPE_F16, |
| 78 | + 2: GGML_TYPE_Q4_0, |
| 79 | + 3: GGML_TYPE_Q4_1, |
| 80 | +} |
| 81 | + |
| 82 | +GGML_BLCK_SIZE = { |
| 83 | + GGML_TYPE_Q4_0: QK, |
| 84 | + GGML_TYPE_Q4_1: QK, |
| 85 | + GGML_TYPE_I8: 1, |
| 86 | + GGML_TYPE_I16: 1, |
| 87 | + GGML_TYPE_I32: 1, |
| 88 | + GGML_TYPE_F16: 1, |
| 89 | + GGML_TYPE_F32: 1, |
| 90 | +} |
| 91 | + |
| 92 | +GGML_TYPE_SIZE = { |
| 93 | + GGML_TYPE_Q4_0: 4 + QK//2, |
| 94 | + GGML_TYPE_Q4_1: 4*2 + QK//2, |
| 95 | + GGML_TYPE_I8: 1, |
| 96 | + GGML_TYPE_I16: 2, |
| 97 | + GGML_TYPE_I32: 4, |
| 98 | + GGML_TYPE_F16: 2, |
| 99 | + GGML_TYPE_F32: 4, |
| 100 | +} |
| 101 | + |
| 102 | +HPARAMS = [ |
| 103 | + 'magic', # int32 |
| 104 | + 'version', # int32 |
| 105 | + 'n_vocab', # int32 |
| 106 | + 'n_embd', # int32 |
| 107 | + 'n_mult', # int32 |
| 108 | + 'n_head', # int32 |
| 109 | + 'n_layer', # int32 |
| 110 | + 'n_rot', # int32 |
| 111 | + 'f16', # int32 |
| 112 | +] |
| 113 | + |
| 114 | +def read_hparams(fin): |
| 115 | + struct_fmt = "i" * len(HPARAMS) |
| 116 | + struct_size = struct.calcsize(struct_fmt) |
| 117 | + buf = fin.read(struct_size) |
| 118 | + ints = struct.unpack(struct_fmt, buf) |
| 119 | + hparams = dict(zip(HPARAMS, ints)) |
| 120 | + return hparams |
| 121 | + |
| 122 | +def write_hparams(fout, hparams): |
| 123 | + struct_fmt = "i" * len(HPARAMS) |
| 124 | + struct_size = struct.calcsize(struct_fmt) |
| 125 | + ints = [hparams[h] for h in HPARAMS] |
| 126 | + fout.write(struct.pack(struct_fmt, *ints)) |
| 127 | + |
| 128 | +def read_tokens(fin, hparams): |
| 129 | + tokens = [] |
| 130 | + for i in range(hparams['n_vocab']): |
| 131 | + len_b = fin.read(4) |
| 132 | + (length,) = struct.unpack("i", len_b) |
| 133 | + word = fin.read(length) |
| 134 | + score_b = fin.read(4) |
| 135 | + (score,) = struct.unpack("f", score_b) |
| 136 | + tokens.append((word, score)) |
| 137 | + return tokens |
| 138 | + |
| 139 | +def write_tokens(fout, tokens): |
| 140 | + for word, score in tokens: |
| 141 | + fout.write(struct.pack("i", len(word))) |
| 142 | + fout.write(word) |
| 143 | + fout.write(struct.pack("f", score)) |
| 144 | + |
| 145 | +def ggml_nelements(shape): |
| 146 | + r = 1 |
| 147 | + for i in shape: |
| 148 | + r *= i |
| 149 | + return r |
| 150 | + |
| 151 | +def ggml_nbytes(shape, ftype): |
| 152 | + x = ggml_nelements(shape) |
| 153 | + t = WTYPES[ftype] |
| 154 | + x *= GGML_TYPE_SIZE[t] |
| 155 | + x //= GGML_BLCK_SIZE[t] |
| 156 | + return x |
| 157 | + |
| 158 | +def get_n_parts(dim): |
| 159 | + mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} |
| 160 | + n_parts = mappings.get(dim) |
| 161 | + if n_parts is None: |
| 162 | + print(f"Invalid dim: {dim}") |
| 163 | + sys.exit(1) |
| 164 | + print(f"n_parts = {n_parts}\n") |
| 165 | + return n_parts |
| 166 | + |
| 167 | +def copy_tensors(fin, fout, part_id, n_parts): |
| 168 | + while True: |
| 169 | + |
| 170 | + b = fin.read(4) |
| 171 | + if not b: break |
| 172 | + (n_dims,) = struct.unpack("i", b) |
| 173 | + b = fin.read(4) |
| 174 | + (length,) = struct.unpack("i", b) |
| 175 | + b = fin.read(4) |
| 176 | + (ftype,) = struct.unpack("i", b) |
| 177 | + |
| 178 | + assert n_dims in (1, 2) |
| 179 | + |
| 180 | + nelements = 1 |
| 181 | + partshape = list(range(n_dims)) |
| 182 | + for i in range(n_dims): |
| 183 | + b = fin.read(4) |
| 184 | + partshape[i] = struct.unpack("i", b)[0] |
| 185 | + nelements *= partshape[i] |
| 186 | + partshape = list(reversed(partshape)) |
| 187 | + |
| 188 | + name = fin.read(length) |
| 189 | + data = fin.read(ggml_nbytes(partshape, ftype)) |
| 190 | + |
| 191 | + blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] |
| 192 | + type_size = GGML_TYPE_SIZE[WTYPES[ftype]] |
| 193 | + |
| 194 | + print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") |
| 195 | + |
| 196 | + # determine dimension along which multipart tensor is sharded |
| 197 | + # |
| 198 | + # split_dim 0 regex: |
| 199 | + # - output.* |
| 200 | + # - layers.*.attention.wq.weight |
| 201 | + # - layers.*.attention.wk.weight |
| 202 | + # - layers.*.attention.wv.weight |
| 203 | + # - layers.*.feed_forward.w1.weight |
| 204 | + # - layers.*.feed_forward.w3.weight |
| 205 | + # |
| 206 | + # split_dim 1 regex: |
| 207 | + # - tok_embeddings.* |
| 208 | + # - layers.*.attention.wo.weight |
| 209 | + # - layers.*.feed_forward.w2.weight |
| 210 | + # |
| 211 | + if n_dims > 1: |
| 212 | + split_dim = 1 |
| 213 | + if b"tok_embeddings" in name: |
| 214 | + split_dim = 1 |
| 215 | + elif b"layers" in name: |
| 216 | + if b"attention.wo.weight" in name: |
| 217 | + split_dim = 1 |
| 218 | + elif b"feed_forward.w2.weight" in name: |
| 219 | + split_dim = 1 |
| 220 | + else: |
| 221 | + split_dim = 0 |
| 222 | + elif b"output" in name: |
| 223 | + split_dim = 0 |
| 224 | + |
| 225 | + # output tensor header |
| 226 | + fullshape = list(partshape) |
| 227 | + if n_dims > 1: |
| 228 | + fullshape[split_dim] *= n_parts |
| 229 | + fout.write(struct.pack("iii", n_dims, len(name), ftype)) |
| 230 | + for dim in reversed(fullshape): |
| 231 | + fout.write(struct.pack("i", dim)) |
| 232 | + fout.write(name) |
| 233 | + |
| 234 | + # ensure tensor data is aligned |
| 235 | + tensor_data_offset = fout.tell() |
| 236 | + while tensor_data_offset % QK != 0: |
| 237 | + fout.write(struct.pack("B", 0)) |
| 238 | + tensor_data_offset += 1 |
| 239 | + |
| 240 | + # output unified mappable tensor data |
| 241 | + if n_dims == 1 or n_parts == 1: |
| 242 | + # copy tensor which we thankfully received in one piece |
| 243 | + if part_id == 0: |
| 244 | + fout.write(data) |
| 245 | + elif split_dim == 0: |
| 246 | + # reassemble multifile tensor containing some of the rows |
| 247 | + rows_per_chunk = partshape[0] |
| 248 | + current_row = part_id * rows_per_chunk |
| 249 | + bytes_per_row = fullshape[1] // blck_size * type_size |
| 250 | + offset = current_row * bytes_per_row |
| 251 | + fout.seek(tensor_data_offset + offset) |
| 252 | + fout.write(data) |
| 253 | + elif split_dim == 1: |
| 254 | + # reassemble multifile tensor containing some of the cols |
| 255 | + cols_per_chunk = partshape[1] |
| 256 | + current_col = part_id * cols_per_chunk |
| 257 | + bpr = partshape[1] // blck_size * type_size |
| 258 | + bytes_per_row = fullshape[1] // blck_size * type_size |
| 259 | + offset_current_col = current_col // blck_size * type_size |
| 260 | + for row in range(partshape[0]): |
| 261 | + offset_row = row * bytes_per_row |
| 262 | + offset = offset_row + offset_current_col |
| 263 | + fout.seek(tensor_data_offset + offset) |
| 264 | + fout.write(data[row * bpr:row * bpr + bpr]) |
| 265 | + |
| 266 | + # advance file position to next tensor |
| 267 | + fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) |
| 268 | + |
| 269 | +def parse_args(): |
| 270 | + parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') |
| 271 | + parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') |
| 272 | + parser.add_argument('fout_path', help='your new ggjt file name') |
| 273 | + return parser.parse_args() |
| 274 | + |
| 275 | +def main(): |
| 276 | + args = parse_args() |
| 277 | + assert args.fin_path |
| 278 | + assert args.fout_path |
| 279 | + assert args.fin_path != args.fout_path |
| 280 | + |
| 281 | + with open(args.fin_path, "rb") as fin: |
| 282 | + hparams = read_hparams(fin) |
| 283 | + tokens = read_tokens(fin, hparams) |
| 284 | + |
| 285 | + if hparams['magic'] == 0x67676a74: # ggjt |
| 286 | + print("%s: input ggml has already been converted to 'ggjt' magic\n" % |
| 287 | + (args.fin_path)) |
| 288 | + sys.exit(1) |
| 289 | + |
| 290 | + if hparams['magic'] != 0x67676d66: # ggmf |
| 291 | + print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" % |
| 292 | + (args.fin_path, hparams['magic'])) |
| 293 | + sys.exit(1) |
| 294 | + |
| 295 | + hparams['magic'] = 0x67676a74 # ggjt |
| 296 | + n_parts = get_n_parts(hparams['n_embd']) |
| 297 | + |
| 298 | + # we output a single file for ggml |
| 299 | + with open(args.fout_path, "wb") as fout: |
| 300 | + write_hparams(fout, hparams) |
| 301 | + write_tokens(fout, tokens) |
| 302 | + offset_of_tensors = fout.tell() |
| 303 | + # the tensors we load could be split across multiple files |
| 304 | + for part_id in range(n_parts): |
| 305 | + fout.seek(offset_of_tensors) |
| 306 | + print(f"Processing part {part_id+1} of {n_parts}\n") |
| 307 | + fin_path = args.fin_path |
| 308 | + if part_id > 0: |
| 309 | + fin_path += ".%d" % (part_id) |
| 310 | + with open(fin_path, "rb") as fin: |
| 311 | + read_tokens(fin, read_hparams(fin)) |
| 312 | + copy_tensors(fin, fout, part_id, n_parts) |
| 313 | + |
| 314 | + print(f"Done. Output file: {args.fout_path}\n") |
| 315 | + |
| 316 | +if __name__ == "__main__": |
| 317 | + main() |
0 commit comments