Skip to content

Commit d2ade63

Browse files
committed
Merge 'origin/master' into hipblas
2 parents cde52d6 + 8a88e58 commit d2ade63

30 files changed

+3022
-1429
lines changed

.github/workflows/build.yml

+2
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ jobs:
197197
strategy:
198198
matrix:
199199
include:
200+
- build: 'noavx'
201+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
200202
- build: 'avx2'
201203
defines: '-DLLAMA_BUILD_SERVER=ON'
202204
- build: 'avx'

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ if (LLAMA_ALL_WARNINGS)
392392
-Wshadow
393393
-Wstrict-prototypes
394394
-Wpointer-arith
395+
-Wmissing-prototypes
395396
)
396397
set(cxx_flags
397398
-Wall

Makefile

+3-2
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
6363
endif
6464

6565
# warnings
66-
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
66+
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
67+
-Wmissing-prototypes
6768
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
6869

6970
# OS specific
@@ -381,7 +382,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
381382
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
382383
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
383384

384-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
385+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
385386
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
386387

387388
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)

README.md

+14
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ as the main playground for developing new features for the [ggml](https://github
7777
**Supported models:**
7878

7979
- [X] LLaMA 🦙
80+
- [x] LLaMA 2 🦙🦙
8081
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
8182
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
8283
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
@@ -650,6 +651,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
650651
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
651652
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
652653
654+
### Obtaining and using the Facebook LLaMA 2 model
655+
656+
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
657+
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
658+
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
659+
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
660+
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
661+
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
662+
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
663+
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
664+
- Specify `-eps 1e-5` for best generation quality
665+
- Specify `-gqa 8` for 70B models to work
666+
653667
### Verifying the model files
654668
655669
Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.

convert.py

100755100644
+98-67
Original file line numberDiff line numberDiff line change
@@ -133,19 +133,20 @@ def make_tensors_list() -> List[str]:
133133

134134
def find_n_mult(n_ff: int, n_embd: int) -> int:
135135
# hardcoded magic range
136-
for n_mult in range(256, 1, -1):
136+
for n_mult in range(8192, 1, -1):
137137
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
138138
if calc_ff == n_ff:
139139
return n_mult
140140
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
141141

142142
@dataclass
143143
class Params:
144-
n_vocab: int
145-
n_embd: int
146-
n_mult: int
147-
n_head: int
148-
n_layer: int
144+
n_vocab: int
145+
n_embd: int
146+
n_mult: int
147+
n_head: int
148+
n_layer: int
149+
n_kv_head: Optional[int] # This parameter is only used for Llama 2
149150

150151
@staticmethod
151152
def guessed(model: 'LazyModel') -> 'Params':
@@ -167,11 +168,12 @@ def guessed(model: 'LazyModel') -> 'Params':
167168
n_head=n_embd // 128 # guessed
168169

169170
return Params(
170-
n_vocab = n_vocab,
171-
n_embd = n_embd,
172-
n_mult = 256,
173-
n_head = n_head,
174-
n_layer = n_layer,
171+
n_vocab = n_vocab,
172+
n_embd = n_embd,
173+
n_mult = 256,
174+
n_head = n_head,
175+
n_layer = n_layer,
176+
n_kv_head = None,
175177
)
176178

177179
@staticmethod
@@ -183,15 +185,17 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
183185
n_head = config["num_attention_heads"];
184186
n_layer = config["num_hidden_layers"];
185187
n_ff = config["intermediate_size"];
188+
n_kv_head = config.get("num_key_value_heads")
186189

187190
n_mult = find_n_mult(n_ff, n_embd);
188191

189192
return Params(
190-
n_vocab = n_vocab,
191-
n_embd = n_embd,
192-
n_mult = n_mult,
193-
n_head = n_head,
194-
n_layer = n_layer,
193+
n_vocab = n_vocab,
194+
n_embd = n_embd,
195+
n_mult = n_mult,
196+
n_head = n_head,
197+
n_layer = n_layer,
198+
n_kv_head = n_kv_head,
195199
)
196200

197201
# LLaMA v2 70B params.json
@@ -200,21 +204,22 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
200204
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
201205
config = json.load(open(config_path))
202206

203-
n_vocab = config["vocab_size"];
204-
n_embd = config["dim"];
205-
n_head = config["n_heads"];
206-
n_layer = config["n_layers"];
207-
n_mult = config["multiple_of"];
207+
n_vocab = config["vocab_size"];
208+
n_embd = config["dim"];
209+
n_head = config["n_heads"];
210+
n_layer = config["n_layers"];
211+
n_mult = config["multiple_of"];
208212

209213
if n_vocab == -1:
210214
n_vocab = model["tok_embeddings.weight"].shape[0]
211215

212216
return Params(
213-
n_vocab = n_vocab,
214-
n_embd = n_embd,
215-
n_mult = n_mult,
216-
n_head = n_head,
217-
n_layer = n_layer,
217+
n_vocab = n_vocab,
218+
n_embd = n_embd,
219+
n_mult = n_mult,
220+
n_head = n_head,
221+
n_layer = n_layer,
222+
n_kv_head = None,
218223
)
219224

220225
@staticmethod
@@ -234,14 +239,21 @@ def load(model_plus: 'ModelPlus') -> 'Params':
234239

235240

236241
class SentencePieceVocab:
237-
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
238-
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
242+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
243+
self.vocabtype = vocabtype
244+
if self.vocabtype == "bpe":
245+
self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
246+
else:
247+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
239248
added_tokens: Dict[str, int]
240249
if fname_added_tokens is not None:
241250
added_tokens = json.load(open(fname_added_tokens))
242251
else:
243252
added_tokens = {}
244-
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
253+
if self.vocabtype == "bpe":
254+
vocab_size: int = len(self.sentencepiece_tokenizer)
255+
else:
256+
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
245257
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
246258
actual_ids = sorted(added_tokens.values())
247259
if expected_ids != actual_ids:
@@ -255,22 +267,32 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255267

256268
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
257269
tokenizer = self.sentencepiece_tokenizer
258-
for i in range(tokenizer.vocab_size()):
270+
if self.vocabtype == "bpe":
271+
from transformers.models.gpt2 import tokenization_gpt2
272+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
273+
byte_decoder = {v: k for k, v in byte_encoder.items()}
274+
for i, item in enumerate(tokenizer):
259275
text: bytes
260-
if tokenizer.is_unknown(i):
261-
text = " \u2047 ".encode("utf-8")
262-
elif tokenizer.is_control(i):
263-
text = b""
264-
elif tokenizer.is_byte(i):
265-
piece = tokenizer.id_to_piece(i)
266-
if len(piece) != 6:
267-
raise Exception(f"Invalid token: {piece}")
268-
byte_value = int(piece[3:-1], 16)
269-
text = struct.pack("B", byte_value)
270-
else:
271-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
272-
score: float = tokenizer.get_score(i)
276+
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
277+
score: float = -i
273278
yield text, score
279+
else:
280+
for i in range(tokenizer.vocab_size()):
281+
text: bytes
282+
if tokenizer.is_unknown(i):
283+
text = " \u2047 ".encode("utf-8")
284+
elif tokenizer.is_control(i):
285+
text = b""
286+
elif tokenizer.is_byte(i):
287+
piece = tokenizer.id_to_piece(i)
288+
if len(piece) != 6:
289+
raise Exception(f"Invalid token: {piece}")
290+
byte_value = int(piece[3:-1], 16)
291+
text = struct.pack("B", byte_value)
292+
else:
293+
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
294+
score: float = tokenizer.get_score(i)
295+
yield text, score
274296

275297
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
276298
for text in self.added_tokens_list:
@@ -300,10 +322,12 @@ def __repr__(self) -> str:
300322
Vocab = Union[SentencePieceVocab, GGMLVocab]
301323

302324

303-
def permute(weights: NDArray, n_head: int) -> NDArray:
325+
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
326+
if n_kv_head is not None and n_head != n_kv_head:
327+
n_head //= n_kv_head
304328
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
305-
.swapaxes(1, 2)
306-
.reshape(weights.shape))
329+
.swapaxes(1, 2)
330+
.reshape(weights.shape))
307331

308332

309333
def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
@@ -351,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
351375
@abstractmethod
352376
def astype(self, data_type: DataType) -> 'Tensor': ...
353377
@abstractmethod
354-
def permute(self, n_head: int) -> 'Tensor': ...
378+
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
355379
@abstractmethod
356380
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
357381
@abstractmethod
@@ -389,8 +413,8 @@ def part(self, n_part: int) -> 'UnquantizedTensor':
389413
r = self.ndarray.shape[0] // 3
390414
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
391415

392-
def permute(self, n_head: int) -> 'UnquantizedTensor':
393-
return UnquantizedTensor(permute(self.ndarray, n_head))
416+
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
417+
return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
394418

395419

396420
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@@ -438,26 +462,27 @@ def astype(self, data_type: DataType) -> Tensor:
438462
def to_ggml(self) -> 'GGMLQuantizedTensor':
439463
return self
440464

441-
def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
442-
return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
465+
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
466+
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
443467

444468

445469
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
446470

447471

448472
class DeferredPermutedTensor(Tensor):
449-
def __init__(self, base: Tensor, n_head: int) -> None:
473+
def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
450474
self.base = base
451475
self.n_head = n_head
476+
self.n_kv_head = n_kv_head
452477
self.data_type = self.base.data_type
453478

454479
def astype(self, data_type: DataType) -> Tensor:
455-
return self.base.astype(data_type).permute(self.n_head)
480+
return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
456481

457482
def to_ggml(self) -> GGMLCompatibleTensor:
458-
return self.base.to_ggml().permute(self.n_head)
483+
return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
459484

460-
def permute(self, n_head: int) -> Tensor:
485+
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
461486
raise Exception("shouldn't permute twice")
462487

463488

@@ -549,8 +574,8 @@ def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
549574
ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
550575
return ret
551576

552-
def permute(self, n_head: int) -> Tensor:
553-
return DeferredPermutedTensor(self, n_head)
577+
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
578+
return DeferredPermutedTensor(self, n_head, n_kv_head)
554579

555580
def to_ggml(self) -> GGMLQuantizedTensor:
556581
# The output format looks like this:
@@ -681,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
681706
return ModelPlus(model, paths, format, vocab)
682707

683708

684-
def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
709+
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
685710
def load() -> Tensor:
686-
return lazy_tensor.load().permute(n_head)
687-
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
711+
return lazy_tensor.load().permute(n_head, n_kv_head)
712+
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
688713

689714
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
690715
def load() -> Tensor:
@@ -709,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
709734
for i in itertools.count():
710735
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
711736
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
712-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
737+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
713738
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
714739
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
715740
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
@@ -1196,14 +1221,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
11961221
return {name: model[name] for name in TENSORS_LIST if name in model}
11971222

11981223

1199-
def load_vocab(path: Path) -> SentencePieceVocab:
1224+
def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1225+
print(f"vocabtype: {vocabtype}")
12001226
# Be extra-friendly and accept either a file or a directory. Also, if it's
12011227
# a directory, it might be the model directory, and tokenizer.model might
12021228
# be in the parent of that.
12031229
if path.is_dir():
1204-
path2 = path / "tokenizer.model"
1230+
vocab_file = "tokenizer.model"
1231+
if vocabtype == 'bpe':
1232+
vocab_file = "vocab.json"
1233+
path2 = path / vocab_file
12051234
# Use `.parent` instead of /.. to handle the symlink case better.
1206-
path3 = path.parent / "tokenizer.model"
1235+
path3 = path.parent / vocab_file
12071236
if path2.exists():
12081237
path = path2
12091238
elif path3.exists():
@@ -1214,7 +1243,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
12141243
"if it's in another directory, pass the directory as --vocab-dir")
12151244
added_tokens_path = path.parent / "added_tokens.json"
12161245
print(f"Loading vocab file {path}")
1217-
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1246+
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
1247+
vocabtype)
12181248

12191249

12201250
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
@@ -1252,14 +1282,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
12521282
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
12531283
parser.add_argument("model", type=Path,
12541284
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1285+
parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
12551286
args = parser.parse_args(args_in)
12561287

12571288
vocab: Vocab
12581289
if args.dump_single:
12591290
model_plus = lazy_load_file(args.model)
12601291
do_dump_model(model_plus)
12611292
elif args.vocab_only:
1262-
vocab = load_vocab(args.vocab_dir or args.model)
1293+
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
12631294
assert args.outfile, "need --outfile if using --vocab-only"
12641295
outfile = args.outfile
12651296
OutputFile.write_vocab_only(outfile, vocab)
@@ -1273,7 +1304,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
12731304
vocab = model_plus.vocab
12741305
else:
12751306
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1276-
vocab = load_vocab(vocab_dir)
1307+
vocab = load_vocab(vocab_dir, args.vocabtype)
12771308
params = Params.load(model_plus)
12781309
model = model_plus.model
12791310
model = do_necessary_conversions(model, params)

0 commit comments

Comments
 (0)