forked from b4rtaz/distributed-llama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlaunch.py
104 lines (92 loc) · 3.87 KB
/
launch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import sys
import requests
# ['model-url', 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
MODELS = {
'tinyllama_1_1b_3t_q40': [
'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true',
'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_tokenizer_tinylama_1.1b_3t.t?download=true',
'q40', 'q80', 'base'
],
'llama3_8b_q40': [
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true',
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
'q40', 'q80', 'base'
],
'llama3_8b_instruct_q40': [
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true',
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
'q40', 'q80', 'chat'
]
}
def downloadFile(url: str, path: str):
if (os.path.isfile(path)):
fileName = os.path.basename(path)
result = input(f'❓ {fileName} already exists, do you want to download again? ("Y" if yes): ')
if (result.upper() != 'Y'):
return
response = requests.get(url, stream=True)
response.raise_for_status()
print(f'📄 {url}')
lastSize = 0
with open(path, 'wb') as file:
for chunk in response.iter_content(chunk_size=4096):
file.write(chunk)
size = file.tell() // 1024
if (size - lastSize >= 8192):
sys.stdout.write("\rDownloaded %i kB" % size)
lastSize = size
sys.stdout.write(' ✅\n')
def download(modelName: str, model: list):
dirPath = os.path.join('models', modelName)
print(f'📀 Downloading {modelName} to {dirPath}...')
os.makedirs(dirPath, exist_ok=True)
modelUrl = model[0]
tokenizerUrl = model[1]
modelPath = os.path.join(dirPath, f'dllama_model_{modelName}.m')
tokenizerPath = os.path.join(dirPath, f'dllama_tokenizer_{modelName}.t')
downloadFile(modelUrl, modelPath)
downloadFile(tokenizerUrl, tokenizerPath)
print('📀 All files are downloaded')
return (modelPath, tokenizerPath)
def writeRunFile(modelName: str, command: str):
filePath = f'run_{modelName}.sh'
with open(filePath, 'w') as file:
file.write('#!/bin/sh\n')
file.write('\n')
file.write(f'{command}\n')
return filePath
def printUsage():
print('Usage: python download-model.py <model>')
print('Available models:')
for model in MODELS:
print(f' {model}')
if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)
os.chdir(os.path.dirname(__file__))
modelName = sys.argv[1].replace('-', '_')
if modelName not in MODELS:
print(f'Model is not supported: {modelName}')
exit(1)
model = MODELS[modelName]
(modelPath, tokenizerPath) = download(modelName, model)
if (model[4] == 'chat'):
command = './dllama chat'
else:
command = './dllama inference --steps 64 --prompt "Hello world"'
command += f' --model {modelPath} --tokenizer {tokenizerPath} --buffer-float-type {model[3]} --nthreads 4'
print('To run Distributed Llama you need to execute:')
print('--- copy start ---')
print()
print(command)
print()
print('--- copy end -----')
runFilePath = writeRunFile(modelName, command)
print(f'🌻 Created {runFilePath} script to easy run')
result = input('❓ Do you want to run Distributed Llama? ("Y" if yes): ')
if (result.upper() == 'Y'):
if (not os.path.isfile('dllama')):
os.system('make dllama')
os.system(command)