Skip to content

Commit

Permalink
fix: converters.
Browse files Browse the repository at this point in the history
  • Loading branch information
b4rtaz committed Feb 25, 2025
1 parent f8113c1 commit 47f3ac1
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
2 changes: 1 addition & 1 deletion converter/convert-hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __preparePlan(self):
p.append([FloatType.F32,
'model.norm.weight'])
p.append([wt,
'lm_head.weight'])
'lm_head.weight', 'model.embed_tokens.weight'])

def write(self, outputFile: str):
self.__preparePlan()
Expand Down
24 changes: 15 additions & 9 deletions converter/convert-tokenizer-hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, dirPath, tokenizerConfig):
self.dirPath = dirPath
self.tokenizerConfig = tokenizerConfig
self.bosId = None
self.eosId = None
self.eosIds = None
self.tokens = []
self.scores = []

Expand All @@ -47,13 +47,18 @@ def resolvePreTrainedTokenizerFast(self):
self.scores.append(-float(i))

self.bosId = tokenizer.bos_token_id
self.eosId = tokenizer.eos_token_id
if (tokenizer.eos_token_id):
self.eosIds = [tokenizer.eos_token_id]
if (self.bosId is None or self.eosId is None):
config = openJson(os.path.join(self.dirPath, 'config.json'))
if (self.bosId is None):
self.bosId = config['bos_token_id']
if (self.eosId is None):
self.eosId = config['eos_token_id']
if (self.eosIds is None):
self.eosIds = config['eos_token_id']
if isinstance(self.eosIds, list):
self.eosIds = self.eosIds[:2] # TODO: add support more than 2 eos ids
else:
self.eosIds = [self.eosIds]

def resolveLlamaTokenizer(self):
modelPath = os.path.join(self.dirPath, 'tokenizer.model')
Expand Down Expand Up @@ -100,10 +105,11 @@ def printUsage():
resolver = TokensResolver(dirPath, tokenizerConfig)
resolver.resolve()

if (resolver.bosId is None or resolver.eosId is None):
raise Exception('Cannot resolve bosId or eosId')
if (resolver.bosId is None or resolver.eosIds is None):
raise Exception('Cannot resolve bosId or eosIds')
print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
print(f'eosId: {resolver.eosId} ({resolver.tokens[resolver.eosId]})')
for eosId in resolver.eosIds:
print(f'eosId: {eosId} ({resolver.tokens[eosId]})')

chatTemplate = None
chatExtraStop = None
Expand All @@ -117,7 +123,7 @@ def printUsage():
with open(outputFileName, 'wb') as outputFile:
writer.writeTokenizer(outputFile, {
'bos_id': resolver.bosId,
'eos_id': resolver.eosId,
'chat_eos_id': resolver.eosId,
'eos_id': resolver.eosIds[0],
'chat_eos_id': resolver.eosIds[1 if len(resolver.eosIds) > 1 else 0],
}, resolver.tokens, resolver.scores, chatTemplate, chatExtraStop)
print(f'✅ Created {outputFileName}')
6 changes: 4 additions & 2 deletions converter/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,11 @@ def writeHeader(file, params):
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Unknown header key: {key}')
print(f'Warning: Unknown header key: {key}')

header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)
print(params)
for key in params:
print(f'🎓 {key}: {params[key]}')
print()

0 comments on commit 47f3ac1

Please # to comment.