-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathbpe_vocab_fuse.py
47 lines (33 loc) · 922 Bytes
/
bpe_vocab_fuse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# coding: utf-8
import sys
import codecs
from collections import OrderedDict
codes = codecs.open(sys.argv[1], 'r', encoding='utf-8').readlines()
vocab = codecs.open(sys.argv[2], 'r', encoding='utf-8').readlines()
data = OrderedDict()
for v in vocab:
data[v.strip()] = len(data)
charcters = []
first=True
for code in codes:
if first:
first = False
continue
a, b = code.strip().split()
if '</w>' not in b:
pair = '%s%s@@' % (a, b.replace('</w>', ''))
else:
pair = '%s%s' % (a, b.replace('</w>', ''))
if pair not in data:
data[pair] = len(data)
for c in a+b.replace('</w>', ''):
charcters.append(c)
chars = set(charcters)
for c in chars:
if c not in data:
data[c] = len(data)
data[c+'@@'] = len(data)
writer = codecs.open(sys.argv[3], 'w', encoding='utf-8')
for v in data:
writer.write(v + '\n')
writer.close()