forked from artetxem/undreamt
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenize_sents.py
42 lines (34 loc) · 1.27 KB
/
tokenize_sents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#! /usr/bin/python
"""
BLEU score assessor
"""
# -*- coding: utf-8 -*-
import argparse
from unicodedata import normalize
import re, string
from euskalToken import EuskalToken
from nltk.tokenize.treebank import TreebankWordTokenizer
def main():
parser = argparse.ArgumentParser(description="Tokenize sentences")
parser.add_argument("file", help="Ground truth filename")
parser.add_argument("lang", choices=['en', 'eu'])
args = parser.parse_args()
word_tokenizer = EuskalToken() if args.lang == 'eu' else TreebankWordTokenizer()
# prepare regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
with open(args.file, 'r', encoding='utf8') as file:
lines = file.readlines()
for line in lines:
line = line.rstrip()
# normalize unicode characters
line = normalize('NFD', line).encode('ascii', 'ignore')
line = line.decode('UTF-8')
# tokenize
line = word_tokenizer.tokenize(line)
# convert to lower case
line = [word.lower() for word in line]
# remove non-printable chars from each token
line = [re_print.sub('', w) for w in line]
print(" ".join(line))
if __name__ == '__main__':
main()