-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_create_corpus.py
executable file
·37 lines (31 loc) · 1.07 KB
/
run_create_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python3
import argparse
from utils.files import read_file
from text.dataset import create_corpus
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process datasets to create a corpus")
parser.add_argument("files", nargs="+", help="List of text files to read")
parser.add_argument(
"--lang",
type=str,
choices=["pt", "en", "es"],
required="True",
help="Language of the text files",
)
parser.add_argument(
"--output",
type=str,
default="corpus.tok",
help="Name of the output file",
)
args = parser.parse_args()
sentences = []
print(f"Reading {len(args.files)} files")
for f in args.files:
sentences.extend(read_file(f))
print(f"Cleaning {len(sentences)} sentences")
cleaned_sentences = create_corpus(sentences, args.lang)
with open(args.output, "w") as f:
print(f"Writing {len(cleaned_sentences)} cleaned sentences to {args.output}")
for sentence in cleaned_sentences:
f.write(sentence + "\n")