-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpreprocess_phoenix.sh
99 lines (79 loc) · 3.36 KB
/
preprocess_phoenix.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash -v
SRC=en
TRG=de
# number of merge operations
bpe_operations=8000
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
subword_nmt=
# path to phoneix 2014T
phoneix=PHOENIX-2014-T-release-v3/PHOENIX-2014-T/
# path to mustc en-de
mustc=en-de/data/train/txt
# extract train/dev/test data from phoenix2014
for prefix in train dev test
do
python3 extract_phoenix.py \
${phoneix}/annotations/manual/PHOENIX-2014-T.${prefix}.corpus.csv \
${phoneix}/features/fullFrame-210x260px/ \
${prefix}
cat ${prefix}.src | tr '[:upper:]' '[:lower:]' > ${prefix}.${SRC}.raw
ln -s ${prefix}.tgt ${prefix}.${TRG}
done
# extract mustc en->de machine translation data
ln -s ${mustc}/train.${SRC} mustc.${SRC}
ln -s ${mustc}/train.${TRG} mustc.${TRG}
# tokenize -> should use sacreBLEU for final evaluation
awk 'BEGIN{i=0} /.*/{printf "%d % s\n",i,$0; i++}' < train.${SRC}.raw > train.${SRC}
# tokenize mustc
for prefix in mustc
do
cat ${prefix}.${SRC} \
| ${mosesdecoder}/scripts/tokenizer/normalize-punctuation.perl -l ${SRC} \
| ${mosesdecoder}/scripts/tokenizer/tokenizer.perl -a -l ${SRC} \
| tr -d '[:punct:]' | tr '[:upper:]' '[:lower:]' > ${prefix}.tok.${SRC}
cat ${prefix}.${TRG} \
| ${mosesdecoder}/scripts/tokenizer/normalize-punctuation.perl -l ${TRG} \
| ${mosesdecoder}/scripts/tokenizer/tokenizer.perl -a -l ${TRG} \
| tr -d '[:punct:]' | tr '[:upper:]' '[:lower:]'> ${prefix}.tok.${TRG}
done
# <aug> denotes machine translation
sed -i 's/^/-1 <aug> /g' mustc.tok.${SRC}
# combine must and phoneix
for Lang in ${SRC} ${TRG}
do
# oversampling, 10 times
for i in {1..10}; do cat train.${Lang} >> ${Lang}.txt; done
cat mustc.tok.${Lang} ${Lang}.txt > train.tok.${Lang}
done
cut -d ' ' -f 2- < train.tok.${SRC} > train.tok.${SRC}.no-first
cut -d ' ' -f 1 < train.tok.${SRC} > train.tok.${SRC}.first
# train BPE
cat train.tok.${SRC}.no-first train.tok.${TRG} | ${subword_nmt}/learn_bpe.py -s ${bpe_operations} > ${SRC}${TRG}.bpe
# apply BPE
for prefix in dev test
do
${subword_nmt}/apply_bpe.py -c ${SRC}${TRG}.bpe < ${prefix}.${SRC}.raw > ${prefix}.bpe.${SRC}.raw
awk 'BEGIN{i=0} /.*/{printf "%d % s\n",i,$0; i++}' < ${prefix}.bpe.${SRC}.raw > ${prefix}.bpe.${SRC}
${subword_nmt}/apply_bpe.py -c ${SRC}${TRG}.bpe < ${prefix}.${TRG} > ${prefix}.bpe.${TRG}
done
# apply BPE to the training data
prefix=train
${subword_nmt}/apply_bpe.py -c ${SRC}${TRG}.bpe < ${prefix}.tok.${SRC}.no-first > ${prefix}.bpe.${SRC}.no-first
${subword_nmt}/apply_bpe.py -c ${SRC}${TRG}.bpe < ${prefix}.tok.${TRG} > ${prefix}.bpe.${TRG}
paste -d ' ' ${prefix}.tok.${SRC}.first ${prefix}.bpe.${SRC}.no-first > train.bpe.en
# shuffle training data
python shuffle_corpus.py --corpus train.bpe.${SRC} train.bpe.${TRG}
cat ${prefix}.bpe.${SRC}.no-first train.bpe.${TRG} > tmp
# extract vocabulary
python vocab.py tmp vocab.zero
python bpe_vocab_fuse.py ${SRC}${TRG}.bpe vocab.zero vocab.zero.drop
rm tmp
# outputs explanation
# - train/dev/test.src <= gloss file
# - train/dev/test.tgt <= translation file
# - train/dev/test.img <= sign video path
# - vocab.zero.drop <= vocabulary considering bpe-dropout
# - train/dev/test.bpe.en/de <= bpe tokenized files
# - train.bpe.en/de.shuf <= randomly shuffled training data, used for training