forked from LRLNMT/LRLNMT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_opendomain_ft_100k_xx-to-English.sh
137 lines (100 loc) · 4.22 KB
/
run_opendomain_ft_100k_xx-to-English.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
# Install libraries
# source <<conda_path_need_to_add>>/etc/profile.d/conda.sh
# conda create -n lrl_nmt_mbart50_ft python=3.7.11
# conda activate lrl_nmt_mbart50_ft
# echo $CONDA_PREFIX
# pip install -r requirements.txt
declare -A language_list=( ["Afrikaans"]="af_ZA" ["French"]="fr_XX" ["Hindi"]="hi_IN" ["Kannada"]="te_IN" ["Sinhala"]="si_LK" ["Tamil"]="ta_IN" ["Xhosa"]="xh_ZA" ["Yoruba"]="sw_KE" ["Irish"]="fr_XX" )
function train () {
lang_tag=${language_list[$1]}
printf "Fine-tuning for language: $1, mbart_notation: ${lang_tag}\n"
train_set_path="Datasets/$1/train/cc_aligned/100k/train.json"
printf "Train directory: ${train_set_path}\n"
dev_set_path="Datasets/$1/dev/flores/dev.json"
printf "Dev directory: ${dev_set_path}\n"
test_set_path="Datasets/$1/test/flores/test.json"
printf "Test directory: ${test_set_path}\n"
model_save_path_xx_to_en="100k_${lang_tag}-en_ccaligned_model"
printf "******************** Start training ${lang_tag} to en_XX direction ********************\n"
python run_translation.py \
--model_name_or_path facebook/mbart-large-50 \
--do_train \
--do_eval \
--train_file ${train_set_path} \
--validation_file ${dev_set_path} \
--test_file ${test_set_path} \
--source_lang ${lang_tag} \
--target_lang en_XX \
--output_dir ${model_save_path_xx_to_en} \
--per_device_train_batch_size=10 \
--per_device_eval_batch_size=10 \
--overwrite_output_dir \
--predict_with_generate \
--forced_bos_token en_XX \
--save_steps 50000 \
--num_beams 10 \
--do_predict \
--max_source_length 200 \
--max_target_length 200
printf "******************** Finished training ${lang_tag} to en_XX direction ********************\n"
printf "******************** Testing on other domain-speific test sets for ${lang_tag} to en_XX direction ********************\n"
for test_file_path in Datasets/$1/test/*/test.json
do
printf "Reading directory: ${test_file_path}\n"
IFS='/' read -ra path_array <<< "${test_file_path}"
printf "******************** Start TESTING on: ${path_array[3]} domain for ${lang_tag} to en_XX direction ********************\n"
dev_set_path="Datasets/$1/dev/${path_array[3]}/dev.json"
printf "Dev directory: ${dev_set_path}\n"
printf "Test directory: ${test_file_path}\n"
model_test_save_path_xx_to_en="100k_${lang_tag}-en_ccaligned_model_test_on_${path_array[3]}"
python run_translation.py \
--model_name_or_path ${model_save_path_xx_to_en} \
--do_eval \
--validation_file ${dev_set_path} \
--test_file ${test_file_path} \
--source_lang ${lang_tag} \
--target_lang en_XX \
--output_dir ${model_test_save_path_xx_to_en} \
--per_device_train_batch_size=10 \
--per_device_eval_batch_size=10 \
--overwrite_output_dir \
--predict_with_generate \
--forced_bos_token en_XX \
--save_steps 50000 \
--num_beams 10 \
--do_predict \
--max_source_length 200 \
--max_target_length 200
printf "******************** Finished TESTING on: ${path_array[3]} domain for ${lang_tag} to en_XX direction ********************\n"
done
printf "******************** END of ${lang_tag} to en_XX direction ********************\n"
}
# start training
while getopts l: flag
do
case "${flag}" in
l) language=${OPTARG};;
esac
done
if [ -z "$language" ]
then
echo "Language is not provided"
elif [ "$language" == "*" ]
then
echo "Language is wildcard (*), hence train all the supported languages"
for data_file_path in Datasets/*/train/cc_aligned/100k/train.json
do
printf "Reading directory: ${data_file_path}\n"
IFS='/' read -ra path_array <<< "${data_file_path}"
train ${path_array[1]}
done
else
echo "Provided input language pair: $language"
if [ -z ${language_list[${language}]} ]
then
echo "NOT-supported input language pair: $language"
else
train $language
fi
fi