-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfiguration.py
125 lines (103 loc) · 7.29 KB
/
configuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
### A function that adds a new language code to the existing language code list. Keeps the alphabetical order maintained
def add_new_lang(lang_list, new_lang):
new_list = []
new_lang_added = False
for lang in lang_list:
if lang < new_lang:
new_list.append(lang)
elif (lang > new_lang) and (not new_lang_added):
new_list.append(new_lang)
new_list.append(lang)
new_lang_added = True
elif (lang > new_lang) and (new_lang_added):
new_list.append(lang)
return new_list
### The paths of the scatch directory. All the other paths relative to this
path_scratch = '/scratch/cse/btech/cs1170339'
### All the 14 languages initially considered by us
langs = ['as', 'bn', 'en', 'gu', 'hi', 'kn', 'ml', 'mni', 'mr', 'or', 'pa', 'ta', 'te', 'ur']
### The paths for the major datasources obtained from OPUS
path_opus = os.path.join(path_scratch, 'opus')
path_jw300 = os.path.join(path_opus, 'jw300')
path_subtitles = os.path.join(path_opus, 'subtitles')
path_opus100 = os.path.join(path_opus, 'opus100')
path_gnome = os.path.join(path_opus, 'gnome')
path_ubuntu = os.path.join(path_opus, 'ubuntu')
path_wikimatrix = os.path.join(path_opus, 'wikimatrix')
path_kde4 = os.path.join(path_opus, 'kde4')
path_tanzil = os.path.join(path_opus, 'tanzil')
path_ted2020 = os.path.join(path_opus, 'ted2020')
### The paths for the Modi and related datasources
path_pib = os.path.join(path_scratch, 'pib-v1.3')
path_pmi = os.path.join(path_scratch, 'pmi-v1')
path_mkb = os.path.join(path_scratch, 'mkb-v0')
### Path for the Indo WordNet corpus
path_iwn = os.path.join(path_scratch, 'indo-wordnet-v0.2')
### The paths for the Miscellaneour resources
path_misc = os.path.join(path_scratch, 'misc')
path_wikititles_misc = os.path.join(path_misc, 'wikititles-temp')
path_alt = os.path.join(path_misc, 'alt-corpus')
path_ufal = os.path.join(path_misc, 'ufal-en-ta')
path_odi_en = os.path.join(path_misc, 'odi-en-v2.0')
path_en_urdu = os.path.join(path_misc, 'en-ur-charles')
path_wiki_turk = os.path.join(path_misc, 'wiki-turk')
path_iitb = os.path.join(path_misc, 'iitb')
path_bible_uedin = os.path.join(path_misc, 'bible-uedin')
### A list of all the dataset paths
datapath_list = [path_jw300, path_subtitles, path_opus100, path_gnome, path_ubuntu, path_wikimatrix, path_kde4, path_tanzil, path_ted2020, path_pib, path_pmi, path_mkb, path_iwn, \
path_wikititles_misc, path_alt, path_ufal, path_odi_en, path_en_urdu, path_wiki_turk, path_iitb, path_bible_uedin]
### Path of the final dataset
path_indic_parallel = os.path.join(path_scratch, 'indic_parallel')
path_indic_dataset = os.path.join(path_indic_parallel, 'train_data')
path_indic_test = os.path.join(path_indic_parallel, 'test_data')
path_indic_dev = os.path.join(path_indic_parallel, 'dev_data')
### Additional paths that would be needed for getting the test data
path_test_cluster = os.path.join(path_indic_test, 'test_cluster')
### Different tasks to be carried out
### Note: Base dataset means that the files in the dataset are taken as they are i.e without any cleaning
get_base_stats = 'get_base_stats' # Get the stats of all the base datasets present
clean_base_data = 'clean_base_data' # Clean the training data present in the base datasets
get_clean_stats = 'get_clean_stats' # Get the stats of the files after having the dataset cleaned
merge_clean_data = 'merge_cleaned_data' # Merge the cleaned data for the different pairs as the actual data
### The different file names to be considered
base_train_name = 'overall' # The name of the training data of the base datasets (eg overall.hi in en-hi corpus)
clean_train_name = 'overall_clean' # The name of the cleaned training data corresponding to the datasets
merge_train_name = 'train' # The name of the training data of the overall dataset
wat_test_name = 'test' # The name of the test data of the overall dataset obtained from the WAT test set
wat_dev_name = 'dev' # The name of the dev data for the overall dataset obtained from the WAT dev set
saved_base_stats = 'base_stats.json' # The name of the file where the stats for the base datasets will be stored
saved_clean_stats = 'clean_stats.json' # The name of the file where the stats for the cleaned datasets will be stored
saved_merge_stats = 'merge_stats.json' # The name of the file where the stats for the overall merged data will be stored
repeated_lines_check = 'repeated_lines.txt' # The name of the file that contains the indices of occurence for every pair in the requested dataset (see check_rep.py)
saved_test_cluster = 'test_cluster.txt' # The name of the file that consists of the tab seperated test cluster being saved
### Some miscellaneous parameters introduced (Not present in the other configuration files)
pmi_data_name = 'overall_pmi' # The name of the data downloaded from PMI source that contains WAT test as well as Dev respectively
########################################################################################################################################################################################################################################################
##### The following lines correspond to the addition of a new dataset/language to the system. Add any new datasets that you want in the same way as done in this section
##### Here, we make an addition of resources for Sanskrit language, making it the 15th language of our dataset
### Some important initialisations for this process
path_addition = os.path.join(path_scratch, 'addition')
datapath_list_added = []
### Adding Sanskrit in the list of languages
langs = add_new_lang(langs, 'sa')
### Different paths of the datasets that were added for Sanskrit
path_iwn_sans = os.path.join(path_addition, 'indo-wordnet-sans-v0.2')
path_ramayana_sans = os.path.join(path_addition, 'ramayana-sans-iitk')
path_ramcharitmanas_sans = os.path.join(path_addition, 'ramcharitmanas-sans')
path_sanskrit_bible = os.path.join(path_addition, 'sanskrit-bible')
### Appending these Entries to the Datapath list
datapath_list.append(path_iwn_sans)
datapath_list.append(path_ramayana_sans)
datapath_list.append(path_ramcharitmanas_sans)
datapath_list.append(path_sanskrit_bible)
## Adding these entries to a seperate Datapath list for the new additions
datapath_list_added = [path_iwn_sans, path_ramayana_sans, path_ramcharitmanas_sans, path_sanskrit_bible]
### The list of all the different books that were present in Sanskrit-Bible
bible_books = ['1_corinthians', '1_john', '1_peter', '1_thessalonians', '1_timothy', '2_corinthians', '2_john', '2_peter', '2_thessalonians', '2_timothy', \
'3_john', 'acts', 'colossians', 'ephesians', 'galatians', 'hebrews', 'james', 'john', 'jude', 'luke', 'mark', 'matthew', 'philemon', \
'philippians', 'revelation', 'romans', 'titus']
### The different extra file names that would be extracted corresponding to this new addition
word_translation_name = 'overall_word' # The name of the training data extracted for word-word mappings
shloka_translation_name = 'overall_shloka' # The name of the training data extracted for shloka translations
language_bible_name = 'overall_bible' # The name of the files where the Sanskrit-Bible files are parsed and extracted