-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconversion_conll.py
348 lines (284 loc) · 13.6 KB
/
conversion_conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# -*- coding: utf-8 -*-
"""Conversion_CONLL.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1pP9ciu96kMf1cuJ5RY4k_9YwkoiXNNYH
This script is intended to convert the URS and XML format from the original DEMOCRAT corpus
into the more common conll format suited for coreference annotation
"""
import glob, os, re, sys
import pandas as pd
import spacy
import lxml
from bs4 import BeautifulSoup as bs
"""
Description of the final conll file
Column Type Description </br>
1. Document ID
> This is a variation on the document filename</br>
2. Part number
> Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.</br>
3. Word number
4. Word itself
> This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release.
5. Part-of-Speech
6. Parse bit
> This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column. When the parse information is missing, the first word of a sentence is tagged as "(TOP*" and the last word is tagged as "*)" and all intermediate words are tagged with a "*".
7. Predicate lemma
> The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-"
8. Predicate Frameset ID
> This is the PropBank frameset ID of the predicate in Column 7.
9. Word sense
> This is the word sense of the word in Column 3.
10. Speaker/Author
>This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data.
11. Named Entities
>These columns identifies the spans representing various named entities.
12. :N Predicate Arguments
> There is one column each of predicate argument structure information for the predicate mentioned in Column 7.
* N Coreference Coreference
> chain information encoded in a parenthesis structure.
Transformation
"""
class DocumentConllConverter():
'''
Converter object that takes two tree objets (urs and xml) and extracts infos on the 12 columns described above
'''
def __init__(self,xml_tree,urs_tree,document_name,convert_pos):
self.xml_tree = xml_tree
self.urs_tree = urs_tree
self.part_number , self.word_number = 0,0
self.speaker = '_'
self.sentence_starts ,self.sentence_ends = True, False
self.sentence_number = 1
self.unclosed_corefs , self.mentions_spans = set(), []
self.convert_pos = convert_pos
self.mention_unique = re.compile("#u\-MENTION\-\d+$")
genres ={'non narratif':"nn", 'mixte (narratif et non narratif)':"mi", 'narratif':"na"}
genre = DF_METADATA[DF_METADATA.fichier==document_name].iat[0,5]
self.document_id = genres.get(genre)+'_'+document_name
self.output_string = ''
self.filter_coreference_spans()
def filter_coreference_spans(self):
'''
filtering coreferences...
'''
self.unique_mentions = {}
path_mentions = './standOff/annotations[@type="coreference"]/annotationGrp[@subtype="MENTION"]/span'
all_mentions = self.urs_tree.xpath(path_mentions)
for mention in all_mentions[::-1]:
span = (mention.get("from"), mention.get('to'))
if span not in self.unique_mentions:
self.unique_mentions[span] = mention.get('id')
def from_xml_to_conll(self,output_file,print_frequency=10,document_split_frequency=15,seps={".",".","?","!",";"}):
'''
write the final conll file in the given output_file
Arguments :
print_frequency: information on the process is printed every 1/print_frequency
document_split_frequency : the number of sentences in the original document that will be considered a full document in the new conll file
Necessary to prevent too long coreference chains that will require too much ressources for later computation
seps = sentence separators. Characters from which it is possible to end a sentence and starts a new sentence / new document
'''
self.sentence_seps = seps
self.output_string+=f"#begin document ({self.document_id}); part {self.part_number}\n"
tokens = self.xml_tree.findall('/{*}text/{*}s/{*}w')
token_number = len(tokens)
print_period = token_number//print_frequency
for i, token in enumerate(tokens):
if (i+1) % (print_period) == 0:
print(round((i+1)/token_number*100,1), "%")
word_id = token.get('id')
if self.sentence_starts:
self.word_number = 0
word = token.find('./{*}form').text.replace("’","'")
if word.strip('-'):
word = word.strip('-')
#print(word)
part_of_speech = token.xpath('./*[contains(@type,"pos")]')[0].text
# pos, fropos, frpos, type
#part_of_speech =token.find('./{*}ana').text
coreference = self.get_coreference(word_id)
parse_bit = self.make_parse_bit(word,part_of_speech)
predicate_lemma,predicate_frameset_id,part_of_speech= self.get_predicate_lemma(token,part_of_speech)
part_of_speech = self.from_frpos_to_upos(part_of_speech)
word_sense = '_'
named_entities = '_'
predicate_arguments = '_'
speaker = '_'
line = (' '*10).join([self.document_id,str(self.part_number),str(self.word_number),
word,part_of_speech,parse_bit,
predicate_lemma,str(predicate_frameset_id),
word_sense,speaker,named_entities,
predicate_arguments,coreference])
self.output_string += line+'\n'
self.word_number +=1
if self.sentence_ends:
self.sentence_starts = True
self.sentence_ends = False
self.mentions_spans = []
if i< len(tokens) -1 and self.sentence_number % document_split_frequency == 0 :
self.part_number+=1
self.output_string +=f"#end document\n\n#begin document ({self.document_id}); part {self.part_number}\n"
self.sentence_number=1
else:
self.output_string +='\n'
self.sentence_number+=1
else:
self.sentence_starts = False
self.output_string+="#end document\n"
open(output_file,'w',encoding='utf8').write(self.output_string)
def from_frpos_to_upos(self,pos):
'''
maps french part of speech to their universal counterparts
'''
matchpos = re.match("[A-Z]+",pos)
if pos.startswith("CONcoo"):
upos = 'CCONJ'
elif pos.startswith("CONsub"):
upos = 'SCONJ'
elif matchpos:
upos = matchpos.group(0)
else:
upos = pos
if upos in self.convert_pos:
upos= self.convert_pos[upos]
return upos
def get_predicate_lemma(self,token,pos):
predicate_frameset_id = 0
if pos == 'ABR':
doc = NLP(pos)
pos = doc[0].pos_
if pos in ['VERB','AUX']:
lemma = token.find('//txm:ana[#type="#lemma"]').text
else:
lemma = '_'
return lemma,predicate_frameset_id,pos
def get_coreference(self,word_id):
'''
Get the coreference chain of the token if there is one. Otherwise returns '_'
returns a string with all the chain ids the token belongs following the conll format
'''
path_begin = './standOff/annotations[@type="coreference"]/annotationGrp[@subtype="MENTION"]/'
path_from_to = f'span[{ends_with("from",word_id)} or {ends_with("to",word_id,)}]'
spans = self.urs_tree.xpath(path_begin+path_from_to)
coreferences = []
#print(spans)
if spans :
for span in spans:
if span.get('id') not in self.unique_mentions.values():
continue
chains = self.urs_tree.xpath(path_begin.replace("MENTION","CHAINE")+f'link[{ends_with("target",span.get("id"))} or contains(@target,"{span.get("id")} ")]')
#print("SPAN",word_id,"from"+span.get("from"),"to"+span.get("to"), chains)
for chain in chains:
if self.mention_unique.match(chain.get("target")):
continue
coreference = ''
chain_id = chain.get('id')
chain_number = re.search('\d+',chain_id).group()
if span.get('from')==f"text:{word_id}":
coreference += '('
coreference += chain_number
if span.get('to')==f"text:{word_id}":
coreference +=')'
if re.match("\d+\)", coreference) and chain_number in self.unclosed_corefs:
self.unclosed_corefs.remove(chain_number)
elif re.match("\(\d+$", coreference) :
self.unclosed_corefs.add(chain_number)
coreferences.append(coreference)
if not coreferences:
coreferences = ["_"]
return '|'.join(coreferences)
def make_parse_bit(self,token,part_of_speech):
'''
returns parse bit
updates the self.sentence_ends necessary to change sentence when possible
'''
if (part_of_speech == 'SENT' or token in self.sentence_seps ) and not self.unclosed_corefs and self.word_number > 3:
self.sentence_ends = True
parse_bit = '*)'
elif self.sentence_starts:
parse_bit = '(TOP'
else:
parse_bit = '*'
return parse_bit
def convert_corpus(file_path,output_dir,print_frequency=10,limit_century=16):
'''
browses the DEMOCRAT corpus and from each xml/urs file pair creates a new conll file
'''
file_list = glob.glob(file_path)
file_list = [f for f in file_list if (DF_METADATA[DF_METADATA.fichier==os.path.basename(f).replace(".xml","")].iat[0,4]) >= limit_century]
print(file_list)
file_number = len(file_list)-1
for i,file_xml in enumerate(file_list):
file_xml_urs = '/xml-urs/'.join(file_xml.split('/xml/')).replace(".xml",'-urs.xml')
xml_tree , urs_tree = get_tree(file_xml), get_tree(file_xml_urs)
dir, base = os.path.split(file_xml)
output_file = output_dir+'/'+base.replace(".xml",'.conll')
document_name = base.replace(".xml","")
print(base, document_name,"file",i+1, "/", file_number)
converter = DocumentConllConverter(xml_tree,urs_tree,document_name,CONVERT_POS)
converter.from_xml_to_conll(output_file,print_frequency=print_frequency,document_split_frequency=15)
print()
def get_tree(file_name):
return lxml.etree.parse(open(file_name,encoding='utf8'))
def ends_with(attribute,substring):
return f'substring(@{attribute} , string-length(@{attribute}) - string-length("{substring}")+1) = "{substring}"'
def convert_century_to_int(century):
'''
applied to metadata dataframe to convert centuries to int
'''
return int(re.search("\d+",century).group())
"""Assemble the multiple conll files into three large files, each for training, development and testing"""
def open_w(f):
return open(f,"w",encoding="utf8")
def splits_conll_file(f,train_corpus_size,test_corpus_size):
'''
splits the conll file in three files of given proportions : train, dev and test
'''
dev_corpus_size = 1 - ( train_corpus_size+test_corpus_size)
train = dev = test = ''
with open(f) as input_file:
text = input_file.read()
docs_number = text.count("#begin document")
train_end = docs_number*train_corpus_size
dev_end = train_end + docs_number*dev_corpus_size
part_number=0
for line in text.splitlines(True):
document_starts = re.match("#begin document.*?(\d+)\n",line)
if document_starts:
part_number = int(document_starts.group(1))
if part_number < train_end:
train += line
elif part_number < dev_end:
dev += line
else:
test+= line
return train, dev, test
def make_large_conll_files(conll_files_path,conll_large_files_directory,train_corpus_size=0.6,test_corpus_size=0.1):
'''
splits each conll file in the directory in three parts
and combine them in three large conll files
'''
assert all([0<s<1 for s in (train_corpus_size,test_corpus_size) ])
if train_corpus_size+test_corpus_size > 0.9:
print(f"developement corpus too small. It must be at least equal to 10 % of the whole corpus")
for dir in ["train","dev","test"]:
os.makedirs(conll_large_files_directory+"/"+dir, exist_ok=True)
with open_w(conll_large_files_directory+"/train/train.french.v4_gold_conll") as train_output,open_w(conll_large_files_directory+"/dev/dev.french.v4_gold_conll") as dev_output, open_w(conll_large_files_directory+"/test/test.french.v4_gold_conll") as test_output:
for f in glob.glob(conll_files_path):
train,dev,test = splits_conll_file(f,train_corpus_size,test_corpus_size)
train_output.write(train)
dev_output.write(dev)
test_output.write(test)
if __name__ == '__main__':
NLP = spacy.load('fr_core_news_sm')
path_corpus = sys.argv[0]
CONVERT_POS = {"VER":"VERB","PRO":"PRON","PUN":"PUNCT",'PON':'PUNCT',
"NOM":'NOUN','NAM':'PROPN','SENT':'PUNCT','PRE':'ADP',
'PRP':'ADP','INJ':'INTJ','KON':'CCONJ','INT':'INTJ',
'ETRE':'NUM','OUT':'X',}
DF_METADATA = pd.read_csv(path_corpus+"DEMOCRAT/democrat_metadata.csv",sep="\t")
DF_METADATA["siècle_composition"] = DF_METADATA["siècle_composition"].apply(convert_century_to_int)
DF_METADATA.fichier = DF_METADATA.fichier.apply(lambda x:x.lower())
convert_corpus(path_corpus+'/DEMOCRAT/democrat/5/data/xml/*.xml',path_corpus+'/DEMOCRAT/democrat/5/data/conll2',print_frequency=10,limit_century=19)
make_large_conll_files(path_corpus+'/DEMOCRAT/democrat/5/data/conll2/*.conll',path_corpus+'/DEMOCRAT/democrat/5/data/conll2/conll_large',train_corpus_size=0.65,test_corpus_size=0.1)