-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata.py
153 lines (116 loc) · 4.78 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
''' Load data from the SSC xml file
'''
from lxml import etree
import sys
import codecs
import itertools
from collections import Counter
import re
''' Annotation container class. Supply kwargs to initialize class fields by name
'''
class Annotation(object):
__slots__ = ('len', 'offset', 'grp', 'text', 'unit_text', 'e')
def __init__(self, e_element=None, unittext=None, **kwargs):
# initialization from kwargs
if kwargs:
for attrname in self.__slots__:
setattr(self, attrname, kwargs.get(attrname, None))
# initialization from an xml node
else:
self.len = e_element.attrib['len']
self.offset = e_element.attrib['offset']
self.grp = e_element.attrib['grp'] # if multiple groups, separated by |
self.text = e_element.text.lower()
self.e = e_element
self.unit_text = unittext
# for testing ambiguity
def __hash__(self):
return hash((self.len, self.offset, self.text))
def __unicode__(self):
to_string = lambda x: str(x) if x is not None else ''
return u'\t'.join(to_string(getattr(self, field)) for field in self.__slots__)
GROUP_NAMES = [
'ANAT', 'CHEM', 'DEVI',
'DISO', 'GEOG', 'LIVB',
'PHYS', 'PROC', 'PHEN',
'OBJC'
]
GROUP_MAPPING = dict((y, x) for (x, y) in enumerate(GROUP_NAMES))
''' Get a group number if there is only one or a list if a plenty
'''
def get_group_number(self):
ambig_groups = self.get_ambiguous_groups()
if ambig_groups:
return [self.GROUP_MAPPING[grp] for grp in ambig_groups]
else:
return self.GROUP_MAPPING[self.grp]
def get_slices(self):
begin_highlight = int(self.offset)
end_highlight = int(self.offset) + int(self.len)
slice_one = self.unit_text[:begin_highlight]
slice_two = self.unit_text[begin_highlight:end_highlight]
slice_three = self.unit_text[end_highlight:]
return (slice_one, slice_two, slice_three)
def get_highlighted_repr(self):
return "%s[[%s]]%s" % self.get_slices()
def get_context_string(self):
context_before, _, context_after = self.get_slices()
return context_before + context_after
def get_ambiguous_groups(self):
groups = self.grp.split('|')
return groups if len(groups) != 1 else None
''' Load unambiguous annotations from Silver Standard Corpus
'''
def load_unambiguous_annotations(ssc_file_name, unit_ids_to_ignore=set()):
# loading XMLs
parser = etree.XMLParser(encoding='utf-8')
ssc = etree.parse(ssc_file_name, parser).getroot()
global_annotations = []
for document in ssc.iter("document"):
for unit in document.iter("unit"):
if unit.attrib["id"] in unit_ids_to_ignore:
continue
unit_text = unit.find("text").text
non_empty_e_iter = itertools.ifilter(lambda e: e.text is not None, unit.iter("e"))
annotations = [Annotation(e, unit_text) for e in non_empty_e_iter]
# filter out ambiguous annotations
annotations_counted = Counter(annotations)
annotations = [annotation for (annotation, count) in
filter( lambda (key, count): count == 1, annotations_counted.items())]
global_annotations += annotations
return global_annotations
''' Load ambiguous annotations from a csv file produced by
https://kitt.cl.uzh.ch/kitt/mantracrowd/disambig/vote_results.csv?AgreementThr=0.6
annotations are labeled with answers: e.g. from MTurk or expert
first line of file should be an excel-like separator instruction, e.g. "sep=\t"
'''
def load_ambiguous_annotations_labeled(csv_file_name):
annotations = []
labels = []
with codecs.open(csv_file_name, 'r', 'utf-8') as f:
separator_line = f.readline()
sep = re.match("sep=(.)", separator_line).group(1)
for line in f:
length, offset, groups, text, unit_text, vote, _ = line[:-1].split(sep)
annotations.append(Annotation(len=int(length), offset=int(offset), grp=groups, text=text, unit_text=unit_text))
labels.append(vote)
return (annotations, labels)
def load_ambiguous_annotations_labeled_generic(file_name):
annotations = []
labels = []
with codecs.open(file_name, 'r', 'utf-8') as f:
separator_line = f.readline()
if re.match('.+\.csv', file_name):
sep = re.match("sep=(.)", separator_line).group(1)
for line in f:
length, offset, groups, text, unit_text, vote, _ = line[:-1].split(sep)
annotations.append(Annotation(len=int(length), offset=int(offset), grp=groups, text=text, unit_text=unit_text))
labels.append(vote)
elif re.match('.+\.tsv', file_name):
separator_line = f.next()
sep = '\t'
for line in f:
length, offset, groups, text, unit_text, vote = line[:-1].split(sep)
annotations.append(Annotation(len=int(length), offset=int(offset), grp=groups, text=text, unit_text=unit_text))
labels.append(vote)
return (annotations, labels)