-
Notifications
You must be signed in to change notification settings - Fork 2.6k
/
Copy pathlabel_config.py
399 lines (335 loc) · 14.8 KB
/
label_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import logging
import json
import pandas as pd
import numpy as np
import os
import xmljson
import jsonschema
import re
from urllib.parse import urlencode
from lxml import etree
from collections import defaultdict
from django.conf import settings
from label_studio.core.utils.io import find_file
from label_studio.core.utils.exceptions import (
LabelStudioValidationErrorSentryIgnored, LabelStudioXMLSyntaxErrorSentryIgnored
)
logger = logging.getLogger(__name__)
_DATA_EXAMPLES = None
_LABEL_TAGS = {'Label', 'Choice'}
_NOT_CONTROL_TAGS = {'Filter',}
# TODO: move configs in right place
_LABEL_CONFIG_SCHEMA = find_file('label_config_schema.json')
with open(_LABEL_CONFIG_SCHEMA) as f:
_LABEL_CONFIG_SCHEMA_DATA = json.load(f)
def parse_config(config_string):
"""
:param config_string: Label config string
:return: structured config of the form:
{
"<ControlTag>.name": {
"type": "ControlTag",
"to_name": ["<ObjectTag1>.name", "<ObjectTag2>.name"],
"inputs: [
{"type": "ObjectTag1", "value": "<ObjectTag1>.value"},
{"type": "ObjectTag2", "value": "<ObjectTag2>.value"}
],
"labels": ["Label1", "Label2", "Label3"] // taken from "alias" if exists or "value"
}
"""
if not config_string:
return {}
def _is_input_tag(tag):
return tag.attrib.get('name') and tag.attrib.get('value')
def _is_output_tag(tag):
return tag.attrib.get('name') and tag.attrib.get('toName') and tag.tag not in _NOT_CONTROL_TAGS
def _get_parent_output_tag_name(tag, outputs):
# Find parental <Choices> tag for nested tags like <Choices><View><View><Choice>...
parent = tag
while True:
parent = parent.getparent()
if parent is None:
return
name = parent.attrib.get('name')
if name in outputs:
return name
try:
xml_tree = etree.fromstring(config_string)
except etree.XMLSyntaxError as e:
raise LabelStudioXMLSyntaxErrorSentryIgnored(str(e))
inputs, outputs, labels = {}, {}, defaultdict(dict)
for tag in xml_tree.iter():
if _is_output_tag(tag):
tag_info = {'type': tag.tag, 'to_name': tag.attrib['toName'].split(',')}
# Grab conditionals if any
conditionals = {}
if tag.attrib.get('perRegion') == 'true':
if tag.attrib.get('whenTagName'):
conditionals = {'type': 'tag', 'name': tag.attrib['whenTagName']}
elif tag.attrib.get('whenLabelValue'):
conditionals = {'type': 'label', 'name': tag.attrib['whenLabelValue']}
elif tag.attrib.get('whenChoiceValue'):
conditionals = {'type': 'choice', 'name': tag.attrib['whenChoiceValue']}
if conditionals:
tag_info['conditionals'] = conditionals
outputs[tag.attrib['name']] = tag_info
elif _is_input_tag(tag):
inputs[tag.attrib['name']] = {'type': tag.tag, 'value': tag.attrib['value'].lstrip('$')}
if tag.tag not in _LABEL_TAGS:
continue
parent_name = _get_parent_output_tag_name(tag, outputs)
if parent_name is not None:
actual_value = tag.attrib.get('alias') or tag.attrib.get('value')
if not actual_value:
logger.debug(
'Inspecting tag {tag_name}... found no "value" or "alias" attributes.'.format(
tag_name=etree.tostring(tag, encoding='unicode').strip()[:50]))
else:
labels[parent_name][actual_value] = dict(tag.attrib)
for output_tag, tag_info in outputs.items():
tag_info['inputs'] = []
for input_tag_name in tag_info['to_name']:
if input_tag_name not in inputs:
logger.warning(
f'to_name={input_tag_name} is specified for output tag name={output_tag}, '
'but we can\'t find it among input tags')
continue
tag_info['inputs'].append(inputs[input_tag_name])
tag_info['labels'] = list(labels[output_tag])
tag_info['labels_attrs'] = labels[output_tag]
return outputs
def parse_config_to_json(config_string):
parser = etree.XMLParser(recover=False)
xml = etree.fromstring(config_string, parser)
if xml is None:
raise etree.XMLSchemaParseError('xml is empty or incorrect')
config = xmljson.badgerfish.data(xml)
return config
def validate_label_config(config_string):
# xml and schema
try:
config = parse_config_to_json(config_string)
jsonschema.validate(config, _LABEL_CONFIG_SCHEMA_DATA)
except (etree.XMLSyntaxError, etree.XMLSchemaParseError, ValueError) as exc:
raise LabelStudioValidationErrorSentryIgnored(str(exc))
except jsonschema.exceptions.ValidationError as exc:
error_message = exc.context[-1].message if len(exc.context) else exc.message
error_message = 'Validation failed on {}: {}'.format('/'.join(exc.path), error_message.replace('@', ''))
raise LabelStudioValidationErrorSentryIgnored(error_message)
# unique names in config # FIXME: 'name =' (with spaces) won't work
all_names = re.findall(r'name="([^"]*)"', config_string)
if len(set(all_names)) != len(all_names):
raise LabelStudioValidationErrorSentryIgnored('Label config contains non-unique names')
# toName points to existent name
names = set(all_names)
toNames = re.findall(r'toName="([^"]*)"', config_string)
for toName_ in toNames:
for toName in toName_.split(','):
if toName not in names:
raise LabelStudioValidationErrorSentryIgnored(f'toName="{toName}" not found in names: {sorted(names)}')
def extract_data_types(label_config):
# load config
parser = etree.XMLParser()
xml = etree.fromstring(label_config, parser)
if xml is None:
raise etree.XMLSchemaParseError('Project config is empty or incorrect')
# take all tags with values attribute and fit them to tag types
data_type = {}
parent = xml.findall('.//*[@value]')
for match in parent:
if not match.get('name'):
continue
name = match.get('value')
if len(name) > 1 and name[0] == '$':
name = name[1:]
data_type[name] = match.tag
return data_type
def get_all_labels(label_config):
outputs = parse_config(label_config)
labels = defaultdict(list)
for control_name in outputs:
for label in outputs[control_name].get('labels', []):
labels[control_name].append(label)
return labels
def get_annotation_tuple(from_name, to_name, type):
if isinstance(to_name, list):
to_name = ','.join(to_name)
return '|'.join([from_name, to_name, type.lower()])
def get_all_control_tag_tuples(label_config):
outputs = parse_config(label_config)
out = []
for control_name, info in outputs.items():
out.append(get_annotation_tuple(control_name, info['to_name'], info['type']))
return out
def get_all_object_tag_names(label_config):
return set(extract_data_types(label_config))
def config_line_stipped(c):
tree = etree.fromstring(c)
comments = tree.xpath('//comment()')
for c in comments:
p = c.getparent()
if p is not None:
p.remove(c)
c = etree.tostring(tree, method='html').decode("utf-8")
return c.replace('\n', '').replace('\r', '')
def get_task_from_labeling_config(config):
""" Get task, annotations and predictions from labeling config comment,
it must start from "<!-- {" and end as "} -->"
"""
# try to get task data, annotations & predictions from config comment
task_data, annotations, predictions = {}, None, None
start = config.find('<!-- {')
start = start if start >= 0 else config.find('<!--{')
start += 4
end = config[start:].find('-->') if start >= 0 else -1
if 3 < start < start + end:
try:
logger.debug('Parse ' + config[start:start + end])
body = json.loads(config[start:start + end])
except Exception as exc:
logger.error(exc, exc_info=True)
pass
else:
logger.debug(json.dumps(body, indent=2))
dont_use_root = 'predictions' in body or 'annotations' in body
task_data = body['data'] if 'data' in body else (None if dont_use_root else body)
predictions = body['predictions'] if 'predictions' in body else None
annotations = body['annotations'] if 'annotations' in body else None
return task_data, annotations, predictions
def data_examples(mode):
""" Data examples for editor preview and task upload examples
"""
global _DATA_EXAMPLES
if _DATA_EXAMPLES is None:
with open(find_file('data_examples.json'), encoding='utf-8') as f:
_DATA_EXAMPLES = json.load(f)
roots = ['editor_preview', 'upload']
for root in roots:
for key, value in _DATA_EXAMPLES[root].items():
if isinstance(value, str):
_DATA_EXAMPLES[root][key] = value.replace('<HOSTNAME>', settings.HOSTNAME)
return _DATA_EXAMPLES[mode]
def generate_sample_task_without_check(label_config, mode='upload', secure_mode=False):
""" Generate sample task only
"""
# load config
parser = etree.XMLParser()
xml = etree.fromstring(label_config, parser)
if xml is None:
raise etree.XMLSchemaParseError('Project config is empty or incorrect')
# make examples pretty
examples = data_examples(mode=mode)
# iterate over xml tree and find values with '$'
task = {}
parent = xml.findall('.//*[@value]') # take all tags with value attribute
for p in parent:
# Make sure it is a real object tag, extract data placeholder key
value = p.get('value')
if not value or not value.startswith('$'):
continue
value = value[1:]
# detect secured mode - objects served as URLs
value_type = p.get('valueType') or p.get('valuetype')
only_urls = secure_mode or value_type == 'url'
example_from_field_name = examples.get('$' + value)
if example_from_field_name:
# try get example by variable name
task[value] = example_from_field_name
elif value == 'video' and p.tag == 'HyperText':
task[value] = examples.get('$videoHack')
elif p.tag == 'Paragraphs':
# Paragraphs special case - replace nameKey/textKey if presented
name_key = p.get('nameKey') or p.get('namekey') or 'author'
text_key = p.get('textKey') or p.get('textkey') or 'text'
if only_urls:
params = {'nameKey': name_key, 'textKey': text_key}
task[value] = examples['ParagraphsUrl'] + urlencode(params)
else:
task[value] = []
for item in examples[p.tag]:
task[value].append({name_key: item['author'], text_key: item['text']})
elif p.tag == 'TimeSeries':
# TimeSeries special case - generate signals on-the-fly
time_column = p.get('timeColumn')
value_columns = []
for ts_child in p:
if ts_child.tag != 'Channel':
continue
value_columns.append(ts_child.get('column'))
sep = p.get('sep')
time_format = p.get('timeFormat')
if only_urls:
# data is URL
params = {'time': time_column, 'values': ','.join(value_columns)}
if sep:
params['sep'] = sep
if time_format:
params['tf'] = time_format
task[value] = '/samples/time-series.csv?' + urlencode(params)
else:
# data is JSON
task[value] = generate_time_series_json(time_column, value_columns, time_format)
elif p.tag == 'HyperText':
if only_urls:
task[value] = examples['HyperTextUrl']
else:
task[value] = examples['HyperText']
else:
# patch for valueType="url"
examples['Text'] = examples['TextUrl'] if only_urls else examples['TextRaw']
# not found by name, try get example by type
task[value] = examples.get(p.tag, 'Something')
return task
def _is_strftime_string(s):
# simple way to detect strftime format
return '%' in s
def generate_time_series_json(time_column, value_columns, time_format=None):
""" Generate sample for time series
"""
n = 100
if time_format is not None and not _is_strftime_string(time_format):
time_fmt_map = {
'yyyy-MM-dd': '%Y-%m-%d'
}
time_format = time_fmt_map.get(time_format)
if time_format is None:
times = np.arange(n).tolist()
else:
times = pd.date_range('2020-01-01', periods=n, freq='D').strftime(time_format).tolist()
ts = {time_column: times}
for value_col in value_columns:
ts[value_col] = np.random.randn(n).tolist()
return ts
def get_sample_task(label_config, secure_mode=False):
""" Get sample task from labeling config and combine it with generated sample task
"""
predefined_task, annotations, predictions = get_task_from_labeling_config(label_config)
generated_task = generate_sample_task_without_check(label_config, mode='editor_preview', secure_mode=secure_mode)
if predefined_task is not None:
generated_task.update(predefined_task)
return generated_task, annotations, predictions
def config_essential_data_has_changed(new_config_str, old_config_str):
""" Detect essential changes of the labeling config
"""
new_config = parse_config(new_config_str)
old_config = parse_config(old_config_str)
for tag, new_info in new_config.items():
if tag not in old_config:
return True
old_info = old_config[tag]
if new_info['type'] != old_info['type']:
return True
if new_info['inputs'] != old_info['inputs']:
return True
if not set(old_info['labels']).issubset(new_info['labels']):
return True
def replace_task_data_undefined_with_config_field(data, project, first_key=None):
""" Use first key is passed (for speed up) or project.data.types.keys()[0]
"""
# assign undefined key name from data to the first key from config, e.g. for txt loading
if settings.DATA_UNDEFINED_NAME in data and (first_key or project.data_types.keys()):
key = first_key or list(project.data_types.keys())[0]
data[key] = data[settings.DATA_UNDEFINED_NAME]
del data[settings.DATA_UNDEFINED_NAME]