-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathannotater.py
70 lines (49 loc) · 2.06 KB
/
annotater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
from txtmarker.factory import Factory
class Annotate():
def __init__(self):
super().__init__()
def annotate(self, search_result_raw, granularized_corpus_raw, input_path, output_path):
highlights = []
for val in search_result_raw:
name = "{:.4f}".format(val['score'])
corpus_id = val['corpus_id']
text = granularized_corpus_raw[corpus_id]
highlight = (name, text)
highlights.append(highlight)
# Create annotated file
highlighter = Factory.create("pdf", self.formatter, 4)
highlighter.highlight(
input_path, output_path, highlights)
def formatter(self, text):
"""
Custom formatter that is passed to PDF Annotation method. This logic maps data cleansing logic in paperetl.
Reference: https://github.com/neuml/paperetl/blob/master/src/python/paperetl/text.py
Args:
text: input text
Returns:
clean text
"""
# List of patterns
patterns = []
# Remove emails
patterns.append(r"\w+@\w+(\.[a-z]{2,})+")
# Remove urls
patterns.append(r"http(s)?\:\/\/\S+")
# Remove single characters repeated at least 3 times (ex. j o u r n a l)
patterns.append(r"(^|\s)(\w\s+){3,}")
# Remove citations references (ex. [3] [4] [5])
patterns.append(r"(\[\d+\]\,?\s?){3,}(\.|\,)?")
# Remove citations references (ex. [3, 4, 5])
patterns.append(r"\[[\d\,\s]+\]")
# Remove citations references (ex. (NUM1) repeated at least 3 times with whitespace
patterns.append(r"(\(\d+\)\s){3,}")
# Build regex pattern
pattern = re.compile("|".join([f"({p})" for p in patterns]))
# Clean/transform text
text = pattern.sub(" ", text)
# Remove extra spacing either caused by replacements or already in text
text = re.sub(r" {2,}|\.{2,}", " ", text)
# Limit to alphanumeric characters
text = re.sub(r"[^A-Za-z0-9]", "", text)
return text