-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_refs.py
214 lines (175 loc) · 5.93 KB
/
scrape_refs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import re
import requests
import random
from bs4 import BeautifulSoup
# iterations for random search
iterations = 20
# output file, name can be changed here
output_file = 'bibliography.txt'
# Put references in this file
ref_file = input("reference file name (e.g. references.txt):\n")
# code starts below
# define some variables
# pretend to be a firefox browser
SESSION = requests.Session()
SESSION.headers.update(
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
URL_SEARCH = 'https://pubmed.ncbi.nlm.nih.gov/?sort=date&term={q}'
URL_CITE = 'https://pubmed.ncbi.nlm.nih.gov/{ident}/?format=pubmed'
skipped = 0
skipped_refs = ''
r = ''
# define functions
def http_get(url):
r = SESSION.get(url)
return BeautifulSoup(r.text, features="html.parser")
# split reference into authors, year and title
def split_reference(reference):
r = reference
left = ''
right = ''
year = ''
# find (YEAR)
regex = r'\(\d\d\d\d\)'
match = re.search(regex, str(r))
if match:
year = str(match.group(0))
else:
# if not try to find YEAR
regex = r'\d\d\d\d'
match = re.search(regex, str(r))
if match:
year = str(match.group(0))
else:
return 'Error no YEAR found'
# where is YEAR in reference? and how long is reference?
pos_y = r.find(year)
le = len(r)
# if YEAR somewhere in the middle of reference, split into left-part YEAR right-part
if (le - pos_y >= 15):
left, right = r[:pos_y], r[pos_y+6:]
# else split on 'et al' into left-part 'et al' right-part
elif ((le - pos_y <= 15) and (r.find('et al')) != -1):
rs = r.split('et al')
left = rs[0]
right = rs[1]
# else find the third full-stop from the end and split there into left-part full-stop right-part
else:
rs = r.strip('.').rsplit('.', 2)
left = rs[0]
right = rs[1]
# clean up
right = right.replace(year, '')
right = right.replace(' ', ' ')
right = right.strip()
left = left.replace(' ', ' ')
left = left.strip()
year = year.replace('(', '')
year = year.replace(')', '')
split_ref = [left, year, right]
return split_ref
# last fallback if no search quesries find a reference: use random word combinations from the title
def choose_random(reference, leng=3):
q = reference.split()
i = 1
r = []
while i < 6:
rand = random.randint(1, len(q)-1)
if len(q[rand]) > leng:
if (q[rand] in r):
continue
else:
r.append(q[rand])
r.append(' ')
i += 1
r = ''.join(r)
return r
# search for Pubmed ID in results page
def get_articles(query):
url = URL_SEARCH.format(q=query)
soup = http_get(url)
pubmed = ''
for tag in soup.findAll(title="PubMed ID"):
regex = r'\d+'
match = re.search(regex, str(tag))
if match:
pubmed = str(match.group(0))
else:
return 'Error no Pubmed ID found'
return pubmed
# use Pubmed ID to create URL and copy entry in NML format
def get_citations(ident, resolve=True):
url = URL_CITE.format(ident=ident)
soup = http_get(url)
citations = ''
for tag in soup.findAll(id="article-details"):
citations = tag.string.strip()
return citations
# main code
# open input and output fines
if(ref_file == ''):
print('Please provide a file')
quit()
references = open(ref_file, 'r', encoding='utf-8')
myFile = open(output_file, 'w', encoding='utf-8', errors='replace')
# loop through references line by line
for reference in references:
reference = reference.strip()
# covert to lower case and remove some special chars
reference = reference.lower()
reference = reference.replace('-', ' ')
reference = reference.replace('/', ' ')
reference = reference.replace(',', ' ')
reference = reference.replace('%', ' ')
reference = reference.replace('&', '')
# skip empty lines
if (reference == ''):
continue
print("\n---------------------------------------------------------")
print("Doing reference:", reference)
# split the reference into author, year, title
query = split_reference(reference)
q = []
for i in query:
i = i.replace('.', '')
q.append(i)
# find article by author and title
r = get_articles(q[0] + ' ' + q[2])
print("Query: " + q[0] + ' ' + q[2])
# find article by author and year
if len(r) == 0:
r = get_articles(q[0] + ' ' + q[1])
print("No results -- trying: " + q[0] + ' ' + q[1])
# find article by year and title
if len(r) == 0:
r = get_articles(q[1] + ' ' + q[2])
print("Still no results -- trying: " + q[1] + ' ' + q[2])
# find article by author year and title
if len(r) == 0:
r = get_articles(q[0] + ' ' + q[1] + ' ' + q[2])
print("Still no results -- trying: " + q[0] + ' ' + q[1] + ' ' + q[2])
# find article by author year and random words from title
if len(r) == 0:
its = 0
while its < iterations:
q2 = choose_random(reference)
print("Still no results -- trying again with random words: ", q[0] + q[1] + q2)
r = get_articles(q[0] + ' ' + q[1] + ' ' + q2)
if len(r) != 0:
break
its += 1
if len(r) == 0:
print("Still no results -- skipping")
skipped += 1
skipped_refs = skipped_refs + '\n' + reference
continue
print("Result written")
myFile.write(get_citations(r) + '\n\n')
if skipped > 0:
print("\n---------------------------------------------------------")
print("Total number of results skipped: ", skipped)
print("Please check the following references:\n", skipped_refs)
else:
print("\n---------------------------------------------------------")
print('Done')
myFile.close()