-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_Pauling.py
52 lines (50 loc) · 2.83 KB
/
parse_Pauling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pymongo
from bs4 import BeautifulSoup
client = pymongo.MongoClient()
db = client.springer
if __name__ == '__main__':
d = 0
for doc in db['pauling_file_unique_Parse'].find().sort('_id', pymongo.ASCENDING).batch_size(75).skip(d):
# if doc['key'] in ['sd_1301665', 'sd_0456987']:
# continue
d += 1
print '#######'
print 'On record # {} and key: {}'.format(d, doc['key'])
soup = BeautifulSoup(doc['webpage_str'], 'lxml')
######### Get data from the sections 'General Information' and 'Reference'
geninfo = soup.find('div', {'id': 'general_information'})
geninfo_text = geninfo.get_text()
lines = (line.strip() for line in geninfo_text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
geninfo_text = '\n'.join(chunk for chunk in chunks if chunk)
geninfo_dict = {}
geninfo_text_list = geninfo_text.split('\n')
for i in range(len(geninfo_text_list)):
if 'General Information' not in geninfo_text_list[i] and 'Substance Summary' not in geninfo_text_list[i]:
if geninfo_text_list[i].endswith(':'):
geninfo_dict[geninfo_text_list[i][:-1]] = geninfo_text_list[i + 1]
refsoup = soup.find('div', {'id': 'globalReference'}).find('div', 'accordion__bd')
reference_dict = {'html': refsoup.prettify(),
'text': ''.join([(str(item.encode('utf-8'))).strip() for item in refsoup.contents])}
geninfo_dict['ref'] = reference_dict
############ Get data from the 'Experimental details' section
expdetails = soup.find('div', {'id': 'experimentalDetails'}).find('div', 'accordion__bd')
exptables = expdetails.findAll('table')
exptables_dict = {}
for table in exptables:
trs = table.findAll('tr')
expfields = {tr.findAll('td')[0].string.strip(): tr.findAll('td')[1].find('ul').find('li').text.strip()
for tr in trs}
exptables_dict.update(expfields)
############ Get title
header = soup.find('h1', 'document__title')
header_text = header.get_text()
lines = (line.strip() for line in header_text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
header_text = '\n'.join(chunk for chunk in chunks if chunk)
db['pauling_file_unique_Parse'].update({'key': doc['key']}, {'$set': {'metadata._Springer.title': header_text}},
upsert=False)
############
# db['pauling_file_unique_Parse'].update({'key': doc['key']}, {
# '$set': {'metadata._Springer.geninfo': geninfo_dict, 'metadata._Springer.expdetails': exptables_dict}},
# upsert=False)