-
Notifications
You must be signed in to change notification settings - Fork 1
/
metadata.py
164 lines (142 loc) · 7.58 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import datetime, os, pandas as pd, numpy as np, requests, time, urllib3
from bs4 import BeautifulSoup
class Metadata(object):
'''
'''
identifiers = None
metadata_filepath = None
def request_bulk_metadata(self, date_of_last_request):
'''
Requests bulk metadata from OAI2.
Returns (rows, requestDate) where
'''
rows = []
resumptionToken = 'placeholder'
url = 'http://export.arxiv.org/oai2?verb=ListRecords&set=physics:astro-ph&metadataPrefix=arXiv'
results = None
# If we have specified the date of the last request, add it to the URL
if date_of_last_request:
url += '&from=' + date_of_last_request.strftime('%Y-%m-%d')
# Continue requesting until we are not given any more resumption tokens
while resumptionToken is not None:
# Send request and receive results, waiting specified time if necessary
while results == None:
try:
print('Requesting: ' + url)
results = requests.get(url).text
except urllib3.exceptions.HTTPError as e:
wait = int(e.headers.get('Retry-After'))
print('HTTPError: Waiting ' + str(wait) + 's to retry requesting metadata...')
time.sleep(wait)
# Parse with Beautiful Soup
soup = BeautifulSoup(results, 'xml')
records = soup.find_all('record')
for record in records:
# Get header data
identifier = record.find('identifier')
datestamp = record.find('datestamp')
spec = record.find('setSpec')
# Get metadata
filename = record.find('id')
created = record.find('created')
updated = record.find('updated')
authors = []
for author in record.find_all('author'):
forenames = author.forenames
keyname = author.keyname
if forenames and keyname:
authors.append(author.forenames.text.strip() + ' ' + author.keyname.text.strip())
author_str = ', '.join(authors)
title = record.find('title')
categories = record.find('categories')
journal = record.find('journal-ref')
doi = record.find('doi')
abstract = record.find('abstract')
comments = record.find('comment')
# Save current record as a row in the table
row = {
'identifier': getattr(identifier, 'text', None),
'filename': getattr(filename, 'text', None),
'spec': getattr(spec, 'text', None),
'title': getattr(title, 'text', None),
'datestamp': getattr(datestamp, 'text', None),
'created': getattr(created, 'text', None),
'updated': getattr(updated, 'text', None), # may have more than one instance that we're missing
'authors': author_str,
'categories': getattr(categories, 'text', None),
'journal': getattr(journal, 'text', None),
'doi': getattr(doi, 'text', None),
'abstract': getattr(abstract, 'text', None),
'comments': getattr(comments, 'text', None)
}
rows.append(row)
# Get resumption token if provided
resumptionToken = soup.find('resumptionToken')
# Continue if we have resumption token
if resumptionToken is not None:
print('Status: ' + str(int(resumptionToken['cursor']) + 1) + '—' + str(len(rows)) + '/' + str(resumptionToken['completeListSize']) + '...')
resumptionToken = resumptionToken.text
url = 'http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=' + resumptionToken
time.sleep(20) # avoid 503 status
return rows
def update(self):
'''
Checks if an update is needed. If it is needed, gathers
'''
metadata_filepath = 'arxiv_metadata_astroph.csv'
if os.path.exists(metadata_filepath):
# If the metadata file exists, load it into a data frame
existing_metadata_df = pd.read_csv(metadata_filepath,
dtype={'filename': str,
'filename_parsed': str,
'identifier': str,
'updated': str,
'doi': str},
parse_dates=['date_retrieved'])
# Get the date of the last request
existing_metadata_df['date_retrieved'] = existing_metadata_df['date_retrieved'].apply(lambda x:x.replace(tzinfo=datetime.timezone.utc))
date_of_last_request = existing_metadata_df['date_retrieved'].max()
print(metadata_filepath + ' last updated on ' + date_of_last_request.strftime('%Y-%m-%d'))
print('Updating...')
# Send a request to access metadata since that date
records = self.request_bulk_metadata(date_of_last_request + datetime.timedelta(days=1))
# Create data frame for records to specify additional info
if len(records) > 0:
print('Number of new records found: ' + str(len(records)))
records_df = pd.DataFrame(records)
records_df['date_retrieved'] = np.full(len(records_df), datetime.datetime.now())
records_df['filename_parsed'] = existing_metadata_df['filename'].str.replace('/', '')
# Update metadata file
metadata_df = pd.concat([existing_metadata_df, records_df], axis=0, sort=True, ignore_index=True)
metadata_df.to_csv(metadata_filepath, index=False)
print('Metadata has been updated.')
else:
print('No additional records found. Metadata is up to date.')
else:
# If the metadata file doesn't exist, request all metadata
print(metadata_filepath + ' is being created...')
records = request_bulk_metadata(None)
# Load records into data frame
metadata_df = pd.DataFrame(records)
# Add a column to specify additional info
metadata_df['date_retrieved'] = np.full(len(metadata_df), datetime.datetime.now())
metadata_df['filename_parsed'] = metadata_df['filename'].str.replace('/', '')
# Save it to CSV
metadata_df.to_csv(metadata_filepath, index=False)
print('Metadata has been saved.')
def get_identifiers(self):
# Grab identifiers from metadata
metadata_df = pd.read_csv(self.metadata_filepath,
dtype={'filename': str,
'filename_parsed': str,
'identifier': str,
'updated': str,
'doi': str},
parse_dates=['date_retrieved'])
self.identifiers = metadata_df['filename_parsed']
def __init__(self, update=False):
self.metadata_filepath = 'arxiv_metadata_astroph.csv'
# Automatically check for any updates
if update:
self.update()
self.get_identifiers()