-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix-only-name.py
183 lines (145 loc) · 5.85 KB
/
fix-only-name.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/local/bin/python3
import argparse
import json
import bibtexparser
import requests
import sys
import os
import time
from bibtexparser.bwriter import BibTexWriter
from crossref.restful import Works
works = Works()
BIB_CACHE_FILE_PATH = "/tmp/fix-bibtex.bib.cache"
TITLE_CACHE_FILE_PATH = "/tmp/fix-bibtex.title.cache"
def doi2bib(doi, cache):
"""
Return a bibTeX string of metadata for a given DOI.
Reference: https://gist.github.com/jrsmith3/5513926
"""
if doi in cache:
return bibtexparser.loads(cache[doi]).entries[0]
url = "http://dx.doi.org/" + doi
bib_headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers=bib_headers)
new_bibtex = r.text
json_headers = {"accept": "application/citeproc+json"}
r = requests.get(url, headers=json_headers)
r_json = json.loads(r.text)
new_bib_entry_local = {}
parsed_bibtex = bibtexparser.loads(new_bibtex)
if len(parsed_bibtex.entries) > 0:
new_bib_entry_local = parsed_bibtex.entries[0]
new_bib_entry_local['ID'] = bib_entry['ID']
if "subtitle" in r_json and len(r_json["subtitle"]) > 0:
new_bib_entry_local["title"] = new_bib_entry_local["title"] + ": " + r_json["subtitle"][0]
if "subtitle" in r_json and len(r_json["subtitle"]) > 1:
print("Multiple subtitles:", file=sys.stderr)
print(r_json["subtitle"], file=sys.stderr)
if "author" in r_json and "Andrew J. Ko" in new_bib_entry_local["author"]:
new_bib_entry_local["author"] = new_bib_entry_local["author"].replace("Andrew J. Ko", "Amy J. Ko")
print("Update Amy's name!", file=sys.stderr)
parsed_bibtex.entries[0] = new_bib_entry_local
print('%d / %d BibTex entries fixed!' % (i, j), file=sys.stderr)
else:
print("parse failed")
print(doi)
print(new_bibtex)
cache[doi] = bibtexparser.dumps(parsed_bibtex)
return new_bib_entry_local
MAX_RETRY = 10
STOP_ASKING = False
def safe_doi2bib(doi, cache):
for i in range(MAX_RETRY):
result = doi2bib(doi, cache)
if result != '' and result is not None:
return result
time.sleep(1)
print("timeout")
#%%
def find_by_title(title, cache):
global STOP_ASKING
if(title in cache):
return cache[title]
res = works.query(title).sort('score')
res.sample(1)
first_result = None
for i in res:
first_result = i
break
result_title = first_result['title'][0]
print("original: " + title)
print("found: " + result_title)
if "subtitle" in first_result:
print("subtitle: " + str(first_result["subtitle"]))
print("doi: " + first_result["DOI"])
# only keep a-Z0-9 in title
raw_title = ''.join(e for e in title if e.isalnum()).lower()
raw_result_title = ''.join(e for e in result_title if e.isalnum()).lower()
raw_result_title_and_subtitle = raw_result_title
if "subtitle" in first_result:
raw_result_title_and_subtitle += ''.join(e for e in first_result["subtitle"][0] if e.isalnum()).lower()
doi_result = None
if len(raw_title) > 10 and (raw_title == raw_result_title or raw_title == raw_result_title_and_subtitle):
print("automatically matched")
doi_result = first_result["DOI"]
else:
if not STOP_ASKING:
user_input = 'a'
while user_input not in 'nys':
user_input = input("y/n/complete doi?/s")
print(user_input)
if user_input.startswith('https://doi.org/'):
doi_result = user_input.replace('https://doi.org/', '')
break
if user_input == 'y':
doi_result = first_result["DOI"]
if user_input == 's':
STOP_ASKING = True
if not STOP_ASKING or doi_result is not None:
cache[title] = doi_result
return doi_result
#%%
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fix BibTex File generated by ReadCube")
parser.add_argument('bibtex_file', metavar='FILE', type=str, help='BibTex file generated by ReadCube for fixing')
parser.add_argument('--out', dest='output_bibtex_file', metavar='OUTPUT', type=str, help='output file location',
required=True)
args = parser.parse_args()
if os.path.exists(BIB_CACHE_FILE_PATH):
with open(BIB_CACHE_FILE_PATH) as cache_file:
bib_cache = json.load(cache_file)
else:
bib_cache = dict()
args = parser.parse_args()
if os.path.exists(TITLE_CACHE_FILE_PATH):
with open(TITLE_CACHE_FILE_PATH) as cache_file:
title_cache = json.load(cache_file)
else:
title_cache = dict()
with open(args.bibtex_file) as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
print('%d BibTex entries loaded!' % len(bib_database.entries), file=sys.stderr)
i = 1
j = 1
for bib_entry in bib_database.entries:
doi = None
if 'doi' in bib_entry:
doi = bib_entry['doi']
else:
if 'title' in bib_entry:
doi = find_by_title(bib_entry['title'], title_cache)
if doi is not None:
new_bib_entry = safe_doi2bib(doi, bib_cache)
bib_database.entries_dict[new_bib_entry['ID']] = new_bib_entry
i += 1
j += 1
with open(BIB_CACHE_FILE_PATH, 'w') as cache_file:
json.dump(bib_cache, cache_file)
with open(TITLE_CACHE_FILE_PATH, 'w') as cache_file:
json.dump(title_cache, cache_file)
bib_database.entries = [value for key, value in bib_database.entries_dict.items()]
with open(args.output_bibtex_file, 'w') as bibtex_file:
bibtex_file.write(BibTexWriter().write(bib_database))
#%%
#print(find_by_title('Your location has been shared 5,398 times!: A field study on mobile app privacy nudging'))
#%%