fix-only-name.py

#!/usr/local/bin/python3

import argparse
import json

import bibtexparser
import requests
import sys
import os

import time
from bibtexparser.bwriter import BibTexWriter

from crossref.restful import Works
works = Works()

BIB_CACHE_FILE_PATH = "/tmp/fix-bibtex.bib.cache"
TITLE_CACHE_FILE_PATH = "/tmp/fix-bibtex.title.cache"


def doi2bib(doi, cache):
    """
    Return a bibTeX string of metadata for a given DOI.
    Reference: https://gist.github.com/jrsmith3/5513926
    """

    if doi in cache:
        return bibtexparser.loads(cache[doi]).entries[0]

    url = "http://dx.doi.org/" + doi

    bib_headers = {"accept": "application/x-bibtex"}
    r = requests.get(url, headers=bib_headers)
    new_bibtex = r.text

    json_headers = {"accept": "application/citeproc+json"}
    r = requests.get(url, headers=json_headers)
    r_json = json.loads(r.text)

    new_bib_entry_local = {}
    parsed_bibtex = bibtexparser.loads(new_bibtex)
    if len(parsed_bibtex.entries) > 0:
        new_bib_entry_local = parsed_bibtex.entries[0]
        new_bib_entry_local['ID'] = bib_entry['ID']
        if "subtitle" in r_json and len(r_json["subtitle"]) > 0:
            new_bib_entry_local["title"] = new_bib_entry_local["title"] + ": " + r_json["subtitle"][0]
        if "subtitle" in r_json and len(r_json["subtitle"]) > 1:
            print("Multiple subtitles:", file=sys.stderr)
            print(r_json["subtitle"], file=sys.stderr)
        if "author" in r_json and "Andrew J. Ko" in new_bib_entry_local["author"]:
            new_bib_entry_local["author"] = new_bib_entry_local["author"].replace("Andrew J. Ko", "Amy J. Ko")
            print("Update Amy's name!", file=sys.stderr)
        parsed_bibtex.entries[0] = new_bib_entry_local
        print('%d / %d BibTex entries fixed!' % (i, j), file=sys.stderr)
    else:
        print("parse failed")
        print(doi)
        print(new_bibtex)

    cache[doi] = bibtexparser.dumps(parsed_bibtex)
    return new_bib_entry_local


MAX_RETRY = 10
STOP_ASKING = False


def safe_doi2bib(doi, cache):
    for i in range(MAX_RETRY):
        result = doi2bib(doi, cache)
        if result != '' and result is not None:
            return result
        time.sleep(1)

    print("timeout")

#%%
def find_by_title(title, cache):
    global STOP_ASKING
    if(title in cache):
        return cache[title]
    res = works.query(title).sort('score')

    res.sample(1)

    first_result = None
    for i in res:
        first_result = i
        break

    result_title = first_result['title'][0]

    print("original: " + title)
    print("found: " + result_title)
    if "subtitle" in first_result:
        print("subtitle: " + str(first_result["subtitle"]))
    print("doi: " + first_result["DOI"])

    # only keep a-Z0-9 in title
    raw_title = ''.join(e for e in title if e.isalnum()).lower()
    raw_result_title = ''.join(e for e in result_title if e.isalnum()).lower()
    raw_result_title_and_subtitle = raw_result_title
    if "subtitle" in first_result:
        raw_result_title_and_subtitle += ''.join(e for e in first_result["subtitle"][0] if e.isalnum()).lower()

    doi_result = None
    if len(raw_title) > 10 and (raw_title == raw_result_title or raw_title == raw_result_title_and_subtitle):
        print("automatically matched")
        doi_result = first_result["DOI"]
    else:
        if not STOP_ASKING:
            user_input = 'a'

            while user_input not in 'nys':
                user_input = input("y/n/complete doi?/s")
                print(user_input)
                if user_input.startswith('https://doi.org/'):
                    doi_result = user_input.replace('https://doi.org/', '')
                    break

            if user_input == 'y':
                doi_result = first_result["DOI"]

            if user_input == 's':
                STOP_ASKING = True
    if not STOP_ASKING or doi_result is not None:
        cache[title] = doi_result
    return doi_result
#%%


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fix BibTex File generated by ReadCube")
    parser.add_argument('bibtex_file', metavar='FILE', type=str, help='BibTex file generated by ReadCube for fixing')
    parser.add_argument('--out', dest='output_bibtex_file', metavar='OUTPUT', type=str, help='output file location',
                        required=True)
    args = parser.parse_args()
    if os.path.exists(BIB_CACHE_FILE_PATH):
        with open(BIB_CACHE_FILE_PATH) as cache_file:
            bib_cache = json.load(cache_file)
    else:
        bib_cache = dict()

    args = parser.parse_args()
    if os.path.exists(TITLE_CACHE_FILE_PATH):
        with open(TITLE_CACHE_FILE_PATH) as cache_file:
            title_cache = json.load(cache_file)
    else:
        title_cache = dict()

    with open(args.bibtex_file) as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    print('%d BibTex entries loaded!' % len(bib_database.entries), file=sys.stderr)
    i = 1
    j = 1
    for bib_entry in bib_database.entries:
        doi = None
        if 'doi' in bib_entry:
            doi = bib_entry['doi']
        else:
            if 'title' in bib_entry:
                doi = find_by_title(bib_entry['title'], title_cache)
        if doi is not None:
            new_bib_entry = safe_doi2bib(doi, bib_cache)
            bib_database.entries_dict[new_bib_entry['ID']] = new_bib_entry
            i += 1
        j += 1
        with open(BIB_CACHE_FILE_PATH, 'w') as cache_file:
            json.dump(bib_cache, cache_file)
        with open(TITLE_CACHE_FILE_PATH, 'w') as cache_file:
            json.dump(title_cache, cache_file)

    bib_database.entries = [value for key, value in bib_database.entries_dict.items()]

    with open(args.output_bibtex_file, 'w') as bibtex_file:
        bibtex_file.write(BibTexWriter().write(bib_database))

#%%
#print(find_by_title('Your location has been shared 5,398 times!: A field study on mobile app privacy nudging'))


#%%