This repository has been archived by the owner on Dec 9, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
fetch_bytes.py
88 lines (78 loc) · 2.6 KB
/
fetch_bytes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
""" loop through the issns, gather abstracts and wite to abstracts/ """
import json
import os
import requests
import spacy
from time import sleep
from bs4 import BeautifulSoup
MONTH = "2021-12"
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])
def fetch(issn):
base_url = "https://doaj.org/api/v1/search/articles/issn%3A"
pagesize = "?pageSize=100&sort=year%3Adesc"
data = requests.get(base_url + issn + pagesize)
print(
"fetching data for "
+ issn
+ ". "
+ str(idx + 1)
+ "/"
+ str(len(issns))
+ ". status: "
+ str(data.status_code)
)
try:
articles = data.json().get("results")
except:
articles = ""
status = str(data.status_code)
if status == "429":
sleep(10)
print("forbidden")
articles = fetch(issn)
return articles
def parse(articles):
abstracts = ""
counter = 0
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
abstract = article["bibjson"]["abstract"]
abstract = BeautifulSoup(abstract, "lxml").text
abstracts = abstracts + " " + abstract
counter += 1
except KeyError:
pass
if abstracts and counter >= 10:
doc = nlp(abstracts)
doc_bytes = doc.to_bytes()
else:
doc = None
print("fail! " + str(counter))
return doc
if __name__ == "__main__":
with open("issnlist-" + MONTH + ".txt") as issnfile:
issns = json.loads(issnfile.read())
issns_output = []
for idx, issn in enumerate(issns[:200]):
if not os.path.exists("abstracts-" + MONTH + "/" + issn):
articles = fetch(issn)
doc = parse(articles)
if not doc:
# if the file does not exist but there is no data
pass
else:
# if the file does not exist and there is data
doc_bytes = doc.to_bytes()
with open("abstracts-" + MONTH + "/" + issn, "wb") as abstractfile:
abstractfile.write(doc_bytes)
os.makedirs("abstracts-" + MONTH + "/" + issn + "-vocab")
doc.vocab.to_disk("abstracts-" + MONTH + "/" + issn + "-vocab")
issns_output.append(issn)
else:
# if the file exists
issns_output.append(issn)
pass
nlp.config.to_disk("abstracts-" + MONTH + "/config.cfg")
with open("issns-" + MONTH + ".txt", "w") as issnfile:
issnfile.write(json.dumps(issns_output))