-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
114 lines (100 loc) · 4.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
import time
from lxml import html
import os
import json
from tqdm import tqdm
def get_num_pages(tree):
'''Finds out how many pages there are'''
pages = tree.xpath("//ol[@title='pagination']/li//text()")
max_page = 0
for page in pages:
try:
page = int(page)
max_page = max(max_page, page)
except:
pass
return max_page
def request_till_200(url):
'''Make a request till response is 200'''
while True:
r = requests.get(url)
time.sleep(1)
if r.status_code == 200 or r.status_code == 404:
return r
time.sleep(10)
try:
os.mkdir("fanfictions")
except FileExistsError:
pass
relationships = [
"Sherlock Holmes/John Watson",
"Draco Malfoy/Harry Potter",
"Steve Rogers/Tony Stark",
"Castiel/Dean Winchester"
]
for fandom in relationships:
try:
os.mkdir(os.path.join("fanfictions", fandom.replace("/", "*s*")))
except FileExistsError:
pass
authors = []
for page in range(1, 25):
r = request_till_200("https://archiveofourown.org/tags/" + fandom.replace("/", "*s*") + "/works?page=" + str(page))
tree = html.fromstring(r.text)
page_authors = tree.xpath("//a[@rel='author']/@href")
page_authors = [p.split("/")[-1] for p in page_authors]
authors += page_authors
# Get list of works for each user
for username in tqdm(authors, desc=fandom, position=0):
page = 1
max_page = 2
works = []
while page < max_page:
r = request_till_200("https://archiveofourown.org/users/" + username + "/works?page=" + str(page))
tree = html.fromstring(r.text)
max_page = get_num_pages(tree)
page_works = tree.xpath("//h4[@class='heading']/a/@href")
page_works = [w.split("/")[-1] for w in page_works if w.startswith("/works/")]
# ratings = tree.xpath("//span[contains(@class, 'rating')]/@title")
works += page_works
page += 1
# Get work text and metadata
for fic_id in tqdm(works, desc=username, position=1, leave=False):
url = 'http://archiveofourown.org/works/'+str(fic_id)+'?view_adult=true'
url += '&view_full_work=true'
r = request_till_200(url)
tree = html.fromstring(r.text)
text_ps = tree.xpath("//div[@id='chapters']//p/text()")
text_ps = [p.strip() for p in text_ps]
text = "\n".join(text_ps).strip()
def unpack_if_single(l):
if len(l) == 1:
return l[0]
return l
summary_ps = tree.xpath("//div[@class='summary module']//blockquote//text()")
summary_ps = [p.strip() for p in summary_ps]
summary = "\n".join(summary_ps).strip()
metadata = {
"author": username,
"summary": summary,
"url": url,
"title": tree.xpath("//h2[@class='title heading']//text()")[0].strip(),
"fandoms": unpack_if_single(tree.xpath("///dd[@class='fandom tags']//a//text()")),
"category": unpack_if_single(tree.xpath("///dd[@class='category tags']//a//text()")),
"relationship": unpack_if_single(tree.xpath("///dd[@class='relationship tags']//a//text()")),
"rating": unpack_if_single(tree.xpath("///dd[@class='rating tags']//a//text()")),
"warning": unpack_if_single(tree.xpath("///dd[@class='warning tags']//a//text()")),
"characters": unpack_if_single(tree.xpath("///dd[@class='character tags']//a//text()")),
"language": tree.xpath("///dd[@class='language']//text()")[0].strip(),
}
if isinstance(metadata["relationship"], str) and metadata["relationship"] == fandom and metadata["language"] == "English":
import hashlib
doc_id = hashlib.sha1(text.encode("utf-8")).hexdigest()
base_dir = os.path.join("fanfictions", fandom.replace("/","*s*"), username)
try:
os.mkdir(base_dir)
except Exception as e:
pass
open(os.path.join(base_dir, doc_id + ".txt"), 'w').write(text)
json.dump(metadata, open(os.path.join(base_dir, doc_id + ".json"), 'w'), indent=2)