-
Notifications
You must be signed in to change notification settings - Fork 1
/
wiki.py
105 lines (86 loc) · 2.46 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
import json
import aiohttp
import sys
BASE_URL = 'https://en.wikipedia.org/w/api.php?'
async def wiki_request(session, topic, is_source):
"""
Sends wiki request to obtain links for a topic.
Due to a 500 link limit, additional requests must be sent based on the
'continue' response.
"""
cont = None
titles = []
while cont != 'DONE':
if is_source:
body = await _get_links(session, topic, cont)
cont_type = 'plcontinue'
else:
body = await _get_linkshere(session, topic, cont)
cont_type = 'lhcontinue'
_get_titles(body, titles, cont_type)
try:
cont = body['continue'][cont_type]
except KeyError:
cont = 'DONE'
return titles
async def _get_links(session, topic, cont):
"""
Helper function for single wiki request.
"""
payload = {
'action': 'query',
'titles': topic,
'prop': 'links',
'format': 'json',
'pllimit': '500',
}
if cont:
payload['plcontinue'] = cont
# using 'with' closes the session
async with session.get(BASE_URL, params=payload) as resp:
# check to see if response is OK
if resp.status // 100 == 2:
return await resp.json()
else:
print(resp.status)
sys.exit(1)
async def _get_linkshere(session, topic, cont):
"""
Helper function for single wiki request.
"""
payload = {
'action': 'query',
'titles': topic,
'prop': 'linkshere',
'format': 'json',
'lhlimit': '500',
}
if cont:
payload['lhcontinue'] = cont
# using 'with' closes the session
async with session.get(BASE_URL, params=payload) as resp:
# check to see if response is OK
if resp.status // 100 == 2:
return await resp.json()
else:
print(resp.status)
sys.exit(1)
def _get_titles(body, titles, cont_type):
"""
Adds titles from response to list.
Responses typically have one page of links, but accounted for several in
case.
"""
pages = body['query']['pages']
links = []
if cont_type == 'plcontinue':
link_type = 'links'
else:
link_type = 'linkshere'
for page in pages:
if link_type in pages[page]:
links.append(pages[page][link_type])
for link in links:
for sub in link:
titles.append(sub['title'])