-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_philosophy.py
155 lines (132 loc) · 5.26 KB
/
wikipedia_philosophy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import json
import requests
from bs4 import BeautifulSoup
import wikipedia
from colorama import Fore, Back, Style
# save the links path to philosophy
def save_object(filename, _object):
json_object = json.dumps(_object, indent=3, ensure_ascii=False)
with open(filename, "a") as outfile:
outfile.write(json_object + ',\n')
# remove all links from the last run
def earase_output_file(filename):
with open(filename, 'r+') as outfile:
outfile.truncate(0)
# return a random link
def get_random_link() :
response = requests.get('https://en.wikipedia.org/wiki/Special:Random')
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('h1', attrs ={'id':'firstHeading'})
return title.text, wikipedia.page(title.text).url
# detect text between start_char and end_char
def detect_bad_indexes(text, start_char, end_char):
text_between = ''
counter = 0
indexes = []
start_index = 0
end_index = 0
for each in range(text.__len__()):
if counter > 0:
text_between += text[each]
if counter > 0:
text_between += text[each]
elif counter < 0 :
counter = 0
if text[each] == start_char :
if counter == 0 :
start_index = each
counter += 1
if text[each] == end_char:
counter -= 1
if counter == 0 :
end_index = each
indexes.append((start_index, end_index))
elif counter < 0 :
start_index = end_index = 0
return indexes
# check if a is inside of bad indexes
def check_a_isbad(a_tag_text, text, bad_indexes):
a_tag_text_index = text.index(a_tag_text)
res = [x for x in bad_indexes if a_tag_text_index > x[0] and a_tag_text_index < x[1]]
if res :
return True
else:
return False
# opens the link
def open_link(link) :
response = requests.get(link)
soup = BeautifulSoup(response.content, 'html.parser')
div_tag = soup.find('div', attrs={'class':'mw-parser-output'})
p_tags = div_tag.find_all("p", recursive = False)
counter = 1
for each_p in p_tags:
if each_p.has_attr('class') and each_p['class'][0] == 'mw-empty-elt' :
continue
bad_par_indexes = detect_bad_indexes(each_p.text, '(', ')')
bad_curly_indexes = detect_bad_indexes(each_p.text, '{', '}')
a_tags = each_p.find_all('a', recursive = False)
for each_a in a_tags :
if each_a != None :
a_text = each_a.text
a_href = each_a['href']
# detect text in parantheses and curly brackets
if check_a_isbad(a_text, each_p.text, bad_par_indexes) :
continue
if check_a_isbad(a_text, each_p.text, bad_curly_indexes) :
continue
# Detect red links
if each_a.has_attr('class') and each_a['class'][0] in ['new', 'mw-disambig'] :
continue
# Detect links to the out side of wikipedia
if 'https://' in a_href or 'http://' in a_href:
continue
ret_val ={
'a_text' : each_a.text,
'a_href' : each_a['href'],
'p_number' : counter,
}
return ret_val, True
counter += 1
return {}, False
if __name__ == "__main__":
wiki_english = 'https://en.wikipedia.org/wiki/Main_Page'
wiki_link = 'https://en.wikipedia.org'
wiki_random = ''
wiki_philosophy = '/wiki/Philosophy'
output_file = 'links_path.json'
print(Style.RESET_ALL + Style.DIM + Fore.GREEN + 'Starting game ...' + Style.RESET_ALL)
wiki_random = get_random_link()
earase_output_file(output_file)
counter = 1
flag = False
link = dict()
link['a_text'] = wiki_random[0]
link['a_href'] = wiki_random[1].replace(wiki_link,'')
link['p_number'] = 0
check_loop = set()
while counter < 120:
if link['a_href'] in check_loop:
print(Style.RESET_ALL + Style.DIM + Fore.RED + 'Oops! It is a loop!' + Style.RESET_ALL)
flag = False
break
else :
check_loop.add(link['a_href'])
temp_link = wiki_link + link['a_href']
print(Style.RESET_ALL + Style.DIM + str(counter) , link['a_text']
, Fore.BLUE + f'({temp_link})'+ Style.RESET_ALL)
save_object(output_file,link)
link, flag = open_link(temp_link)
if flag == False :
break
if link['a_href'] == wiki_philosophy :
save_object(output_file, link)
counter += 1
print(Style.RESET_ALL + Style.DIM + str(counter) , link['a_text'] + Style.RESET_ALL)
flag = True
break
else:
counter += 1
if flag == True :
print(Style.RESET_ALL + Fore.WHITE + Back.GREEN + f'Congratulation! Philosophy has been found on {counter}th time.' + Style.RESET_ALL)
else :
print(Style.RESET_ALL + Fore.WHITE + Back.RED + f'Sorry! Philosophy was not reached.' + Style.RESET_ALL)