-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_fr_parse.py
111 lines (90 loc) · 3.44 KB
/
wikipedia_fr_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
from bs4 import BeautifulSoup
from keras.utils import get_file
from timeit import default_timer as timer
import os
import pandas as pd
import matplotlib.pyplot as plt
import bz2
import subprocess
import xml.sax
from WikiXml import WikiXmlHandler
import wiki_dump_parser
import wikiextractor
# get wikipedia files
base_url = 'https://dumps.wikimedia.org/frwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')
dumps = [a['href'] for a in soup_index.find_all('a') if a.text == '20201120/']
print(dumps)
dumps_url = base_url + dumps[0]
dump_html = requests.get(dumps_url).text
soup_dump = BeautifulSoup(dump_html, 'html.parser')
files = []
for file in soup_dump.find_all('li', {'class': 'file'}):
text = file.text
if 'pages-articles' in text:
files.append((text.split()[0], text.split()[1:]))
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
print(f'There are {len(files_to_download)} files to download.')
data_paths = []
file_info = []
keras_home = '/home/shinjini/.keras/datasets/'
start = timer()
for file in files_to_download:
path = keras_home + file
if not os.path.exists(keras_home + file):
print('Downloading')
data_paths.append(get_file(file, dumps_url))
file_size = os.stat(path).st_size / 1e6
file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
file_info.append((file, file_size, file_articles))
else:
data_paths.append(path)
file_size = os.stat(path).st_size / 1e6
file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
file_info.append((file.split('-')[-1], file_size, file_number))
# inspect file sizes
print("The largest files are:")
print("Filename, file size, number of articles")
for thing in sorted(file_info, key=lambda x: x[1], reverse=True)[:5]:
print(thing)
print(f'There are {len(file_info)} partitions.')
file_df = pd.DataFrame(file_info, columns = ['file', 'size (MB)', 'articles']).set_index('file')
file_df['size (MB)'].plot.bar(color = 'red', figsize = (12, 6))
# plt.show()
print(f"The total size of files on disk is {file_df['size (MB)'].sum() / 1e3} GB")
# print(data_paths)
data_path = data_paths[-1]
# inspect data
# lines = []
#
# for i, line in enumerate(subprocess.Popen(['bzcat'],
# stdin=open(data_path),
# stdout=subprocess.PIPE).stdout):
# lines.append(line)
# if i > 5e5:
# break
#
# print(lines[-165:-100])
# Content handler for Wiki XML
# handler = WikiXmlHandler()
# Parsing object
# parser = xml.sax.make_parser()
# parser.setContentHandler(handler)
# for l in lines[-165:-109]:
# parser.feed(l)
#
# print(handler._pages)
# ALTERNATIVE
xml_path = data_path[:-4]
base_path = "/home/shinjini/.keras/datasets/"
# xml_path = base_path + "frwiki-20201120-pages-articles-multistream6-p13574284p13718495.xml"
# xml_path = base_path + "frwiki-20201120-pages-articles1-p1p306134.xml"
# xml_path = base_path + "frwiki-20201120-pages-articles-multistream6-p9074284p10574283.xml"
# xml_path = base_path + "frwiki-20201101-pages-meta-current1-p1p306134.xml"
xml_path = base_path + "frwiki-20201101-pages-articles1-p1p306134.xml"
print("XML to use is", xml_path)
# wiki_dump_parser.xml_to_csv(xml_path)
# ALTERNATIVE
# python3 -m wikiextractor.WikiExtractor "/home/shinjini/.keras/datasets/frwiki-20201101-pages-articles1-p1p306134.xml"