-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpics.py
94 lines (84 loc) · 2.99 KB
/
pics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import shutil
import re
from tqdm import tqdm
from urllib.parse import urlparse
from os.path import basename
import os.path
import requests
import requests_cache
from lxml import html
from xml.etree.ElementTree import tostring
min_w = 4096
min_h = 2160
big_urls = []
dl_size = 0
dl_location="pics"
def get_image(url):
global dl_size
# print(dl_size)
url = "https://en.wikipedia.org" + url
image_page = requests.get(url)
image_tree = html.fromstring(image_page.content)
search = image_tree.xpath("//span[@class=\"fileInfo\"]")
if len(search) > 0:
info_span = search[0]
else:
return
info_text = info_span.text
img_size = info_text[info_text.find("(") + 1 : info_text.find("pixels")]
size_split = img_size.split("×")
try:
img_size = [int(i.replace(",","")) for i in size_split]
except:
return
if img_size[0] >= min_w and img_size[1] >= min_h:
orig_url = "http:" + image_tree.xpath("//a[@class='internal']/@href")[0]
big_urls.append(orig_url)
file_size = re.search(r"file size:\s*(.*)\s*MIME", info_text).groups()[0].split()
# import pdb; pdb.set_trace()
if file_size and len(file_size) == 2:
num_units = float(file_size[0].replace(",",""))
if file_size[1].lower().startswith("mb"):
dl_size += num_units * 1000000
else:
dl_size += num_units * 1000
def get_images_for_month(url):
url = "https://en.wikipedia.org" + url
month_page = requests.get(url)
month_tree = html.fromstring(month_page.content)
image_urls = [u for u in month_tree.xpath('//a/@href') if "File:" in u and not u.endswith(".svg")]
for iurl in image_urls:
get_image(iurl)
def dl(url, pbar):
pbar.set_description("Downloading file %s" % url)
url_parts = urlparse(url)
file_name = basename(url_parts.path)
if len(file_name) > 180:
dot_loc = file_name.rfind(".")
file_name = file_name[:180] + file_name[dot_loc:]
file_path = dl_location + "/" + file_name
if not os.path.isfile(file_path):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(file_path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
pbar.update(1)
requests_cache.install_cache("wikicache")
archive_header_url = "https://en.wikipedia.org/wiki/Template:POTDArchiveHeader"
r = requests.get(archive_header_url)
tree = html.fromstring(r.content)
center = tree.xpath("//center//div[@class = \"hlist\"]//table")[0]
sc = tostring(center, "utf-8", method="xml")
month_urls = [url for url in center.xpath('//a/@href') if "Picture_of_the_day/" in url and not "redlink" in url]
print(month_urls)
for month_url in month_urls:
get_images_for_month(month_url)
# done grabbing pages, time to disable the cache
requests_cache.uninstall_cache()
print(len(big_urls))
print(dl_size)
pbar = tqdm(total=len(big_urls))
for img_url in big_urls:
dl(img_url, pbar)
pbar.close()