-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjackkruse.py
89 lines (75 loc) · 2.49 KB
/
jackkruse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import re
import time
import random
import typing
import hashlib
import pdfkit
import requests
from bs4 import BeautifulSoup
ATTEMPTS: int = 100
ROOT: str = "https://jackkruse.com/blogindex/page/{page}/"
def save(filename: str, data: str) -> None:
"""Saves `data` into a cache file in the /tmp called `filename`."""
path: str = os.path.join(os.sep, "tmp", filename)
with open(path, "w") as file_handler:
file_handler.write(data)
def get(filename: str) -> str:
"""Loads data from a cache file in the /tmp called `filename`, if it exists."""
path: str = os.path.join(os.sep, "tmp", filename)
if not os.path.isfile(path):
return ""
with open(path, "r") as file_handler:
return file_handler.read()
def load(url: str) -> str:
"""Downloads an HTML from the `url` if it doesn't exist in the cache."""
filename: str = hashlib.md5(url.encode("utf-8")).hexdigest() + ".html"
cached: str = get(filename)
if cached:
return cached
print(f"URL: {filename}")
response: str = requests.get(url)
assert response.status_code == 200
save(filename, response.text)
time.sleep(random.uniform(0, 5))
return response.text
def pdf(url: str) -> str:
"""Exports `url` as a PDF if it doesn't exist in the cache."""
print(f"CRAWL: {url}")
if not url:
return
url: str = url.strip() + "?print=print"
filename: str = (
hashlib.md5(url.encode("utf-8")).hexdigest() + ".pdf"
)
assert url
assert filename
if not os.path.isdir("pdf"):
os.mkdir("pdf")
path: str = os.path.join("pdf", filename)
if not os.path.isfile(path):
for i in range(ATTEMPTS):
print(f"PDF: {path}")
try:
pdfkit.from_url(url, path)
break
except Exception as error:
print(f"ERROR: {error}")
time.sleep(2 * i)
continue
def links(html: str) -> typing.Generator[str, None, None]:
"""Returns a list of links found in the `html` string."""
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", text=re.compile("Read More")):
url = link.get("href")
if url and url.startswith("http"):
yield url
if __name__ == "__main__":
for page in range(1, 100):
url = ROOT.format(page=page)
link = None
for link in links(load(url)):
pdf(link)
if link is None:
break
print("Crawled successfully!")