-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscraper.py
70 lines (57 loc) · 1.66 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
from bs4 import BeautifulSoup
import os
import urllib.parse as urp
import sys
banned = ["Parent Directory", "../", ".."]
def get_domain(link):
return '{uri.scheme}://{uri.netloc}'.format(uri=urp.urlparse(link))
def assure_path_exists(path):
if not os.path.exists(path):
os.makedirs(path)
def recur(url, currentdir):
root = get_domain(url)
visited = set()
r = requests.get(url)
print("Get request to", url)
visited.add(url)
soup = BeautifulSoup(r.content, "lxml")
i=0
ite = soup.find_all("a")
try:
ite.pop(0)
except Exception as e:
'''
When there is no link on page. Generally 404 or 403 page on nginx.
'''
pass
assure_path_exists(urp.unquote(currentdir))
file = open(urp.unquote(currentdir)+"links.txt", "w")
for link in ite:
if link.get("href")[0] == '/':
temp_url = root + link.get("href")
else:
temp_url = url + link.get("href")
if temp_url not in visited and link.get("href") != '/':
#print(temp_url)
if link.get("href")[-1] != '/':
if link.get("href")[0] == '/':
file.write(root+link.get("href")+"\n")
else:
file.write(url+link.get("href")+"\n")
print(link.get("href")+" <----- Scraped")
else:
if link.text not in banned and link.get("href")[-1] =='/':
if link.get("href")[0] == '/':
po = root + link.get("href")
temp = currentdir + link.get("href").split("/")[-2]
else:
po = url + link.get("href")
temp = currentdir + link.get("href")
recur(po, temp)
file.close()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("\n\nEnter URL to SCRAPE as COMMAND LINE ARGUMENT\n\n")
else:
recur(sys.argv[1], os.getcwd()+"/")