-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpool.py
95 lines (86 loc) · 5.67 KB
/
pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from fake_useragent import FakeUserAgentError
import random
class Pool:
def __init__(self,logger):
self.logger = logger
self.proxy_pool()
self.accepts_pool()
# get proxies pool
# this one is free and slow af so you should def invest in a better one if you can afford it xx
# (it was so slow i haven't finished running the script btw)
# (and also i'm trying to scrape like 3k+ pages in one go without setting up proper precautions before)
def proxy_pool(self):
try:
url = 'https://www.sslproxies.org/'
with requests.Session() as res: # immediately close after done
proxies_page = res.get(url)
soup = BeautifulSoup(proxies_page.text,'html.parser')
proxies_table = soup.find(lambda tag: tag.name=='table' and 'IP Address' in tag.text) # find table that has 'IP Address' in it
proxies_table_body = proxies_table.find('tbody') # find the body of the table
proxies = []
for row in proxies_table_body.find_all('tr'):
proxies.append('{}:{}'.format(row.find_all('td')[0].string.replace(' ',''), row.find_all('td')[1].string)) # get the IP address and port
# in case connection failed, use pre-defined IP
# from different sources, so hopefully they dont fail at the same time
# (setting up flags: moments before disaster (hasnt happened yet))
except Exception as error:
self.logger.error(f"proxy_pool() encounters an error. Reverting back to pre-defined IP and Port. error with type {type(error).__name__}: {error}") # save error in a file
proxies = [
'145.239. 85.58:9300', # from vpnoverview
'8.219.74.58:8080', # from free-proxy-list
'103.152.112.145:80', # from freeproxylist
'195.201.61.51:8000' # from javatpoint
]
self.proxies = proxies
# get accept values pool
def accepts_pool(self):
try:
url = 'https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation/List_of_default_Accept_values'
with requests.Session() as res: # immediately close after done
accepts_page = res.get(url)
soup = BeautifulSoup(accepts_page.text,'html.parser')
accepts_table = soup.find('h2',id='default_values').parent # find table with default_values as an id
accepts_table_body = accepts_table.find('tbody') # find the body of the table
accepts = {} # dictionary will accept 'firefox' or 'chrome' as keys and accepts string as the value
accepts['Firefox'] = accepts_table_body.find(lambda tag: tag.name=='td' and 'Firefox' in tag.text).next_sibling.next_sibling.text # for firefox
accepts['Chrome'] = accepts_table_body.find(lambda tag: tag.name=='td' and 'Chrome' in tag.text).next_sibling.next_sibling.text # for chrome
# in case connection failed, use pre-defined accept values
except Exception as error:
self.logger.error(f"accepts_pool() encounters an error. Reverting back to pre-defined accept values. error with type {type(error).__name__}: {error}") # save error in a file
accepts = {
'Firefox':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Chrome':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
}
self.accepts = accepts
# get user agent values pool
def user_agent_pool(self):
try:
ua = UserAgent() # create useragent instance
if random.random() > 0.5: # create randomness to avoid block
random_user_agent = ua.chrome # get ua for chrome
else:
random_user_agent = ua.firefox # get ua for firefox
# in case connection failed, use pre-defined valid user-agent
except FakeUserAgentError as error:
self.logger.error(f"FakeUserAgent didn't work. Generating headers from the pre-defined set of headers. error with type {type(error).__name__}: {error}") # save error in a file
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"] # Just for case user agents are not extracted from fake-useragent package
random_user_agent = random.choice(user_agents)
self.random_user_agent = random_user_agent
# get headers pool, where header = user-agents and accept values
def header_pool(self):
self.user_agent_pool()
valid_accept = self.accepts['Firefox'] if self.random_user_agent.find('Firefox') > 0 else self.accepts['Chrome']
headers = {
'User-Agent':self.random_user_agent,
'Accept':valid_accept
}
return headers
# create proxies and headers pool so it can be rotated
def create_pools(self):
self.headers = [self.header_pool() for ind in range(len(self.proxies))]
return len(self.proxies), self.proxies, self.headers