-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKP_scraper.py
161 lines (124 loc) · 4.68 KB
/
KP_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Made by Mihajlo Jankovic - 2020 <jankovic.mihajlo99@gmail.com>
#
# This program is free software: you can redistribute it and/or modify it.
# Since I am currently the first year student, I have been working on this
# project for the purpose of practice and also for fun.
#
# Thank you for visiting and using!
#
# Enjoy!
import requests,bs4,re,random,webbrowser,time
def is_num(num): # Check if char is num
if(ord(num) >= ord('0') and ord(num) <= ord('9')): return True
return False
def convert_to_num(s): # Convert price from str to int
i = 0
num = 0
if(not is_num(s[i])): return (num,'Kontakt')
while(i < len(s) and ord(s[i]) != ' ' and ord(s[i]) != 160): #160 = Unicode Character (NO-BREAK SPACE)
if(s[i] == '.'):
i += 1
continue
elif(s[i] == ','): return (num,'€')
digit = ord(s[i]) - ord('0')
num = num * 10 + digit
i += 1
return (num,'din')
def get_proxy(): # Scraping IP/Port from https://www.sslproxies.org/ (Free proxy list)
url = 'https://www.sslproxies.org/'
req = requests.get(url)
soup = bs4.BeautifulSoup(req.content, features = 'html.parser')
ip_list = soup.findAll('td')[0::8]
port_list = soup.findAll('td')[1::8]
regex_port = re.compile(r'^[0-9]')
regex_ip = re.compile(r'^[0-9]*[.]')
proxy_list = []
for i in range(len(ip_list)):
if(regex_ip.match(ip_list[i].text) and regex_port.match(port_list[i].text)):
proxy = ip_list[i].text + ':' + port_list[i].text
proxy_dict = dict()
proxy_dict['https'] = proxy
proxy_list.append(proxy_dict)
else: break
return proxy_list
def proxy_request(request_type, url, **kwargs): # Send a request to the targeted site
print('Proxy IP/Port - trying to connect...')
for proxy in proxy_list:
try:
print(proxy)
req = requests.request(request_type, url, proxies = proxy, timeout = 1, **kwargs)
return req,proxy
except:
pass
return False
proxy_list = get_proxy()
search = input('search (use + instead of SPACEBAR): ')
link = 'https://www.kupujemprodajem.com/search.php?action=list&data%5Bpage%5D=1&data%5Bprev_keywords%5D=tastatura&data%5Border%5D=relevance&submit%5Bsearch%5D=Tra%C5%BEi&dummy=name&data%5Bkeywords%5D='+search
req,valid_proxy = proxy_request('get', link)
if(not req): print('Failed to access...\n')
print('\nIP successfully connected.\n')
print('Valid proxy = ', end = '')
print(valid_proxy)
soup = bs4.BeautifulSoup(req.content, features = 'html.parser')
h1_list = soup.select('h1')
flag = 1
for h1 in h1_list:
h1 = h1.text.strip()
print(h1)
if(h1 == 'Vama je blokiran pristup portalu!'):
print('\nNeuspelo povezianje (blokiran pristup)...')
flag = 0
break
if(flag): # Main scraper
ads = []
kp_link = 'https://www.kupujemprodajem.com'
ad_list_container = soup.findAll("div",{"id":"adListContainer"})
ad_list = ad_list_container[0].findAll("div",{"class":"clearfix"})
for i in range(1,len(ad_list),2):
name = ad_list[i].find('img')
name = name.get('alt', '')
name = re.sub(' +', ' ', name)
price = ad_list[i].find('span',{'class':'adPrice'})
price = price.text.strip()
price_tuple = convert_to_num(price)
price = price_tuple[0]
if(price != 0): price_str = str(price_tuple[0]) + ' ' + price_tuple[1]
else: price_str = price_tuple[1]
link = ad_list[i].find('a')
link = link.get('href', '')
link = kp_link + link
req = requests.request('get', link, proxies = valid_proxy)
soup = bs4.BeautifulSoup(req.content, features = 'html.parser', from_encoding="iso-8859-1")
if(soup.find("div",{"class":"thumb-up"})):
like_num = int(soup.find("div",{"class":"thumb-up"}).find("span").text)
dislike_num = int(soup.find("div",{"class":"thumb-down"}).find("span").text)
else:
like_num = 0
dislike_num = 0
ads.append((name,price_str,price,like_num,dislike_num,link))
#Sorting
print('Sort ads:')
print('1) Sort by price')
print('2) Sort by number of likes')
print('3) Sort by number of dislikes')
j = int(input('Enter the number you want to sort by: ')) + 1
print('\n')
sorted_list = []
ind = 0
while(len(ads)):
max_index = 0
for i in range(1,len(ads)):
if(ads[i][j] > ads[max_index][j]): max_index = i
ad_tuple = ads.pop(max_index)
sorted_list.append(ad_tuple)
for i in range(len(sorted_list)):
print('------------ Ad ' + str(i+1) + ' ------------')
print('Ad name: ' + str(sorted_list[i][0]))
print('Product price: ' + str(sorted_list[i][1]))
print('Likes/Dislikes: ' + str(sorted_list[i][3]) + '/' + str(sorted_list[i][4]))
flag = int(input('\nEnter Ad number to open Ad in browser or 0 to exit: ')) - 1
while(flag != -1):
webbrowser.open(sorted_list[flag][5], new=2)
time.sleep(0.5)
flag = int(input('\nEnter Ad number to open Ad in browser or 0 to exit: ')) - 1
print('Program closed...')