-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape_goole_image_thumbnails.py
135 lines (109 loc) · 3.6 KB
/
scrape_goole_image_thumbnails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import os
import requests
import urllib3
from urllib3.exceptions import InsecureRequestWarning
import time
urllib3.disable_warnings(InsecureRequestWarning)
searchword = input()
searchurl = ('https://www.google.com/search?q=' + searchword + '&source=lnms&tbm=isch')
dirs = 'pictures'
maxcount = 100
chrome_driver_path = "chromedriver.exe"
browser_path = "C:\\Users\\Devesh sangwan\\AppData\\Local\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
if not os.path.exists(dirs):
os.mkdir(dirs)
def download_google_staticimages():
options = webdriver.ChromeOptions()
options.binary_location = browser_path
options.add_argument('--no-sandbox')
# options.add_argument('--headless')
try:
driver = webdriver.Chrome(executable_path = chrome_driver_path, chrome_options = options)
except Exception as e:
print('Chrome driver not found')
print(f'exception: {e}')
driver.set_window_size(1280, 1024)
driver.get(searchurl)
time.sleep(1)
print('Downloading images')
print('This may take a few moments...')
element = driver.find_element_by_tag_name('body')
# Scroll down
# for i in range(30):
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
try:
driver.find_element_by_id('smb').click()
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
except:
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
print('Reached end of page.')
time.sleep(0.5)
print('Retry')
time.sleep(0.5)
# loading more results
driver.find_element_by_xpath('//input[@value="Show more results"]').click()
# Scroll down 2
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
try:
driver.find_element_by_id('smb').click()
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
except:
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
# elements = driver.find_elements_by_xpath('//div[@id="islrg"]')
# page_source = elements[0].get_attribute('innerHTML')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
images = soup.find_all('img')
urls = []
for image in images:
try:
url = image['data-src']
if not url.find('https://'):
urls.append(url)
except:
try:
url = image['src']
if not url.find('https://'):
urls.append(image['src'])
except Exception as e:
print('Image sources not found')
print(e)
count = 0
if urls:
for url in urls:
try:
res = requests.get(url, verify=False, stream=True)
rawdata = res.raw.read()
with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f:
f.write(rawdata)
count += 1
except Exception as e:
print('Failed to write')
print(e)
driver.close()
return count
def main():
t0 = time.time()
count = download_google_staticimages()
t1 = time.time()
total_time = t1 - t0
print('\n')
print(f'Download completed. [Successful count = {count}].')
print(f'Total time is {str(total_time)} seconds.')
if __name__ == '__main__':
main()