-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
123 lines (99 loc) · 4.07 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Given the profile urls scrape skills and other info
from selenium import webdriver
from parsel import Selector
import time
import pathlib
import csv
from selenium.common.exceptions import NoSuchElementException
#------------------------------------- MAKING CONNECTION-------------------------------------
chromedriver = str(pathlib.Path().resolve())+'/chromedriver.exe'
#Put your chromedriver.exe into current directory as MAC and windows have different system path
_DRIVER_CHROME = webdriver.Chrome(chromedriver)
_DRIVER_CHROME.get('https://www.linkedin.com/uas/#')
elementID = _DRIVER_CHROME.find_element_by_id('username')
elementID.send_keys('') # Enter your email between quotes
elementID = _DRIVER_CHROME.find_element_by_id('password')
elementID.send_keys('') # Enter your password between quotes
elementID.submit()
#CHANGE - if captcha occurs
time.sleep(30) # to solve CAPTCHA
#------------------------------------- GET LIST OF URLS-------------------------------------
def readUrls(DATA_FILE):
f = open(DATA_FILE)
urls = f.read()
url_list = urls.split('\n')
url_list = [url + '/' for url in url_list]
print(f'Total urls - {len(url_list)}')
print(f'First url is {url_list[0]}')
print(f'Last url is {url_list[-1]}')
return url_list
url_file = 'profile_urls.csv'
urls = readUrls(url_file)
#----------------------------------- SCRAPE PROFILES-------------------------------------
#------ BOOKEEPING for next iteration ------- #CHANGE
# ATHARVA
# Start - 127 End - 50
# ------------------------
# MAYANK
# Start - 0 End - ?
# ------------------------
# SIDHESH
# Start - 0 End - ?
# ------------------------
# SIDHANT | INITIAL | CURR - CURR | END
# Start - 742 | 839 End - 838 | 989
#---------------------------------------------
#CHANGE
#Enter Start and End ids to scrape
start = 800 #1 based indexing
end = 840 #1 based indexing
filenames = ['atharva','mayank','sidhant','siddesh']
filename = filenames[2] + '.csv' #CHANGE
def linkedin_scrape(linkedin_urls,filename):
SCROLL_PAUSE_TIME = 4
for p in range(start-1,end):
profiles = []
url = linkedin_urls[p]
_DRIVER_CHROME.get(url)
# Get scroll height
last_height = int(_DRIVER_CHROME.execute_script("return document.body.scrollHeight")) # MAY THROW ERROR WHEN LOGGED OUT
curr_height=1
while True:
for i in range(curr_height, last_height, 120):
_DRIVER_CHROME.execute_script("window.scrollTo(0, {});".format(i))
time.sleep(1)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = _DRIVER_CHROME.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
curr_height =last_height
last_height = new_height
selector = Selector(text=_DRIVER_CHROME.page_source)
# Use xpath to extract the exact class containing the profile name
name = selector.xpath('//*[starts-with(@class, "text-heading-xlarge")]/text()').extract_first()
if name:
name = name.strip()
#locate link to expand skills
try:
show_more_skills_button = _DRIVER_CHROME.find_element_by_class_name("pv-skills-section__chevron-icon")
_DRIVER_CHROME.execute_script("arguments[0].click();", show_more_skills_button)
skills = _DRIVER_CHROME.find_elements_by_xpath("//*[starts-with(@class,'pv-skill-category-entity__name-text')]")
except NoSuchElementException:
continue
#create skills set
skill_set = []
for skill in skills:
skill_set.append(skill.text)
profiles.append(name)
profiles.append(skill_set)
print(profiles)
with open(filename, 'a+', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
# write the data
writer.writerow(profiles)
time.sleep(10)
_DRIVER_CHROME.close()
return 0
linkedin_scrape(urls,filename)