-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraplatostadora.py
68 lines (55 loc) · 1.53 KB
/
scraplatostadora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# We scrape the images
# https://www.latostadora.com/afaces
import requests
from bs4 import BeautifulSoup
playlists = ["https://www.latostadora.com/afaces"]
playlistName = ""
url = playlists[0]
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
#text = soup.find_all(text=True)
images = soup.find_all('img')
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in images:
if t.parent.name not in blacklist:
output += '{} '.format(t)
lines = output.split('data-original=')
# print(lines[1].split(" ")[0])
# print(lines[2].split(" ")[0])
text = ""
for i in lines:
text += i
ola = text.split("zoomable-images")
c = 1
images_url = []
for j in ola:
images_url.append(ola[c].split(" ")[1].strip("\""))
c+=1
if c >= len(ola):
break
# Get URL of each product
project_href = [i.a['href'] for i in soup.find_all('div', attrs={'class': 'm-product-card'})]
for i in range(len(project_href)):
project_href[i] = "https://www.latostadora.com" + project_href[i]
urlIDList = []
for j in range(len(images_url)):
separateList=[]
separateList.append(images_url[j])
separateList.append(project_href[j])
urlIDList.append(separateList)
# print(urlIDList[0])
with open("store.md", "w") as o:
for k in range(len(urlIDList)):
o.writelines("[![Alt text](" + urlIDList[k][0] + ")](" + urlIDList[k][1] + ")\n")