-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScraper.py
64 lines (55 loc) · 2.11 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Python script to scrape an article given the url of the article and store the extracted text in a file
# Url: https://medium.com/@subashgandyer/papa-what-is-a-neural-network-c5e5cc427c7
import os
import requests
import re
from bs4 import BeautifulSoup
# function to get the html source text of the medium article
def get_page():
global url
# Code here - Ask the user to input "Enter url of a medium article: " and collect it in url
url = input('Enter url of a medium article:')
# Code ends here
# handling possible error
if not re.match(r'https?://medium.com/',url):
print('Please enter a valid website, or make sure it is a medium article')
sys.exit(1)
# Code here - Call get method in requests object, pass url and collect it in res
res = requests.get(url)
# Code ends here
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
return soup
# function to remove all the html tags and replace some with specific strings
def clean(text):
rep = {"<br>": "\n", "<br/>": "\n", "<li>": "\n"}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
text = re.sub('\<(.*?)\>', '', text)
return text
def collect_text(soup):
text = f'url: {url}\n\n'
para_text = soup.find_all('p')
print(f"paragraphs text = \n {para_text}")
for para in para_text:
text += f"{para.text}\n\n"
return text
# function to save file in the current directory
def save_file(text):
if not os.path.exists('./scraped_articles'):
os.mkdir('./scraped_articles')
name = url.split("/")[-1]
print(name)
fname = f'scraped_articles/{name}.txt'
# Code here - write a file using with (2 lines)
f =open(fname, "w")
f.write(text)
f.close()
# Code ends here
print(f'File saved in directory {fname}')
if __name__ == '__main__':
text = collect_text(get_page())
save_file(text)
# Instructions to Run this python code
# Give url as https://medium.com/@subashgandyer/papa-what-is-a-neural-network-c5e5cc427c7