-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
46 lines (37 loc) · 1.65 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from chromedriver_py import binary_path
from bs4 import BeautifulSoup
from lxml.html.clean import clean_html
def fetch_and_clean_html(url):
try:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Set the user agent
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")
service_object = Service(binary_path)
# Initialize Selenium webdriver with options
driver = webdriver.Chrome(service=service_object, options=options)
# Navigate to url
driver.get(url)
# Get the raw HTML content
soup = BeautifulSoup(driver.page_source, 'lxml')
# Extract text and alt from images
body_text = ' '.join(soup.get_text().split())
# Extract text and alt from images
img_alts = [' '.join(img.get('alt').split()) for img in soup.select('img') if img.get('alt') is not None]
# Close the driver
driver.close()
# Sanitize the text using lxml's clean_html()
clean_body_text = clean_html(body_text)
# clean_img_alts = [clean_html(alt) for alt in img_alts]
# return {
# 'text': clean_body_text,
# 'imgAlts': clean_img_alts,
# }
return clean_body_text
except Exception as e:
print('Error while fetching and cleaning html:', str(e))