From 6104c801221296ceec113f080ccbe66fb89f3157 Mon Sep 17 00:00:00 2001 From: SilentJMA Date: Fri, 22 Sep 2023 12:21:31 +0200 Subject: [PATCH 1/2] Added error handling, combined the url and file name pairs, used a list comprehension to simplify the creation, removed the duplicate import of the requests module, adjust readibality --- wiki_toc.py | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/wiki_toc.py b/wiki_toc.py index 6df7b35..89e65bf 100644 --- a/wiki_toc.py +++ b/wiki_toc.py @@ -1,45 +1,36 @@ import csv import requests from bs4 import BeautifulSoup -import requests - def get_data(url): response = requests.get(url) + response.raise_for_status() # Add error handling for request soup = BeautifulSoup(response.text, 'lxml') - table_of_contents = soup.find("div", id="toc") - headings = table_of_contents.find_all("li") - data = [] - for heading in headings: - heading_text = heading.find("span", class_="toctext").text - heading_number = heading.find("span", class_="tocnumber").text - data.append({ - 'heading_number': heading_number, - 'heading_text': heading_text, - }) + headings = soup.find("div", id="toc").find_all("li") + + data = [{'heading_number': heading.find("span", class_="tocnumber").text, + 'heading_text': heading.find("span", class_="toctext").text} + for heading in headings] + return data - def export_data(data, file_name): with open(file_name, "w", newline="") as file: writer = csv.DictWriter(file, fieldnames=['heading_number', 'heading_text']) writer.writeheader() writer.writerows(data) - def main(): - url_to_parse = "https://en.wikipedia.org/wiki/Python_(programming_language)" - file_name = "python_toc.csv" - data = get_data(url_to_parse) - export_data(data, file_name) - - url_to_parse = "https://en.wikipedia.org/wiki/Web_scraping" - file_name = "web_scraping_toc.csv" - data = get_data(url_to_parse) - export_data(data, file_name) + urls = [ + ("https://en.wikipedia.org/wiki/Python_(programming_language)", "python_toc.csv"), + ("https://en.wikipedia.org/wiki/Web_scraping", "web_scraping_toc.csv") + ] + + for url, file_name in urls: + data = get_data(url) + export_data(data, file_name) print('Done') - if __name__ == '__main__': main() From dd524945f4f3ba0bf946567d57a7e85b2b065978 Mon Sep 17 00:00:00 2001 From: SilentJMA Date: Fri, 22 Sep 2023 12:32:04 +0200 Subject: [PATCH 2/2] Added error handling, combined the url and file name pairs, used a list comprehension to simplify the creation, removed the duplicate import of the requests module, adjust readibality --- wiki_toc.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/wiki_toc.py b/wiki_toc.py index 89e65bf..f566353 100644 --- a/wiki_toc.py +++ b/wiki_toc.py @@ -4,13 +4,24 @@ def get_data(url): response = requests.get(url) - response.raise_for_status() # Add error handling for request - soup = BeautifulSoup(response.text, 'lxml') - headings = soup.find("div", id="toc").find_all("li") + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') - data = [{'heading_number': heading.find("span", class_="tocnumber").text, - 'heading_text': heading.find("span", class_="toctext").text} - for heading in headings] + data = [] + + toc = soup.find("div", {"id": "toc"}) + + if toc: + headings = toc.find_all("li") + for heading in headings: + heading_number = heading.find("span", {"class": "tocnumber"}) + heading_text = heading.find("span", {"class": "toctext"}) + + if heading_number and heading_text: + data.append({ + 'heading_number': heading_number.text.strip(), + 'heading_text': heading_text.text.strip(), + }) return data