We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
## 从 豆瓣 页面按照 ISBN 爬取图书信息,title, author, intro, publisher, publish_date: ## http://douban.com/isbn/9787111637172 ## python3 -m pip install beautifulsoup4 ## python3 -m pip install lxml from bs4 import BeautifulSoup import time import random import pandas as pd import urllib.request import sys import re class Douban(): def __init__(self): self.__r_publisher = r'出版社:</span>(.*?)<br/>' self.__r_publish_date = r'出版年:</span>(.*?)<br/>' self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'} def get_book(self, isbn=""): book = {"isbn": isbn} html = self.__get_html(isbn=isbn) if not html : # not found return None soup = self.__get_soup(html=html) book["title"] = self.__getTitle(soup=soup) book["author"] = self.__getAuthor(soup=soup) book["intro"] = self.__get_intro(soup=soup) book["publisher"] = self.__getpublisher(soup=soup) book["publish_date"] = self.__getpublish_date(soup=soup) return book def __get_html(self, isbn=""): url = f"http://douban.com/isbn/{isbn}/" request = urllib.request.Request(url, headers=self.headers) try: response = urllib.request.urlopen(request) except: return None html = response.read().decode('utf-8') return html def __get_soup(self, html=""): soup = BeautifulSoup(html, 'lxml', exclude_encodings="utf-8") return soup def __getTitle(self, soup): soupSelect = str(soup.select("body>div>h1>span")) soupTemp = BeautifulSoup(str(soupSelect), 'lxml', exclude_encodings="utf-8") return str(soupTemp.text).strip('[] \n\t') def __getAuthor(self, soup): soupSelect = str(soup.select( "body>div>div>div>div>div>div>div>div>span>a")[0]) soupTemp = BeautifulSoup(str(soupSelect), 'lxml', exclude_encodings="utf-8") return str(soupTemp.text).strip() def __getpublisher(self, soup): soupSelect = str(soup.select( "body>div>div>div>div>div>div>div>div>a")[1]) soupTemp = BeautifulSoup(str(soupSelect), 'lxml', exclude_encodings="utf-8") return str(soupTemp.text).strip() def __getpublish_date(self, soup): soupSelect = str(soup.select( "body>div>div>div>div>div>div>div>div")) ans = re.findall(self.__r_publish_date, soupSelect) if len(ans) == 0: return "" else: return str(ans[0]).strip("[] \n\t") def __get_intro(self, soup): soupSelect = soup.select( "body>div>div>div>div>div>div>span>div>div") soupTemp = BeautifulSoup(str(soupSelect), 'lxml', exclude_encodings="utf-8") return str(soupTemp.text).strip("[] \n\t") if __name__ == "__main__": base = Douban() print(base.get_book("9787111637172"))
The text was updated successfully, but these errors were encountered:
No branches or pull requests
The text was updated successfully, but these errors were encountered: