Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

爬取豆瓣 ISBN 图书数据 #21

Open
davideuler opened this issue Jun 20, 2022 · 0 comments
Open

爬取豆瓣 ISBN 图书数据 #21

davideuler opened this issue Jun 20, 2022 · 0 comments

Comments

@davideuler
Copy link
Owner

## 从 豆瓣 页面按照 ISBN 爬取图书信息,title, author, intro, publisher, publish_date: 
## http://douban.com/isbn/9787111637172
## python3 -m pip install beautifulsoup4
## python3 -m pip install lxml

from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import urllib.request
import sys
import re


class Douban():
    def __init__(self):
        self.__r_publisher = r'出版社:</span>(.*?)<br/>'
        self.__r_publish_date = r'出版年:</span>(.*?)<br/>'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'}

    def get_book(self, isbn=""):
        book = {"isbn": isbn}
        html = self.__get_html(isbn=isbn)
        if not html :  # not found
            return None
        soup = self.__get_soup(html=html)
        book["title"] = self.__getTitle(soup=soup)
        book["author"] = self.__getAuthor(soup=soup)
        book["intro"] = self.__get_intro(soup=soup)
        book["publisher"] = self.__getpublisher(soup=soup)
        book["publish_date"] = self.__getpublish_date(soup=soup)
        return book

    def __get_html(self, isbn=""):
        url = f"http://douban.com/isbn/{isbn}/"
        request = urllib.request.Request(url, headers=self.headers)
        try:
            response = urllib.request.urlopen(request)
        except:
            return None
        html = response.read().decode('utf-8')
        return html

    def __get_soup(self, html=""):
        soup = BeautifulSoup(html, 'lxml', exclude_encodings="utf-8")
        return soup

    def __getTitle(self, soup):
        soupSelect = str(soup.select("body>div>h1>span"))
        soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
                                 exclude_encodings="utf-8")
        return str(soupTemp.text).strip('[] \n\t')

    def __getAuthor(self, soup):
        soupSelect = str(soup.select(
            "body>div>div>div>div>div>div>div>div>span>a")[0])
        soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
                                 exclude_encodings="utf-8")
        return str(soupTemp.text).strip()

    def __getpublisher(self, soup):
        soupSelect = str(soup.select(
            "body>div>div>div>div>div>div>div>div>a")[1])
        soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
                                 exclude_encodings="utf-8")
        return str(soupTemp.text).strip()


    def __getpublish_date(self, soup):
        soupSelect = str(soup.select(
            "body>div>div>div>div>div>div>div>div"))
        ans = re.findall(self.__r_publish_date, soupSelect)
        if len(ans) == 0:
            return ""
        else:
            return str(ans[0]).strip("[] \n\t")

    def __get_intro(self, soup):
        soupSelect = soup.select(
            "body>div>div>div>div>div>div>span>div>div")
        soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
                                 exclude_encodings="utf-8")
        return str(soupTemp.text).strip("[] \n\t")


if __name__ == "__main__":
    base = Douban()
    print(base.get_book("9787111637172"))
# for free to join this conversation on GitHub. Already have an account? # to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant