豆瓣电影TOP250和书籍TOP250爬虫

豆瓣电影 TOP250 和书籍 TOP250 爬虫

最近开始玩 Python , 学习爬虫相关知识的时候，心血来潮，爬取了豆瓣电影TOP250 和书籍TOP250, 这里记录一下自己玩的过程。

电影 TOP250 爬虫

import requests
from bs4 import BeautifulSoup
import time


def getlist(list_url):
    time.sleep(2)
    res = requests.get(list_url)
    soup = BeautifulSoup(res.text, \'html.parser\')
    movie_list = soup.select(\'.grid_view li\')
    for m in movie_list:
        rank = m.select(\'em\')[0].text
        score = m.select(\'.rating_num\')[0].text
        title = m.select(\'.title\')[0].text
        direct = m.select(\'.info .bd p\')[0].text.strip()
        actor = \'\n主演:\'.join(direct.split(\'   主演:\'))
        director = \'年代:\'.join(actor.split(\'                           \'))
        if m.select(\'.inq\'):
            comments = m.select(\'.inq\')[0].text.strip()
        else:
            comments = \'None\'
        movie.append(
            \'排名: \' + rank + \'\n\'
            + \'评分: \' + score + \'\n\'
            + \'片名: \' + title + \'\n\'
            + director + \'\n\'
            + \'评论: \' + comments + \'\n\'
            + \'\n\')
    if soup.select(\'.next a\'):
        asoup = soup.select(\'.next a\')[0][\'href\']
        next_page = seed_url + asoup
        getlist(next_page)
    else:
        print(\'结束\')
    return movie


def write(movies):
    with open(\'movie.txt\', \'w\', encoding=\'utf8\') as m:
        for a in movies:
            m.write(a)


def main():
    write(getlist(seed_url))
    pass


if __name__ == \'__main__\':
    seed_url = \'https://movie.douban.com/top250\'
    movie = []
    main()

书籍 TOP250 爬虫

import bs4
import requests
import re
from bs4 import BeautifulSoup
from operator import itemgetter


def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parserText(text, book_list):
    soup = BeautifulSoup(text, \'html.parser\')
    for table in soup(\'table\', {\'width\': \'100%\'}):
        if isinstance(table, bs4.element.Tag):
            tds = table.find(\'tr\')(\'td\')
            divs = tds[1](\'div\')
            content = {}
            for div in divs:
                if isinstance(div, bs4.element.Tag):
                    if div.find(\'a\'):
                        name = div.find(\'a\').attrs[\'title\']
                        content.update({"书名": name})
                    if div.select(\'.rating_nums\'):
                        score = div.select(\'.rating_nums\')[0].text
                        content.update({"评分": score})
                    if div.select(\'.pl\'):
                        people_num = div.select(\'.pl\')[0].text
                        regex = re.compile(r\'[\d]{1,10}\')
                        content.update({"评价人数": regex.findall(people_num)[0]})

            ps = tds[1](\'p\')
            for p in ps:
                if isinstance(p, bs4.element.Tag):
                    if p.attrs[\'class\'][0] == \'quote\':
                        description = p.find(\'span\').string
                        content.update({"介绍": description})
                    if p.attrs[\'class\'][0] == \'pl\':
                        author = p.string
                        content.update({"作者信息": author})

            book_list.append(content)

    next_books = soup.find(\'span\', {\'class\': \'next\'})
    if next_books.find(\'a\'):
        a = next_books.find(\'a\').attrs[\'href\']
        text = getHtmlText(a)
        parserText(text, books)

    return book_list


def sortedBookTop250(book_list):
    tmp = sorted(book_list, key=itemgetter(\'评分\'), reverse=True)
    for i in range(len(tmp)):
        tmp[i].update({"排名": i + 1})
    return tmp


def writeToFile(book_list):
    with open(\'good_books.txt\', \'w\', encoding=\'utf8\') as book_file:
        for book in book_list:
            for key, value in book.items():
                book_file.write(f\'{key}:{value}\n\')
            book_file.write(\'\n\')
    pass


def main():
    text = getHtmlText(seed_url)
    book_list = parserText(text, books)
    writeToFile(sortedBookTop250(book_list))
    pass


if __name__ == \'__main__\':
    seed_url = "https://book.douban.com/top250"
    books = []
    main()

总结

点击查看我的Github

点击查看我的个人Blog

日拱一卒，不期速成

以上直接贴出了代码，这是很简单的两段代码，主要用到了 requests 库和 beautifulsoup 库，需要的可以直接拿去，或者直接去我的 GIthub上拿 movies.txt 和 good_books.txt

发表于 2019-09-24 17:47 innerpeacez 阅读(311) 评论(0) 编辑收藏举报