liduo0413

功能描述V1.0:

爬取豆瓣电影排行top250

功能分析:

使用的库

1、time

2、json

3、requests

4、BuautifulSoup

5、RequestException

上机实验室:

"""
    作者:李舵
    日期:2019-4-27
    功能:抓取豆瓣电影top250
    版本:V1.0
"""

import time
import json
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    soup = BeautifulSoup(html, \'lxml\')
    ol_list = soup.find(\'ol\', {\'class\': \'grid_view\'})
    li_list = ol_list.find_all(\'li\')
    for i in range(25):
        move_value = li_list[i]
        yield {
            \'index\': move_value.find(\'em\', {\'class\': \'\'}).text.strip(),
            \'title\': move_value.find(\'span\', {\'class\': \'title\'}).text.strip(),
            \'actor\': move_value.find(\'p\', {\'class\': \'\'}).text.strip(),
            \'score\': move_value.find(\'span\', {\'class\': \'rating_num\'}).text.strip()
        }


def write_to_file(content):
    with open(\'result.txt\', \'a\', encoding=\'utf-8\') as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False)+\'\n\')


def main(start):
    url = \'https://movie.douban.com/top250?start=\' + str(start)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == \'__main__\':
    for i in range(0,250,25):
        main(start=i)
        time.sleep(1)

 

功能描述V2.0:

爬取豆瓣电影排行top250

功能分析:

使用的库

1、time

2、requests

3、RequestException

上机实验室:

"""
作者:李舵
日期:2019 - 4 - 8
功能:抓取豆瓣电影top250
版本:V2.0
"""

import re
import time
import requests
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    pattern = re.compile(u\'<div.*?class="item">.*?\'
                            + u\'<div.*?class="pic">.*?\'
                            + u\'<em.*?class="">(.*?)</em>.*?\'
                            + u\'<div.*?class="info">.*?\'
                            + u\'<span.*?class="title">(.*?)</span>.*?\'
                            + u\'<span.*?class="other">(.*?)</span>.*?\'
                            + u\'<div.*?class="bd">.*?\'
                            + u\'<p.*?class="">.*?\'
                            + u\'导演:\s(.*?)\s.*?<br>\'
                            + u\'(.*?) / \'
                            + u\'(.*?) / (.*?)</p>.*?\'
                            + u\'<div.*?class="star">.*?\'
                            + u\'<span.*?class="rating_num".*?property="v:average">\'
                            + u\'(.*?)</span>.*?\'
                            + u\'<span>(.*?)人评价</span>.*?\'
                            + u\'<span.*?class="inq">(.*?)</span>\', re.S)
    movies = re.findall(pattern, html)
    movie_list = []
    for movie in movies:
        movie_list.append([movie[0],
                           movie[1],
                           movie[2].lstrip(\' / \'),
                           movie[3],
                           movie[4].lstrip(),
                           movie[5],
                           movie[6].strip(),
                           movie[7],
                           movie[8],
                           movie[9]])
    return movie_list


def write_to_file(movie_list):
    with open(\'top_250.txt\', \'w\', encoding=\'utf-8\',) as f:
        for movie in movie_list:
            f.write(\'电影排名:\' + movie[0] + \'\n\')
            f.write(\'电影名称:\' + movie[1] + \'\n\')
            f.write(\'电影别名:\' + movie[2] + \'\n\')
            f.write(\'导演:\' + movie[3] + \'\n\')
            f.write(\'上映年份:\' + movie[4] + \'\n\')
            f.write(\'制作国家/地区:\' + movie[5] + \'\n\')
            f.write(\'电影类别:\' + movie[6] + \'\n\')
            f.write(\'评分:\' + movie[7] + \'\n\')
            f.write(\'参评人数:\' + movie[8] + \'\n\')
            f.write(\'简短影评:\' + movie[9] + \'\n\')
            f.write(\'\n\')
        print(\'成功写入文件,共有%d条记录……\' % len(movie_list))
        f.close()


def main(start):
    url = \'https://movie.douban.com/top250?start=\' + str(start)
    html = get_one_page(url)
    movie_list = parse_one_page(html)
    write_to_file(movie_list)


if __name__ == \'__main__\':
    for i in range(0, 250, 25):
        main(start=i)
        time.sleep(1)

  

补充说明:

1、

分类:

技术点:

相关文章: