1.爬取豆瓣top250电影信息
- 第一页: https://movie.douban.com/top250?start=0&filter=
- 第二页: https://movie.douban.com/top250?start=25&filter=
- 第三页: https://movie.douban.com/top250?start=50&filter=
- 第十页: https://movie.douban.com/top250?start=225&filter=
2.-爬取步骤:
- 1) 获取所有电影的主页url
- 2) 往每一个主页发送请求,获取响应数据
- 3) 解析并提取想要的数据(获取每一部电影的class为item的div)
- 4) 根据每一部电影的div提取电影的: 详情页url、电影名字、电影评分、评价人数
3.解析html数据
"""
re.findall(\'正则匹配规则\', \'匹配文本\', \'匹配模式\')
re.findall(
\'<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价\',
response.text, re.S)
- html:
<div class="item">.*?<a href="https://movie.douban.com/subject/1293908/">
.*?
<span class="title">城市之光</span>.*?<span class="rating_num" property="v:average">(.*?)</span>
.*?
<span>(.*?)人评价
"""
import requests
import re
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 chrome-extension\'
}
# 1.发送请求
def get_html(url):
response = requests.get(url,headers=headers)
return response
# 2.解析数据
def parse_html(response):
movie_list = re.findall(
\'<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*? <span class="rating_num" .*?>(.*?)</span>.*? <span>(.*?)人评价\',
response.text,
re.S)
return movie_list
# 3.保存数据
def save_data(movie_data, num):
url, name, grade, count = movie_data
movie = f"""
电影排名: {num}
电影详情: {url}
电影名字: {name}
电影评分: {grade}
评分人数: {count}
"""
print(movie)
with open(\'douban.txt\',\'a\', encoding=\'utf-8\') as f:
f.write(movie)
if __name__ == \'__main__\':
number = 0
num = 1
for i in range(10):
url = f\'https://movie.douban.com/top250?start={number}&filter=\'
number += 25
response = get_html(url)
movie_list = parse_html(response)
for movie in movie_list:
save_data(movie, num)
num += 1