功能描述V1.0:
爬取豆瓣电影排行top250
功能分析:
使用的库
1、time
2、json
3、requests
4、BuautifulSoup
5、RequestException
上机实验室:
"""
作者:李舵
日期:2019-4-27
功能:抓取豆瓣电影top250
版本:V1.0
"""
import time
import json
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
soup = BeautifulSoup(html, \'lxml\')
ol_list = soup.find(\'ol\', {\'class\': \'grid_view\'})
li_list = ol_list.find_all(\'li\')
for i in range(25):
move_value = li_list[i]
yield {
\'index\': move_value.find(\'em\', {\'class\': \'\'}).text.strip(),
\'title\': move_value.find(\'span\', {\'class\': \'title\'}).text.strip(),
\'actor\': move_value.find(\'p\', {\'class\': \'\'}).text.strip(),
\'score\': move_value.find(\'span\', {\'class\': \'rating_num\'}).text.strip()
}
def write_to_file(content):
with open(\'result.txt\', \'a\', encoding=\'utf-8\') as f:
print(type(json.dumps(content)))
f.write(json.dumps(content, ensure_ascii=False)+\'\n\')
def main(start):
url = \'https://movie.douban.com/top250?start=\' + str(start)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == \'__main__\':
for i in range(0,250,25):
main(start=i)
time.sleep(1)
功能描述V2.0:
爬取豆瓣电影排行top250
功能分析:
使用的库
1、time
2、requests
3、RequestException
上机实验室:
"""
作者:李舵
日期:2019 - 4 - 8
功能:抓取豆瓣电影top250
版本:V2.0
"""
import re
import time
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile(u\'<div.*?class="item">.*?\'
+ u\'<div.*?class="pic">.*?\'
+ u\'<em.*?class="">(.*?)</em>.*?\'
+ u\'<div.*?class="info">.*?\'
+ u\'<span.*?class="title">(.*?)</span>.*?\'
+ u\'<span.*?class="other">(.*?)</span>.*?\'
+ u\'<div.*?class="bd">.*?\'
+ u\'<p.*?class="">.*?\'
+ u\'导演:\s(.*?)\s.*?<br>\'
+ u\'(.*?) / \'
+ u\'(.*?) / (.*?)</p>.*?\'
+ u\'<div.*?class="star">.*?\'
+ u\'<span.*?class="rating_num".*?property="v:average">\'
+ u\'(.*?)</span>.*?\'
+ u\'<span>(.*?)人评价</span>.*?\'
+ u\'<span.*?class="inq">(.*?)</span>\', re.S)
movies = re.findall(pattern, html)
movie_list = []
for movie in movies:
movie_list.append([movie[0],
movie[1],
movie[2].lstrip(\' / \'),
movie[3],
movie[4].lstrip(),
movie[5],
movie[6].strip(),
movie[7],
movie[8],
movie[9]])
return movie_list
def write_to_file(movie_list):
with open(\'top_250.txt\', \'w\', encoding=\'utf-8\',) as f:
for movie in movie_list:
f.write(\'电影排名:\' + movie[0] + \'\n\')
f.write(\'电影名称:\' + movie[1] + \'\n\')
f.write(\'电影别名:\' + movie[2] + \'\n\')
f.write(\'导演:\' + movie[3] + \'\n\')
f.write(\'上映年份:\' + movie[4] + \'\n\')
f.write(\'制作国家/地区:\' + movie[5] + \'\n\')
f.write(\'电影类别:\' + movie[6] + \'\n\')
f.write(\'评分:\' + movie[7] + \'\n\')
f.write(\'参评人数:\' + movie[8] + \'\n\')
f.write(\'简短影评:\' + movie[9] + \'\n\')
f.write(\'\n\')
print(\'成功写入文件,共有%d条记录……\' % len(movie_list))
f.close()
def main(start):
url = \'https://movie.douban.com/top250?start=\' + str(start)
html = get_one_page(url)
movie_list = parse_one_page(html)
write_to_file(movie_list)
if __name__ == \'__main__\':
for i in range(0, 250, 25):
main(start=i)
time.sleep(1)
补充说明:
1、