def __init__(self, url, start_page, pages, page_size):
"""
初始化
@param url: 爬取主网址
@param start_page: 起始页码
@param pages: 总页码(截止页码)
@param page_size: 每页的大小
"""
self.url = url
self.start_page = start_page
self.pages = pages
self.page_size = page_size
self.data_info = []
self.headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36\',
}
def get_one_page(self):
"""
根据起始页码获取当前页面的所有电影
:return:
"""
# 如果当前页码小于0,异常退出
if self.start_page < 0:
return ""
# 如果起始页面大于总页码数,退出
if self.start_page > self.pages:
return ""
# 若当前页其实页码小于总页数,继续爬取数据
while self.start_page<pages:
# 根据每页数据条数确定起始下标
start_number = self.start_page * self.page_size
new_url = self.url + \'?start=\' + str(start_number) + \'&filter=\'
print(\'正在爬取第 {0} 页数据\'.format(self.start_page+1))
# 爬取当前页码的数据
response = requests.get(url=new_url, headers=self.headers)
# 解析数据
self.get_per_movie(response.text)
# 下一页
self.start_page = self.start_page + 1
# 将当前数据保存到数据库中
self.data_to_mysql()
return ""
def get_per_movie(self, one_page_data):
"""
解析每一页的每一个电影详细链接
:param one_page_data:
:return:
"""
soup = BeautifulSoup(one_page_data, \'html.parser\')
# 定位到每一个电影的 div (pic 标记的 div)
soup_div_list = soup.find_all(class_="pic")
# 遍历获取每一个 div 的电影详情链接
for soup_div in soup_div_list:
# 定位到每一个电影的 a 标签
soup_a = soup_div.find_all(\'a\')[0]
movie_href = soup_a.get(\'href\')
print(movie_href)
# 解析数据,获取当前页的 25 个电影详细链接
self.get_movie_content(movie_href)
return ""
def get_movie_content(self, movie_detail_href):
# 生成一个有序字典,保存影片结果
movie_info = OrderedDict()
\'\'\'爬取页面,获得详细数据\'\'\'
response = requests.get(url=movie_detail_href, headers=self.headers)
soup = BeautifulSoup(response.text, \'html.parser\')
# 解析电影排名和名称
movie_info[\'movie_rank\'] = soup.find_all(\'span\', class_=\'top250-no\')[0].string
movie_info[\'movie_name\'] = soup.find_all(\'span\', property=\'v:itemreviewed\')[0].string
# 定位到影片数据的 div
soup_div = soup.find(id=\'info\')
# 解析电影发布信息
movie_info[\'movie_director\'] = self.get_mul_tag_info(soup_div.find_all(\'span\')[0].find_all(\'a\'))
movie_info[\'movie_writer\'] = self.get_mul_tag_info(soup_div.find_all(\'span\')[3].find_all(\'a\'))
movie_info[\'movie_starring\'] = self.get_mul_tag_info(soup_div.find_all(\'span\')[6].find_all(\'a\'))
movie_info[\'movie_type\'] = self.get_mul_tag_info(soup_div.find_all(\'span\', property=\'v:genre\'))
movie_info[\'movie_country\'] = soup_div.find(text=\'制片国家/地区:\').next_element.lstrip().rstrip()
movie_info[\'movie_language\'] = soup_div.find(text=\'语言:\').next_element.lstrip().rstrip()
movie_info[\'movie_release_date\'] = self.get_mul_tag_info(soup_div.find_all(\'span\', property=\'v:initialReleaseDate\'))
movie_info[\'movie_run_time\'] = self.get_mul_tag_info(soup_div.find_all(\'span\', property=\'v:runtime\'))
movie_info[\'movie_imdb_href\'] = soup_div.find(\'a\', target=\'_blank\')[\'href\']
movie_second_name = \'\'
try:
movie_second_name = soup_div.find(text=\'又名:\').next_element.lstrip().rstrip()
except AttributeError:
print(\'{0} 没有又名\'.format(movie_info[\'movie_name\']))
movie_info[\'movie_second_name\'] = movie_second_name
# 获取总评分和总评价人数
movie_info[\'movie_rating\'] = soup.find_all(\'strong\', property=\'v:average\')[0].string
movie_info[\'movie_comments_user\'] = soup.find_all(\'span\', property=\'v:votes\')[0].string
# 定位到影片星级评分占比的 div
soup_div = soup.find(\'div\', class_="ratings-on-weight")
# 获取每个星级的评分
movie_info[\'movie_five_star_ratio\'] = soup_div.find_all(\'div\')[0].find(class_=\'rating_per\').string
movie_info[\'movie_four_star_ratio\'] = soup_div.find_all(\'div\')[2].find(class_=\'rating_per\').string
movie_info[\'movie_three_star_ratio\'] = soup_div.find_all(\'div\')[4].find(class_=\'rating_per\').string
movie_info[\'movie_two_star_ratio\'] = soup_div.find_all(\'div\')[6].find(class_=\'rating_per\').string
movie_info[\'movie_one_star_ratio\'] = soup_div.find_all(\'div\')[8].find(class_=\'rating_per\').string
movie_info[\'movie_note\'] = \'\'
print(\'movie_info:\',movie_info)
# 保存当前影片信息
self.data_info.append(movie_info)
def data_to_mysql(self):
"""
保存数据到数据库中
@return:
"""
# 获取数据并保存成 DataFrame
df_data = pd.DataFrame(self.data_info)
df_data.to_csv(\'E:/pythonob/data/movie_data/data_movie.csv\', encoding=\'utf-8\', index=False)
# 导入数据到 mysql 中
#df_data.to_sql(\'t_douban_movie_top_250\', self.pymysql_engine, index=False, if_exists=\'append\')
def get_mul_tag_info(self, soup_span):
"""
获取多个标签的结果合并在一个结果中返回,并用 / 分割
:param soup_span:
:type soup_span:
:return:
:rtype:
"""
info = \'\'
for second_span in soup_span:
# 区分 href 和标签内容
info = (\'\' if (info == \'\') else \'/\').join((info, second_span.string))
return info
if __name__ == \'__main__\':
url = \'https://movie.douban.com/top250\'
start_page = 0
pages = 10
page_size = 25
douban_movie = DouBanMovie(url, start_page, pages, page_size)
douban_movie.get_one_page()
原文链接:https://zhuanlan.zhihu.com/p/97705654
相关文章: