猫眼电影网站页面布局整体采用静态页面,其TOP100榜单按照评分和评分人数从高到低综合排序取前100名,如下:
那么,如何爬取电影库中的经典电影呢?为此,本人特意对经典电影库进行了爬取,具体遇到的困难及解决的办法如下:
1、爬取内容:本次爬取维度有电影名称、电影类型、电影制片国家、电影时长/分钟、上映时间、上映地点、评分、评论/万条、票房/亿元、导演以及主演前三名,其中评分和评分人数要有数值,所遇到的问题:
1>票房数据单位混杂,需转换;
2>爬取维度存在部分无信息,为此采用try,避免程序中断,对不符合规则的网站进行网址保存《坏IP.csv》
2、页面请求及解析技术:采用requests库中get请求方式,解析采用lxml和xpath
3、避免反爬措施:
1>使用fake_usergent中的UserAgent类随机生成User-Agent,避免网站检测;
2>因页面关键信息(如评分、票房等)被网站采用动态自定义字体隐藏,为了破解获取关键信息,使用FontTransform对自定义字体进行破解
3>使用随机时间,延迟访问网站的频率
4、代码性能优化:
1>使用多线程和队列,提高爬取速度
2>存储方面:因数据量较小,采用csv存储,同时,为了减少磁盘IO,程序采用一次性存储所有爬取数据,提升效率;
程序主体如下:
import requests from lxml import etree from threading import Thread from queue import Queue from fake_useragent import UserAgent from font_transform import FontTransform import re import csv import os import time import random class MaoyanSpider: def __init__(self): self.url = \'https://maoyan.com/films?showType=3&offset={}\' self.q = Queue() self.headers = { "Referer": "https://maoyan.com/films?showType=3" } self.i = 0 self.j = 0 self.info_list = [] def get_url(self): # 目标链接入队列 for i in range(0, 1981, 30): url = self.url.format(i) self.headers[\'User-Agent\'] = UserAgent().random html = requests.get(url, headers=self.headers).content.decode(\'utf-8\') parse_html = etree.HTML(html) film_link_list = parse_html.xpath(\'//div[@class="movie-item"]/a/@href\') for link in film_link_list: self.q.put(\'https://maoyan.com\' + link) print(self.q.qsize()) def get_page(self): while True: if not self.q.empty(): self.headers[\'User-Agent\'] = UserAgent().random url = self.q.get() html = requests.get(url, headers=self.headers).content.decode(\'utf-8\') time.sleep(random.random()) self.transform_page(html, url) else: break def transform_page(self, html, url): # 源码中自定义字体转换 font_file = re.findall(r"url\(\'//vfile.meituan.net/colorstone/(.*?\.woff)\'", html)[0].strip() font = FontTransform().get_font(font_file) html = FontTransform().modify_data(font, html) self.get_data(html, url) def get_data(self, html, url): # 信息提取 parse_html = etree.HTML(html) term_base = parse_html.xpath(\'//div[@class="movie-stats-container"]/div[1]//text()\') if \'暂无\' not in term_base and \'用户评分\' in term_base: try: base = parse_html.xpath(\'//div[@class="celeInfo-right clearfix"]\') info_list = [] if base is not None: base = base[0] film_name = base.xpath(\'./div[1]/h3/text()\') if len(film_name) != 0: film_name = film_name[0].strip() else: film_name = \'无此信息\' film_type = base.xpath(\'./div[1]/ul/li[1]/text()\') if len(film_type) != 0: film_type = film_type[0].strip() else: film_type = \'无此信息\' film_country_lis = base.xpath(\'./div[1]/ul/li[2]/text()\') if len(film_country_lis) != 0: film_country = film_country_lis[0].strip().split(\'/\')[0].strip() film_length = film_country_lis[0].strip().split(\'/\')[1].strip()[:-2] else: film_country = \'无此信息\' film_length = \'无此信息\' lis = base.xpath(\'./div[1]/ul/li[3]/text()\') if len(lis) != 0: lis = lis[0].strip() film_release_time = re.findall(r\'[0-9|\-| |:]+\', lis)[0] film_release_place = re.findall(r\'[^0-9|\-| |:]+\', lis)[0] else: film_release_time = \'无此信息\' film_release_place = \'无此信息\' film_score = base.xpath(\'./div[3]/div[1]/div/span/span/text()\') if len(film_score) != 0: film_score = film_score[0].strip() else: film_score = \'无此信息\' film_coment_counts = base.xpath(\'./div[3]/div[1]/div/div/span/span/text()\') if len(film_coment_counts) != 0: film_coment_counts = film_coment_counts[0].strip() if \'万\' in film_coment_counts: film_coment_counts = film_coment_counts[:-1] else: film_coment_counts = str(float(film_coment_counts) / 10000) else: film_coment_counts = \'无此信息\' film_money = base.xpath(\'./div[3]/div[2]/div/span[1]/text()\') film_money_str = base.xpath(\'./div[3]/div[2]/div/span[2]/text()\') if len(film_money) != 0 and len(film_money_str) != 0: film_money = film_money[0].strip() if \'暂无\' not in film_money_str[0].strip() and \'万美元\' in film_money_str[0].strip(): film_money = str((float(film_money) * 6.88) / 10000) elif \'暂无\' not in film_money_str[0].strip() and \'万\' in film_money_str[0].strip(): film_money = str(float(film_money) / 10000) else: film_money = \'无此信息\' info_list = [ film_name, film_type, film_country, film_length, film_release_time, film_release_place, film_score, film_coment_counts, film_money] film_editor = parse_html.xpath(\'//div[@class="celebrity-container"]/div[1]/ul/li[1]/div/a/text()\') if film_editor != 0: film_editor = film_editor[0].strip() else: film_editor = \'无此信息\' info_list.append(film_editor) film_stars_list = parse_html.xpath(\'//div[@class="celebrity-container"]/div[2]/ul/li/div/a/text()\') if len(film_stars_list) != 0: for star in film_stars_list[:3]: info_list.append(star.strip()) if len(info_list) < 13: n = 13 - len(info_list) for i in range(0, n): info_list.append(\'无此信息\') else: info_list.append(\'无此信息\') info_list.append(\'无此信息\') info_list.append(\'无此信息\') self.info_list.append(info_list) except Exception as e: print(e) self.j += 1 print(self.j) print(url) with open(\'./坏IP.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: f.write(url + \'\n\') def save_data(self): with open(\'./maoyan.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: writer = csv.writer(f) writer.writerows(self.info_list) def main(self): if os.path.exists(\'./maoyan.csv\'): os.remove(\'./maoyan.csv\') with open(\'./maoyan.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: writer = csv.writer(f) writer.writerow( [\'电影名称\', \'电影类型\', \'电影制片国家\', \'电影时长/分钟\', \'上映时间\', \'上映地点\', \'评分\', \'评论/万条\', \'票房/亿元\', \'导演\', \'主演1\', \'主演2\', \'主演3\']) self.get_url() t_list = [] # 创建线程 for i in range(0, 10): t = Thread(target=self.get_page) t_list.append(t) t.start() for j in t_list: j.join() self.save_data() if __name__ == \'__main__\': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print(\'共用时{:.2f}秒\'.format(end - start))
自定义字体转换程序如下:
import requests import os from fontTools.ttLib import TTFont from fake_useragent import UserAgent import random class FontTransform(object): def __init__(self): self.headers = { "Referer": "https://maoyan.com/films?showType=3" } # 因猫眼有动态自定义字体,且其字形坐标不变,因此可以先下载一个字体文件作为模板,对比前后坐标,找出真值 self.base_font = TTFont(\'./fonts/base.woff\') self.font_dict = {\'uniED99\': \'4\', \'uniE417\': \'1\', \'uniF623\': \'3\', \'uniEE70\': \'5\', \'uniF7E6\': \'2\', \'uniF7E7\': \'0\', \'uniEFA3\': \'7\', \'uniF77D\': \'6\', \'uniEEDF\': \'9\', \'uniF44E\': \'8\'} def get_html(self, url): self.headers[\'User-Agent\'] = UserAgent().random response = requests.get(url, headers=self.headers) return response.content def get_font(self, font_file): # 列出已下载文件 file_list = os.listdir(\'./fonts\') # 判断是否在已下载文件列表中 if font_file not in file_list: url = \'http://vfile.meituan.net/colorstone/\' + font_file new_file = self.get_html(url) with open(\'./fonts/\' + font_file, \'wb\') as f: f.write(new_file) font = TTFont(\'./fonts/\' + font_file) # font.saveXML(\'./fonts/\' + font_file[:-5] + \'.xml\') return font # 对比坐标找出关系,并对数据做转换 def modify_data(self, font, data): gly_list = font.getGlyphOrder() gly_list = gly_list[2:] for name in gly_list: uni = font[\'glyf\'][name] for k, v in self.font_dict.items(): # 判断与模板的坐标是否相同,相同则替换数据 if uni == self.base_font[\'glyf\'][k]: gly = name.replace(\'uni\', \'&#x\').lower() + \';\' if gly in data: data = data.replace(gly, v) # 返回替换后的字符串 return data # if __name__ == \'__main__\': # test = FontTransform() # test.modify_data()
具体程序和爬取结果见文件