Excel版本
from urllib import request,parse from bs4 import BeautifulSoup import csv class CatEye(): def __init__(self): self.url=\'https://maoyan.com/board/4?offset={}\' # https://maoyan.com/board/4?offset=10 self.headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\'} self.filename=\'CatEye_Board.csv\' #获取url def get_url(self,page_n): params=page_n*10 full_url=self.url.format(params) return full_url #获取html def get_html(self,url): req=request.Request(url,headers=self.headers) response=request.urlopen(req) html=response.read().decode() return html #解析 def get_info(self,html): soup=BeautifulSoup(html) movie_list_soup=soup.find("dl",class_=\'board-wrapper\') movie_name_list=[] for dd in movie_list_soup.find_all(\'dd\'): movie_name_tmp=dd.find("p",attrs={\'class\':\'name\'}) movie_name=movie_name_tmp.find("a").getText().strip() actors=dd.find("p",attrs={\'class\':\'star\'}).getText().strip()[3:] time=dd.find("p",attrs={\'class\':\'releasetime\'}).getText().strip()[5:] movie_info=movie_name+\' \'+actors+\' \'+time movie_name_list.append(movie_info) with open(self.filename,\'a\') as f: writer=csv.writer(f) writer.writerows([(movie_name,actors,time)]) return movie_name_list #存储 def save_info(self,filename,info): pass def runforever(self): info=\'\' for i in range(10): url=self.get_url(i) html=self.get_html(url) movie_list=self.get_info(html) info+=\'\r\n\'.join(movie_list) info+=\'\r\n\' # self.save_info(filename,info) if __name__ == \'__main__\': spider=CatEye() spider.runforever()
MongoDB版本
1 import random 2 import time 3 from urllib import request,parse 4 from bs4 import BeautifulSoup 5 import pymongo 6 7 class CatEye(): 8 def __init__(self): 9 self.url=\'https://maoyan.com/board/4?offset={}\' 10 # https://maoyan.com/board/4?offset=10 11 self.headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\'} 12 self.filename=\'CatEye_Board.csv\' 13 self.info_list=[] 14 self.conn=pymongo.MongoClient(port=27017,host=\'127.0.0.1\') 15 self.db=self.conn[\'CatEye\'] 16 self.collection=self.db[\'filmlist\'] 17 18 #获取url 19 def get_url(self,page_n): 20 params=page_n*10 21 full_url=self.url.format(params) 22 return full_url 23 24 #获取html 25 def get_html(self,url): 26 req=request.Request(url,headers=self.headers) 27 response=request.urlopen(req) 28 html=response.read().decode() 29 return html 30 31 32 #解析 33 def get_info(self,html): 34 soup=BeautifulSoup(html) 35 movie_list_soup=soup.find("dl",class_=\'board-wrapper\') 36 37 for dd in movie_list_soup.find_all(\'dd\'): 38 movie_name_tmp=dd.find("p",attrs={\'class\':\'name\'}) 39 movie_name=movie_name_tmp.find("a").getText().strip() 40 actors=dd.find("p",attrs={\'class\':\'star\'}).getText().strip()[3:] 41 time=dd.find("p",attrs={\'class\':\'releasetime\'}).getText().strip()[5:15] 42 self.save_info({\'name\':movie_name,\'actors\':actors,\'time\':time}) 43 44 #存储 45 def save_info(self,item): 46 self.collection.insert_one(item) 47 48 def runforever(self): 49 info=\'\' 50 for i in range(10): 51 url=self.get_url(i) 52 time.sleep(random.randint(1,2)) 53 html=self.get_html(url) 54 self.get_info(html) 55 self.info_list=[] 56 57 58 59 60 if __name__ == \'__main__\': 61 spider=CatEye() 62 spider.runforever()
注意在使用MongoDB时,可能会出现Duplicate Key Error,这是因为mongodb在insert_one的时候会检查是否有item[\'_id\']这一个键,如果没有的话会自动生成。所以不能只覆盖item其他的键,需要将item整个清空。又或者在init里面生成自定义的key来确保唯一性。
2020.1.9更新:(1)换了user_agent头(2)将存储改成了存在excel中 (3) 新增了可以使用MongoDB版本