csv文件作用
将爬取的数据存放到本地的csv文件中
使用流程
# 1、导入模块 # 2、打开csv文件 # 3、初始化写入对象 # 4、写入数据(参数为列表) import csv with open(\'film.csv\',\'w\') as f: writer = csv.writer(f) writer.writerow([])
示例代码
# 单行写入(writerow([])) import csv with open(\'test.csv\',\'w\',newline=\'\') as f: writer = csv.writer(f) writer.writerow([\'步惊云\',\'36\']) writer.writerow([\'超哥哥\',\'25\']) # 多行写入(writerows([(),(),()] import csv with open(\'test.csv\',\'w\',newline=\'\') as f: writer = csv.writer(f) writer.writerows([(\'聂风\',\'36\'),(\'秦霜\',\'25\'),(\'孔慈\',\'30\')])
爬虫代码
from urllib import request import re import time import random from useragents import ua_list import csv class MaoyanSpider(object): def __init__(self): self.url = \'https://maoyan.com/board/4?offset={}\' # 计数 self.num = 0 def get_html(self,url): headers = { \'User-Agent\' : random.choice(ua_list) } req = request.Request(url=url,headers=headers) res = request.urlopen(req) html = res.read().decode(\'utf-8\') # 直接调用解析函数 self.parse_html(html) def parse_html(self,html): re_bds = r\'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>\' pattern = re.compile(re_bds,re.S) # film_list: [(\'霸王别姬\',\'张国荣\',\'1993\'),()] film_list = pattern.findall(html) # 直接调用写入函数 self.write_html(film_list) # 存入csv文件 - writerow() # def write_html(self,film_list): # with open(\'film.csv\',\'a\') as f: # # 初始化写入对象,注意参数f别忘了 # writer = csv.writer(f) # for film in film_list: # L = [ # film[0].strip(), # film[1].strip(), # film[2].strip()[5:15] # ] # # writerow()参数为列表 # writer.writerow(L) # 存入csv文件 - writerows() def write_html(self,film_list): L = [] with open(\'film.csv\',\'a\') as f: # 初始化写入对象,注意参数f别忘了 writer = csv.writer(f) for film in film_list: t = ( film[0].strip(), film[1].strip(), film[2].strip()[5:15] ) L.append(t) self.num += 1 # writerows()参数为列表 writer.writerows(L) def main(self): for offset in range(0,31,10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1,2)) print(\'共抓取数据:\',self.num) if __name__ == \'__main__\': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print(\'执行时间:%.2f\' % (end-start))