一数据持久化存储-csv文件
1.作用
将爬取的数据存放到本地的csv文件中
2.使用流程
1、导入模块 2、打开csv文件 3、初始化写入对象 4、写入数据(参数为列表) import csv with open('film.csv','w') as f: writer = csv.writer(f) writer.writerow([])
writer.writerows([(),(),()])
3.示例代码
创建text.csv文件,在文件中写入数据
# 单行写入(writerow([])) import csv with open('test.csv','w',newline='') as f: writer = csv.writer(f) writer.writerow(['步惊云','36']) writer.writerow(['聂风','36']) # 多行写入(writerows([(),(),()] import csv with open('test.csv','w') as f: writer = csv.writer(f) writer.writerows([('聂风','36'),('秦霜','25'),('孔慈','30')])
练习:猫眼电影数据存入本地 maoyanfilm.csv 文件 - 使用writerow方法实现
思考:使用 writerows()方法实现?
import csv from urllib import request, parse import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): # 创建正则的编译对象 re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> ' pattern = re.compile(re_, re.S) # film_list:[('霸王别姬','张国荣','1993')] film_list = pattern.findall(html) self.write_html(film_list) # 存入csv文件-writerrows def write_html(self, film_list): L = [] with open('maoyanfilm.csv', 'a',newline='') as f: # 初始化写入对象,注意参数f不能忘 writer = csv.writer(f) for film in film_list: t = ( film[0].strip(), film[1].strip(), film[2].strip()[5:15] ) self.num += 1 L.append(t) # writerow()参数为列表 writer.writerows(L) print(L) # def write_html(self,film_list): # with open('maoyanfilm.csv','a') as f: # #初始化写入对象,注意参数f不能忘 # writer=csv.writer(f) # for film in film_list: # L=[ # film[0].strip(), # film[1].strip(), # film[2].strip()[5:15] # ] # self.num+=1 # # writerow()参数为列表 # writer.writerow(L) def main(self): for offset in range(0, 91, 10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1, 2)) print('共抓取数据', self.num, "部") if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))