hooo-1102

csv文件作用

将爬取的数据存放到本地的csv文件中

使用流程

# 1、导入模块
# 2、打开csv文件
# 3、初始化写入对象
# 4、写入数据(参数为列表)

import csv 

with open(\'film.csv\',\'w\') as f:
    writer = csv.writer(f)
    writer.writerow([])

 

示例代码

创建 test.csv 文件,在文件中写入数据

# 单行写入(writerow([]))
import csv
with open(\'test.csv\',\'w\',newline=\'\') as f:
    writer = csv.writer(f)
    writer.writerow([\'步惊云\',\'36\'])
    writer.writerow([\'超哥哥\',\'25\'])

# 多行写入(writerows([(),(),()]
import csv
with open(\'test.csv\',\'w\',newline=\'\') as f:
    writer = csv.writer(f)
    writer.writerows([(\'聂风\',\'36\'),(\'秦霜\',\'25\'),(\'孔慈\',\'30\')])

 

爬虫代码

from urllib import request
import re
import time
import random
from useragents import ua_list
import csv

class MaoyanSpider(object):
  def __init__(self):
    self.url = \'https://maoyan.com/board/4?offset={}\'
    # 计数
    self.num = 0

  def get_html(self,url):
    headers = {
      \'User-Agent\' : random.choice(ua_list)
    }
    req = request.Request(url=url,headers=headers)
    res = request.urlopen(req)
    html = res.read().decode(\'utf-8\')
    # 直接调用解析函数
    self.parse_html(html)

  def parse_html(self,html):
    re_bds = r\'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>\'
    pattern = re.compile(re_bds,re.S)
    # film_list: [(\'霸王别姬\',\'张国荣\',\'1993\'),()]
    film_list = pattern.findall(html)
    # 直接调用写入函数
    self.write_html(film_list)

  # 存入csv文件 - writerow()
  # def write_html(self,film_list):
  #   with open(\'film.csv\',\'a\') as f:
  #     # 初始化写入对象,注意参数f别忘了
  #     writer = csv.writer(f)
  #     for film in film_list:
  #       L = [
  #         film[0].strip(),
  #         film[1].strip(),
  #         film[2].strip()[5:15]
  #       ]
  #       # writerow()参数为列表
  #       writer.writerow(L)

  # 存入csv文件 - writerows()
  def write_html(self,film_list):
    L = []
    with open(\'film.csv\',\'a\') as f:
      # 初始化写入对象,注意参数f别忘了
      writer = csv.writer(f)
      for film in film_list:
        t = (
          film[0].strip(),
          film[1].strip(),
          film[2].strip()[5:15]
        )
        L.append(t)
        self.num += 1
      # writerows()参数为列表
      writer.writerows(L)

  def main(self):
    for offset in range(0,31,10):
      url = self.url.format(offset)
      self.get_html(url)
      time.sleep(random.randint(1,2))
    print(\'共抓取数据:\',self.num)

if __name__ == \'__main__\':
  start = time.time()
  spider = MaoyanSpider()
  spider.main()
  end = time.time()
  print(\'执行时间:%.2f\' % (end-start))

分类:

技术点:

相关文章: