步骤01: 创建项目
scrapy startproject csvfeedspider
步骤02: 使用csvfeed模版
scrapy genspider -t csvfeed csvdata gzdata.gov.cn
步骤03: 编写items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class CsvspiderItem(scrapy.Item):
# define the fields for your item here like:
# 姓名
name = scrapy.Field()
# 研究领域
SearchField = scrapy.Field()
# 服务分类
Service = scrapy.Field()
# 专业特长
Specialty = scrapy.Field()
步骤04: 编写爬虫文件csvdata.py
# -*- coding: utf-8 -*-
from scrapy.spiders import CSVFeedSpider
from csvfeedspider.items import CsvspiderItem
class CsvparseSpider(CSVFeedSpider):
name = \'csvdata\'
allowed_domains = [\'gzdata.gov.cn\']
start_urls = [\'http://gzopen.oss-cn-guizhou-a.aliyuncs.com/科技特派员.csv\']
headers = [\'name\', \'SearchField\', \'Service\', \'Specialty\']
delimiter = \',\'
quotechar = "\n"
# Do any adaptations you need here
def adapt_response(self, response):
return response.body.decode(\'gb18030\')
def parse_row(self, response, row):
i = CsvspiderItem()
try:
i[\'name\'] = row[\'name\']
i[\'SearchField\'] = row[\'SearchField\']
i[\'Service\'] = row[\'Service\']
i[\'Specialty\'] = row[\'Specialty\']
except:
pass
yield i
步骤05: 运行爬虫文件
scrapy crawl csvdata