items.py
class CoserItem(scrapy.Item):
url = scrapy.Field()
name = scrapy.Field()
info = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
spiders/coser.py
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
import scrapy
from scrapy.contrib.loader import ItemLoader
from Cosplay.items import CoserItem
class CoserSpider(scrapy.Spider):
name = "coser"
allowed_domains = ["bcy.net"]
start_urls = (
\'http://bcy.net/cn125101\',
\'http://bcy.net/cn126487\',
\'http://bcy.net/cn126173\'
)
def parse(self, response):
sel = Selector(response)
for link in sel.xpath("//ul[@class=\'js-articles l-works\']/li[@class=\'l-work--big\']/article[@class=\'work work--second-created\']/h2[@class=\'work__title\']/a/@href").extract():
link = \'http://bcy.net%s\' % link
request = scrapy.Request(link, callback=self.parse_item)
yield request
def parse_item(self, response):
l = ItemLoader(item=CoserItem(), response=response)
l.add_xpath(\'name\', "//h1[@class=\'js-post-title\']/text()")
l.add_xpath(\'info\', "//div[@class=\'post__info\']/div[@class=\'post__type post__info-group\']/span/text()")
urls = l.get_xpath(\'//img[@class="detail_std detail_clickable"]/@src\')
urls = [url.replace(\'/w650\', \'\') for url in urls]
l.add_value(\'image_urls\', urls)
l.add_value(\'url\', response.url)
return l.load_item()
pipelines.py
import requests
from Cosplay import settings
import os
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
if \'image_urls\' in item:
images = []
dir_path = \'%s/%s\' % (settings.IMAGES_STORE, spider.name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item[\'image_urls\']:
us = image_url.split(\'/\')[3:]
image_file_name = \'_\'.join(us)
file_path = \'%s/%s\' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
with open(file_path, \'wb\') as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
item[\'images\'] = images
return item
settings.py
ITEM_PIPELINES = {\'Cosplay.pipelines.ImageDownloadPipeline\': 1}
IMAGES_STORE = \'../Images\'
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute(\'scrapy crawl coser\'.split())
执行程序