lpdeboke

”python爬虫系列“目录:

scrapy爬取二级页面的内容

1.定义数据结构item.py文件

# -*- coding: utf-8 -*-
\'\'\'
field: item.py
\'\'\'
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TupianprojectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 图片标题
    title = scrapy.Field()
    # 发布时间
    publish_time = scrapy.Field()
    # 浏览量
    look = scrapy.Field()
    # 收藏量
    collect = scrapy.Field()
    # 下载量
    download = scrapy.Field()
    # 图片链接
    image_url = scrapy.Field()


2.爬虫文件

# -*- coding: utf-8 -*-
import scrapy

from tupianproject.items import TupianprojectItem


class ImageSpider(scrapy.Spider):
    name = \'image\'
    allowed_domains = [\'699pic.com\']
    start_urls = [\'http://699pic.com/people-1-0-0-0-0-0-0.html\']
    
    url = \'http://699pic.com/people-{}-0-0-0-0-0-0.html\'
    page = 1

    def parse(self, response):
        # 在一级页面中,应该将所有的图片详情页面的链接获取到
        image_detail_url_list = response.xpath(\'//div[@class="list"]/a/@href\').extract()
        # pass
        # 遍历详情页面,向每一个详情页面发送请求即可
        for image_detail_url in image_detail_url_list:
            yield scrapy.Request(url=image_detail_url, callback=self.parse_detail)
        
        # 接着发送其他请求
        if self.page <= 3:
            self.page += 1
            url = self.url.format(self.page)
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse_detail(self, response):
        # 创建一个item对象
        item = TupianprojectItem()
        # 提取图片的每一个信息
        # title
        item[\'title\'] = response.xpath(\'//div[@class="photo-view"]/h1/text()\').extract_first()
        # 发布时间
        item[\'publish_time\'] = response.xpath(\'//div[@class="photo-view"]/div/span[@class="publicityt"]\')[0].xpath(\'string(.)\').extract_first()
        # 获取浏览量
        item[\'look\'] = response.xpath(\'//div[@class="photo-view"]/div/span[@class="look"]/read/text()\').extract_first()
        # 获取收藏量
        item[\'collect\'] = response.xpath(\'//div[@class="photo-view"]/div/span[@class="collect"]\')[0].xpath(\'string(.)\').extract_first()
        # 获取下载量
        item[\'download\'] = response.xpath(\'//div[@class="photo-view"]/div/span[@class="download"]\')[0].xpath(\'string(.)\').extract_first().strip(\'\n\t\')
        # 获取图片的链接
        item[\'image_url\'] = response.xpath(\'//div[@class="huabu"]//img/@src\').extract_first()
        # 将item发送出去
        yield item


3.管道文件

# -*- coding: utf-8 -*-
\'\'\'
filed: pipelines.py
\'\'\'
s
# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
import urllib.request
import os

class TupianprojectPipeline(object):
    def open_spider(self, spider):
        self.fp = open(\'tupian.json\', \'w\', encoding=\'utf8\')
        
    def process_item(self, item, spider):
        d = dict(item)
        string = json.dumps(d, ensure_ascii=False)
        self.fp.write(string + \'\n\')
        
        # 下载图片
        self.download(item)
        return item
        
    def download(self, item):
        dirname = \'./people\'
        suffix = item[\'image_url\'].split(\'.\')[-1]
        filename = item[\'title\'] + \'.\' + suffix
        filepath = os.path.join(dirname, filename)
        urllib.request.urlretrieve(item[\'image_url\'], filepath)
    
    def close_spider(self, spider):
        self.fp.close()

分类:

技术点:

相关文章: