爬虫框架Scrapy之案例一

阳光热线问政平台

http://wz.sun0769.com/index.php/question/questionType?type=4

爬取投诉帖子的编号、帖子的url、帖子的标题，和帖子里的内容。

items.py

import scrapy

class SunwzItem(scrapy.Item):
    number = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()

spiders/sunwz.py


# -*- coding: utf-8 -*-

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Sunwz.items import SunwzItem


class SunwzSpider(CrawlSpider):
    name = \'sunwz\'
    num = 0
    allow_domain = [\'http://wz.sun0769.com/\']
    start_urls = [\'http://wz.sun0769.com/index.php/question/questionType?type=4\']
    rules = {
        Rule(LinkExtractor(allow=\'page\')),
        Rule(LinkExtractor(allow=\'/index\.php/question/questionType\?type=4$\')),
        Rule(LinkExtractor(allow=\'/html/question/\d+/\d+\.shtml$\'), follow = True, callback=\'parse_content\')
    }

    xpathDict = {
        \'title\': \'//div[contains(@class, "pagecenter p3")]/div/div/div[contains(@class,"cleft")]/strong/text()\',
        \'content\': \'//div[contains(@class, "c1 text14_2")]/text()\',
        \'content_first\': \'//div[contains(@class, "contentext")]/text()\'
    }

    def parse_content(self, response):
        item = SunwzItem()
        content = response.xpath(self.xpathDict[\'content_first\']).extract()
        if len(content) == 0:
            content = response.xpath(self.xpathDict[\'content\']).extract()[0]
        else:
            content = content[0]
        title = response.xpath(self.xpathDict[\'title\']).extract()[0]
        title_list = title.split(\' \')
        number = title_list[-1]
        number = number.split(\':\')[-1]
        url = response.url
        item[\'url\'] = url
        item[\'number\'] = number
        item[\'title\'] = title
        item[\'content\'] = content

        yield item

pipelines.py

import json
import codecs

class JsonWriterPipeline(object):

    def __init__(self):
        self.file = codecs.open(\'sunwz.json\', \'w\', encoding=\'utf-8\')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py

ITEM_PIPELINES = {
    \'Sunwz.pipelines.JsonWriterPipeline\': 300,
}

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute(\'scrapy crawl sunwz\'.split())

执行程序

py2 main.py