阳光热线问政平台
http://wz.sun0769.com/index.php/question/questionType?type=4
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。
items.py
import scrapy
class SunwzItem(scrapy.Item):
number = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
spiders/sunwz.py
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Sunwz.items import SunwzItem
class SunwzSpider(CrawlSpider):
name = \'sunwz\'
num = 0
allow_domain = [\'http://wz.sun0769.com/\']
start_urls = [\'http://wz.sun0769.com/index.php/question/questionType?type=4\']
rules = {
Rule(LinkExtractor(allow=\'page\')),
Rule(LinkExtractor(allow=\'/index\.php/question/questionType\?type=4$\')),
Rule(LinkExtractor(allow=\'/html/question/\d+/\d+\.shtml$\'), follow = True, callback=\'parse_content\')
}
xpathDict = {
\'title\': \'//div[contains(@class, "pagecenter p3")]/div/div/div[contains(@class,"cleft")]/strong/text()\',
\'content\': \'//div[contains(@class, "c1 text14_2")]/text()\',
\'content_first\': \'//div[contains(@class, "contentext")]/text()\'
}
def parse_content(self, response):
item = SunwzItem()
content = response.xpath(self.xpathDict[\'content_first\']).extract()
if len(content) == 0:
content = response.xpath(self.xpathDict[\'content\']).extract()[0]
else:
content = content[0]
title = response.xpath(self.xpathDict[\'title\']).extract()[0]
title_list = title.split(\' \')
number = title_list[-1]
number = number.split(\':\')[-1]
url = response.url
item[\'url\'] = url
item[\'number\'] = number
item[\'title\'] = title
item[\'content\'] = content
yield item
pipelines.py
import json
import codecs
class JsonWriterPipeline(object):
def __init__(self):
self.file = codecs.open(\'sunwz.json\', \'w\', encoding=\'utf-8\')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
settings.py
ITEM_PIPELINES = {
\'Sunwz.pipelines.JsonWriterPipeline\': 300,
}
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute(\'scrapy crawl sunwz\'.split())
执行程序
py2 main.py