14.crawlspider抓取微信小程序社区案例

crawlspider抓取微信小程序社区案例

本文只用于学习，一切非法用途于本人无关！！！！！！！！！！！！

spider

 name = \'wxapp_spider\'
    allowed_domains = [\'wxapp-union.com\']
    start_urls = [\'http://www.wxapp-union.com/portal.php?mod=list&catid=1&page=1\']

    rules = (
        Rule(LinkExtractor(allow=r\'.+mod=list&catid=1&page=\d\'), follow=True),
        Rule(LinkExtractor(allow=r\'.+article-.+\.html\'), callback=\'parse_detail\', follow=False)
    )

    def parse_detail(self, response):
        title = response.xpath(\'//h1/text()\').get()
        author_p = response.xpath(\'//p[@class="authors"]\')
        author = author_p.xpath(\'./a/text()\').get()
        pub_time = author_p.xpath(\'./span/text()\').get()
        article_content = response.xpath(\'//td[@id="article_content"]//text()\').getall()
        article_content = \'\'.join(article_content).strip()
        item = Demo12WxappItem(title=title, author=author, pub_time=pub_time, article_content=article_content)
        yield item

item

    title = scrapy.Field()
    author = scrapy.Field()
    pub_time = scrapy.Field()
    article_content = scrapy.Field()

pipelines

    def __init__(self):
        self.fp = open(\'wxjc.json\', \'wb\')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding=\'utf-8\')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.fp.close()