crawlspider抓取微信小程序社区案例
本文只用于学习,一切非法用途于本人无关!!!!!!!!!!!!
spider
name = \'wxapp_spider\'
allowed_domains = [\'wxapp-union.com\']
start_urls = [\'http://www.wxapp-union.com/portal.php?mod=list&catid=1&page=1\']
rules = (
Rule(LinkExtractor(allow=r\'.+mod=list&catid=1&page=\d\'), follow=True),
Rule(LinkExtractor(allow=r\'.+article-.+\.html\'), callback=\'parse_detail\', follow=False)
)
def parse_detail(self, response):
title = response.xpath(\'//h1/text()\').get()
author_p = response.xpath(\'//p[@class="authors"]\')
author = author_p.xpath(\'./a/text()\').get()
pub_time = author_p.xpath(\'./span/text()\').get()
article_content = response.xpath(\'//td[@id="article_content"]//text()\').getall()
article_content = \'\'.join(article_content).strip()
item = Demo12WxappItem(title=title, author=author, pub_time=pub_time, article_content=article_content)
yield item
item
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
article_content = scrapy.Field()
pipelines
def __init__(self):
self.fp = open(\'wxjc.json\', \'wb\')
self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding=\'utf-8\')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()