#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-11-08 22:33:55
# Project: qsbk
from pyspider.libs.base_handler import *
from lxml import html
from urlparse import urljoin
import datetime
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.start_url=\'https://www.qiushibaike.com/\'
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.start_url, callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
root=html.fromstring(response.content.decode(\'utf-8\'))
content_left_node = root.xpath("//div[@id=\'content-left\']")
div_node_list = content_left_node[0].xpath("./div")
tasks=[]
for div_node in div_node_list:
title_node = div_node.xpath(
".//div[@class=\'author clearfix\']/a[contains(@onclick,\'web-list-author-text\')]/h2/text()")
__content_url =div_node.xpath("./a[@class=\'contentHerf\']/@href")
content_url = urljoin(self.start_url, __content_url[0])
content_node = div_node.xpath(".//div[@class=\'content\']/span[1]")
content = content_node[0].xpath(\'string(.)\')
name = title_node[0]
info = \'\'.join(content)
crawldate = datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')
item = {}
item[\'name\'] = name.strip() if name else name
item[\'info\'] = info.strip() if info else info
item[\'crawldate\'] = crawldate
item[\'url\'] = content_url
tasks.append(item)
return {\'data\':tasks}