c-x-a
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-11-08 22:33:55
# Project: qsbk

from pyspider.libs.base_handler import *
from lxml import html
from urlparse import urljoin
import datetime
class Handler(BaseHandler):
    crawl_config = {
    }
    def __init__(self):
        self.start_url=\'https://www.qiushibaike.com/\'
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(self.start_url, callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        root=html.fromstring(response.content.decode(\'utf-8\'))
        content_left_node = root.xpath("//div[@id=\'content-left\']")
        div_node_list = content_left_node[0].xpath("./div")
        tasks=[]
        for div_node in div_node_list:
            title_node = div_node.xpath(
                ".//div[@class=\'author clearfix\']/a[contains(@onclick,\'web-list-author-text\')]/h2/text()")
            __content_url =div_node.xpath("./a[@class=\'contentHerf\']/@href")
            content_url = urljoin(self.start_url, __content_url[0])
            content_node = div_node.xpath(".//div[@class=\'content\']/span[1]")
            content = content_node[0].xpath(\'string(.)\')
            name = title_node[0]
            info = \'\'.join(content)
            crawldate = datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')
            item = {}
            item[\'name\'] = name.strip() if name else name
            item[\'info\'] = info.strip() if info else info
            item[\'crawldate\'] = crawldate
            item[\'url\'] = content_url
            tasks.append(item)
        return {\'data\':tasks}    
         

  

分类:

技术点:

相关文章:

  • 2022-12-23
  • 2021-07-30
  • 2021-09-15
  • 2022-12-23
  • 2021-08-07
  • 2022-12-23
  • 2022-12-23
猜你喜欢
  • 2021-07-05
  • 2022-12-23
  • 2021-10-04
  • 2021-10-27
  • 2021-06-07
  • 2022-02-15
相关资源
相似解决方案