Scrapy学习-3-Request回调巧用

基于twisted的异步回调

使得页面爬取有阶段性和连续性

from scrapy.http import Request
from urllib import parse

def parse(self, response):

    post_urls = response.css("a::attr(href)")

    for url in post_urls:

        # 有时候网站的href没有写全url，我们需要将域名添加到url前面，下面调用parse函数，拼接域名和资源路径

        yield Request(url=parse.urljoin(response.url, url), meta={}, callback=self.parse_detail)

    # 很多时候我们爬取的并不仅仅局限于一个网站上的内容，我们需要深度或广度遍历网站中的url，所以下边将继续对下一个url进行parse

    next_url = response.css(".next_page::attr(href)")

    if next_page:

        yield Request(url=parse.urljoin(response.url, url), meta={}, callback=self.parse)

def parse_detail(self, response):

    front_data = response.meta.get('xxx', '')
    pass