hankleo

步骤01: 创建项目

scrapy startproject cnblogs

步骤02: 编写items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class CnblogsItem(scrapy.Item):
    # define the fields for your item here like:
    # 文章标题
    title = scrapy.Field()
    # 文章url
    url = scrapy.Field()
    # 文章作者
    author = scrapy.Field()

步骤03: 在spiders文件夹内创建articles.py

from scrapy.spiders import SitemapSpider
from cnblogs.items import CnblogsItem

class MySpider(SitemapSpider):
    name = \'articles\'
    # Sitemap 地址
    sitemap_urls = [\'http://www.cnblogs.com/sitemap.xml\']
    # 从Sitemap中提取url的规则,并指定回调方法
    sitemap_rules = [
        # 抓取 ***/cate/python/**的url,调用parse_python处理
        (\'/cate/python/\',\'parse_python\')
    ]

    # 回调方法
    def parse_python(self,response):
        articles = response.css(\'.post_item\')

        for article in articles:
            item = CnblogsItem()
            # 文章标题
            item[\'title\'] = article.css(\'.titlelnk::text\').extract_first()
            # 文章url
            item[\'url\'] = article.css(\'.titlelnk::attr(href)\').extract_first()
            # 文章作者
            item[\'author\'] = article.css(\'.lightblue::text\').extract_first()
            yield item

步骤04: 运行爬虫

scrapy crawl articles

分类:

技术点:

相关文章:

  • 2021-06-29
  • 2021-10-06
  • 2021-09-05
  • 2022-12-23
猜你喜欢
  • 2022-01-18
  • 2021-12-09
  • 2022-01-15
  • 2021-12-19
  • 2021-06-02
  • 2021-11-28
  • 2021-12-10
相关资源
相似解决方案