实验网站http://books.toscrape.com/爬取
# -*- coding: utf-8 -*-
import scrapy
class Book(scrapy.Spider):
name = 'book' #爬虫标识名称
# allowed_domains = ['example.com']
start_urls = ['http://books.toscrape.com/'] #开始爬取的位置
def parse(self, response):
for book in response.xpath("//article[@class='product_pod']"):
book_name = book.xpath("./h3/a/@title").extract()
book_price = book.xpath("./div[@class='product_price']/p[@class='price_color']/text()").extract()
yield{
'name':book_name,
'price':book_price,
}
#获取下一页代码实现所有页数的数据的爬取
url = response.xpath("//li[@class='next']/a/@href").extract_first()
url2 = response.urljoin(url)
yield scrapy.Request(url2,callback=self.parse)
这个地址不是全部内容所以要拼截url2 = response.urljoin(url)
最后通过yield获取全部路径
启动文件内容
运行后显示的结果
到此显示book网页的对书名与书价的静态数据爬取与分页的显示所有内容完成