案例1 :(增量式爬取)京东全部图书,自己可以扩展 爬取每一本电子书的评论
1.spider:
# -*- coding: utf-8 -*- import scrapy from copy import deepcopy import json import urllib class JdSpider(scrapy.Spider): name = \'jd\' allowed_domains = [\'jd.com\',\'p.3.cn\'] start_urls = [\'https://book.jd.com/booksort.html\'] def parse(self, response): dt_list = response.xpath("//div[@class=\'mc\']/dl/dt") #大分类列表 for dt in dt_list: item = {} item["b_cate"] = dt.xpath("./a/text()").extract_first() em_list = dt.xpath("./following-sibling::dd[1]/em") #小分类列表 for em in em_list: item["s_href"] = em.xpath("./a/@href").extract_first() item["s_cate"] = em.xpath("./a/text()").extract_first() if item["s_href"] is not None: item["s_href"] = "https:" + item["s_href"] yield scrapy.Request( item["s_href"], callback=self.parse_book_list, meta = {"item":deepcopy(item)} ) def parse_book_list(self,response): #解析列表页 item = response.meta["item"] li_list = response.xpath("//div[@id=\'plist\']/ul/li") for li in li_list: item["book_img"] = li.xpath(".//div[@class=\'p-img\']//img/@src").extract_first() if item["book_img"] is None: item["book_img"] = li.xpath(".//div[@class=\'p-img\']//img/@data-lazy-img").extract_first() item["book_img"]="https:"+item["book_img"] if item["book_img"] is not None else None item["book_name"] = li.xpath(".//div[@class=\'p-name\']/a/em/text()").extract_first().strip() item["book_author"] = li.xpath(".//span[@class=\'author_type_1\']/a/text()").extract() item["book_press"]= li.xpath(".//span[@class=\'p-bi-store\']/a/@title").extract_first() item["book_publish_date"] = li.xpath(".//span[@class=\'p-bi-date\']/text()").extract_first().strip() item["book_sku"] = li.xpath("./div/@data-sku").extract_first() yield scrapy.Request( "https://p.3.cn/prices/mgets?skuIds=J_{}".format(item["book_sku"]), callback=self.parse_book_price, meta = {"item":deepcopy(item)} ) #列表页翻页 next_url = response.xpath("//a[@class=\'pn-next\']/@href").extract_first() if next_url is not None: next_url = urllib.parse.urljoin(response.url,next_url) yield scrapy.Request( next_url, callback=self.parse_book_list, meta = {"item":item} ) def parse_book_price(self,response): item = response.meta["item"] item["book_price"] = json.loads(response.body.decode())[0]["op"] print(item)
2. 配置文件
# 增加配置
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True REDIS_URL = "redis://127.0.0.1:6379" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\' # Obey robots.txt rules ROBOTSTXT_OBEY = False
案例2:分布式爬取当当图书
1.scrapy 配置settings.py中
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True REDIS_URL = "redis://127.0.0.1:6379" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\' # Obey robots.txt rules ROBOTSTXT_OBEY = False
2. 当当爬虫文件 spider.py 文件中:
# -*- coding: utf-8 -*- import scrapy from scrapy_redis.spiders import RedisSpider from copy import deepcopy import urllib class DangdangSpider(RedisSpider): name = \'dangdang\' allowed_domains = [\'dangdang.com\'] # start_urls = [\'http://book.dangdang.com/\']# 不在写start_url地址,如果写了就会重复每台电脑就会重复爬取该地址 # 在redis中 先存start_url 地址 :lpush dangdang http://book.dangdang.com/ redis_key = "dangdang" def parse(self, response): #大分类分组 div_list = response.xpath("//div[@class=\'con flq_body\']/div") for div in div_list: item = {} item["b_cate"] = div.xpath("./dl/dt//text()").extract() item["b_cate"] = [i.strip() for i in item["b_cate"] if len(i.strip())>0] #中间分类分组 dl_list = div.xpath("./div//dl[@class=\'inner_dl\']") for dl in dl_list: item["m_cate"] = dl.xpath("./dt//text()").extract() item["m_cate"] = [i.strip() for i in item["m_cate"] if len(i.strip())>0][0] #小分类分组 a_list = dl.xpath("./dd/a") for a in a_list: item["s_href"] = a.xpath("./@href").extract_first() item["s_cate"] = a.xpath("./text()").extract_first() if item["s_href"] is not None: yield scrapy.Request( item["s_href"], callback=self.parse_book_list, meta = {"item":deepcopy(item)} ) def parse_book_list(self,response): item = response.meta["item"] li_list = response.xpath("//ul[@class=\'bigimg\']/li") for li in li_list: item["book_img"] = li.xpath("./a[@class=\'pic\']/img/@src").extract_first() if item["book_img"] == "images/model/guan/url_none.png": item["book_img"] = li.xpath("./a[@class=\'pic\']/img/@data-original").extract_first() item["book_name"] = li.xpath("./p[@class=\'name\']/a/@title").extract_first() item["book_desc"] = li.xpath("./p[@class=\'detail\']/text()").extract_first() item["book_price"] = li.xpath(".//span[@class=\'search_now_price\']/text()").extract_first() item["book_author"] = li.xpath("./p[@class=\'search_book_author\']/span[1]/a/text()").extract() item["book_publish_date"] = li.xpath("./p[@class=\'search_book_author\']/span[2]/text()").extract_first() item["book_press"] = li.xpath("./p[@class=\'search_book_author\']/span[3]/a/text()").extract_first() print(item) #下一页 next_url = response.xpath("//li[@class=\'next\']/a/@href").extract_first() if next_url is not None: next_url = urllib.parse.urljoin(response.url,next_url) yield scrapy.Request( next_url, callback=self.parse_book_list, meta = {"item":item} )
redis数据库操作
案例3 : 使用RedisCrawlSpider 自动提取url地址,并可以实现分布式
amazon 爬取亚马逊电子书
1. 创建amazon 爬虫命令:
scrapy genspdier -t crawl amazon amazon.com # 创建 crawlSpdier 爬虫
2.对crawlSpider 爬虫进行修改为 RedisCrawlSpdier
2. settings.py 中:
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True REDIS_URL = "redis://127.0.0.1:6379" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\' # Obey robots.txt rules ROBOTSTXT_OBEY = False
3. amazon.py 最后的爬虫文件
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy_redis.spiders import RedisCrawlSpider import re class AmazonSpider(RedisCrawlSpider): name = \'amazon\' allowed_domains = [\'amazon.cn\'] # start_urls = [\'https://www.amazon.cn/%E5%9B%BE%E4%B9%A6/b/ref=sd_allcat_books_l1?ie=UTF8&node=658390051\'] redis_key = "amazon" rules = ( #匹配大分类的url地址和小分类的url Rule(LinkExtractor(restrict_xpaths=("//div[@class=\'categoryRefinementsSection\']/ul/li",)), follow=True), #匹配图书的url地址 Rule(LinkExtractor(restrict_xpaths=("//div[@id=\'mainResults\']/ul/li//h2/..",)),callback="parse_book_detail"), #列表页翻页 Rule(LinkExtractor(restrict_xpaths=("//div[@id=\'pagn\']",)),follow=True), ) def parse_book_detail(self,response): # with open(response.url.split("/")[-1]+".html","w",encoding="utf-8") as f: # f.write(response.body.decode()) item = {} item["book_title"] = response.xpath("//span[@id=\'productTitle\']/text()").extract_first() item["book_publish_date"] = response.xpath("//h1[@id=\'title\']/span[last()]/text()").extract_first() item["book_author"] = response.xpath("//div[@id=\'byline\']/span/a/text()").extract() # item["book_img"] = response.xpath("//div[@id=\'img-canvas\']/img/@src").extract_first() item["book_price"] = response.xpath("//div[@id=\'soldByThirdParty\']/span[2]/text()").extract_first() item["book_cate"] = response.xpath("//div[@id=\'wayfinding-breadcrumbs_feature_div\']/ul/li[not(@class)]/span/a/text()").extract() item["book_cate"] = [i.strip() for i in item["book_cate"]] item["book_url"] = response.url item["book_press"] = response.xpath("//b[text()=\'出版社:\']/../text()").extract_first() # item["book_desc"] = re.findall(r\'<noscript>.*?<div>(.*?)</div>.*?</noscript>\',response.body.decode(),re.S) # item["book_desc"] = response.xpath("//noscript/div/text()").extract() # item["book_desc"] = [i.strip() for i in item["book_desc"] if len(i.strip())>0 and i!=\'海报:\'] # item["book_desc"] = item["book_desc"][0].split("<br>",1)[0] if len(item["book_desc"])>0 else None print(item)
4. 爬虫程序写完之后进行启动redis 服务
redis-cli
5. 在redis中存入 start_url ,在redis中执行以下命令
lpush amazon https://www.amazon.cn/%E5%9B%BE%E4%B9%A6/b/ref=sd_allcat_books_l1?ie=UTF8&node=658390051
类似 爬取dangdang 图书
爬虫中涉及到的小知识点:
1. url 的补全
import urllib next_url = response.xpath("//a[@class=\'pn-next\']/@href").extract_first() next_url = urllib.parse.urljoin(response.url,next_url)
2. scrapy 中获取文本内容
response.body.decode()
3. 获取scrapy xpath 中获取兄弟节点标签
em_list = dt.xpath("./following-sibling::dd[1]/em")
4. 获取通过字节点获取父节点
获取a标签中的h2标签,通过h2 标签获取a标签的的herf 属性
5. 获取h1 标签中的最后一个span 标签 h1/span[last()]/text()
6. 网页中base64 转图片,以后自己也可以将图片转化为base64 格式的字符串用来存图片。
7. 找出没有某个class 属性的li li[not(@class)]
8. 用crawlSpdier 时写rule规则时 ,找下图中的下一页的url ,提取url 地址时
rules = ( #匹配大分类的url地址和小分类的url Rule(LinkExtractor(restrict_xpaths=("//div[@class=\'categoryRefinementsSection\']/ul/li",)), follow=True), #匹配图书的url地址 Rule(LinkExtractor(restrict_xpaths=("//div[@id=\'mainResults\']/ul/li//h2/..",)),callback="parse_book_detail"), #列表页翻页 Rule(LinkExtractor(restrict_xpaths=("//div[@id=\'pagn\']",)),follow=True), )
9. 提取下面内容