# -*- coding: utf-8 -*- import scrapy import json import csv from milk.items import MilkItem class MilkspiderSpider(scrapy.Spider): name = \'milkspider\' # allowed_domains = [\'www.xxx.com\'] start_urls = [\'https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec\'] data_list = [] def parse(self, response): li_list = response.xpath(\'//li[@class="gl-item"]\') for li in li_list: good_id = li.xpath(\'./@data-sku\').get() # 从自己开始找 # print(good_id) shop_name = li.xpath(\'.//a[@class="curr-shop"]/text()\').get() # print(shop_name) good_name = li.xpath(\'.//div[@class="p-name p-name-type-2"]/a/em/text()\').getall() good_name = \',\'.join(good_name).strip().replace(",", "").replace("\n\t", "") # print(good_name) good_url = li.xpath(\'.//div[@class="p-name p-name-type-2"]/a/@href\').get() if good_url.startswith(\'https:\'): good_url = good_url else: good_url = \'https:\' + good_url # print(good_url) good_price = li.xpath(\'.//div[@class="p-price"]/strong//text()\').getall() good_price = \',\'.join(good_price).replace(",", "") # print(good_price) # 评论数在源码没有 获取不到 需要去详情页获取 item = MilkItem() item["shop_name"] = shop_name item["good_name"] = good_name item["good_price"] = good_price item["good_id"] = good_id item[\'good_url\'] = good_url yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail) def parse_detail(self, response): # 获取的评论是动态加载的 item = response.meta[\'item\'] # 拼接每个商品的评论的url comment_info_url = \'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=\' + item[\'good_id\'] # print(comment_info_url) yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment) def parse_comment(self, response): item = response.meta[\'item\'] # response.body是一个bytes格式的 转成str str = response.body.decode(\'utf-8\', \'replace\') json_str = str.replace(\'��\', \'万\') dict = json.loads(json_str) total_comment = dict[\'CommentsCount\'][0][\'CommentCountStr\'] good_comment = dict[\'CommentsCount\'][0][\'GoodCountStr\'] video_count = dict[\'CommentsCount\'][0][\'VideoCountStr\'] general_count = dict[\'CommentsCount\'][0][\'GeneralCountStr\'] poor_count = dict[\'CommentsCount\'][0][\'PoorCountStr\'] item[\'total_comment\'] = total_comment item[\'good_comment\'] = good_comment item[\'video_count\'] = video_count item[\'general_count\'] = general_count item[\'poor_count\'] = poor_count self.data_list.append(item) # print(self.data_list) with open(\'./京东进口牛奶.csv\', \'w\', encoding=\'utf-8\', errors=\'ignore\', newline="") as csvfile: fieldnames = [\'good_id\', \'good_name\', \'shop_name\', \'good_url\', \'total_comment\', \'good_comment\', \'video_count\', \'general_count\', \'poor_count\', \'good_price\'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.data_list) return self.data_list
items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MilkItem(scrapy.Item): # define the fields for your item here like: good_id = scrapy.Field() good_name = scrapy.Field() shop_name = scrapy.Field() good_url = scrapy.Field() total_comment = scrapy.Field() good_comment = scrapy.Field() video_count = scrapy.Field() general_count = scrapy.Field() poor_count = scrapy.Field() good_price = scrapy.Field()
start
from scrapy import cmdline cmdline.execute("scrapy crawl milkspider".split())