kenD
# -*- coding: utf-8 -*-
import scrapy
import json
import csv
from milk.items import MilkItem

class MilkspiderSpider(scrapy.Spider):
    name = \'milkspider\'
    # allowed_domains = [\'www.xxx.com\']
    start_urls = [\'https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec\']
    data_list = []

    def parse(self, response):
        li_list = response.xpath(\'//li[@class="gl-item"]\')
        for li in li_list:
            good_id = li.xpath(\'./@data-sku\').get()  # 从自己开始找
            # print(good_id)
            shop_name = li.xpath(\'.//a[@class="curr-shop"]/text()\').get()
            # print(shop_name)
            good_name = li.xpath(\'.//div[@class="p-name p-name-type-2"]/a/em/text()\').getall()
            good_name = \',\'.join(good_name).strip().replace(",", "").replace("\n\t", "")
            # print(good_name)
            good_url = li.xpath(\'.//div[@class="p-name p-name-type-2"]/a/@href\').get()
            if good_url.startswith(\'https:\'):
                good_url = good_url
            else:
                good_url = \'https:\' + good_url
            # print(good_url)
            good_price = li.xpath(\'.//div[@class="p-price"]/strong//text()\').getall()
            good_price = \',\'.join(good_price).replace(",", "")
            # print(good_price)

            # 评论数在源码没有 获取不到 需要去详情页获取
            item = MilkItem()
            item["shop_name"] = shop_name
            item["good_name"] = good_name
            item["good_price"] = good_price
            item["good_id"] = good_id
            item[\'good_url\'] = good_url
            yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail)

    def parse_detail(self, response):
        # 获取的评论是动态加载的
        item = response.meta[\'item\']

        # 拼接每个商品的评论的url
        comment_info_url = \'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=\' + item[\'good_id\']
        # print(comment_info_url)
        yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment)

    def parse_comment(self, response):
        item = response.meta[\'item\']

        # response.body是一个bytes格式的   转成str
        str = response.body.decode(\'utf-8\', \'replace\')
        json_str = str.replace(\'��\', \'\')
        dict = json.loads(json_str)

        total_comment = dict[\'CommentsCount\'][0][\'CommentCountStr\']
        good_comment = dict[\'CommentsCount\'][0][\'GoodCountStr\']
        video_count = dict[\'CommentsCount\'][0][\'VideoCountStr\']
        general_count = dict[\'CommentsCount\'][0][\'GeneralCountStr\']
        poor_count = dict[\'CommentsCount\'][0][\'PoorCountStr\']

        item[\'total_comment\'] = total_comment
        item[\'good_comment\'] = good_comment
        item[\'video_count\'] = video_count
        item[\'general_count\'] = general_count
        item[\'poor_count\'] = poor_count

        self.data_list.append(item)
        # print(self.data_list)

        with open(\'./京东进口牛奶.csv\', \'w\', encoding=\'utf-8\', errors=\'ignore\', newline="") as csvfile:
            fieldnames = [\'good_id\', \'good_name\', \'shop_name\', \'good_url\', \'total_comment\', \'good_comment\',
                          \'video_count\', \'general_count\', \'poor_count\', \'good_price\']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.data_list)

        return self.data_list

 

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MilkItem(scrapy.Item):
    # define the fields for your item here like:
    good_id = scrapy.Field()
    good_name = scrapy.Field()
    shop_name = scrapy.Field()
    good_url = scrapy.Field()

    total_comment = scrapy.Field()
    good_comment = scrapy.Field()
    video_count = scrapy.Field()
    general_count = scrapy.Field()
    poor_count = scrapy.Field()

    good_price = scrapy.Field()

 

start

from scrapy import cmdline

cmdline.execute("scrapy crawl milkspider".split())

 

分类:

技术点:

相关文章: