python -- 京东图书

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
import re
import random
from jdgoods.items import JdgoodsItem
from lxml import etree
from scrapy.http import Request


class GoodsSpider(scrapy.Spider):
    name = \'goods\'
    allowed_domains = [\'jd.com\']
    url_lst=[]
    pd_lst=[]
    pd_pages={}
    #start_urls = [\'http://jd.com/\']
    ua = [\'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\',
          \'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\',
          \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\',
          \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)\',
          \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)\',
          \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\'
          ]

    def start_requests(self):

        req1 = urllib.request.Request("https://book.jd.com/")
        user_age=random.choice(self.ua)
        req1.add_header("User-Agent", user_age )
        all_data = urllib.request.urlopen(req1).read().decode(\'gbk\',\'ignore\')
        # print(all_data)
        pat1 = \'"URL":"(.*?)","ANCHOR":\'
        all_html_data = re.compile(pat1).findall(all_data)
        #print(all_html_data )
        for i in all_html_data:
            a = i.split(\',\')
            self.url_lst.append("http:"+(a[len(a) - 1].replace(\'"URL":"\', "")).replace(\'\/\',\'/\'))
            #print("http:"+(a[len(a) - 1].replace(\'"URL":"\', "")).replace(\'\/\',\'/\'))
        url_set=set(self.url_lst)
        self.url_lst=list(url_set)

        for j in self.url_lst:
            try:

                req2 = urllib.request.Request(j)
                user_age = random.choice(self.ua)
                req2.add_header("User-Agent", user_age)

                sub_data=urllib.request.urlopen(req2).read().decode(\'gbk\', \'ignore\')
                pat2=\'href="//list.*cat=(.*?)[&|"]\'
                all_html_addr = re.compile(pat2).findall(sub_data)
                for lst_num in all_html_addr :
                    self.pd_lst.append(lst_num)

            except Exception as err:
                pass
        x=0
        for a in self.pd_lst:

            this_url = \'https://list.jd.com/list.html?cat=\' + a
            req3 = urllib.request.Request(this_url)
            user_age = random.choice(self.ua)
            req3.add_header("User-Agent", user_age)


            html_data = urllib.request.urlopen(req3).read().decode(\'utf-8\', \'ignore\')
            pat3 = u"[\u4e00-\u9fa5]"+"<b>(.*?)</b>"+u"[\u4e00-\u9fa5]"
            pages = re.compile(pat3).findall(html_data)
            self.pd_pages[a]="".join(pages)
            x+=1
            if x > 1 :break
        y=0
        for key in self.pd_pages:
            #print(key +":"+ str(self.pd_pages[key]))
            for p in range(1,int(str(self.pd_pages[key]))):
                thispage=\'https://list.jd.com/list.html?cat=\'+key+\'&page=\' + str(p)
                #print(thispage)
                y+=1
                if y>2:break
                #yield Request(thispage, callback=self.parse)

        yield Request("https://list.jd.com/list.html?cat=1713,3260,3339", callback=self.parse)
    def parse(self, response):
        item=JdgoodsItem()

        try:

            content_lst=response.xpath(\'//span[@class="curr"]/text()\').extract()
            p_content="---".join(content_lst)
            print(p_content)

            book_name_lst=response.xpath(\'//div[@class="p-name"]/a/em/text()\').extract()

            book_price_html = response.xpath(\'//div[@class="p-img"]/a/@href\').extract()

            book_pub_lst = response.xpath(\'//span[@class="p-bi-store"]/a/text()\').extract()

            book_seller = response.xpath(\'//span[@class="curr-shop"]/text()\').extract()

            # price https://p.3.cn/prices/mgets?&skuIds=J_11481255
            print("书名--出版社----销售商---已下载")
            skuIds=[]
            price=[]
            comment=[]
            for n in range(len(book_price_html)):
                pat = \'//.*/([0-9].*?).html\'
                tmp = re.compile(pat).findall(book_price_html[n])
                skuIds.append("".join(tmp))

            for i in range(0,len(book_name_lst)):
                req4 = urllib.request.Request(\'https://p.3.cn/prices/mgets?&skuIds=J_\' + str(skuIds[i]))
                user_age = random.choice(self.ua)
                req4.add_header("User-Agent", user_age)
                p = urllib.request.urlopen(req4).read().decode()
                pat = \'"p":"(.*?)"\'
                p1 = re.compile(pat).findall(p)
                p2="".join(p1)
                price.append(p2)
            print("书价格---已下载")
            #评论：https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=11338771
            for i in range(0, len(book_name_lst)):
                req5 = urllib.request.Request(
                    \'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=\' + str(
                        skuIds[i]))
                user_age = random.choice(self.ua)
                req5.add_header("User-Agent", user_age)
                c = urllib.request.urlopen(req5).read().decode("utf-8", \'ignore\')
                pat = \'"CommentCount":(.*?),\'
                c1 = re.compile(pat).findall(c)
                c2 = "".join(c1)
                comment.append(c2)
            print("书评论---已下载")
            for  n in  range(len(book_name_lst)):
                print(book_name_lst[n]+\':\'+str(price[n])+\':\'+book_seller[n]+\':\'+book_pub_lst[n]+\':\'+str(comment[n]))
        except Exception as err:
            pass

        yield item