xiesongyou
# -*- coding: utf-8 -*-
import scrapy
import urllib.request
import re
import random
from jdgoods.items import JdgoodsItem
from lxml import etree
from scrapy.http import Request


class GoodsSpider(scrapy.Spider):
name = \'goods\'
allowed_domains = [\'jd.com\']
url_lst=[]
pd_lst=[]
pd_pages={}
#start_urls = [\'http://jd.com/\']
ua = [\'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\',
\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\',
\'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\',
\'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)\',
\'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)\',
\'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\'
]

def start_requests(self):

req1 = urllib.request.Request("https://book.jd.com/")
user_age=random.choice(self.ua)
req1.add_header("User-Agent", user_age )
all_data = urllib.request.urlopen(req1).read().decode(\'gbk\',\'ignore\')
# print(all_data)
pat1 = \'"URL":"(.*?)","ANCHOR":\'
all_html_data = re.compile(pat1).findall(all_data)
#print(all_html_data )
for i in all_html_data:
a = i.split(\',\')
self.url_lst.append("http:"+(a[len(a) - 1].replace(\'"URL":"\', "")).replace(\'\/\',\'/\'))
#print("http:"+(a[len(a) - 1].replace(\'"URL":"\', "")).replace(\'\/\',\'/\'))
url_set=set(self.url_lst)
self.url_lst=list(url_set)

for j in self.url_lst:
try:

req2 = urllib.request.Request(j)
user_age = random.choice(self.ua)
req2.add_header("User-Agent", user_age)

sub_data=urllib.request.urlopen(req2).read().decode(\'gbk\', \'ignore\')
pat2=\'href="//list.*cat=(.*?)[&|"]\'
all_html_addr = re.compile(pat2).findall(sub_data)
for lst_num in all_html_addr :
self.pd_lst.append(lst_num)

except Exception as err:
pass
x=0
for a in self.pd_lst:

this_url = \'https://list.jd.com/list.html?cat=\' + a
req3 = urllib.request.Request(this_url)
user_age = random.choice(self.ua)
req3.add_header("User-Agent", user_age)


html_data = urllib.request.urlopen(req3).read().decode(\'utf-8\', \'ignore\')
pat3 = u"[\u4e00-\u9fa5]"+"<b>(.*?)</b>"+u"[\u4e00-\u9fa5]"
pages = re.compile(pat3).findall(html_data)
self.pd_pages[a]="".join(pages)
x+=1
if x > 1 :break
y=0
for key in self.pd_pages:
#print(key +":"+ str(self.pd_pages[key]))
for p in range(1,int(str(self.pd_pages[key]))):
thispage=\'https://list.jd.com/list.html?cat=\'+key+\'&page=\' + str(p)
#print(thispage)
y+=1
if y>2:break
#yield Request(thispage, callback=self.parse)

yield Request("https://list.jd.com/list.html?cat=1713,3260,3339", callback=self.parse)
def parse(self, response):
item=JdgoodsItem()

try:

content_lst=response.xpath(\'//span[@class="curr"]/text()\').extract()
p_content="---".join(content_lst)
print(p_content)

book_name_lst=response.xpath(\'//div[@class="p-name"]/a/em/text()\').extract()

book_price_html = response.xpath(\'//div[@class="p-img"]/a/@href\').extract()

book_pub_lst = response.xpath(\'//span[@class="p-bi-store"]/a/text()\').extract()

book_seller = response.xpath(\'//span[@class="curr-shop"]/text()\').extract()

# price https://p.3.cn/prices/mgets?&skuIds=J_11481255
print("书名--出版社----销售商---已下载")
skuIds=[]
price=[]
comment=[]
for n in range(len(book_price_html)):
pat = \'//.*/([0-9].*?).html\'
tmp = re.compile(pat).findall(book_price_html[n])
skuIds.append("".join(tmp))

for i in range(0,len(book_name_lst)):
req4 = urllib.request.Request(\'https://p.3.cn/prices/mgets?&skuIds=J_\' + str(skuIds[i]))
user_age = random.choice(self.ua)
req4.add_header("User-Agent", user_age)
p = urllib.request.urlopen(req4).read().decode()
pat = \'"p":"(.*?)"\'
p1 = re.compile(pat).findall(p)
p2="".join(p1)
price.append(p2)
print("书价格---已下载")
#评论:https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=11338771
for i in range(0, len(book_name_lst)):
req5 = urllib.request.Request(
\'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=\' + str(
skuIds[i]))
user_age = random.choice(self.ua)
req5.add_header("User-Agent", user_age)
c = urllib.request.urlopen(req5).read().decode("utf-8", \'ignore\')
pat = \'"CommentCount":(.*?),\'
c1 = re.compile(pat).findall(c)
c2 = "".join(c1)
comment.append(c2)
print("书评论---已下载")
for n in range(len(book_name_lst)):
print(book_name_lst[n]+\':\'+str(price[n])+\':\'+book_seller[n]+\':\'+book_pub_lst[n]+\':\'+str(comment[n]))
except Exception as err:
pass

yield item

分类:

技术点:

相关文章:

  • 2022-01-12
  • 2021-11-11
  • 2021-12-07
  • 2021-10-16
  • 2021-11-02
  • 2021-12-11
  • 2021-12-05
猜你喜欢
  • 2021-12-19
  • 2021-12-19
  • 2021-12-05
  • 2021-12-05
  • 2020-07-06
  • 2021-12-05
  • 2021-12-25
相关资源
相似解决方案