python 2.7
# -*- coding:utf-8 -*-
#导入模块
import urllib2,re,urllib
from bs4 import BeautifulSoup
import json,time
import sys
reload(sys)
sys.setdefaultencoding(\'utf8\')
fout = open(r\'res.txt\', "wb")
tot = 0
#定义抓取类
class JD:
#记录抓取产品个数
prodNum = 1
#初始化参数
def __init__(self,baseurl,page):
self.baseurl = baseurl
self.page = page
#拼装成url
self.url = self.baseurl+\'&\'+\'page=\'+str(self.page)
def getHtml(self,url):
try:
#请求抓取对象
request = urllib2.Request(url)
#响应对象
reponse = urllib2.urlopen(request)
#读取源代码
html = reponse.read()
#返回源代码
except:
time.sleep(0.1)
return self.getHtml(url)
return html
#获取总页数
def getNum(self,html):
#封装成BeautifulSoup对象
soup = BeautifulSoup(html)
#定位到总页数节点
items = soup.find_all(\'span\',class_=\'p-skip\')
#获取总页数
for item in items:
pagenum = item.find(\'em\').find(\'b\').string
return pagenum
#获取所有产品id列表
def getIds(self,html):
#生成匹配规则
pattern = re.compile(\'<a target="_blank" href="//item.jd.com/(.*?).html".*?>\')
#查询匹配对象
items = re.findall(pattern,html)
return items
#根据产品id获取同款产品列表
def getIdByItems(self,id):
#拼装成url
url = basePd+str(id)+\'.html\'
#调用抓取函数返回源代码
html = self.getHtml(url)
# 封装成BeautifulSoup对象
soup = BeautifulSoup(html)
#查询匹配对象
items = []
items = soup.find(\'div\',class_=\'dd clearfix\')
l = []
#生成列表
for item in items:
pattern = re.compile(\'href="//item.jd.com/(.*?).html".*?>\')
id = re.findall(pattern,str(item))
if id:
l += id
return l
#获取产品价格
def getPrice(self,id):
url = \'http://p.3.cn/prices/mgets?skuIds=J_\'+str(id)
jsonString = self.getHtml(url)
jsonObject = json.loads(jsonString.decode())
price_jd = jsonObject[0][\'p\']
price_mk = jsonObject[0][\'m\']
fout.write(\'jd price:\'+str(price_jd)+\'\n\')
fout.write(\'market price:\'+str(price_mk)+\'\n\')
#获取产品图片
def getImg(self,html,subid):
\'\'\'
pattern = re.compile(r\'<img id=.*?data-origin="(.*?)" alt=.*?\', re.S)
items = re.findall(pattern, html)
for item in items:
imgurl = \'http:%s\' % (item)
urllib.urlretrieve(imgurl, \'d:/temp/jdimg/%s.jpg\' % (str(subid)))
\'\'\'
#获取内容
def getContent(self,html,subid):
soup = BeautifulSoup(html)
title = soup.find(\'div\',class_=\'sku-name\')
fout.write(\'\n-----------------\'+ str(JD.prodNum) +\'--------------------\n\')
try:
for t in title:
fout.write(\'name:\'+t.string+\'\n\')
except:
return
time.sleep(1)
#价格
self.getPrice(subid)
#编码
items1 = soup.find_all(\'ul\',class_=\'parameter1 p-parameter-list\')
#商品基本信息
for item in items1:
p = item.findAll(\'p\')
for i in p:
i.string=""
# 商品基本信息
items2 = soup.find_all(\'ul\', class_=\'parameter2 p-parameter-list\')
for item in items2:
p = item.findAll(\'li\')
if len(str(p[0].string))>0:
fout.write(str(p[0].string))
fout.write(\'\n\')
\'\'\'
for i in p:
if len(str(i.string))>0:
fout.write(str(i.string))
fout.write(\'\n\')
\'\'\'
#规格与包装
\'\'\'
items3 = soup.find_all(\'div\',class_=\'Ptable-item\')
for item in items3:
contents1 = item.findAll(\'dt\')
contents2 = item.findAll(\'dd\')
for i in range(len(contents1)):
if len(str(contents1[i].string))>0 and len(str(contents2[i].string))>0:
fout.write(contents1[i].string)
if len(str(contents2[i].string))>0:
fout.write(str(contents2[i].string))
fout.write(\'\n\')
\'\'\'
JD.prodNum += 1
print JD.prodNum
#启动抓取程序
def start(self):
html = spider.getHtml(self.url)
pageNum = self.getNum(html)
print \'doing............\'
#time.sleep(3)
print \'finish. all\',pageNum,\'pages\'
#time.sleep(1)
print \'doing.........\'
#循环1--页数
for page in range(1,int(pageNum)+1):
url = self.baseurl+\'&\'+\'page=\'+str(page)
html = self.getHtml(url)
ids = self.getIds(html)
#循环2--产品列表
for id in ids:
urlprod = basePd+str(id)+\'.html\'
htmlprod = self.getHtml(urlprod)
\'\'\'
subids = self.getIdByItems(id)
\'\'\'
self.getContent(htmlprod,id)
self.getImg(htmlprod,id)
\'\'\'
#循环3--产品组列表
for subid in subids:
urlsubprod = basePd+str(subid)+\'.html\'
subhtml = self.getHtml(urlsubprod)
time.sleep(1)
self.getContent(subhtml,subid)
self.getImg(subhtml,subid)
\'\'\'
#产品列表base页
basePd = \'http://item.jd.com/\'
#抓取入口URL
baseURL = \'http://list.jd.com/list.html?cat=9987,653,655\'
#生成爬虫抓取对象
spider = JD(baseURL,1)
#开始抓取
spider.start()