京东商品信息及其价格爬虫

python 2.7
# -*- coding:utf-8 -*-  

#导入模块
import urllib2,re,urllib
from bs4 import BeautifulSoup
import json,time
import sys  
reload(sys)  
sys.setdefaultencoding(\'utf8\') 

fout = open(r\'res.txt\', "wb")
tot = 0

#定义抓取类
class JD:
    #记录抓取产品个数
    prodNum = 1
    #初始化参数
    def __init__(self,baseurl,page):
        self.baseurl = baseurl
        self.page = page
        #拼装成url
        self.url = self.baseurl+\'&\'+\'page=\'+str(self.page)

    def getHtml(self,url):
        try:
            #请求抓取对象
            request = urllib2.Request(url)
            #响应对象
            reponse = urllib2.urlopen(request)
            #读取源代码
            html = reponse.read()
            #返回源代码
        except:
            time.sleep(0.1)
            return self.getHtml(url)
        return html


    #获取总页数
    def getNum(self,html):
        #封装成BeautifulSoup对象
        soup = BeautifulSoup(html)
        #定位到总页数节点
        items = soup.find_all(\'span\',class_=\'p-skip\')
        #获取总页数
        for item in items:
            pagenum = item.find(\'em\').find(\'b\').string
        return pagenum

    #获取所有产品id列表
    def getIds(self,html):
        #生成匹配规则
        pattern =  re.compile(\'<a target="_blank" href="//item.jd.com/(.*?).html".*?>\')
        #查询匹配对象
        items = re.findall(pattern,html)
        return items

    #根据产品id获取同款产品列表
    def getIdByItems(self,id):
        #拼装成url
        url = basePd+str(id)+\'.html\'
        #调用抓取函数返回源代码
        html = self.getHtml(url)
        # 封装成BeautifulSoup对象
        soup = BeautifulSoup(html)
        #查询匹配对象
        items = []
        items = soup.find(\'div\',class_=\'dd clearfix\')
        l = []
        #生成列表

        for item in items:
            pattern = re.compile(\'href="//item.jd.com/(.*?).html".*?>\')
            id = re.findall(pattern,str(item))
            if id:
                l += id
        return l

    #获取产品价格
    def getPrice(self,id):
        url = \'http://p.3.cn/prices/mgets?skuIds=J_\'+str(id)
        jsonString = self.getHtml(url)
        jsonObject = json.loads(jsonString.decode())
        price_jd = jsonObject[0][\'p\']
        price_mk = jsonObject[0][\'m\']
        fout.write(\'jd price:\'+str(price_jd)+\'\n\')
        fout.write(\'market price:\'+str(price_mk)+\'\n\')

    #获取产品图片
    def getImg(self,html,subid):
        \'\'\'
        pattern = re.compile(r\'<img id=.*?data-origin="(.*?)" alt=.*?\', re.S)
        items = re.findall(pattern, html)
        for item in items:
            imgurl = \'http:%s\' % (item)
            urllib.urlretrieve(imgurl, \'d:/temp/jdimg/%s.jpg\' % (str(subid)))
            \'\'\'

    #获取内容
    def getContent(self,html,subid):
        soup = BeautifulSoup(html)
        title = soup.find(\'div\',class_=\'sku-name\')
        fout.write(\'\n-----------------\'+ str(JD.prodNum) +\'--------------------\n\')
        try:
            for t in title:
                fout.write(\'name:\'+t.string+\'\n\')
        except:
            return
        time.sleep(1)
        #价格
        self.getPrice(subid)
        #编码
        items1 = soup.find_all(\'ul\',class_=\'parameter1 p-parameter-list\')
        #商品基本信息
        for item in items1:
            p = item.findAll(\'p\')
            for i in p:
                i.string=""
        # 商品基本信息
        items2 = soup.find_all(\'ul\', class_=\'parameter2 p-parameter-list\')
        for item in items2:
            p = item.findAll(\'li\')
            if len(str(p[0].string))>0:
                fout.write(str(p[0].string))
            fout.write(\'\n\')
            \'\'\'
            for i in p:
                if len(str(i.string))>0:
                    fout.write(str(i.string))
                fout.write(\'\n\')
            \'\'\'
        #规格与包装
        \'\'\'
        items3 = soup.find_all(\'div\',class_=\'Ptable-item\')
        for item in items3:
            contents1 = item.findAll(\'dt\')
            contents2 = item.findAll(\'dd\')
            for i in range(len(contents1)):
                if len(str(contents1[i].string))>0 and len(str(contents2[i].string))>0:
                    fout.write(contents1[i].string)
                    if len(str(contents2[i].string))>0:
                        fout.write(str(contents2[i].string))
                    fout.write(\'\n\')
        \'\'\'
        JD.prodNum += 1
        print JD.prodNum

    #启动抓取程序
    def start(self):
        html = spider.getHtml(self.url)
        pageNum = self.getNum(html)
        print \'doing............\'
        #time.sleep(3)
        print \'finish. all\',pageNum,\'pages\'
        #time.sleep(1)
        print \'doing.........\'
        #循环1--页数
        for page in range(1,int(pageNum)+1):
            url = self.baseurl+\'&\'+\'page=\'+str(page)
            html = self.getHtml(url)
            ids = self.getIds(html)
            #循环2--产品列表
            for id in ids:
                urlprod = basePd+str(id)+\'.html\'
                htmlprod = self.getHtml(urlprod)
                \'\'\'
                subids = self.getIdByItems(id)
                \'\'\'
                self.getContent(htmlprod,id)
                self.getImg(htmlprod,id)
                \'\'\'
                #循环3--产品组列表
                for subid in subids:
                    urlsubprod = basePd+str(subid)+\'.html\'
                    subhtml = self.getHtml(urlsubprod)
                    time.sleep(1)
                    self.getContent(subhtml,subid)
                    self.getImg(subhtml,subid)
                \'\'\'


#产品列表base页
basePd  = \'http://item.jd.com/\'
#抓取入口URL
baseURL = \'http://list.jd.com/list.html?cat=9987,653,655\'
#生成爬虫抓取对象
spider = JD(baseURL,1)

#开始抓取
spider.start()