爬取淘宝模特信息并自动保存图片

环境：Ubuntu 16.04
工具：python 3.5+，scrapy1.1，pycharm
import scrapy, re, os, lxml, urllib.request
from scrapy.http import Request
from bs4 import BeautifulSoup


class TaobaoMMSpider(scrapy.Spider):
    name = \'TaobaoMM\'
    start_urls = [\'https://mm.taobao.com/json/request_top_list.htm?page=1\']
    #在这里输入你要保存图片的地址
    mainposition = \'/media/liuyu/0009F608000B7B40/TaobaoMM/\'

    # 处理第一个网页,获取总页数
    def parse(self, response):
        content = BeautifulSoup(response.text, "lxml")
        totalpage = content.find(\'input\', id="J_Totalpage").get(\'value\')
        url = \'https://mm.taobao.com/json/request_top_list.htm?page=\'
        for i in range(1):
            yield Request(url + str(i+1), callback=self.everypage)

    # 对每一页的网页进行处理，获取每位model的网页
    def everypage(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelinfo = content.find_all(\'div\', class_="personal-info")
        for i in modelinfo:
            name = i.find(\'a\', class_="lady-name").string
            seconddir = self.mainposition + name
            os.mkdir(self.mainposition + str(name))
            age = i.find(\'strong\').string
            modelurl = \'https:\' + i.find(\'a\', class_="lady-name").get(\'href\')
            yield Request(modelurl, callback=self.infocard, meta={\'age\': age, \'seconddir\': seconddir})

    # 处理模特卡界面,获取模特id,构造获取model信息的json链接
    def infocard(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelid = content.find(\'input\', id="J_MmuserId").get(\'value\')
        infourl = \'https://mm.taobao.com/self/info/model_info_show.htm?user_id=\' + modelid
        albumurl = \'https:\' + content.find(\'ul\', class_="mm-p-menu").find(\'a\').get(\'href\')
        yield Request(infourl, callback=self.infoprocess,
                      meta={\'seconddir\': response.meta[\'seconddir\'], \'albumurl\': albumurl, \'age\': response.meta[\'age\']})

    # 处理model的json网页信息，获取名字等信息，然后跳转至相册界面
    def infoprocess(self, response):
        seconddir = response.meta[\'seconddir\']
        albumurl = response.meta[\'albumurl\']
        age = response.meta[\'age\']
        content = BeautifulSoup(response.text, "lxml")
        modelinfo = content.find(\'ul\', class_="mm-p-info-cell clearfix")
        info = modelinfo.find_all(\'li\')
        name = info[0].find(\'span\').string
        with open(seconddir + \'/\' + name + \'.txt\', \'w\')as file:
            file.write(\'age\' + age + \'\n\')
            for i in range(6):
                file.write(info[i].find(\'span\').string.replace("\xa0", "") + \'\n\')
            for i in range(2):
                file.write(info[i+7].find(\'p\').string + \'\n\')
            file.write(\'BWH:  \' + info[9].find(\'p\').string + \'\n\')
            file.write(\'cup_size:  \' + info[10].find(\'p\').string + \'\n\')
            file.write(\'shoe_size:  \' + info[11].find(\'p\').string + \'\n\')
        file.close()
        yield Request(albumurl, callback=self.album, meta={\'seconddir\': response.meta[\'seconddir\']})

    # 处理相册框架界面，获取model的ID，构造相册列表的json请求链接
    def album(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelid = content.find(\'input\', id="J_userID").get(\'value\')
        url = \'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=\' + modelid
        yield Request(url, callback=self.allimage, meta={\'url\': url, \'seconddir\': response.meta[\'seconddir\']})

    # 处理相册信息页面，获取总页数
    def allimage(self, response):
        url = response.meta[\'url\']
        content = BeautifulSoup(response.text, "lxml")
        page = content.find(\'input\').get(\'value\')
        for i in range(int(page)):
            yield Request(url + \'&page=\' + str(i+1), callback=self.image, meta={\'seconddir\': response.meta[\'seconddir\']})

    # 对相册每一页进行处理，获取相册名，对每一个相册进行访问
    def image(self, response):
        seconddir = response.meta[\'seconddir\']
        content = BeautifulSoup(response.text,"lxml")
        albuminfo = content.find_all(\'div\', class_="mm-photo-cell-middle")
        for i in albuminfo:
            albumname = i.find(\'h4\').a.string.replace(" ","")
            albumname=albumname.replace("\n","")
            thirddir = seconddir + \'/\' + albumname
            os.mkdir(thirddir)
            url = i.find(\'h4\').a.get(\'href\')
            pattern = re.compile(\'.*?user_id=(.*?)&album_id=(.*?)&album_flag\')
            item = re.findall(pattern, url)
            for item in item:
                modelid = item[0]
                albumid = item[1]
            imageurl = \'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=\' + modelid + \'&album_id=\' + albumid + \'&page=\'
            yield Request(imageurl, callback=self.imageprocess, meta={\'url\': imageurl, \'thirddir\': thirddir})

    # 对相册页面进行处理，获取相册总页数
    def imageprocess(self, response):
        url = response.meta[\'url\']
        content = response.text
        pattern = re.compile(\'.*?"totalPage":"(.*?)"\')
        item = re.findall(pattern, content)
        pagenum = item[0]
        for i in range(int(pagenum)):
            imageurl = url + str(i+1)
            yield Request(imageurl, callback=self.saveimage, meta={\'thirddir\': response.meta[\'thirddir\']})

    # 处理相册页面，获得每一个照片的链接
    def saveimage(self, response):
        thirddir = response.meta[\'thirddir\']
        content = response.text
        pattern = re.compile(\'.*?"picUrl":"(.*?)"\')
        pattern_2 = re.compile(\'.*?imgextra/.*?/(.*?)/\')
        imageurls = re.findall(pattern, content)
        for imageurl in imageurls:
            imagename_temp=re.findall(pattern_2,imageurl)
            imagename=imagename_temp[0]
            url = \'https:\' + imageurl
            print(url)
            u = urllib.request.urlopen(url).read()
            with open(thirddir + \'/\' + imagename + \'.jpg\', \'wb\')as file:
                file.write(u)
                file.close()
运行结果：