淘宝产品抓取实战

#!coding=utf-8
import requests
import re
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告
 
class tb(object):####手机端
     def __init__(self,path,seach):  ###保存数据路径
        self.path = path  ###保存数据路径
        self.seach= seach ##搜索词
        self.s = requests.session()
        headers = {
            \'Host\':\'s.m.taobao.com\',
            \'Accept-Encoding\':\'br, gzip, deflate\',
            \'Connection\':\'keep-alive\',
            \'Accept\':\'application/json\',
            \'User-Agent\':\'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/16A366 Safari/605.1.15\',
            \'Accept-Language\':\'zh-cn\',
            \'X-Requested-With\':\'XMLHttpRequest\',
                   }
        self.s.headers.update(headers)  ##插入头信息
 
 
    def seachdata(self):
        for i in range(0,100):
            time.sleep(1.25)
            url=\'https://s.m.taobao.com/search?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&q={}&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=18&wlsort=18&style=list&closeModues=nav%2Cselecthot%2Conesearch&page={}\'.format(self.seach,i)   ##爬取的网址
            print(i)
            req = self.s.get(url=url, verify=False).text  #爬取页面结果
            try:
                js=json.loads(req)
                print(js)
            except:
                print(\'err\')
            listItem=js[\'listItem\']
            title=[]  ##名称
            sold=[]   ##月销量
            commentCount=[]  ##评论量
            item_id=[]  ##商品ID
            userId=[]   ##商家ID
            nick=[]  ##商家名称
            location=[]  ##商家地址
            pic_path=[]  ##图片
            itemNumId=[]  ##商品NID
            originalPrice=[]  ##原价
            price=[]  ##售价
            category=[]  ##类别ID
            itemurl=[]  ##商品链接
            if listItem==[]:
                break
 
            for j in listItem:  ##数据提取
 
                title.append(j[\'title\'])
                sold.append(j[\'sold\'])
                try:
                    commentCount.append(j[\'commentCount\'])
                except:
                    commentCount.append(\'\')
                item_id.append(j[\'item_id\'])
                userId.append(j[\'userId\'])
                nick.append(j[\'nick\'])
                location.append(j[\'location\'])
                pic_path.append(j[\'pic_path\'])
                itemNumId.append(j[\'itemNumId\'])
                originalPrice.append(j[\'originalPrice\'])
                price.append(j[\'price\'])
                try:
                    category.append(j[\'category\'])
                except:
                    category.append(\'\')
                itemurl.append(j[\'url\'])
                data={
                    \'title_名称\':title,
                    \'sold_月销量\': sold,
                    \'commentCount_评论量\': commentCount,
                    \'item_id_商品ID\': item_id,
                    \'userId_商家ID\': userId,
                    \'nick_商家名称\': nick,
                    \'location_商家地址\': location,
                    \'pic_path_图片\': pic_path,
                    \'itemNumId_商品NID\': itemNumId,
                    \'originalPrice_原价\': originalPrice,
                    \'price_售价\': price,
                    \'category_类别ID\': category,
                    \'itemurl_商品链接\': itemurl,
                            }
 
                df=pd.DataFrame(data)
                if i==0:
                    df.to_csv(self.path+r\'\out.csv\', index=False, header=1, encoding="GB18030")
                else:
                    df.to_csv(self.path+r\'\out.csv\', index=False, header=0, mode=\'a\', encoding="GB18030")###保存文件
 
 
if __name__ == \'__main__\':
    t=tb(r\'E:\taobao\',\'手机\')
    t.seachdata()