ray-mmss
1.re实现
  1 import requests
  2 from requests.exceptions import RequestException
  3 import re,json
  4 import xlwt,xlrd
  5 
  6 # 数据
  7 DATA = []
  8 KEYWORD = \'python\'
  9 HEADERS = {\'user-agent\':\'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome\'\
 10                         \'/63.0.3239.132 Safari/537.36\'}
 11 MAX_PAGE = 10
 12 
 13 
 14 def get_target(data_list):
 15     for item in data_list:
 16          temp = {
 17         \'title\': item[\'title\'],
 18         \'price\': item[\'view_price\'],
 19         \'sales\': item[\'view_sales\'],
 20         \'isTmall\': \'\' if float(item[\'view_fee\']) else \'\',
 21         \'area\': item[\'item_loc\'],
 22         \'name\': item[\'nick\'],
 23         \'url\': item[\'detail_url\']
 24          }
 25          DATA.append(temp)
 26     return True
 27 
 28 
 29 # 发送http请求,获取网页源码
 30 def get_html(url,*args):
 31     try:
 32         if not args:
 33             response = requests.get(url,headers=HEADERS)
 34             global COOKIES
 35             COOKIES = response.cookies  # 获取cookie
 36         else:
 37             response = requests.get(url,headers=HEADERS,cookies=COOKIES)
 38 
 39         response.encoding = response.apparent_encoding
 40         return response.text
 41     except RequestException:
 42         print(\'请求源码出错!\')
 43 
 44 # 解析源码,得到目标信息
 45 def parse_html(html,*args):
 46     if not args:
 47         pattern = re.compile(r\'g_page_config = (.*?)g_srp_loadCss\',re.S)
 48         # 去掉末尾的\';\'
 49         result = re.findall(pattern, html)[0].strip()[:-1]
 50         # 格式化json,可以用json在线解析工具查看结构
 51         content = json.loads(result)
 52         data_list = content[\'mods\'][\'itemlist\'][\'data\'][\'auctions\']
 53     else:
 54         pattern = re.compile(r\'{.*}\',re.S)
 55         result = re.findall(pattern,html)[0]
 56         content = json.loads(result)
 57         data_list = content[\'API.CustomizedApi\'][\'itemlist\'][\'auctions\']
 58 
 59     get_target(data_list)
 60 
 61 
 62 def save_to_excel():
 63     f_name = \'淘宝%s数据\'%KEYWORD
 64     book = xlwt.Workbook(encoding=\'utf-8\',style_compression=0)
 65     sheet = book.add_sheet(f_name)
 66     sheet.write(0, 0, \'title\')
 67     sheet.write(0, 1, \'price\')
 68     sheet.write(0, 2, \'sales\')
 69     sheet.write(0, 3, \'isTmall\')
 70     sheet.write(0, 4, \'area\')
 71     sheet.write(0, 5, \'name\')
 72     sheet.write(0, 6, \'url\')
 73     for i in range(len(DATA)):
 74         sheet.write(i+1, 0, DATA[i][\'title\'])
 75         sheet.write(i+1, 1, DATA[i][\'price\'])
 76         sheet.write(i+1, 2, DATA[i][\'sales\'])
 77         sheet.write(i+1, 3, DATA[i][\'isTmall\'])
 78         sheet.write(i+1, 4, DATA[i][\'area\'])
 79         sheet.write(i+1, 5, DATA[i][\'name\'])
 80         sheet.write(i+1, 6, DATA[i][\'url\'])
 81     book.save(\'淘宝%s数据.xls\'%KEYWORD)
 82 
 83 
 84 
 85 def main():
 86     for offset in range(MAX_PAGE):
 87         #  首页有12条异步加载的数据 api?
 88         if offset == 0:
 89             url1 = \'https://s.taobao.com/search?q={}&s={}\'.format(KEYWORD,offset*44)
 90             html = get_html(url1)
 91             contents = parse_html(html)
 92 
 93             url2 = \'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&\' \
 94                    \'stats_click=search_radio_all:1&q={}\'.format(KEYWORD)
 95             html = get_html(url2,2)
 96             contents = parse_html(html,2)
 97         else:
 98             url = \'https://s.taobao.com/search?q={}&s={}\'.format(KEYWORD,offset*44)
 99             html = get_html(url)
100             contents = parse_html(html)
101 
102     save_to_excel()
103     print(len(DATA))
104 
105 if __name__ == \'__main__\':
106     main()
View Code

 

分类:

技术点:

相关文章: