1 import requests
2 from requests.exceptions import RequestException
3 import re,json
4 import xlwt,xlrd
5
6 # 数据
7 DATA = []
8 KEYWORD = \'python\'
9 HEADERS = {\'user-agent\':\'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome\'\
10 \'/63.0.3239.132 Safari/537.36\'}
11 MAX_PAGE = 10
12
13
14 def get_target(data_list):
15 for item in data_list:
16 temp = {
17 \'title\': item[\'title\'],
18 \'price\': item[\'view_price\'],
19 \'sales\': item[\'view_sales\'],
20 \'isTmall\': \'否\' if float(item[\'view_fee\']) else \'是\',
21 \'area\': item[\'item_loc\'],
22 \'name\': item[\'nick\'],
23 \'url\': item[\'detail_url\']
24 }
25 DATA.append(temp)
26 return True
27
28
29 # 发送http请求,获取网页源码
30 def get_html(url,*args):
31 try:
32 if not args:
33 response = requests.get(url,headers=HEADERS)
34 global COOKIES
35 COOKIES = response.cookies # 获取cookie
36 else:
37 response = requests.get(url,headers=HEADERS,cookies=COOKIES)
38
39 response.encoding = response.apparent_encoding
40 return response.text
41 except RequestException:
42 print(\'请求源码出错!\')
43
44 # 解析源码,得到目标信息
45 def parse_html(html,*args):
46 if not args:
47 pattern = re.compile(r\'g_page_config = (.*?)g_srp_loadCss\',re.S)
48 # 去掉末尾的\';\'
49 result = re.findall(pattern, html)[0].strip()[:-1]
50 # 格式化json,可以用json在线解析工具查看结构
51 content = json.loads(result)
52 data_list = content[\'mods\'][\'itemlist\'][\'data\'][\'auctions\']
53 else:
54 pattern = re.compile(r\'{.*}\',re.S)
55 result = re.findall(pattern,html)[0]
56 content = json.loads(result)
57 data_list = content[\'API.CustomizedApi\'][\'itemlist\'][\'auctions\']
58
59 get_target(data_list)
60
61
62 def save_to_excel():
63 f_name = \'淘宝%s数据\'%KEYWORD
64 book = xlwt.Workbook(encoding=\'utf-8\',style_compression=0)
65 sheet = book.add_sheet(f_name)
66 sheet.write(0, 0, \'title\')
67 sheet.write(0, 1, \'price\')
68 sheet.write(0, 2, \'sales\')
69 sheet.write(0, 3, \'isTmall\')
70 sheet.write(0, 4, \'area\')
71 sheet.write(0, 5, \'name\')
72 sheet.write(0, 6, \'url\')
73 for i in range(len(DATA)):
74 sheet.write(i+1, 0, DATA[i][\'title\'])
75 sheet.write(i+1, 1, DATA[i][\'price\'])
76 sheet.write(i+1, 2, DATA[i][\'sales\'])
77 sheet.write(i+1, 3, DATA[i][\'isTmall\'])
78 sheet.write(i+1, 4, DATA[i][\'area\'])
79 sheet.write(i+1, 5, DATA[i][\'name\'])
80 sheet.write(i+1, 6, DATA[i][\'url\'])
81 book.save(\'淘宝%s数据.xls\'%KEYWORD)
82
83
84
85 def main():
86 for offset in range(MAX_PAGE):
87 # 首页有12条异步加载的数据 api?
88 if offset == 0:
89 url1 = \'https://s.taobao.com/search?q={}&s={}\'.format(KEYWORD,offset*44)
90 html = get_html(url1)
91 contents = parse_html(html)
92
93 url2 = \'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&\' \
94 \'stats_click=search_radio_all:1&q={}\'.format(KEYWORD)
95 html = get_html(url2,2)
96 contents = parse_html(html,2)
97 else:
98 url = \'https://s.taobao.com/search?q={}&s={}\'.format(KEYWORD,offset*44)
99 html = get_html(url)
100 contents = parse_html(html)
101
102 save_to_excel()
103 print(len(DATA))
104
105 if __name__ == \'__main__\':
106 main()