Knight02

代码如下:

 1 """
 2 Created on Thu Jun 10 21:42:17 2021
 3 
 4 @author: 泥烟
 5 
 6 本爬虫可实现指定商品,指定页数的信息爬取,仅用来学习
 7 具有时效性(cookie会过期,用的时候手动改一下即可)
 8 """
 9 
10 
11 import requests
12 import re
13 import csv
14 import time
15 
16 count=1
17 #步骤1:提交商品搜索请求,循环获取页面
18 def getHTMLText(url):
19     me = {\'cookie\':\'\',
20           \'User-agent\':\'Mozilla/5.0\'}
21     try:
22         r = requests.get(url, headers=me,timeout=30)
23         r.raise_for_status()
24         r.encoding = r.apparent_encoding
25         return r.text
26     except:
27         return ""
28 
29 #步骤2:对于每个页面,提取商品序号,名称和价格信息
30 def parsePage(ilt, html,page):
31     try:
32         plt = re.findall(r\'\"view_price\"\:\"[\d\.]*\"\', html)
33         tlt = re.findall(r\'\"raw_title\"\:\".*?\"\', html)
34         #每页第一个商品的序号
35         global count
36         for i in range(len(plt)):
37             price = eval(plt[i].split(\':\')[1])
38             title = eval(tlt[i].split(\':\')[1])
39             ilt.append([count,price, title])
40             count+=1
41     except:
42         print("")
43 
44 
45 def main():
46     print(\'输入要爬取的商品名字:\')
47     goods = input()
48     print("输入要爬取的页数(仅做练习,请尽量少于10页):")
49     depth = int(input())
50     basic_url = \'https://s.taobao.com/search?q=\' + goods
51     uList = []
52     header = ["序号", "价格", "商品名称"]
53 
54     for i in range(depth):
55         try:
56             url = basic_url + \'&s=\' + str(44 * i)
57             html = getHTMLText(url)
58             parsePage(uList, html,i)
59             print(""+str(i+1)+"页爬取成功")
60             time.sleep(0.5)
61         except:
62             continue
63     filename = goods+".csv"
64     #步骤3:将信息保存在文件中,文件名为该商品的名字
65     with open(filename, \'a\', newline=\'\') as f:
66         writer  = csv.writer(f)
67         writer.writerow(header)
68         for row in uList:
69             writer.writerow(row)
70 
71 if __name__ == \'__main__\':
72     main()
73     print("输入回车退出...")
74     input()

 

分类:

技术点:

相关文章: