代码如下:
1 """ 2 Created on Thu Jun 10 21:42:17 2021 3 4 @author: 泥烟 5 6 本爬虫可实现指定商品,指定页数的信息爬取,仅用来学习 7 具有时效性(cookie会过期,用的时候手动改一下即可) 8 """ 9 10 11 import requests 12 import re 13 import csv 14 import time 15 16 count=1 17 #步骤1:提交商品搜索请求,循环获取页面 18 def getHTMLText(url): 19 me = {\'cookie\':\'略\', 20 \'User-agent\':\'Mozilla/5.0\'} 21 try: 22 r = requests.get(url, headers=me,timeout=30) 23 r.raise_for_status() 24 r.encoding = r.apparent_encoding 25 return r.text 26 except: 27 return "" 28 29 #步骤2:对于每个页面,提取商品序号,名称和价格信息 30 def parsePage(ilt, html,page): 31 try: 32 plt = re.findall(r\'\"view_price\"\:\"[\d\.]*\"\', html) 33 tlt = re.findall(r\'\"raw_title\"\:\".*?\"\', html) 34 #每页第一个商品的序号 35 global count 36 for i in range(len(plt)): 37 price = eval(plt[i].split(\':\')[1]) 38 title = eval(tlt[i].split(\':\')[1]) 39 ilt.append([count,price, title]) 40 count+=1 41 except: 42 print("") 43 44 45 def main(): 46 print(\'输入要爬取的商品名字:\') 47 goods = input() 48 print("输入要爬取的页数(仅做练习,请尽量少于10页):") 49 depth = int(input()) 50 basic_url = \'https://s.taobao.com/search?q=\' + goods 51 uList = [] 52 header = ["序号", "价格", "商品名称"] 53 54 for i in range(depth): 55 try: 56 url = basic_url + \'&s=\' + str(44 * i) 57 html = getHTMLText(url) 58 parsePage(uList, html,i) 59 print("第"+str(i+1)+"页爬取成功") 60 time.sleep(0.5) 61 except: 62 continue 63 filename = goods+".csv" 64 #步骤3:将信息保存在文件中,文件名为该商品的名字 65 with open(filename, \'a\', newline=\'\') as f: 66 writer = csv.writer(f) 67 writer.writerow(header) 68 for row in uList: 69 writer.writerow(row) 70 71 if __name__ == \'__main__\': 72 main() 73 print("输入回车退出...") 74 input()