Python实例之抓取淘宝商品数据(json型数据)并保存为TXT
本实例实现了抓取淘宝网中以‘python’为关键字的搜索结果,经详细查看数据存储于html文档中的js脚本中,数据类型为JSON
具体实现代码如下:
import requests import re import json from urllib.parse import urlencode from bs4 import BeautifulSoup finalstr = \'\' count = 0 #初始化页码对应参数,0为首页 test = 0 #初始化数据总条数 for j in range(0,100): #共搜索100页数据 count = 44*j #每跳一页参数值增加44 data = { \'q\':\'python\', \'imgfile\':\'\', \'js\':\'1\', \'stats_click\':\'search_radio_all:1\', \'initiative_id\':\'staobaoz_20170529\', \'ie\':\'utf8\', \'bcoffset\':\'4\', \'ntoffest\':\'4\', \'p4ppushleft\':\'1,48\', \'s\':count } #浏览器地址的get参数,只有\'s\'会随着页码改变而改变 url = \'https://s.taobao.com/search?\' + urlencode(data) #浏览器地址 req = requests.get(url) pattern = re.compile(\'g_page_config = (.*?);\n g_srp_loadCss()\') #正则表达式匹配数据 result = pattern.search(req.text) # result.group(1) jsres = json.loads(result.group(1)) #匹配后获得的json数据深度比较大,需要层层剖析 sedata = jsres[\'mods\'][ \'itemlist\'][\'data\'][\'auctions\'] #经过剖析后的json数据 for i in range(0,len(sedata)): rt = \'标题:\'+sedata[i][\'raw_title\']+\'\n\' finalstr += rt rp = \'价格:\'+sedata[i][\'view_price\']+\'\n\' finalstr += rp rn = \'卖家:\'+sedata[i][\'nick\']+\'\n\' finalstr += rn rd = \'地址:\'+sedata[i][\'item_loc\']+\'\n\n\' finalstr += rd print(\'当前正在读取第\'+str(j+1)+"页的第"+str(i+1)+\'条数据...\') test += 1 f = open(\'淘宝搜索python时的商品数据,共\'+str(test)+\'条.txt\',\'w\',1,\'UTF-8\') #保存数据到TXT f.write(finalstr) print(\'正在保存。。。\') f.close() print(\'保存完毕!共\'+str(test)+\'条数据\')