股票数据爬虫
老虎社区
\'https://www.laohu8.com/stock/\'
百度股票不行了
import requests import re from bs4 import BeautifulSoup import collections import traceback def getHtmlText(url): try: kv = {\'user-agent\':\'Mozilla/5.0\'} r = requests.get(url,headers = kv) # print(r.status_code) r.raise_for_status() #print(r.apparent_encoding) GB2312 #r.encoding = r.apparent_encoding() GB2312 无法获取信息 r.encoding = \'utf-8\'#这步必须要,可以省时。因为获取r.apparent_encoding解析文本内容需要很长的时间 return r.text except: return "" def getstocklist(list,stock_url): html = getHtmlText(stock_url) soup = BeautifulSoup(html,\'html.parser\') a = soup.find_all(\'a\') for i in a: try: href = i.attrs[\'href\'] list.append(re.findall(r"\d{6}",href)[0]) #查找股票代码 except: continue print(len(list)) def getstockinfo(list,stock_url,path): cnt = 0 for stock in list: url = stock_url+stock html = getHtmlText(url) try: if html == \'\': continue infodict = collections.OrderedDict()#为了后面按照插入顺序写入文件 soup = BeautifulSoup(html,\'html.parser\') stock_name = soup.find_all(\'h1\',attrs = {\'class\':\'name\'})[0] name = stock_name.text.split()[0] infodict[\'股票名称\'] = name stockinfo = soup.find(\'div\',attrs = {\'class\':\'detail-data\'}) key_list = stockinfo.find_all(\'dt\') value_list = stockinfo.find_all(\'dd\') for i in range(len(key_list)): key = key_list[i].text value = value_list[i].text infodict[key] = value with open(path,\'a\',encoding=\'utf-8\') as f:#\'a\':新的内容会加到已有内容的后面 f.write(str(infodict)+\'\n\') cnt = cnt+1 print(\'\r当前进度:{:.2f}%\'.format(cnt*100/len(list)),end=\'\')#\r 表示将光标的位置回退到本行的开头位置 except: cnt = cnt +1 print(\'\r当前进度:{:.2f}%\'.format(cnt*100/len(list)),end=\'\') continue def main(): stock_list_url = \'http://quote.eastmoney.com/stock_list.html\' stock_info_url = \'https://www.laohu8.com/stock/\' output_file = \'laohu_stock.txt\' list = [] getstocklist(list,stock_list_url) getstockinfo(list,stock_info_url,output_file) main()
getstockinfo():
getstockinfo
laohu_stock.txt 部分截图
注意:
可以看到爬取到的信息里没有最高最低信息,这是因为我直接爬取的网页源代码,源代码里没有显示出最高最低信息,但是正常网页显示了最高最低,具体原因还在了解。