给大家分享一个早前爬取东方财富网股票信息的爬虫程序,回头来看做了好多改进,特别是数据处理部分使用了heapd模块,方便快捷一步到位...
1 # _*_ coding:utf-8 _*_ 2 3 import requests,re,json,time,os 4 import heapq 5 from bs4 import BeautifulSoup 6 7 class GPINFO(object): 8 """docstring for GPINFO""" 9 def __init__(self): 10 self.Url = \'http://quote.eastmoney.com/stocklist.html\' 11 self.BaseData = [] 12 self.Date = time.strftime(\'%Y%m%d\') 13 self.Record = \'basedata\'+self.Date 14 if os.path.exists(self.Record): 15 print (\'record exist...\') 16 self.BaseData = self.get_base_data_from_record() 17 else: 18 print (\'fuck-get data again...\') 19 self.get_data() 20 21 def write_record(self,text): 22 with open(self.Record,\'ab\') as f: 23 f.write((text+\'\n\').encode(\'utf-8\')) 24 25 def get_base_data_from_record(self): 26 ll = [] 27 with open(self.Record,\'rb\') as f: 28 json_l = f.readlines() 29 for j in json_l: 30 ll.append(json.loads(j.decode(\'utf-8\'))) 31 return ll 32 33 def get_data(self): 34 #请求数据 35 orihtml = requests.get(self.Url).content 36 #创建 beautifulsoup 对象 37 soup = BeautifulSoup(orihtml,\'lxml\') 38 #采集每一个股票的信息 39 count = 0 40 for a in soup.find(\'div\',class_=\'quotebody\').find_all(\'a\',{\'target\':\'_blank\'}): 41 record_d = {} 42 #代号 43 num = a.get_text().split(\'(\')[1].strip(\')\') 44 if not (num.startswith(\'00\') or num.startswith(\'60\')):continue #只需要6*/0* 45 record_d[\'num\']=num 46 #名称 47 name = a.get_text().split(\'(\')[0] 48 record_d[\'name\']=name 49 #详情页 50 detail_url = a[\'href\'] 51 record_d[\'detail_url\']=detail_url 52 53 cwzburl = detail_url 54 #发送请求 55 try: 56 cwzbhtml = requests.get(cwzburl,timeout=30).content 57 except Exception as e: 58 print (\'perhaps timeout:\',e) 59 continue 60 #创建soup对象 61 cwzbsoup = BeautifulSoup(cwzbhtml,\'lxml\') 62 63 #财务指标列表 [浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率 64 try: 65 cwzb_list = cwzbsoup.find(\'div\',class_=\'cwzb\').tbody.tr.get_text().split() 66 except Exception as e: 67 print (\'error:\',e) 68 continue 69 #去除退市股票 70 if \'-\' not in cwzb_list: 71 record_d[\'data\']=cwzb_list 72 self.BaseData.append(record_d) 73 self.write_record(json.dumps(record_d)) 74 count=count+1 75 print (len(self.BaseData)) 76 77 def main(): 78 test = GPINFO() 79 result = test.BaseData 80 #[浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率] 81 top_10 = heapq.nlargest(10,result,key=lambda r:float(r[\'data\'][7].strip(\'%\'))) 82 for i in top_10: 83 print(i[\'data\']) 84 85 if __name__ == \'__main__\': 86 main()
程序主函数部分是为了获取净利率前10名的股票信息,打印结果如下:
[\'绵石投资\', \'52.2亿\', \'14.0亿\', \'1.25亿\', \'30.90\', \'3.73\', \'42.25%\', \'2047.04%\', \'9.27%\'] [\'国投安信\', \'556亿\', \'270亿\', \'21.1亿\', \'19.80\', \'2.12\', \'5.90%\', \'487.53%\', \'7.79%\'] [\'川投能源\', \'379亿\', \'202亿\', \'28.0亿\', \'10.16\', \'1.91\', \'37.01%\', \'402.64%\', \'14.58%\'] [\'ST明科\', \'47.6亿\', \'9.25亿\', \'5.11千万\', \'68.00\', \'5.14\', \'2.38%\', \'345.11%\', \'5.68%\'] [\'华联控股\', \'93.6亿\', \'31.5亿\', \'4.76亿\', \'14.54\', \'3.74\', \'46.25%\', \'328.53%\', \'20.88%\'] [\'上海九百\', \'68.2亿\', \'12.3亿\', \'1.61亿\', \'31.67\', \'5.56\', \'54.00%\', \'297.99%\', \'13.21%\'] [\'凯瑞德\', \'46.7亿\', \'1.14亿\', \'3.27千万\', \'107.10\', \'40.94\', \'16.07%\', \'294.19%\', \'33.41%\'] [\'鲁信创投\', \'172亿\', \'38.6亿\', \'3.32亿\', \'38.48\', \'4.64\', \'28.67%\', \'244.43%\', \'9.26%\'] [\'博闻科技\', \'35.0亿\', \'6.56亿\', \'2.23千万\', \'117.65\', \'5.36\', \'-16.07%\', \'215.27%\', \'3.41%\'] [\'万泽股份\', \'71.8亿\', \'13.7亿\', \'6.87千万\', \'78.38\', \'5.29\', \'22.57%\', \'203.15%\', \'5.13%\']