一个爬取股票信息的爬虫程序

　　给大家分享一个早前爬取东方财富网股票信息的爬虫程序，回头来看做了好多改进，特别是数据处理部分使用了heapd模块，方便快捷一步到位...

 1 # _*_ coding:utf-8 _*_
 2 
 3 import requests,re,json,time,os
 4 import heapq
 5 from bs4 import BeautifulSoup
 6 
 7 class GPINFO(object):
 8     """docstring for GPINFO"""
 9     def __init__(self):
10         self.Url = \'http://quote.eastmoney.com/stocklist.html\'
11         self.BaseData = []
12         self.Date = time.strftime(\'%Y%m%d\')
13         self.Record = \'basedata\'+self.Date
14         if os.path.exists(self.Record):
15             print (\'record exist...\')
16             self.BaseData = self.get_base_data_from_record()
17         else:
18             print (\'fuck-get data again...\')
19             self.get_data()
20 
21     def write_record(self,text):
22         with open(self.Record,\'ab\') as f:
23             f.write((text+\'\n\').encode(\'utf-8\'))
24 
25     def get_base_data_from_record(self):
26         ll = []
27         with open(self.Record,\'rb\') as f:
28             json_l = f.readlines()
29             for j in json_l:
30                 ll.append(json.loads(j.decode(\'utf-8\')))
31         return ll
32 
33     def get_data(self):
34         #请求数据
35         orihtml = requests.get(self.Url).content
36         #创建 beautifulsoup 对象
37         soup = BeautifulSoup(orihtml,\'lxml\')
38         #采集每一个股票的信息
39         count = 0
40         for a in soup.find(\'div\',class_=\'quotebody\').find_all(\'a\',{\'target\':\'_blank\'}):
41             record_d = {}
42             #代号
43             num = a.get_text().split(\'(\')[1].strip(\')\')
44             if not (num.startswith(\'00\') or num.startswith(\'60\')):continue #只需要6*/0*
45             record_d[\'num\']=num
46             #名称
47             name = a.get_text().split(\'(\')[0]
48             record_d[\'name\']=name
49             #详情页
50             detail_url = a[\'href\']
51             record_d[\'detail_url\']=detail_url
52 
53             cwzburl = detail_url
54             #发送请求
55             try:
56                 cwzbhtml = requests.get(cwzburl,timeout=30).content
57             except Exception as e:
58                 print (\'perhaps timeout:\',e)
59                 continue
60             #创建soup对象
61             cwzbsoup = BeautifulSoup(cwzbhtml,\'lxml\')
62 
63             #财务指标列表 [浦发银行，总市值    净资产    净利润    市盈率    市净率    毛利率    净利率    ROE] roe:净资产收益率
64             try:
65                 cwzb_list = cwzbsoup.find(\'div\',class_=\'cwzb\').tbody.tr.get_text().split()
66             except Exception as e:
67                 print (\'error:\',e)
68                 continue
69             #去除退市股票
70             if \'-\' not in cwzb_list:
71                 record_d[\'data\']=cwzb_list
72                 self.BaseData.append(record_d)
73                 self.write_record(json.dumps(record_d))
74                 count=count+1
75                 print (len(self.BaseData))
76 
77 def main():
78     test = GPINFO()
79     result = test.BaseData
80     #[浦发银行，总市值    净资产    净利润    市盈率    市净率    毛利率    净利率    ROE] roe:净资产收益率]
81     top_10 = heapq.nlargest(10,result,key=lambda r:float(r[\'data\'][7].strip(\'%\')))
82     for i in top_10:
83         print(i[\'data\'])
84 
85 if __name__ == \'__main__\':
86     main()

　　程序主函数部分是为了获取净利率前10名的股票信息,打印结果如下:

[\'绵石投资\', \'52.2亿\', \'14.0亿\', \'1.25亿\', \'30.90\', \'3.73\', \'42.25%\', \'2047.04%\', \'9.27%\']
[\'国投安信\', \'556亿\', \'270亿\', \'21.1亿\', \'19.80\', \'2.12\', \'5.90%\', \'487.53%\', \'7.79%\']
[\'川投能源\', \'379亿\', \'202亿\', \'28.0亿\', \'10.16\', \'1.91\', \'37.01%\', \'402.64%\', \'14.58%\']
[\'ST明科\', \'47.6亿\', \'9.25亿\', \'5.11千万\', \'68.00\', \'5.14\', \'2.38%\', \'345.11%\', \'5.68%\']
[\'华联控股\', \'93.6亿\', \'31.5亿\', \'4.76亿\', \'14.54\', \'3.74\', \'46.25%\', \'328.53%\', \'20.88%\']
[\'上海九百\', \'68.2亿\', \'12.3亿\', \'1.61亿\', \'31.67\', \'5.56\', \'54.00%\', \'297.99%\', \'13.21%\']
[\'凯瑞德\', \'46.7亿\', \'1.14亿\', \'3.27千万\', \'107.10\', \'40.94\', \'16.07%\', \'294.19%\', \'33.41%\']
[\'鲁信创投\', \'172亿\', \'38.6亿\', \'3.32亿\', \'38.48\', \'4.64\', \'28.67%\', \'244.43%\', \'9.26%\']
[\'博闻科技\', \'35.0亿\', \'6.56亿\', \'2.23千万\', \'117.65\', \'5.36\', \'-16.07%\', \'215.27%\', \'3.41%\']
[\'万泽股份\', \'71.8亿\', \'13.7亿\', \'6.87千万\', \'78.38\', \'5.29\', \'22.57%\', \'203.15%\', \'5.13%\']