跟小白学python网络爬虫实例3

实例3--股票数据定向爬虫

程序结构如下：

　　1.先从网站中获取股票代号列表（requests库，re库）

　　2.遍历每一只股票，从股票信息网站中获得详细信息

　　3.使用字典的数据结构，写入文本文件中

以下为代码：

 1 # 股票数据定向爬虫
 2 """
 3 Created on Thu Oct 12 16:12:48 2017
 4 
 5 @author: DONG LONG RUI
 6 """
 7 import requests
 8 from bs4 import BeautifulSoup
 9 import re
10 #import traceback
11 
12 def getHTMLText(url,code=\'utf-8\'):#参数code缺省值为‘utf-8’(编码方式)
13     try:
14         r=requests.get(url,timeout=30)
15         r.raise_for_status()
16         #r.encoding=r.apparent_encoding
17         r.encoding=code
18         return r.text
19     except:
20         return \'\'
21     
22 def getStockList(lst,stockURL):
23     html=getHTMLText(stockURL,\'GB2312\')
24     soup=BeautifulSoup(html,\'html.parser\')
25     a=soup.find_all(\'a\')
26     for i in a:
27         try:
28             href=i.attrs[\'href\']
29             lst.append(re.findall(r\'[s][hz]\d{6}\',href)[0])
30         except:
31             continue
32     
33 def getStockInfo(lst,stockURL,fpath):
34     count=0#
35     for stock in lst:
36         url=stockURL+stock+\'.html\'
37         html=getHTMLText(url)
38         try:
39             if html==\'\':
40                 continue
41             infoDict={}
42             soup=BeautifulSoup(html,\'html.parser\')
43             stockInfo=soup.find(\'div\',attrs={\'class\':\'stock-bets\'})
44             
45             name=stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]
46             infoDict.update({\'股票名称\':name.text.split()[0]})#用空格分开，得到股票名称
47             
48             keyList=stockInfo.find_all(\'dt\')
49             valueList=stockInfo.find_all(\'dd\')
50             for i in range(len(keyList)):
51                 key=keyList[i].text
52                 val=valueList[i].text
53                 infoDict[key]=val
54             
55             with open(fpath,\'a\',encoding=\'UTF-8\') as f:
56                 f.write(str(infoDict)+\'\n\')
57                 count=count+1#
58                 print(\'\r当前进度：{:.2f}%\'.format(count*100/len(lst)),end=\'\')#动态显示进度，‘\r’实现光标移动，即为不换行的效果
59         except:
60             count=count+1
61             print(\'\r当前进度：{:.2f}%\'.format(count*100/len(lst)),end=\'\')#动态显示进度，‘\r’实现光标移动，即为不换行的效果
62             #traceback.print_exc()
63             continue
64 
65     
66 def main():
67     stock_list_url=\'http://quote.eastmoney.com/stocklist.html\'
68     stock_info_url=\'https://gupiao.baidu.com/stock/\'
69     output_file=\'C:/Users/DONG LONG RUI/.spyder-py3/BaiduStockInfo.txt\'
70     slist=[]
71     getStockList(slist,stock_list_url)
72     getStockInfo(slist,stock_info_url,output_file)
73     
74 main()

由于requests库爬虫的限制，我运行后速度会比较慢，后续可尝试scrapy爬虫。

又想到bs4中的BeautifulSoup和re库都可用于搜索html中的目标信息，但两者一般结合起来使用：

　　先用BeautifulSoup找到目标信息所在的特定标签，然后在这些标签内容中使用正则表达式去匹配。