python爬虫笔记（六）网络爬虫之实战（2）——股票数据定向爬虫

1. 股票数据定向爬虫

https://gupiao.baidu.com/stock

http://quote.eastmoney.com/stock_list.html

2. 实例编写

2.1 获取HTML页面

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        print("url:", r.request.url)
        return r.text
    except:
        return ""

2.2 获取股票列表信息（bs4+正则）

def getStockList(lst, stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, \'html.parser\')
    # 个股链接在 <a> 标签中
    a = soup.find_all(\'a\')
    for i in a:
        try:
            # 个股链接在 <a> 标签的 href 属性中
            # 我们需要获得 sh/sz + 6 个数字，利用正则表达式
            href = i.attrs[\'href\']
            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
        except:
            continue
    print(lst)

2.3 获取股票信息主体

def getStockInfo(lst, stockURL, fpath):
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            # 使用键值对记录个股信息
            infoDict = {}
            soup = BeautifulSoup(html, \'html.parser\')
            stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})
            
            # 获取股票名称
            name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]
            # 将信息添加到字典中
            # split()以空格分隔
            infoDict.update({\'股票名称\' : name.text.split()[0]})
            
            # 在 <dt><dd> 标签中获取其他信息，用键值对维护
            keyList = stockInfo.find_all(\'dt\')
            valueList = stockInfo.find_all(\'dd\')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
                
            # 将获得额信息写入相应文件中
            with open(fpath, \'a\', encoding=\'utf-8\') as f:
                f.write(str(infoDict) + \'\n\')
                
        # 利用 traceback 跟踪并输出异常信息
        except:
            traceback.print_exc()
            continue

3. 完整代码

# -*- coding: utf-8 -*-
"""
Created on Sat Feb  1 00:40:47 2020

@author: douzi
"""

import requests
from bs4 import BeautifulSoup
# traceback模块被用来跟踪异常返回信息
import traceback
import re


def getHTMLText(url, code = \'utf-8\'):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
#        r.encoding = r.apparent_encoding   # 定向爬虫可以直接固定
        r.encoding = code
        
        print("url:", r.request.url)
        return r.text
    except:
        return ""


def getStockList(lst, stockURL):
    html = getHTMLText(stockURL, "GB2312")
    soup = BeautifulSoup(html, \'html.parser\')
    # 个股链接在 <a> 标签中
    a = soup.find_all(\'a\')
    for i in a:
        try:
            # 个股链接在 <a> 标签的 href 属性中
            # 我们需要获得 sh/sz + 6 个数字，利用正则表达式
            href = i.attrs[\'href\']
            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
        except:
            continue
    print(lst)


def getStockInfo(lst, stockURL, fpath):
    count = 0
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            # 使用键值对记录个股信息
            infoDict = {}
            soup = BeautifulSoup(html, \'html.parser\')
            stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})
            
            # 获取股票名称
            name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]
            # 将信息添加到字典中
            # split()以空格分隔
            infoDict.update({\'股票名称\' : name.text.split()[0]})
            
            # 在 <dt><dd> 标签中获取其他信息，用键值对维护
            keyList = stockInfo.find_all(\'dt\')
            valueList = stockInfo.find_all(\'dd\')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
                
            # 将获得额信息写入相应文件中
            with open(fpath, \'a\', encoding=\'utf-8\') as f:
                f.write(str(infoDict) + \'\n\')
                count = count + 1
                # \r
                print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')
        # 利用 traceback 跟踪并输出异常信息
        except:
            count = count + 1
            print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')
            traceback.print_exc()
            continue


def main():
    # 股票列表信息
    stock_list_url = "http://quote.eastmoney.com/stock_list.html"
    # 股票信息主体
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = ".//Result_stock.txt"
    slist = []
    # 获取股票列表信息
    getStockList(slist, stock_list_url)
    # 获得股票信息主体
    getStockInfo(slist, stock_info_url, output_file)


if __name__ == "__main__":
    main()