selenium模块在爬虫中的应用

1. 相关概念

1. selenium模块

　　是一个基于浏览器自动化的模块

2. 与爬虫之间的关联

　　便捷的捕获到动态加载到的数据（可见即可得）

　　实现模拟登陆

3.环境安装

pip3 install selenium

简单演示

from selenium import webdriver
from time import sleep

# 后面是你的浏览器驱动位置，记得前面加r\'\',\'r\'是防止字符转义的
driver = webdriver.Chrome(r\'chromedriver.exe\')

# 用get打开百度页面
driver.get("http://www.baidu.com")

# 查找页面的“设置”选项，并进行点击
driver.find_elements_by_link_text(\'设置\')[0].click()
sleep(2)

# 打开设置后找到“搜索设置”选项，设置为每页显示50条
driver.find_elements_by_link_text(\'搜索设置\')[0].click()
sleep(2)

# 选中每页显示50条
m = driver.find_element_by_id(\'nr\')
sleep(2)
m.find_element_by_xpath(\'//*[@id="nr"]/option[3]\').click()
m.find_element_by_xpath(\'.//option[3]\').click()
sleep(2)

# 点击保存设置
driver.find_elements_by_class_name("prefpanelgo")[0].click()
sleep(2)

# 处理弹出的警告页面   确定accept() 和 取消dismiss()
driver.switch_to_alert().accept()
sleep(2)

# 找到百度的输入框，并输入 美女
driver.find_element_by_id(\'kw\').send_keys(\'美女\')
sleep(2)

# 点击搜索按钮
driver.find_element_by_id(\'su\').click()
sleep(2)

# 在打开的页面中找到“Selenium - 开源中国社区”，并打开这个页面
driver.find_elements_by_link_text(\'美女_百度图片\')[0].click()
sleep(3)

# 关闭浏览器
driver.quit()

2.基本使用

　　准备好某一款浏览器的驱动程序：http://chromedriver.storage.googleapis.com/index.html

　　版本的映射关系：https://blog.csdn.net/huilan_same/article/details/51896672

1. 访问京东网站,并搜索“苹果”

from time import sleep
from selenium import webdriver

bro = webdriver.Chrome(executable_path="chromedriver.exe")

# 录入路由地址
bro.get("https://www:jd.com/")
sleep(2)

# 进行标签定位
search_input = bro.find_element_by_id("key")

# 向搜索框中录入关键词
search_input.send_keys("苹果")

# 定位搜索按钮
btn = bro.find_element_by_xpath(\'//*[@id="search"]/div/div[2]/button\')

# 点击搜索按钮
btn.click()
sleep(2)

#执行js（滑动滚轮）
bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
sleep(2)

# 获取页面的源码数据
page_text = bro.page_source
print(page_text)

# 退出访问
bro.quit()

2.selenium爬取动态加载的数据

from time import sleep
from selenium import webdriver
from lxml import etree

bro = webdriver.Chrome(executable_path="chromedriver.exe")

bro.get("http://125.35.6.84:81/xk/")
sleep(2)

page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id("pageIto_next").click()  # 点击下一页
    sleep(2)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    tree_list = tree.xpath(\'//ul[@id="gzlist"]/li\')
    for lis in tree_list:
        title = lis.xpath(\'./dl/@title\')[0]
        num = lis.xpath(\'./ol/@title\')[0]

sleep(2)
bro.quit()

3.动作链

　　一系列连续的动作

　　在实现标签定位时，如果发现定位的标签是存在于iframe标签中的，则在定位时必须执行一个固定的操作:bro.switch_to.frame(\'id\')

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains


bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')

bro.get(\'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable\')
bro.switch_to.frame(\'iframeResult\')
div_tag = bro.find_element_by_id(\'draggable\')
# 拖动=点击+滑动
action = ActionChains(bro)
action.click_and_hold(div_tag)

for i in range(5):
    #perform让动作链立即执行
    action.move_by_offset(17,5).perform()
    sleep(0.5)
action.release()
sleep(3)
bro.quit()

4.模拟12306登录

超级鹰识别代码

# Cjy.py

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode(\'utf8\')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            \'user\': self.username,
            \'pass2\': self.password,
            \'softid\': self.soft_id,
        }
        self.headers = {
            \'Connection\': \'Keep-Alive\',
            \'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            \'codetype\': codetype,
        }
        params.update(self.base_params)
        files = {\'userfile\': (\'ccc.jpg\', im)}
        r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            \'id\': im_id,
        }
        params.update(self.base_params)
        r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers)
        return r.json()

模拟登陆

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from Cjy import Chaojiying_Client
from selenium.webdriver import ActionChains

bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.get(\'https://kyfw.12306.cn/otn/login/init\')
sleep(5)
bro.save_screenshot(\'main.png\')

code_img_tag = bro.find_element_by_xpath(\'//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img\')
location = code_img_tag.location
size = code_img_tag.size

# 裁剪的区域范围
rangle = (
int(location[\'x\']), int(location[\'y\']), int(location[\'x\'] + size[\'width\']), int(location[\'y\'] + size[\'height\']))

i = Image.open(\'./main.png\')
frame = i.crop(rangle)
frame.save(\'code.png\')


def get_text(imgPath, imgType):
    chaojiying = Chaojiying_Client(\'bobo328410948\', \'bobo328410948\', \'899370\')
    im = open(imgPath, \'rb\').read()
    return chaojiying.PostPic(im, imgType)[\'pic_str\']


# 55,70|267,133 ==[[55,70],[33,66]]
result = get_text(\'./code.png\', 9004)
all_list = []
if \'|\' in result:
    list_1 = result.split(\'|\')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(\',\')[0])
        y = int(list_1[i].split(\',\')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(\',\')[0])
    y = int(result.split(\',\')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
# action = ActionChains(bro)
for a in all_list:
    x = a[0]
    y = a[1]
    ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform()
    sleep(1)

bro.find_element_by_id(\'username\').send_keys(\'123456\')
sleep(1)
bro.find_element_by_id(\'password\').send_keys(\'67890000000\')
sleep(1)
bro.find_element_by_id(\'loginSub\').click()

sleep(5)
bro.quit()

爬取梨视频

import requests
from lxml import etree
import re
headers = {
    \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36\'
}
url = \'https://www.pearvideo.com/category_1\'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath(\'//*[@id="listvideoListUl"]/li\')
for li in li_list:
    detail_url = \'https://www.pearvideo.com/\'+li.xpath(\'./div/a/@href\')[0]
    title = li.xpath(\'./div/a/div[2]/text()\')[0]+\'.mp4\'
    detail_page_text = requests.get(detail_url,headers=headers).text
    ex = \'srcUrl="(.*?)",vdoUrl\'
    video_url = re.findall(ex,detail_page_text,re.S)[0]
    video_data = requests.get(video_url,headers=headers).content
    with open(title,\'wb\') as fp:
        fp.write(video_data)

5. 移动端数据的爬取

1. fiddler是一款抓包工具，代理服务器

　　- 青花瓷

　　- miteproxy

　　- 配置：让其可以抓取https协议的请求

　　- tools -> options -> https -> 安装证书

2. http：客户端和服务端进行数据交互的某种形式

　　- https：安全的http协议

　　- https的加密方式采用的是证书密钥加密。

3.步骤

配置下fiddler的端口
将手机和fiddler所在的电脑处在同一个网段下（pc开启wifi，手机连接）
在手机中访问fiddler的ip+port：192.168.14.110:50816,在当前页面中点击对应的连接下载证书
在手机中安装且信任证书
设置手机网络的代理：开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号

待续