zangyue

1. 相关概念

1. selenium模块

  是一个基于浏览器自动化的模块

2. 与爬虫之间的关联

  便捷的捕获到动态加载到的数据(可见即可得)

  实现模拟登陆

3.环境安装

pip3 install selenium

简单演示

from selenium import webdriver
from time import sleep

# 后面是你的浏览器驱动位置,记得前面加r\'\',\'r\'是防止字符转义的
driver = webdriver.Chrome(r\'chromedriver.exe\')

# 用get打开百度页面
driver.get("http://www.baidu.com")

# 查找页面的“设置”选项,并进行点击
driver.find_elements_by_link_text(\'设置\')[0].click()
sleep(2)

# 打开设置后找到“搜索设置”选项,设置为每页显示50条
driver.find_elements_by_link_text(\'搜索设置\')[0].click()
sleep(2)

# 选中每页显示50条
m = driver.find_element_by_id(\'nr\')
sleep(2)
m.find_element_by_xpath(\'//*[@id="nr"]/option[3]\').click()
m.find_element_by_xpath(\'.//option[3]\').click()
sleep(2)

# 点击保存设置
driver.find_elements_by_class_name("prefpanelgo")[0].click()
sleep(2)

# 处理弹出的警告页面   确定accept() 和 取消dismiss()
driver.switch_to_alert().accept()
sleep(2)

# 找到百度的输入框,并输入 美女
driver.find_element_by_id(\'kw\').send_keys(\'美女\')
sleep(2)

# 点击搜索按钮
driver.find_element_by_id(\'su\').click()
sleep(2)

# 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面
driver.find_elements_by_link_text(\'美女_百度图片\')[0].click()
sleep(3)

# 关闭浏览器
driver.quit()

2.基本使用

  准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html

  版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672

1. 访问京东网站,并搜索“苹果”

from time import sleep
from selenium import webdriver

bro = webdriver.Chrome(executable_path="chromedriver.exe")

# 录入路由地址
bro.get("https://www:jd.com/")
sleep(2)

# 进行标签定位
search_input = bro.find_element_by_id("key")

# 向搜索框中录入关键词
search_input.send_keys("苹果")

# 定位搜索按钮
btn = bro.find_element_by_xpath(\'//*[@id="search"]/div/div[2]/button\')

# 点击搜索按钮
btn.click()
sleep(2)

#执行js(滑动滚轮)
bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
sleep(2)

# 获取页面的源码数据
page_text = bro.page_source
print(page_text)

# 退出访问
bro.quit()

2.selenium爬取动态加载的数据

from time import sleep
from selenium import webdriver
from lxml import etree

bro = webdriver.Chrome(executable_path="chromedriver.exe")

bro.get("http://125.35.6.84:81/xk/")
sleep(2)

page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id("pageIto_next").click()  # 点击下一页
    sleep(2)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    tree_list = tree.xpath(\'//ul[@id="gzlist"]/li\')
    for lis in tree_list:
        title = lis.xpath(\'./dl/@title\')[0]
        num = lis.xpath(\'./ol/@title\')[0]

sleep(2)
bro.quit()

3.动作链

  一系列连续的动作

  在实现标签定位时,如果发现定位的标签是存在于iframe标签中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame(\'id\')

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains

bro
= webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.get(
\'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable\') bro.switch_to.frame(\'iframeResult\') div_tag = bro.find_element_by_id(\'draggable\') # 拖动=点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()

4.模拟12306登录

超级鹰识别代码

# Cjy.py

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode(\'utf8\')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            \'user\': self.username,
            \'pass2\': self.password,
            \'softid\': self.soft_id,
        }
        self.headers = {
            \'Connection\': \'Keep-Alive\',
            \'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            \'codetype\': codetype,
        }
        params.update(self.base_params)
        files = {\'userfile\': (\'ccc.jpg\', im)}
        r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            \'id\': im_id,
        }
        params.update(self.base_params)
        r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers)
        return r.json()

模拟登陆

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from Cjy import Chaojiying_Client
from selenium.webdriver import ActionChains

bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.get(\'https://kyfw.12306.cn/otn/login/init\')
sleep(5)
bro.save_screenshot(\'main.png\')

code_img_tag = bro.find_element_by_xpath(\'//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img\')
location = code_img_tag.location
size = code_img_tag.size

# 裁剪的区域范围
rangle = (
int(location[\'x\']), int(location[\'y\']), int(location[\'x\'] + size[\'width\']), int(location[\'y\'] + size[\'height\']))

i = Image.open(\'./main.png\')
frame = i.crop(rangle)
frame.save(\'code.png\')


def get_text(imgPath, imgType):
    chaojiying = Chaojiying_Client(\'bobo328410948\', \'bobo328410948\', \'899370\')
    im = open(imgPath, \'rb\').read()
    return chaojiying.PostPic(im, imgType)[\'pic_str\']


# 55,70|267,133 ==[[55,70],[33,66]]
result = get_text(\'./code.png\', 9004)
all_list = []
if \'|\' in result:
    list_1 = result.split(\'|\')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(\',\')[0])
        y = int(list_1[i].split(\',\')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(\',\')[0])
    y = int(result.split(\',\')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
# action = ActionChains(bro)
for a in all_list:
    x = a[0]
    y = a[1]
    ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform()
    sleep(1)

bro.find_element_by_id(\'username\').send_keys(\'123456\')
sleep(1)
bro.find_element_by_id(\'password\').send_keys(\'67890000000\')
sleep(1)
bro.find_element_by_id(\'loginSub\').click()

sleep(5)
bro.quit()

爬取梨视频

import requests
from lxml import etree
import re
headers = {
    \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36\'
}
url = \'https://www.pearvideo.com/category_1\'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath(\'//*[@id="listvideoListUl"]/li\')
for li in li_list:
    detail_url = \'https://www.pearvideo.com/\'+li.xpath(\'./div/a/@href\')[0]
    title = li.xpath(\'./div/a/div[2]/text()\')[0]+\'.mp4\'
    detail_page_text = requests.get(detail_url,headers=headers).text
    ex = \'srcUrl="(.*?)",vdoUrl\'
    video_url = re.findall(ex,detail_page_text,re.S)[0]
    video_data = requests.get(video_url,headers=headers).content
    with open(title,\'wb\') as fp:
        fp.write(video_data)

5. 移动端数据的爬取

1. fiddler是一款抓包工具,代理服务器

  - 青花瓷

  - miteproxy

  - 配置:让其可以抓取https协议的请求

  - tools -> options -> https -> 安装证书

2. http:客户端和服务端进行数据交互的某种形式

  - https:安全的http协议

  - https的加密方式采用的是证书密钥加密。

3.步骤

  1. 配置下fiddler的端口

  2. 将手机和fiddler所在的电脑处在同一个网段下(pc开启wifi,手机连接)

  3. 在手机中访问fiddler的ip+port:192.168.14.110:50816,在当前页面中点击对应的连接下载证书

  4. 在手机中安装且信任证书

  5. 设置手机网络的代理:开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号

待续

分类:

技术点:

相关文章:

  • 2021-11-24
  • 2022-12-23
  • 2021-12-22
  • 2021-12-22
  • 2022-12-23
  • 2022-03-09
猜你喜欢
  • 2021-07-05
  • 2022-12-23
  • 2019-01-26
  • 2021-11-21
  • 2021-09-12
  • 2022-01-27
相关资源
相似解决方案