selenium的使用 - 爱码网

1 selenium的使用

1.0 基本使用

from selenium import webdriver
import time

# 浏览器对象
bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.implicitly_wait(10)  # 隐士等待，去找控件，如果没有会等10s

bro.get(\'https://www.baidu.com/\')

# sub_button=bro.find_element_by_css_selector(\'#s-top-loginbtn\')
sub_button = bro.find_element_by_id(\'s-top-loginbtn\')  # 如果有id，优先用它
# 点击
sub_button.click()

# 找到用户名密码登录
user_btn = bro.find_element_by_xpath(\'//*[@id="TANGRAM__PSP_11__footerULoginBtn"]\')
# user_btn=bro.find_element_by_id(\'TANGRAM__PSP_11__footerULoginBtn\')
user_btn.click()

username = bro.find_element_by_id(\'TANGRAM__PSP_11__userName\')
password = bro.find_element_by_id(\'TANGRAM__PSP_11__password\')

# 往输入框中写东西
username.send_keys(\'6666666@qq.com\')
password.send_keys(\'lqz12345\')

sumbit_btn = bro.find_element_by_id(\'TANGRAM__PSP_11__submit\')
time.sleep(3)
sumbit_btn.click()

time.sleep(3)
bro.close()

1.1 无头浏览器




from selenium import webdriver

from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(\'window-size=1920x3000\') #指定浏览器分辨率
chrome_options.add_argument(\'--disable-gpu\') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument(\'--hide-scrollbars\') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument(\'blink-settings=imagesEnabled=false\') #不加载图片, 提升速度
chrome_options.add_argument(\'--headless\') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败


driver=webdriver.Chrome(executable_path=\'chromedriver.exe\',chrome_options=chrome_options)
driver.get(\'https://www.baidu.com\')
print(driver.page_source)
driver.close()

1.2 获取元素位置，属性，大小

from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://kyfw.12306.cn/otn/resources/login.html\')
driver.implicitly_wait(10)

user_login=driver.find_element_by_css_selector(\'.login-hd-account>a\')

user_login.click()
time.sleep(2)
img=driver.find_element_by_id(\'J-loginImg\')
print(img)

print(img.id)    #selenium提供的id，忽略
print(img.tag_name) # 标签名



print(\'-----\')
print(img.location) # img标签的位置
print(img.size)     # img标签大小

# 获取属性
# print(img.get_attribute(\'src\'))
print(img.get_attribute(\'class\'))

driver.close()

1.3 等待元素被加载

from selenium import webdriver

# 两种等待方式
# 显示等待
# 隐式等待：只需要写一句话，等待所有要获取的标签

driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://www.baidu.com\')
\'\'\'
# 两种等待方式
# 显示等待(忽略掉)
    wait=WebDriverWait(driver,10)
    wait.until(EC.presence_of_element_located((By.ID,\'content_left\')))
    contents=browser.find_element(By.CSS_SELECTOR,\'#content_left\')
# 隐式等待：
    -driver.implicitly_wait(10)
    -driver.find_element_by_css_selector()
    -只需要写一句话，等待所有要获取的标签

\'\'\'

driver.implicitly_wait(10)


print(driver.page_source)
# 再找控件，只要没加载成功，就会等待，最多等10s
driver.close()

1.4 元素操作

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://www.baidu.com\')
driver.implicitly_wait(10)

## 点击，清空，输入操作

input_search=driver.find_element_by_id(\'kw\')
input_search.send_keys(\'美女\')  # 输入
time.sleep(3)
input_search.clear() # 清空

time.sleep(2)
input_search.send_keys(\'性感美女\')
time.sleep(2)
btn=driver.find_element_by_id(\'su\')
btn.click()  # 点击
time.sleep(10)

driver.close()

1.5 执行js

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'http://127.0.0.1:8000/\')
driver.implicitly_wait(10)

driver.execute_script("name=\'egon\';") # 这里面写js代码
driver.execute_script("alert(name)") # 这里面写js代码


time.sleep(5)
# driver.close()

1.6 切换选项卡



import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get(\'https://www.baidu.com\')
browser.execute_script(\'window.open()\')

print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get(\'https://www.taobao.com\')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get(\'https://www.sina.com.cn\')
browser.close()

1.7 模拟前进后退


import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path=\'chromedriver.exe\')
browser.get(\'https://www.baidu.com\')
browser.get(\'https://www.taobao.com\')
browser.get(\'http://www.sina.com.cn/\')

browser.back()
time.sleep(3)
browser.forward()
browser.close()

1.8 异常处理


from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

browser = webdriver.Chrome()
try:
    browser.get(\'http://www.baidu.com\')

except Exception as e:
    print(e)
finally:
    browser.close()

1.9 selenium登录cnblogs获取cookie

#selenium登录cnblogs获取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path=\'chromedriver.exe\')
browser.implicitly_wait(10)

####  登录过程
# try:
#     browser.get(\'http://www.cnblogs.com\')
#     submit_btn=browser.find_element_by_link_text(\'登录\')  # a标签的内容
#     submit_btn.click()
#
#     username=browser.find_element_by_id(\'mat-input-0\')
#     password=browser.find_element_by_id(\'mat-input-1\')
#     username.send_keys(\'616564099@qq.com\')
#     password.send_keys(\'1111\')
#     input(\'等会\')
#     sub_btn=browser.find_element_by_css_selector(\'body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper\')
#     sub_btn.click()
#
#     # 人工参与，滑动
#     input(\'等会\')
#
#     # 获取到登录后的cookie
#     print(browser.get_cookies())
#
#     with open(\'cookie.json\',\'w\') as f:
#         json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()


### 不登录了，把cookie写入浏览器
# browser.get(\'http://www.cnblogs.com\')
# with open(\'cookie.json\',\'r\') as f:
#     cookie=json.load(f)
# time.sleep(5)
# for item in cookie:  # 设置cookie必须用字典，cookie的json文件是列表，所以用循环往里放
#     browser.add_cookie(item)
#
#
#
# browser.refresh()  # 刷新页面
#
# time.sleep(5)
#
# browser.close()

1.10 抽屉半自动点赞



from selenium import webdriver
import json
import time

#### 登录过程
# bro=webdriver.Chrome(executable_path=\'chromedriver.exe\')
# bro.implicitly_wait(10)
# bro.get(\'https://dig.chouti.com/\')
# try:
#     sub_btn=bro.find_element_by_id(\'login_btn\')
#     print(sub_btn)
#
#     # sub_btn.click()  # 报错
#     bro.execute_script(\'arguments[0].click();\',sub_btn)
#
#     # username=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input\')
#     username=bro.find_element_by_css_selector(\'div.input-item>input.login-phone\')
#     username.send_keys(\'18953675221\')
#     # password=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div\')
#     password = bro.find_element_by_css_selector(\'div.input-item>input.pwd-password-input\')
#     password.send_keys(\'lqz123\')
#
#     time.sleep(3)
#     btn=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button\')
#
#     btn.click()
#
#     input(\'等\')
#
#     with open(\'chouti.json\',\'w\') as f:
#         json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
#     bro.close()
import requests

bro=webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.implicitly_wait(10)
bro.get(\'https://dig.chouti.com/\')



# 把屏幕滑倒最底下
bro.execute_script(\'window.scrollTo(0, document.body.scrollHeight);\')
# bro.find_elements_by_css_selector(\'.link-item\')
cookie={}
##从文件中读出cookie
with open(\'chouti.json\',\'r\') as f:
    res=json.load(f)
for item in res:
    cookie[item[\'name\']]=item[\'value\']

print(cookie) # requests能够使用的cookie


div= bro.find_element_by_class_name(\'link-con\')
time.sleep(2)
header={
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\'
}
div_list=div.find_elements_by_class_name(\'link-item\')
for div in div_list:
    article_id=div.get_attribute(\'data-id\')
    print(article_id)
    # 使用requests发送请求
    res=requests.post(\'https://dig.chouti.com/link/vote\',data={\'linkId\': article_id},cookies=cookie,headers=header)
    print(res.text)
bro.close()

2 打码平台使用

# 人工破解
# 图像识别模块---》数字，字母组合
# 验证码破解平台---》云打码，超级鹰
    -给它一张图片---》结果返回   （收费的）

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode(\'utf8\')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            \'user\': self.username,
            \'pass2\': self.password,
            \'softid\': self.soft_id,
        }
        self.headers = {
            \'Connection\': \'Keep-Alive\',
            \'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            \'codetype\': codetype,
        }
        params.update(self.base_params)
        files = {\'userfile\': (\'ccc.jpg\', im)}
        r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            \'id\': im_id,
        }
        params.update(self.base_params)
        r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers)
        return r.json()


if __name__ == \'__main__\':
    chaojiying = Chaojiying_Client(\'306334678\', \'lqz12345\', \'903641\')  # 用户中心>>软件ID 生成一个替换 96001
    im = open(\'a.jpg\', \'rb\').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

3 xpath使用

1 一门在html中查找数据的语言
2 记住的语法：
    /   取当前路径下的xx   
    //  取所有路径下的xx   
    .   当前路径    
    ..   上一层
    @    取属性
    
4 lxml解析模块提供的xpath
doc=\'\'\'
<html>
 <head>
  <base href=\'http://example.com/\' />
  <title>Example website</title>
 </head>
 <body>
  <div id=\'images\'>
   <a href=\'image1.html\' name=\'sss\'>Name: My image 1 <br /><img src=\'image1_thumb.jpg\' /></a>
   <a href=\'image2.html\' name=\'lqz\'>Name: My image 2 <br /><img src=\'image2_thumb.jpg\' /></a>
   <a href=\'image3.html\'>Name: My image 3 <br /><img src=\'image3_thumb.jpg\' /></a>
   <a href=\'image4.html\' class=\'li\'>Name: My image 4 <br /><img src=\'image4_thumb.jpg\' /></a>
   <a href=\'image5.html\' class=\'li li-item\' name=\'items\'>Name: My image 5 <br /><img src=\'image5_thumb.jpg\' /></a>
   <a href=\'image6.html\' name=\'items\'><span><h5>test</h5></span>Name: My image 6 <br /><img src=\'image6_thumb.jpg\' /></a>
  </div>
 </body>
</html>
\'\'\'
from lxml import etree

# 传入要解析的内容
html=etree.HTML(doc)

# res=html.xpath(\'//body\')
# print(res)

# 1 所有节点
# a=html.xpath(\'//*\')




# 2 指定节点（结果为列表）
# a=html.xpath(\'//head\')
# 3 子节点，子孙节点
# a=html.xpath(\'//div/a\')
# a=html.xpath(\'//body//a\') #无数据
# a=html.xpath(\'//body//a\')
# 4 父节点
# a=html.xpath(\'//body//a[@href="image1.html"]/..\')
# a=html.xpath(\'//body//a\')
# a=html.xpath(\'//body//a[@href="image1.html"]\')
# a=html.xpath(\'//body//a[1]/..\')
# 也可以这样
# a=html.xpath(\'//body//a[1]/parent::*\')
# a=html.xpath(\'//body//a[1]/parent::p\')
# 5 属性匹配
# a=html.xpath(\'//a[@href="image1.html"]\')
# a=html.xpath(\'//a[@name="sss"]\')

# 6 文本获取  text()
# a=html.xpath(\'//a[@href="image1.html"]/text()\')
# a=html.xpath(\'//a/text()\')

# 7 属性获取
# a=html.xpath(\'//a/@href\')
# a=html.xpath(\'//a[1]/@name\')
# # 注意从1 开始取（不是从0）
# a=html.xpath(\'//body//a[2]/@href\')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath(\'//a[@class="li"]\')
# a=html.xpath(\'//a[contains(@class,"li")]\')
# a=html.xpath(\'//body//a[contains(@class,"li")]/text()\')
# 9 多属性匹配
# a=html.xpath(\'//body//a[contains(@class,"li") or @name="items"]\')
# a=html.xpath(\'//body//a[contains(@class,"li") and @name="items"]/text()\')
# a=html.xpath(\'//body//a[contains(@class,"li")]/text()\')
# 10 按序选择
# a=html.xpath(\'//a[2]/text()\')
# a=html.xpath(\'//a[2]/@href\')
# a=html.xpath(\'//a[2]/@name\')
# 取最后一个
# a=html.xpath(\'//a[last()]/@href\')
# 位置小于3的
# a=html.xpath(\'//a[position()<3]/@href\')
# 倒数第二个
# a=html.xpath(\'//a[last()-2]/@href\')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath(\'//a/ancestor::*\')
# # 获取祖先节点中的div
# a=html.xpath(\'//a/ancestor::div\')
# attribute：属性值
# a=html.xpath(\'//a[1]/attribute::*\')
# child：直接子节点
# a=html.xpath(\'//a[1]/child::*\')
# a=html.xpath(\'//a[1]/child::img/@src\')
# descendant：所有子孙节点
# a=html.xpath(\'//a[6]/descendant::*\')
# following:当前节点之后所有节点
# a=html.xpath(\'//a[1]/following::*\')
# a=html.xpath(\'//a[1]/following::*[1]/@href\')
# following-sibling:当前节点之后同级节点
# a=html.xpath(\'//a[1]/following-sibling::*\')
# a=html.xpath(\'//a[1]/following-sibling::a\')
# a=html.xpath(\'//a[1]/following-sibling::*[2]/text()\')
# a=html.xpath(\'//a[1]/following-sibling::*[2]/@href\')

print(a)