1 selenium的使用
1.0 基本使用
from selenium import webdriver
import time
# 浏览器对象
bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.implicitly_wait(10) # 隐士等待,去找控件,如果没有会等10s
bro.get(\'https://www.baidu.com/\')
# sub_button=bro.find_element_by_css_selector(\'#s-top-loginbtn\')
sub_button = bro.find_element_by_id(\'s-top-loginbtn\') # 如果有id,优先用它
# 点击
sub_button.click()
# 找到用户名密码登录
user_btn = bro.find_element_by_xpath(\'//*[@id="TANGRAM__PSP_11__footerULoginBtn"]\')
# user_btn=bro.find_element_by_id(\'TANGRAM__PSP_11__footerULoginBtn\')
user_btn.click()
username = bro.find_element_by_id(\'TANGRAM__PSP_11__userName\')
password = bro.find_element_by_id(\'TANGRAM__PSP_11__password\')
# 往输入框中写东西
username.send_keys(\'6666666@qq.com\')
password.send_keys(\'lqz12345\')
sumbit_btn = bro.find_element_by_id(\'TANGRAM__PSP_11__submit\')
time.sleep(3)
sumbit_btn.click()
time.sleep(3)
bro.close()
1.1 无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(\'window-size=1920x3000\') #指定浏览器分辨率
chrome_options.add_argument(\'--disable-gpu\') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument(\'--hide-scrollbars\') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument(\'blink-settings=imagesEnabled=false\') #不加载图片, 提升速度
chrome_options.add_argument(\'--headless\') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\',chrome_options=chrome_options)
driver.get(\'https://www.baidu.com\')
print(driver.page_source)
driver.close()
1.2 获取元素位置,属性,大小
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://kyfw.12306.cn/otn/resources/login.html\')
driver.implicitly_wait(10)
user_login=driver.find_element_by_css_selector(\'.login-hd-account>a\')
user_login.click()
time.sleep(2)
img=driver.find_element_by_id(\'J-loginImg\')
print(img)
print(img.id) #selenium提供的id,忽略
print(img.tag_name) # 标签名
print(\'-----\')
print(img.location) # img标签的位置
print(img.size) # img标签大小
# 获取属性
# print(img.get_attribute(\'src\'))
print(img.get_attribute(\'class\'))
driver.close()
1.3 等待元素被加载
from selenium import webdriver
# 两种等待方式
# 显示等待
# 隐式等待:只需要写一句话,等待所有要获取的标签
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://www.baidu.com\')
\'\'\'
# 两种等待方式
# 显示等待(忽略掉)
wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,\'content_left\')))
contents=browser.find_element(By.CSS_SELECTOR,\'#content_left\')
# 隐式等待:
-driver.implicitly_wait(10)
-driver.find_element_by_css_selector()
-只需要写一句话,等待所有要获取的标签
\'\'\'
driver.implicitly_wait(10)
print(driver.page_source)
# 再找控件,只要没加载成功,就会等待,最多等10s
driver.close()
1.4 元素操作
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'https://www.baidu.com\')
driver.implicitly_wait(10)
## 点击,清空,输入操作
input_search=driver.find_element_by_id(\'kw\')
input_search.send_keys(\'美女\') # 输入
time.sleep(3)
input_search.clear() # 清空
time.sleep(2)
input_search.send_keys(\'性感美女\')
time.sleep(2)
btn=driver.find_element_by_id(\'su\')
btn.click() # 点击
time.sleep(10)
driver.close()
1.5 执行js
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path=\'chromedriver.exe\')
driver.get(\'http://127.0.0.1:8000/\')
driver.implicitly_wait(10)
driver.execute_script("name=\'egon\';") # 这里面写js代码
driver.execute_script("alert(name)") # 这里面写js代码
time.sleep(5)
# driver.close()
1.6 切换选项卡
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get(\'https://www.baidu.com\')
browser.execute_script(\'window.open()\')
print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get(\'https://www.taobao.com\')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get(\'https://www.sina.com.cn\')
browser.close()
1.7 模拟前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome(executable_path=\'chromedriver.exe\')
browser.get(\'https://www.baidu.com\')
browser.get(\'https://www.taobao.com\')
browser.get(\'http://www.sina.com.cn/\')
browser.back()
time.sleep(3)
browser.forward()
browser.close()
1.8 异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
browser = webdriver.Chrome()
try:
browser.get(\'http://www.baidu.com\')
except Exception as e:
print(e)
finally:
browser.close()
1.9 selenium登录cnblogs获取cookie
#selenium登录cnblogs获取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path=\'chromedriver.exe\')
browser.implicitly_wait(10)
#### 登录过程
# try:
# browser.get(\'http://www.cnblogs.com\')
# submit_btn=browser.find_element_by_link_text(\'登录\') # a标签的内容
# submit_btn.click()
#
# username=browser.find_element_by_id(\'mat-input-0\')
# password=browser.find_element_by_id(\'mat-input-1\')
# username.send_keys(\'616564099@qq.com\')
# password.send_keys(\'1111\')
# input(\'等会\')
# sub_btn=browser.find_element_by_css_selector(\'body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper\')
# sub_btn.click()
#
# # 人工参与,滑动
# input(\'等会\')
#
# # 获取到登录后的cookie
# print(browser.get_cookies())
#
# with open(\'cookie.json\',\'w\') as f:
# json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
# print(e)
# finally:
# browser.close()
### 不登录了,把cookie写入浏览器
# browser.get(\'http://www.cnblogs.com\')
# with open(\'cookie.json\',\'r\') as f:
# cookie=json.load(f)
# time.sleep(5)
# for item in cookie: # 设置cookie必须用字典,cookie的json文件是列表,所以用循环往里放
# browser.add_cookie(item)
#
#
#
# browser.refresh() # 刷新页面
#
# time.sleep(5)
#
# browser.close()
1.10 抽屉半自动点赞
from selenium import webdriver
import json
import time
#### 登录过程
# bro=webdriver.Chrome(executable_path=\'chromedriver.exe\')
# bro.implicitly_wait(10)
# bro.get(\'https://dig.chouti.com/\')
# try:
# sub_btn=bro.find_element_by_id(\'login_btn\')
# print(sub_btn)
#
# # sub_btn.click() # 报错
# bro.execute_script(\'arguments[0].click();\',sub_btn)
#
# # username=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input\')
# username=bro.find_element_by_css_selector(\'div.input-item>input.login-phone\')
# username.send_keys(\'18953675221\')
# # password=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div\')
# password = bro.find_element_by_css_selector(\'div.input-item>input.pwd-password-input\')
# password.send_keys(\'lqz123\')
#
# time.sleep(3)
# btn=bro.find_element_by_css_selector(\'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button\')
#
# btn.click()
#
# input(\'等\')
#
# with open(\'chouti.json\',\'w\') as f:
# json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
# bro.close()
import requests
bro=webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.implicitly_wait(10)
bro.get(\'https://dig.chouti.com/\')
# 把屏幕滑倒最底下
bro.execute_script(\'window.scrollTo(0, document.body.scrollHeight);\')
# bro.find_elements_by_css_selector(\'.link-item\')
cookie={}
##从文件中读出cookie
with open(\'chouti.json\',\'r\') as f:
res=json.load(f)
for item in res:
cookie[item[\'name\']]=item[\'value\']
print(cookie) # requests能够使用的cookie
div= bro.find_element_by_class_name(\'link-con\')
time.sleep(2)
header={
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\'
}
div_list=div.find_elements_by_class_name(\'link-item\')
for div in div_list:
article_id=div.get_attribute(\'data-id\')
print(article_id)
# 使用requests发送请求
res=requests.post(\'https://dig.chouti.com/link/vote\',data={\'linkId\': article_id},cookies=cookie,headers=header)
print(res.text)
bro.close()
2 打码平台使用
# 人工破解
# 图像识别模块---》数字,字母组合
# 验证码破解平台---》云打码,超级鹰
-给它一张图片---》结果返回 (收费的)
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode(\'utf8\')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
\'user\': self.username,
\'pass2\': self.password,
\'softid\': self.soft_id,
}
self.headers = {
\'Connection\': \'Keep-Alive\',
\'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
\'codetype\': codetype,
}
params.update(self.base_params)
files = {\'userfile\': (\'ccc.jpg\', im)}
r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
\'id\': im_id,
}
params.update(self.base_params)
r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers)
return r.json()
if __name__ == \'__main__\':
chaojiying = Chaojiying_Client(\'306334678\', \'lqz12345\', \'903641\') # 用户中心>>软件ID 生成一个替换 96001
im = open(\'a.jpg\', \'rb\').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
3 xpath使用
1 一门在html中查找数据的语言
2 记住的语法:
/ 取当前路径下的xx
// 取所有路径下的xx
. 当前路径
.. 上一层
@ 取属性
4 lxml解析模块提供的xpath
doc=\'\'\'
<html>
<head>
<base href=\'http://example.com/\' />
<title>Example website</title>
</head>
<body>
<div id=\'images\'>
<a href=\'image1.html\' name=\'sss\'>Name: My image 1 <br /><img src=\'image1_thumb.jpg\' /></a>
<a href=\'image2.html\' name=\'lqz\'>Name: My image 2 <br /><img src=\'image2_thumb.jpg\' /></a>
<a href=\'image3.html\'>Name: My image 3 <br /><img src=\'image3_thumb.jpg\' /></a>
<a href=\'image4.html\' class=\'li\'>Name: My image 4 <br /><img src=\'image4_thumb.jpg\' /></a>
<a href=\'image5.html\' class=\'li li-item\' name=\'items\'>Name: My image 5 <br /><img src=\'image5_thumb.jpg\' /></a>
<a href=\'image6.html\' name=\'items\'><span><h5>test</h5></span>Name: My image 6 <br /><img src=\'image6_thumb.jpg\' /></a>
</div>
</body>
</html>
\'\'\'
from lxml import etree
# 传入要解析的内容
html=etree.HTML(doc)
# res=html.xpath(\'//body\')
# print(res)
# 1 所有节点
# a=html.xpath(\'//*\')
# 2 指定节点(结果为列表)
# a=html.xpath(\'//head\')
# 3 子节点,子孙节点
# a=html.xpath(\'//div/a\')
# a=html.xpath(\'//body//a\') #无数据
# a=html.xpath(\'//body//a\')
# 4 父节点
# a=html.xpath(\'//body//a[@href="image1.html"]/..\')
# a=html.xpath(\'//body//a\')
# a=html.xpath(\'//body//a[@href="image1.html"]\')
# a=html.xpath(\'//body//a[1]/..\')
# 也可以这样
# a=html.xpath(\'//body//a[1]/parent::*\')
# a=html.xpath(\'//body//a[1]/parent::p\')
# 5 属性匹配
# a=html.xpath(\'//a[@href="image1.html"]\')
# a=html.xpath(\'//a[@name="sss"]\')
# 6 文本获取 text()
# a=html.xpath(\'//a[@href="image1.html"]/text()\')
# a=html.xpath(\'//a/text()\')
# 7 属性获取
# a=html.xpath(\'//a/@href\')
# a=html.xpath(\'//a[1]/@name\')
# # 注意从1 开始取(不是从0)
# a=html.xpath(\'//body//a[2]/@href\')
# 8 属性多值匹配
# a 标签有多个class类,直接匹配就不可以了,需要用contains
# a=html.xpath(\'//a[@class="li"]\')
# a=html.xpath(\'//a[contains(@class,"li")]\')
# a=html.xpath(\'//body//a[contains(@class,"li")]/text()\')
# 9 多属性匹配
# a=html.xpath(\'//body//a[contains(@class,"li") or @name="items"]\')
# a=html.xpath(\'//body//a[contains(@class,"li") and @name="items"]/text()\')
# a=html.xpath(\'//body//a[contains(@class,"li")]/text()\')
# 10 按序选择
# a=html.xpath(\'//a[2]/text()\')
# a=html.xpath(\'//a[2]/@href\')
# a=html.xpath(\'//a[2]/@name\')
# 取最后一个
# a=html.xpath(\'//a[last()]/@href\')
# 位置小于3的
# a=html.xpath(\'//a[position()<3]/@href\')
# 倒数第二个
# a=html.xpath(\'//a[last()-2]/@href\')
# 11 节点轴选择
# ancestor:祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath(\'//a/ancestor::*\')
# # 获取祖先节点中的div
# a=html.xpath(\'//a/ancestor::div\')
# attribute:属性值
# a=html.xpath(\'//a[1]/attribute::*\')
# child:直接子节点
# a=html.xpath(\'//a[1]/child::*\')
# a=html.xpath(\'//a[1]/child::img/@src\')
# descendant:所有子孙节点
# a=html.xpath(\'//a[6]/descendant::*\')
# following:当前节点之后所有节点
# a=html.xpath(\'//a[1]/following::*\')
# a=html.xpath(\'//a[1]/following::*[1]/@href\')
# following-sibling:当前节点之后同级节点
# a=html.xpath(\'//a[1]/following-sibling::*\')
# a=html.xpath(\'//a[1]/following-sibling::a\')
# a=html.xpath(\'//a[1]/following-sibling::*[2]/text()\')
# a=html.xpath(\'//a[1]/following-sibling::*[2]/@href\')
print(a)