1、验证码的识别
验证码的处理
- 基于线上的打码平台来实现
- 超级鹰的使用流程:
- 基于用户中心的身份进行注册
- 用户中心的身份进行登录:
- 充值
- 创建一个软件:软件ID-》生成软件ID
- 下载示例代码:开发文档-》Python-》下载
示例1:使用打码平台识别古诗文网中的验证码图片
# 超级鹰代码
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode(\'utf8\')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
\'user\': self.username,
\'pass2\': self.password,
\'softid\': self.soft_id,
}
self.headers = {
\'Connection\': \'Keep-Alive\',
\'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
\'codetype\': codetype,
}
params.update(self.base_params)
files = {\'userfile\': (\'ccc.jpg\', im)}
r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
\'id\': im_id,
}
params.update(self.base_params)
r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers)
return r.json()
# 执行代码并封装成函数
#封装一个识别验证码的函数
def transformCode(imgPath,imgType):
chaojiying = Chaojiying_Client(\'bobo328410948\', \'bobo328410948\', \'899370\')
im = open(imgPath, \'rb\').read()
return chaojiying.PostPic(im, imgType)[\'pic_str\']
# 识别验证码
url = \'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx\'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
#解析到了验证码图片的地址
code_img_src = \'https://so.gushiwen.org\'+tree.xpath(\'//*[@id="imgCode"]/@src\')[0]
img_data = requests.get(code_img_src,headers=headers).content
with open(\'./code.jpg\',\'wb\') as fp:
fp.write(img_data)
transformCode(\'./code.jpg\',1902)
2、模拟登录
模拟登陆的处理
- 对点击登录按钮对应的url进行请求发送
- 动态变化的请求参数:
- 一般都会被隐藏在前台页面中
- 基于抓包工具对请求参数的名称进行全局搜索
示例1:对古诗文网进行模拟登陆
#处理cookie
session = requests.Session()
#获取且识别验证码
url = \'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx\'
page_text = session.get(url,headers=headers).text
tree = etree.HTML(page_text)
#解析到了验证码图片的地址
code_img_src = \'https://so.gushiwen.org\'+tree.xpath(\'//*[@id="imgCode"]/@src\')[0]
#解析出模拟登录请求中动态变化的两个请求参数的之
__VIEWSTATE = tree.xpath(\'//*[@id="__VIEWSTATE"]/@value\')[0]
__VIEWSTATEGENERATOR = tree.xpath(\'//*[@id="__VIEWSTATEGENERATOR"]/@value\')[0]
#对图片进行请求时捕获cookie
img_data = session.get(code_img_src,headers=headers).content
with open(\'./code.jpg\',\'wb\') as fp:
fp.write(img_data)
#验证码对应的文本数据
code_text = transformCode(\'./code.jpg\',1902)
print(code_text)
#对于登录按钮的点击进行请求发送
login_url = \'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx\'
data = {
\'__VIEWSTATE\': __VIEWSTATE,
\'__VIEWSTATEGENERATOR\': __VIEWSTATEGENERATOR,
\'from\': \'http://so.gushiwen.org/user/collect.aspx\',
\'email\': \'www.zhangbowudi@qq.com\',
\'pwd\': \'bobo328410948\',
\'code\': code_text,
\'denglu\': \'登录\',
}
login_page_text = session.post(url=login_url,headers=headers,data=data).text
with open(\'./gushiwen.html\',\'w\',encoding=\'utf-8\') as fp:
fp.write(login_page_text)