1. 相关概念
1. selenium模块
是一个基于浏览器自动化的模块
2. 与爬虫之间的关联
便捷的捕获到动态加载到的数据(可见即可得)
实现模拟登陆
3.环境安装
pip3 install selenium
简单演示
from selenium import webdriver from time import sleep # 后面是你的浏览器驱动位置,记得前面加r\'\',\'r\'是防止字符转义的 driver = webdriver.Chrome(r\'chromedriver.exe\') # 用get打开百度页面 driver.get("http://www.baidu.com") # 查找页面的“设置”选项,并进行点击 driver.find_elements_by_link_text(\'设置\')[0].click() sleep(2) # 打开设置后找到“搜索设置”选项,设置为每页显示50条 driver.find_elements_by_link_text(\'搜索设置\')[0].click() sleep(2) # 选中每页显示50条 m = driver.find_element_by_id(\'nr\') sleep(2) m.find_element_by_xpath(\'//*[@id="nr"]/option[3]\').click() m.find_element_by_xpath(\'.//option[3]\').click() sleep(2) # 点击保存设置 driver.find_elements_by_class_name("prefpanelgo")[0].click() sleep(2) # 处理弹出的警告页面 确定accept() 和 取消dismiss() driver.switch_to_alert().accept() sleep(2) # 找到百度的输入框,并输入 美女 driver.find_element_by_id(\'kw\').send_keys(\'美女\') sleep(2) # 点击搜索按钮 driver.find_element_by_id(\'su\').click() sleep(2) # 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面 driver.find_elements_by_link_text(\'美女_百度图片\')[0].click() sleep(3) # 关闭浏览器 driver.quit()
2.基本使用
from time import sleep from selenium import webdriver bro = webdriver.Chrome(executable_path="chromedriver.exe") # 录入路由地址 bro.get("https://www:jd.com/") sleep(2) # 进行标签定位 search_input = bro.find_element_by_id("key") # 向搜索框中录入关键词 search_input.send_keys("苹果") # 定位搜索按钮 btn = bro.find_element_by_xpath(\'//*[@id="search"]/div/div[2]/button\') # 点击搜索按钮 btn.click() sleep(2) #执行js(滑动滚轮) bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\') sleep(2) # 获取页面的源码数据 page_text = bro.page_source print(page_text) # 退出访问 bro.quit()
from time import sleep from selenium import webdriver from lxml import etree bro = webdriver.Chrome(executable_path="chromedriver.exe") bro.get("http://125.35.6.84:81/xk/") sleep(2) page_text = bro.page_source page_text_list = [page_text] for i in range(3): bro.find_element_by_id("pageIto_next").click() # 点击下一页 sleep(2) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) tree_list = tree.xpath(\'//ul[@id="gzlist"]/li\') for lis in tree_list: title = lis.xpath(\'./dl/@title\')[0] num = lis.xpath(\'./ol/@title\')[0] sleep(2) bro.quit()
3.动作链
一系列连续的动作
在实现标签定位时,如果发现定位的标签是存在于iframe标签中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame(\'id\')
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path=\'chromedriver.exe\')
bro.get(\'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable\') bro.switch_to.frame(\'iframeResult\') div_tag = bro.find_element_by_id(\'draggable\') # 拖动=点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()
4.模拟12306登录
# Cjy.py import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode(\'utf8\') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { \'user\': self.username, \'pass2\': self.password, \'softid\': self.soft_id, } self.headers = { \'Connection\': \'Keep-Alive\', \'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)\', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { \'codetype\': codetype, } params.update(self.base_params) files = {\'userfile\': (\'ccc.jpg\', im)} r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { \'id\': im_id, } params.update(self.base_params) r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\', data=params, headers=self.headers) return r.json()
模拟登陆
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from Cjy import Chaojiying_Client from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path=\'chromedriver.exe\') bro.get(\'https://kyfw.12306.cn/otn/login/init\') sleep(5) bro.save_screenshot(\'main.png\') code_img_tag = bro.find_element_by_xpath(\'//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img\') location = code_img_tag.location size = code_img_tag.size # 裁剪的区域范围 rangle = ( int(location[\'x\']), int(location[\'y\']), int(location[\'x\'] + size[\'width\']), int(location[\'y\'] + size[\'height\'])) i = Image.open(\'./main.png\') frame = i.crop(rangle) frame.save(\'code.png\') def get_text(imgPath, imgType): chaojiying = Chaojiying_Client(\'bobo328410948\', \'bobo328410948\', \'899370\') im = open(imgPath, \'rb\').read() return chaojiying.PostPic(im, imgType)[\'pic_str\'] # 55,70|267,133 ==[[55,70],[33,66]] result = get_text(\'./code.png\', 9004) all_list = [] if \'|\' in result: list_1 = result.split(\'|\') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(\',\')[0]) y = int(list_1[i].split(\',\')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(\',\')[0]) y = int(result.split(\',\')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) # action = ActionChains(bro) for a in all_list: x = a[0] y = a[1] ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform() sleep(1) bro.find_element_by_id(\'username\').send_keys(\'123456\') sleep(1) bro.find_element_by_id(\'password\').send_keys(\'67890000000\') sleep(1) bro.find_element_by_id(\'loginSub\').click() sleep(5) bro.quit()
import requests from lxml import etree import re headers = { \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36\' } url = \'https://www.pearvideo.com/category_1\' page_text = requests.get(url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath(\'//*[@id="listvideoListUl"]/li\') for li in li_list: detail_url = \'https://www.pearvideo.com/\'+li.xpath(\'./div/a/@href\')[0] title = li.xpath(\'./div/a/div[2]/text()\')[0]+\'.mp4\' detail_page_text = requests.get(detail_url,headers=headers).text ex = \'srcUrl="(.*?)",vdoUrl\' video_url = re.findall(ex,detail_page_text,re.S)[0] video_data = requests.get(video_url,headers=headers).content with open(title,\'wb\') as fp: fp.write(video_data)
5. 移动端数据的爬取
1. fiddler是一款抓包工具,代理服务器
- 青花瓷
- miteproxy
- 配置:让其可以抓取https协议的请求
- tools -> options -> https -> 安装证书
-
-
将手机和fiddler所在的电脑处在同一个网段下(pc开启wifi,手机连接)
-
在手机中访问fiddler的ip+port:192.168.14.110:50816,在当前页面中点击对应的连接下载证书
-
在手机中安装且信任证书
-
设置手机网络的代理:开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号