#https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false
import requests
#实际要爬取的url
url = \'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false\'
payload = {
\'first\': \'true\',
\'pn\': \'1\',
\'kd\': \'python\',
}
header = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36\',
\'Referer\': \'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=\',
\'Accept\': \'application/json, text/javascript, */*; q=0.01\'
}
#原始的url
urls =\'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=\'
#建立session
s = requests.Session()
# 获取搜索页的cookies
s.get(urls, headers=header, timeout=3)
# 为此次获取的cookies
cookie = s.cookies
# 获取此次文本
response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text
print(response)
爬取红楼梦小说
2 爬红楼梦小说
#http://www.shicimingju.com/book/hongloumeng.html
import requests
from bs4 import BeautifulSoup
ret=requests.get(\'https://www.shicimingju.com/book/hongloumeng.html\')
# print(ret.text)
soup=BeautifulSoup(ret.text,\'lxml\')
li_list=soup.find(class_=\'book-mulu\').find(\'ul\').find_all(\'li\')
with open(\'hlm.txt\',\'w\',encoding=\'utf-8\') as f:
for li in li_list:
content=li.find(\'a\').text
url=\'https://www.shicimingju.com\'+li.find(\'a\').get(\'href\')
f.write(content)
f.write(\'\n\')
res_content=requests.get(url)
soup2=BeautifulSoup(res_content.text,\'lxml\')
content_detail=soup2.find(class_=\'chapter_content\').text
f.write(content_detail)
f.write(\'\n\')
print(content,\'写入了\')
爬取肯德基门店
3 爬肯德基门店
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
import requests
header = {
\'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36\'
}
data = {
\'cname\': \'\',
\'pid\': 20,
\'keyword\': \'浦东\',
\'pageIndex\': 1,
\'pageSize\': 10
}
ret = requests.post(\'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword\', data=data, headers=header)
print(ret.json())
爬取糗事百科
4 爬糗事百科段子
#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get(\'https://www.qiushibaike.com/text/page/2/\')
# print(ret.text)
soup=BeautifulSoup(ret.text,\'html.parser\')
article_list=soup.find_all(class_=\'article\')
# print(article_list)
for article in article_list:
content=article.find(class_=\'content\').text
print(content)
print(\'-------\')
1 打码平台使用
1 不花钱破解验证码:数字字母的组合---》识别率一般
2 花钱---》复杂的验证码---》打码平台
-云打码
-超级鹰(以它为例,破解12306的)
2 selenium登录获得cookie,使用requests发送请求
# from selenium import webdriver
# import time
import json
# bro=webdriver.Chrome(executable_path=\'chromedriver.exe\')
#
# bro.get(\'https://dig.chouti.com/\')
# bro.maximize_window() # 最大化
# bro.implicitly_wait(10)
# login_btn=bro.find_element_by_link_text(\'登录\')
# login_btn.click()
#
# name_input=bro.find_element_by_name(\'phone\')
# password_input=bro.find_element_by_name(\'password\')
# name_input.send_keys(\'18953675221\')
# password_input.send_keys(\'lqz123\')
#
# login_real_btn=bro.find_element_by_css_selector(\'button.login-btn\')
# login_real_btn.click()
#
# # 可能有验证码
# time.sleep(10)
#
# cookie=bro.get_cookies()
#
# print(cookie)
# with open(\'cookie.json\',\'w\') as f:
# json.dump(cookie,f)
#
#
# print(\'cookie写入到文件中了\')
# 自动点赞功能
import requests
from requests.cookies import RequestsCookieJar
header = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36\',
\'Referer\': \'https://dig.chouti.com/\',
# \'Cookie\':\'deviceId=web.eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiIzZmMzMDcxMC1kYTE4LTRhYjEtODgyYi1jMjJiNDJmZGQ0M2MiLCJleHBpcmUiOiIxNjEyMzQwNDkxODkyIn0.-rOq71RiwftdV6hywRH7VU6mJLLHour4W8vd7R7VSjM; __snaker__id=5v8JU21SihPI5qPo; _9755xjdesxxd_=32; YD00000980905869%3AWM_TID=S1%2B5wnw84kpFRVFRBAJvePHwfuWOllVw; gdxidpyhxdE=qjBxjj9DZSdEqLP9dsw7LIaiVZt90hIoMIyY1uiz2NsZigVtqxoRGyr7R5PWhKHIr%2BkrBADuEa3%5CGNNRdJ2JMGzmtetESp%2BkeazmmQVeObtbcHO2Db%2FA%5CR06TGfdvarx%2BlvQO70UtX4zGkNaSUH1mXVNo%2BydlxLNxv72ivYgEgmga8ze%3A1611026700588; YD00000980905869%3AWM_NI=Q4TKzSEwfVhdRZu%2BqO0ALrTB9UwQsMFZV%2BtYYe4hp%2BZE32Uiv6uU0DtTfDxmHrV70D4hpmzh7G69jaXfVH9P3JbVycL4n1Dx5NFP%2BW7wOXo0y2ovpUQCCoxwld%2FRv4mzWWg%3D; YD00000980905869%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eed7aa658fa789b4e244fb928bb2c14a879a9baaf541acbe8e8df173ab94e5bacf2af0fea7c3b92a93ec9cacf95987a9aab7eb709286bbccf445f49f8493d74afb8bb9b2f24786869accf7539bbbfc89f3219a9bfdd3d863b6eabbb6b350b88caaa5fb678cae9ad8b350f8b8c08ef964f599fadae96b9cba9aa9bc5cf196aeb7fc6990eca2d2c774f7928ab1aa5df5a981b9ee3cbbeea197f35f939d8d90fb7db0bce1b3f540afa699b8e637e2a3; token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNTMyMDcwNzg0NjAiLCJleHBpcmUiOiIxNjEzNjE3ODE2NDU5In0.lZVzlzVAtPkdg5guBx2ju8zILeOtAFrW7UTAnlf-Yfw; Hm_lvt_03b2668f8e8699e91d479d62bc7630f1=1609748496,1611025799,1611283275,1611286554; Hm_lpvt_03b2668f8e8699e91d479d62bc7630f1=1611286868\'
}
res = requests.get(\'https://dig.chouti.com/link/hot\', headers=header)
print(res.json())
jar=RequestsCookieJar()
# 从文件中读出来,写到cookie中
with open(\'cookie.json\',\'r\') as f:
cookie_l=json.load(f)
for cookie in cookie_l:
jar.set(cookie[\'name\'], cookie[\'value\'])
# cookie={}
for item in res.json()[\'data\']:
id = item[\'id\']
print(id)
# 点赞,缺cookie
data={
\'linkId\':id
}
res = requests.post(\'https://dig.chouti.com/link/vote\', headers=header,cookies=jar,data=data)
# res = requests.post(\'https://dig.chouti.com/link/vote\', headers=header,data=data)
print(res.text)
3 自动登录12306
from selenium import webdriver
import time
# import json
# bro=webdriver.Chrome(executable_path=\'./chromedriver\')
# bro.get(\'https://dig.chouti.com/\')
# # time.sleep(20)
# cookie=bro.get_cookies()
# print(cookie)
# with open(\'cookie.json\',\'w\') as f:
# json.dump(cookie,f)
import requests
import json
from requests.cookies import RequestsCookieJar
#这里我们使用cookie对象进行处理
jar = RequestsCookieJar()
with open("cookie.json", "r") as fp:
cookies = json.load(fp)
for cookie in cookies:
jar.set(cookie[\'name\'], cookie[\'value\'])
header={
\'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36\',
\'Referer\': \'http://dig.chouti.cc/\',
}
data={
\'linkId\': 30268402
}
res=requests.post(\'http://dig.chouti.cc/link/vote\',cookies=jar,headers=header,data=data)
print(res.text)
from selenium import webdriver
import time
#pillow
from PIL import Image
# 引入超级鹰
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
bro=webdriver.Chrome(executable_path=\'./chromedriver.exe\')
bro.implicitly_wait(10)
try:
bro.get(\'https://kyfw.12306.cn/otn/resources/login.html\')
bro.maximize_window() # 窗口最大化,全屏
button_z=bro.find_element_by_css_selector(\'.login-hd-account a\')
button_z.click()
time.sleep(2)
# 截取整个屏幕
bro.save_screenshot(\'./main.png\')
# 验证码的位置和大小
img_t=bro.find_element_by_id(\'J-loginImg\')
print(img_t.size)
print(img_t.location)
size=img_t.size
location=img_t.location
img_tu = (int(location[\'x\']), int(location[\'y\']), int(location[\'x\'] + size[\'width\']), int(location[\'y\'] + size[\'height\']))
# # 抠出验证码
# #打开
img = Image.open(\'./main.png\')
# 抠图
fram = img.crop(img_tu)
# 截出来的小图
fram.save(\'code.png\')
# 调用超级鹰破解
chaojiying = Chaojiying_Client(\'306334678\', \'lqz12345\', \'903641\') #用户中心>>软件ID 生成一个替换 96001
im = open(\'code.png\', \'rb\').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 9004))
## 返回结果如果有多个 260,133|123,233,处理这种格式[[260,133],[123,233]]
res=chaojiying.PostPic(im, 9004)
print(res)
result=res[\'pic_str\']
all_list = []
if \'|\' in result:
list_1 = result.split(\'|\')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(\',\')[0])
y = int(list_1[i].split(\',\')[1])
xy_list.append(x)
xy_list.append(