代理操作
https = [ {"https":"120.234.63.196:3128"}, {"https":"163.204.246.45:9999"}, {"https":"122.193.245.38:9999"}, {"https":"112.85.129.7:9999"}, {"https":"112.87.68.21:9999"}, ] http = [ {"http":"47.92.233.84:8080"}, {"http":"163.204.246.221:9999"}, ] import random import requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", } url = \'http://httpbin.org/get\' if url.split(":")[0] == "https": page_text = requests.get(url=url,headers=headers,proxies = random.choice(https)).text else: page_text = requests.get(url=url, headers=headers, proxies=random.choice(http)).text with open(\'./ip.html\',"w",encoding="utf-8") as fp: fp.write(page_text)
boss招聘
import requests from bs4 import BeautifulSoup from lxml import etree headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } url = \'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position=\' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath(\'//div[@class="job-list"]/ul/li\') for li in li_list: title = li.xpath(\'.//div[@class="job-title"]/text()\')[0] salary = li.xpath(\'.//span[@class="red"]/text()\')[0] company = li.xpath(\'.//div[@class="company-text"]/h3/a/text()\')[0] print(title,salary,company)
丑事百科下载图片
import requests import os import re def csbk(): url="https://www.qiushibaike.com/pic/page/%s" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } page_start = int(input("enter start page:")) page_end = int(input(\'enter end page:\')) if not os.path.exists("images"): os.makedirs("images") for page in range(page_start,page_end+1): print("正在下载第%s页图片"%page) new_url = format(url %page) response = requests.get(url=new_url,headers=headers) e = \'<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>\' pa = re.compile(e,re.S) image_urls = pa.findall(response.text) for image_url in image_urls: image_url = "https:" + image_url image_name = image_url.split(\'/\')[-1] image_path = "images/"+image_name image_data = requests.get(url=image_url,headers=headers).content with open(image_path,"wb") as fp: fp.write(image_data) if __name__ == \'__main__\': csbk()
4k壁纸下载
import requests import os from lxml import etree from urllib import request start_page = int(input("start page num:")) end_page = int(input("end page num:")) if not os.path.exists("./meinvs"): os.mkdir("./meinvs") url = \'http://pic.netbian.com/4kmeinv/index_%d.html\' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } for page in range(start_page,end_page+1): if page == 1: new_url = \'http://pic.netbian.com/4kmeinv/\' else: new_url = format(url %page) response = requests.get(url=new_url,headers=headers) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath(\'//div[@class="slist"]/ul/li\') for li in li_list: img_name = li.xpath(\'./a/img/@alt\')[0] img_name = img_name.encode("iso-8859-1").decode("gbk")+".jpg" img_src = \'http://pic.netbian.com\'+li.xpath(\'./a/img/@src\')[0] img_path = \'./meinvs/\'+img_name request.urlretrieve(img_src,img_path) print(img_name,"下载成功!!")
爬取企业详情数据
import requests headers = { \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36\' } first_url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList" ids = [] for page in range(1,11): data = { "on": "true", "page": str(page), "pageSize": "15", "productName": "", "conditionType": "1", "applyname": "", "applysn": "", } response = requests.post(url=first_url,headers=headers,data=data) if response.headers["Content-Type"]=="application/json;charset=UTF-8": json_obj = response.json() for dic in json_obj["list"]: ids.append(dic["ID"]) else: pass detail_url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById" for _id in ids: data = { "id":_id } company_text = requests.post(detail_url,data=data,headers=headers).text print(company_text)
豆瓣登录
import requests import os url = \'https://accounts.douban.com/passport/login\' #封装请求参数 data = { "source": "movie", "redir": "https://movie.douban.com/", "form_email": "15027900535", "form_password": "bobo@15027900535", "login": "登录", } #自定义请求头信息 headers={ \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36\', } response = requests.post(url=url,data=data) page_text = response.text with open(\'./douban111.html\',\'w\',encoding=\'utf-8\') as fp: fp.write(page_text)
爬取简历模板
import requests from lxml import etree from urllib import request import os import random # http://sc.chinaz.com/jianli/free.html # http://sc.chinaz.com/jianli/free_2.html start_page = int(input("start page num:")) end_page = int(input("end page num:")) if not os.path.exists("./sucai1"): os.mkdir("./sucai1") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Connection":"close", } url = \'http://sc.chinaz.com/jianli/free_%d.html\' for page in range(start_page,end_page+1): if page == 1: new_url = \'http://sc.chinaz.com/jianli/free.html\' else: new_url = format(url %page) response = requests.get(url=new_url,headers=headers) page_text = response.content.decode(\'utf-8\') # response.encoding = \'utf-8\' # page_text = response.text # print(page_text) tree = etree.HTML(page_text) div_list = tree.xpath(\'//div[@id="main"]/div[@id="container"]/div\') for div in div_list: _url = div.xpath(\'./a/@href\')[0] response = requests.get(url=_url,headers=headers) page_text = response.content.decode(\'utf-8\') # response.encoding = \'utf-8\' # page_text = response.text tre = etree.HTML(page_text) li_list = tre.xpath(\'//div[@class="clearfix mt20 downlist"]/ul/li\') title = tre.xpath(\'//div[@class="ppt_tit clearfix"]/h1/text()\')[0] ms = [] path = \'./sucai1/\'+title+".rar" for li in li_list: a = li.xpath("./a/@href") ms.append(a[0]) request.urlretrieve(random.choice(ms),path) print(title,"下载完成")
雪球网爬取
import requests from lxml import etree headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } session = requests.session() session.get(url="https://xueqiu.com/",headers=headers) url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1" response = session.get(url=url,headers=headers).json() print(response)
爬取古诗文三国演义
import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } url = "http://www.shicimingju.com/book/sanguoyanyi.html" page_text = requests.get(url=url,headers=headers).text soup = BeautifulSoup(page_text,\'lxml\') li_list = soup.select(\'.book-mulu > ul > li\') fp = open(\'./sangou.txt\',\'w\',encoding="utf-8") for li in li_list: title = li.a.string detail_url = \'http://www.shicimingju.com\'+li.a[\'href\'] detail_page_text = requests.get(url=detail_url,headers=headers).text soup = BeautifulSoup(detail_page_text,"lxml") content = soup.find(\'div\',class_="chapter_content").text fp.write(title+\'\n\'+content+\'\n\') print(title,"下载成功") fp.close()
梨视频下载
import requests import re from lxml import etree import random from multiprocessing.dummy import Pool def getVideoDate(url): return requests.get(url=url,headers=headers).content def saveVideo(data): for i in title: name = i +".mp4" with open(name,"wb") as fp: fp.write(data) print(name,"下载成功") pool = Pool(4) url ="https://www.pearvideo.com/category_1" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } response = requests.get(url=url,headers=headers).text tree = etree.HTML(response) li_list = tree.xpath(\'//*[@id="listvideoListUl"]/li\') title = tree.xpath(\'//div/a/div[@class="vervideo-title"]/text()\') video_urls = [] for li in li_list: detail_url = "https://www.pearvideo.com/"+li.xpath(\'./div/a/@href\')[0] detail_url_text = requests.get(url=detail_url,headers=headers).text ex = \'srcUrl="(.*?)",vdoUrl\' # srcUrl="https://video.pearvideo.com/mp4/adshort/20190528/cont-1559859-13957455_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com"; video_src = re.findall(ex,detail_url_text,re.S)[0] video_urls.append(video_src) all_video_data_list = pool.map(getVideoDate,video_urls) pool.map(saveVideo,all_video_data_list)
爬取美团自助餐
import requests s = requests.Session() page = int(input("你想查看前几页的数据")) if page <= 0: print("输入有误") for pg in range(0, page): url = "https://apimobile.meituan.com/group/v4/poi/pcsearch/1?uuid=d2309f0d880345e3bd54.1559122435.1.0.0&userid=-1&limit=32&offset={}&cateId=-1&q=%E8%87%AA%E5%8A%A9%E9%A4%90".format(pg*32) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Referer": "https://bj.meituan.com/s/%E8%87%AA%E5%8A%A9%E9%A4%90/", } data = { "uuid": "d2309f0d880345e3bd54.1559122435.1.0.0", "userid": "-1", "limit": "32", "offset": pg*32, "cateId": "-1", "q": "自助餐", } response = s.get(url=url, headers=headers, data=data).json() datas = response["data"]["searchResult"] for data in datas: id = data["id"] title = data["title"] address = data["address"] comments = data["comments"] price = data["lowestprice"] avgprice = data["avgprice"] print("------------------" + title + "-----------------") print("店铺名称:", title) print("店铺地址:", address) print("店铺评论数:", comments) print("店铺价格:", price) print("店铺平均价格:", avgprice) for i in range(0, int(comments), 10): try: url_conter = "https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=d2309f0d880345e3bd54.1559122435.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F164776375%2F&riskLevel=1&optimusCode=1&id={}&userId=&offset={}&pageSize=10&sortType=1".format( id, i) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Referer": "https://www.meituan.com/meishi/{}/".format(id), } data_com = { "uuid": "d2309f0d880345e3bd54.1559122435.1.0.0", "platform": "1", "partner": "126", "originUrl": "https://www.meituan.com/meishi/{}/".format(id), "riskLevel": "1", "optimusCode": "1", "id": id, "userId": "", "offset": i, "pageSize": "10", "sortType": "1", } response = s.get(url=url_conter, data=data_com, headers=headers).json() comm = response["data"]["comments"] for com in comm: if com["comment"] != "": print(com["userName"] + ":", com["comment"]) except Exception as e: pass
古诗文识别验证码登录
import http.client, mimetypes, urllib, json, time, requests class YDMHttp: apiurl = \'http://api.yundama.com/api.php\' username = \'\' password = \'\' appid = \'\' appkey = \'\' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {\'method\': \'balance\', \'username\': self.username, \'password\': self.password, \'appid\': self.appid, \'appkey\': self.appkey} response = self.request(data) if (response): if (response[\'ret\'] and response[\'ret\'] < 0): return response[\'ret\'] else: return response[\'balance\'] else: return -9001 def login(self): data = {\'method\': \'login\', \'username\': self.username, \'password\': self.password, \'appid\': self.appid, \'appkey\': self.appkey} response = self.request(data) if (response): if (response[\'ret\'] and response[\'ret\'] < 0): return response[\'ret\'] else: return response[\'uid\'] else: return -9001 def upload(self, filename, codetype, timeout): data = {\'method\': \'upload\', \'username\': self.username, \'password\': self.password, \'appid\': self.appid, \'appkey\': self.appkey, \'codetype\': str(codetype), \'timeout\': str(timeout)} file = {\'file\': filename} response = self.request(data, file) if (response): if (response[\'ret\'] and response[\'ret\'] < 0): return response[\'ret\'] else: return response[\'cid\'] else: return -9001 def result(self, cid): data = {\'method\': \'result\', \'username\': self.username, \'password\': self.password, \'appid\': self.appid, \'appkey\': self.appkey, \'cid\': str(cid)} response = self.request(data) return response and response[\'text\'] or \'\' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != \'\'): return cid, result else: time.sleep(1) return -3003, \'\' else: return cid, \'\' def report(self, cid): data = {\'method\': \'report\', \'username\': self.username, \'password\': self.password, \'appid\': self.appid, \'appkey\': self.appkey, \'cid\': str(cid), \'flag\': \'0\'} response = self.request(data) if (response): return response[\'ret\'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], \'rb\'); res = requests.post(url, files=files, data=fields) return res.text def transformCodeImg(imgPath, imgType): # 普通用户名 username = \'bobo328410948\' # 密码 password = \'bobo328410948\' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 6003 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = \'1f4b564483ae5c907a1d34f8e2f2776c\' # 图片文件 filename = imgPath # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = imgType # 超时时间,秒 timeout = 30 result = None # 检查 if (username == \'username\'): print(\'请设置好相关参数再测试\') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print(\'uid: %s\' % uid) # 查询余额 balance = yundama.balance(); print(\'balance: %s\' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); return result import requests from lxml import etree s = requests.Session() url ="https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } page_text = s.get(url=url,headers=headers).text tree = etree.HTML(page_text) img_src = "https://so.gushiwen.org"+tree.xpath(\'//*[@id="imgCode"]/@src\')[0] __VIEWSTATE =tree.xpath(\'//*[@id="__VIEWSTATE"]/@value\')[0] __VIEWSTATEGENERATOR = tree.xpath(\'//*[@id="__VIEWSTATEGENERATOR"]\')[0] img_data = s.get(url=img_src,headers=headers).content with open("./gushiwen.jpg",\'wb\') as fp: fp.write(img_data) result = transformCodeImg(\'./gushiwen.jpg\',1004) print(result) post_url = "https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx" data = { "__VIEWSTATE":__VIEWSTATE, "__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR, "from":"http://so.gushiwen.org/user/collect.aspx", "email": "www.zhangbowudi@qq.com", "pwd": "bobo328410948", "code": result, "denglu": "登录" } response = s.post(url=post_url,headers=headers,data=data) print(response.status_code) page_text = response.text with open(\'./gushi.html\',\'w\',encoding=\'utf-8\') as fp: fp.write(page_text)