zhaco

1、urllib代理设置

 1 from urllib.error import URLError
 2 from urllib.request import ProxyHandler
 3 from urllib.request import build_opener
 4 
 5 # 通过ProxyHandler来设置代理服务器,参数为字典类型,键名为协议,键值为代理
 6 proxy_handler = ProxyHandler({"http": "http://113.120.33.75:9999",
 7                               "https":"https://120.83.99.72:9999"})
 8 
 9 # 创建一个opener对象,比openurl多了一个header
10 opener = build_opener(proxy_handler)
11 try:
12     # 通过opener对象打开url
13     response = opener.open("http://httpbin.org/get")
14     print(response.read().decode("utf-8"))
15 except URLError as e:
16     print(e.reason)

结果会出现两种情况

 [WinError 10061] 由于目标计算机积极拒绝,无法连接。 

解决方法:主要是代理不可用,更换代理就行

 [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。 

解决方法:将浏览器的代理设置中的局域网设置,里面的自动配置脚本选项改为自动检测设置即可。

运行结果如下,可以看到origin已经改成了代理IP

{
  "args": {},
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Host": "httpbin.org",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
  },
  "origin": "113.120.33.75, 113.120.33.75",
  "url": "https://httpbin.org/get"
}

2、requests代理设置

 1 import requests
 2 
 3 # 设置代理
 4 proxies = {"http":"http://182.92.113.183:8118",
 5            "https":"https://120.83.99.72:9999"}
 6 try:
 7     # 请求url链接
 8     response = requests.get("http://httpbin.org/get",proxies=proxies)
 9     
10     # 输出文本内容
11     print(response.text)
12 except requests.exceptions.ConnectionError as e:
13     print(e.args)

运行结果如下,origin已经更改为代理IP了,显然比urllib简单多了,且不用创建opener对象

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "182.92.113.183, 182.92.113.183", 
  "url": "https://httpbin.org/get"
}

3、selenium代理设置

 1 import time
 2 from selenium import webdriver
 3 
 4 # 代理IP地址
 5 proxy = "182.92.113.183:8118"
 6 
 7 # url链接
 8 url = "http://httpbin.org/get"
 9 
10 # 启动谷歌控制选项,以便添加代理
11 chrom_options = webdriver.ChromeOptions()
12 
13 # 设置代理,注意“=”号两边不能有空格
14 chrom_options.add_argument("--proxy-server=http://" + proxy)
15 
16 # 模拟谷歌浏览器,并通过chrome_options参数传递代理
17 browser = webdriver.Chrome(executable_path="D:\chromedriver.exe",chrome_options=chrom_options)
18 
19 # 打开url链接
20 browser.get(url=url)
21 time.sleep(10)
22 
23 # 退出并清除浏览器缓存
24 browser.quit()

运行结果如下,可以看到origin已经更改为代理IP了

{
  "args": {}, 
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "zh-CN,zh;q=0.9", 
    "Host": "httpbin.org", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
  }, 
  "origin": "182.92.113.183, 182.92.113.183", 
  "url": "https://httpbin.org/get"
}

4、PhantomJS代理设置(新版本selenium已经弃用,使用谷歌无界面浏览器)

from selenium import webdriver
"""service_args = [
    ‘--proxy=%s‘ % ip_html,         # 代理 IP:prot    (eg:192.168.0.28:808)
    ‘--proxy-type=http’,            # 代理类型:http/https
    ‘--load-images=no’,             # 关闭图片加载(可选)
    ‘--disk-cache=yes’,             # 开启缓存(可选)
    ‘--ignore-ssl-errors=true’      # 忽略https错误(可选)
]"""
# url链接
url = "http://httpbin.org/get"
service_args = ["--proxy=121.233.206.44:9999",      # 代理IP
                "--proxy-type=http"]                # 代理协议类型http/HTTPS

# 启用PhantomJS无界面浏览器,并传递参数为代理IP
browser = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1-windows\bin\phantomjs.exe",service_args=service_args)
browser.get(url=url)
print(browser.page_source)

运行结果出乎意料的是最新版本的selenium不在支持PhantomJS了,让我们使用谷歌或者火狐的无头浏览器

UserWarning: Selenium support for PhantomJS has been deprecated, 
please use headless versions of Chrome or Firefox instead
warnings.warn(\'Selenium support for PhantomJS has been deprecated, please use headless \'

由于我的浏览器是谷歌浏览器,所以我就使用谷歌的无头浏览器进行IP代理,

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 创建谷歌浏览器模拟设置对象
chrome_options = Options()
proxy = "182.92.113.183:8118"       # 代理IP
url = "http://httpbin.org/get"      # url链接

# 设置谷歌浏览器无界面模式,即浏览器不停可视化页面
chrome_options.add_argument("--headless")

# 禁用使用GPU加速
chrome_options.add_argument("--disable-gpu")

# 设置语言
chrome_options.add_argument("-lang=zh-cn")      # 中文
# chrome_options.add_argument("-lang=en-GB")    # 英文

# 设置谷歌浏览器代理IP
chrome_options.add_argument("--proxy-server=http://" + proxy)

# 指定浏览器分辨率
chrome_options.add_argument("window-size=1920x3000")

# 模拟谷歌浏览器,并通过chrome_options参数传递代理IP
browser = webdriver.Chrome(chrome_options=chrome_options,executable_path="D:\chromedriver.exe")
browser.get(url=url)
print(browser.find_element_by_xpath("/html/body/pre").text)

运行结果如下,origin的IP地址已经更改为代理IP

{
  "args": {}, 
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "zh-cn", 
    "Host": "httpbin.org", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/74.0.3729.169 Safari/537.36"
  }, 
  "origin": "182.92.113.183, 182.92.113.183", 
  "url": "https://httpbin.org/get"
}

最后附上selenium模块add_argument常用参数

chrome_options.add_argument(\'--user-agent=""\')                  # 设置请求头的User-Agent
chrome_options.add_argument(\'--window-size=1280x1024\')          # 设置浏览器分辨率(窗口大小)
chrome_options.add_argument(\'--start-maximized\')                # 最大化运行(全屏窗口),不设置,取元素会报错
chrome_options.add_argument(\'--disable-infobars\')               # 禁用浏览器正在被自动化程序控制的提示
chrome_options.add_argument(\'--incognito\')                      # 隐身模式(无痕模式)
chrome_options.add_argument(\'--hide-scrollbars\')                # 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument(\'--disable-javascript\')             # 禁用javascript
chrome_options.add_argument(\'--blink-settings=imagesEnabled=false\')  # 不加载图片, 提升速度
chrome_options.add_argument(\'--headless\')                       # 浏览器不提供可视化页面
chrome_options.add_argument(\'--ignore-certificate-errors\')      # 禁用扩展插件并实现窗口最大化
chrome_options.add_argument(\'--disable-gpu\')                    # 禁用GPU加速
chrome_options.add_argument(\'--disable-software-rasterizer\')
chrome_options.add_argument(\'--disable-extensions\')             # 禁止扩展
chrome_options.add_argument(\'--start-maximized\')                # 启动就最大化
chrome_options.add_argument("--proxy-server=http://xxxxxxx")    # 设置IP代理

另外还有其他参数,请参考https://blog.csdn.net/liaojianqiu0115/article/details/78353267

分类:

技术点:

相关文章: