albireo

使用爬虫抓取数据时,经常要用到多个ip代理,防止单个ip访问太过频繁被封禁。
ip代理可以从这个网站获取:http://www.xicidaili.com/nn/。
因此写一个python程序来获取ip代理,保存到本地。
python版本:3.6.3

 1 #grab ip proxies from xicidaili
 2 import sys, time, re, requests
 3 from multiprocessing.dummy import Pool as ThreadPool
 4 from lxml import etree
 5 
 6 IP_POOL = \'ip_pool.py\'
 7 URL = \'http://www.xicidaili.com/nn/\' #IP代理 高匿
 8 #URL = \'http://www.xicidaili.com/wt/\' #IP代理 http
 9 RUN_TIME = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #执行时间
10 
11 #用字典存放有效ip代理
12 alive_ip = {\'http\': [], \'https\': []}
13 #多线程
14 pool = ThreadPool(20)
15 
16 #返回html文本
17 def get_html(url):
18     headers = {
19         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
20         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
21         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
22         "Accept-Encoding": "gzip, deflate",
23         "Referer": "https://www.xicidaili.com/",
24         "Connection": "keep-alive",
25         "Upgrade-Insecure-Requests": "1"
26     }
27     r = requests.get(url, headers=headers)
28     r.encoding = \'utf-8\'
29     return r.text
30 
31 #测试ip代理是否存活
32 def test_alive(proxy):
33     global alive_ip
34     proxies = {\'http\': proxy}
35     try:
36         r = requests.get(\'https://www.baidu.com\', proxies=proxies, timeout=3)
37         if r.status_code == 200:
38             if proxy.startswith(\'https\'):
39                 alive_ip[\'https\'].append(proxy)
40             else:
41                 alive_ip[\'http\'].append(proxy)
42     except:
43         print("%s无效!"%proxy)
44 
45 #解析html文本,获取ip代理
46 def get_alive_ip_address():
47     iplist = []
48     html = get_html(URL)
49     selector = etree.HTML(html)
50     table = selector.xpath(\'//table[@id="ip_list"]\')[0]
51     lines = table.xpath(\'./tr\')[1:]
52     for line in lines:
53         speed, connect_time = line.xpath(\'.//div/@title\')
54         data = line.xpath(\'./td\')
55         ip = data[1].xpath(\'./text()\')[0]
56         port = data[2].xpath(\'./text()\')[0]
57         anonymous = data[4].xpath(\'./text()\')[0]
58         ip_type = data[5].xpath(\'./text()\')[0]
59         #过滤掉速度慢和非高匿的ip代理
60         if float(speed[:-1])>1 or float(connect_time[:-1])>1 or anonymous != \'高匿\':
61             continue
62         iplist.append(ip_type.lower() + \'://\' + ip + \':\' + port)
63     pool.map(test_alive, iplist)
64 
65 #把抓取到的有效ip代理写入到本地
66 def write_txt(output_file):
67     with open(output_file, \'w\') as f:
68         f.write(\'#create time: %s\n\n\' % RUN_TIME)
69         f.write(\'http_ip_pool = \\\n\')
70         f.write(str(alive_ip[\'http\']).replace(\',\', \',\n\'))
71         f.write(\'\n\n\')
72     with open(output_file, \'a\') as f:
73         f.write(\'https_ip_pool = \\\n\')
74         f.write(str(alive_ip[\'https\']).replace(\',\', \',\n\'))
75     print(\'write successful: %s\' % output_file)
76 
77 def main():
78     get_alive_ip_address()
79     write_txt(output_file)
80 
81 if __name__ == \'__main__\':
82     try:
83         output_file = sys.argv[1] #第一个参数作为文件名
84     except:
85         output_file = IP_POOL
86     main()

运行程序:

root@c:test$ python get_ip_proxies.py
write successful: ip_pool.py

查看文件:

root@c:test$ vim ip_pool.py
 1 #create time: 2019-03-14 19:53
 2 
 3 http_ip_pool = \
 4 [\'http://183.148.152.1:9999\',
 5  \'http://112.85.165.234:9999\',
 6  \'http://112.87.69.162:9999\',
 7  \'http://111.77.197.10:9999\',
 8  \'http://113.64.94.80:8118\',
 9  \'http://61.184.109.33:61320\',
10  \'http://125.126.204.82:9999\',
11  \'http://125.126.218.8:9999\',
12  \'http://36.26.224.56:9999\',
13  \'http://123.162.168.192:40274\',
14  \'http://116.209.54.125:9999\',
15  \'http://183.148.148.211:9999\',
16  \'http://111.177.161.111:9999\',
17  \'http://116.209.58.245:9999\',
18  \'http://183.148.143.38:9999\',
19  \'http://116.209.55.218:9999\',
20  \'http://114.239.250.15:9999\',
21  \'http://116.209.54.109:9999\',
22  \'http://125.123.143.98:9999\',
23  \'http://183.6.130.6:8118\',
24  \'http://183.148.143.166:9999\',
25  \'http://125.126.203.228:9999\',
26  \'http://111.79.198.74:9999\',
27  \'http://116.209.53.215:9999\',
28  \'http://112.87.69.124:9999\',
29  \'http://112.80.198.13:8123\',
30  \'http://182.88.160.16:8123\',
31  \'http://116.209.56.24:9999\',
32  \'http://112.85.131.25:9999\',
33  \'http://116.209.52.234:9999\',
34  \'http://175.165.128.223:1133\',
35  \'http://122.4.47.199:8010\',
36  \'http://112.85.170.204:9999\',
37  \'http://49.86.178.206:9999\',
38  \'http://125.126.215.187:9999\']
39 
40 https_ip_pool = \
41 [\'https://183.148.156.98:9999\',
42  \'https://111.79.199.167:808\',
43  \'https://61.142.72.150:39894\',
44  \'https://119.254.94.71:42788\',
45  \'https://221.218.102.146:33323\',
46  \'https://122.193.246.29:9999\',
47  \'https://183.148.139.173:9999\',
48  \'https://60.184.194.157:3128\',
49  \'https://118.89.138.129:52699\',
50  \'https://112.87.71.67:9999\',
51  \'https://58.56.108.226:43296\',
52  \'https://182.207.232.135:50465\',
53  \'https://111.177.186.32:9999\',
54  \'https://58.210.133.98:32741\',
55  \'https://115.221.116.71:9999\',
56  \'https://183.148.140.191:9999\',
57  \'https://183.148.130.143:9999\',
58  \'https://116.209.54.84:9999\',
59  \'https://125.126.219.125:9999\',
60  \'https://112.85.167.158:9999\',
61  \'https://112.85.173.76:9999\',
62  \'https://60.173.244.133:41306\',
63  \'https://183.148.147.223:9999\',
64  \'https://116.209.53.68:9999\',
65  \'https://111.79.198.102:9999\',
66  \'https://123.188.5.11:1133\',
67  \'https://60.190.66.131:56882\',
68  \'https://112.85.168.140:9999\',
69  \'https://110.250.65.108:8118\',
70  \'https://221.208.39.160:8118\',
71  \'https://116.209.53.77:9999\',
72  \'https://116.209.58.29:9999\',
73  \'https://183.148.141.129:9999\',
74  \'https://124.89.33.59:53281\',
75  \'https://116.209.57.149:9999\',
76  \'https://58.62.238.150:32431\',
77  \'https://218.76.253.201:61408\']

之后就可以直接使用了

from ip_pool import http_ip_pool, https_ip_pool

 

分类:

技术点:

相关文章:

  • 2021-11-12
  • 2021-07-22
  • 2022-12-23
  • 2022-02-26
  • 2022-12-23
  • 2022-12-23
  • 2022-01-12
  • 2021-06-16
猜你喜欢
  • 2021-11-26
  • 2022-12-23
  • 2021-12-18
  • 2021-08-15
  • 2022-02-07
  • 2021-12-19
相关资源
相似解决方案