各大免费IP的网站的反爬手段往往是封掉在一定时间内访问过于频繁的IP,因此在爬取的时候需要设定一定的时间间隔,不过说实话,免费代理很多时候基本都不能用,可能一千个下来只有十几个可以用,而且几分钟之后估计也扑街了。虽然有那种付费的大量代理IP,但是也不见得好,测试过,里面优质的也很少。目前体验比较好的还是私密代理,当然还有其他。贵有贵的道理。
import requests import time import random from fake_useragent import UserAgentfrom requests.exceptions import RequestException from lxml import etree import csv class IPSpider(object): def __init__(self): self.url = \'https://www.kuaidaili.com/free/inha/\' self.url_test = \'http://www.baidu.com/\' # 直接拿百度来测试IP能不能用 def get_headers(self): """ 随机产生请求头 :return: """ ua = UserAgent() headers = { \'User-Agent\': ua.random } return headers def get_page(self, url): """ 获取网页源代码 :param url: :return: """ while True: try: headers = self.get_headers() response = requests.get(url, headers=headers, verify=False) if response.status_code == 200: return response.text print(response.status_code) raise ValueError("打开网页错误") except RequestException as err: print(err) def parse_ip(self, text): """ 提取页面的IP和端口号 :param text: :return: """ html = etree.HTML(text) ip = html.xpath("//tr/td[1]/text()") print(ip) port = html.xpath("//tr/td[2]/text()") print(port) return zip(ip, port) def test_ip(self, ip, port): """ 测试IP是否可用 :param ip: :param port: :return: """ try: # url_ip = \'http://\' + ip + \':\' + port # proxies = { # \'http\': url_ip, # \'https\': url_ip # } proxies = { \'http\': \'http://{}:{}\'.format(ip, port), \'https\': \'https://{}:{}\'.format(ip, port), } headers = self.get_headers() response = requests.get(url=self.url_test, headers=headers, proxies=proxies, timeout=8) if response.status_code == 200: print("%s可用" % ip) return ip, port return None except RequestException: print(\'%s失效\' % ip) def save_ip(self, result): """ 可用的IP保存 :param result: :return: """ with open("kuaidailiip.csv", "a")as f: writer = csv.writer(f) writer.writerows(result) def run(self): """ 主函数 :return: """ for i in range(1, 1001): url = self.url + str(i) + \'/\' text = self.get_page(url) ip = self.parse_ip(text) result = [] for j in ip: ok_ip = self.test_ip(j[0], j[1]) if ok_ip == None: continue else: result.append(ok_ip) self.save_ip(result) time.sleep(random.randint(5, 7))
if __name__ == \'__main__\':
spider = IPSpider()
spider.run()