import requests from lxml import etree import time import random import csv def test_ip(ip_address): \'\'\' 测试ip是否可用 :param ip_address: 代理ip \'\'\' url = \'http://icanhazip.com/\' headers = { # headers 头部文件 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0\', } ip_pool = [] for ip_test in ip_address: # print(ip_test) try: response = requests.get(url=url,headers=headers,proxies=ip_test,timeout=5) if response.status_code == \'200\': ip_pool.append(ip_test) time.sleep(random.randint(2,8)) except Exception as e: pass print(ip_pool) files_save(ip_pool) def files_save(ip_list): \'\'\' 将可用代理ip保存 :param ip_list:代理ip :return: \'\'\' with open(\'./代理ip.csv\',\'a+\',encoding=\'utf-8\')as f: write = csv.writer(f) write.writerow(ip_list) pass def get_page_data(nums): \'\'\' 获取西刺代理的页面信息 :return: \'\'\' ip_list = [] headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0\', } for i in range(1,nums+1): url = "https://www.xicidaili.com/nn/{}".format(i) response = requests.request(\'get\',url=url,headers=headers) page_data = etree.HTML(response.text) # 获取https信息 # https_infos = page_data.xpath(".//tr[@class=\'odd\']") # 获取http信息 # http_infos = page_data.xpath(".//tr[@class=\'\']") page_infos = page_data.xpath(".//tr[@class=\'odd\']|.//tr[@class=\'\']") for info in page_infos: ip_dict = {} ip_address = info.xpath(".//td[2]/text()")[0] ip_port = info.xpath(".//td[3]/text()")[0] ip_type = info.xpath(".//td[6]/text()")[0].lower() ip_dict[ip_type] = ip_type+\'://\'+ip_address+\':\'+ip_port ip_list.append(ip_dict) # print(ip_list) test_ip(ip_list) pass pass if __name__ == \'__main__\': \'\'\' 爬取代理ip时应注意 需要测试此ip是否可用 爬取速度 分析: url信息 页面 url 1 https://www.xicidaili.com/nn/ 2 https://www.xicidaili.com/nn/2 3 https://www.xicidaili.com/nn/3 \'\'\' # nums = int(input("请输入爬取页数>>")) nums = 2 get_page_data(nums)