python爬虫redis-ip代理池搭建几十万的ip数据--可以使用

from bs4 import BeautifulSoup
import requests,os,sys,time,random,redis
from lxml import etree
conn = redis.Redis(host=\'127.0.0.1\',port=6379,db=0,decode_responses=True)
def get_ip(page_url,headers,cookies,sui_ji_time):
    """
    爬取ip并组合ip使用的格式
    :param page_url:
    :param headers:
    :param cookies:
    :param sui_ji_time:
    :return:
    """
    try:

        print(\'{}--{}--{}--{}>>{}\'.format(\'此程序睡眠时间\',sui_ji_time,\'正在爬取第\',page_url,\'的数据\'))
        response = requests.get(page_url,headers=headers,cookies=cookies).text
        json_lxml = etree.HTML(response)
        table = json_lxml.xpath(\'//*[@id="list"]/table/tbody/tr\')
        for i in table:
            html_ip = i.xpath(\'.//td[1]/text()\')[0]
            html_ip_port = i.xpath(\'.//td[2]/text()\')[0]
            html_ip_lei = i.xpath(\'.//td[4]/text()\')[0]
            daili_ip = \'{}{}:{}\'.format(\'http://\', html_ip, html_ip_port)
            if html_ip_lei == \'HTTP\':
                ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)
            else:
                ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)
    except:

        print(\'{}--{}--{}--{}>>{}\'.format(\'此程序睡眠时间\', sui_ji_time, \'正在爬取第\', page_url, \'的数据=========失败\'))

def ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei):
    """
    测试爬取下来的ip是否可用
    :param headers:
    :param cookies:
    :param sui_ji_time:
    :param daili_ip:
    :param html_ip_lei:
    :return:
    """
    print(daili_ip,\'@@@@@@@@@@@@\')
    # list1 = []
    try:
        requests.get(\'http://wenshu.court.gov.cn/\', proxies={str(html_ip_lei): daili_ip})
    except:
        print(\'{}>>{}\'.format(daili_ip,\'不可用\'))
    else:
        print(\'{}>>{}\'.format(daili_ip,\'可用\'))
        """
        存储redis数据库
        """
        try:
            conn.sadd(\'proxy\',\'{}+{}\'.format(html_ip_lei,daili_ip))
            print(\'{}\'.format(\'存储redis成功\'))
        except:
            print(\'{}\'.format(\'存储redis失败\'))
        root_dir = \'{}\'.format(\'D:\\web_xiangmu\\biquge_tushu\\代理\')
        # list1.append({str(html_ip_lei): str(daili_ip)})
        if not os.path.exists(root_dir):
            os.mkdir(root_dir)
            print(\'{}\'.format(\'创建成功\'))
        # print(\'{}\'.format(\'文件存在\'))
        """
        存储文件以防丢失
        """
        try:
            with open(root_dir+\'\\\'+\'daili.text\', "a+") as mon:
                mon.write(\'{}+{}\n\'.format(html_ip_lei,daili_ip))
                print(\'{}>>>{}\'.format(daili_ip,\'写入成功\'))
        except:
            print(\'{}\'.format(\'写入失败\'))




def main():
    """
    爬取ip代理网站的全部ip
    并组合分页
    :return:
    """

    url = \'https://www.kuaidaili.com/free/inha/\'
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\',
        \'Referer\': \'https://www.kuaidaili.com/free/inha/\',
    }
    cookies = {
        \'Cookie\': \'channelid=0; sid=1575640807483263; _ga=GA1.2.757045199.1575642271; _gid=GA1.2.1903168241.1575642271; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575642272,1575686420; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575686420\',
    }
    try:
        response = requests.get(url,headers=headers,cookies=cookies).text
        json_lxml = etree.HTML(response)
        ip_page = json_lxml.xpath(\'//ul/li[9]/a/text()\')[0]
        ip_page_href = json_lxml.xpath(\'//ul/li[9]/a/@href\')[0]
        sui_ji_time = random.choice(list_time_sleep)
        for page in range(1,int(ip_page)+1):
            page_url = \'{}/{}/{}/{}\'.format(\'https://www.kuaidaili.com\',\'\'.join(ip_page_href).split(\'/\')[1],\'\'.join(ip_page_href).split(\'/\')[2],page)
            time.sleep(sui_ji_time)
            get_ip(page_url,headers,cookies,sui_ji_time)
    except:
        print(\'程序崩溃\')

if __name__ == \'__main__\':
    list_time_sleep = [5,10,15]
    zhu_sui_ji_time = random.choice(list_time_sleep)
    print(\'{}<<{}>>{}\'.format(\'主程序随机睡眠时间\',zhu_sui_ji_time,\'秒\'))
    time.sleep(zhu_sui_ji_time)
    main()


"""
import redis,requests
conn = redis.Redis(host=\'127.0.0.1\',port=6379,db=0,decode_responses=True)
ip = conn.srandmember(\'proxy\')
ip_add = \'\'.join(ip).split(\'+\')
zhen_ip = ip_add
dict1 = {}
# # 使用IP代理访问百度，测试代理地址是否有效
try:
    requests.get(\'http://wenshu.court.gov.cn/\', proxies={zhen_ip[0]: zhen_ip[1]})
    print(\'{}---{}>>>{}\'.format(zhen_ip[0],zhen_ip[1],\'可用\'))
except:
    #删除没用的ip
    conn.srem(\'proxy\',zhen_ip[1] )
    print(\'{}---{}>>>{}\'.format(zhen_ip[0], zhen_ip[1], \'不可用\'))
    dict1 = {zhen_ip[0]:zhen_ip[1]}}


print(dict1)

#<<<proxies=dict1>>>在请求头部添加这个参数就可以正常使用了
"""