import requests
from lxml import etree
import pymysql
'''
爬取西刺代理ip和port,然后验证该ip是否可用
'''
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def parse_data(url):
# 开启数据库
conn = pymysql.Connect(
host='127.0.0.1', port=3306, db='pyproject', user='root', passwd='root', charset='utf8'
)
curr = conn.cursor()
all_ip = [] # 保存可用的ip和port
response = requests.get(url, headers=header)
data = response.content.decode('utf-8')
html = etree.HTML(data)
all_list = html.xpath('//table[@id="ip_list"]//tr')
for t in all_list[2:]:
ip = t.xpath('./td[2]/text()')[0]
port = t.xpath('./td[3]/text()')[0]
sql = "insert into XCIP(ip, port) values('" + ip + "','" + port + "')"
list_data = ip + ':' + port
# 验证ip是否有效
tarurl = 'http://www.baidu.com'
proxies = {'http': 'http://' + list_data, 'https': 'https://' + list_data}
try:
res = requests.get(url=tarurl, proxies=proxies, headers=header, timeout=5).status_code
if res == 200:
print(list_data)
curr.execute(sql)
conn.commit()
all_ip.append(list_data)
except:
print('该ip不可用')
curr.close()
conn.close()
return all_ip
def main():
base_url = 'https://www.xicidaili.com/nn/{}'
for i in range(1, 5):
url = base_url.format(i)
detail_url = parse_data(url)
if __name__ == '__main__':
main()
爬取了前五页,就只有7个可用,这个网页太不靠谱了;