xingxingnbsp
import time
import random
from selenium import webdriver
from bs4 import BeautifulSoup
# 创建浏览器对象
def get_request(url):
    browser = webdriver.Chrome()
    browser.get(url=url)
    browser.maximize_window()
    page_num = browser.find_element_by_xpath(\'//*[@id="body"]/div[3]/a[10]\').text
    for page in range(1,int(page_num)):
        print("正在爬取第{}页...".format(page))
        time.sleep(random.randint(1,5))
        browser.get(url="https://www.xicidaili.com/nn/{}".format(page))
        soup = BeautifulSoup(browser.page_source,"lxml")
        tr_list = soup.select("#ip_list > tbody > tr")
        for tr in tr_list[1:]:
            data = {}
            data["ip"] = tr.select("td:nth-of-type(2)")[0].get_text()
            data["host"] = tr.select("td:nth-of-type(3)")[0].get_text()
            data["http"] = tr.select("td:nth-of-type(6)")[0].get_text()
            data["class"] = tr.select("td:nth-of-type(5)")[0].get_text()
            data["time"] = tr.select("td:nth-of-type(10)")[0].get_text()
            data["show"] = data["ip"]+":"+data["host"]
            print(data)
            with open("data.txt","a+",encoding="utf8") as f:
                f.write(str(data)+"\n")
    time.sleep(1000)
    browser.close()
if __name__ == \'__main__\':
    url = "https://www.xicidaili.com/nn"
    get_request(url)

方式二:(推荐)

import time
import random
from selenium import webdriver

# 创建浏览器对象
def get_request(url):
    browser = webdriver.Chrome()
    browser.get(url=url)
    browser.maximize_window()
    page_num = browser.find_element_by_xpath(\'//*[@id="body"]/div[3]/a[10]\').text
    for page in range(1, int(page_num)):
        print("正在爬取第{}页...".format(page))
        time.sleep(random.randint(1, 5))
        browser.get(url="https://www.xicidaili.com/nn/{}".format(page))
        tr_list = browser.find_elements_by_css_selector(\'#ip_list > tbody > tr\')
        for tr in tr_list[1:]:
            data = {}
            data["host"] = tr.find_element_by_css_selector("td:nth-of-type(2)").text
            data["port"] = tr.find_element_by_css_selector("td:nth-of-type(3)").text
            data["http"] = tr.find_element_by_css_selector("td:nth-of-type(6)").text
            data["class"] = tr.find_element_by_css_selector("td:nth-of-type(5)").text
            data["speed"] = tr.find_element_by_css_selector("td:nth-of-type(7) div").get_attribute("title")
            data["test_time"] = tr.find_element_by_css_selector("td:nth-of-type(10)").text
            data["survival_time"] = tr.find_element_by_css_selector("td:nth-of-type(9)").text
            try:
                data["address"] = tr.find_element_by_css_selector("td:nth-of-type(4) a").text
            except:
                data["address"] = \'\'
            data["show"] = data["host"]+":"+data["port"]
            print(data)
    time.sleep(1000)
    browser.close()

if __name__ == \'__main__\':
    url = "https://www.xicidaili.com/nn"
    get_request(url)

 

分类:

技术点:

相关文章: