import time import random from selenium import webdriver from bs4 import BeautifulSoup # 创建浏览器对象 def get_request(url): browser = webdriver.Chrome() browser.get(url=url) browser.maximize_window() page_num = browser.find_element_by_xpath(\'//*[@id="body"]/div[3]/a[10]\').text for page in range(1,int(page_num)): print("正在爬取第{}页...".format(page)) time.sleep(random.randint(1,5)) browser.get(url="https://www.xicidaili.com/nn/{}".format(page)) soup = BeautifulSoup(browser.page_source,"lxml") tr_list = soup.select("#ip_list > tbody > tr") for tr in tr_list[1:]: data = {} data["ip"] = tr.select("td:nth-of-type(2)")[0].get_text() data["host"] = tr.select("td:nth-of-type(3)")[0].get_text() data["http"] = tr.select("td:nth-of-type(6)")[0].get_text() data["class"] = tr.select("td:nth-of-type(5)")[0].get_text() data["time"] = tr.select("td:nth-of-type(10)")[0].get_text() data["show"] = data["ip"]+":"+data["host"] print(data) with open("data.txt","a+",encoding="utf8") as f: f.write(str(data)+"\n") time.sleep(1000) browser.close() if __name__ == \'__main__\': url = "https://www.xicidaili.com/nn" get_request(url)
方式二:(推荐)
import time import random from selenium import webdriver # 创建浏览器对象 def get_request(url): browser = webdriver.Chrome() browser.get(url=url) browser.maximize_window() page_num = browser.find_element_by_xpath(\'//*[@id="body"]/div[3]/a[10]\').text for page in range(1, int(page_num)): print("正在爬取第{}页...".format(page)) time.sleep(random.randint(1, 5)) browser.get(url="https://www.xicidaili.com/nn/{}".format(page)) tr_list = browser.find_elements_by_css_selector(\'#ip_list > tbody > tr\') for tr in tr_list[1:]: data = {} data["host"] = tr.find_element_by_css_selector("td:nth-of-type(2)").text data["port"] = tr.find_element_by_css_selector("td:nth-of-type(3)").text data["http"] = tr.find_element_by_css_selector("td:nth-of-type(6)").text data["class"] = tr.find_element_by_css_selector("td:nth-of-type(5)").text data["speed"] = tr.find_element_by_css_selector("td:nth-of-type(7) div").get_attribute("title") data["test_time"] = tr.find_element_by_css_selector("td:nth-of-type(10)").text data["survival_time"] = tr.find_element_by_css_selector("td:nth-of-type(9)").text try: data["address"] = tr.find_element_by_css_selector("td:nth-of-type(4) a").text except: data["address"] = \'\' data["show"] = data["host"]+":"+data["port"] print(data) time.sleep(1000) browser.close() if __name__ == \'__main__\': url = "https://www.xicidaili.com/nn" get_request(url)