斗鱼直播主播信息采集
from selenium import webdriver
import time
from lxml import etree
from excel_utils.excel_utils import write_to_excel,append_to_excel
import os
# 浏览器生成并解析
def get_page_content_by_selenium(url):
driver.get(url)
time.sleep(2)
driver.maximize_window()
page_content = driver.page_source
return etree.HTML(page_content)
def main():
start_url = \'https://www.douyu.com/g_LOL\'
page_content = get_page_content_by_selenium(start_url)
next_btn = driver.find_element_by_xpath(\'//div[@class="ListFooter"]/ul/li[last()]\')
print(next_btn.tag_name)
n = 1
while True:
print(f\'爬取第{n}页\')
titles = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h3/text()\')
anchor = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h2/div[@class="DyListCover-userName"]/text()\')
focus = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//span[@class="DyListCover-hot"]/text()\')
anchor_list = []
for index, title in enumerate(titles):
item = {}
item[\'title\'] = title
item[\'anchor\'] = anchor[index]
item[\'focus\'] = focus[index]
anchor_list.append(item)
file_name = \'anchor.xls\'
if not os.path.exists(file_name):
write_to_excel(anchor_list, file_name)
else:
append_to_excel(anchor_list, file_name)
if next_btn.get_attribute(\'aria-disabled\') == \'false\':
next_btn.click()
time.sleep(0.5)
page_content = etree.HTML(driver.page_source)
else:
break
n += 1
if __name__ == \'__main__\':
driver = webdriver.Chrome()
main()