childheart

斗鱼直播主播信息采集

from selenium import webdriver
import time
from lxml import etree
from excel_utils.excel_utils import write_to_excel,append_to_excel
import os


# 浏览器生成并解析
def get_page_content_by_selenium(url):
    driver.get(url)
    time.sleep(2)
    driver.maximize_window()
    page_content = driver.page_source
    return etree.HTML(page_content)


def main():
    start_url = \'https://www.douyu.com/g_LOL\'
    page_content = get_page_content_by_selenium(start_url)
    next_btn = driver.find_element_by_xpath(\'//div[@class="ListFooter"]/ul/li[last()]\')
    print(next_btn.tag_name)
    n = 1
    while True:
        print(f\'爬取第{n}页\')
        titles = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h3/text()\')
        anchor = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h2/div[@class="DyListCover-userName"]/text()\')
        focus = page_content.xpath(\'//section[@id="listAll"]//ul[@class="layout-Cover-list"]//span[@class="DyListCover-hot"]/text()\')
        anchor_list = []
        for index, title in enumerate(titles):
            item = {}
            item[\'title\'] = title
            item[\'anchor\'] = anchor[index]
            item[\'focus\'] = focus[index]
            anchor_list.append(item)
        file_name = \'anchor.xls\'
        if not os.path.exists(file_name):
            write_to_excel(anchor_list, file_name)
        else:
            append_to_excel(anchor_list, file_name)
        if next_btn.get_attribute(\'aria-disabled\') == \'false\':
            next_btn.click()
            time.sleep(0.5)
            page_content = etree.HTML(driver.page_source)
        else:
            break
        n += 1


if __name__ == \'__main__\':
    driver = webdriver.Chrome()
    main()

分类:

技术点:

相关文章: