【问题标题】:Get YouTube Playlist urls with python使用 python 获取 YouTube 播放列表网址
【发布时间】:2020-07-31 13:14:18
【问题描述】:

如何获取播放列表 url 的存储方式

这里:https://www.youtube.com/watch?v=VpTRlS7EO6E&list=RDOIhVs0FQ8xc&index=5 用bs4?

使用

from bs4 import BeautifulSoup as bs
import requests

r = requests.get('https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1')
page = r.text
soup=bs(page,'html.parser')
#print(soup)
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)

不返回任何内容。即使打印汤本身也不包含我正在寻找的链接(如 href="/watch?v=puNOG62lf-Y&list=RDOIhVs0FQ8xc&index=2")

【问题讨论】:

标签: python url beautifulsoup youtube


【解决方案1】:

这是一个 javascript 呈现的页面。你必须使用硒。

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time

url = 'https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
time.sleep(2)

soup=bs(driver.page_source,'html.parser')
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)

使用pip install webdriver-manager安装所需的包

【讨论】:

    【解决方案2】:

    谢谢! 这里有一些对我有用的脏代码:

    #--------------------------------- 
    # import modules
    from bs4 import BeautifulSoup as bs
    from selenium import webdriver
    import time
    import re
    
    #---------------------------------
    #     
    from webdriver_manager.firefox import GeckoDriverManager
    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    
    #---------------------------------
    # get links from url    
    def get_links(driver, sleep_time):
    
        # open driver window
        driver.set_window_size(1024, 600)
        driver.maximize_window()
        driver.get(url)    
                
        # wait some seconds
        time.sleep(sleep_time)
                
        # get information from url
        soup = bs(driver.page_source,'html.parser')
        res = soup.find_all('ytd-playlist-panel-video-renderer')  
                
        # check if there is information
        if len(res) > 0:
            main_url = 'https://www.youtube.com/watch?v='
            urls = re.findall('watch.*list', str(res))
            links = [main_url + str(a[8:-9]) for a in urls[::2]]
        # if there is no information return false
        else:
            links = False    
        return links
    
    #---------------------------------
    # set sleep timer
    sleep_time = 10
    # call function to get links
    links = get_links(driver, sleep_time)
    
    

    【讨论】:

      【解决方案3】:

      这对我有用:

      from selenium import webdriver  # pip install selenium
      import time
      
      # make sure you download chrome driver from https://chromedriver.chromium.org/downloads and put it in folder 'driver'
      driver = webdriver.Chrome('driver\chromedriver.exe')
      driver.get('https://www.youtube.com/playlist?list=PLxvodScTx2RtAOoajGSu6ad4p8P8uXKQk')  # put here your link
      
      # scroll page down
      old_position = 0
      new_position = None
      position_script = """return (window.pageYOffset !== undefined) ?
                window.pageYOffset : (document.documentElement ||
                document.body.parentNode || document.body);"""
      while new_position != old_position:
          old_position = driver.execute_script(position_script)
          time.sleep(1)
          driver.execute_script(
              """var scrollingElement = (document.scrollingElement ||
               document.body);scrollingElement.scrollTop =
               scrollingElement.scrollHeight;""")
          new_position = driver.execute_script(position_script)
      source_page = driver.page_source
      driver.quit()
      
      # extract the url's and name's
      counter = 1
      element_to_find = 'amp;index={}" ar'
      video_index = source_page.find(element_to_find.format(counter))  #'amp;index=1" ar'
      while video_index != -1:
          title_element = ''
          count_name = video_index
          while title_element != 'title="':
              title_element = source_page[count_name: count_name + 7]
              count_name += 1
          count_name += 6
          start_title_position = count_name
          end_title = ''
          while end_title != '>':
              end_title = source_page[count_name]  # exit loop if end_title == '>'
              count_name += 1
          name = source_page[start_title_position:count_name - 2]  # extract the name of the video
          name = name.replace('"','"')
          video_id = source_page[video_index - 56: video_index - 45]  # extract video id
          print(str(counter)
                + '. link: ' + 'https://www.youtube.com/watch?v=' + video_id +
                ', name: ' + name)
          counter += 1
          video_index = source_page.find(element_to_find.format(counter))  # continue the next video
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 2017-11-01
        • 1970-01-01
        • 2013-04-30
        • 1970-01-01
        • 2021-02-15
        • 1970-01-01
        • 2017-08-10
        • 2014-03-25
        相关资源
        最近更新 更多