python网易云音乐抓取
import threading from selenium import webdriver from collections import deque songList =set([]); playList =set([]); #歌单 def chrome_browser_songList(url,browser): browser.get(url) play_count = browser.find_element_by_id(\'play-count\').text if(int(play_count)>10000): data=\'\n\'+browser.find_element_by_class_name(\'f-ff2\').text+\' 评论数:\'+str(play_count)+\' 地址:\'+url save_file(data,\'D:\\songList.txt\') songQueue = deque() try: # a[href^=\/song] for each in browser.find_elements_by_css_selector(\'a[href^=\/song]\'): try: print("歌曲名字: %s 地址 %s" % (each.text, each.get_property(\'href\'))) songQueue.append(each.get_property(\'href\')) except: continue except: print(\'someerror\') song_queue(songQueue,browser) #寻找歌单 playListQueue = deque() try: for each in browser.find_elements_by_css_selector(\'a[href^=\/playlist]\'): try: print("歌单: %s 地址 %s" % (each.text, each.get_property(\'href\'))) playListQueue.append(each.get_property(\'href\')) except: continue except: print(\'someerror\') browser.close() browser = webdriver.Chrome(\'C:\Program Files\Google\Chrome\Application\chromedriver.exe\') play_list_queue(playListQueue,browser) #browser.close() #歌曲 def chrome_browser_song(url): browser = webdriver.Chrome(\'C:\Program Files\Google\Chrome\Application\chromedriver.exe\') browser.get(url) browser.switch_to_frame(\'g_iframe\') comment_count = browser.find_element_by_id(\'cnt_comment_count\').text if(int(comment_count)>10000): data = \'\n歌曲名字:\'+browser.find_element_by_class_name(\'f-ff2\').text+\' 歌手:\'+browser.find_element_by_css_selector(\'a[href^=\/artist]\').text+\' 评论数:\'+comment_count+\' 歌曲地址:\' +url; save_file(data,\'D:\\song.txt\') browser.close() #保存文件 def save_file(data,file): save_path = file f_obj = open(save_path, \'a\') f_obj.write(data) f_obj.close() #歌队列 def song_queue(songQueue,browser): while songQueue: current_url = songQueue.popleft() if current_url not in songList: songList.add(current_url) try: chrome_browser_song(current_url) except: continue #歌单队列 def play_list_queue(listQueue,browser): while listQueue: current_url = listQueue.popleft() if current_url not in playList: playList.add(current_url) try: chrome_browser_songList(current_url,browser) except: continue url_list =[ \'http://music.163.com/playlist?id=598057191\', \'http://music.163.com/#/playlist?id=144236857\', ] def thread_1(): url = url_list[0] browser = webdriver.Chrome(\'C:\Program Files\Google\Chrome\Application\chromedriver.exe\') chrome_browser_songList(url, browser) def thread_2(): url = url_list[1] browser = webdriver.Chrome(\'C:\Program Files\Google\Chrome\Application\chromedriver.exe\') chrome_browser_songList(url, browser) #多线程 def thread_song(): threads = [] t1 = threading.Thread(target=thread_1) threads.append(t1) t2 = threading.Thread(target=thread_2) threads.append(t2) return threads if __name__ == \'__main__\': # url = url_list[1] # browser = webdriver.Chrome(\'C:\Program Files\Google\Chrome\Application\chromedriver.exe\') # chrome_browser_songList(url, browser) threads = thread_song() for t in threads: t.setDaemon(True) t.start() t.join()
因为没有解决登陆问题,采用了一种比较笨的方法~~