V1.0
功能:从比较知名的几个电影下载网站爬取下载链接,并自动打印出来:
代码:
# -*- coding: utf8 -*- from bs4 import BeautifulSoup import requests, lxml from urllib.parse import quote import re def get_name(): while 1: moviename = input(\'请输入要查找的电影名\n->\') moviename_quote = quote(moviename.encode(\'gb2312\')) get_url_from_ygdy(moviename_quote) get_url_from_bttiantang(moviename) get_url_from_dytt(moviename_quote) def get_url_from_ygdy(moviename): baseurl = \'http://s.dydytt.net/plus/search.php?kwtype=0&keyword=\' url = baseurl + str(moviename) content = BeautifulSoup(requests.get(url).content.decode(\'gb2312\', \'ignore\'), \'lxml\') first_page = content.find_all(\'td\', width="30") movie_infos = content.find_all(\'td\', width="55%") if movie_infos.__len__() == 0: print(\'查无此电影,请检查后重试\') return else: print(\'阳光电影搜索结果:\') if first_page.__len__() == 0: for movie_info in movie_infos: get_info(movie_info, moviename) else: last_page_url = first_page[1].find(\'a\').get(\'href\') + \'"\' pattern = re.compile(\'PageNo=(.*?)"\') pnt = re.findall(pattern, last_page_url) for i in range(int(pnt[0])): print(\'第\', i + 1, \'页:\') page_url = url + \'&PageNo=\' + str(i + 1) pagecontent = BeautifulSoup(requests.get(page_url).content.decode(\'gb2312\', \'ignore\'), \'lxml\') movie_infos = pagecontent.find_all(\'td\', width=\'55%\') for movie_info in movie_infos: get_info(movie_info, moviename) def get_info(movie_info, name): movie_url = movie_info.find(\'a\').get(\'href\') moviename = movie_info.text if \'游戏\' not in name and \'游戏\' in moviename: return else: print(\'电影名:\', moviename) url = \'http://www.ygdy8.com\' + movie_url info = BeautifulSoup(requests.get(url).content.decode(\'gbk\', \'ignore\'), \'lxml\') download = info.find_all(\'td\', style="WORD-WRAP: break-word") print(\'下载链接:\') if download.__len__() == 1: print(download[0].find(\'a\').string) else: for each in range(download.__len__()): print(\'链接\', each + 1, \':\', download[each].find(\'a\').string) print(\'\n\') def get_url_from_bttiantang(moviename): baseurl = \'http://www.bttiantang.com/s.php?q=\' + str(moviename) page_content = requests.get(baseurl).content.decode(\'utf8\', \'ignore\') pattern = re.compile(\'</b>条<b>(.*?)</b>\') pagenum_info = re.findall(pattern, page_content) page_content = BeautifulSoup(page_content, \'lxml\') content = page_content.find_all(\'p\', class_="tt cl") if content.__len__() == 0: print(\'查无此电影,请检查后重试\') return else: print(\'BT天堂搜索结果:\') if pagenum_info.__len__() == 0: for each in content: get_movieinfo(each, moviename) else: for i in range(int(pagenum_info[0])): print(\'第\', i + 1, \'页:\') page_url = baseurl + \'&PageNo=\' + str(i + 1) page_content = BeautifulSoup(requests.get(page_url).content.decode(\'utf8\', \'ignore\'), \'lxml\') content = page_content.find_all(\'p\', class_="tt cl") for each in content: get_movieinfo(each, moviename) def get_movieinfo(movie_content, name): url = \'http://www.bttiantang.com/\' + movie_content.find(\'a\').get(\'href\') moviename = movie_content.text if \'游戏\' not in name and \'游戏\' in moviename: return print(\'电影名:\', moviename) info = BeautifulSoup(requests.get(url).content.decode(\'utf8\', \'ignore\'), \'lxml\') links = info.find_all(\'div\', class_=\'tinfo\') print(\'下载链接:\') i = 0 for each in links: i += 1 print(\'链接\' + str(i) + \':\') print(\'http://www.bttiantang.com\' + each.find(\'a\').get(\'href\')) def get_url_from_dytt(moviename): baseurl = \'http://www.dytt.com/search.asp?searchword=\' + str(moviename) content = requests.get(baseurl).content.decode(\'gbk\', \'ignore\') pattern = re.compile(\'下一页.*?href.*?page=(.*?)&\') result = re.findall(pattern, content) content = BeautifulSoup(content, \'lxml\') items = content.find_all(\'p\', class_=\'s1\') if items.__len__() == 1: print(\'查无此电影,请检查后重试\') return else: print(\'电影淘淘搜索结果:\') if result.__len__() == 0: for i in range(items.__len__() - 1): get_movieinfo_from_dytt(items[i + 1], moviename) else: for i in range(int(result[0])): print(\'第\', i + 1, \'页:\') url = baseurl + \'&page=\' + str(i + 1) page_content = BeautifulSoup(requests.get(url).content.decode(\'gbk\', \'ignore\'), \'lxml\') items = page_content.find_all(\'p\', class_=\'s1\') for i in range(items.__len__() - 1): get_movieinfo_from_dytt(items[i + 1], moviename) def get_movieinfo_from_dytt(item, name): moviename = item.find(\'a\').text movieurl = \'http://www.dytt.com\' + item.find(\'a\').get(\'href\') if \'游戏\' not in name and \'游戏\' in moviename: return print(\'电影名:\', moviename) pagecontent = requests.get(movieurl).content.decode(\'gbk\', \'ignore\') links = re.findall(re.compile(\'ed2k:(.*?)\|/\'), pagecontent) i = 0 print(\'下载链接:\') if links.__len__() != 0: for link in links: i += 1 print(\'链接\' + str(i) + \':\', \'ed2k://|file|\' + link + \'|/\') else: links = re.findall(re.compile(\'http:(.*?)torrent\'), pagecontent) if links.__len__() != 0: for link in links: i += 1 print(\'链接\' + str(i) + \':\', \'http:\' + link + \'torrent\') else: links = re.findall(re.compile(\'ftp:(.*?)mkv\'), pagecontent) for link in links: i += 1 print(\'链接\' + str(i) + \':\', \'ftp:\' + link + \'mkv\') if __name__ == \'__main__\': get_name()
运行结果: