下载视频:
1 from bs4 import BeautifulSoup 2 import requests 3 import re 4 import urllib 5 6 7 def callbackfunc(blocknum, blocksize, totalsize): 8 \'\'\'回调函数 9 @blocknum: 已经下载的数据块 10 @blocksize: 数据块的大小 11 @totalsize: 远程文件的大小 12 \'\'\' 13 percent = 100.0 * blocknum * blocksize / totalsize 14 if percent > 100: 15 percent = 100 16 print ("%.2f%%"% percent) 17 18 19 20 ur = \'http://www.budejie.com/video/\' 21 22 def get_htmls(url): 23 headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\'} 24 html = requests.get(url,headers=headers) 25 #print(type(html.text)) 26 soup =BeautifulSoup(html.text,\'html.parser\') 27 result1 = soup.find(attrs={\'class\':\'j-video-c\',\'data-title\':True}) 28 result2 = soup.find(attrs={\'class\': \'j-video\', \'data-mp4\': True}) 29 nam = result1.get(\'data-title\') 30 url = result2.get(\'data-mp4\') 31 local = \'e:\\\'+str(nam)+\'.mp4\' 32 urllib.request.urlretrieve(url, local, callbackfunc) 33 34 if __name__ == \'__main__\': 35 get_htmls(ur)
下载小说:
1 from bs4 import BeautifulSoup 2 import requests 3 import re 4 from openpyxl import load_workbook 5 from openpyxl.utils import get_column_letter 6 7 #这一部分是存链接的 8 \'\'\' 9 # 设置文件 mingc 10 addr = "1.xlsx" 11 # 打开文件 12 wb = load_workbook(addr) 13 # 创建一张新表 14 ws = wb.create_sheet() 15 # 第一行输入 16 #ws.append([\'TIME\', \'TITLE\', \'A-Z\']) 17 ws[\'A1\'] = \'章节\' 18 ws[\'B1\'] = \'链接 19 \'\'\' 20 links = [] 21 ur = \'https://www.qb5200.tw/xiaoshuo/2/2155/\' 22 def get_one_page(url,ok): 23 headers = { 24 25 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763\'} 26 27 html = requests.get(url,headers=headers) 28 html.encoding = \'gbk\' 29 #print(html.content) 30 if ok == True: 31 get_parsing(html) 32 else : 33 return html 34 35 def get_parsing(html): 36 soup = BeautifulSoup(html.content,\'html.parser\') 37 dd = soup.findAll([\'dt\',[\'dd\']]) 38 result = False 39 40 #counts =2 41 co = 0 42 for one in dd: 43 #print(type(one)) 44 if one.string==\'《龙王传说》正文卷\': 45 #ws.title = one.string 46 result = True 47 if result == True and one.name == \'dd\': 48 link = one.a.get(\'href\') 49 links.append(link) #注意这里是传入元组、列表、字典 50 \'\'\' 51 st = one.a.string 52 data = [++co,ur+link] 53 ws.append(data) 54 \'\'\' 55 56 def get_htmls(): 57 i=1000 58 results = links[1000:] 59 for link in results: 60 i+=1 61 url = ur+link 62 path=\'龙3.txt\' 63 html = get_one_page(url,False) 64 soup = BeautifulSoup(html.content, \'html.parser\') 65 name = soup.find(attrs={\'class\':\'content\',\'id\':False}) 66 names = name.h1.string 67 div = soup.find(\'div\',attrs={\'class\':\'showtxt\'}) 68 with open(path,\'a\',encoding=\'utf8\') as f: 69 f.write(names + \'\n\') 70 for string in div.stripped_strings: 71 f.write(string+\'\n\') 72 73 if i%10 == 0: 74 print(i) 75 if i==1300: 76 break 77 78 79 80 if __name__ == \'__main__\': 81 get_one_page(\'https://www.qb5200.tw/xiaoshuo/2/2155/\',True) 82 #wb.save(addr) 83 get_htmls()