python爬虫下载小视频和小说(基础)

下载视频：

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import re
 4 import urllib
 5 
 6 
 7 def callbackfunc(blocknum, blocksize, totalsize):
 8     \'\'\'回调函数
 9     @blocknum: 已经下载的数据块
10     @blocksize: 数据块的大小
11     @totalsize: 远程文件的大小
12     \'\'\'
13     percent = 100.0 * blocknum * blocksize / totalsize
14     if percent > 100:
15         percent = 100
16     print ("%.2f%%"% percent)
17 
18 
19 
20 ur = \'http://www.budejie.com/video/\'
21 
22 def get_htmls(url):
23     headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\'}
24     html = requests.get(url,headers=headers)
25     #print(type(html.text))
26     soup =BeautifulSoup(html.text,\'html.parser\')
27     result1 = soup.find(attrs={\'class\':\'j-video-c\',\'data-title\':True})
28     result2 = soup.find(attrs={\'class\': \'j-video\', \'data-mp4\': True})
29     nam = result1.get(\'data-title\')
30     url = result2.get(\'data-mp4\')
31     local = \'e:\\\'+str(nam)+\'.mp4\'
32     urllib.request.urlretrieve(url, local, callbackfunc)
33 
34 if __name__ == \'__main__\':
35     get_htmls(ur)

下载小说：

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import re
 4 from openpyxl import load_workbook
 5 from openpyxl.utils import get_column_letter
 6 
 7 #这一部分是存链接的
 8 \'\'\'
 9 # 设置文件 mingc
10 addr = "1.xlsx"
11 # 打开文件
12 wb = load_workbook(addr)
13 # 创建一张新表
14 ws = wb.create_sheet()
15 # 第一行输入
16 #ws.append([\'TIME\', \'TITLE\', \'A-Z\'])
17 ws[\'A1\'] = \'章节\'
18 ws[\'B1\'] = \'链接
19 \'\'\'
20 links = []
21 ur = \'https://www.qb5200.tw/xiaoshuo/2/2155/\'
22 def get_one_page(url,ok):
23     headers = {
24 
25    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763\'}
26 
27     html = requests.get(url,headers=headers)
28     html.encoding = \'gbk\'
29     #print(html.content)
30     if ok == True:
31         get_parsing(html)
32     else :
33         return html
34 
35 def get_parsing(html):
36     soup = BeautifulSoup(html.content,\'html.parser\')
37     dd = soup.findAll([\'dt\',[\'dd\']])
38     result = False
39 
40     #counts =2
41     co = 0
42     for one in dd:
43         #print(type(one))
44         if one.string==\'《龙王传说》正文卷\':
45             #ws.title = one.string
46             result = True
47         if result == True and one.name == \'dd\':
48             link = one.a.get(\'href\')
49             links.append(link)   #注意这里是传入元组、列表、字典
50 \'\'\'
51             st = one.a.string
52             data = [++co,ur+link]
53             ws.append(data)
54 \'\'\'
55 
56 def get_htmls():
57     i=1000
58     results = links[1000:]
59     for link in results:
60         i+=1
61         url = ur+link
62         path=\'龙3.txt\'
63         html = get_one_page(url,False)
64         soup = BeautifulSoup(html.content, \'html.parser\')
65         name = soup.find(attrs={\'class\':\'content\',\'id\':False})
66         names = name.h1.string
67         div = soup.find(\'div\',attrs={\'class\':\'showtxt\'})
68         with open(path,\'a\',encoding=\'utf8\') as f:
69             f.write(names + \'\n\')
70             for string in div.stripped_strings:
71                 f.write(string+\'\n\')
72 
73         if i%10 == 0:
74             print(i)
75             if i==1300:
76                 break
77 
78 
79 
80 if __name__ == \'__main__\':
81     get_one_page(\'https://www.qb5200.tw/xiaoshuo/2/2155/\',True)
82     #wb.save(addr)
83     get_htmls()