1 import requests 2 from urllib.request import urlretrieve,urljoin 3 import re,json,os,time 4 import glob,fitz 5 6 """爬取原创力文档内容,并转存为PDF格式""" 7 8 def get_params(url,headers): 9 10 res = requests.get(url=url,headers=headers) 11 dict1={} 12 dict1["title"] = re.findall("<title>(.*)</title>",res.text)[0] 13 dict1["aid"] = re.findall("aid: (.*), //解密后的id",res.text)[0] 14 dict1["view_token"] = re.findall("view_token: (.*) //预览的token",res.text)[0] 15 dict1["page"] = re.findall("actual_page: (.*), //真实页数",res.text)[0] 16 return dict1 17 18 def get_imgs(headers,title,aid,view_token,page,img_path): 19 20 url = "https://openapi.book118.com/getPreview.html" 21 data = { 22 "project_id": 1, 23 "aid": aid, 24 "view_token": view_token, 25 "page": page 26 } 27 r = int(page)/6 28 if r >1: 29 r=int(r) 30 else: 31 r=0 32 n = 1 33 for j in range(r+1): 34 data["page"] = j*6+1 35 res = requests.get(url=url,headers=headers,params=data) 36 time.sleep(2) 37 data1 = re.findall("{.*}",res.text)[0] 38 dic1 = json.loads(data1) 39 #print(data1) 40 for i in (dic1["data"].values()): 41 img_url = urljoin("https:",i) 42 urlretrieve(img_url,img_path + rf"\{n}.png") 43 n = n + 1 44 45 def img_pdf(img_path,pdf_name): 46 #打开空文档 47 doc = fitz.open() 48 for img in sorted(glob.glob(img_path + r"\*.png")): 49 imgdoc = fitz.open(img) 50 pdfbytes = imgdoc.convertToPDF() 51 # 将当前文档写入pdf 52 imgpdf = fitz.open("pdf", pdfbytes) 53 doc.insertPDF(imgpdf) 54 doc.save(img_path + rf"\{pdf_name}.pdf") 55 doc.close() 56 57 def main(url,img_path): 58 headers = { 59 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" 60 } 61 dict1 = get_params(url,headers) 62 title = dict1["title"] 63 aid = dict1["aid"] 64 view_token =dict1["view_token"] 65 page = dict1["page"] 66 if os.path.exists(img_path): 67 pass 68 else: 69 os.mkdir(img_path) 70 get_imgs(headers,title,aid,view_token,page,img_path) 71 img_pdf(img_path,title) 72 73 74 75 if __name__ == "__main__": 76 77 url = "https://max.book118.com/html/2018/1027/5041323214001323.shtm" 78 #url = "https://max.book118.com/html/2018/0706/6225232151001204.shtm" 79 file = int(time.time()) 80 img_path = rf"statics\{file}" 81 main(url,img_path)