python转换html到pdf文件
1.安装wkhtmltopdf
Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”
2.安装pdfkit
直接pip install pdfkit
pdfkit 是 wkhtmltopdf 的Python封装包
1 import pdfkit 2 3 # 有下面3中途径生产pdf 4 5 pdfkit.from_url(\'http://google.com\', \'out.pdf\') 6 7 pdfkit.from_file(\'test.html\', \'out.pdf\') 8 9 pdfkit.from_string(\'Hello!\', \'out.pdf\')
3.合并pdf,使用PyPDF2
直接pip install PyPDF2
1 from PyPDF2 import PdfFileMerger 2 merger = PdfFileMerger() 3 input1 = open("1.pdf", "rb") 4 input2 = open("2.pdf", "rb") 5 merger.append(input1) 6 merger.append(input2) 7 # 写入到输出pdf文档中 8 output = open("hql_all.pdf", "wb") 9 merger.write(output)
4.综合示例:
1 # coding=utf-8 2 import os 3 import re 4 import time 5 import logging 6 import pdfkit 7 import requests 8 from bs4 import BeautifulSoup 9 from PyPDF2 import PdfFileMerger 10 11 html_template = """ 12 <!DOCTYPE html> 13 <html lang="en"> 14 <head> 15 <meta charset="UTF-8"> 16 </head> 17 <body> 18 {content} 19 </body> 20 </html> 21 22 """ 23 24 25 def parse_url_to_html(url, name): 26 """ 27 解析URL,返回HTML内容 28 :param url:解析的url 29 :param name: 保存的html文件名 30 :return: html 31 """ 32 try: 33 response = requests.get(url) 34 soup = BeautifulSoup(response.content, \'html.parser\') 35 # 正文 36 body = soup.find_all(class_="x-wiki-content")[0] 37 # 标题 38 title = soup.find(\'h4\').get_text() 39 40 # 标题加入到正文的最前面,居中显示 41 center_tag = soup.new_tag("center") 42 title_tag = soup.new_tag(\'h1\') 43 title_tag.string = title 44 center_tag.insert(1, title_tag) 45 body.insert(1, center_tag) 46 html = str(body) 47 # body中的img标签的src相对路径的改成绝对路径 48 pattern = "(<img .*?src=\")(.*?)(\")" 49 50 def func(m): 51 if not m.group(3).startswith("http"): 52 rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3) 53 return rtn 54 else: 55 return m.group(1)+m.group(2)+m.group(3) 56 html = re.compile(pattern).sub(func, html) 57 html = html_template.format(content=html) 58 html = html.encode("utf-8") 59 with open(name, \'wb\') as f: 60 f.write(html) 61 return name 62 63 except Exception as e: 64 65 logging.error("解析错误", exc_info=True) 66 67 68 def get_url_list(): 69 """ 70 获取所有URL目录列表 71 :return: 72 """ 73 response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000") 74 soup = BeautifulSoup(response.content, "html.parser") 75 menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1] 76 urls = [] 77 for li in menu_tag.find_all("li"): 78 url = "http://www.liaoxuefeng.com" + li.a.get(\'href\') 79 urls.append(url) 80 return urls 81 82 83 def save_pdf(htmls, file_name): 84 """ 85 把所有html文件保存到pdf文件 86 :param htmls: html文件列表 87 :param file_name: pdf文件名 88 :return: 89 """ 90 options = { 91 \'page-size\': \'Letter\', 92 \'margin-top\': \'0.75in\', 93 \'margin-right\': \'0.75in\', 94 \'margin-bottom\': \'0.75in\', 95 \'margin-left\': \'0.75in\', 96 \'encoding\': "UTF-8", 97 \'custom-header\': [ 98 (\'Accept-Encoding\', \'gzip\') 99 ], 100 \'cookie\': [ 101 (\'cookie-name1\', \'cookie-value1\'), 102 (\'cookie-name2\', \'cookie-value2\'), 103 ], 104 \'outline-depth\': 10, 105 } 106 pdfkit.from_file(htmls, file_name, options=options) 107 108 109 def main(): 110 start = time.time() 111 file_name = u"liaoxuefeng_Python3_tutorial" 112 urls = get_url_list() 113 for index, url in enumerate(urls): 114 parse_url_to_html(url, str(index) + ".html") 115 htmls =[] 116 pdfs =[] 117 for i in range(0,124): 118 htmls.append(str(i)+\'.html\') 119 pdfs.append(file_name+str(i)+\'.pdf\') 120 121 save_pdf(str(i)+\'.html\', file_name+str(i)+\'.pdf\') 122 123 print u"转换完成第"+str(i)+\'个html\' 124 125 merger = PdfFileMerger() 126 for pdf in pdfs: 127 merger.append(open(pdf,\'rb\')) 128 print u"合并完成第"+str(i)+\'个pdf\'+pdf 129 130 output = open(u"廖雪峰Python_all.pdf", "wb") 131 merger.write(output) 132 133 print u"输出PDF成功!" 134 135 for html in htmls: 136 os.remove(html) 137 print u"删除临时文件"+html 138 139 for pdf in pdfs: 140 os.remove(pdf) 141 print u"删除临时文件"+pdf 142 143 total_time = time.time() - start 144 print(u"总共耗时:%f 秒" % total_time) 145 146 147 if __name__ == \'__main__\': 148 main()