hushaojun

python转换html到pdf文件

1.安装wkhtmltopdf 

Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”

2.安装pdfkit

直接pip install pdfkit

pdfkit 是 wkhtmltopdf 的Python封装包

1 import pdfkit
2 
3 # 有下面3中途径生产pdf
4 
5 pdfkit.from_url(\'http://google.com\', \'out.pdf\')
6 
7 pdfkit.from_file(\'test.html\', \'out.pdf\')
8 
9 pdfkit.from_string(\'Hello!\', \'out.pdf\')

3.合并pdf,使用PyPDF2

直接pip install PyPDF2

1 from PyPDF2 import PdfFileMerger
2 merger = PdfFileMerger()
3 input1 = open("1.pdf", "rb")
4 input2 = open("2.pdf", "rb")
5 merger.append(input1)
6 merger.append(input2)
7 # 写入到输出pdf文档中
8 output = open("hql_all.pdf", "wb")
9 merger.write(output)

4.综合示例:

  1 # coding=utf-8  
  2 import os  
  3 import re  
  4 import time  
  5 import logging  
  6 import pdfkit  
  7 import requests  
  8 from bs4 import BeautifulSoup  
  9 from PyPDF2 import PdfFileMerger  
 10 
 11 html_template = """ 
 12 <!DOCTYPE html> 
 13 <html lang="en"> 
 14 <head> 
 15     <meta charset="UTF-8"> 
 16 </head> 
 17 <body> 
 18 {content} 
 19 </body> 
 20 </html> 
 21 
 22 """  
 23 
 24 
 25 def parse_url_to_html(url, name):  
 26     """ 
 27     解析URL,返回HTML内容 
 28     :param url:解析的url 
 29     :param name: 保存的html文件名 
 30     :return: html 
 31     """  
 32     try:  
 33         response = requests.get(url)  
 34         soup = BeautifulSoup(response.content, \'html.parser\')  
 35         # 正文  
 36         body = soup.find_all(class_="x-wiki-content")[0]  
 37         # 标题  
 38         title = soup.find(\'h4\').get_text()  
 39 
 40         # 标题加入到正文的最前面,居中显示  
 41         center_tag = soup.new_tag("center")  
 42         title_tag = soup.new_tag(\'h1\')  
 43         title_tag.string = title  
 44         center_tag.insert(1, title_tag)  
 45         body.insert(1, center_tag)  
 46         html = str(body)  
 47         # body中的img标签的src相对路径的改成绝对路径  
 48         pattern = "(<img .*?src=\")(.*?)(\")"  
 49 
 50         def func(m):  
 51             if not m.group(3).startswith("http"):  
 52                 rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)  
 53                 return rtn  
 54             else:  
 55                 return m.group(1)+m.group(2)+m.group(3)  
 56         html = re.compile(pattern).sub(func, html)  
 57         html = html_template.format(content=html)  
 58         html = html.encode("utf-8")  
 59         with open(name, \'wb\') as f:  
 60             f.write(html)  
 61         return name  
 62 
 63     except Exception as e:  
 64 
 65         logging.error("解析错误", exc_info=True)  
 66 
 67 
 68 def get_url_list():  
 69     """ 
 70     获取所有URL目录列表 
 71     :return: 
 72     """  
 73     response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")  
 74     soup = BeautifulSoup(response.content, "html.parser")  
 75     menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]  
 76     urls = []  
 77     for li in menu_tag.find_all("li"):  
 78         url = "http://www.liaoxuefeng.com" + li.a.get(\'href\')  
 79         urls.append(url)  
 80     return urls  
 81 
 82 
 83 def save_pdf(htmls, file_name):  
 84     """ 
 85     把所有html文件保存到pdf文件 
 86     :param htmls:  html文件列表 
 87     :param file_name: pdf文件名 
 88     :return: 
 89     """  
 90     options = {  
 91         \'page-size\': \'Letter\',  
 92         \'margin-top\': \'0.75in\',  
 93         \'margin-right\': \'0.75in\',  
 94         \'margin-bottom\': \'0.75in\',  
 95         \'margin-left\': \'0.75in\',  
 96         \'encoding\': "UTF-8",  
 97         \'custom-header\': [  
 98             (\'Accept-Encoding\', \'gzip\')  
 99         ],  
100         \'cookie\': [  
101             (\'cookie-name1\', \'cookie-value1\'),  
102             (\'cookie-name2\', \'cookie-value2\'),  
103         ],  
104         \'outline-depth\': 10,  
105     }  
106     pdfkit.from_file(htmls, file_name, options=options)  
107 
108 
109 def main():  
110     start = time.time()  
111     file_name = u"liaoxuefeng_Python3_tutorial"  
112     urls = get_url_list()  
113     for index, url in enumerate(urls):  
114       parse_url_to_html(url, str(index) + ".html")  
115     htmls =[]  
116     pdfs =[]  
117     for i in range(0,124):  
118         htmls.append(str(i)+\'.html\')  
119         pdfs.append(file_name+str(i)+\'.pdf\')  
120 
121         save_pdf(str(i)+\'.html\', file_name+str(i)+\'.pdf\')  
122 
123         print u"转换完成第"+str(i)+\'个html\'  
124 
125     merger = PdfFileMerger()  
126     for pdf in pdfs:  
127        merger.append(open(pdf,\'rb\'))  
128        print u"合并完成第"+str(i)+\'个pdf\'+pdf  
129 
130     output = open(u"廖雪峰Python_all.pdf", "wb")  
131     merger.write(output)  
132 
133     print u"输出PDF成功!"  
134 
135     for html in htmls:  
136         os.remove(html)  
137         print u"删除临时文件"+html  
138 
139     for pdf in pdfs:  
140         os.remove(pdf)  
141         print u"删除临时文件"+pdf  
142 
143     total_time = time.time() - start  
144     print(u"总共耗时:%f 秒" % total_time)  
145 
146 
147 if __name__ == \'__main__\':  
148     main()  

 

分类:

技术点:

相关文章: