首先,下载所需要的库
1 :pdfminer 安装库命令:
2: docx 安装库命令:
开始正餐:
(注意:pdf中非图片构成的部分才能被成功转换)
1
2
3
4
5import sys
6import importlib
7importlib.reload(sys)
8
9from pdfminer.pdfparser import PDFParser,PDFDocument
10from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
11from pdfminer.converter import PDFPageAggregator
12from pdfminer.layout import *
13from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
14import os
15
16
17os.chdir(r\'c:/users/dicey/desktop/codes/pdf-docx\')
18
19\'\'\'
20解析pdf文件,获取文件中包含的各种对象
21\'\'\'
22
23def parse(pdf_path):
24 fp = open(\'diya.pdf\', \'rb\')
25
26 parser = PDFParser(fp)
27
28 doc = PDFDocument()
29
30 parser.set_document(doc)
31 doc.set_parser(parser)
32
33
34
35 doc.initialize()
36
37
38 if not doc.is_extractable:
39 raise PDFTextExtractionNotAllowed
40 else:
41
42 rsrcmgr = PDFResourceManager()
43
44 laparams = LAParams()
45 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
46
47 interpreter = PDFPageInterpreter(rsrcmgr, device)
48
49
50 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
51
52
53 for page in doc.get_pages():
54 num_page += 1
55 interpreter.process_page(page)
56
57 layout = device.get_result()
58 for x in layout:
59 if isinstance(x,LTImage):
60 num_image += 1
61 if isinstance(x,LTCurve):
62 num_curve += 1
63 if isinstance(x,LTFigure):
64 num_figure += 1
65 if isinstance(x, LTTextBoxHorizontal):
66 num_TextBoxHorizontal += 1
67
68 with open(r\'test2.doc\', \'a\',encoding=\'utf-8\') as f:
69 results = x.get_text()
70 f.write(results)
71 f.write(\'\n\')
72 print(\'对象数量:\n\',\'页面数:%s\n\'%num_page,\'图片数:%s\n\'%num_image,\'曲线数:%s\n\'%num_curve,\'水平文本框:%s\n\'
73 %num_TextBoxHorizontal)
74
75
76if __name__ == \'__main__\':
77 pdf_path = r\'diya.pdf\'
78 parse(pdf_path)