from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import threading, os

class PdfThread(threading.Thread):
    def __init__(self, in_queue, doc_txt_dir):
        threading.Thread.__init__(self)
        self.in_queue = in_queue
        self.doc_txt_dir = doc_txt_dir

    def run(self):
        while True:
            try:
                codec = 'utf-8'
                in_fname = self.in_queue.get()
                rsrc = PDFResourceManager(caching = True)
                base_name = os.path.basename(in_fname)
                out_file = os.path.join(self.doc_txt_dir, base_name[0:base_name.rfind(".")] + ".txt")
                outfp = file(out_file, 'w')
                laparams = LAParams() #加上此参数可保留原pdf中的字符间空格
                device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
                fp = file(in_fname, 'rb')
                caching = True
                pagenos = set()
                process_pdf(rsrc, device, fp, pagenos, maxpages=0, password='',caching=caching, check_extractable=True)
                fp.close()
                device.close()
                outfp.close()
                print "have convert pdf file %s to file %s" %(in_fname, out_file)
            finally:
                self.in_queue.task_done()
#TagExtractor

 

相关文章:

  • 2022-12-23
  • 2021-10-30
  • 2021-07-16
  • 2021-10-29
  • 2021-12-09
  • 2022-01-25
  • 2022-12-23
猜你喜欢
  • 2021-09-12
  • 2022-12-23
  • 2021-12-30
  • 2022-02-18
  • 2022-12-23
  • 2021-08-14
  • 2022-12-23
相关资源
相似解决方案