使用 PDFBox 获取 PDF 文本对象答案

【问题标题】：Getting PDF TextObjects with PDFBox使用 PDFBox 获取 PDF 文本对象
【发布时间】：2014-10-13 10:24:25
【问题描述】：

我有一个 PDF，我使用 PDFBox 从中提取了一个页面：

(...)
File input = new File("C:\\temp\\sample.pdf");
document = PDDocument.load(input);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage page = (PDPage) allPages.get(2);
PDStream contents = page.getContents();
if (contents != null) {
System.out.println(contents.getInputStreamAsString());
(...)

这会根据PDF spec 给出以下结果，看起来就像您所期望的那样。

q
/GS0 gs
/Fm0 Do
Q
/Span <</Lang (en-US)/MCID 88 >>BDC 
BT
/CS0 cs 0 0 0  scn
/GS1 gs
/T1_0 1 Tf
8.5 0 0 8.5 70.8661 576 Tm
(This page has been intentionally left blank.)Tj
ET
EMC 
1 1 1  scn
/GS0 gs
22.677 761.102 28.346 32.599 re
f
/Span <</Lang (en-US)/MCID 89 >>BDC 
BT
0.531 0.53 0.528  scn
/T1_1 1 Tf
9 0 0 9 45.7136 761.1024 Tm
(2)Tj
ET
EMC 
q
0 g
/Fm1 Do
Q

我正在寻找的是将页面上的 PDF TextObjects（如 PDF 规范的第 5.3 节所述）提取为 java 对象，所以基本上是 BT 和 ET 之间的部分（本页上的两个） . 它们至少应包含“Tj”之前的括号之间的所有内容作为字符串，以及基于“Tm”（或“Td”运算符等）的 x 和 y 坐标。其他属性将是一个奖励，但不是必需的。

PDFTextStripper 似乎给了我每个具有属性的字符作为 TextPosition（对我的目的来说噪音太大），或者所有的 Text 作为一个长字符串。

PDFBox 是否具有解析页面并提供我错过的此类 TextObjects 的功能？或者，如果我要扩展 PDFBox 以获得我需要的东西，我应该从哪里开始？欢迎任何帮助。

编辑：发现另一个问题here，这为我如何构建我需要的东西提供了灵感。如果我成功了，我会回来检查的。不过，仍然期待您的任何帮助。

谢谢，

菲尔

【问题讨论】：

使用 PDFBox 获得的最佳效果是 PDFStreamParser 返回的令牌。不完全是文本对象，而是可以从中隔离文本对象的操作集合。

标签： java pdfbox

【解决方案1】：

根据链接的问题和昨天mkl 的提示（谢谢！），我决定构建一些东西来解析令牌。需要考虑的是，在 PDF 文本对象中，属性位于运算符之前，因此我收集集合中的所有属性，直到遇到运算符。然后，当我知道属性属于哪个运算符时，我将它们移动到适当的位置。这是我想出的：

import java.io.File;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFOperator;

public class TextExtractor {
    public static void main(String[] args) { 
        try {
            File input = new File("C:\\some\\file.pdf");
            PDDocument document = PDDocument.load(input);
            List allPages = document.getDocumentCatalog().getAllPages();
            // just parsing page 2 here, as it's only a sample
            PDPage page = (PDPage) allPages.get(2);
            PDStream contents = page.getContents();
            PDFStreamParser parser = new PDFStreamParser(contents.getStream());
            parser.parse();  
            List tokens = parser.getTokens();  
            boolean parsingTextObject = false; //boolean to check whether the token being parsed is part of a TextObject
            PDFTextObject textobj = new PDFTextObject();
            for (int i = 0; i < tokens.size(); i++)  
            {  
                Object next = tokens.get(i); 
                if (next instanceof PDFOperator)  {
                    PDFOperator op = (PDFOperator) next;  
                    switch(op.getOperation()){
                        case "BT":
                            //BT: Begin Text. 
                            parsingTextObject = true;
                            textobj = new PDFTextObject();
                            break;
                        case "ET":
                            parsingTextObject = false;
                            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
                            break;
                        case "Tj":
                            textobj.setText();
                            break;
                        case "Tm":
                            textobj.setMatrix();
                            break;
                        default:
                            //System.out.println("unsupported operation " + op.getOperation());
                    }
                    textobj.clearAllAttributes();
                }
                else if (parsingTextObject)  {
                    textobj.addAttribute(next);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } 
    }
}

结合：

import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

class PDFTextObject{
    private List attributes = new ArrayList<Object>();
    private String text = "";
    private float x = -1;
    private float y = -1;

    public void clearAllAttributes(){
        attributes = new ArrayList<Object>();
    }

    public void addAttribute(Object anAttribute){
        attributes.add(anAttribute);
    }

    public void setText(){
        //Move the contents of the attributes to the text attribute.
        for (int i = 0; i < attributes.size(); i++){
            if (attributes.get(i) instanceof COSString){
                COSString aString = (COSString) attributes.get(i);
                text = text + aString.getString();
            }
            else {
                System.out.println("Whoops! Wrong type of property...");
            }
        }
    }

    public String getText(){
        return text;
    }

    public void setMatrix(){
        //Move the contents of the attributes to the x and y attributes.
        //A Matrix has 6 attributes, the last two of which are x and y
        for (int i = 4; i < attributes.size(); i++){
            float curval = -1;
            if (attributes.get(i) instanceof COSInteger){
                COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                curval = aCOSInteger.floatValue();

            }
            if (attributes.get(i) instanceof COSFloat){
                COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                curval = aCOSFloat.floatValue();
            }
            switch(i) {
                case 4:
                    x = curval;
                    break;
                case 5:
                    y = curval;
                    break;
            }
        }
    }

    public float getX(){
        return x;
    }

    public float getY(){
        return y;
    }
}

它给出了输出：

Text: This page has been intentionally left blank.@70.8661,576.0
Text: 2@45.7136,761.1024

虽然它可以解决问题，但我确信我已经打破了一些约定，并且并不总是写出最优雅的代码。欢迎改进和替代解决方案。

【讨论】：

好吧，只考虑 Tj 和 Tm 你会忽略很多，这也可能发生。你例如有时根本没有 Tm 或 Tm 信息被其他操作更改。
没错，我主要将其发布为一个有效的概念证明，可以根据需要进行扩展。我曾考虑自己这样做，但我仍在学习 Java 和 PDF。此外，它会扩展这里发布的代码，而且已经很长了，恕我直言。第三，它会减损我提出的问题：如何去获取 TextObjects。我仍然期待其他方法。
PDFOperator 在 2.0.7 版本的 util 中不存在

【解决方案2】：

我使用 pdfbox-2.0.1 添加了 Phil 响应的一个版本

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

public class TextExtractor {
  public static void main(String[] args) {
    try {
      File input = new File("src\\test\\resources\\files\\file1.pdf");
      PDDocument document = PDDocument.load(input);
      PDPageTree allPages = document.getDocumentCatalog().getPages();
      // just parsing page 2 here, as it's only a sample
      PDPage page = allPages.get(0);
      PDFStreamParser parser = new PDFStreamParser(page);
      parser.parse();
      List tokens = parser.getTokens();
      boolean parsingTextObject = false; // boolean to check whether the token
                                         // being parsed is part of a TextObject
      PDFTextObject textobj = new PDFTextObject();
      for (int i = 0; i < tokens.size(); i++) {
        Object next = tokens.get(i);
        if (next instanceof Operator) {
          Operator op = (Operator) next;
          switch (op.getName()) {
          case "BT":
            // BT: Begin Text.
            parsingTextObject = true;
            textobj = new PDFTextObject();
            break;
          case "ET":
            parsingTextObject = false;
            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
            break;
          case "Tj":
            textobj.setText();
            break;
          case "Tm":
            textobj.setMatrix();
            break;
          default:
            System.out.println("unsupported operation " + op);
          }
          textobj.clearAllAttributes();
        } else if (parsingTextObject) {
          textobj.addAttribute(next);
        } else {
          System.out.println("ignore "+next.getClass()+" -> "+next);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }


  static class PDFTextObject{
      private List attributes = new ArrayList<Object>();
      private String text = "";
      private float x = -1;
      private float y = -1;

      public void clearAllAttributes(){
          attributes = new ArrayList<Object>();
      }

      public void addAttribute(Object anAttribute){
          attributes.add(anAttribute);
      }

      public void setText(){
          //Move the contents of the attributes to the text attribute.
          for (int i = 0; i < attributes.size(); i++){
              if (attributes.get(i) instanceof COSString){
                  COSString aString = (COSString) attributes.get(i);
                  text = text + aString.getString();
              }
              else {
                  System.out.println("Whoops! Wrong type of property...");
              }
          }
      }

      public String getText(){
          return text;
      }

      public void setMatrix(){
          //Move the contents of the attributes to the x and y attributes.
          //A Matrix has 6 attributes, the last two of which are x and y
          for (int i = 4; i < attributes.size(); i++){
              float curval = -1;
              if (attributes.get(i) instanceof COSInteger){
                  COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                  curval = aCOSInteger.floatValue();

              }
              if (attributes.get(i) instanceof COSFloat){
                  COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                  curval = aCOSFloat.floatValue();
              }
              switch(i) {
                  case 4:
                      x = curval;
                      break;
                  case 5:
                      y = curval;
                      break;
              }
          }
      }

      public float getX(){
          return x;
      }

      public float getY(){
          return y;
      }
  }
}

【讨论】：