C# Pdf to Text with image placeholder
https://stackoverflow.com/a/28087521/
https://stackoverflow.com/a/33697745/
虽然这没有我的问题中提到的确切布局(因为这是我真正想要的简化版本),但它确实具有第二个注释列出的起始部分(从 iText Java 翻译)。 .. 从第三个注释中提取了额外的信息(Java 中使用的一些反射似乎在 C# 中不起作用,因此该信息来自 #3)。
据此,我可以得到一个字符串列表,表示 PDF 中的行(所有页面,而不仅仅是第 1 页)......在图像应该出现的位置添加了文本(Huzzah!)。为风味添加了 ByteArrayToFile 扩展方法(尽管我没有包含可能破坏此代码的复制/粘贴用法的其他部分/扩展)。
我还能够大大简化我的流程的其他部分,并消除我之前工作的一半垃圾。嘘!!!谢谢@Mkl
internal class Program
{
public static void Main(string[] args)
{
var dir = Settings.TestDirectory;
var file = Settings.TestFile;
Log.Info($"File to Process: {file.FullName}");
using (var reader = new PdfReader(file.FullName))
{
var parser = new PdfReaderContentParser(reader);
var listener = new SimpleMixedExtractionStrategy(file, dir);
parser.ProcessContent(1, listener);
var x = listener.GetResultantText().Split('\n');
}
}
}
public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
public static readonly ILog Log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
public DirectoryInfo OutputPath { get; }
public FileInfo OutputFile { get; }
private static readonly LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
private int _counter;
public SimpleMixedExtractionStrategy(FileInfo outputFile, DirectoryInfo outputPath)
{
OutputPath = outputPath;
OutputFile = outputFile;
}
public override void RenderImage(ImageRenderInfo renderInfo)
{
try
{
var image = renderInfo.GetImage();
if (image == null) return;
var number = _counter++;
var imageFile = new FileInfo($"{OutputFile.FullName}-{number}.{image.GetFileType()}");
imageFile.ByteArrayToFile(image.GetImageAsBytes());
var segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
var location = new TextChunk("[" + imageFile + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);
var locationalResultField = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.NonPublic | BindingFlags.Instance);
var LocationalResults = (List<TextChunk>)locationalResultField.GetValue(this);
LocationalResults.Add(location);
}
catch (Exception ex)
{
Log.Debug($"{ex.Message}");
Log.Verbose($"{ex.StackTrace}");
}
}
}
public static class ByteArrayExtensions
{
public static bool ByteArrayToFile(this FileInfo fileName, byte[] byteArray)
{
try
{
// Open file for reading
var fileStream = new FileStream(fileName.FullName, FileMode.Create, FileAccess.Write);
// Writes a block of bytes to this stream using data from a byte array.
fileStream.Write(byteArray, 0, byteArray.Length);
// close file stream
fileStream.Close();
return true;
}
catch (Exception exception)
{
// Error
Log.Error($"Exception caught in process: {exception.Message}", exception);
}
// error occured, return false
return false;
}
}