PDF.js可以实现在html下直接浏览pdf文档,是一款开源的pdf文档读取解析插件,非常强大,能将PDF文件渲染成Canvas。PDF.js主要包含两个库文件,一个pdf.js和一个pdf.worker.js,一个负责API解析,一个负责核心解析。
首先引入pdf.js文件<script type="text/javascript" src=\'pdf.js\'></script>
PDF.js大部分用法都是基于Promise的,PDFJS.getDocument(url)方法返回的就是一个Promise:
PDFJS.getDocument(\'helloworld.pdf\').then(function(pdf) {
});
PDF的解析工作需要通过pdf.getPage(page)去执行,这个方法返回的也是一个Promise,因此可以去逐页解析PDF:
pdf.getPage(1).then(function(page) {
});
官网地址:http://mozilla.github.io/pdf.js/
渲染页面
各PDF页面有它自己的视窗,它定义了像素大小(n.72dpi和初始旋转。默认情况下,该窗口将缩放到PDF但是通过修改视图可以更改此操作。当创建了视图时,还会创建一个初始转换矩阵,它考虑到期望的规模、旋转,并转换坐标系统(0点)PDF文档底部左边,而画布0是 左。
var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(\'the-canvas\'); var context = canvas.getContext(\'2d\'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; page.render(renderContext);
还可以自定义canvas大小:
var desiredWidth = 100; var viewport = page.getViewport(1); var scale = desiredWidth / viewport.width; var scaledViewport = page.getViewport(scale);
官方给出的示例:
ar url = \'//cdn.mozilla.net/pdfjs/helloworld.pdf\'; PDFJS.workerSrc = \'//mozilla.github.io/pdf.js/build/pdf.worker.js\'; var loadingTask = PDFJS.getDocument(url); loadingTask.promise.then(function(pdf) { console.log(\'PDF loaded\'); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log(\'Page loaded\'); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(\'the-canvas\'); var context = canvas.getContext(\'2d\'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log(\'Page rendered\'); }); }); }, function (reason) { console.error(reason); });
另外较大的PDF文件可以用base 64编码方式加载,例如:
var pdfData = atob( \'JVBERi0xLjcKCjEgMCBvYmogICUgZW50cnkgcG9pbnQKPDwKICAvVHlwZSAvQ2F0YWxvZwog\' + \'IC9QYWdlcyAyIDAgUgo+PgplbmRvYmoKCjIgMCBvYmoKPDwKICAvVHlwZSAvUGFnZXMKICAv\' + \'TWVkaWFCb3ggWyAwIDAgMjAwIDIwMCBdCiAgL0NvdW50IDEKICAvS2lkcyBbIDMgMCBSIF0K\' + \'Pj4KZW5kb2JqCgozIDAgb2JqCjw8CiAgL1R5cGUgL1BhZ2UKICAvUGFyZW50IDIgMCBSCiAg\' + \'L1Jlc291cmNlcyA8PAogICAgL0ZvbnQgPDwKICAgICAgL0YxIDQgMCBSIAogICAgPj4KICA+\' + \'PgogIC9Db250ZW50cyA1IDAgUgo+PgplbmRvYmoKCjQgMCBvYmoKPDwKICAvVHlwZSAvRm9u\' + \'dAogIC9TdWJ0eXBlIC9UeXBlMQogIC9CYXNlRm9udCAvVGltZXMtUm9tYW4KPj4KZW5kb2Jq\' + \'Cgo1IDAgb2JqICAlIHBhZ2UgY29udGVudAo8PAogIC9MZW5ndGggNDQKPj4Kc3RyZWFtCkJU\' + \'CjcwIDUwIFRECi9GMSAxMiBUZgooSGVsbG8sIHdvcmxkISkgVGoKRVQKZW5kc3RyZWFtCmVu\' + \'ZG9iagoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDEwIDAwMDAwIG4g\' + \'CjAwMDAwMDAwNzkgMDAwMDAgbiAKMDAwMDAwMDE3MyAwMDAwMCBuIAowMDAwMDAwMzAxIDAw\' + \'MDAwIG4gCjAwMDAwMDAzODAgMDAwMDAgbiAKdHJhaWxlcgo8PAogIC9TaXplIDYKICAvUm9v\' + \'dCAxIDAgUgo+PgpzdGFydHhyZWYKNDkyCiUlRU9G\');
PDFJS.workerSrc = \'//mozilla.github.io/pdf.js/build/pdf.worker.js\'; var loadingTask = PDFJS.getDocument({data: pdfData}); loadingTask.promise.then(function(pdf) { console.log(\'PDF loaded\'); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log(\'Page loaded\'); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(\'the-canvas\'); var context = canvas.getContext(\'2d\'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log(\'Page rendered\'); }); }); }, function (reason) { console.error(reason); });
pdf翻页处理:
// If absolute URL from the remote server is provided, configure the CORS // header on that server. var url = \'//cdn.mozilla.net/pdfjs/tracemonkey.pdf\'; // The workerSrc property shall be specified. PDFJS.workerSrc = \'//mozilla.github.io/pdf.js/build/pdf.worker.js\'; var pdfDoc = null, pageNum = 1, pageRendering = false, pageNumPending = null, scale = 0.8, canvas = document.getElementById(\'the-canvas\'), ctx = canvas.getContext(\'2d\'); /** * Get page info from document, resize canvas accordingly, and render page. * @param num Page number. */ function renderPage(num) { pageRendering = true;
pdfDoc.getPage(num).then(function(page) { var viewport = page.getViewport(scale); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: ctx, viewport: viewport }; var renderTask = page.render(renderContext);
renderTask.promise.then(function() { pageRendering = false; if (pageNumPending !== null) { renderPage(pageNumPending); pageNumPending = null; } }); }); document.getElementById(\'page_num\').textContent = num; }
function queueRenderPage(num) { if (pageRendering) { pageNumPending = num; } else { renderPage(num); } }
function onPrevPage() { if (pageNum <= 1) { return; } pageNum--; queueRenderPage(pageNum); } document.getElementById(\'prev\').addEventListener(\'click\', onPrevPage);
function onNextPage() { if (pageNum >= pdfDoc.numPages) { return; } pageNum++; queueRenderPage(pageNum); } document.getElementById(\'next\').addEventListener(\'click\', onNextPage);
PDFJS.getDocument(url).then(function(pdfDoc_) { pdfDoc = pdfDoc_; document.getElementById(\'page_count\').textContent = pdfDoc.numPages; renderPage(pageNum); });
关于page方式的使用:
解析结果,我们可以看下这个对象提供的方法:
| 方法 | 返回 |
|---|---|
| getAnnotations | A promise that is resolved with an {Array} of the annotation objects. |
| getTextContent | That is resolved a TextContent object that represent the page text content. |
| getViewport | Contains ‘width’ and ‘height’ properties along with transforms required for rendering. |
| render | An object that contains the promise, which is resolved when the page finishes rendering. |
我们可以试试调用getTextContent方法,并将其结果打印出来:
pdf.getPage(1).then(function(page) { console.log(page); });
输入格式大致如下:
{ "items": [ { "str": "xxx", "dir": "xxx", "width": xxx, "height": xxx, "transform": [ 48, 0, 0, 48, 45.32495, 679.04 ], "fontName": "g_d0_f1" }, { "str": " ", "dir": "ltr", "width": 9.600000000000001, "height": 2304, "transform": [ 48, 0, 0, 48, 285.325, 679.04 ], "fontName": "g_d0_f2" } ], "styles": { "g_d0_f1": { "fontFamily": "monospace", "ascent": 1.05810546875, "descent": -0.26171875, "vertical": false }, "g_d0_f2": { "fontFamily": "sans-serif", "ascent": 0.74365234375, "descent": -0.25634765625 } } }
PDF.js能将每页文本的字符串、位置、字体都解析出来。
官网用的viewer.js:http://mozilla.github.io/pdf.js/web/viewer.html,首先底图是一个Canvas,内容和PDF一样(通过下面介绍的page.render方法可以得到),底图之上是一个textLayer,这一层就是通过page.getTextContent()得到了字体的位置和样式,再覆盖在Canvas上。
我们可以直接使用官网view.html的demo,然后修改样式去掉用不掉的功能,简单粗暴。只需要在跳转链接后面加上参数就行,例:http://xxxx/viewer.html?file=\'xxxx.pdf\';