From 58277c8b76af68b466be8b28f26abaaf29da05a2 Mon Sep 17 00:00:00 2001 From: Pdf Bot Date: Wed, 25 Nov 2015 17:15:54 +0000 Subject: [PATCH] PDF.js version 1.3.34 - See mozilla/pdf.js@0819d71a027020d44b37a31808bf778fc2c0ea14 --- bower.json | 2 +- build/pdf.combined.js | 58 +++++++++++++++++++++++++++++++++---------- build/pdf.js | 20 ++++++++++++--- build/pdf.worker.js | 42 +++++++++++++++++++++++-------- package.json | 2 +- web/pdf_viewer.js | 6 ++--- 6 files changed, 97 insertions(+), 33 deletions(-) diff --git a/bower.json b/bower.json index 54c72577f..d5387ccb1 100644 --- a/bower.json +++ b/bower.json @@ -1,6 +1,6 @@ { "name": "pdfjs-dist", - "version": "1.3.32", + "version": "1.3.34", "main": [ "build/pdf.js", "build/pdf.worker.js" diff --git a/build/pdf.combined.js b/build/pdf.combined.js index 0903675b5..1c4de3c3e 100644 --- a/build/pdf.combined.js +++ b/build/pdf.combined.js @@ -20,8 +20,8 @@ if (typeof PDFJS === 'undefined') { (typeof window !== 'undefined' ? window : this).PDFJS = {}; } -PDFJS.version = '1.3.32'; -PDFJS.build = 'c2dfe9e'; +PDFJS.version = '1.3.34'; +PDFJS.build = '0819d71'; (function pdfjsWrapper() { // Use strict in our context only - users might not want it @@ -2354,6 +2354,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { return PDFDocumentProxy; })(); +/** + * Page getTextContent parameters. + * + * @typedef {Object} getTextContentParameters + * @param {boolean} normalizeWhitespace - replaces all occurrences of + * whitespace with standard spaces (0x20). The default value is `false`. + */ + /** * Page text content. * @@ -2632,12 +2640,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { }, /** + * @param {getTextContentParameters} params - getTextContent parameters. * @return {Promise} That is resolved a {@link TextContent} * object that represent the page text content. */ - getTextContent: function PDFPageProxy_getTextContent() { + getTextContent: function PDFPageProxy_getTextContent(params) { + var normalizeWhitespace = (params && params.normalizeWhitespace) || false; + return this.transport.messageHandler.sendWithPromise('GetTextContent', { - pageIndex: this.pageNumber - 1 + pageIndex: this.pageNumber - 1, + normalizeWhitespace: normalizeWhitespace, }); }, @@ -9909,7 +9921,8 @@ var Page = (function PageClosure() { }); }, - extractTextContent: function Page_extractTextContent(task) { + extractTextContent: function Page_extractTextContent(task, + normalizeWhitespace) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -9939,7 +9952,9 @@ var Page = (function PageClosure() { return partialEvaluator.getTextContent(contentStream, task, - self.resources); + self.resources, + /* stateManager = */ null, + normalizeWhitespace); }); }, @@ -18842,12 +18857,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }); }, - getTextContent: function PartialEvaluator_getTextContent(stream, task, - resources, - stateManager) { + getTextContent: + function PartialEvaluator_getTextContent(stream, task, resources, + stateManager, + normalizeWhitespace) { stateManager = (stateManager || new StateManager(new TextState())); + var WhitespaceRegexp = /\s/g; + var textContent = { items: [], styles: Object.create(null) @@ -18961,11 +18979,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return textContentItem; } + function replaceWhitespace(str) { + // Replaces all whitespaces with standard spaces (0x20), to avoid + // alignment issues between the textLayer and the canvas if the text + // contains e.g. tabs (fixes issue6612.pdf). + var i = 0, ii = str.length, code; + while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { + i++; + } + return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); + } + function runBidiTransform(textChunk) { var str = textChunk.str.join(''); var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); return { - str: bidiResult.str, + str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : + bidiResult.str), dir: bidiResult.dir, width: textChunk.width, height: textChunk.height, @@ -19286,8 +19316,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } return self.getTextContent(xobj, task, - xobj.dict.get('Resources') || resources, stateManager). - then(function (formTextContent) { + xobj.dict.get('Resources') || resources, stateManager, + normalizeWhitespace).then(function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); @@ -41996,12 +42026,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; + var normalizeWhitespace = data.normalizeWhitespace; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task).then(function(textContent) { + return page.extractTextContent(task, normalizeWhitespace).then( + function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + (Date.now() - start) + 'ms'); diff --git a/build/pdf.js b/build/pdf.js index cc50e4c26..db8a394b2 100644 --- a/build/pdf.js +++ b/build/pdf.js @@ -20,8 +20,8 @@ if (typeof PDFJS === 'undefined') { (typeof window !== 'undefined' ? window : this).PDFJS = {}; } -PDFJS.version = '1.3.32'; -PDFJS.build = 'c2dfe9e'; +PDFJS.version = '1.3.34'; +PDFJS.build = '0819d71'; (function pdfjsWrapper() { // Use strict in our context only - users might not want it @@ -2354,6 +2354,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { return PDFDocumentProxy; })(); +/** + * Page getTextContent parameters. + * + * @typedef {Object} getTextContentParameters + * @param {boolean} normalizeWhitespace - replaces all occurrences of + * whitespace with standard spaces (0x20). The default value is `false`. + */ + /** * Page text content. * @@ -2632,12 +2640,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() { }, /** + * @param {getTextContentParameters} params - getTextContent parameters. * @return {Promise} That is resolved a {@link TextContent} * object that represent the page text content. */ - getTextContent: function PDFPageProxy_getTextContent() { + getTextContent: function PDFPageProxy_getTextContent(params) { + var normalizeWhitespace = (params && params.normalizeWhitespace) || false; + return this.transport.messageHandler.sendWithPromise('GetTextContent', { - pageIndex: this.pageNumber - 1 + pageIndex: this.pageNumber - 1, + normalizeWhitespace: normalizeWhitespace, }); }, diff --git a/build/pdf.worker.js b/build/pdf.worker.js index f8c4527d3..02816fb62 100644 --- a/build/pdf.worker.js +++ b/build/pdf.worker.js @@ -20,8 +20,8 @@ if (typeof PDFJS === 'undefined') { (typeof window !== 'undefined' ? window : this).PDFJS = {}; } -PDFJS.version = '1.3.32'; -PDFJS.build = 'c2dfe9e'; +PDFJS.version = '1.3.34'; +PDFJS.build = '0819d71'; (function pdfjsWrapper() { // Use strict in our context only - users might not want it @@ -2849,7 +2849,8 @@ var Page = (function PageClosure() { }); }, - extractTextContent: function Page_extractTextContent(task) { + extractTextContent: function Page_extractTextContent(task, + normalizeWhitespace) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -2879,7 +2880,9 @@ var Page = (function PageClosure() { return partialEvaluator.getTextContent(contentStream, task, - self.resources); + self.resources, + /* stateManager = */ null, + normalizeWhitespace); }); }, @@ -11782,12 +11785,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }); }, - getTextContent: function PartialEvaluator_getTextContent(stream, task, - resources, - stateManager) { + getTextContent: + function PartialEvaluator_getTextContent(stream, task, resources, + stateManager, + normalizeWhitespace) { stateManager = (stateManager || new StateManager(new TextState())); + var WhitespaceRegexp = /\s/g; + var textContent = { items: [], styles: Object.create(null) @@ -11901,11 +11907,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return textContentItem; } + function replaceWhitespace(str) { + // Replaces all whitespaces with standard spaces (0x20), to avoid + // alignment issues between the textLayer and the canvas if the text + // contains e.g. tabs (fixes issue6612.pdf). + var i = 0, ii = str.length, code; + while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) { + i++; + } + return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str); + } + function runBidiTransform(textChunk) { var str = textChunk.str.join(''); var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical); return { - str: bidiResult.str, + str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) : + bidiResult.str), dir: bidiResult.dir, width: textChunk.width, height: textChunk.height, @@ -12226,8 +12244,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } return self.getTextContent(xobj, task, - xobj.dict.get('Resources') || resources, stateManager). - then(function (formTextContent) { + xobj.dict.get('Resources') || resources, stateManager, + normalizeWhitespace).then(function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); @@ -34936,12 +34954,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; + var normalizeWhitespace = data.normalizeWhitespace; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task).then(function(textContent) { + return page.extractTextContent(task, normalizeWhitespace).then( + function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + (Date.now() - start) + 'ms'); diff --git a/package.json b/package.json index 5c0ec6e65..e65005c1a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfjs-dist", - "version": "1.3.32", + "version": "1.3.34", "description": "Generic build of Mozilla's PDF.js library.", "keywords": [ "Mozilla", diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js index 4c0cb6094..c1c264a48 100644 --- a/web/pdf_viewer.js +++ b/web/pdf_viewer.js @@ -1345,7 +1345,7 @@ var PDFPageView = (function PDFPageViewClosure() { function pdfPageRenderCallback() { pageViewDrawCallback(null); if (textLayer) { - self.pdfPage.getTextContent().then( + self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( function textContentResolved(textContent) { textLayer.setTextContent(textContent); textLayer.render(TEXT_LAYER_RENDER_DELAY); @@ -2368,7 +2368,7 @@ var PDFViewer = (function pdfViewer() { if (!this.pdfDocument) { return; } - + var pageView = this._pages[pageNumber - 1]; if (this.isInPresentationMode) { @@ -2626,7 +2626,7 @@ var PDFViewer = (function pdfViewer() { getPageTextContent: function (pageIndex) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { - return page.getTextContent(); + return page.getTextContent({ normalizeWhitespace: true }); }); },