diff --git a/src/core/document.js b/src/core/document.js index 6b9a7bced..a23c9fd89 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -265,7 +265,8 @@ var Page = (function PageClosure() { }, extractTextContent: function Page_extractTextContent(task, - normalizeWhitespace) { + normalizeWhitespace, + combineTextItems) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -298,7 +299,8 @@ var Page = (function PageClosure() { task, self.resources, /* stateManager = */ null, - normalizeWhitespace); + normalizeWhitespace, + combineTextItems); }); }, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 47f5816b1..bce4ce6a5 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1110,7 +1110,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { getTextContent: function PartialEvaluator_getTextContent(stream, task, resources, stateManager, - normalizeWhitespace) { + normalizeWhitespace, + combineTextItems) { stateManager = (stateManager || new StateManager(new TextState())); @@ -1421,7 +1422,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { var isSameTextLine = !textState.font ? false : ((textState.font.vertical ? args[0] : args[1]) === 0); advance = args[0] - args[1]; - if (isSameTextLine && textContentItem.initialized && + if (combineTextItems && + isSameTextLine && textContentItem.initialized && advance > 0 && advance <= textContentItem.fakeMultiSpaceMax) { textState.translateTextLineMatrix(args[0], args[1]); @@ -1453,7 +1455,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // Optimization to treat same line movement as advance. advance = textState.calcTextLineMatrixAdvance( args[0], args[1], args[2], args[3], args[4], args[5]); - if (advance !== null && textContentItem.initialized && + if (combineTextItems && + advance !== null && textContentItem.initialized && advance.value > 0 && advance.value <= textContentItem.fakeMultiSpaceMax) { textState.translateTextLineMatrix(advance.width, @@ -1594,7 +1597,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { next(self.getTextContent(xobj, task, xobj.dict.get('Resources') || resources, stateManager, - normalizeWhitespace).then(function (formTextContent) { + normalizeWhitespace, combineTextItems).then( + function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); diff --git a/src/core/worker.js b/src/core/worker.js index 76556518c..40e5a5a0f 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -891,12 +891,14 @@ var WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; var normalizeWhitespace = data.normalizeWhitespace; + var combineTextItems = data.combineTextItems; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task, normalizeWhitespace).then( + return page.extractTextContent(task, normalizeWhitespace, + combineTextItems).then( function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + diff --git a/src/display/api.js b/src/display/api.js index cfdfb1a9f..06fcc18be 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { * @typedef {Object} getTextContentParameters * @param {boolean} normalizeWhitespace - replaces all occurrences of * whitespace with standard spaces (0x20). The default value is `false`. + * @param {boolean} disableCombineTextItems - do not attempt to combine + * same line {@link TextItem}'s. The default value is `false`. */ /** @@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() { * object that represent the page text content. */ getTextContent: function PDFPageProxy_getTextContent(params) { - var normalizeWhitespace = (params && params.normalizeWhitespace) || false; - return this.transport.messageHandler.sendWithPromise('GetTextContent', { pageIndex: this.pageNumber - 1, - normalizeWhitespace: normalizeWhitespace, + normalizeWhitespace: (params && params.normalizeWhitespace === true ? + true : /* Default */ false), + combineTextItems: (params && params.disableCombineTextItems === true ? + false : /* Default */ true), }); }, diff --git a/test/driver.js b/test/driver.js index f5508241d..3fc21e513 100644 --- a/test/driver.js +++ b/test/driver.js @@ -332,7 +332,7 @@ var Driver = (function DriverClosure() { this._log('Loading file "' + task.file + '"\n'); - var absoluteUrl = new URL(task.file, window.location).href; + var absoluteUrl = new URL(task.file, window.location).href; PDFJS.disableRange = task.disableRange; PDFJS.disableAutoFetch = !task.enableAutoFetch; try { @@ -469,12 +469,12 @@ var Driver = (function DriverClosure() { textLayerContext.clearRect(0, 0, textLayerCanvas.width, textLayerCanvas.height); // The text builder will draw its content on the test canvas - initPromise = - page.getTextContent({ normalizeWhitespace: true }).then( - function(textContent) { - return rasterizeTextLayer(textLayerContext, viewport, - textContent); - }); + initPromise = page.getTextContent({ + normalizeWhitespace: true, + }).then(function(textContent) { + return rasterizeTextLayer(textLayerContext, viewport, + textContent); + }); } else { textLayerCanvas = null; diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6205d3c14..531af34de 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -771,12 +771,14 @@ describe('api', function() { }); it('gets text content', function (done) { var defaultPromise = page.getTextContent(); - var normalizeWhitespacePromise = page.getTextContent({ - normalizeWhitespace: true }); + var parametersPromise = page.getTextContent({ + normalizeWhitespace: true, + disableCombineTextItems: true, + }); var promises = [ defaultPromise, - normalizeWhitespacePromise + parametersPromise, ]; Promise.all(promises).then(function (data) { expect(!!data[0].items).toEqual(true); diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index cef2a4193..467b8c029 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() { function pdfPageRenderCallback() { pageViewDrawCallback(null); if (textLayer) { - self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( - function textContentResolved(textContent) { - textLayer.setTextContent(textContent); - textLayer.render(TEXT_LAYER_RENDER_DELAY); - } - ); + self.pdfPage.getTextContent({ + normalizeWhitespace: true, + }).then(function textContentResolved(textContent) { + textLayer.setTextContent(textContent); + textLayer.render(TEXT_LAYER_RENDER_DELAY); + }); } }, function pdfPageRenderError(error) { diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js index 0207c6f78..29197e7bd 100644 --- a/web/pdf_viewer.js +++ b/web/pdf_viewer.js @@ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() { getPageTextContent: function (pageIndex) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { - return page.getTextContent({ normalizeWhitespace: true }); + return page.getTextContent({ + normalizeWhitespace: true, + }); }); },