Browse Source

[api-minor] Add a parameter to `PDFPageProxy_getTextContent` that controls whether `PartialEvaluator_getTextContent` will attempt to combine same line text items

From the discussion in issue 7445, it seems that there may be cases where an API consumer would want to get the text content as is, without combined text items.
Jonas Jenwald 9 years ago
parent
commit
f297e4d17c
  1. 6
      src/core/document.js
  2. 12
      src/core/evaluator.js
  3. 4
      src/core/worker.js
  4. 9
      src/display/api.js
  5. 14
      test/driver.js
  6. 8
      test/unit/api_spec.js
  7. 12
      web/pdf_page_view.js
  8. 4
      web/pdf_viewer.js

6
src/core/document.js

@ -265,7 +265,8 @@ var Page = (function PageClosure() {
}, },
extractTextContent: function Page_extractTextContent(task, extractTextContent: function Page_extractTextContent(task,
normalizeWhitespace) { normalizeWhitespace,
combineTextItems) {
var handler = { var handler = {
on: function nullHandlerOn() {}, on: function nullHandlerOn() {},
send: function nullHandlerSend() {} send: function nullHandlerSend() {}
@ -298,7 +299,8 @@ var Page = (function PageClosure() {
task, task,
self.resources, self.resources,
/* stateManager = */ null, /* stateManager = */ null,
normalizeWhitespace); normalizeWhitespace,
combineTextItems);
}); });
}, },

12
src/core/evaluator.js

@ -1110,7 +1110,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
getTextContent: getTextContent:
function PartialEvaluator_getTextContent(stream, task, resources, function PartialEvaluator_getTextContent(stream, task, resources,
stateManager, stateManager,
normalizeWhitespace) { normalizeWhitespace,
combineTextItems) {
stateManager = (stateManager || new StateManager(new TextState())); stateManager = (stateManager || new StateManager(new TextState()));
@ -1421,7 +1422,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var isSameTextLine = !textState.font ? false : var isSameTextLine = !textState.font ? false :
((textState.font.vertical ? args[0] : args[1]) === 0); ((textState.font.vertical ? args[0] : args[1]) === 0);
advance = args[0] - args[1]; advance = args[0] - args[1];
if (isSameTextLine && textContentItem.initialized && if (combineTextItems &&
isSameTextLine && textContentItem.initialized &&
advance > 0 && advance > 0 &&
advance <= textContentItem.fakeMultiSpaceMax) { advance <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(args[0], args[1]); textState.translateTextLineMatrix(args[0], args[1]);
@ -1453,7 +1455,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
// Optimization to treat same line movement as advance. // Optimization to treat same line movement as advance.
advance = textState.calcTextLineMatrixAdvance( advance = textState.calcTextLineMatrixAdvance(
args[0], args[1], args[2], args[3], args[4], args[5]); args[0], args[1], args[2], args[3], args[4], args[5]);
if (advance !== null && textContentItem.initialized && if (combineTextItems &&
advance !== null && textContentItem.initialized &&
advance.value > 0 && advance.value > 0 &&
advance.value <= textContentItem.fakeMultiSpaceMax) { advance.value <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(advance.width, textState.translateTextLineMatrix(advance.width,
@ -1594,7 +1597,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
next(self.getTextContent(xobj, task, next(self.getTextContent(xobj, task,
xobj.dict.get('Resources') || resources, stateManager, xobj.dict.get('Resources') || resources, stateManager,
normalizeWhitespace).then(function (formTextContent) { normalizeWhitespace, combineTextItems).then(
function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items); Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles); Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore(); stateManager.restore();

4
src/core/worker.js

@ -891,12 +891,14 @@ var WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) { handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex; var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace; var normalizeWhitespace = data.normalizeWhitespace;
var combineTextItems = data.combineTextItems;
return pdfManager.getPage(pageIndex).then(function(page) { return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex); var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task); startWorkerTask(task);
var pageNum = pageIndex + 1; var pageNum = pageIndex + 1;
var start = Date.now(); var start = Date.now();
return page.extractTextContent(task, normalizeWhitespace).then( return page.extractTextContent(task, normalizeWhitespace,
combineTextItems).then(
function(textContent) { function(textContent) {
finishWorkerTask(task); finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' + info('text indexing: page=' + pageNum + ' - time=' +

9
src/display/api.js

@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
* @typedef {Object} getTextContentParameters * @typedef {Object} getTextContentParameters
* @param {boolean} normalizeWhitespace - replaces all occurrences of * @param {boolean} normalizeWhitespace - replaces all occurrences of
* whitespace with standard spaces (0x20). The default value is `false`. * whitespace with standard spaces (0x20). The default value is `false`.
* @param {boolean} disableCombineTextItems - do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`.
*/ */
/** /**
@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
* object that represent the page text content. * object that represent the page text content.
*/ */
getTextContent: function PDFPageProxy_getTextContent(params) { getTextContent: function PDFPageProxy_getTextContent(params) {
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
return this.transport.messageHandler.sendWithPromise('GetTextContent', { return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1, pageIndex: this.pageNumber - 1,
normalizeWhitespace: normalizeWhitespace, normalizeWhitespace: (params && params.normalizeWhitespace === true ?
true : /* Default */ false),
combineTextItems: (params && params.disableCombineTextItems === true ?
false : /* Default */ true),
}); });
}, },

14
test/driver.js

@ -332,7 +332,7 @@ var Driver = (function DriverClosure() {
this._log('Loading file "' + task.file + '"\n'); this._log('Loading file "' + task.file + '"\n');
var absoluteUrl = new URL(task.file, window.location).href; var absoluteUrl = new URL(task.file, window.location).href;
PDFJS.disableRange = task.disableRange; PDFJS.disableRange = task.disableRange;
PDFJS.disableAutoFetch = !task.enableAutoFetch; PDFJS.disableAutoFetch = !task.enableAutoFetch;
try { try {
@ -469,12 +469,12 @@ var Driver = (function DriverClosure() {
textLayerContext.clearRect(0, 0, textLayerContext.clearRect(0, 0,
textLayerCanvas.width, textLayerCanvas.height); textLayerCanvas.width, textLayerCanvas.height);
// The text builder will draw its content on the test canvas // The text builder will draw its content on the test canvas
initPromise = initPromise = page.getTextContent({
page.getTextContent({ normalizeWhitespace: true }).then( normalizeWhitespace: true,
function(textContent) { }).then(function(textContent) {
return rasterizeTextLayer(textLayerContext, viewport, return rasterizeTextLayer(textLayerContext, viewport,
textContent); textContent);
}); });
} else { } else {
textLayerCanvas = null; textLayerCanvas = null;

8
test/unit/api_spec.js

@ -771,12 +771,14 @@ describe('api', function() {
}); });
it('gets text content', function (done) { it('gets text content', function (done) {
var defaultPromise = page.getTextContent(); var defaultPromise = page.getTextContent();
var normalizeWhitespacePromise = page.getTextContent({ var parametersPromise = page.getTextContent({
normalizeWhitespace: true }); normalizeWhitespace: true,
disableCombineTextItems: true,
});
var promises = [ var promises = [
defaultPromise, defaultPromise,
normalizeWhitespacePromise parametersPromise,
]; ];
Promise.all(promises).then(function (data) { Promise.all(promises).then(function (data) {
expect(!!data[0].items).toEqual(true); expect(!!data[0].items).toEqual(true);

12
web/pdf_page_view.js

@ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() {
function pdfPageRenderCallback() { function pdfPageRenderCallback() {
pageViewDrawCallback(null); pageViewDrawCallback(null);
if (textLayer) { if (textLayer) {
self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( self.pdfPage.getTextContent({
function textContentResolved(textContent) { normalizeWhitespace: true,
textLayer.setTextContent(textContent); }).then(function textContentResolved(textContent) {
textLayer.render(TEXT_LAYER_RENDER_DELAY); textLayer.setTextContent(textContent);
} textLayer.render(TEXT_LAYER_RENDER_DELAY);
); });
} }
}, },
function pdfPageRenderError(error) { function pdfPageRenderError(error) {

4
web/pdf_viewer.js

@ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() {
getPageTextContent: function (pageIndex) { getPageTextContent: function (pageIndex) {
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
return page.getTextContent({ normalizeWhitespace: true }); return page.getTextContent({
normalizeWhitespace: true,
});
}); });
}, },

Loading…
Cancel
Save