Browse Source

Merge pull request #7475 from Snuffleupagus/api-getTextContent-combineTextItems

[api-minor] Add a parameter to `PDFPageProxy_getTextContent` that controls whether `PartialEvaluator_getTextContent` will attempt to combine same line text items
Yury Delendik 9 years ago committed by GitHub
parent
commit
a02e2686b9
  1. 6
      src/core/document.js
  2. 12
      src/core/evaluator.js
  3. 4
      src/core/worker.js
  4. 9
      src/display/api.js
  5. 6
      test/driver.js
  6. 8
      test/unit/api_spec.js
  7. 8
      web/pdf_page_view.js
  8. 4
      web/pdf_viewer.js

6
src/core/document.js

@ -265,7 +265,8 @@ var Page = (function PageClosure() {
}, },
extractTextContent: function Page_extractTextContent(task, extractTextContent: function Page_extractTextContent(task,
normalizeWhitespace) { normalizeWhitespace,
combineTextItems) {
var handler = { var handler = {
on: function nullHandlerOn() {}, on: function nullHandlerOn() {},
send: function nullHandlerSend() {} send: function nullHandlerSend() {}
@ -298,7 +299,8 @@ var Page = (function PageClosure() {
task, task,
self.resources, self.resources,
/* stateManager = */ null, /* stateManager = */ null,
normalizeWhitespace); normalizeWhitespace,
combineTextItems);
}); });
}, },

12
src/core/evaluator.js

@ -1132,7 +1132,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
getTextContent: getTextContent:
function PartialEvaluator_getTextContent(stream, task, resources, function PartialEvaluator_getTextContent(stream, task, resources,
stateManager, stateManager,
normalizeWhitespace) { normalizeWhitespace,
combineTextItems) {
stateManager = (stateManager || new StateManager(new TextState())); stateManager = (stateManager || new StateManager(new TextState()));
@ -1443,7 +1444,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var isSameTextLine = !textState.font ? false : var isSameTextLine = !textState.font ? false :
((textState.font.vertical ? args[0] : args[1]) === 0); ((textState.font.vertical ? args[0] : args[1]) === 0);
advance = args[0] - args[1]; advance = args[0] - args[1];
if (isSameTextLine && textContentItem.initialized && if (combineTextItems &&
isSameTextLine && textContentItem.initialized &&
advance > 0 && advance > 0 &&
advance <= textContentItem.fakeMultiSpaceMax) { advance <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(args[0], args[1]); textState.translateTextLineMatrix(args[0], args[1]);
@ -1475,7 +1477,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
// Optimization to treat same line movement as advance. // Optimization to treat same line movement as advance.
advance = textState.calcTextLineMatrixAdvance( advance = textState.calcTextLineMatrixAdvance(
args[0], args[1], args[2], args[3], args[4], args[5]); args[0], args[1], args[2], args[3], args[4], args[5]);
if (advance !== null && textContentItem.initialized && if (combineTextItems &&
advance !== null && textContentItem.initialized &&
advance.value > 0 && advance.value > 0 &&
advance.value <= textContentItem.fakeMultiSpaceMax) { advance.value <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(advance.width, textState.translateTextLineMatrix(advance.width,
@ -1616,7 +1619,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
next(self.getTextContent(xobj, task, next(self.getTextContent(xobj, task,
xobj.dict.get('Resources') || resources, stateManager, xobj.dict.get('Resources') || resources, stateManager,
normalizeWhitespace).then(function (formTextContent) { normalizeWhitespace, combineTextItems).then(
function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items); Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles); Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore(); stateManager.restore();

4
src/core/worker.js

@ -891,12 +891,14 @@ var WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) { handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex; var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace; var normalizeWhitespace = data.normalizeWhitespace;
var combineTextItems = data.combineTextItems;
return pdfManager.getPage(pageIndex).then(function(page) { return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex); var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task); startWorkerTask(task);
var pageNum = pageIndex + 1; var pageNum = pageIndex + 1;
var start = Date.now(); var start = Date.now();
return page.extractTextContent(task, normalizeWhitespace).then( return page.extractTextContent(task, normalizeWhitespace,
combineTextItems).then(
function(textContent) { function(textContent) {
finishWorkerTask(task); finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' + info('text indexing: page=' + pageNum + ' - time=' +

9
src/display/api.js

@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
* @typedef {Object} getTextContentParameters * @typedef {Object} getTextContentParameters
* @param {boolean} normalizeWhitespace - replaces all occurrences of * @param {boolean} normalizeWhitespace - replaces all occurrences of
* whitespace with standard spaces (0x20). The default value is `false`. * whitespace with standard spaces (0x20). The default value is `false`.
* @param {boolean} disableCombineTextItems - do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`.
*/ */
/** /**
@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
* object that represent the page text content. * object that represent the page text content.
*/ */
getTextContent: function PDFPageProxy_getTextContent(params) { getTextContent: function PDFPageProxy_getTextContent(params) {
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
return this.transport.messageHandler.sendWithPromise('GetTextContent', { return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1, pageIndex: this.pageNumber - 1,
normalizeWhitespace: normalizeWhitespace, normalizeWhitespace: (params && params.normalizeWhitespace === true ?
true : /* Default */ false),
combineTextItems: (params && params.disableCombineTextItems === true ?
false : /* Default */ true),
}); });
}, },

6
test/driver.js

@ -469,9 +469,9 @@ var Driver = (function DriverClosure() {
textLayerContext.clearRect(0, 0, textLayerContext.clearRect(0, 0,
textLayerCanvas.width, textLayerCanvas.height); textLayerCanvas.width, textLayerCanvas.height);
// The text builder will draw its content on the test canvas // The text builder will draw its content on the test canvas
initPromise = initPromise = page.getTextContent({
page.getTextContent({ normalizeWhitespace: true }).then( normalizeWhitespace: true,
function(textContent) { }).then(function(textContent) {
return rasterizeTextLayer(textLayerContext, viewport, return rasterizeTextLayer(textLayerContext, viewport,
textContent); textContent);
}); });

8
test/unit/api_spec.js

@ -771,12 +771,14 @@ describe('api', function() {
}); });
it('gets text content', function (done) { it('gets text content', function (done) {
var defaultPromise = page.getTextContent(); var defaultPromise = page.getTextContent();
var normalizeWhitespacePromise = page.getTextContent({ var parametersPromise = page.getTextContent({
normalizeWhitespace: true }); normalizeWhitespace: true,
disableCombineTextItems: true,
});
var promises = [ var promises = [
defaultPromise, defaultPromise,
normalizeWhitespacePromise parametersPromise,
]; ];
Promise.all(promises).then(function (data) { Promise.all(promises).then(function (data) {
expect(!!data[0].items).toEqual(true); expect(!!data[0].items).toEqual(true);

8
web/pdf_page_view.js

@ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() {
function pdfPageRenderCallback() { function pdfPageRenderCallback() {
pageViewDrawCallback(null); pageViewDrawCallback(null);
if (textLayer) { if (textLayer) {
self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( self.pdfPage.getTextContent({
function textContentResolved(textContent) { normalizeWhitespace: true,
}).then(function textContentResolved(textContent) {
textLayer.setTextContent(textContent); textLayer.setTextContent(textContent);
textLayer.render(TEXT_LAYER_RENDER_DELAY); textLayer.render(TEXT_LAYER_RENDER_DELAY);
} });
);
} }
}, },
function pdfPageRenderError(error) { function pdfPageRenderError(error) {

4
web/pdf_viewer.js

@ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() {
getPageTextContent: function (pageIndex) { getPageTextContent: function (pageIndex) {
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
return page.getTextContent({ normalizeWhitespace: true }); return page.getTextContent({
normalizeWhitespace: true,
});
}); });
}, },

Loading…
Cancel
Save