From c9fb5637c3bbb07a250ea4ecb19d79881b275ca2 Mon Sep 17 00:00:00 2001 From: Julian Viereck Date: Sun, 8 Apr 2012 16:18:43 -0700 Subject: [PATCH] Extract one page after the other and not all pages at once --- src/core.js | 29 +++++++++++++++++++++++++---- src/worker.js | 38 ++++++++++---------------------------- web/viewer.js | 31 ++++++++++++++----------------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/core.js b/src/core.js index 4bd2cb234..38f264bf5 100644 --- a/src/core.js +++ b/src/core.js @@ -698,6 +698,9 @@ var PDFDoc = (function PDFDocClosure() { this.fontsLoading = {}; this.workerReadyPromise = new Promise('workerReady'); + this.pageText = []; + this.startedTextExtraction = false; + // If worker support isn't disabled explicit and the browser has worker // support, create a new web worker and test if it/the browser fullfills // all requirements to run parts of pdf.js in a web worker. @@ -769,7 +772,6 @@ var PDFDoc = (function PDFDocClosure() { WorkerMessageHandler.setup(messageHandler); }, - setupMessageHandler: function PDFDoc_setupMessageHandler(messageHandler) { this.messageHandler = messageHandler; @@ -825,9 +827,18 @@ var PDFDoc = (function PDFDocClosure() { }, this); messageHandler.on('text_extracted', function pdfTextExtracted(data) { - var index = data[0]; + var pageNum = data[0]; + var content = data[1]; + if (pageNum !== this.pageText.length + 1) + error('pdfTextExtracted: pageIdx and pageText length got to fit'); + + this.pageText.push(content); + if (this.textExtracted) - this.textExtracted(index); + this.textExtracted(pageNum, content); + + if (pageNum < this.numPages) + this.extractTextPage(pageNum + 1); }, this); messageHandler.on('jpeg_decode', function(data, promise) { @@ -895,9 +906,19 @@ var PDFDoc = (function PDFDocClosure() { return (this.pageCache[n] = page); }, + extractTextPage: function PDFDoc_extractTextPage(pageNum) { + this.messageHandler.send('extract_text', pageNum); + }, + extractText: function PDFDoc_extractText() { + if (this.startedTextExtraction) + return; + + this.startedTextExtraction = true; + this.workerReadyPromise.then(function pdfDocStartRenderingThen() { - this.messageHandler.send('extract_text'); + // Start the text extraction process. + this.extractTextPage(1); }.bind(this)); }, diff --git a/src/worker.js b/src/worker.js index b75fc66e8..b7679bdbe 100644 --- a/src/worker.js +++ b/src/worker.js @@ -94,7 +94,6 @@ var WorkerMessageHandler = { handler.on('page_request', function wphSetupPageRequest(pageNum) { pageNum = parseInt(pageNum); - // The following code does quite the same as // Page.prototype.startRendering, but stops at one point and sends the // result back to the main thread. @@ -156,37 +155,20 @@ var WorkerMessageHandler = { }); }, this); - handler.on('extract_text', function wphExtractText() { - var numPages = pdfModel.numPages; - var index = []; + handler.on('extract_text', function wphExtractText(pageNum) { var start = Date.now(); - function indexPage(pageNum) { - if (pageNum > numPages) { - console.log('text indexing: time=%dms', Date.now() - start); - - handler.send('text_extracted', [index]); - return; - } - - var textContent = ''; - // try { - var page = pdfModel.getPage(pageNum); - textContent = page.extractTextContent(); - // } catch (e) { - // // Skip errored pages - // } - - index.push(textContent); - - // processing one page, interrupting thread to process - // other requests - setTimeout(function extractTextNextPage() { - indexPage(pageNum + 1); - }, 0); + var textContent = ''; + try { + var page = pdfModel.getPage(pageNum); + textContent = page.extractTextContent(); + } catch (e) { + // Skip errored pages } - indexPage(1); + console.log('text indexing: page=%d - time=%dms', + pageNum, Date.now() - start); + handler.send('text_extracted', [pageNum, textContent]); }); } }; diff --git a/web/viewer.js b/web/viewer.js index 91639d9ee..c827b5bce 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -491,7 +491,7 @@ var PDFView = { var pdf; try { - pdf = new PDFJS.PDFDoc(data); + this.pdfDoc = pdf = new PDFJS.PDFDoc(data); } catch (e) { this.error('An error occurred while reading the PDF.', e); } @@ -576,22 +576,18 @@ var PDFView = { if (pdfTitle) document.title = pdfTitle + ' - ' + document.title; - - // loosing pdf reference here, starting text indexing in 500ms - setTimeout((function loadStartTextExtraction() { - this.startTextExtraction(pdf); - }).bind(this), 500); - delete PDFView.extractedText; }, startTextExtraction: function pdfViewStartTextExtraction(pdf) { var searchResults = document.getElementById('searchResults'); searchResults.textContent = ''; - pdf.textExtracted = function pdfTextExtracted(index) { - PDFView.extractedText = index; - }; + pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) { + this.search(); + }).bind(this); pdf.extractText(); + + this.pdfDoc = pdf; }, search: function pdfViewStartSearch() { @@ -604,21 +600,19 @@ var PDFView = { } var searchResults = document.getElementById('searchResults'); - if (!('extractedText' in PDFView)) { - // not indexed yet, repeat in 1 second - searchResults.textContent = 'Searching...'; - setTimeout(this.search.bind(this), 1000); - return; - } var searchTermsInput = document.getElementById('searchTermsInput'); searchResults.removeAttribute('hidden'); searchResults.textContent = ''; var terms = searchTermsInput.value; + + if (!terms) + return; + // simple search: removing spaces and hyphens, then scanning every terms = terms.replace(/\s-/g, '').toLowerCase(); - var index = PDFView.extractedText; + var index = PDFView.pdfDoc.pageText; var pageFound = false; for (var i = 0, ii = index.length; i < ii; i++) { var pageText = index[i].replace(/\s-/g, '').toLowerCase(); @@ -708,6 +702,9 @@ var PDFView = { var searchTermsInput = document.getElementById('searchTermsInput'); searchTermsInput.focus(); + + // Start text extraction as soon as the search gets displayed. + this.pdfDoc.extractText(); } else { searchScrollView.setAttribute('hidden', 'true'); searchSwitchButton.removeAttribute('data-selected');