From 97fa5459365a73931c52dfe46c2da623d1cb1dbc Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Mon, 30 Sep 2019 17:33:41 +0800 Subject: [PATCH] Update folder structure and refactor --- examples/node/recognize.js | 20 +- package-lock.json | 6 +- package.json | 2 +- scripts/webpack.config.dev.js | 2 +- src/common/TesseractJob.js | 163 --------------- src/common/TesseractWorker.js | 196 ------------------ src/common/circularize.js | 86 -------- src/common/createWorker.js | 129 ------------ src/common/env.js | 1 - src/common/options.js | 40 ---- src/common/pdf-ttf.js | 1 - src/common/types.js | 34 --- src/common/utils.js | 87 -------- src/constants/OEM.js | 12 ++ src/constants/PSM.js | 18 ++ src/constants/config.js | 5 + src/constants/defaultOptions.js | 13 ++ src/{common => }/createJob.js | 4 +- src/{common => }/createScheduler.js | 0 src/{common => }/createTesseract.js | 0 src/createWorker.js | 100 +++++++++ src/index.js | 19 +- src/node/b64toU8Array.js | 1 - src/node/index.js | 111 ---------- src/node/worker.js | 44 ---- src/utils/circularize.js | 57 +++++ src/utils/getEnvironment.js | 10 + src/utils/resolvePaths.js | 12 ++ src/worker-script/browser/index.js | 0 src/worker-script/constants/defaultParams.js | 24 +++ .../workerUtils.js => worker-script/index.js} | 110 +++++----- src/worker-script/node/exportFile.js | 7 + src/worker-script/node/index.js | 39 ++++ src/{common => worker-script/utils}/dump.js | 62 +++--- src/worker-script/utils/getFiles.js | 39 ++++ src/worker-script/utils/getLangStr.js | 5 + src/worker-script/utils/setImage.js | 41 ++++ src/{ => worker}/browser/b64toU8Array.js | 0 src/{ => worker}/browser/index.js | 0 src/{ => worker}/browser/worker.js | 8 +- src/worker/node/defaultOptions.js | 10 + src/worker/node/index.js | 22 ++ src/worker/node/onMessage.js | 3 + src/worker/node/send.js | 61 ++++++ src/worker/node/spawnWorker.js | 15 ++ src/worker/node/terminateWorker.js | 11 + 46 files changed, 632 insertions(+), 998 deletions(-) delete mode 100644 src/common/TesseractJob.js delete mode 100644 src/common/TesseractWorker.js delete mode 100644 src/common/circularize.js delete mode 100644 src/common/createWorker.js delete mode 100644 src/common/env.js delete mode 100644 src/common/options.js delete mode 100644 src/common/pdf-ttf.js delete mode 100644 src/common/types.js delete mode 100644 src/common/utils.js create mode 100644 src/constants/OEM.js create mode 100644 src/constants/PSM.js create mode 100644 src/constants/config.js create mode 100644 src/constants/defaultOptions.js rename src/{common => }/createJob.js (85%) rename src/{common => }/createScheduler.js (100%) rename src/{common => }/createTesseract.js (100%) create mode 100644 src/createWorker.js delete mode 100644 src/node/b64toU8Array.js delete mode 100644 src/node/index.js delete mode 100644 src/node/worker.js create mode 100644 src/utils/circularize.js create mode 100644 src/utils/getEnvironment.js create mode 100644 src/utils/resolvePaths.js create mode 100644 src/worker-script/browser/index.js create mode 100644 src/worker-script/constants/defaultParams.js rename src/{common/workerUtils.js => worker-script/index.js} (71%) create mode 100644 src/worker-script/node/exportFile.js create mode 100644 src/worker-script/node/index.js rename src/{common => worker-script/utils}/dump.js (75%) create mode 100644 src/worker-script/utils/getFiles.js create mode 100644 src/worker-script/utils/getLangStr.js create mode 100644 src/worker-script/utils/setImage.js rename src/{ => worker}/browser/b64toU8Array.js (100%) rename src/{ => worker}/browser/index.js (100%) rename src/{ => worker}/browser/worker.js (88%) create mode 100644 src/worker/node/defaultOptions.js create mode 100644 src/worker/node/index.js create mode 100644 src/worker/node/onMessage.js create mode 100644 src/worker/node/send.js create mode 100644 src/worker/node/spawnWorker.js create mode 100644 src/worker/node/terminateWorker.js diff --git a/examples/node/recognize.js b/examples/node/recognize.js index 804820a..36bb365 100755 --- a/examples/node/recognize.js +++ b/examples/node/recognize.js @@ -1,6 +1,8 @@ #!/usr/bin/env node const path = require('path'); -const { createScheduler, createWorker, createJob, OEM } = require('../../'); +const { + createScheduler, createWorker, createJob, PSM, +} = require('../../'); const [,, imagePath] = process.argv; const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); @@ -11,12 +13,18 @@ console.log(`Recognizing ${image}`); const scheduler = createScheduler(); const worker = createWorker(); await worker.load(); - await worker.loadLanguage('osd'); - await worker.initialize('osd', { - tessedit_ocr_engine_mode: OEM.OSD_ONLY, + await worker.loadLanguage('eng'); + await worker.initialize('eng'); + await worker.setParameters({ + tessedit_char_whitelist: 'ABCDEFGH', }); scheduler.addWorker(worker); - const data = await scheduler.addJob(createJob('detect', { image })); - console.log(data); + const { text: t1 } = await scheduler.addJob(createJob('recognize', { image })); + console.log(t1); + await worker.setParameters({ + tessedit_char_whitelist: 'abcdefg', + }); + const { text: t2 } = await scheduler.addJob(createJob('recognize', { image })); + console.log(t2); scheduler.terminate(); })(); diff --git a/package-lock.json b/package-lock.json index a6bc2ab..284d618 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8548,9 +8548,9 @@ } }, "tesseract.js-core": { - "version": "2.0.0-beta.12", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.12.tgz", - "integrity": "sha512-/CJhrDO82u1Nix4BQXYdL98+ctPZK4ZBYIiPlVu9uu9DHH65HsTEUV86Kd/3hdg67wCPmO1FzRGOaskvLv5O5A==" + "version": "2.0.0-beta.13", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.13.tgz", + "integrity": "sha512-GboWV/aV5h+Whito6L6Q3WCFZ2+lgxZGgjY84wSpWbTLEkkZgHsU+dz1or+3rWSABH/nuzHDco1bZRk5+f94mw==" }, "tesseract.js-utils": { "version": "1.0.0-beta.8", diff --git a/package.json b/package.json index 5208ded..424cdf9 100644 --- a/package.json +++ b/package.json @@ -57,7 +57,7 @@ "is-url": "1.2.2", "opencollective-postinstall": "^2.0.2", "resolve-url": "^0.2.1", - "tesseract.js-core": "^2.0.0-beta.12", + "tesseract.js-core": "^2.0.0-beta.13", "tesseract.js-utils": "^1.0.0-beta.8" }, "repository": { diff --git a/scripts/webpack.config.dev.js b/scripts/webpack.config.dev.js index 33215a0..dad4c9e 100644 --- a/scripts/webpack.config.dev.js +++ b/scripts/webpack.config.dev.js @@ -21,7 +21,7 @@ const genConfig = ({ }), ], devServer: { - allowedHosts: ['localhost', '.gitpod.io'], + allowedHosts: ['localhost', '.gitpod.io'], }, }); diff --git a/src/common/TesseractJob.js b/src/common/TesseractJob.js deleted file mode 100644 index 90652bf..0000000 --- a/src/common/TesseractJob.js +++ /dev/null @@ -1,163 +0,0 @@ -/** - * - * The job exectued by worker, each job is basically a recognition of an image. - * - * @fileoverview Job excuted by Worker - * @author Kevin Kwok - * @author Guillermo Webster - * @author Jerome Wu - */ -const adapter = require('../node/'); - -/** A global job counter as part of job id */ -let jobCounter = 0; - -class TesseractJob { - /** - * constructor - * - * @name constructor - * @function initial a TesseractJob - * @access public - * @param {object} worker - An instance of TesseractWorker - */ - constructor(worker) { - jobCounter += 1; - this.id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`; - - this._worker = worker; - - /** - * As all the callback functions are saved in an array. - * Basically you can register more than callback function - * for then, catch, progress and finally. - */ - this._resolve = []; - this._reject = []; - this._progress = []; - this._finally = []; - } - - /** - * then - * - * @name then - * @function A function to chain like Promise - * @access public - * @param {function} resolve - called when the job succeeds - * @param {function} reject - called when the job fails - */ - then(resolve, reject) { - return new Promise((res, rej) => { - if (!this._resolve.push) { - res(this._result); - } else { - this._resolve.push(res); - } - this.catch(rej); - }).then(resolve, reject); - } - - /** - * catch - * - * @name catch - * @function register a function to call when there is an error - * @access public - * @param {function} reject - callback function for error - */ - catch(reject) { - if (this._reject.push) { - this._reject.push(reject); - } else { - reject(this._reject); - } - return this; - } - - /** - * progress - * - * @name progress - * @function register a function to show progress of the recognition, - * use res.progress to print the message - * @access public - * @param {function} fn - callback function for progress information - */ - progress(fn) { - this._progress.push(fn); - return this; - } - - /** - * finally - * - * @name finally - * @function registry a callback function for final - * @access public - * @param {function} fn - callback function for final - */ - finally(fn) { - this._finally.push(fn); - return this; - } - - /** - * send - * - * @name send - * @function send specific action with payload a worker - * @access public - * @param {string} action - action to trigger, should be "recognize" or "detect" - * @param {object} payload - data to be consumed - */ - send(action, payload) { - adapter.sendPacket(this._worker, { - jobId: this.id, - action, - payload, - }); - } - - /** - * handle - * - * @name handle - * @function execute packet action - * @access public - * @param {object} packet action and payload to handle - */ - handle(packet) { - const { data } = packet; - let runFinallyCbs = false; - - if (packet.status === 'resolve') { - if (this._resolve.length === 0) console.log(data); - this._resolve.forEach((fn) => { - const ret = fn(data); - if (ret && typeof ret.then === 'function') { - console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.'); - } - }); - this._resolve = data; - this._worker.dequeue(); - runFinallyCbs = true; - } else if (packet.status === 'reject') { - if (this._reject.length === 0) console.error(data); - this._reject.forEach(fn => fn(data)); - this._reject = data; - this._worker.dequeue(); - runFinallyCbs = true; - } else if (packet.status === 'progress') { - this._progress.forEach(fn => fn(data)); - } else { - console.warn('Message type unknown', packet.status); - } - - if (runFinallyCbs) { - this._finally.forEach(fn => fn(data)); - } - } -} - -module.exports = TesseractJob; diff --git a/src/common/TesseractWorker.js b/src/common/TesseractWorker.js deleted file mode 100644 index 5b66287..0000000 --- a/src/common/TesseractWorker.js +++ /dev/null @@ -1,196 +0,0 @@ -/** - * - * The core part of tesseract.js to execute the OCR jobs. - * - * @fileoverview Worker for OCR jobs - * @author Kevin Kwok - * @author Guillermo Webster - * @author Jerome Wu - */ -const check = require('check-types'); -const resolveURL = (typeof window !== 'undefined' && typeof window.document !== 'undefined') ? require('resolve-url') : s => s; -const adapter = require('../node'); -const circularize = require('./circularize'); -const TesseractJob = require('./TesseractJob'); - -/** - * TesseractWorker - * @name TesseractWorker - * @function execute TesseractJob with a queue mechanism - * @access public - */ -class TesseractWorker { - /** - * constructor - * - * @name constructor - * @function initialize the worker - * @access public - * @param {object} options - worker configurations - * @param {string} options.workerPath - - * A remote path to load worker script. - * In browser-like environment, it is downloaded from a CDN service. - * Please update this option if you self-host the worker script. - * In Node.js environment, this option is not used as the worker script is in local. - * @param {boolean} [options.workerBlobURL=true] - Use a blob: URL for the worker script - * @param {string} options.corePath - - * A remote path to load tesseract.js-core script. - * In browser-like environment, it is downloaded from a CDN service. - * Please update this option if you self-host the core script. - * In Node.js environment, this option is not used as the core script is in local. - * @param {string} options.langPath - - * A remote path to load *.traineddata.gz, it is download from a CDN service. - * Please update this option if you self-host the worker script. - * @param {string} [options.cachePath=.] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js} - * @param {string} [options.cacheMethod=write] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js} - * @param {string} [options.dataPath=.] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js} - * - */ - constructor(options = {}) { - this.worker = null; - this.options = { - ...adapter.defaultOptions, - ...options, - }; - ['corePath', 'workerPath', 'langPath'].forEach((key) => { - if (check.not.undefined(options[key])) { - this.options = { ...this.options, [key]: resolveURL(options[key]) }; - } - }); - this._currentJob = null; - this._queue = []; - } - - /** - * recognize - * - * @name recognize - * @function recognize text in given image - * @access public - * @param {Buffer, string} image - image to be recognized - * @param {string, array} [langs=eng] - languages to recognize - * @param {object} params - tesseract parameters - * - */ - recognize(image, langs = 'eng', params = {}) { - return this._sendJob('recognize', image, langs, params); - } - - /** - * detect - * - * @name detect - * @function detect language of the text in the image - * @access public - * @param {Buffer, string} image - image to be recognized - * @param {object} params - tesseract parameters - * - */ - detect(image, params = {}) { - return this._sendJob('detect', image, 'osd', params); - } - - /** - * recv - * - * @name recv - * @function handle completed job - * @access public - * @param {object} packet job data - */ - recv(packet) { - if (this._currentJob.id === packet.jobId) { - this._currentJob.handle({ - ...packet, - data: packet.status === 'resolve' && packet.action === 'recognize' - ? circularize(packet.data) - : packet.data, - }); - } else { - console.warn(`Job ID ${packet.jobId} not known.`); - } - } - - /** - * dequeue - * - * @name dequeue - * @function dequeue and execute the rear job - * @access public - */ - dequeue() { - this._currentJob = null; - if (this._queue.length) { - this._queue[0](); - } - } - - /** - * terminate - * - * @name terminate - * @function terminate the worker - * @access public - * - */ - terminate() { - if (this.worker) { - adapter.terminateWorker(this); - } - this.worker = null; - this._currentJob = null; - this._queue = []; - } - - /** - * _sendJob - * - * @name _sendJob - * @function append a new job to the job queue - * @access private - * @param {string} type job type, should be recognize or detect - * @param {Buffer, string} image image to recognize - * @param {string} lang language to recognize - * @param {object} params tesseract parameters - */ - _sendJob(type, image, langs, params) { - return this._delay((job) => { - job.send( - type, - { - image, - langs, - params, - options: this.options, - }, - ); - }); - } - - /** - * _delay - * - * @name _delay - * @function delays the fn to execute until it is on the rear of the queue - * @access private - * @param {function} fn A handler function for the job - */ - _delay(fn) { - if (check.null(this.worker)) { - this.worker = adapter.spawnWorker(this, this.options); - } - - const job = new TesseractJob(this); - this._queue.push(() => { - this._queue.shift(); - this._currentJob = job; - fn(job); - }); - if (check.null(this._currentJob)) { - this.dequeue(); - } - return job; - } -} - -module.exports = TesseractWorker; diff --git a/src/common/circularize.js b/src/common/circularize.js deleted file mode 100644 index 9448e70..0000000 --- a/src/common/circularize.js +++ /dev/null @@ -1,86 +0,0 @@ -/** - * The result of dump.js is a big JSON tree - * which can be easily serialized (for instance - * to be sent from a webworker to the main app - * or through Node's IPC), but we want - * a (circular) DOM-like interface for walking - * through the data. - * - * @fileoverview DOM-like interface for walking through data - * @author Kevin Kwok - * @author Guillermo Webster - * @author Jerome Wu - */ - -module.exports = (iPage) => { - const page = { - ...iPage, - paragraphs: [], - lines: [], - words: [], - symbols: [], - }; - - page.blocks.forEach((iBlock) => { - const block = { - ...iBlock, - page, - lines: [], - words: [], - symbols: [], - }; - - block.paragraphs.forEach((iPara) => { - const para = { - ...iPara, - block, - page, - words: [], - symbols: [], - }; - - para.lines.forEach((iLine) => { - const line = { - ...iLine, - paragraph: para, - block, - page, - symbols: [], - }; - - line.words.forEach((iWord) => { - const word = { - ...iWord, - line, - paragraph: para, - block, - page, - }; - - word.symbols.forEach((iSym) => { - const sym = { - ...iSym, - word, - line, - paragraph: para, - block, - page, - }; - - sym.line.symbols.push(sym); - sym.paragraph.symbols.push(sym); - sym.block.symbols.push(sym); - sym.page.symbols.push(sym); - }); - word.paragraph.words.push(word); - word.block.words.push(word); - word.page.words.push(word); - }); - line.block.lines.push(line); - line.page.lines.push(line); - }); - para.page.paragraphs.push(para); - }); - }); - return page; -}; diff --git a/src/common/createWorker.js b/src/common/createWorker.js deleted file mode 100644 index 0c56e02..0000000 --- a/src/common/createWorker.js +++ /dev/null @@ -1,129 +0,0 @@ -const { isBrowser } = require('./env'); -const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line -const circularize = require('./circularize'); -const createJob = require('./createJob'); -const { defaultParams } = require('./options'); -const { - defaultOptions, - spawnWorker, - terminateWorker, - setOnMessage, -} = require('../node'); - -let workerCounter = 0; - -const resolvePaths = (options) => { - const opts = { ...options }; - ['corePath', 'workerPath', 'langPath'].forEach((key) => { - if (typeof options[key] !== 'undefined') { - opts[key] = resolveURL(opts[key]); - } - }); - return opts; -}; - -module.exports = (options = {}) => { - workerCounter += 1; - const id = `Worker-${workerCounter}-${Math.random().toString(16).slice(3, 8)}`; - const opts = resolvePaths({ - ...defaultOptions, - ...options, - }); - const { logger } = opts; - const resolves = {}; - const rejects = {}; - let worker = spawnWorker(opts); - - const setResolve = (action, res) => { - resolves[action] = res; - }; - - const setReject = (action, rej) => { - rejects[action] = rej; - }; - - const load = () => ( - new Promise((resolve, reject) => { - const job = createJob( - 'load', - opts, - ); - setResolve('load', resolve); - setReject('load', reject); - job.start({ worker, id }); - }) - ); - - const loadLanguage = (langs = 'eng') => ( - new Promise((resolve, reject) => { - const job = createJob( - 'load-language', - { - langs, - options: opts, - }, - ); - setResolve('load-language', resolve); - setReject('load-language', reject); - job.start({ worker, id }); - }) - ); - - const initialize = (langs = 'eng', params = {}) => ( - new Promise((resolve, reject) => { - const job = createJob( - 'initialize', - { - langs, - params: { - ...defaultParams, - ...params, - }, - }, - ); - setResolve('initialize', resolve); - setReject('initialize', reject); - job.start({ worker, id }); - }) - ); - - const terminate = () => { - if (worker !== null) { - terminateWorker({ worker }); - worker = null; - } - }; - - setOnMessage(worker, (packet) => { - const { status, action, data } = packet; - if (status === 'resolve') { - if (action === 'load') { - resolves.load(data); - } else if (action === 'initialize') { - resolves.initialize({ id }); - } else if (action === 'load-language') { - resolves['load-language'](data); - } else if (action === 'recognize') { - resolves.recognize(circularize(data)); - } else if (action === 'detect') { - resolves.detect(data); - } - } else if (status === 'reject') { - rejects[action](data); - throw Error(data); - } else if (status === 'progress') { - logger(data); - } - }); - - return { - id, - worker, - setResolve, - setReject, - load, - loadLanguage, - initialize, - terminate, - }; -}; diff --git a/src/common/env.js b/src/common/env.js deleted file mode 100644 index f8e49ad..0000000 --- a/src/common/env.js +++ /dev/null @@ -1 +0,0 @@ -exports.isBrowser = (typeof window !== 'undefined') && (typeof window.document !== 'undefined'); diff --git a/src/common/options.js b/src/common/options.js deleted file mode 100644 index 65e7936..0000000 --- a/src/common/options.js +++ /dev/null @@ -1,40 +0,0 @@ -const { OEM, PSM } = require('./types'); - -module.exports = { - defaultOptions: { - /* - * default path for downloading *.traineddata, this URL basically - * points to a github page, not using jsDelivr as there is is limitation - * of 20 MB. - */ - langPath: 'https://tessdata.projectnaptha.com/4.0.0', - /* - * Use BlobURL for worker script by default - */ - workerBlobURL: true, - logger: () => {}, - }, - /* - * default params for recognize() - */ - defaultParams: { - tessedit_ocr_engine_mode: OEM.LSTM_ONLY, - tessedit_pageseg_mode: PSM.SINGLE_BLOCK, - tessedit_char_whiltelist: '', - tessjs_create_pdf: '0', - tessjs_create_hocr: '1', - tessjs_create_tsv: '1', - tessjs_create_box: '0', - tessjs_create_unlv: '0', - tessjs_create_osd: '0', - tessjs_textonly_pdf: '0', - tessjs_pdf_name: 'tesseract.js-ocr-result', - tessjs_pdf_title: 'Tesseract.js OCR Result', - tessjs_pdf_auto_download: true, - tessjs_pdf_bin: false, - tessjs_image_rectangle_left: 0, - tessjs_image_rectangle_top: 0, - tessjs_image_rectangle_width: -1, - tessjs_image_rectangle_height: -1, - }, -}; diff --git a/src/common/pdf-ttf.js b/src/common/pdf-ttf.js deleted file mode 100644 index 8e24c21..0000000 --- a/src/common/pdf-ttf.js +++ /dev/null @@ -1 +0,0 @@ -module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA='; diff --git a/src/common/types.js b/src/common/types.js deleted file mode 100644 index 63d46a8..0000000 --- a/src/common/types.js +++ /dev/null @@ -1,34 +0,0 @@ -module.exports = { - /* - * OEM = OCR Engine Mode, and there are 5 possible modes. - * - * By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible. - * If you need to use some tesseract v3 features (like tessedit_char_whitelist), - * you need to use TESSERACT_ONLY mode. - * - */ - OEM: { - TESSERACT_ONLY: 0, - LSTM_ONLY: 1, - TESSERACT_LSTM_COMBINED: 2, - DEFAULT: 3, - }, - /* - * PSM = Page Segmentation Mode - */ - PSM: { - OSD_ONLY: '0', - AUTO_OSD: '1', - AUTO_ONLY: '2', - AUTO: '3', - SINGLE_COLUMN: '4', - SINGLE_BLOCK_VERT_TEXT: '5', - SINGLE_BLOCK: '6', - SINGLE_LINE: '7', - SINGLE_WORD: '8', - SINGLE_CHAR: '9', - SPARSE_TEXT: '10', - SPARSE_TEXT_OSD: '11', - RAW_LINE: '12', - }, -}; diff --git a/src/common/utils.js b/src/common/utils.js deleted file mode 100644 index 51edf83..0000000 --- a/src/common/utils.js +++ /dev/null @@ -1,87 +0,0 @@ -const { readImage } = require('tesseract.js-utils'); - -/** - * setImage - * - * @name setImage - * @function set image in tesseract for recognition - * @access public - * @param {array} image - binary array in array format - * @returns {number} - an emscripten pointer of the image - */ -exports.setImage = (TessModule, api, image, params) => { - const { - tessjs_image_rectangle_left: left, - tessjs_image_rectangle_top: top, - tessjs_image_rectangle_width: width, - tessjs_image_rectangle_height: height, - } = params; - const { - w, h, bytesPerPixel, data, pix, - } = readImage(TessModule, Array.from(image)); - - /* - * As some image format (ex. bmp) is not supported natiely by tesseract, - * sometimes it will not return pix directly, but data and bytesPerPixel - * for another SetImage usage. - * - */ - if (data === null) { - api.SetImage(pix); - } else { - api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel); - } - api.SetRectangle( - (left < 0) ? 0 : left, - (top < 0) ? 0 : top, - (width < 0) ? w : width, - (height < 0) ? h : height, - ); - return data === null ? pix : data; -}; - -exports.getLangsStr = langs => ( - typeof langs === 'string' - ? langs - : langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+') -); - -/** - * handleOutput - * - * @name handleOutput - * @function handle file output - * @access private - * @param {object} customParams - an object of params - */ -exports.getFiles = (TessModule, api, adapter, params) => { - let files = {}; - const { - tessjs_create_pdf, - tessjs_textonly_pdf, - tessjs_pdf_name, - tessjs_pdf_title, - tessjs_pdf_auto_download, - tessjs_pdf_bin, - } = params; - - if (tessjs_create_pdf === '1') { - const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1'); - pdfRenderer.BeginDocument(tessjs_pdf_title); - pdfRenderer.AddImage(api); - pdfRenderer.EndDocument(); - TessModule._free(pdfRenderer); - - const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`); - - if (tessjs_pdf_bin) { - files = { pdf: data, ...files }; - } - - if (tessjs_pdf_auto_download) { - adapter.writeFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf'); - } - } - - return files; -}; diff --git a/src/constants/OEM.js b/src/constants/OEM.js new file mode 100644 index 0000000..b66aabe --- /dev/null +++ b/src/constants/OEM.js @@ -0,0 +1,12 @@ +/* + * OEM = OCR Engine Mode, and there are 4 possible modes. + * + * By default tesseract.js uses LSTM_ONLY mode. + * + */ +module.exports = { + TESSERACT_ONLY: 0, + LSTM_ONLY: 1, + TESSERACT_LSTM_COMBINED: 2, + DEFAULT: 3, +}; diff --git a/src/constants/PSM.js b/src/constants/PSM.js new file mode 100644 index 0000000..8d25442 --- /dev/null +++ b/src/constants/PSM.js @@ -0,0 +1,18 @@ +/* + * PSM = Page Segmentation Mode + */ +module.exports = { + OSD_ONLY: '0', + AUTO_OSD: '1', + AUTO_ONLY: '2', + AUTO: '3', + SINGLE_COLUMN: '4', + SINGLE_BLOCK_VERT_TEXT: '5', + SINGLE_BLOCK: '6', + SINGLE_LINE: '7', + SINGLE_WORD: '8', + CIRCLE_WORD: '9', + SINGLE_CHAR: '10', + SPARSE_TEXT: '11', + SPARSE_TEXT_OSD: '12', +}; diff --git a/src/constants/config.js b/src/constants/config.js new file mode 100644 index 0000000..f61b062 --- /dev/null +++ b/src/constants/config.js @@ -0,0 +1,5 @@ +const OEM = require('./OEM'); + +module.exports = { + defaultOEM: OEM.DEFAULT, +}; diff --git a/src/constants/defaultOptions.js b/src/constants/defaultOptions.js new file mode 100644 index 0000000..e425420 --- /dev/null +++ b/src/constants/defaultOptions.js @@ -0,0 +1,13 @@ +module.exports = { + /* + * default path for downloading *.traineddata + */ + langPath: 'https://tessdata.projectnaptha.com/4.0.0', + /* + * Use BlobURL for worker script by default + * TODO: remove this option + * + */ + workerBlobURL: true, + logger: () => {}, +}; diff --git a/src/common/createJob.js b/src/createJob.js similarity index 85% rename from src/common/createJob.js rename to src/createJob.js index a7e3b48..fc799ae 100644 --- a/src/common/createJob.js +++ b/src/createJob.js @@ -1,4 +1,4 @@ -const { sendPacket } = require('../node'); +const { send } = require('./worker/node'); let jobCounter = 0; @@ -11,7 +11,7 @@ module.exports = ( const start = (worker) => { console.log(`[${worker.id}]: Start ${id}, action=${action}`); - sendPacket(worker, { + send(worker, { workerId: worker.id, jobId: id, action, diff --git a/src/common/createScheduler.js b/src/createScheduler.js similarity index 100% rename from src/common/createScheduler.js rename to src/createScheduler.js diff --git a/src/common/createTesseract.js b/src/createTesseract.js similarity index 100% rename from src/common/createTesseract.js rename to src/createTesseract.js diff --git a/src/createWorker.js b/src/createWorker.js new file mode 100644 index 0000000..23782b7 --- /dev/null +++ b/src/createWorker.js @@ -0,0 +1,100 @@ +const resolvePaths = require('./utils/resolvePaths'); +const circularize = require('./utils/circularize'); +const createJob = require('./createJob'); +const { defaultOEM } = require('./constants/config'); +const { + defaultOptions, + spawnWorker, + terminateWorker, + onMessage, +} = require('./worker/node'); + +let workerCounter = 0; + +module.exports = (_options = {}) => { + workerCounter += 1; + const id = `Worker-${workerCounter}-${Math.random().toString(16).slice(3, 8)}`; + const options = resolvePaths({ + ...defaultOptions, + ..._options, + }); + const { logger } = options; + const resolves = {}; + const rejects = {}; + let worker = spawnWorker(options); + + const setResolve = (action, res) => { + resolves[action] = res; + }; + + const setReject = (action, rej) => { + rejects[action] = rej; + }; + + const doJob = (action, payload) => ( + new Promise((resolve, reject) => { + setResolve(action, resolve); + setReject(action, reject); + createJob(action, payload).start({ worker, id }); + }) + ); + + const load = () => ( + doJob('load', { options }) + ); + + const loadLanguage = (langs = 'eng') => ( + doJob('load-language', { langs, options }) + ); + + const initialize = (langs = 'eng', oem = defaultOEM) => ( + doJob('initialize', { langs, oem }) + ); + + const setParameters = (params = {}) => ( + doJob('set-parameters', { params }) + ); + + const terminate = () => { + if (worker !== null) { + terminateWorker(worker); + worker = null; + } + }; + + onMessage(worker, (packet) => { + const { status, action, data } = packet; + if (status === 'resolve') { + if (action === 'load') { + resolves.load(data); + } else if (action === 'initialize') { + resolves.initialize({ id }); + } else if (action === 'set-parameters') { + resolves['set-parameters'](data); + } else if (action === 'load-language') { + resolves['load-language'](data); + } else if (action === 'recognize') { + resolves.recognize(circularize(data)); + } else if (action === 'detect') { + resolves.detect(data); + } + } else if (status === 'reject') { + rejects[action](data); + throw Error(data); + } else if (status === 'progress') { + logger(data); + } + }); + + return { + id, + worker, + setResolve, + setReject, + load, + loadLanguage, + initialize, + setParameters, + terminate, + }; +}; diff --git a/src/index.js b/src/index.js index ffda1db..e0cab5f 100644 --- a/src/index.js +++ b/src/index.js @@ -7,20 +7,15 @@ * @author Guillermo Webster * @author Jerome Wu */ -const utils = require('tesseract.js-utils'); -const TesseractWorker = require('./common/TesseractWorker'); -const createScheduler = require('./common/createScheduler'); -const createWorker = require('./common/createWorker'); -const createJob = require('./common/createJob'); -const types = require('./common/types'); +const createScheduler = require('./createScheduler'); +const createWorker = require('./createWorker'); +const createJob = require('./createJob'); +const OEM = require('./constants/OEM'); +const PSM = require('./constants/PSM'); module.exports = { - /** Worker for OCR, @see common/TesseractWorker.js */ - TesseractWorker, - /** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */ - utils, - /** Check ./common/types for more details */ - ...types, + OEM, + PSM, createScheduler, createWorker, createJob, diff --git a/src/node/b64toU8Array.js b/src/node/b64toU8Array.js deleted file mode 100644 index 349587f..0000000 --- a/src/node/b64toU8Array.js +++ /dev/null @@ -1 +0,0 @@ -module.exports = s => Buffer.from(s, 'base64'); diff --git a/src/node/index.js b/src/node/index.js deleted file mode 100644 index 8103691..0000000 --- a/src/node/index.js +++ /dev/null @@ -1,111 +0,0 @@ -/** - * - * Tesseract Worker adapter for node - * - * @fileoverview Tesseract Worker adapter for node - * @author Kevin Kwok - * @author Guillermo Webster - * @author Jerome Wu - */ -const util = require('util'); -const fs = require('fs'); -const axios = require('axios'); -const isURL = require('is-url'); -const { fork } = require('child_process'); -const path = require('path'); -const b64toU8Array = require('./b64toU8Array'); -const { defaultOptions } = require('../common/options'); - -const readFile = util.promisify(fs.readFile); - -/** - * loadImage - * - * @name loadImage - * @function load image from different source - * @access public - * @param {string} image - image source, supported formats: - * string: URL string or file path - * string: base64 image - * buffer: image buffer - * @returns {array} binary image in array format - */ -const loadImage = (image) => { - if (isURL(image)) { - return axios.get(image, { - responseType: 'arraybuffer', - }) - .then(resp => resp.data); - } - - if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { - return Promise.resolve(b64toU8Array(image.split(',')[1])); - } - - if (Buffer.isBuffer(image)) { - return Promise.resolve(image); - } - - return readFile(image); -}; - -/* - * Default options for node worker - */ -exports.defaultOptions = { - ...defaultOptions, - workerPath: path.join(__dirname, 'worker.js'), -}; - -/** - * spawnWorker - * - * @name spawnWorker - * @function fork a new process in node - * @access public - * @param {object} instance - TesseractWorker instance - * @param {object} options - * @param {string} options.workerPath - worker script path - */ -exports.spawnWorker = ({ workerPath }) => ( - fork(workerPath) -); - -exports.setOnMessage = (worker, handler) => { - worker.on('message', handler); -}; - -/** - * terminateWorker - * - * @name terminateWorker - * @function kill worker - * @access public - * @param {object} instance TesseractWorker instance - */ -exports.terminateWorker = ({ worker }) => { - worker.kill(); -}; - -/** - * sendPacket - * - * @name sendPacket - * @function send packet to worker and create a job - * @access public - * @param {object} instance TesseractWorker instance - * @param {object} iPacket data for worker - */ -exports.sendPacket = ({ worker }, packet) => { - const p = { ...packet }; - if (['recognize', 'detect'].includes(p.action)) { - loadImage(p.payload.image) - .then(buf => new Uint8Array(buf)) - .then((img) => { - p.payload.image = Array.from(img); - worker.send(p); - }); - } else { - worker.send(p); - } -}; diff --git a/src/node/worker.js b/src/node/worker.js deleted file mode 100644 index 1dafb2b..0000000 --- a/src/node/worker.js +++ /dev/null @@ -1,44 +0,0 @@ -/** - * - * Node worker implementation - * - * @fileoverview Node worker implementation - * @author Kevin Kwok - * @author Guillermo Webster - * @author Jerome Wu - */ - -const check = require('check-types'); -const workerUtils = require('../common/workerUtils'); -const b64toU8Array = require('./b64toU8Array'); - -let TesseractCore = null; - -/* - * register message handler - */ -process.on('message', (packet) => { - workerUtils.dispatchHandlers(packet, obj => process.send(obj)); -}); - -/* - * getCore is a sync function to load and return - * TesseractCore. - */ -workerUtils.setAdapter({ - getCore: (corePath, res) => { - if (check.null(TesseractCore)) { - res.progress({ status: 'loading tesseract core', progress: 0 }); - TesseractCore = require('tesseract.js-core'); - res.progress({ status: 'loaded tesseract core', progress: 1 }); - } - return TesseractCore; - }, - b64toU8Array, - writeFile: (path, data) => { - const fs = require('fs'); - fs.writeFile(path, data, (err) => { - if (err) throw err; - }); - }, -}); diff --git a/src/utils/circularize.js b/src/utils/circularize.js new file mode 100644 index 0000000..6e682a1 --- /dev/null +++ b/src/utils/circularize.js @@ -0,0 +1,57 @@ +/** + * In the recognition result of tesseract, there + * is a deep JSON object for details, it has around + * + * The result of dump.js is a big JSON tree + * which can be easily serialized (for instance + * to be sent from a webworker to the main app + * or through Node's IPC), but we want + * a (circular) DOM-like interface for walking + * through the data. + * + * A (circular) DOM-like interface here means that + * each child element + * + * @fileoverview DOM-like interface for walking through data + * @author Kevin Kwok + * @author Guillermo Webster + * @author Jerome Wu + */ + +module.exports = (page) => { + const blocks = []; + const paragraphs = []; + const lines = []; + const words = []; + const symbols = []; + + page.blocks.forEach((block) => { + block.paragraphs.forEach((paragraph) => { + paragraph.lines.forEach((line) => { + line.words.forEach((word) => { + word.symbols.forEach((sym) => { + symbols.push({ + ...sym, page, block, paragraph, line, word, + }); + }); + words.push({ + ...word, page, block, paragraph, line, + }); + }); + lines.push({ + ...line, page, block, paragraph, + }); + }); + paragraphs.push({ + ...paragraph, page, block, + }); + }); + blocks.push({ + ...block, page, + }); + }); + + return { + ...page, blocks, paragraphs, lines, words, symbols, + }; +}; diff --git a/src/utils/getEnvironment.js b/src/utils/getEnvironment.js new file mode 100644 index 0000000..71b1c01 --- /dev/null +++ b/src/utils/getEnvironment.js @@ -0,0 +1,10 @@ +module.exports = (key) => { + const env = { + type: (typeof window !== 'undefined') && (typeof window.document !== 'undefined') ? 'browser' : 'node', + }; + + if (typeof key === 'undefined') { + return env; + } + return env[key]; +}; diff --git a/src/utils/resolvePaths.js b/src/utils/resolvePaths.js new file mode 100644 index 0000000..5aa2b74 --- /dev/null +++ b/src/utils/resolvePaths.js @@ -0,0 +1,12 @@ +const isBrowser = require('./getEnvironment')('type') === 'browser'; +const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line + +module.exports = (options) => { + const opts = { ...options }; + ['corePath', 'workerPath', 'langPath'].forEach((key) => { + if (typeof options[key] !== 'undefined') { + opts[key] = resolveURL(opts[key]); + } + }); + return opts; +}; diff --git a/src/worker-script/browser/index.js b/src/worker-script/browser/index.js new file mode 100644 index 0000000..e69de29 diff --git a/src/worker-script/constants/defaultParams.js b/src/worker-script/constants/defaultParams.js new file mode 100644 index 0000000..00bbc25 --- /dev/null +++ b/src/worker-script/constants/defaultParams.js @@ -0,0 +1,24 @@ +/* + * default params for tesseract.js + */ +const PSM = require('../../constants/PSM'); + +module.exports = { + tessedit_pageseg_mode: PSM.SINGLE_BLOCK, + tessedit_char_whiltelist: '', + tessjs_create_pdf: '0', + tessjs_create_hocr: '1', + tessjs_create_tsv: '1', + tessjs_create_box: '0', + tessjs_create_unlv: '0', + tessjs_create_osd: '0', + tessjs_textonly_pdf: '0', + tessjs_pdf_name: 'tesseract.js-ocr-result', + tessjs_pdf_title: 'Tesseract.js OCR Result', + tessjs_pdf_auto_download: true, + tessjs_pdf_bin: false, + tessjs_image_rectangle_left: 0, + tessjs_image_rectangle_top: 0, + tessjs_image_rectangle_width: -1, + tessjs_image_rectangle_height: -1, +}; diff --git a/src/common/workerUtils.js b/src/worker-script/index.js similarity index 71% rename from src/common/workerUtils.js rename to src/worker-script/index.js index 8a12660..0ee1523 100644 --- a/src/common/workerUtils.js +++ b/src/worker-script/index.js @@ -1,18 +1,18 @@ /** * - * Worker utilities for browser and node + * Worker script for browser and node * - * @fileoverview Worker utilities for browser and node + * @fileoverview Worker script for browser and node * @author Kevin Kwok * @author Guillermo Webster * @author Jerome Wu */ const { loadLang } = require('tesseract.js-utils'); -const pdfTTF = require('./pdf-ttf'); -const dump = require('./dump'); -const { OEM, PSM } = require('./types'); -const { isBrowser } = require('./env'); -const { setImage, getLangsStr, getFiles } = require('./utils'); +const dump = require('./utils/dump'); +const isBrowser = require('../utils/getEnvironment')('type') === 'browser'; +const setImage = require('./utils/setImage'); +const getFiles = require('./utils/getFiles'); +const defaultParams = require('./constants/defaultParams'); /* * Tesseract Module returned by TesseractCore. @@ -24,7 +24,7 @@ let TessModule; let api; let latestJob; let adapter = {}; -let curParams = {}; +let params = defaultParams; /** @@ -38,7 +38,7 @@ let curParams = {}; * @param {object} res - job instance * @returns {Promise} A Promise for callback */ -const load = ({ workerId, jobId, payload: { corePath } }, res) => { +const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => { if (!TessModule) { const Core = adapter.getCore(corePath, res); @@ -56,8 +56,6 @@ const load = ({ workerId, jobId, payload: { corePath } }, res) => { }) .then((tessModule) => { TessModule = tessModule; - TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF)); - api = new TessModule.TessBaseAPI(); res.progress({ workerId, status: 'initialized tesseract', progress: 1 }); res.resolve({ loaded: true }); }); @@ -83,52 +81,55 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => { loadLang({ langs, TessModule, ...options }).then(() => { res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); res.resolve(langs); - }).catch((e) => { - if (isBrowser && e instanceof DOMException) { + }).catch((err) => { + if (isBrowser && err instanceof DOMException) { /* * For some reason google chrome throw DOMException in loadLang, * while other browser is OK, for now we ignore this exception * and hopefully to find the root cause one day. */ } else { - res.reject(e.toString()); + res.reject(err.toString()); } }); }; +const setParameters = ({ payload: { params: _params } }, res) => { + Object.keys(_params) + .filter(k => !k.startsWith('tessjs_')) + .forEach((key) => { + api.SetVariable(key, _params[key]); + }); + params = { ...params, ..._params }; + + if (typeof res !== 'undefined') { + res.resolve(params); + } +}; + const initialize = ({ workerId, jobId, - payload: { langs, params }, + payload: { langs: _langs, oem }, }, res) => { - let { tessedit_ocr_engine_mode: oem } = params; - let l = langs; + const langs = (typeof _langs === 'string') + ? _langs + : _langs.map(l => ((typeof l === 'string') ? l : l.data)).join('+'); - res.progress({ - workerId, jobId, status: 'initializing api', progress: 0, - }); - if ([ - PSM.OSD_ONLY, - PSM.AUTO_OSD, - PSM.RAW_LINE, - ].includes(params.tessedit_pageseg_mode)) { - l = (typeof l === 'string') ? `${l}+osd` : [...l, 'osd']; - // oem = OEM.TESSERACT_ONLY; + try { + res.progress({ + workerId, jobId, status: 'initializing api', progress: 0, + }); + api = new TessModule.TessBaseAPI(); + api.Init(null, langs, oem); + setParameters({ payload: { params } }); + res.progress({ + workerId, jobId, status: 'initialized api', progress: 1, + }); + res.resolve(); + } catch (err) { + res.reject(err.toString()); } - api.Init(null, getLangsStr(l), oem); - Object.keys(params).forEach((key) => { - if (!key.startsWith('tessjs')) { - api.SetVariable(key, params[key]); - } - }); - curParams = { - tessedit_ocr_engine_mode: oem, - ...params, - }; - res.progress({ - workerId, jobId, status: 'initialized api', progress: 1, - }); - res.resolve(); }; /** @@ -146,11 +147,11 @@ const initialize = ({ */ const recognize = ({ payload: { image } }, res) => { try { - const ptr = setImage(TessModule, api, image, curParams); + const ptr = setImage(TessModule, api, image, params); api.Recognize(null); res.resolve({ - files: getFiles(TessModule, api, adapter, curParams), - ...dump(TessModule, api, curParams), + files: getFiles(TessModule, api, adapter, params), + ...dump(TessModule, api, params), }); TessModule._free(ptr); } catch (err) { @@ -172,7 +173,7 @@ const recognize = ({ payload: { image } }, res) => { */ const detect = ({ payload: { image } }, res) => { try { - const ptr = setImage(TessModule, api, image, curParams); + const ptr = setImage(TessModule, api, image, params); const results = new TessModule.OSResults(); if (!api.DetectOS(results)) { @@ -199,6 +200,15 @@ const detect = ({ payload: { image } }, res) => { } }; +const terminate = (_, res) => { + try { + api.End(); + res.resolve({ terminated: true }); + } catch (err) { + res.reject(err.toString()); + } +}; + /** * dispatchHandlers * @@ -233,10 +243,14 @@ exports.dispatchHandlers = (packet, send) => { loadLanguage(packet, res); } else if (action === 'initialize') { initialize(packet, res); + } else if (action === 'set-parameters') { + setParameters(packet, res); } else if (action === 'recognize') { recognize(packet, res); } else if (action === 'detect') { detect(packet, res); + } else if (action === 'terminate') { + terminate(packet, res); } } catch (err) { /** Prepare exception to travel through postMessage */ @@ -250,8 +264,8 @@ exports.dispatchHandlers = (packet, send) => { * @name setAdapter * @function * @access public - * @param {object} impl - implementation of the worker, different in browser and node environment + * @param {object} adapter - implementation of the worker, different in browser and node environment */ -exports.setAdapter = (impl) => { - adapter = impl; +exports.setAdapter = (_adapter) => { + adapter = _adapter; }; diff --git a/src/worker-script/node/exportFile.js b/src/worker-script/node/exportFile.js new file mode 100644 index 0000000..45efd01 --- /dev/null +++ b/src/worker-script/node/exportFile.js @@ -0,0 +1,7 @@ +const fs = require('fs'); + +module.exports = (path, data) => { + fs.writeFile(path, data, (err) => { + if (err) throw err; + }); +}; diff --git a/src/worker-script/node/index.js b/src/worker-script/node/index.js new file mode 100644 index 0000000..a5bb76b --- /dev/null +++ b/src/worker-script/node/index.js @@ -0,0 +1,39 @@ +/** + * + * Tesseract Worker Script for Node + * + * @fileoverview Node worker implementation + * @author Kevin Kwok + * @author Guillermo Webster + * @author Jerome Wu + */ + +const worker = require('../'); +const exportFile = require('./exportFile'); + +let TesseractCore = null; + +/* + * register message handler + */ +process.on('message', (packet) => { + worker.dispatchHandlers(packet, obj => process.send(obj)); +}); + +/* + * getCore is a sync function to load and return + * TesseractCore. + */ +const getCore = (_, res) => { + if (TesseractCore === null) { + res.progress({ status: 'loading tesseract core', progress: 0 }); + TesseractCore = require('tesseract.js-core'); + res.progress({ status: 'loaded tesseract core', progress: 1 }); + } + return TesseractCore; +}; + +worker.setAdapter({ + getCore, + exportFile, +}); diff --git a/src/common/dump.js b/src/worker-script/utils/dump.js similarity index 75% rename from src/common/dump.js rename to src/worker-script/utils/dump.js index 9ec104b..abaf541 100644 --- a/src/common/dump.js +++ b/src/worker-script/utils/dump.js @@ -50,6 +50,13 @@ module.exports = (TessModule, api, { tessjs_create_osd, }) => { const ri = api.GetIterator(); + const { + RIL_BLOCK, + RIL_PARA, + RIL_TEXTLINE, + RIL_WORD, + RIL_SYMBOL, + } = TessModule; const blocks = []; let block; let para; @@ -59,14 +66,13 @@ module.exports = (TessModule, api, { const enumToString = (value, prefix) => ( Object.keys(TessModule) - .filter(e => (e.substr(0, prefix.length + 1) === `${prefix}_`)) - .filter(e => TessModule[e] === value) + .filter(e => (e.startsWith(`${prefix}_`) && TessModule[e] === value)) .map(e => e.slice(prefix.length + 1))[0] ); ri.Begin(); do { - if (ri.IsAtBeginningOf(TessModule.RIL_BLOCK)) { + if (ri.IsAtBeginningOf(RIL_BLOCK)) { const poly = ri.BlockPolygon(); let polygon = null; // BlockPolygon() returns null when automatic page segmentation is off @@ -86,47 +92,47 @@ module.exports = (TessModule, api, { block = { paragraphs: [], - text: ri.GetUTF8Text(TessModule.RIL_BLOCK), - confidence: ri.Confidence(TessModule.RIL_BLOCK), - baseline: ri.getBaseline(TessModule.RIL_BLOCK), - bbox: ri.getBoundingBox(TessModule.RIL_BLOCK), + text: ri.GetUTF8Text(RIL_BLOCK), + confidence: ri.Confidence(RIL_BLOCK), + baseline: ri.getBaseline(RIL_BLOCK), + bbox: ri.getBoundingBox(RIL_BLOCK), blocktype: enumToString(ri.BlockType(), 'PT'), polygon, }; blocks.push(block); } - if (ri.IsAtBeginningOf(TessModule.RIL_PARA)) { + if (ri.IsAtBeginningOf(RIL_PARA)) { para = { lines: [], - text: ri.GetUTF8Text(TessModule.RIL_PARA), - confidence: ri.Confidence(TessModule.RIL_PARA), - baseline: ri.getBaseline(TessModule.RIL_PARA), - bbox: ri.getBoundingBox(TessModule.RIL_PARA), + text: ri.GetUTF8Text(RIL_PARA), + confidence: ri.Confidence(RIL_PARA), + baseline: ri.getBaseline(RIL_PARA), + bbox: ri.getBoundingBox(RIL_PARA), is_ltr: !!ri.ParagraphIsLtr(), }; block.paragraphs.push(para); } - if (ri.IsAtBeginningOf(TessModule.RIL_TEXTLINE)) { + if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { textline = { words: [], - text: ri.GetUTF8Text(TessModule.RIL_TEXTLINE), - confidence: ri.Confidence(TessModule.RIL_TEXTLINE), - baseline: ri.getBaseline(TessModule.RIL_TEXTLINE), - bbox: ri.getBoundingBox(TessModule.RIL_TEXTLINE), + text: ri.GetUTF8Text(RIL_TEXTLINE), + confidence: ri.Confidence(RIL_TEXTLINE), + baseline: ri.getBaseline(RIL_TEXTLINE), + bbox: ri.getBoundingBox(RIL_TEXTLINE), }; para.lines.push(textline); } - if (ri.IsAtBeginningOf(TessModule.RIL_WORD)) { + if (ri.IsAtBeginningOf(RIL_WORD)) { const fontInfo = ri.getWordFontAttributes(); const wordDir = ri.WordDirection(); word = { symbols: [], choices: [], - text: ri.GetUTF8Text(TessModule.RIL_WORD), - confidence: ri.Confidence(TessModule.RIL_WORD), - baseline: ri.getBaseline(TessModule.RIL_WORD), - bbox: ri.getBoundingBox(TessModule.RIL_WORD), + text: ri.GetUTF8Text(RIL_WORD), + confidence: ri.Confidence(RIL_WORD), + baseline: ri.getBaseline(RIL_WORD), + bbox: ri.getBoundingBox(RIL_WORD), is_numeric: !!ri.WordIsNumeric(), in_dictionary: !!ri.WordIsFromDictionary(), @@ -159,14 +165,14 @@ module.exports = (TessModule, api, { // var image = pix2array(pix); // // for some reason it seems that things stop working if you destroy pics // TessModule._pixDestroy(TessModule.getPointer(pix)); - if (ri.IsAtBeginningOf(TessModule.RIL_SYMBOL)) { + if (ri.IsAtBeginningOf(RIL_SYMBOL)) { symbol = { choices: [], image: null, - text: ri.GetUTF8Text(TessModule.RIL_SYMBOL), - confidence: ri.Confidence(TessModule.RIL_SYMBOL), - baseline: ri.getBaseline(TessModule.RIL_SYMBOL), - bbox: ri.getBoundingBox(TessModule.RIL_SYMBOL), + text: ri.GetUTF8Text(RIL_SYMBOL), + confidence: ri.Confidence(RIL_SYMBOL), + baseline: ri.getBaseline(RIL_SYMBOL), + bbox: ri.getBoundingBox(RIL_SYMBOL), is_superscript: !!ri.SymbolIsSuperscript(), is_subscript: !!ri.SymbolIsSubscript(), is_dropcap: !!ri.SymbolIsDropcap(), @@ -181,7 +187,7 @@ module.exports = (TessModule, api, { } while (ci.Next()); // TessModule.destroy(i); } - } while (ri.Next(TessModule.RIL_SYMBOL)); + } while (ri.Next(RIL_SYMBOL)); TessModule.destroy(ri); return { diff --git a/src/worker-script/utils/getFiles.js b/src/worker-script/utils/getFiles.js new file mode 100644 index 0000000..c1e8cb1 --- /dev/null +++ b/src/worker-script/utils/getFiles.js @@ -0,0 +1,39 @@ +/** + * handleOutput + * + * @name handleOutput + * @function handle file output + * @access private + * @param {object} customParams - an object of params + */ +module.exports = (TessModule, api, adapter, params) => { + let files = {}; + const { + tessjs_create_pdf, + tessjs_textonly_pdf, + tessjs_pdf_name, + tessjs_pdf_title, + tessjs_pdf_auto_download, + tessjs_pdf_bin, + } = params; + + if (tessjs_create_pdf === '1') { + const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1'); + pdfRenderer.BeginDocument(tessjs_pdf_title); + pdfRenderer.AddImage(api); + pdfRenderer.EndDocument(); + TessModule._free(pdfRenderer); + + const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`); + + if (tessjs_pdf_bin) { + files = { pdf: data, ...files }; + } + + if (tessjs_pdf_auto_download) { + adapter.exportFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf'); + } + } + + return files; +}; diff --git a/src/worker-script/utils/getLangStr.js b/src/worker-script/utils/getLangStr.js new file mode 100644 index 0000000..da07f70 --- /dev/null +++ b/src/worker-script/utils/getLangStr.js @@ -0,0 +1,5 @@ +module.exports = langs => ( + typeof langs === 'string' + ? langs + : langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+') +); diff --git a/src/worker-script/utils/setImage.js b/src/worker-script/utils/setImage.js new file mode 100644 index 0000000..89c3117 --- /dev/null +++ b/src/worker-script/utils/setImage.js @@ -0,0 +1,41 @@ +const { readImage } = require('tesseract.js-utils'); + +/** + * setImage + * + * @name setImage + * @function set image in tesseract for recognition + * @access public + * @param {array} image - binary array in array format + * @returns {number} - an emscripten pointer of the image + */ +module.exports = (TessModule, api, image, params) => { + const { + tessjs_image_rectangle_left: left, + tessjs_image_rectangle_top: top, + tessjs_image_rectangle_width: width, + tessjs_image_rectangle_height: height, + } = params; + const { + w, h, bytesPerPixel, data, pix, + } = readImage(TessModule, Array.from(image)); + + /* + * As some image format (ex. bmp) is not supported natiely by tesseract, + * sometimes it will not return pix directly, but data and bytesPerPixel + * for another SetImage usage. + * + */ + if (data === null) { + api.SetImage(pix); + } else { + api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel); + } + api.SetRectangle( + (left < 0) ? 0 : left, + (top < 0) ? 0 : top, + (width < 0) ? w : width, + (height < 0) ? h : height, + ); + return data === null ? pix : data; +}; diff --git a/src/browser/b64toU8Array.js b/src/worker/browser/b64toU8Array.js similarity index 100% rename from src/browser/b64toU8Array.js rename to src/worker/browser/b64toU8Array.js diff --git a/src/browser/index.js b/src/worker/browser/index.js similarity index 100% rename from src/browser/index.js rename to src/worker/browser/index.js diff --git a/src/browser/worker.js b/src/worker/browser/worker.js similarity index 88% rename from src/browser/worker.js rename to src/worker/browser/worker.js index 4eabe74..84a0316 100644 --- a/src/browser/worker.js +++ b/src/worker/browser/worker.js @@ -1,6 +1,6 @@ /** * - * Browser worker implementation + * Browser worker scripts * * @fileoverview Browser worker implementation * @author Kevin Kwok @@ -9,21 +9,21 @@ */ const check = require('check-types'); -const workerUtils = require('../common/workerUtils'); +const workerWrapper = require('../../workerWrapper'); const b64toU8Array = require('./b64toU8Array'); /* * register message handler */ global.addEventListener('message', ({ data }) => { - workerUtils.dispatchHandlers(data, obj => postMessage(obj)); + workerWrapper.dispatchHandlers(data, obj => postMessage(obj)); }); /* * getCore is a sync function to load and return * TesseractCore. */ -workerUtils.setAdapter({ +workerWrapper.setAdapter({ getCore: (corePath, res) => { if (check.undefined(global.TesseractCore)) { res.progress({ status: 'loading tesseract core', progress: 0 }); diff --git a/src/worker/node/defaultOptions.js b/src/worker/node/defaultOptions.js new file mode 100644 index 0000000..053c1e3 --- /dev/null +++ b/src/worker/node/defaultOptions.js @@ -0,0 +1,10 @@ +const path = require('path'); +const defaultOptions = require('../../constants/defaultOptions'); + +/* + * Default options for node worker + */ +module.exports = { + ...defaultOptions, + workerPath: path.join(__dirname, '..', '..', 'worker-script', 'node', 'index.js'), +}; diff --git a/src/worker/node/index.js b/src/worker/node/index.js new file mode 100644 index 0000000..cedf1f5 --- /dev/null +++ b/src/worker/node/index.js @@ -0,0 +1,22 @@ +/** + * + * Tesseract Worker impl. for node (using child_process) + * + * @fileoverview Tesseract Worker impl. for node + * @author Kevin Kwok + * @author Guillermo Webster + * @author Jerome Wu + */ +const defaultOptions = require('./defaultOptions'); +const spawnWorker = require('./spawnWorker'); +const terminateWorker = require('./terminateWorker'); +const onMessage = require('./onMessage'); +const send = require('./send'); + +module.exports = { + defaultOptions, + spawnWorker, + terminateWorker, + onMessage, + send, +}; diff --git a/src/worker/node/onMessage.js b/src/worker/node/onMessage.js new file mode 100644 index 0000000..7100e71 --- /dev/null +++ b/src/worker/node/onMessage.js @@ -0,0 +1,3 @@ +module.exports = (worker, handler) => { + worker.on('message', handler); +}; diff --git a/src/worker/node/send.js b/src/worker/node/send.js new file mode 100644 index 0000000..b3ee04b --- /dev/null +++ b/src/worker/node/send.js @@ -0,0 +1,61 @@ +const util = require('util'); +const fs = require('fs'); +const axios = require('axios'); +const isURL = require('is-url'); + +const readFile = util.promisify(fs.readFile); + +/** + * loadImage + * + * @name loadImage + * @function load image from different source + * @access public + * @param {string} image - image source, supported formats: + * string: URL string or file path + * string: base64 image + * buffer: image buffer + * @returns {array} binary image in array format + */ +const loadImage = (image) => { + if (isURL(image)) { + return axios.get(image, { + responseType: 'arraybuffer', + }) + .then(resp => resp.data); + } + + if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { + return Promise.resolve(Buffer.from(image.split(',')[1], 'base64')); + } + + if (Buffer.isBuffer(image)) { + return Promise.resolve(image); + } + + return readFile(image); +}; + + +/** + * send + * + * @name send + * @function send packet to worker and create a job + * @access public + * @param {object} instance TesseractWorker instance + * @param {object} iPacket data for worker + */ +module.exports = ({ worker }, packet) => { + const p = { ...packet }; + if (['recognize', 'detect'].includes(p.action)) { + loadImage(p.payload.image) + .then(buf => new Uint8Array(buf)) + .then((img) => { + p.payload.image = Array.from(img); + worker.send(p); + }); + } else { + worker.send(p); + } +}; diff --git a/src/worker/node/spawnWorker.js b/src/worker/node/spawnWorker.js new file mode 100644 index 0000000..1dabcd7 --- /dev/null +++ b/src/worker/node/spawnWorker.js @@ -0,0 +1,15 @@ +const { fork } = require('child_process'); + +/** + * spawnWorker + * + * @name spawnWorker + * @function fork a new process in node + * @access public + * @param {object} instance - TesseractWorker instance + * @param {object} options + * @param {string} options.workerPath - worker script path + */ +module.exports = ({ workerPath }) => ( + fork(workerPath) +); diff --git a/src/worker/node/terminateWorker.js b/src/worker/node/terminateWorker.js new file mode 100644 index 0000000..2ee0a78 --- /dev/null +++ b/src/worker/node/terminateWorker.js @@ -0,0 +1,11 @@ +/** + * terminateWorker + * + * @name terminateWorker + * @function kill worker + * @access public + * @param {object} instance TesseractWorker instance + */ +module.exports = (worker) => { + worker.kill(); +};