From de4b98ae23202929471ae8483939c009e5f421b0 Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Fri, 24 May 2019 12:51:14 +0800 Subject: [PATCH] Upgrade to latest tesseract.js-utils --- package-lock.json | 6 +++--- package.json | 4 ++-- scripts/webpack.config.common.js | 1 + scripts/webpack.config.dev.js | 20 +++++++++++------- scripts/webpack.config.prod.js | 22 ++++++++++++------- src/common/TesseractWorker.js | 10 ++++----- src/common/workerUtils.js | 36 +++++++++++++++++++------------- tests/detect.test.js | 4 +--- tests/recognize.test.js | 16 +++++++------- 9 files changed, 68 insertions(+), 51 deletions(-) diff --git a/package-lock.json b/package-lock.json index b344a7b..e39e07c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8537,9 +8537,9 @@ "integrity": "sha512-QmNgMA9m5ES5uMTqpOAPysrUA80vUx/6WKQlfkK3zhOeAgqv8DjwwcDv9tQv2TgRzOQ+LFKrJn94Y2rw5b2IGw==" }, "tesseract.js-utils": { - "version": "1.0.0-beta.6", - "resolved": "https://registry.npmjs.org/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.6.tgz", - "integrity": "sha512-AENYhkqafwysayWmKtyApV0gR4abLJ426plLNHs/++8oHt+ekooyp77ew/q4+QLE7cbUDyxiNGawcraOWE/RuQ==", + "version": "1.0.0-beta.7", + "resolved": "https://registry.npmjs.org/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.7.tgz", + "integrity": "sha512-MvBQNxVoueDg/8iN8jb26Tnj0lBRfmHVMcnaNdab+JeQKg/SuNXWMCpTD4D17+S6zeE9AAlVQymDmXH2/3vaMg==", "requires": { "axios": "^0.18.0", "bmp-js": "^0.1.0", diff --git a/package.json b/package.json index 5474e33..1df7515 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "main": "src/index.js", "scripts": { "start": "node scripts/server.js", - "build": "webpack --config scripts/webpack.config.prod.js", + "build": "webpack --progress --config scripts/webpack.config.prod.js", "prepublishOnly": "npm run build", "wait": "wait-on http://localhost:3000/package.json", "test": "npm-run-all -p -r start test:all", @@ -54,7 +54,7 @@ "node-fetch": "^2.3.0", "resolve-url": "^0.2.1", "tesseract.js-core": "^2.0.0-beta.10", - "tesseract.js-utils": "^1.0.0-beta.6" + "tesseract.js-utils": "^1.0.0-beta.7" }, "repository": { "type": "git", diff --git a/scripts/webpack.config.common.js b/scripts/webpack.config.common.js index a6dd7d5..fabc7f6 100644 --- a/scripts/webpack.config.common.js +++ b/scripts/webpack.config.common.js @@ -4,6 +4,7 @@ module.exports = { { test: /\.m?js$/, // exclude: /(node_modules|bower_components)/, + exclude: /(tesseract.js-core)/, use: { loader: 'babel-loader', options: { diff --git a/scripts/webpack.config.dev.js b/scripts/webpack.config.dev.js index 9efa913..92893b6 100644 --- a/scripts/webpack.config.dev.js +++ b/scripts/webpack.config.dev.js @@ -3,9 +3,10 @@ const webpack = require('webpack'); const common = require('./webpack.config.common'); const genConfig = ({ - entry, filename, library, libraryTarget, + entry, filename, library, libraryTarget, ...config }) => ({ ...common, + ...config, mode: 'development', entry, output: { @@ -29,10 +30,15 @@ module.exports = [ library: 'Tesseract', libraryTarget: 'umd', }), - /* - *genConfig({ - * entry: path.resolve(__dirname, '..', 'src', 'browser', 'worker.js'), - * filename: 'worker.dev.js', - *}), - */ + genConfig({ + entry: path.resolve(__dirname, '..', 'src', 'index.js'), + filename: 'tesseract.asm.dev.js', + library: 'Tesseract', + libraryTarget: 'umd', + resolve: { + alias: { + 'tesseract.js-core/tesseract-core.wasm.js': 'tesseract.js-core/tesseract-core.asm.js', + }, + }, + }), ]; diff --git a/scripts/webpack.config.prod.js b/scripts/webpack.config.prod.js index b088f3f..9f127c7 100644 --- a/scripts/webpack.config.prod.js +++ b/scripts/webpack.config.prod.js @@ -2,11 +2,12 @@ const path = require('path'); const common = require('./webpack.config.common'); const genConfig = ({ - entry, filename, library, libraryTarget, + entry, filename, library, libraryTarget, ...config }) => ({ ...common, + ...config, mode: 'production', - devtool: 'source-map', + // devtool: 'source-map', entry, output: { path: path.resolve(__dirname, '..', 'dist'), @@ -23,10 +24,15 @@ module.exports = [ library: 'Tesseract', libraryTarget: 'umd', }), - /* - *genConfig({ - * entry: path.resolve(__dirname, '..', 'src', 'browser', 'worker.js'), - * filename: 'worker.min.js', - *}), - */ + genConfig({ + entry: path.resolve(__dirname, '..', 'src', 'index.js'), + filename: 'tesseract.asm.min.js', + library: 'Tesseract', + libraryTarget: 'umd', + resolve: { + alias: { + 'tesseract.js-core/tesseract-core.wasm.js': 'tesseract.js-core/tesseract-core.asm.js', + }, + }, + }), ]; diff --git a/src/common/TesseractWorker.js b/src/common/TesseractWorker.js index 9c92afc..e911c62 100644 --- a/src/common/TesseractWorker.js +++ b/src/common/TesseractWorker.js @@ -67,12 +67,12 @@ class TesseractWorker { * @function recognize text in given image * @access public * @param {Buffer, string} image - image to be recognized - * @param {string} [lang=eng] - language to recognize + * @param {string, array} [langs='eng'] - language to recognize * @param {object} params - tesseract parameters * */ - recognize(image, lang = 'eng', params = {}) { - return this._sendJob('recognize', image, lang, params); + recognize(image, langs = 'eng', params = {}) { + return this._sendJob('recognize', image, langs, params); } /** @@ -152,13 +152,13 @@ class TesseractWorker { * @param {string} lang language to recognize * @param {object} params tesseract parameters */ - _sendJob(type, image, lang, params) { + _sendJob(type, image, langs, params) { return this._delay((job) => { job.send( type, { image, - lang, + langs, params, options: this.options, }, diff --git a/src/common/workerUtils.js b/src/common/workerUtils.js index 93f9bce..80e051a 100644 --- a/src/common/workerUtils.js +++ b/src/common/workerUtils.js @@ -52,16 +52,24 @@ const setImage = (image) => { return data === null ? pix : data; }; +const getLangsStr = (langs) => { + if (typeof langs === 'string') { + return langs; + } + + return langs.map(lang => (typeof lang === 'string' ? lang : lang.code)).join('+'); +}; + /** * handleParams * * @name handleParams * @function hanlde params from users * @access private - * @param {string} lang - lang string for Init() + * @param {string} langs - lang string for Init() * @param {object} customParams - an object of params */ -const handleParams = (lang, customParams) => { +const handleParams = (langs, customParams) => { const { tessedit_ocr_engine_mode, ...params @@ -69,7 +77,7 @@ const handleParams = (lang, customParams) => { ...defaultParams, ...customParams, }; - api.Init(null, lang, tessedit_ocr_engine_mode); + api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); Object.keys(params).forEach((key) => { api.SetVariable(key, params[key]); }); @@ -158,14 +166,14 @@ const handleInit = ({ corePath }, res) => { * @function load language from remote or local cache * @access public * @param {object} req - job payload - * @param {string} req.lang - languages to load, ex: eng, eng+chi_tra + * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra * @param {object} req.options - other options for loadLang function * @param {object} res - job instance * @returns {Promise} A Promise for callback */ -const loadLanguage = ({ lang, options }, res) => { +const loadLanguage = ({ langs, options }, res) => { res.progress({ status: 'loading language traineddata', progress: 0 }); - return loadLang({ lang, TessModule, ...options }).then((...args) => { + return loadLang({ langs, TessModule, ...options }).then((...args) => { res.progress({ status: 'loaded language traineddata', progress: 1 }); return args; }); @@ -179,17 +187,17 @@ const loadLanguage = ({ lang, options }, res) => { * @access public * @param {object} req - job payload * @param {array} req.image - binary image in array format - * @param {string} req.lang - languages to load, ex: eng, eng+chi_tra + * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra * @param {object} req.options - other options for loadLang function * @param {object} req.params - parameters for tesseract * @param {object} res - job instance */ const handleRecognize = ({ - image, lang, options, params, + image, langs, options, params, }, res) => ( handleInit(options, res) .then(() => ( - loadLanguage({ lang, options }, res) + loadLanguage({ langs, options }, res) .catch((e) => { if (e instanceof DOMException) { /* @@ -206,7 +214,7 @@ const handleRecognize = ({ res.progress({ status: 'initializing api', progress }); }; progressUpdate(0); - handleParams(lang, params); + handleParams(langs, params); progressUpdate(0.5); const ptr = setImage(image); progressUpdate(1); @@ -228,18 +236,18 @@ const handleRecognize = ({ * @access public * @param {object} req - job payload * @param {array} req.image - binary image in array format - * @param {string} req.lang - languages to load, ex: eng, eng+chi_tra + * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra * @param {object} req.options - other options for loadLang function * @param {object} res - job instance */ const handleDetect = ({ - image, lang, options, + image, langs, options, }, res) => ( handleInit(options, res) .then(() => ( - loadLanguage({ lang, options }, res) + loadLanguage({ langs, options }, res) .then(() => { - api.Init(null, lang); + api.Init(null, getLangsStr(langs)); api.SetPageSegMode(TessModule.PSM_OSD_ONLY); const ptr = setImage(image); diff --git a/tests/detect.test.js b/tests/detect.test.js index d739505..c003106 100644 --- a/tests/detect.test.js +++ b/tests/detect.test.js @@ -1,5 +1,4 @@ -const { TesseractWorker, utils: { loadLang } } = Tesseract; -const isBrowser = typeof window !== 'undefined' && typeof window.document !== 'undefined'; +const { TesseractWorker } = Tesseract; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const loadLangOptions = { langPath: 'http://localhost:3000/tests/assets/traineddata', @@ -9,7 +8,6 @@ const loadLangOptions = { const getWorker = options => ( new TesseractWorker({ cacheMethod: 'readOnly', - ...(isBrowser ? { workerPath: 'http://localhost:3000/dist/worker.dev.js' } : {}), ...loadLangOptions, ...options, }) diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 012532b..575572c 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -1,6 +1,5 @@ -const { TesseractWorker, utils: { loadLang } } = Tesseract; +const { TesseractWorker } = Tesseract; -const isBrowser = typeof window !== 'undefined' && typeof window.document !== 'undefined'; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const SIMPLE_TEXT = 'Tesseract.js\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; @@ -16,7 +15,6 @@ const loadLangOptions = { const getWorker = options => ( new TesseractWorker({ cacheMethod: 'readOnly', - ...(isBrowser ? { workerPath: 'http://localhost:3000/dist/worker.dev.js' } : {}), ...loadLangOptions, ...options, }) @@ -133,7 +131,7 @@ describe('recognize()', () => { }).timeout(10000) )); }); - + (isBrowser ? describe : describe.skip)('should read image from video DOM element (browser only)', () => { FORMATS.forEach(format => ( it(`support ${format} format`, (done) => { @@ -160,22 +158,23 @@ describe('recognize()', () => { let canvasDOM = null; let imageDOM = null; let idx = 0; - beforeEach(function cb(done) { + beforeEach((done) => { canvasDOM = document.createElement('canvas'); imageDOM = document.createElement('img'); imageDOM.setAttribute('crossOrigin', 'Anonymous'); imageDOM.onload = () => { canvasDOM.getContext('2d').drawImage(imageDOM, 0, 0); done(); - } - imageDOM.setAttribute('src', `${IMAGE_PATH}/simple.${formats[idx++]}`); + }; + imageDOM.setAttribute('src', `${IMAGE_PATH}/simple.${formats[idx]}`); + idx += 1; }); afterEach(() => { canvasDOM.remove(); imageDOM.remove(); }); - + formats.forEach(format => ( it(`support ${format} format`, (done) => { const worker = getWorker(); @@ -189,5 +188,4 @@ describe('recognize()', () => { }).timeout(10000) )); }); - });