From 30d3421d15d8052f61b83a3abebaa055d62ac268 Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Mon, 19 Nov 2018 23:48:56 +0800 Subject: [PATCH] Rewrite src to use tesseract.js-utils and fit lint --- package-lock.json | 33 +++++-- package.json | 3 +- src/common/worker.js | 202 ++++++++++++++++++++----------------------- src/node/lang.js | 47 ---------- src/node/worker.js | 1 - 5 files changed, 124 insertions(+), 162 deletions(-) delete mode 100644 src/node/lang.js diff --git a/package-lock.json b/package-lock.json index e791c84..fc66b3c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2959,6 +2959,11 @@ "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.18.tgz", "integrity": "sha512-sr1ZQph3UwHTR0XftSbK85OvBbxe/abLGzEnPENCQwmHf7sck8Oyu4ob3LgBxWWxRoM+QszeUyl7jbqapu2TqA==" }, + "idb-keyval": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-3.1.0.tgz", + "integrity": "sha512-iFwFN5n00KNNnVxlOOK280SJJfXWY7pbMUOQXdIXehvvc/mGCV/6T2Ae+Pk2KwAkkATDTwfMavOiDH5lrJKWXQ==" + }, "idb-wrapper": { "version": "1.7.1", "resolved": "https://registry.npmjs.org/idb-wrapper/-/idb-wrapper-1.7.1.tgz", @@ -4822,17 +4827,30 @@ } }, "tesseract.js-core": { - "version": "2.0.0-beta.2", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.2.tgz", - "integrity": "sha512-aExaof0ZxYc2q0T7HN26wxcARN/N+0q9z9gjkyaEFsuUj82VtUZr0A6yDQ0rpQyIdk22K15fASUX5XcwFIb0rw==", - "requires": { - "node-fetch": "^2.3.0" + "version": "2.0.0-beta.4", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.4.tgz", + "integrity": "sha512-s9uw+s3Rgsw4DfWzUN7LKuLUdhzOxGMRZ+NIKLFGajTfczMTOkYE7PKngz3ob0ze3JLOnimnM7A6mJ/U2xMn5w==" + }, + "tesseract.js-utils": { + "version": "1.0.0-beta.1", + "resolved": "https://registry.npmjs.org/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.1.tgz", + "integrity": "sha512-jQrP7umpOAdaeBoAE8aFl+6HUPpGpLdMozTQINVt+4allstur5dho7u8YWNWu4WXFk9XHSUl6isWKIpmBGbGcA==", + "requires": { + "idb-keyval": "^3.1.0", + "node-fetch": "^2.3.0", + "whatwg-fetch": "^3.0.0", + "zlibjs": "^0.3.1" }, "dependencies": { "node-fetch": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.3.0.tgz", "integrity": "sha512-MOd8pV3fxENbryESLgVIeaGKrdl+uaYhCSSVkjeOb/31/njTpcis5aWfdqgNlHIrKOLRbMnfPINPOML2CIFeXA==" + }, + "whatwg-fetch": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.0.0.tgz", + "integrity": "sha512-9GSJUgz1D4MfyKU7KRqwOjXCXTqWdFNvEr7eUBYchQiVc744mqK/MzXPNR2WsPkmkOa4ywfg8C2n8h+13Bey1Q==" } } }, @@ -5207,6 +5225,11 @@ "requires": { "object-keys": "~0.4.0" } + }, + "zlibjs": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", + "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=" } } } diff --git a/package.json b/package.json index 7f4b85b..be19a46 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,8 @@ "node-fetch": "^1.6.3", "object-assign": "^4.1.0", "png.js": "^0.2.1", - "tesseract.js-core": "^2.0.0-beta.2" + "tesseract.js-core": "^2.0.0-beta.4", + "tesseract.js-utils": "^1.0.0-beta.1" }, "repository": { "type": "git", diff --git a/src/common/worker.js b/src/common/worker.js index 779d215..2e2b495 100644 --- a/src/common/worker.js +++ b/src/common/worker.js @@ -1,138 +1,93 @@ -const readImage = require('tesseract.js-core/src/utils/readImage'); -var latestJob, - Module, - base, - adapter = {}, - dump = require('./dump.js'), - desaturate = require('./desaturate.js'); - -function dispatchHandlers(packet, send){ - function respond(status, data){ - send({ - jobId: packet.jobId, - status, - action: packet.action, - data - }); - } - respond.resolve = respond.bind(this, 'resolve'); - respond.reject = respond.bind(this, 'reject'); - respond.progress = respond.bind(this, 'progress'); - - latestJob = respond; - - try { - if(packet.action === 'recognize'){ - handleRecognize(packet.payload, respond); - } else if (packet.action === 'detect'){ - handleDetect(packet.payload, respond); - } - } catch (err) { - // Prepare exception to travel through postMessage - err = err.toString(); +const { readImage, loadLang } = require('tesseract.js-utils'); +const dump = require('./dump'); - respond.reject(err) - } -} -exports.dispatchHandlers = dispatchHandlers; +let Module; +let base; +let latestJob; +let adapter = {}; -exports.setAdapter = function setAdapter(impl){ - adapter = impl; -}; +const handleInit = (req, res) => { + let MIN_MEMORY = 100663296; + if (['chi_sim', 'chi_tra', 'jpn'].includes(req.options.lang)) { + MIN_MEMORY = 167772160; + } -function handleInit(req, res){ - var MIN_MEMORY = 100663296; + if (!Module || Module.TOTAL_MEMORY < MIN_MEMORY) { + const Core = adapter.getCore(req, res); - if(['chi_sim', 'chi_tra', 'jpn'].includes(req.options.lang)){ - MIN_MEMORY = 167772160; - } + res.progress({ status: 'initializing tesseract', progress: 0 }); - if(!Module || Module.TOTAL_MEMORY < MIN_MEMORY){ - var Core = adapter.getCore(req, res); - - res.progress({ status: 'initializing tesseract', progress: 0 }) - - return Core({ - // TOTAL_MEMORY: MIN_MEMORY, - TesseractProgress(percent){ - latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent-30)/70) }); - }, - }) - .then((TessModule) => { - Module = TessModule; - base = new Module.TessBaseAPI(); - res.progress({ status: 'initializing tesseract', progress: 1 }); - }); - } + return Core({ + // TOTAL_MEMORY: MIN_MEMORY, + TesseractProgress(percent) { + latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent - 30) / 70) }); + }, + }) + .then((TessModule) => { + Module = TessModule; + base = new Module.TessBaseAPI(); + res.progress({ status: 'initialized tesseract', progress: 1 }); + }); + } return new Promise(); -} +}; -function setImage(Module, base, image) { +const setImage = (image) => { const { w, h, data } = readImage(Module, Array.from(image)); base.SetImage(data); base.SetRectangle(0, 0, w, h); return data; -} - -function loadLanguage(req, res, cb){ - var lang = req.options.lang, - langFile = lang + '.traineddata'; - - if(!Module._loadedLanguages) Module._loadedLanguages = {}; - if(lang in Module._loadedLanguages) return cb(); - - adapter.getLanguageData(req, res, function(data){ - res.progress({ status: 'loading ' + langFile, progress: 0 }); - Module.FS.writeFile(langFile, data); - Module._loadedLanguages[lang] = true; - res.progress({ status: 'loading ' + langFile, progress: 1 }); - cb(); - }) -} - +}; +const loadLanguage = (req, res, cb) => { + const { options: { lang }, workerOptions: { langPath } } = req; + return loadLang({ + langs: lang, + tessModule: Module, + langURI: langPath, + cache: true, + }).then(cb); +}; -function handleRecognize(req, res){ +const handleRecognize = (req, res) => { handleInit(req, res) .then(() => { loadLanguage(req, res, () => { - var options = req.options; + const { options } = req; - function progressUpdate(progress){ - res.progress({ status: 'initializing api', progress: progress }); - } + const progressUpdate = (progress) => { + res.progress({ status: 'initializing api', progress }); + }; progressUpdate(0); - base.Init(null, req.options.lang); - progressUpdate(.3); + base.Init(null, options.lang); + progressUpdate(0.3); - for (var option in options) { - if (options.hasOwnProperty(option)) { - base.SetVariable(option, options[option]); - } - } + Object.keys(options).forEach((key) => { + base.SetVariable(key, options[key]); + }); - progressUpdate(.6); - var ptr = setImage(Module, base, req.image); + progressUpdate(0.6); + const ptr = setImage(req.image); progressUpdate(1); base.Recognize(null); - var result = dump(Module, base); + const result = dump(Module, base); base.End(); Module._free(ptr); res.resolve(result); - }) + }); }); -} +}; -function handleDetect(req, res){ +const handleDetect = (req, res) => { handleInit(req, res) .then(() => { req.options.lang = 'osd'; @@ -140,17 +95,17 @@ function handleDetect(req, res){ base.Init(null, 'osd'); base.SetPageSegMode(Module.PSM_OSD_ONLY); - var ptr = setImage(Module, base, req.image), - results = new Module.OSResults(); + const ptr = setImage(req.image); + const results = new Module.OSResults(); - if(!base.DetectOS(results)){ + if (!base.DetectOS(results)) { base.End(); Module._free(ptr); - res.reject("Failed to detect OS"); + res.reject('Failed to detect OS'); } else { - var best = results.get_best_result(), - oid = best.get_orientation_id(), - sid = best.get_script_id(); + const best = results.get_best_result(); + const oid = best.get_orientation_id(); + const sid = best.get_script_id(); base.End(); Module._free(ptr); @@ -160,9 +115,40 @@ function handleDetect(req, res){ script: results.get_unicharset().get_script_from_script_id(sid), script_confidence: best.get_sconfidence(), orientation_degrees: [0, 270, 180, 90][oid], - orientation_confidence: best.get_oconfidence() + orientation_confidence: best.get_oconfidence(), }); } }); }); -} +}; + +exports.dispatchHandlers = (packet, send) => { + const respond = (status, data) => { + send({ + jobId: packet.jobId, + status, + action: packet.action, + data, + }); + }; + respond.resolve = respond.bind(this, 'resolve'); + respond.reject = respond.bind(this, 'reject'); + respond.progress = respond.bind(this, 'progress'); + + latestJob = respond; + + try { + if (packet.action === 'recognize') { + handleRecognize(packet.payload, respond); + } else if (packet.action === 'detect') { + handleDetect(packet.payload, respond); + } + } catch (err) { + // Prepare exception to travel through postMessage + respond.reject(err.toString()); + } +}; + +exports.setAdapter = (impl) => { + adapter = impl; +}; diff --git a/src/node/lang.js b/src/node/lang.js deleted file mode 100644 index e06cc20..0000000 --- a/src/node/lang.js +++ /dev/null @@ -1,47 +0,0 @@ -const https = require("https"), - http = require("http"), - zlib = require("zlib"), - fs = require("fs"), - path = require("path"), - isURL = require("is-url"); - -var langdata = require('../common/langdata.json') - -function getLanguageData(req, res, cb){ - var lang = req.options.lang, - langfile = lang + '.traineddata.gz'; - - // langPath defaults to a URL where languages can be downloaded. If a custom path is specified - // and it is a local path, use that instead - var localPath = isURL(req.workerOptions.langPath) ? - lang + '.traineddata' : - path.join(req.workerOptions.langPath, lang + '.traineddata'); - - var fetchProtocol = req.workerOptions.langPath.startsWith('http://') ? http : https; - - fs.readFile(localPath, function (err, data) { - if(!err) return cb(new Uint8Array(data)); - - fetchProtocol.get(req.workerOptions.langPath + langfile, stream => { - var received_bytes = 0; - stream.on('data', function(chunk) { - received_bytes += chunk.length; - res.progress({ - status: 'downloading ' + langfile, - loaded: received_bytes, - progress: Math.min(1, received_bytes / langdata[lang]) - }); - - }); - - var gunzip = zlib.createGunzip(); - stream.pipe(gunzip).pipe(fs.createWriteStream(lang + '.traineddata')) - gunzip.on('end',() => { - getLanguageData(req, stream, cb) - }); - }); - }); -} - - -module.exports = getLanguageData; diff --git a/src/node/worker.js b/src/node/worker.js index 1531d34..d5aaab2 100644 --- a/src/node/worker.js +++ b/src/node/worker.js @@ -16,5 +16,4 @@ workerUtils.setAdapter({ } return TesseractCore; }, - getLanguageData: require('./lang'), });