diff --git a/examples/node/recognize.js b/examples/node/recognize.js index 36bb365..0e021f2 100755 --- a/examples/node/recognize.js +++ b/examples/node/recognize.js @@ -1,7 +1,8 @@ #!/usr/bin/env node const path = require('path'); +const fs = require('fs'); const { - createScheduler, createWorker, createJob, PSM, + Tesseract, createScheduler, createWorker, } = require('../../'); const [,, imagePath] = process.argv; @@ -15,16 +16,19 @@ console.log(`Recognizing ${image}`); await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); - await worker.setParameters({ - tessedit_char_whitelist: 'ABCDEFGH', - }); scheduler.addWorker(worker); - const { text: t1 } = await scheduler.addJob(createJob('recognize', { image })); - console.log(t1); - await worker.setParameters({ - tessedit_char_whitelist: 'abcdefg', - }); - const { text: t2 } = await scheduler.addJob(createJob('recognize', { image })); - console.log(t2); - scheduler.terminate(); + console.log((await scheduler.addJob('recognize', image)).text); + const data = await worker.getPDF('ocr', 'Tesseract OCR'); + fs.writeFileSync('test.pdf', Buffer.from(data)); + await scheduler.terminate(); })(); + +//Tesseract.recognize(image, 'eng', { logger: m => console.log(m) }) +// .then(({ text }) => { +// console.log(text); +// }); + +//Tesseract.detect(image, { logger: m => console.log(m) }) +// .then((data) => { +// console.log(data); +// }); diff --git a/package-lock.json b/package-lock.json index 284d618..7615f01 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3558,9 +3558,9 @@ } }, "file-type": { - "version": "10.11.0", - "resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz", - "integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw==" + "version": "12.3.0", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-12.3.0.tgz", + "integrity": "sha512-4E4Esq9KLwjYCY32E7qSmd0h7LefcniZHX+XcdJ4Wfx1uGJX7QCigiqw/U0yT7WOslm28yhxl87DJ0wHYv0RAA==" }, "finalhandler": { "version": "1.1.1", @@ -8565,6 +8565,11 @@ "zlibjs": "^0.3.1" }, "dependencies": { + "file-type": { + "version": "10.11.0", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz", + "integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw==" + }, "is-url": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", diff --git a/package.json b/package.json index 424cdf9..fb9b6d4 100644 --- a/package.json +++ b/package.json @@ -53,12 +53,16 @@ }, "dependencies": { "axios": "^0.18.0", + "bmp-js": "^0.1.0", "check-types": "^7.4.0", + "file-type": "^12.3.0", + "idb-keyval": "^3.2.0", "is-url": "1.2.2", "opencollective-postinstall": "^2.0.2", "resolve-url": "^0.2.1", "tesseract.js-core": "^2.0.0-beta.13", - "tesseract.js-utils": "^1.0.0-beta.8" + "tesseract.js-utils": "^1.0.0-beta.8", + "zlibjs": "^0.3.1" }, "repository": { "type": "git", diff --git a/src/Tesseract.js b/src/Tesseract.js new file mode 100644 index 0000000..9492aa8 --- /dev/null +++ b/src/Tesseract.js @@ -0,0 +1,33 @@ +const createScheduler = require('./createScheduler'); +const createWorker = require('./createWorker'); + +const recognize = async (image, langs, options) => { + const scheduler = createScheduler(); + const worker = createWorker(options); + await worker.load(); + await worker.loadLanguage(langs); + await worker.initialize(langs); + scheduler.addWorker(worker); + return scheduler.addJob('recognize', image) + .finally(() => { + scheduler.terminate(); + }); +}; + +const detect = async (image, options) => { + const scheduler = createScheduler(); + const worker = createWorker(options); + await worker.load(); + await worker.loadLanguage('osd'); + await worker.initialize('osd'); + scheduler.addWorker(worker); + return scheduler.addJob('detect', image) + .finally(() => { + scheduler.terminate(); + }); +}; + +module.exports = { + recognize, + detect, +}; diff --git a/src/createJob.js b/src/createJob.js index fc799ae..dec906f 100644 --- a/src/createJob.js +++ b/src/createJob.js @@ -9,10 +9,10 @@ module.exports = ( jobCounter += 1; const id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`; - const start = (worker) => { - console.log(`[${worker.id}]: Start ${id}, action=${action}`); - send(worker, { - workerId: worker.id, + const start = (w) => { + console.log(`[${w.id}]: Start ${id}, action=${action}`); + send(w.worker, { + workerId: w.id, jobId: id, action, payload, diff --git a/src/createScheduler.js b/src/createScheduler.js index 23b3435..dcc1569 100644 --- a/src/createScheduler.js +++ b/src/createScheduler.js @@ -1,13 +1,13 @@ module.exports = () => { const workers = {}; - const runningJobs = {}; + const runningWorkers = {}; let jobQueue = []; const dequeue = () => { if (jobQueue.length !== 0) { const wIds = Object.keys(workers); for (let i = 0; i < wIds.length; i += 1) { - if (typeof runningJobs[wIds[i]] === 'undefined') { + if (typeof runningWorkers[wIds[i]] === 'undefined') { jobQueue[0](workers[wIds[i]]); break; } @@ -15,19 +15,19 @@ module.exports = () => { } }; - const queue = job => ( + const queue = (action, payload) => ( new Promise((resolve, reject) => { - jobQueue.push((w) => { - const { action } = job; + jobQueue.push(async (w) => { jobQueue.shift(); - w.setResolve(action, (data) => { - delete runningJobs[w.id]; + runningWorkers[w.id] = true; + try { + resolve(await w[action].apply(this, payload)); + } catch (err) { + reject(err); + } finally { + delete runningWorkers[w.id]; dequeue(); - resolve(data); - }); - w.setReject(action, reject); - runningJobs[w.id] = job; - job.start(w); + } }); dequeue(); }) @@ -38,13 +38,13 @@ module.exports = () => { return w.id; }; - const addJob = job => ( - queue(job) + const addJob = (action, ...payload) => ( + queue(action, payload) ); - const terminate = () => { - Object.keys(workers).forEach((id) => { - workers[id].terminate(); + const terminate = async () => { + Object.keys(workers).forEach(async (id) => { + await workers[id].terminate(); }); jobQueue = []; }; diff --git a/src/createTesseract.js b/src/createTesseract.js deleted file mode 100644 index fe0c1d1..0000000 --- a/src/createTesseract.js +++ /dev/null @@ -1,7 +0,0 @@ -module.exports = (options = {}, nWorkers = 1) => { - return { - init: () => {}, - loadLanguauge: () => {}, - recognize: () => {}, - }; -}; diff --git a/src/createWorker.js b/src/createWorker.js index 23782b7..95aa188 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -44,7 +44,7 @@ module.exports = (_options = {}) => { ); const loadLanguage = (langs = 'eng') => ( - doJob('load-language', { langs, options }) + doJob('loadLanguage', { langs, options }) ); const initialize = (langs = 'eng', oem = defaultOEM) => ( @@ -52,32 +52,39 @@ module.exports = (_options = {}) => { ); const setParameters = (params = {}) => ( - doJob('set-parameters', { params }) + doJob('setParameters', { params }) ); - const terminate = () => { + const recognize = (image, opts = {}) => ( + doJob('recognize', { image, options: opts }) + ); + + const getPDF = (title = 'Tesseract OCR Result', textonly = false) => ( + doJob('getPDF', { title, textonly }) + ); + + const detect = image => ( + doJob('detect', { image }) + ); + + const terminate = async () => { if (worker !== null) { + await doJob('terminate'); terminateWorker(worker); worker = null; } + return Promise.resolve(); }; - onMessage(worker, (packet) => { - const { status, action, data } = packet; + onMessage(worker, ({ status, action, data }) => { if (status === 'resolve') { - if (action === 'load') { - resolves.load(data); - } else if (action === 'initialize') { - resolves.initialize({ id }); - } else if (action === 'set-parameters') { - resolves['set-parameters'](data); - } else if (action === 'load-language') { - resolves['load-language'](data); - } else if (action === 'recognize') { - resolves.recognize(circularize(data)); - } else if (action === 'detect') { - resolves.detect(data); + let d = data; + if (action === 'recognize') { + d = circularize(data); + } else if (action === 'getPDF') { + d = Array.from({ ...data, length: Object.keys(data).length }); } + resolves[action](d); } else if (status === 'reject') { rejects[action](data); throw Error(data); @@ -95,6 +102,9 @@ module.exports = (_options = {}) => { loadLanguage, initialize, setParameters, + recognize, + getPDF, + detect, terminate, }; }; diff --git a/src/index.js b/src/index.js index e0cab5f..c3d9f09 100644 --- a/src/index.js +++ b/src/index.js @@ -10,6 +10,7 @@ const createScheduler = require('./createScheduler'); const createWorker = require('./createWorker'); const createJob = require('./createJob'); +const Tesseract = require('./Tesseract'); const OEM = require('./constants/OEM'); const PSM = require('./constants/PSM'); @@ -19,4 +20,5 @@ module.exports = { createScheduler, createWorker, createJob, + Tesseract, }; diff --git a/src/worker-script/browser/resolveURL.js b/src/worker-script/browser/resolveURL.js new file mode 100644 index 0000000..1a0e777 --- /dev/null +++ b/src/worker-script/browser/resolveURL.js @@ -0,0 +1 @@ +module.exports = require('resolve-url'); diff --git a/src/worker-script/constants/defaultParams.js b/src/worker-script/constants/defaultParams.js index 00bbc25..3238f03 100644 --- a/src/worker-script/constants/defaultParams.js +++ b/src/worker-script/constants/defaultParams.js @@ -6,6 +6,7 @@ const PSM = require('../../constants/PSM'); module.exports = { tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_char_whiltelist: '', + user_defined_dpi: '300', tessjs_create_pdf: '0', tessjs_create_hocr: '1', tessjs_create_tsv: '1', @@ -17,8 +18,4 @@ module.exports = { tessjs_pdf_title: 'Tesseract.js OCR Result', tessjs_pdf_auto_download: true, tessjs_pdf_bin: false, - tessjs_image_rectangle_left: 0, - tessjs_image_rectangle_top: 0, - tessjs_image_rectangle_width: -1, - tessjs_image_rectangle_height: -1, }; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 0ee1523..445d094 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -7,11 +7,12 @@ * @author Guillermo Webster * @author Jerome Wu */ -const { loadLang } = require('tesseract.js-utils'); +const fileType = require('file-type'); +const axios = require('axios'); +const isURL = require('is-url'); const dump = require('./utils/dump'); const isBrowser = require('../utils/getEnvironment')('type') === 'browser'; const setImage = require('./utils/setImage'); -const getFiles = require('./utils/getFiles'); const defaultParams = require('./constants/defaultParams'); /* @@ -26,18 +27,6 @@ let latestJob; let adapter = {}; let params = defaultParams; - -/** - * handleInit - * - * @name handleInit - * @function handle initialization of TessModule - * @access public - * @param {object} req - job payload - * @param {string} req.corePath - path to the tesseract-core.js - * @param {object} res - job instance - * @returns {Promise} A Promise for callback - */ const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => { if (!TessModule) { const Core = adapter.getCore(corePath, res); @@ -53,35 +42,99 @@ const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => { progress: Math.max(0, (percent - 30) / 70), }); }, - }) - .then((tessModule) => { - TessModule = tessModule; - res.progress({ workerId, status: 'initialized tesseract', progress: 1 }); - res.resolve({ loaded: true }); - }); + }).then((tessModule) => { + TessModule = tessModule; + res.progress({ workerId, status: 'initialized tesseract', progress: 1 }); + res.resolve({ loaded: true }); + }); } else { res.resolve({ loaded: true }); } }; -/** - * loadLanguage - * - * @name loadLanguage - * @function load language from remote or local cache - * @access public - * @param {object} req - job payload - * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra - * @param {object} req.options - other options for loadLang function - * @param {object} res - job instance - * @returns {Promise} A Promise for callback - */ -const loadLanguage = ({ workerId, payload: { langs, options } }, res) => { +const loadLanguage = async ({ + workerId, + payload: { + langs, + options: { + langPath, + dataPath, + cachePath, + cacheMethod, + gzip = true, + }, + }, +}, + res) => { + const loadAndGunzipFile = async (_lang) => { + const lang = typeof _lang === 'string' ? _lang : _lang.code; + const readCache = ['refresh', 'none'].includes(cacheMethod) + ? () => Promise.resolve() + : adapter.readCache; + let data = null; + + try { + const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); + if (typeof _data !== 'undefined') { + data = _data; + } else { + throw Error('Not found in cache'); + } + } catch (e) { + if (typeof _lang === 'string') { + let path = null; + + if (isURL(langPath)) { /** When langPath is an URL */ + path = langPath; + } else if (process.browser) { /** When langPath is not an URL in browser */ + path = adapter.resolveURL(langPath); + } + + if (path !== null) { + const { data: _data } = await axios.get( + `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`, + { responseType: 'arraybuffer' }, + ); + data = _data; + } else { + data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`); + } + } else { + data = _lang.data; // eslint-disable-line + } + } + + data = new Uint8Array(data); + + const type = fileType(data); + if (typeof type !== 'undefined' && type.mime === 'application/gzip') { + data = adapter.gunzip(data); + } + + if (TessModule) { + if (dataPath) { + try { + TessModule.FS.mkdir(dataPath); + } catch (err) { + res.reject(err.toString()); + } + } + TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data); + } + + if (['write', 'refresh', undefined].includes(cacheMethod)) { + await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data); + } + + return Promise.resolve(data); + }; + res.progress({ workerId, status: 'loading language traineddata', progress: 0 }); - loadLang({ langs, TessModule, ...options }).then(() => { + try { + await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile)); res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); res.resolve(langs); - }).catch((err) => { + } catch (err) { if (isBrowser && err instanceof DOMException) { /* * For some reason google chrome throw DOMException in loadLang, @@ -91,7 +144,7 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => { } else { res.reject(err.toString()); } - }); + } }; const setParameters = ({ payload: { params: _params } }, res) => { @@ -109,7 +162,6 @@ const setParameters = ({ payload: { params: _params } }, res) => { const initialize = ({ workerId, - jobId, payload: { langs: _langs, oem }, }, res) => { const langs = (typeof _langs === 'string') @@ -118,13 +170,13 @@ const initialize = ({ try { res.progress({ - workerId, jobId, status: 'initializing api', progress: 0, + workerId, status: 'initializing api', progress: 0, }); api = new TessModule.TessBaseAPI(); api.Init(null, langs, oem); setParameters({ payload: { params } }); res.progress({ - workerId, jobId, status: 'initialized api', progress: 1, + workerId, status: 'initialized api', progress: 1, }); res.resolve(); } catch (err) { @@ -132,48 +184,35 @@ const initialize = ({ } }; -/** - * handleRecognize - * - * @name handleRecognize - * @function handle recognition job - * @access public - * @param {object} req - job payload - * @param {array} req.image - binary image in array format - * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra - * @param {object} req.options - other options for loadLang function - * @param {object} req.params - parameters for tesseract - * @param {object} res - job instance - */ -const recognize = ({ payload: { image } }, res) => { +const recognize = ({ payload: { image, options: { rectangles = [] } } }, res) => { try { - const ptr = setImage(TessModule, api, image, params); - api.Recognize(null); - res.resolve({ - files: getFiles(TessModule, api, adapter, params), - ...dump(TessModule, api, params), + const ptr = setImage(TessModule, api, image); + rectangles.forEach(({ + left, top, width, height, + }) => { + api.SetRectangle(left, top, width, height); }); + api.Recognize(null); + res.resolve(dump(TessModule, api, params)); TessModule._free(ptr); } catch (err) { res.reject(err.toString()); } }; -/** - * handleDetect - * - * @name handleDetect - * @function handle detect (Orientation and Script Detection / OSD) job - * @access public - * @param {object} req - job payload - * @param {array} req.image - binary image in array format - * @param {string} req.langs - languages to load, ex: eng, eng+chi_tra - * @param {object} req.options - other options for loadLang function - * @param {object} res - job instance - */ +const getPDF = ({ payload: { title, textonly } }, res) => { + const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); + pdfRenderer.BeginDocument(title); + pdfRenderer.AddImage(api); + pdfRenderer.EndDocument(); + TessModule._free(pdfRenderer); + + res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf')); +}; + const detect = ({ payload: { image } }, res) => { try { - const ptr = setImage(TessModule, api, image, params); + const ptr = setImage(TessModule, api, image); const results = new TessModule.OSResults(); if (!api.DetectOS(results)) { @@ -236,22 +275,16 @@ exports.dispatchHandlers = (packet, send) => { latestJob = res; try { - const { action } = packet; - if (action === 'load') { - load(packet, res); - } else if (action === 'load-language') { - loadLanguage(packet, res); - } else if (action === 'initialize') { - initialize(packet, res); - } else if (action === 'set-parameters') { - setParameters(packet, res); - } else if (action === 'recognize') { - recognize(packet, res); - } else if (action === 'detect') { - detect(packet, res); - } else if (action === 'terminate') { - terminate(packet, res); - } + ({ + load, + loadLanguage, + initialize, + setParameters, + recognize, + getPDF, + detect, + terminate, + })[packet.action](packet, res); } catch (err) { /** Prepare exception to travel through postMessage */ res.reject(err.toString()); diff --git a/src/worker-script/node/cache.js b/src/worker-script/node/cache.js new file mode 100644 index 0000000..73b21da --- /dev/null +++ b/src/worker-script/node/cache.js @@ -0,0 +1,16 @@ +const util = require('util'); +const fs = require('fs'); + +module.exports = { + readCache: util.promisify(fs.readFile), + writeCache: util.promisify(fs.writeFile), + deleteCache: path => ( + util.promisify(fs.unlink)(path) + .catch(() => {}) + ), + checkCache: path => ( + util.promisify(fs.access)(path, fs.F_OK) + .then(err => (err === null)) + .catch(() => false) + ), +}; diff --git a/src/worker-script/node/exportFile.js b/src/worker-script/node/exportFile.js deleted file mode 100644 index 45efd01..0000000 --- a/src/worker-script/node/exportFile.js +++ /dev/null @@ -1,7 +0,0 @@ -const fs = require('fs'); - -module.exports = (path, data) => { - fs.writeFile(path, data, (err) => { - if (err) throw err; - }); -}; diff --git a/src/worker-script/node/getCore.js b/src/worker-script/node/getCore.js new file mode 100644 index 0000000..f3783b5 --- /dev/null +++ b/src/worker-script/node/getCore.js @@ -0,0 +1,13 @@ +let TesseractCore = null; +/* + * getCore is a sync function to load and return + * TesseractCore. + */ +module.exports = (_, res) => { + if (TesseractCore === null) { + res.progress({ status: 'loading tesseract core', progress: 0 }); + TesseractCore = require('tesseract.js-core'); + res.progress({ status: 'loaded tesseract core', progress: 1 }); + } + return TesseractCore; +}; diff --git a/src/worker-script/node/gunzip.js b/src/worker-script/node/gunzip.js new file mode 100644 index 0000000..4f2e0e0 --- /dev/null +++ b/src/worker-script/node/gunzip.js @@ -0,0 +1 @@ +module.exports = require('zlib').gunzipSync; diff --git a/src/worker-script/node/index.js b/src/worker-script/node/index.js index a5bb76b..8cb2c65 100644 --- a/src/worker-script/node/index.js +++ b/src/worker-script/node/index.js @@ -9,9 +9,10 @@ */ const worker = require('../'); -const exportFile = require('./exportFile'); - -let TesseractCore = null; +const getCore = require('./getCore'); +const resolveURL = require('./resolveURL'); +const gunzip = require('./gunzip'); +const cache = require('./cache'); /* * register message handler @@ -20,20 +21,9 @@ process.on('message', (packet) => { worker.dispatchHandlers(packet, obj => process.send(obj)); }); -/* - * getCore is a sync function to load and return - * TesseractCore. - */ -const getCore = (_, res) => { - if (TesseractCore === null) { - res.progress({ status: 'loading tesseract core', progress: 0 }); - TesseractCore = require('tesseract.js-core'); - res.progress({ status: 'loaded tesseract core', progress: 1 }); - } - return TesseractCore; -}; - worker.setAdapter({ getCore, - exportFile, + gunzip, + resolveURL, + ...cache, }); diff --git a/src/worker-script/node/resolveURL.js b/src/worker-script/node/resolveURL.js new file mode 100644 index 0000000..63c03e8 --- /dev/null +++ b/src/worker-script/node/resolveURL.js @@ -0,0 +1 @@ +module.exports = s => s; diff --git a/src/worker-script/utils/getFiles.js b/src/worker-script/utils/getFiles.js deleted file mode 100644 index c1e8cb1..0000000 --- a/src/worker-script/utils/getFiles.js +++ /dev/null @@ -1,39 +0,0 @@ -/** - * handleOutput - * - * @name handleOutput - * @function handle file output - * @access private - * @param {object} customParams - an object of params - */ -module.exports = (TessModule, api, adapter, params) => { - let files = {}; - const { - tessjs_create_pdf, - tessjs_textonly_pdf, - tessjs_pdf_name, - tessjs_pdf_title, - tessjs_pdf_auto_download, - tessjs_pdf_bin, - } = params; - - if (tessjs_create_pdf === '1') { - const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1'); - pdfRenderer.BeginDocument(tessjs_pdf_title); - pdfRenderer.AddImage(api); - pdfRenderer.EndDocument(); - TessModule._free(pdfRenderer); - - const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`); - - if (tessjs_pdf_bin) { - files = { pdf: data, ...files }; - } - - if (tessjs_pdf_auto_download) { - adapter.exportFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf'); - } - } - - return files; -}; diff --git a/src/worker-script/utils/getLangStr.js b/src/worker-script/utils/getLangStr.js deleted file mode 100644 index da07f70..0000000 --- a/src/worker-script/utils/getLangStr.js +++ /dev/null @@ -1,5 +0,0 @@ -module.exports = langs => ( - typeof langs === 'string' - ? langs - : langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+') -); diff --git a/src/worker-script/utils/setImage.js b/src/worker-script/utils/setImage.js index 89c3117..8357be7 100644 --- a/src/worker-script/utils/setImage.js +++ b/src/worker-script/utils/setImage.js @@ -1,4 +1,5 @@ -const { readImage } = require('tesseract.js-utils'); +const bmp = require('bmp-js'); +const fileType = require('file-type'); /** * setImage @@ -9,16 +10,43 @@ const { readImage } = require('tesseract.js-utils'); * @param {array} image - binary array in array format * @returns {number} - an emscripten pointer of the image */ -module.exports = (TessModule, api, image, params) => { - const { - tessjs_image_rectangle_left: left, - tessjs_image_rectangle_top: top, - tessjs_image_rectangle_width: width, - tessjs_image_rectangle_height: height, - } = params; - const { - w, h, bytesPerPixel, data, pix, - } = readImage(TessModule, Array.from(image)); +module.exports = (TessModule, api, image) => { + const buf = Buffer.from(Array.from(image)); + const type = fileType(buf); + let bytesPerPixel = 0; + let data = null; + let pix = null; + let w = 0; + let h = 0; + + /* + * Although leptonica should support reading bmp, there is a bug of "compressed BMP files". + * As there is no solution, we need to use bmp-js for now. + * @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE + */ + if (type && type.mime === 'image/bmp') { + const bmpBuf = bmp.decode(buf); + data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT); + TessModule.HEAPU8.set(bmpBuf.data, data); + w = bmpBuf.width; + h = bmpBuf.height; + bytesPerPixel = 4; + } else { + const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT); + TessModule.HEAPU8.set(buf, ptr); + pix = TessModule._pixReadMem(ptr, buf.length); + if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) { + /* + * Set a yres default value to prevent warning from tesseract + * See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h + */ + TessModule.setValue(pix + (7 * 4), 300, 'i32'); + } + [w, h] = Array(2).fill(0) + .map((v, idx) => ( + TessModule.getValue(pix + (idx * 4), 'i32') + )); + } /* * As some image format (ex. bmp) is not supported natiely by tesseract, @@ -31,11 +59,5 @@ module.exports = (TessModule, api, image, params) => { } else { api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel); } - api.SetRectangle( - (left < 0) ? 0 : left, - (top < 0) ? 0 : top, - (width < 0) ? w : width, - (height < 0) ? h : height, - ); return data === null ? pix : data; }; diff --git a/src/worker/node/send.js b/src/worker/node/send.js index b3ee04b..bcc9e07 100644 --- a/src/worker/node/send.js +++ b/src/worker/node/send.js @@ -46,7 +46,7 @@ const loadImage = (image) => { * @param {object} instance TesseractWorker instance * @param {object} iPacket data for worker */ -module.exports = ({ worker }, packet) => { +module.exports = (worker, packet) => { const p = { ...packet }; if (['recognize', 'detect'].includes(p.action)) { loadImage(p.payload.image)