Browse Source

Keep refactoring

develop
Jerome Wu 5 years ago
parent
commit
3ec7092105
  1. 28
      examples/node/recognize.js
  2. 11
      package-lock.json
  3. 6
      package.json
  4. 33
      src/Tesseract.js
  5. 8
      src/createJob.js
  6. 34
      src/createScheduler.js
  7. 7
      src/createTesseract.js
  8. 44
      src/createWorker.js
  9. 2
      src/index.js
  10. 1
      src/worker-script/browser/resolveURL.js
  11. 5
      src/worker-script/constants/defaultParams.js
  12. 207
      src/worker-script/index.js
  13. 16
      src/worker-script/node/cache.js
  14. 7
      src/worker-script/node/exportFile.js
  15. 13
      src/worker-script/node/getCore.js
  16. 1
      src/worker-script/node/gunzip.js
  17. 24
      src/worker-script/node/index.js
  18. 1
      src/worker-script/node/resolveURL.js
  19. 39
      src/worker-script/utils/getFiles.js
  20. 5
      src/worker-script/utils/getLangStr.js
  21. 56
      src/worker-script/utils/setImage.js
  22. 2
      src/worker/node/send.js

28
examples/node/recognize.js

@ -1,7 +1,8 @@ @@ -1,7 +1,8 @@
#!/usr/bin/env node
const path = require('path');
const fs = require('fs');
const {
createScheduler, createWorker, createJob, PSM,
Tesseract, createScheduler, createWorker,
} = require('../../');
const [,, imagePath] = process.argv;
@ -15,16 +16,19 @@ console.log(`Recognizing ${image}`); @@ -15,16 +16,19 @@ console.log(`Recognizing ${image}`);
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_char_whitelist: 'ABCDEFGH',
});
scheduler.addWorker(worker);
const { text: t1 } = await scheduler.addJob(createJob('recognize', { image }));
console.log(t1);
await worker.setParameters({
tessedit_char_whitelist: 'abcdefg',
});
const { text: t2 } = await scheduler.addJob(createJob('recognize', { image }));
console.log(t2);
scheduler.terminate();
console.log((await scheduler.addJob('recognize', image)).text);
const data = await worker.getPDF('ocr', 'Tesseract OCR');
fs.writeFileSync('test.pdf', Buffer.from(data));
await scheduler.terminate();
})();
//Tesseract.recognize(image, 'eng', { logger: m => console.log(m) })
// .then(({ text }) => {
// console.log(text);
// });
//Tesseract.detect(image, { logger: m => console.log(m) })
// .then((data) => {
// console.log(data);
// });

11
package-lock.json generated

@ -3558,9 +3558,9 @@ @@ -3558,9 +3558,9 @@
}
},
"file-type": {
"version": "10.11.0",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz",
"integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw=="
"version": "12.3.0",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-12.3.0.tgz",
"integrity": "sha512-4E4Esq9KLwjYCY32E7qSmd0h7LefcniZHX+XcdJ4Wfx1uGJX7QCigiqw/U0yT7WOslm28yhxl87DJ0wHYv0RAA=="
},
"finalhandler": {
"version": "1.1.1",
@ -8565,6 +8565,11 @@ @@ -8565,6 +8565,11 @@
"zlibjs": "^0.3.1"
},
"dependencies": {
"file-type": {
"version": "10.11.0",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz",
"integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw=="
},
"is-url": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",

6
package.json

@ -53,12 +53,16 @@ @@ -53,12 +53,16 @@
},
"dependencies": {
"axios": "^0.18.0",
"bmp-js": "^0.1.0",
"check-types": "^7.4.0",
"file-type": "^12.3.0",
"idb-keyval": "^3.2.0",
"is-url": "1.2.2",
"opencollective-postinstall": "^2.0.2",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^2.0.0-beta.13",
"tesseract.js-utils": "^1.0.0-beta.8"
"tesseract.js-utils": "^1.0.0-beta.8",
"zlibjs": "^0.3.1"
},
"repository": {
"type": "git",

33
src/Tesseract.js

@ -0,0 +1,33 @@ @@ -0,0 +1,33 @@
const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker');
const recognize = async (image, langs, options) => {
const scheduler = createScheduler();
const worker = createWorker(options);
await worker.load();
await worker.loadLanguage(langs);
await worker.initialize(langs);
scheduler.addWorker(worker);
return scheduler.addJob('recognize', image)
.finally(() => {
scheduler.terminate();
});
};
const detect = async (image, options) => {
const scheduler = createScheduler();
const worker = createWorker(options);
await worker.load();
await worker.loadLanguage('osd');
await worker.initialize('osd');
scheduler.addWorker(worker);
return scheduler.addJob('detect', image)
.finally(() => {
scheduler.terminate();
});
};
module.exports = {
recognize,
detect,
};

8
src/createJob.js

@ -9,10 +9,10 @@ module.exports = ( @@ -9,10 +9,10 @@ module.exports = (
jobCounter += 1;
const id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`;
const start = (worker) => {
console.log(`[${worker.id}]: Start ${id}, action=${action}`);
send(worker, {
workerId: worker.id,
const start = (w) => {
console.log(`[${w.id}]: Start ${id}, action=${action}`);
send(w.worker, {
workerId: w.id,
jobId: id,
action,
payload,

34
src/createScheduler.js

@ -1,13 +1,13 @@ @@ -1,13 +1,13 @@
module.exports = () => {
const workers = {};
const runningJobs = {};
const runningWorkers = {};
let jobQueue = [];
const dequeue = () => {
if (jobQueue.length !== 0) {
const wIds = Object.keys(workers);
for (let i = 0; i < wIds.length; i += 1) {
if (typeof runningJobs[wIds[i]] === 'undefined') {
if (typeof runningWorkers[wIds[i]] === 'undefined') {
jobQueue[0](workers[wIds[i]]);
break;
}
@ -15,19 +15,19 @@ module.exports = () => { @@ -15,19 +15,19 @@ module.exports = () => {
}
};
const queue = job => (
const queue = (action, payload) => (
new Promise((resolve, reject) => {
jobQueue.push((w) => {
const { action } = job;
jobQueue.push(async (w) => {
jobQueue.shift();
w.setResolve(action, (data) => {
delete runningJobs[w.id];
runningWorkers[w.id] = true;
try {
resolve(await w[action].apply(this, payload));
} catch (err) {
reject(err);
} finally {
delete runningWorkers[w.id];
dequeue();
resolve(data);
});
w.setReject(action, reject);
runningJobs[w.id] = job;
job.start(w);
}
});
dequeue();
})
@ -38,13 +38,13 @@ module.exports = () => { @@ -38,13 +38,13 @@ module.exports = () => {
return w.id;
};
const addJob = job => (
queue(job)
const addJob = (action, ...payload) => (
queue(action, payload)
);
const terminate = () => {
Object.keys(workers).forEach((id) => {
workers[id].terminate();
const terminate = async () => {
Object.keys(workers).forEach(async (id) => {
await workers[id].terminate();
});
jobQueue = [];
};

7
src/createTesseract.js

@ -1,7 +0,0 @@ @@ -1,7 +0,0 @@
module.exports = (options = {}, nWorkers = 1) => {
return {
init: () => {},
loadLanguauge: () => {},
recognize: () => {},
};
};

44
src/createWorker.js

@ -44,7 +44,7 @@ module.exports = (_options = {}) => { @@ -44,7 +44,7 @@ module.exports = (_options = {}) => {
);
const loadLanguage = (langs = 'eng') => (
doJob('load-language', { langs, options })
doJob('loadLanguage', { langs, options })
);
const initialize = (langs = 'eng', oem = defaultOEM) => (
@ -52,32 +52,39 @@ module.exports = (_options = {}) => { @@ -52,32 +52,39 @@ module.exports = (_options = {}) => {
);
const setParameters = (params = {}) => (
doJob('set-parameters', { params })
doJob('setParameters', { params })
);
const terminate = () => {
const recognize = (image, opts = {}) => (
doJob('recognize', { image, options: opts })
);
const getPDF = (title = 'Tesseract OCR Result', textonly = false) => (
doJob('getPDF', { title, textonly })
);
const detect = image => (
doJob('detect', { image })
);
const terminate = async () => {
if (worker !== null) {
await doJob('terminate');
terminateWorker(worker);
worker = null;
}
return Promise.resolve();
};
onMessage(worker, (packet) => {
const { status, action, data } = packet;
onMessage(worker, ({ status, action, data }) => {
if (status === 'resolve') {
if (action === 'load') {
resolves.load(data);
} else if (action === 'initialize') {
resolves.initialize({ id });
} else if (action === 'set-parameters') {
resolves['set-parameters'](data);
} else if (action === 'load-language') {
resolves['load-language'](data);
} else if (action === 'recognize') {
resolves.recognize(circularize(data));
} else if (action === 'detect') {
resolves.detect(data);
let d = data;
if (action === 'recognize') {
d = circularize(data);
} else if (action === 'getPDF') {
d = Array.from({ ...data, length: Object.keys(data).length });
}
resolves[action](d);
} else if (status === 'reject') {
rejects[action](data);
throw Error(data);
@ -95,6 +102,9 @@ module.exports = (_options = {}) => { @@ -95,6 +102,9 @@ module.exports = (_options = {}) => {
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
};
};

2
src/index.js

@ -10,6 +10,7 @@ @@ -10,6 +10,7 @@
const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker');
const createJob = require('./createJob');
const Tesseract = require('./Tesseract');
const OEM = require('./constants/OEM');
const PSM = require('./constants/PSM');
@ -19,4 +20,5 @@ module.exports = { @@ -19,4 +20,5 @@ module.exports = {
createScheduler,
createWorker,
createJob,
Tesseract,
};

1
src/worker-script/browser/resolveURL.js

@ -0,0 +1 @@ @@ -0,0 +1 @@
module.exports = require('resolve-url');

5
src/worker-script/constants/defaultParams.js

@ -6,6 +6,7 @@ const PSM = require('../../constants/PSM'); @@ -6,6 +6,7 @@ const PSM = require('../../constants/PSM');
module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
user_defined_dpi: '300',
tessjs_create_pdf: '0',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
@ -17,8 +18,4 @@ module.exports = { @@ -17,8 +18,4 @@ module.exports = {
tessjs_pdf_title: 'Tesseract.js OCR Result',
tessjs_pdf_auto_download: true,
tessjs_pdf_bin: false,
tessjs_image_rectangle_left: 0,
tessjs_image_rectangle_top: 0,
tessjs_image_rectangle_width: -1,
tessjs_image_rectangle_height: -1,
};

207
src/worker-script/index.js

@ -7,11 +7,12 @@ @@ -7,11 +7,12 @@
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const { loadLang } = require('tesseract.js-utils');
const fileType = require('file-type');
const axios = require('axios');
const isURL = require('is-url');
const dump = require('./utils/dump');
const isBrowser = require('../utils/getEnvironment')('type') === 'browser';
const setImage = require('./utils/setImage');
const getFiles = require('./utils/getFiles');
const defaultParams = require('./constants/defaultParams');
/*
@ -26,18 +27,6 @@ let latestJob; @@ -26,18 +27,6 @@ let latestJob;
let adapter = {};
let params = defaultParams;
/**
* handleInit
*
* @name handleInit
* @function handle initialization of TessModule
* @access public
* @param {object} req - job payload
* @param {string} req.corePath - path to the tesseract-core.js
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => {
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
@ -53,35 +42,99 @@ const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => { @@ -53,35 +42,99 @@ const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => {
progress: Math.max(0, (percent - 30) / 70),
});
},
})
.then((tessModule) => {
TessModule = tessModule;
res.progress({ workerId, status: 'initialized tesseract', progress: 1 });
res.resolve({ loaded: true });
});
}).then((tessModule) => {
TessModule = tessModule;
res.progress({ workerId, status: 'initialized tesseract', progress: 1 });
res.resolve({ loaded: true });
});
} else {
res.resolve({ loaded: true });
}
};
/**
* loadLanguage
*
* @name loadLanguage
* @function load language from remote or local cache
* @access public
* @param {object} req - job payload
* @param {string} req.langs - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const loadLanguage = ({ workerId, payload: { langs, options } }, res) => {
const loadLanguage = async ({
workerId,
payload: {
langs,
options: {
langPath,
dataPath,
cachePath,
cacheMethod,
gzip = true,
},
},
},
res) => {
const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod)
? () => Promise.resolve()
: adapter.readCache;
let data = null;
try {
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
if (typeof _data !== 'undefined') {
data = _data;
} else {
throw Error('Not found in cache');
}
} catch (e) {
if (typeof _lang === 'string') {
let path = null;
if (isURL(langPath)) { /** When langPath is an URL */
path = langPath;
} else if (process.browser) { /** When langPath is not an URL in browser */
path = adapter.resolveURL(langPath);
}
if (path !== null) {
const { data: _data } = await axios.get(
`${path}/${lang}.traineddata${gzip ? '.gz' : ''}`,
{ responseType: 'arraybuffer' },
);
data = _data;
} else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
}
} else {
data = _lang.data; // eslint-disable-line
}
}
data = new Uint8Array(data);
const type = fileType(data);
if (typeof type !== 'undefined' && type.mime === 'application/gzip') {
data = adapter.gunzip(data);
}
if (TessModule) {
if (dataPath) {
try {
TessModule.FS.mkdir(dataPath);
} catch (err) {
res.reject(err.toString());
}
}
TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data);
}
if (['write', 'refresh', undefined].includes(cacheMethod)) {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
}
return Promise.resolve(data);
};
res.progress({ workerId, status: 'loading language traineddata', progress: 0 });
loadLang({ langs, TessModule, ...options }).then(() => {
try {
await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile));
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
}).catch((err) => {
} catch (err) {
if (isBrowser && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
@ -91,7 +144,7 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => { @@ -91,7 +144,7 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => {
} else {
res.reject(err.toString());
}
});
}
};
const setParameters = ({ payload: { params: _params } }, res) => {
@ -109,7 +162,6 @@ const setParameters = ({ payload: { params: _params } }, res) => { @@ -109,7 +162,6 @@ const setParameters = ({ payload: { params: _params } }, res) => {
const initialize = ({
workerId,
jobId,
payload: { langs: _langs, oem },
}, res) => {
const langs = (typeof _langs === 'string')
@ -118,13 +170,13 @@ const initialize = ({ @@ -118,13 +170,13 @@ const initialize = ({
try {
res.progress({
workerId, jobId, status: 'initializing api', progress: 0,
workerId, status: 'initializing api', progress: 0,
});
api = new TessModule.TessBaseAPI();
api.Init(null, langs, oem);
setParameters({ payload: { params } });
res.progress({
workerId, jobId, status: 'initialized api', progress: 1,
workerId, status: 'initialized api', progress: 1,
});
res.resolve();
} catch (err) {
@ -132,48 +184,35 @@ const initialize = ({ @@ -132,48 +184,35 @@ const initialize = ({
}
};
/**
* handleRecognize
*
* @name handleRecognize
* @function handle recognition job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.langs - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} req.params - parameters for tesseract
* @param {object} res - job instance
*/
const recognize = ({ payload: { image } }, res) => {
const recognize = ({ payload: { image, options: { rectangles = [] } } }, res) => {
try {
const ptr = setImage(TessModule, api, image, params);
api.Recognize(null);
res.resolve({
files: getFiles(TessModule, api, adapter, params),
...dump(TessModule, api, params),
const ptr = setImage(TessModule, api, image);
rectangles.forEach(({
left, top, width, height,
}) => {
api.SetRectangle(left, top, width, height);
});
api.Recognize(null);
res.resolve(dump(TessModule, api, params));
TessModule._free(ptr);
} catch (err) {
res.reject(err.toString());
}
};
/**
* handleDetect
*
* @name handleDetect
* @function handle detect (Orientation and Script Detection / OSD) job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.langs - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
*/
const getPDF = ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image, params);
const ptr = setImage(TessModule, api, image);
const results = new TessModule.OSResults();
if (!api.DetectOS(results)) {
@ -236,22 +275,16 @@ exports.dispatchHandlers = (packet, send) => { @@ -236,22 +275,16 @@ exports.dispatchHandlers = (packet, send) => {
latestJob = res;
try {
const { action } = packet;
if (action === 'load') {
load(packet, res);
} else if (action === 'load-language') {
loadLanguage(packet, res);
} else if (action === 'initialize') {
initialize(packet, res);
} else if (action === 'set-parameters') {
setParameters(packet, res);
} else if (action === 'recognize') {
recognize(packet, res);
} else if (action === 'detect') {
detect(packet, res);
} else if (action === 'terminate') {
terminate(packet, res);
}
({
load,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
})[packet.action](packet, res);
} catch (err) {
/** Prepare exception to travel through postMessage */
res.reject(err.toString());

16
src/worker-script/node/cache.js

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
const util = require('util');
const fs = require('fs');
module.exports = {
readCache: util.promisify(fs.readFile),
writeCache: util.promisify(fs.writeFile),
deleteCache: path => (
util.promisify(fs.unlink)(path)
.catch(() => {})
),
checkCache: path => (
util.promisify(fs.access)(path, fs.F_OK)
.then(err => (err === null))
.catch(() => false)
),
};

7
src/worker-script/node/exportFile.js

@ -1,7 +0,0 @@ @@ -1,7 +0,0 @@
const fs = require('fs');
module.exports = (path, data) => {
fs.writeFile(path, data, (err) => {
if (err) throw err;
});
};

13
src/worker-script/node/getCore.js

@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
let TesseractCore = null;
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
module.exports = (_, res) => {
if (TesseractCore === null) {
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;
};

1
src/worker-script/node/gunzip.js

@ -0,0 +1 @@ @@ -0,0 +1 @@
module.exports = require('zlib').gunzipSync;

24
src/worker-script/node/index.js

@ -9,9 +9,10 @@ @@ -9,9 +9,10 @@
*/
const worker = require('../');
const exportFile = require('./exportFile');
let TesseractCore = null;
const getCore = require('./getCore');
const resolveURL = require('./resolveURL');
const gunzip = require('./gunzip');
const cache = require('./cache');
/*
* register message handler
@ -20,20 +21,9 @@ process.on('message', (packet) => { @@ -20,20 +21,9 @@ process.on('message', (packet) => {
worker.dispatchHandlers(packet, obj => process.send(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
const getCore = (_, res) => {
if (TesseractCore === null) {
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;
};
worker.setAdapter({
getCore,
exportFile,
gunzip,
resolveURL,
...cache,
});

1
src/worker-script/node/resolveURL.js

@ -0,0 +1 @@ @@ -0,0 +1 @@
module.exports = s => s;

39
src/worker-script/utils/getFiles.js

@ -1,39 +0,0 @@ @@ -1,39 +0,0 @@
/**
* handleOutput
*
* @name handleOutput
* @function handle file output
* @access private
* @param {object} customParams - an object of params
*/
module.exports = (TessModule, api, adapter, params) => {
let files = {};
const {
tessjs_create_pdf,
tessjs_textonly_pdf,
tessjs_pdf_name,
tessjs_pdf_title,
tessjs_pdf_auto_download,
tessjs_pdf_bin,
} = params;
if (tessjs_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1');
pdfRenderer.BeginDocument(tessjs_pdf_title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`);
if (tessjs_pdf_bin) {
files = { pdf: data, ...files };
}
if (tessjs_pdf_auto_download) {
adapter.exportFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf');
}
}
return files;
};

5
src/worker-script/utils/getLangStr.js

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
module.exports = langs => (
typeof langs === 'string'
? langs
: langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+')
);

56
src/worker-script/utils/setImage.js

@ -1,4 +1,5 @@ @@ -1,4 +1,5 @@
const { readImage } = require('tesseract.js-utils');
const bmp = require('bmp-js');
const fileType = require('file-type');
/**
* setImage
@ -9,16 +10,43 @@ const { readImage } = require('tesseract.js-utils'); @@ -9,16 +10,43 @@ const { readImage } = require('tesseract.js-utils');
* @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image
*/
module.exports = (TessModule, api, image, params) => {
const {
tessjs_image_rectangle_left: left,
tessjs_image_rectangle_top: top,
tessjs_image_rectangle_width: width,
tessjs_image_rectangle_height: height,
} = params;
const {
w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image));
module.exports = (TessModule, api, image) => {
const buf = Buffer.from(Array.from(image));
const type = fileType(buf);
let bytesPerPixel = 0;
let data = null;
let pix = null;
let w = 0;
let h = 0;
/*
* Although leptonica should support reading bmp, there is a bug of "compressed BMP files".
* As there is no solution, we need to use bmp-js for now.
* @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE
*/
if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf);
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(bmpBuf.data, data);
w = bmpBuf.width;
h = bmpBuf.height;
bytesPerPixel = 4;
} else {
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(buf, ptr);
pix = TessModule._pixReadMem(ptr, buf.length);
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) {
/*
* Set a yres default value to prevent warning from tesseract
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h
*/
TessModule.setValue(pix + (7 * 4), 300, 'i32');
}
[w, h] = Array(2).fill(0)
.map((v, idx) => (
TessModule.getValue(pix + (idx * 4), 'i32')
));
}
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
@ -31,11 +59,5 @@ module.exports = (TessModule, api, image, params) => { @@ -31,11 +59,5 @@ module.exports = (TessModule, api, image, params) => {
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
}
api.SetRectangle(
(left < 0) ? 0 : left,
(top < 0) ? 0 : top,
(width < 0) ? w : width,
(height < 0) ? h : height,
);
return data === null ? pix : data;
};

2
src/worker/node/send.js

@ -46,7 +46,7 @@ const loadImage = (image) => { @@ -46,7 +46,7 @@ const loadImage = (image) => {
* @param {object} instance TesseractWorker instance
* @param {object} iPacket data for worker
*/
module.exports = ({ worker }, packet) => {
module.exports = (worker, packet) => {
const p = { ...packet };
if (['recognize', 'detect'].includes(p.action)) {
loadImage(p.payload.image)

Loading…
Cancel
Save