Browse Source

Update folder structure and refactor

develop
Jerome Wu 5 years ago
parent
commit
97fa545936
  1. 20
      examples/node/recognize.js
  2. 6
      package-lock.json
  3. 2
      package.json
  4. 2
      scripts/webpack.config.dev.js
  5. 163
      src/common/TesseractJob.js
  6. 196
      src/common/TesseractWorker.js
  7. 86
      src/common/circularize.js
  8. 129
      src/common/createWorker.js
  9. 1
      src/common/env.js
  10. 40
      src/common/options.js
  11. 1
      src/common/pdf-ttf.js
  12. 34
      src/common/types.js
  13. 87
      src/common/utils.js
  14. 12
      src/constants/OEM.js
  15. 18
      src/constants/PSM.js
  16. 5
      src/constants/config.js
  17. 13
      src/constants/defaultOptions.js
  18. 4
      src/createJob.js
  19. 0
      src/createScheduler.js
  20. 0
      src/createTesseract.js
  21. 100
      src/createWorker.js
  22. 19
      src/index.js
  23. 1
      src/node/b64toU8Array.js
  24. 111
      src/node/index.js
  25. 44
      src/node/worker.js
  26. 57
      src/utils/circularize.js
  27. 10
      src/utils/getEnvironment.js
  28. 12
      src/utils/resolvePaths.js
  29. 0
      src/worker-script/browser/index.js
  30. 24
      src/worker-script/constants/defaultParams.js
  31. 110
      src/worker-script/index.js
  32. 7
      src/worker-script/node/exportFile.js
  33. 39
      src/worker-script/node/index.js
  34. 62
      src/worker-script/utils/dump.js
  35. 39
      src/worker-script/utils/getFiles.js
  36. 5
      src/worker-script/utils/getLangStr.js
  37. 41
      src/worker-script/utils/setImage.js
  38. 0
      src/worker/browser/b64toU8Array.js
  39. 0
      src/worker/browser/index.js
  40. 8
      src/worker/browser/worker.js
  41. 10
      src/worker/node/defaultOptions.js
  42. 22
      src/worker/node/index.js
  43. 3
      src/worker/node/onMessage.js
  44. 61
      src/worker/node/send.js
  45. 15
      src/worker/node/spawnWorker.js
  46. 11
      src/worker/node/terminateWorker.js

20
examples/node/recognize.js

@ -1,6 +1,8 @@ @@ -1,6 +1,8 @@
#!/usr/bin/env node
const path = require('path');
const { createScheduler, createWorker, createJob, OEM } = require('../../');
const {
createScheduler, createWorker, createJob, PSM,
} = require('../../');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
@ -11,12 +13,18 @@ console.log(`Recognizing ${image}`); @@ -11,12 +13,18 @@ console.log(`Recognizing ${image}`);
const scheduler = createScheduler();
const worker = createWorker();
await worker.load();
await worker.loadLanguage('osd');
await worker.initialize('osd', {
tessedit_ocr_engine_mode: OEM.OSD_ONLY,
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_char_whitelist: 'ABCDEFGH',
});
scheduler.addWorker(worker);
const data = await scheduler.addJob(createJob('detect', { image }));
console.log(data);
const { text: t1 } = await scheduler.addJob(createJob('recognize', { image }));
console.log(t1);
await worker.setParameters({
tessedit_char_whitelist: 'abcdefg',
});
const { text: t2 } = await scheduler.addJob(createJob('recognize', { image }));
console.log(t2);
scheduler.terminate();
})();

6
package-lock.json generated

@ -8548,9 +8548,9 @@ @@ -8548,9 +8548,9 @@
}
},
"tesseract.js-core": {
"version": "2.0.0-beta.12",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.12.tgz",
"integrity": "sha512-/CJhrDO82u1Nix4BQXYdL98+ctPZK4ZBYIiPlVu9uu9DHH65HsTEUV86Kd/3hdg67wCPmO1FzRGOaskvLv5O5A=="
"version": "2.0.0-beta.13",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.13.tgz",
"integrity": "sha512-GboWV/aV5h+Whito6L6Q3WCFZ2+lgxZGgjY84wSpWbTLEkkZgHsU+dz1or+3rWSABH/nuzHDco1bZRk5+f94mw=="
},
"tesseract.js-utils": {
"version": "1.0.0-beta.8",

2
package.json

@ -57,7 +57,7 @@ @@ -57,7 +57,7 @@
"is-url": "1.2.2",
"opencollective-postinstall": "^2.0.2",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^2.0.0-beta.12",
"tesseract.js-core": "^2.0.0-beta.13",
"tesseract.js-utils": "^1.0.0-beta.8"
},
"repository": {

2
scripts/webpack.config.dev.js

@ -21,7 +21,7 @@ const genConfig = ({ @@ -21,7 +21,7 @@ const genConfig = ({
}),
],
devServer: {
allowedHosts: ['localhost', '.gitpod.io'],
allowedHosts: ['localhost', '.gitpod.io'],
},
});

163
src/common/TesseractJob.js

@ -1,163 +0,0 @@ @@ -1,163 +0,0 @@
/**
*
* The job exectued by worker, each job is basically a recognition of an image.
*
* @fileoverview Job excuted by Worker
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const adapter = require('../node/');
/** A global job counter as part of job id */
let jobCounter = 0;
class TesseractJob {
/**
* constructor
*
* @name constructor
* @function initial a TesseractJob
* @access public
* @param {object} worker - An instance of TesseractWorker
*/
constructor(worker) {
jobCounter += 1;
this.id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`;
this._worker = worker;
/**
* As all the callback functions are saved in an array.
* Basically you can register more than callback function
* for then, catch, progress and finally.
*/
this._resolve = [];
this._reject = [];
this._progress = [];
this._finally = [];
}
/**
* then
*
* @name then
* @function A function to chain like Promise
* @access public
* @param {function} resolve - called when the job succeeds
* @param {function} reject - called when the job fails
*/
then(resolve, reject) {
return new Promise((res, rej) => {
if (!this._resolve.push) {
res(this._result);
} else {
this._resolve.push(res);
}
this.catch(rej);
}).then(resolve, reject);
}
/**
* catch
*
* @name catch
* @function register a function to call when there is an error
* @access public
* @param {function} reject - callback function for error
*/
catch(reject) {
if (this._reject.push) {
this._reject.push(reject);
} else {
reject(this._reject);
}
return this;
}
/**
* progress
*
* @name progress
* @function register a function to show progress of the recognition,
* use res.progress to print the message
* @access public
* @param {function} fn - callback function for progress information
*/
progress(fn) {
this._progress.push(fn);
return this;
}
/**
* finally
*
* @name finally
* @function registry a callback function for final
* @access public
* @param {function} fn - callback function for final
*/
finally(fn) {
this._finally.push(fn);
return this;
}
/**
* send
*
* @name send
* @function send specific action with payload a worker
* @access public
* @param {string} action - action to trigger, should be "recognize" or "detect"
* @param {object} payload - data to be consumed
*/
send(action, payload) {
adapter.sendPacket(this._worker, {
jobId: this.id,
action,
payload,
});
}
/**
* handle
*
* @name handle
* @function execute packet action
* @access public
* @param {object} packet action and payload to handle
*/
handle(packet) {
const { data } = packet;
let runFinallyCbs = false;
if (packet.status === 'resolve') {
if (this._resolve.length === 0) console.log(data);
this._resolve.forEach((fn) => {
const ret = fn(data);
if (ret && typeof ret.then === 'function') {
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.');
}
});
this._resolve = data;
this._worker.dequeue();
runFinallyCbs = true;
} else if (packet.status === 'reject') {
if (this._reject.length === 0) console.error(data);
this._reject.forEach(fn => fn(data));
this._reject = data;
this._worker.dequeue();
runFinallyCbs = true;
} else if (packet.status === 'progress') {
this._progress.forEach(fn => fn(data));
} else {
console.warn('Message type unknown', packet.status);
}
if (runFinallyCbs) {
this._finally.forEach(fn => fn(data));
}
}
}
module.exports = TesseractJob;

196
src/common/TesseractWorker.js

@ -1,196 +0,0 @@ @@ -1,196 +0,0 @@
/**
*
* The core part of tesseract.js to execute the OCR jobs.
*
* @fileoverview Worker for OCR jobs
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const check = require('check-types');
const resolveURL = (typeof window !== 'undefined' && typeof window.document !== 'undefined') ? require('resolve-url') : s => s;
const adapter = require('../node');
const circularize = require('./circularize');
const TesseractJob = require('./TesseractJob');
/**
* TesseractWorker
* @name TesseractWorker
* @function execute TesseractJob with a queue mechanism
* @access public
*/
class TesseractWorker {
/**
* constructor
*
* @name constructor
* @function initialize the worker
* @access public
* @param {object} options - worker configurations
* @param {string} options.workerPath -
* A remote path to load worker script.
* In browser-like environment, it is downloaded from a CDN service.
* Please update this option if you self-host the worker script.
* In Node.js environment, this option is not used as the worker script is in local.
* @param {boolean} [options.workerBlobURL=true] - Use a blob: URL for the worker script
* @param {string} options.corePath -
* A remote path to load tesseract.js-core script.
* In browser-like environment, it is downloaded from a CDN service.
* Please update this option if you self-host the core script.
* In Node.js environment, this option is not used as the core script is in local.
* @param {string} options.langPath -
* A remote path to load *.traineddata.gz, it is download from a CDN service.
* Please update this option if you self-host the worker script.
* @param {string} [options.cachePath=.] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js}
* @param {string} [options.cacheMethod=write] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js}
* @param {string} [options.dataPath=.] - @see {@link https://github.com/jeromewu/tesseract.js-utils/blob/master/src/loadLang.js}
*
*/
constructor(options = {}) {
this.worker = null;
this.options = {
...adapter.defaultOptions,
...options,
};
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (check.not.undefined(options[key])) {
this.options = { ...this.options, [key]: resolveURL(options[key]) };
}
});
this._currentJob = null;
this._queue = [];
}
/**
* recognize
*
* @name recognize
* @function recognize text in given image
* @access public
* @param {Buffer, string} image - image to be recognized
* @param {string, array} [langs=eng] - languages to recognize
* @param {object} params - tesseract parameters
*
*/
recognize(image, langs = 'eng', params = {}) {
return this._sendJob('recognize', image, langs, params);
}
/**
* detect
*
* @name detect
* @function detect language of the text in the image
* @access public
* @param {Buffer, string} image - image to be recognized
* @param {object} params - tesseract parameters
*
*/
detect(image, params = {}) {
return this._sendJob('detect', image, 'osd', params);
}
/**
* recv
*
* @name recv
* @function handle completed job
* @access public
* @param {object} packet job data
*/
recv(packet) {
if (this._currentJob.id === packet.jobId) {
this._currentJob.handle({
...packet,
data: packet.status === 'resolve' && packet.action === 'recognize'
? circularize(packet.data)
: packet.data,
});
} else {
console.warn(`Job ID ${packet.jobId} not known.`);
}
}
/**
* dequeue
*
* @name dequeue
* @function dequeue and execute the rear job
* @access public
*/
dequeue() {
this._currentJob = null;
if (this._queue.length) {
this._queue[0]();
}
}
/**
* terminate
*
* @name terminate
* @function terminate the worker
* @access public
*
*/
terminate() {
if (this.worker) {
adapter.terminateWorker(this);
}
this.worker = null;
this._currentJob = null;
this._queue = [];
}
/**
* _sendJob
*
* @name _sendJob
* @function append a new job to the job queue
* @access private
* @param {string} type job type, should be recognize or detect
* @param {Buffer, string} image image to recognize
* @param {string} lang language to recognize
* @param {object} params tesseract parameters
*/
_sendJob(type, image, langs, params) {
return this._delay((job) => {
job.send(
type,
{
image,
langs,
params,
options: this.options,
},
);
});
}
/**
* _delay
*
* @name _delay
* @function delays the fn to execute until it is on the rear of the queue
* @access private
* @param {function} fn A handler function for the job
*/
_delay(fn) {
if (check.null(this.worker)) {
this.worker = adapter.spawnWorker(this, this.options);
}
const job = new TesseractJob(this);
this._queue.push(() => {
this._queue.shift();
this._currentJob = job;
fn(job);
});
if (check.null(this._currentJob)) {
this.dequeue();
}
return job;
}
}
module.exports = TesseractWorker;

86
src/common/circularize.js

@ -1,86 +0,0 @@ @@ -1,86 +0,0 @@
/**
* The result of dump.js is a big JSON tree
* which can be easily serialized (for instance
* to be sent from a webworker to the main app
* or through Node's IPC), but we want
* a (circular) DOM-like interface for walking
* through the data.
*
* @fileoverview DOM-like interface for walking through data
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
module.exports = (iPage) => {
const page = {
...iPage,
paragraphs: [],
lines: [],
words: [],
symbols: [],
};
page.blocks.forEach((iBlock) => {
const block = {
...iBlock,
page,
lines: [],
words: [],
symbols: [],
};
block.paragraphs.forEach((iPara) => {
const para = {
...iPara,
block,
page,
words: [],
symbols: [],
};
para.lines.forEach((iLine) => {
const line = {
...iLine,
paragraph: para,
block,
page,
symbols: [],
};
line.words.forEach((iWord) => {
const word = {
...iWord,
line,
paragraph: para,
block,
page,
};
word.symbols.forEach((iSym) => {
const sym = {
...iSym,
word,
line,
paragraph: para,
block,
page,
};
sym.line.symbols.push(sym);
sym.paragraph.symbols.push(sym);
sym.block.symbols.push(sym);
sym.page.symbols.push(sym);
});
word.paragraph.words.push(word);
word.block.words.push(word);
word.page.words.push(word);
});
line.block.lines.push(line);
line.page.lines.push(line);
});
para.page.paragraphs.push(para);
});
});
return page;
};

129
src/common/createWorker.js

@ -1,129 +0,0 @@ @@ -1,129 +0,0 @@
const { isBrowser } = require('./env');
const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line
const circularize = require('./circularize');
const createJob = require('./createJob');
const { defaultParams } = require('./options');
const {
defaultOptions,
spawnWorker,
terminateWorker,
setOnMessage,
} = require('../node');
let workerCounter = 0;
const resolvePaths = (options) => {
const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (typeof options[key] !== 'undefined') {
opts[key] = resolveURL(opts[key]);
}
});
return opts;
};
module.exports = (options = {}) => {
workerCounter += 1;
const id = `Worker-${workerCounter}-${Math.random().toString(16).slice(3, 8)}`;
const opts = resolvePaths({
...defaultOptions,
...options,
});
const { logger } = opts;
const resolves = {};
const rejects = {};
let worker = spawnWorker(opts);
const setResolve = (action, res) => {
resolves[action] = res;
};
const setReject = (action, rej) => {
rejects[action] = rej;
};
const load = () => (
new Promise((resolve, reject) => {
const job = createJob(
'load',
opts,
);
setResolve('load', resolve);
setReject('load', reject);
job.start({ worker, id });
})
);
const loadLanguage = (langs = 'eng') => (
new Promise((resolve, reject) => {
const job = createJob(
'load-language',
{
langs,
options: opts,
},
);
setResolve('load-language', resolve);
setReject('load-language', reject);
job.start({ worker, id });
})
);
const initialize = (langs = 'eng', params = {}) => (
new Promise((resolve, reject) => {
const job = createJob(
'initialize',
{
langs,
params: {
...defaultParams,
...params,
},
},
);
setResolve('initialize', resolve);
setReject('initialize', reject);
job.start({ worker, id });
})
);
const terminate = () => {
if (worker !== null) {
terminateWorker({ worker });
worker = null;
}
};
setOnMessage(worker, (packet) => {
const { status, action, data } = packet;
if (status === 'resolve') {
if (action === 'load') {
resolves.load(data);
} else if (action === 'initialize') {
resolves.initialize({ id });
} else if (action === 'load-language') {
resolves['load-language'](data);
} else if (action === 'recognize') {
resolves.recognize(circularize(data));
} else if (action === 'detect') {
resolves.detect(data);
}
} else if (status === 'reject') {
rejects[action](data);
throw Error(data);
} else if (status === 'progress') {
logger(data);
}
});
return {
id,
worker,
setResolve,
setReject,
load,
loadLanguage,
initialize,
terminate,
};
};

1
src/common/env.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
exports.isBrowser = (typeof window !== 'undefined') && (typeof window.document !== 'undefined');

40
src/common/options.js

@ -1,40 +0,0 @@ @@ -1,40 +0,0 @@
const { OEM, PSM } = require('./types');
module.exports = {
defaultOptions: {
/*
* default path for downloading *.traineddata, this URL basically
* points to a github page, not using jsDelivr as there is is limitation
* of 20 MB.
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
/*
* Use BlobURL for worker script by default
*/
workerBlobURL: true,
logger: () => {},
},
/*
* default params for recognize()
*/
defaultParams: {
tessedit_ocr_engine_mode: OEM.LSTM_ONLY,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessjs_create_pdf: '0',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
tessjs_create_box: '0',
tessjs_create_unlv: '0',
tessjs_create_osd: '0',
tessjs_textonly_pdf: '0',
tessjs_pdf_name: 'tesseract.js-ocr-result',
tessjs_pdf_title: 'Tesseract.js OCR Result',
tessjs_pdf_auto_download: true,
tessjs_pdf_bin: false,
tessjs_image_rectangle_left: 0,
tessjs_image_rectangle_top: 0,
tessjs_image_rectangle_width: -1,
tessjs_image_rectangle_height: -1,
},
};

1
src/common/pdf-ttf.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA=';

34
src/common/types.js

@ -1,34 +0,0 @@ @@ -1,34 +0,0 @@
module.exports = {
/*
* OEM = OCR Engine Mode, and there are 5 possible modes.
*
* By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible.
* If you need to use some tesseract v3 features (like tessedit_char_whitelist),
* you need to use TESSERACT_ONLY mode.
*
*/
OEM: {
TESSERACT_ONLY: 0,
LSTM_ONLY: 1,
TESSERACT_LSTM_COMBINED: 2,
DEFAULT: 3,
},
/*
* PSM = Page Segmentation Mode
*/
PSM: {
OSD_ONLY: '0',
AUTO_OSD: '1',
AUTO_ONLY: '2',
AUTO: '3',
SINGLE_COLUMN: '4',
SINGLE_BLOCK_VERT_TEXT: '5',
SINGLE_BLOCK: '6',
SINGLE_LINE: '7',
SINGLE_WORD: '8',
SINGLE_CHAR: '9',
SPARSE_TEXT: '10',
SPARSE_TEXT_OSD: '11',
RAW_LINE: '12',
},
};

87
src/common/utils.js

@ -1,87 +0,0 @@ @@ -1,87 +0,0 @@
const { readImage } = require('tesseract.js-utils');
/**
* setImage
*
* @name setImage
* @function set image in tesseract for recognition
* @access public
* @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image
*/
exports.setImage = (TessModule, api, image, params) => {
const {
tessjs_image_rectangle_left: left,
tessjs_image_rectangle_top: top,
tessjs_image_rectangle_width: width,
tessjs_image_rectangle_height: height,
} = params;
const {
w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image));
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
}
api.SetRectangle(
(left < 0) ? 0 : left,
(top < 0) ? 0 : top,
(width < 0) ? w : width,
(height < 0) ? h : height,
);
return data === null ? pix : data;
};
exports.getLangsStr = langs => (
typeof langs === 'string'
? langs
: langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+')
);
/**
* handleOutput
*
* @name handleOutput
* @function handle file output
* @access private
* @param {object} customParams - an object of params
*/
exports.getFiles = (TessModule, api, adapter, params) => {
let files = {};
const {
tessjs_create_pdf,
tessjs_textonly_pdf,
tessjs_pdf_name,
tessjs_pdf_title,
tessjs_pdf_auto_download,
tessjs_pdf_bin,
} = params;
if (tessjs_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1');
pdfRenderer.BeginDocument(tessjs_pdf_title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`);
if (tessjs_pdf_bin) {
files = { pdf: data, ...files };
}
if (tessjs_pdf_auto_download) {
adapter.writeFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf');
}
}
return files;
};

12
src/constants/OEM.js

@ -0,0 +1,12 @@ @@ -0,0 +1,12 @@
/*
* OEM = OCR Engine Mode, and there are 4 possible modes.
*
* By default tesseract.js uses LSTM_ONLY mode.
*
*/
module.exports = {
TESSERACT_ONLY: 0,
LSTM_ONLY: 1,
TESSERACT_LSTM_COMBINED: 2,
DEFAULT: 3,
};

18
src/constants/PSM.js

@ -0,0 +1,18 @@ @@ -0,0 +1,18 @@
/*
* PSM = Page Segmentation Mode
*/
module.exports = {
OSD_ONLY: '0',
AUTO_OSD: '1',
AUTO_ONLY: '2',
AUTO: '3',
SINGLE_COLUMN: '4',
SINGLE_BLOCK_VERT_TEXT: '5',
SINGLE_BLOCK: '6',
SINGLE_LINE: '7',
SINGLE_WORD: '8',
CIRCLE_WORD: '9',
SINGLE_CHAR: '10',
SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12',
};

5
src/constants/config.js

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
const OEM = require('./OEM');
module.exports = {
defaultOEM: OEM.DEFAULT,
};

13
src/constants/defaultOptions.js

@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
module.exports = {
/*
* default path for downloading *.traineddata
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
/*
* Use BlobURL for worker script by default
* TODO: remove this option
*
*/
workerBlobURL: true,
logger: () => {},
};

4
src/common/createJob.js → src/createJob.js

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
const { sendPacket } = require('../node');
const { send } = require('./worker/node');
let jobCounter = 0;
@ -11,7 +11,7 @@ module.exports = ( @@ -11,7 +11,7 @@ module.exports = (
const start = (worker) => {
console.log(`[${worker.id}]: Start ${id}, action=${action}`);
sendPacket(worker, {
send(worker, {
workerId: worker.id,
jobId: id,
action,

0
src/common/createScheduler.js → src/createScheduler.js

0
src/common/createTesseract.js → src/createTesseract.js

100
src/createWorker.js

@ -0,0 +1,100 @@ @@ -0,0 +1,100 @@
const resolvePaths = require('./utils/resolvePaths');
const circularize = require('./utils/circularize');
const createJob = require('./createJob');
const { defaultOEM } = require('./constants/config');
const {
defaultOptions,
spawnWorker,
terminateWorker,
onMessage,
} = require('./worker/node');
let workerCounter = 0;
module.exports = (_options = {}) => {
workerCounter += 1;
const id = `Worker-${workerCounter}-${Math.random().toString(16).slice(3, 8)}`;
const options = resolvePaths({
...defaultOptions,
..._options,
});
const { logger } = options;
const resolves = {};
const rejects = {};
let worker = spawnWorker(options);
const setResolve = (action, res) => {
resolves[action] = res;
};
const setReject = (action, rej) => {
rejects[action] = rej;
};
const doJob = (action, payload) => (
new Promise((resolve, reject) => {
setResolve(action, resolve);
setReject(action, reject);
createJob(action, payload).start({ worker, id });
})
);
const load = () => (
doJob('load', { options })
);
const loadLanguage = (langs = 'eng') => (
doJob('load-language', { langs, options })
);
const initialize = (langs = 'eng', oem = defaultOEM) => (
doJob('initialize', { langs, oem })
);
const setParameters = (params = {}) => (
doJob('set-parameters', { params })
);
const terminate = () => {
if (worker !== null) {
terminateWorker(worker);
worker = null;
}
};
onMessage(worker, (packet) => {
const { status, action, data } = packet;
if (status === 'resolve') {
if (action === 'load') {
resolves.load(data);
} else if (action === 'initialize') {
resolves.initialize({ id });
} else if (action === 'set-parameters') {
resolves['set-parameters'](data);
} else if (action === 'load-language') {
resolves['load-language'](data);
} else if (action === 'recognize') {
resolves.recognize(circularize(data));
} else if (action === 'detect') {
resolves.detect(data);
}
} else if (status === 'reject') {
rejects[action](data);
throw Error(data);
} else if (status === 'progress') {
logger(data);
}
});
return {
id,
worker,
setResolve,
setReject,
load,
loadLanguage,
initialize,
setParameters,
terminate,
};
};

19
src/index.js

@ -7,20 +7,15 @@ @@ -7,20 +7,15 @@
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const utils = require('tesseract.js-utils');
const TesseractWorker = require('./common/TesseractWorker');
const createScheduler = require('./common/createScheduler');
const createWorker = require('./common/createWorker');
const createJob = require('./common/createJob');
const types = require('./common/types');
const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker');
const createJob = require('./createJob');
const OEM = require('./constants/OEM');
const PSM = require('./constants/PSM');
module.exports = {
/** Worker for OCR, @see common/TesseractWorker.js */
TesseractWorker,
/** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */
utils,
/** Check ./common/types for more details */
...types,
OEM,
PSM,
createScheduler,
createWorker,
createJob,

1
src/node/b64toU8Array.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = s => Buffer.from(s, 'base64');

111
src/node/index.js

@ -1,111 +0,0 @@ @@ -1,111 +0,0 @@
/**
*
* Tesseract Worker adapter for node
*
* @fileoverview Tesseract Worker adapter for node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const util = require('util');
const fs = require('fs');
const axios = require('axios');
const isURL = require('is-url');
const { fork } = require('child_process');
const path = require('path');
const b64toU8Array = require('./b64toU8Array');
const { defaultOptions } = require('../common/options');
const readFile = util.promisify(fs.readFile);
/**
* loadImage
*
* @name loadImage
* @function load image from different source
* @access public
* @param {string} image - image source, supported formats:
* string: URL string or file path
* string: base64 image
* buffer: image buffer
* @returns {array} binary image in array format
*/
const loadImage = (image) => {
if (isURL(image)) {
return axios.get(image, {
responseType: 'arraybuffer',
})
.then(resp => resp.data);
}
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
return Promise.resolve(b64toU8Array(image.split(',')[1]));
}
if (Buffer.isBuffer(image)) {
return Promise.resolve(image);
}
return readFile(image);
};
/*
* Default options for node worker
*/
exports.defaultOptions = {
...defaultOptions,
workerPath: path.join(__dirname, 'worker.js'),
};
/**
* spawnWorker
*
* @name spawnWorker
* @function fork a new process in node
* @access public
* @param {object} instance - TesseractWorker instance
* @param {object} options
* @param {string} options.workerPath - worker script path
*/
exports.spawnWorker = ({ workerPath }) => (
fork(workerPath)
);
exports.setOnMessage = (worker, handler) => {
worker.on('message', handler);
};
/**
* terminateWorker
*
* @name terminateWorker
* @function kill worker
* @access public
* @param {object} instance TesseractWorker instance
*/
exports.terminateWorker = ({ worker }) => {
worker.kill();
};
/**
* sendPacket
*
* @name sendPacket
* @function send packet to worker and create a job
* @access public
* @param {object} instance TesseractWorker instance
* @param {object} iPacket data for worker
*/
exports.sendPacket = ({ worker }, packet) => {
const p = { ...packet };
if (['recognize', 'detect'].includes(p.action)) {
loadImage(p.payload.image)
.then(buf => new Uint8Array(buf))
.then((img) => {
p.payload.image = Array.from(img);
worker.send(p);
});
} else {
worker.send(p);
}
};

44
src/node/worker.js

@ -1,44 +0,0 @@ @@ -1,44 +0,0 @@
/**
*
* Node worker implementation
*
* @fileoverview Node worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const check = require('check-types');
const workerUtils = require('../common/workerUtils');
const b64toU8Array = require('./b64toU8Array');
let TesseractCore = null;
/*
* register message handler
*/
process.on('message', (packet) => {
workerUtils.dispatchHandlers(packet, obj => process.send(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
workerUtils.setAdapter({
getCore: (corePath, res) => {
if (check.null(TesseractCore)) {
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;
},
b64toU8Array,
writeFile: (path, data) => {
const fs = require('fs');
fs.writeFile(path, data, (err) => {
if (err) throw err;
});
},
});

57
src/utils/circularize.js

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
/**
* In the recognition result of tesseract, there
* is a deep JSON object for details, it has around
*
* The result of dump.js is a big JSON tree
* which can be easily serialized (for instance
* to be sent from a webworker to the main app
* or through Node's IPC), but we want
* a (circular) DOM-like interface for walking
* through the data.
*
* A (circular) DOM-like interface here means that
* each child element
*
* @fileoverview DOM-like interface for walking through data
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
module.exports = (page) => {
const blocks = [];
const paragraphs = [];
const lines = [];
const words = [];
const symbols = [];
page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
line.words.forEach((word) => {
word.symbols.forEach((sym) => {
symbols.push({
...sym, page, block, paragraph, line, word,
});
});
words.push({
...word, page, block, paragraph, line,
});
});
lines.push({
...line, page, block, paragraph,
});
});
paragraphs.push({
...paragraph, page, block,
});
});
blocks.push({
...block, page,
});
});
return {
...page, blocks, paragraphs, lines, words, symbols,
};
};

10
src/utils/getEnvironment.js

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
module.exports = (key) => {
const env = {
type: (typeof window !== 'undefined') && (typeof window.document !== 'undefined') ? 'browser' : 'node',
};
if (typeof key === 'undefined') {
return env;
}
return env[key];
};

12
src/utils/resolvePaths.js

@ -0,0 +1,12 @@ @@ -0,0 +1,12 @@
const isBrowser = require('./getEnvironment')('type') === 'browser';
const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line
module.exports = (options) => {
const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (typeof options[key] !== 'undefined') {
opts[key] = resolveURL(opts[key]);
}
});
return opts;
};

0
src/worker-script/browser/index.js

24
src/worker-script/constants/defaultParams.js

@ -0,0 +1,24 @@ @@ -0,0 +1,24 @@
/*
* default params for tesseract.js
*/
const PSM = require('../../constants/PSM');
module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessjs_create_pdf: '0',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
tessjs_create_box: '0',
tessjs_create_unlv: '0',
tessjs_create_osd: '0',
tessjs_textonly_pdf: '0',
tessjs_pdf_name: 'tesseract.js-ocr-result',
tessjs_pdf_title: 'Tesseract.js OCR Result',
tessjs_pdf_auto_download: true,
tessjs_pdf_bin: false,
tessjs_image_rectangle_left: 0,
tessjs_image_rectangle_top: 0,
tessjs_image_rectangle_width: -1,
tessjs_image_rectangle_height: -1,
};

110
src/common/workerUtils.js → src/worker-script/index.js

@ -1,18 +1,18 @@ @@ -1,18 +1,18 @@
/**
*
* Worker utilities for browser and node
* Worker script for browser and node
*
* @fileoverview Worker utilities for browser and node
* @fileoverview Worker script for browser and node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const { loadLang } = require('tesseract.js-utils');
const pdfTTF = require('./pdf-ttf');
const dump = require('./dump');
const { OEM, PSM } = require('./types');
const { isBrowser } = require('./env');
const { setImage, getLangsStr, getFiles } = require('./utils');
const dump = require('./utils/dump');
const isBrowser = require('../utils/getEnvironment')('type') === 'browser';
const setImage = require('./utils/setImage');
const getFiles = require('./utils/getFiles');
const defaultParams = require('./constants/defaultParams');
/*
* Tesseract Module returned by TesseractCore.
@ -24,7 +24,7 @@ let TessModule; @@ -24,7 +24,7 @@ let TessModule;
let api;
let latestJob;
let adapter = {};
let curParams = {};
let params = defaultParams;
/**
@ -38,7 +38,7 @@ let curParams = {}; @@ -38,7 +38,7 @@ let curParams = {};
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const load = ({ workerId, jobId, payload: { corePath } }, res) => {
const load = ({ workerId, jobId, payload: { options: { corePath } } }, res) => {
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
@ -56,8 +56,6 @@ const load = ({ workerId, jobId, payload: { corePath } }, res) => { @@ -56,8 +56,6 @@ const load = ({ workerId, jobId, payload: { corePath } }, res) => {
})
.then((tessModule) => {
TessModule = tessModule;
TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF));
api = new TessModule.TessBaseAPI();
res.progress({ workerId, status: 'initialized tesseract', progress: 1 });
res.resolve({ loaded: true });
});
@ -83,52 +81,55 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => { @@ -83,52 +81,55 @@ const loadLanguage = ({ workerId, payload: { langs, options } }, res) => {
loadLang({ langs, TessModule, ...options }).then(() => {
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
}).catch((e) => {
if (isBrowser && e instanceof DOMException) {
}).catch((err) => {
if (isBrowser && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(e.toString());
res.reject(err.toString());
}
});
};
const setParameters = ({ payload: { params: _params } }, res) => {
Object.keys(_params)
.filter(k => !k.startsWith('tessjs_'))
.forEach((key) => {
api.SetVariable(key, _params[key]);
});
params = { ...params, ..._params };
if (typeof res !== 'undefined') {
res.resolve(params);
}
};
const initialize = ({
workerId,
jobId,
payload: { langs, params },
payload: { langs: _langs, oem },
}, res) => {
let { tessedit_ocr_engine_mode: oem } = params;
let l = langs;
const langs = (typeof _langs === 'string')
? _langs
: _langs.map(l => ((typeof l === 'string') ? l : l.data)).join('+');
res.progress({
workerId, jobId, status: 'initializing api', progress: 0,
});
if ([
PSM.OSD_ONLY,
PSM.AUTO_OSD,
PSM.RAW_LINE,
].includes(params.tessedit_pageseg_mode)) {
l = (typeof l === 'string') ? `${l}+osd` : [...l, 'osd'];
// oem = OEM.TESSERACT_ONLY;
try {
res.progress({
workerId, jobId, status: 'initializing api', progress: 0,
});
api = new TessModule.TessBaseAPI();
api.Init(null, langs, oem);
setParameters({ payload: { params } });
res.progress({
workerId, jobId, status: 'initialized api', progress: 1,
});
res.resolve();
} catch (err) {
res.reject(err.toString());
}
api.Init(null, getLangsStr(l), oem);
Object.keys(params).forEach((key) => {
if (!key.startsWith('tessjs')) {
api.SetVariable(key, params[key]);
}
});
curParams = {
tessedit_ocr_engine_mode: oem,
...params,
};
res.progress({
workerId, jobId, status: 'initialized api', progress: 1,
});
res.resolve();
};
/**
@ -146,11 +147,11 @@ const initialize = ({ @@ -146,11 +147,11 @@ const initialize = ({
*/
const recognize = ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image, curParams);
const ptr = setImage(TessModule, api, image, params);
api.Recognize(null);
res.resolve({
files: getFiles(TessModule, api, adapter, curParams),
...dump(TessModule, api, curParams),
files: getFiles(TessModule, api, adapter, params),
...dump(TessModule, api, params),
});
TessModule._free(ptr);
} catch (err) {
@ -172,7 +173,7 @@ const recognize = ({ payload: { image } }, res) => { @@ -172,7 +173,7 @@ const recognize = ({ payload: { image } }, res) => {
*/
const detect = ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image, curParams);
const ptr = setImage(TessModule, api, image, params);
const results = new TessModule.OSResults();
if (!api.DetectOS(results)) {
@ -199,6 +200,15 @@ const detect = ({ payload: { image } }, res) => { @@ -199,6 +200,15 @@ const detect = ({ payload: { image } }, res) => {
}
};
const terminate = (_, res) => {
try {
api.End();
res.resolve({ terminated: true });
} catch (err) {
res.reject(err.toString());
}
};
/**
* dispatchHandlers
*
@ -233,10 +243,14 @@ exports.dispatchHandlers = (packet, send) => { @@ -233,10 +243,14 @@ exports.dispatchHandlers = (packet, send) => {
loadLanguage(packet, res);
} else if (action === 'initialize') {
initialize(packet, res);
} else if (action === 'set-parameters') {
setParameters(packet, res);
} else if (action === 'recognize') {
recognize(packet, res);
} else if (action === 'detect') {
detect(packet, res);
} else if (action === 'terminate') {
terminate(packet, res);
}
} catch (err) {
/** Prepare exception to travel through postMessage */
@ -250,8 +264,8 @@ exports.dispatchHandlers = (packet, send) => { @@ -250,8 +264,8 @@ exports.dispatchHandlers = (packet, send) => {
* @name setAdapter
* @function
* @access public
* @param {object} impl - implementation of the worker, different in browser and node environment
* @param {object} adapter - implementation of the worker, different in browser and node environment
*/
exports.setAdapter = (impl) => {
adapter = impl;
exports.setAdapter = (_adapter) => {
adapter = _adapter;
};

7
src/worker-script/node/exportFile.js

@ -0,0 +1,7 @@ @@ -0,0 +1,7 @@
const fs = require('fs');
module.exports = (path, data) => {
fs.writeFile(path, data, (err) => {
if (err) throw err;
});
};

39
src/worker-script/node/index.js

@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
/**
*
* Tesseract Worker Script for Node
*
* @fileoverview Node worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const worker = require('../');
const exportFile = require('./exportFile');
let TesseractCore = null;
/*
* register message handler
*/
process.on('message', (packet) => {
worker.dispatchHandlers(packet, obj => process.send(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
const getCore = (_, res) => {
if (TesseractCore === null) {
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;
};
worker.setAdapter({
getCore,
exportFile,
});

62
src/common/dump.js → src/worker-script/utils/dump.js

@ -50,6 +50,13 @@ module.exports = (TessModule, api, { @@ -50,6 +50,13 @@ module.exports = (TessModule, api, {
tessjs_create_osd,
}) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL,
} = TessModule;
const blocks = [];
let block;
let para;
@ -59,14 +66,13 @@ module.exports = (TessModule, api, { @@ -59,14 +66,13 @@ module.exports = (TessModule, api, {
const enumToString = (value, prefix) => (
Object.keys(TessModule)
.filter(e => (e.substr(0, prefix.length + 1) === `${prefix}_`))
.filter(e => TessModule[e] === value)
.filter(e => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map(e => e.slice(prefix.length + 1))[0]
);
ri.Begin();
do {
if (ri.IsAtBeginningOf(TessModule.RIL_BLOCK)) {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
@ -86,47 +92,47 @@ module.exports = (TessModule, api, { @@ -86,47 +92,47 @@ module.exports = (TessModule, api, {
block = {
paragraphs: [],
text: ri.GetUTF8Text(TessModule.RIL_BLOCK),
confidence: ri.Confidence(TessModule.RIL_BLOCK),
baseline: ri.getBaseline(TessModule.RIL_BLOCK),
bbox: ri.getBoundingBox(TessModule.RIL_BLOCK),
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(TessModule.RIL_PARA)) {
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: ri.GetUTF8Text(TessModule.RIL_PARA),
confidence: ri.Confidence(TessModule.RIL_PARA),
baseline: ri.getBaseline(TessModule.RIL_PARA),
bbox: ri.getBoundingBox(TessModule.RIL_PARA),
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(TessModule.RIL_TEXTLINE)) {
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
textline = {
words: [],
text: ri.GetUTF8Text(TessModule.RIL_TEXTLINE),
confidence: ri.Confidence(TessModule.RIL_TEXTLINE),
baseline: ri.getBaseline(TessModule.RIL_TEXTLINE),
bbox: ri.getBoundingBox(TessModule.RIL_TEXTLINE),
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(TessModule.RIL_WORD)) {
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(TessModule.RIL_WORD),
confidence: ri.Confidence(TessModule.RIL_WORD),
baseline: ri.getBaseline(TessModule.RIL_WORD),
bbox: ri.getBoundingBox(TessModule.RIL_WORD),
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
@ -159,14 +165,14 @@ module.exports = (TessModule, api, { @@ -159,14 +165,14 @@ module.exports = (TessModule, api, {
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(TessModule.RIL_SYMBOL)) {
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(TessModule.RIL_SYMBOL),
confidence: ri.Confidence(TessModule.RIL_SYMBOL),
baseline: ri.getBaseline(TessModule.RIL_SYMBOL),
bbox: ri.getBoundingBox(TessModule.RIL_SYMBOL),
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
@ -181,7 +187,7 @@ module.exports = (TessModule, api, { @@ -181,7 +187,7 @@ module.exports = (TessModule, api, {
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(TessModule.RIL_SYMBOL));
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
return {

39
src/worker-script/utils/getFiles.js

@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
/**
* handleOutput
*
* @name handleOutput
* @function handle file output
* @access private
* @param {object} customParams - an object of params
*/
module.exports = (TessModule, api, adapter, params) => {
let files = {};
const {
tessjs_create_pdf,
tessjs_textonly_pdf,
tessjs_pdf_name,
tessjs_pdf_title,
tessjs_pdf_auto_download,
tessjs_pdf_bin,
} = params;
if (tessjs_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1');
pdfRenderer.BeginDocument(tessjs_pdf_title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`);
if (tessjs_pdf_bin) {
files = { pdf: data, ...files };
}
if (tessjs_pdf_auto_download) {
adapter.exportFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf');
}
}
return files;
};

5
src/worker-script/utils/getLangStr.js

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
module.exports = langs => (
typeof langs === 'string'
? langs
: langs.map(lang => (typeof lang === 'string' ? lang : lang.data)).join('+')
);

41
src/worker-script/utils/setImage.js

@ -0,0 +1,41 @@ @@ -0,0 +1,41 @@
const { readImage } = require('tesseract.js-utils');
/**
* setImage
*
* @name setImage
* @function set image in tesseract for recognition
* @access public
* @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image
*/
module.exports = (TessModule, api, image, params) => {
const {
tessjs_image_rectangle_left: left,
tessjs_image_rectangle_top: top,
tessjs_image_rectangle_width: width,
tessjs_image_rectangle_height: height,
} = params;
const {
w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image));
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
}
api.SetRectangle(
(left < 0) ? 0 : left,
(top < 0) ? 0 : top,
(width < 0) ? w : width,
(height < 0) ? h : height,
);
return data === null ? pix : data;
};

0
src/browser/b64toU8Array.js → src/worker/browser/b64toU8Array.js

0
src/browser/index.js → src/worker/browser/index.js

8
src/browser/worker.js → src/worker/browser/worker.js

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
/**
*
* Browser worker implementation
* Browser worker scripts
*
* @fileoverview Browser worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
@ -9,21 +9,21 @@ @@ -9,21 +9,21 @@
*/
const check = require('check-types');
const workerUtils = require('../common/workerUtils');
const workerWrapper = require('../../workerWrapper');
const b64toU8Array = require('./b64toU8Array');
/*
* register message handler
*/
global.addEventListener('message', ({ data }) => {
workerUtils.dispatchHandlers(data, obj => postMessage(obj));
workerWrapper.dispatchHandlers(data, obj => postMessage(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
workerUtils.setAdapter({
workerWrapper.setAdapter({
getCore: (corePath, res) => {
if (check.undefined(global.TesseractCore)) {
res.progress({ status: 'loading tesseract core', progress: 0 });

10
src/worker/node/defaultOptions.js

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
const path = require('path');
const defaultOptions = require('../../constants/defaultOptions');
/*
* Default options for node worker
*/
module.exports = {
...defaultOptions,
workerPath: path.join(__dirname, '..', '..', 'worker-script', 'node', 'index.js'),
};

22
src/worker/node/index.js

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
/**
*
* Tesseract Worker impl. for node (using child_process)
*
* @fileoverview Tesseract Worker impl. for node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const defaultOptions = require('./defaultOptions');
const spawnWorker = require('./spawnWorker');
const terminateWorker = require('./terminateWorker');
const onMessage = require('./onMessage');
const send = require('./send');
module.exports = {
defaultOptions,
spawnWorker,
terminateWorker,
onMessage,
send,
};

3
src/worker/node/onMessage.js

@ -0,0 +1,3 @@ @@ -0,0 +1,3 @@
module.exports = (worker, handler) => {
worker.on('message', handler);
};

61
src/worker/node/send.js

@ -0,0 +1,61 @@ @@ -0,0 +1,61 @@
const util = require('util');
const fs = require('fs');
const axios = require('axios');
const isURL = require('is-url');
const readFile = util.promisify(fs.readFile);
/**
* loadImage
*
* @name loadImage
* @function load image from different source
* @access public
* @param {string} image - image source, supported formats:
* string: URL string or file path
* string: base64 image
* buffer: image buffer
* @returns {array} binary image in array format
*/
const loadImage = (image) => {
if (isURL(image)) {
return axios.get(image, {
responseType: 'arraybuffer',
})
.then(resp => resp.data);
}
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
return Promise.resolve(Buffer.from(image.split(',')[1], 'base64'));
}
if (Buffer.isBuffer(image)) {
return Promise.resolve(image);
}
return readFile(image);
};
/**
* send
*
* @name send
* @function send packet to worker and create a job
* @access public
* @param {object} instance TesseractWorker instance
* @param {object} iPacket data for worker
*/
module.exports = ({ worker }, packet) => {
const p = { ...packet };
if (['recognize', 'detect'].includes(p.action)) {
loadImage(p.payload.image)
.then(buf => new Uint8Array(buf))
.then((img) => {
p.payload.image = Array.from(img);
worker.send(p);
});
} else {
worker.send(p);
}
};

15
src/worker/node/spawnWorker.js

@ -0,0 +1,15 @@ @@ -0,0 +1,15 @@
const { fork } = require('child_process');
/**
* spawnWorker
*
* @name spawnWorker
* @function fork a new process in node
* @access public
* @param {object} instance - TesseractWorker instance
* @param {object} options
* @param {string} options.workerPath - worker script path
*/
module.exports = ({ workerPath }) => (
fork(workerPath)
);

11
src/worker/node/terminateWorker.js

@ -0,0 +1,11 @@ @@ -0,0 +1,11 @@
/**
* terminateWorker
*
* @name terminateWorker
* @function kill worker
* @access public
* @param {object} instance TesseractWorker instance
*/
module.exports = (worker) => {
worker.kill();
};
Loading…
Cancel
Save