Browse Source

Add comments and remove unused files

pull/265/head
Jerome Wu 6 years ago
parent
commit
cfb7d5673c
  1. 56
      src/browser/index.js
  2. 26
      src/browser/worker.js
  3. 19
      src/common/circularize.js
  4. 123
      src/common/dump.js
  5. 1
      src/common/langdata.json
  6. 5
      src/common/options.js
  7. 117
      src/common/workerUtils.js
  8. 49
      src/node/index.js
  9. 19
      src/node/worker.js

56
src/browser/index.js

@ -1,8 +1,28 @@ @@ -1,8 +1,28 @@
/**
*
* Tesseract Worker adapter for browser
*
* @fileoverview Tesseract Worker adapter for browser
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const check = require('check-types');
const resolveURL = require('resolve-url');
const { defaultOptions } = require('../common/options');
const { version } = require('../../package.json');
/**
* loadImage
*
* @name loadImage
* @function load image from different source
* @access public
* @param {string, object} image - image source, supported formats:
* string: URL string, can be relative path
* File instance: data from <input type="file" />
* @returns {array} binary image in array format
*/
const loadImage = (image) => {
if (check.string(image)) {
return fetch(resolveURL(image))
@ -20,14 +40,31 @@ const loadImage = (image) => { @@ -20,14 +40,31 @@ const loadImage = (image) => {
return Promise.reject();
};
/*
* Default options for browser worker
*/
exports.defaultOptions = {
...defaultOptions,
workerPath: process.env.NODE_ENV === 'development'
? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`)
: `https://cdn.jsdelivr.net/gh/naptha/tesseract.js@v${version}/dist/worker.min.js`,
: `https://cdn.jsdelivr.net/gh/naptha/tesseract.js@v${version}/dist/worker.min.js`,
/*
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
corePath: `https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@v2.0.0-beta.5/tesseract-core${typeof WebAssembly === 'object' ? '' : '.asm'}.js`,
};
/**
* spawnWorker
*
* @name spawnWorker
* @function create a new Worker in browser
* @access public
* @param {object} instance - TesseractWorker instance
* @param {object} options
* @param {string} options.workerPath - worker script path
*/
exports.spawnWorker = (instance, { workerPath }) => {
let worker;
if (window.Blob && window.URL) {
@ -44,10 +81,27 @@ exports.spawnWorker = (instance, { workerPath }) => { @@ -44,10 +81,27 @@ exports.spawnWorker = (instance, { workerPath }) => {
return worker;
};
/**
* terminateWorker
*
* @name terminateWorker
* @function terminate worker
* @access public
* @param {object} instance TesseractWorker instance
*/
exports.terminateWorker = (instance) => {
instance.worker.terminate();
};
/**
* sendPacket
*
* @name sendPacket
* @function send packet to worker and create a job
* @access public
* @param {object} instance TesseractWorker instance
* @param {object} iPacket data for worker
*/
exports.sendPacket = (instance, iPacket) => {
const packet = { ...iPacket };
loadImage(packet.payload.image)

26
src/browser/worker.js

@ -1,15 +1,37 @@ @@ -1,15 +1,37 @@
/**
*
* Browser worker implementation
*
* @fileoverview Browser worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const check = require('check-types');
const resolveURL = require('resolve-url');
const workerUtils = require('../common/worker');
const workerUtils = require('../common/workerUtils');
/*
* register message handler
*/
global.addEventListener('message', ({ data }) => {
workerUtils.dispatchHandlers(data, obj => postMessage(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
workerUtils.setAdapter({
getCore: (corePath, res) => {
if (!global.TesseractCore) {
if (check.undefined(global.TesseractCore)) {
res.progress({ status: 'loading tesseract core', progress: 0 });
global.importScripts(resolveURL(corePath));
/*
* Depending on whether the browser supports WebAssembly,
* the version of the TesseractCore will be different.
*/
global.TesseractCore = typeof WebAssembly === 'object' ? global.TesseractCoreWASM : global.TesseractCoreASM;
res.progress({ status: 'loading tesseract core', progress: 1 });
}

19
src/common/circularize.js

@ -1,9 +1,16 @@ @@ -1,9 +1,16 @@
// The result of dump.js is a big JSON tree
// which can be easily serialized (for instance
// to be sent from a webworker to the main app
// or through Node's IPC), but we want
// a (circular) DOM-like interface for walking
// through the data.
/**
* The result of dump.js is a big JSON tree
* which can be easily serialized (for instance
* to be sent from a webworker to the main app
* or through Node's IPC), but we want
* a (circular) DOM-like interface for walking
* through the data.
*
* @fileoverview DOM-like interface for walking through data
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
module.exports = (iPage) => {
const page = {

123
src/common/dump.js

@ -1,6 +1,25 @@ @@ -1,6 +1,25 @@
// the generated HOCR is excessively indented, so
// we get rid of that indentation
/**
*
* Dump data to a big JSON tree
*
* @fileoverview dump data to JSON tree
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
/**
* deindent
*
* The generated HOCR is excessively indented, so
* we get rid of that indentation
*
* @name deindent
* @function deindent string
* @access public
* @param {string} html HOCR in html format
* @returns {string} deindent html string
*/
const deindent = (html) => {
const lines = html.split('\n');
if (lines[0].substring(0, 2) === ' ') {
@ -13,8 +32,18 @@ const deindent = (html) => { @@ -13,8 +32,18 @@ const deindent = (html) => {
return lines.join('\n');
};
module.exports = (Module, base) => {
const ri = base.GetIterator();
/**
* dump
*
* @name dump
* @function dump recognition result to a JSON object
* @access public
* @param {object} TessModule TessModule from TesseractCore
* @param {object} api TesseractBaseAPI instance
* @returns {object} dumpped JSON object
*/
module.exports = (TessModule, api) => {
const ri = api.GetIterator();
const blocks = [];
let block;
let para;
@ -23,19 +52,19 @@ module.exports = (Module, base) => { @@ -23,19 +52,19 @@ module.exports = (Module, base) => {
let symbol;
const enumToString = (value, prefix) => (
Object.keys(Module)
Object.keys(TessModule)
.filter(e => (e.substr(0, prefix.length + 1) === `${prefix}_`))
.filter(e => Module[e] === value)
.filter(e => TessModule[e] === value)
.map(e => e.slice(prefix.length + 1))[0]
);
ri.Begin();
do {
if (ri.IsAtBeginningOf(Module.RIL_BLOCK)) {
if (ri.IsAtBeginningOf(TessModule.RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (Module.getPointer(poly) > 0) {
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
@ -43,52 +72,52 @@ module.exports = (Module, base) => { @@ -43,52 +72,52 @@ module.exports = (Module, base) => {
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
Module._ptaDestroy(Module.getPointer(poly));
TessModule._ptaDestroy(TessModule.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(Module.RIL_BLOCK),
confidence: ri.Confidence(Module.RIL_BLOCK),
baseline: ri.getBaseline(Module.RIL_BLOCK),
bbox: ri.getBoundingBox(Module.RIL_BLOCK),
text: ri.GetUTF8Text(TessModule.RIL_BLOCK),
confidence: ri.Confidence(TessModule.RIL_BLOCK),
baseline: ri.getBaseline(TessModule.RIL_BLOCK),
bbox: ri.getBoundingBox(TessModule.RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(Module.RIL_PARA)) {
if (ri.IsAtBeginningOf(TessModule.RIL_PARA)) {
para = {
lines: [],
text: ri.GetUTF8Text(Module.RIL_PARA),
confidence: ri.Confidence(Module.RIL_PARA),
baseline: ri.getBaseline(Module.RIL_PARA),
bbox: ri.getBoundingBox(Module.RIL_PARA),
text: ri.GetUTF8Text(TessModule.RIL_PARA),
confidence: ri.Confidence(TessModule.RIL_PARA),
baseline: ri.getBaseline(TessModule.RIL_PARA),
bbox: ri.getBoundingBox(TessModule.RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(Module.RIL_TEXTLINE)) {
if (ri.IsAtBeginningOf(TessModule.RIL_TEXTLINE)) {
textline = {
words: [],
text: ri.GetUTF8Text(Module.RIL_TEXTLINE),
confidence: ri.Confidence(Module.RIL_TEXTLINE),
baseline: ri.getBaseline(Module.RIL_TEXTLINE),
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE),
text: ri.GetUTF8Text(TessModule.RIL_TEXTLINE),
confidence: ri.Confidence(TessModule.RIL_TEXTLINE),
baseline: ri.getBaseline(TessModule.RIL_TEXTLINE),
bbox: ri.getBoundingBox(TessModule.RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(Module.RIL_WORD)) {
if (ri.IsAtBeginningOf(TessModule.RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(Module.RIL_WORD),
confidence: ri.Confidence(Module.RIL_WORD),
baseline: ri.getBaseline(Module.RIL_WORD),
bbox: ri.getBoundingBox(Module.RIL_WORD),
text: ri.GetUTF8Text(TessModule.RIL_WORD),
confidence: ri.Confidence(TessModule.RIL_WORD),
baseline: ri.getBaseline(TessModule.RIL_WORD),
bbox: ri.getBoundingBox(TessModule.RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
@ -105,54 +134,54 @@ module.exports = (Module, base) => { @@ -105,54 +134,54 @@ module.exports = (Module, base) => {
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new Module.WordChoiceIterator(ri);
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence(),
});
} while (wc.Next());
Module.destroy(wc);
TessModule.destroy(wc);
textline.words.push(word);
}
// let image = null;
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// Module._pixDestroy(Module.getPointer(pix));
if (ri.IsAtBeginningOf(Module.RIL_SYMBOL)) {
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(TessModule.RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(Module.RIL_SYMBOL),
confidence: ri.Confidence(Module.RIL_SYMBOL),
baseline: ri.getBaseline(Module.RIL_SYMBOL),
bbox: ri.getBoundingBox(Module.RIL_SYMBOL),
text: ri.GetUTF8Text(TessModule.RIL_SYMBOL),
confidence: ri.Confidence(TessModule.RIL_SYMBOL),
baseline: ri.getBaseline(TessModule.RIL_SYMBOL),
bbox: ri.getBoundingBox(TessModule.RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new Module.ChoiceIterator(ri);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence(),
});
} while (ci.Next());
// Module.destroy(i);
// TessModule.destroy(i);
}
} while (ri.Next(Module.RIL_SYMBOL));
Module.destroy(ri);
} while (ri.Next(TessModule.RIL_SYMBOL));
TessModule.destroy(ri);
return {
text: base.GetUTF8Text(),
html: deindent(base.GetHOCRText()),
confidence: base.MeanTextConf(),
text: api.GetUTF8Text(),
html: deindent(api.GetHOCRText()),
confidence: api.MeanTextConf(),
blocks,
psm: enumToString(base.GetPageSegMode(), 'PSM'),
oem: enumToString(base.oem(), 'OEM'),
version: base.Version(),
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
};
};

1
src/common/langdata.json

@ -1 +0,0 @@ @@ -1 +0,0 @@
{"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}

5
src/common/options.js

@ -1,5 +1,10 @@ @@ -1,5 +1,10 @@
module.exports = {
defaultOptions: {
/*
* default path for downloading *.traineddata, this URL basically
* points to a github page, not using jsDelivr as there is is limitation
* of 20 MB.
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
},
};

117
src/common/worker.js → src/common/workerUtils.js

@ -1,25 +1,66 @@ @@ -1,25 +1,66 @@
/**
*
* Worker utilities for browser and node
*
* @fileoverview Worker utilities for browser and node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const { readImage, loadLang } = require('tesseract.js-utils');
const dump = require('./dump');
/*
* Tesseract Module returned by TesseractCore.
*/
let TessModule;
/*
* TessearctBaseAPI instance
*/
let api;
let latestJob;
let adapter = {};
/**
* setImage
*
* @name setImage
* @function set image in tesseract for recognition
* @access public
* @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image
*/
const setImage = (image) => {
const {
w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image));
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
}
api.SetRectangle(0, 0, w, h);
return data;
return data === null ? pix : data;
};
/**
* handleInit
*
* @name handleInit
* @function handle initialization of TessModule
* @access public
* @param {object} req - job payload
* @param {string} req.corePath - path to the tesseract-core.js
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const handleInit = ({ corePath }, res) => {
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
@ -41,6 +82,18 @@ const handleInit = ({ corePath }, res) => { @@ -41,6 +82,18 @@ const handleInit = ({ corePath }, res) => {
return Promise.resolve();
};
/**
* loadLanguage
*
* @name loadLanguage
* @function load language from remote or local cache
* @access public
* @param {object} req - job payload
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const loadLanguage = ({ lang, options }, res) => {
res.progress({ status: 'loading language traineddata', progress: 0 });
return loadLang({ lang, TessModule, ...options }).then((...args) => {
@ -49,6 +102,19 @@ const loadLanguage = ({ lang, options }, res) => { @@ -49,6 +102,19 @@ const loadLanguage = ({ lang, options }, res) => {
});
};
/**
* handleRecognize
*
* @name handleRecognize
* @function handle recognition job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} req.params - parameters for tesseract
* @param {object} res - job instance
*/
const handleRecognize = ({
image, lang, options, params,
}, res) => (
@ -77,7 +143,18 @@ const handleRecognize = ({ @@ -77,7 +143,18 @@ const handleRecognize = ({
))
);
/**
* handleDetect
*
* @name handleDetect
* @function handle detect (Orientation and Script Detection / OSD) job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
*/
const handleDetect = ({
image, lang, options,
}, res) => (
@ -115,8 +192,20 @@ const handleDetect = ({ @@ -115,8 +192,20 @@ const handleDetect = ({
))
);
/**
* dispatchHandlers
*
* @name dispatchHandlers
* @function worker data handler
* @access public
* @param {object} data
* @param {string} data.jobId - unique job id
* @param {string} data.action - action of the job, only recognize and detect for now
* @param {object} data.payload - data for the job
* @param {function} send - trigger job to work
*/
exports.dispatchHandlers = ({ jobId, action, payload }, send) => {
const respond = (status, data) => {
const res = (status, data) => {
send({
jobId,
status,
@ -124,24 +213,32 @@ exports.dispatchHandlers = ({ jobId, action, payload }, send) => { @@ -124,24 +213,32 @@ exports.dispatchHandlers = ({ jobId, action, payload }, send) => {
data,
});
};
respond.resolve = respond.bind(this, 'resolve');
respond.reject = respond.bind(this, 'reject');
respond.progress = respond.bind(this, 'progress');
res.resolve = res.bind(this, 'resolve');
res.reject = res.bind(this, 'reject');
res.progress = res.bind(this, 'progress');
latestJob = respond;
latestJob = res;
try {
if (action === 'recognize') {
handleRecognize(payload, respond);
handleRecognize(payload, res);
} else if (action === 'detect') {
handleDetect(payload, respond);
handleDetect(payload, res);
}
} catch (err) {
/** Prepare exception to travel through postMessage */
respond.reject(err.toString());
res.reject(err.toString());
}
};
/**
* setAdapter
*
* @name setAdapter
* @function
* @access public
* @param {object} impl - implementation of the worker, different in browser and node environment
*/
exports.setAdapter = (impl) => {
adapter = impl;
};

49
src/node/index.js

@ -1,3 +1,12 @@ @@ -1,3 +1,12 @@
/**
*
* Tesseract Worker adapter for node
*
* @fileoverview Tesseract Worker adapter for node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const util = require('util');
const fs = require('fs');
const fetch = require('node-fetch');
@ -8,6 +17,16 @@ const { defaultOptions } = require('../common/options'); @@ -8,6 +17,16 @@ const { defaultOptions } = require('../common/options');
const readFile = util.promisify(fs.readFile);
/**
* loadImage
*
* @name loadImage
* @function load image from different source
* @access public
* @param {string} image - image source, supported formats:
* string: URL string or file path
* @returns {array} binary image in array format
*/
const loadImage = (image) => {
if (isURL(image)) {
return fetch(image)
@ -16,11 +35,24 @@ const loadImage = (image) => { @@ -16,11 +35,24 @@ const loadImage = (image) => {
return readFile(image);
};
/*
* Default options for node worker
*/
exports.defaultOptions = {
...defaultOptions,
workerPath: path.join(__dirname, 'worker.js'),
};
/**
* spawnWorker
*
* @name spawnWorker
* @function fork a new process in node
* @access public
* @param {object} instance - TesseractWorker instance
* @param {object} options
* @param {string} options.workerPath - worker script path
*/
exports.spawnWorker = (instance, { workerPath }) => {
const cp = fork(workerPath);
cp.on('message', (packet) => {
@ -29,10 +61,27 @@ exports.spawnWorker = (instance, { workerPath }) => { @@ -29,10 +61,27 @@ exports.spawnWorker = (instance, { workerPath }) => {
return cp;
};
/**
* terminateWorker
*
* @name terminateWorker
* @function kill worker
* @access public
* @param {object} instance TesseractWorker instance
*/
exports.terminateWorker = (instance) => {
instance.worker.kill();
};
/**
* sendPacket
*
* @name sendPacket
* @function send packet to worker and create a job
* @access public
* @param {object} instance TesseractWorker instance
* @param {object} iPacket data for worker
*/
exports.sendPacket = (instance, iPacket) => {
const packet = { ...iPacket };
loadImage(packet.payload.image)

19
src/node/worker.js

@ -1,12 +1,29 @@ @@ -1,12 +1,29 @@
/**
*
* Node worker implementation
*
* @fileoverview Node worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const check = require('check-types');
const workerUtils = require('../common/worker');
const workerUtils = require('../common/workerUtils');
let TesseractCore = null;
/*
* register message handler
*/
process.on('message', (packet) => {
workerUtils.dispatchHandlers(packet, obj => process.send(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
workerUtils.setAdapter({
getCore: (corePath, res) => {
if (check.null(TesseractCore)) {

Loading…
Cancel
Save