Pure Javascript OCR for more than 100 Languages 📖🎉🖥
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

248 lines
6.7 KiB

/**
*
* Worker utilities for browser and node
*
* @fileoverview Worker utilities for browser and node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const { readImage, loadLang } = require('tesseract.js-utils');
const check = require('check-types');
const dump = require('./dump');
/*
* Tesseract Module returned by TesseractCore.
*/
let TessModule;
/*
* TessearctBaseAPI instance
*/
let api;
let latestJob;
let adapter = {};
/**
* setImage
*
* @name setImage
* @function set image in tesseract for recognition
* @access public
* @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image
*/
const setImage = (image) => {
const {
w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image));
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
}
api.SetRectangle(0, 0, w, h);
return data === null ? pix : data;
};
/**
* handleInit
*
* @name handleInit
* @function handle initialization of TessModule
* @access public
* @param {object} req - job payload
* @param {string} req.corePath - path to the tesseract-core.js
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const handleInit = ({ corePath }, res) => {
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
res.progress({ status: 'initializing tesseract', progress: 0 });
return Core({
TesseractProgress(percent) {
latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent - 30) / 70) });
},
})
.then((tessModule) => {
TessModule = tessModule;
api = new TessModule.TessBaseAPI();
res.progress({ status: 'initialized tesseract', progress: 1 });
});
}
return Promise.resolve();
};
/**
* loadLanguage
*
* @name loadLanguage
* @function load language from remote or local cache
* @access public
* @param {object} req - job payload
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
* @returns {Promise} A Promise for callback
*/
const loadLanguage = ({ lang, options }, res) => {
res.progress({ status: 'loading language traineddata', progress: 0 });
return loadLang({ lang, TessModule, ...options }).then((...args) => {
res.progress({ status: 'loaded language traineddata', progress: 1 });
return args;
});
};
/**
* handleRecognize
*
* @name handleRecognize
* @function handle recognition job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} req.params - parameters for tesseract
* @param {object} res - job instance
*/
const handleRecognize = ({
image, lang, options, params,
}, res) => (
handleInit(options, res)
.then(() => (
loadLanguage({ lang, options }, res)
.then(() => {
const OEM = check.undefined(params['init_oem'])
? TessModule.OEM_DEFAULT
: params['init_oem'];
const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress });
};
progressUpdate(0);
api.Init(null, lang, OEM);
progressUpdate(0.3);
Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => {
api.SetVariable(key, params[key]);
});
progressUpdate(0.6);
const ptr = setImage(image);
progressUpdate(1);
api.Recognize(null);
const result = dump(TessModule, api);
api.End();
TessModule._free(ptr);
res.resolve(result);
})
))
);
/**
* handleDetect
*
* @name handleDetect
* @function handle detect (Orientation and Script Detection / OSD) job
* @access public
* @param {object} req - job payload
* @param {array} req.image - binary image in array format
* @param {string} req.lang - languages to load, ex: eng, eng+chi_tra
* @param {object} req.options - other options for loadLang function
* @param {object} res - job instance
*/
const handleDetect = ({
image, lang, options,
}, res) => (
handleInit(options, res)
.then(() => (
loadLanguage({ lang, options }, res)
.then(() => {
api.Init(null, lang);
api.SetPageSegMode(TessModule.PSM_OSD_ONLY);
const ptr = setImage(image);
const results = new TessModule.OSResults();
if (!api.DetectOS(results)) {
api.End();
TessModule._free(ptr);
res.reject('Failed to detect OS');
} else {
const best = results.best_result;
const oid = best.orientation_id;
const sid = best.script_id;
api.End();
TessModule._free(ptr);
res.resolve({
tesseract_script_id: sid,
script: results.unicharset.get_script_from_script_id(sid),
script_confidence: best.sconfidence,
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.oconfidence,
});
}
})
))
);
/**
* dispatchHandlers
*
* @name dispatchHandlers
* @function worker data handler
* @access public
* @param {object} data
* @param {string} data.jobId - unique job id
* @param {string} data.action - action of the job, only recognize and detect for now
* @param {object} data.payload - data for the job
* @param {function} send - trigger job to work
*/
exports.dispatchHandlers = ({ jobId, action, payload }, send) => {
const res = (status, data) => {
send({
jobId,
status,
action,
data,
});
};
res.resolve = res.bind(this, 'resolve');
res.reject = res.bind(this, 'reject');
res.progress = res.bind(this, 'progress');
latestJob = res;
try {
if (action === 'recognize') {
handleRecognize(payload, res);
} else if (action === 'detect') {
handleDetect(payload, res);
}
} catch (err) {
/** Prepare exception to travel through postMessage */
res.reject(err.toString());
}
};
/**
* setAdapter
*
* @name setAdapter
* @function
* @access public
* @param {object} impl - implementation of the worker, different in browser and node environment
*/
exports.setAdapter = (impl) => {
adapter = impl;
};