Pure Javascript OCR for more than 100 Languages 📖🎉🖥
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

453 lines
13 KiB

/**
*
* Worker script for browser and node
*
* @fileoverview Worker script for browser and node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
require('regenerator-runtime/runtime');
6 years ago
const fileType = require('file-type');
const isURL = require('is-url');
const dump = require('./utils/dump');
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker';
const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams');
const defaultOutput = require('./constants/defaultOutput');
const { log, setLogging } = require('../utils/log');
const imageType = require('../constants/imageType');
const PSM = require('../constants/PSM');
/*
* Tesseract Module returned by TesseractCore.
*/
7 years ago
let TessModule;
/*
* TessearctBaseAPI instance
*/
6 years ago
let api = null;
let latestJob;
let adapter = {};
let params = defaultParams;
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging);
7 years ago
if (!TessModule) {
const Core = await adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
Core({
TesseractProgress(percent) {
latestJob.progress({
workerId,
jobId,
status: 'recognizing text',
progress: Math.max(0, (percent - 30) / 70),
});
},
6 years ago
}).then((tessModule) => {
TessModule = tessModule;
res.progress({ workerId, status: 'initialized tesseract', progress: 1 });
res.resolve({ loaded: true });
});
} else {
res.resolve({ loaded: true });
}
};
const FS = async ({ workerId, payload: { method, args } }, res) => {
5 years ago
log(`[${workerId}]: FS.${method} with args ${args}`);
res.resolve(TessModule.FS[method](...args));
};
6 years ago
const loadLanguage = async ({
workerId,
payload: {
langs,
options: {
langPath,
dataPath,
cachePath,
cacheMethod,
gzip = true,
},
},
},
res) => {
6 years ago
const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod)
? () => Promise.resolve()
: adapter.readCache;
let data = null;
let newData = false;
6 years ago
try {
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
if (typeof _data !== 'undefined') {
6 years ago
log(`[${workerId}]: Load ${lang}.traineddata from cache`);
res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 });
6 years ago
data = _data;
} else {
throw Error('Not found in cache');
}
} catch (e) {
newData = true;
6 years ago
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`);
6 years ago
if (typeof _lang === 'string') {
let path = null;
if (isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */
6 years ago
path = langPath;
}
if (path !== null) {
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`;
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl);
if (!resp.ok) {
3 years ago
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`);
}
data = await resp.arrayBuffer();
6 years ago
} else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
}
} else {
data = _lang.data; // eslint-disable-line
}
}
data = new Uint8Array(data);
const type = fileType(data);
if (typeof type !== 'undefined' && type.mime === 'application/gzip') {
data = adapter.gunzip(data);
}
if (TessModule) {
if (dataPath) {
try {
TessModule.FS.mkdir(dataPath);
} catch (err) {
res.reject(err.toString());
}
}
TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data);
}
if (newData && ['write', 'refresh', undefined].includes(cacheMethod)) {
6 years ago
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
}
return Promise.resolve(data);
};
res.progress({ workerId, status: 'loading language traineddata', progress: 0 });
6 years ago
try {
await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile));
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
6 years ago
} catch (err) {
5 years ago
if (isWebWorker && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
6 years ago
}
7 years ago
};
const setParameters = async ({ payload: { params: _params } }, res) => {
Object.keys(_params)
5 years ago
.filter((k) => !k.startsWith('tessjs_'))
.forEach((key) => {
api.SetVariable(key, _params[key]);
});
params = { ...params, ..._params };
if (typeof res !== 'undefined') {
res.resolve(params);
}
};
const initialize = async ({
workerId,
payload: { langs: _langs, oem, config},
}, res) => {
const langs = (typeof _langs === 'string')
? _langs
5 years ago
: _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+');
try {
res.progress({
6 years ago
workerId, status: 'initializing api', progress: 0,
});
6 years ago
if (api !== null) {
api.End();
}
let configFile = undefined;
let configStr = undefined;
// config argument may either be config file text, or object with key/value pairs
// In the latter case we convert to config file text here
if (typeof config === "object") {
configStr = JSON.stringify(config).replace(/,/g, "\n").replace(/:/g, " ").replace(/["'{}]/g, "");
} else {
configStr = config;
}
if (typeof configStr === "string") {
configFile = "/config";
TessModule.FS.writeFile(configFile, configStr);
}
api = new TessModule.TessBaseAPI();
api.Init(null, langs, oem, configFile);
6 years ago
params = defaultParams;
await setParameters({ payload: { params } });
res.progress({
6 years ago
workerId, status: 'initialized api', progress: 1,
});
res.resolve();
} catch (err) {
res.reject(err.toString());
}
};
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
// Combines default output with user-specified options and
// counts (1) total output formats requested and (2) outputs that require OCR
const processOutput = (output) => {
const workingOutput = JSON.parse(JSON.stringify(defaultOutput));
// Output formats were set using `setParameters` in previous versions
// These settings are copied over for compatability
if (params.tessjs_create_box === "1") workingOutput.box = true;
if (params.tessjs_create_hocr === "1") workingOutput.hocr = true;
if (params.tessjs_create_osd === "1") workingOutput.osd = true;
if (params.tessjs_create_tsv === "1") workingOutput.tsv = true;
if (params.tessjs_create_unlv === "1") workingOutput.unlv = true;
const nonRecOutputs = ["imageColor", "imageGrey", "imageBinary"];
let recOutputCount = 0;
for (const prop in output) {
workingOutput[prop] = output[prop];
}
for (const prop in workingOutput) {
if (workingOutput[prop]) {
if (!nonRecOutputs.includes(prop)) {
recOutputCount++;
}
}
}
return {workingOutput, recOutputCount}
}
// List of options for Tesseract.js (rather than passed through to Tesseract),
// not including those with prefix "tessjs_"
const tessjsOptions = ["rectangle", "pdfTitle", "pdfTextOnly", "rotateAuto", "rotateRadians"];
const recognize = async ({
payload: {
image, options, output
},
}, res) => {
try {
const optionsTess = {};
if (typeof options === "object" && Object.keys(options).length > 0) {
// The options provided by users contain a mix of options for Tesseract.js
// and parameters passed through to Tesseract.
for (const param in options) {
if (!param.startsWith('tessjs_') && !tessjsOptions.includes(param)) {
optionsTess[param] = options[param];
}
}
if (Object.keys(optionsTess).length > 0) {
api.SaveParameters();
for (const prop in optionsTess) {
api.SetVariable(prop, optionsTess[prop]);
}
}
}
const {workingOutput, recOutputCount} = processOutput(output);
// When the auto-rotate option is True, setImage is called with no angle,
// then the angle is calculated by Tesseract and then setImage is re-called.
// Otherwise, setImage is called once using the user-provided rotateRadiansFinal value.
let ptr;
let rotateRadiansFinal;
if (options.rotateAuto) {
// The angle is only detected if auto page segmentation is used
// Therefore, if this is not the mode specified by the user, it is enabled temporarily here
const psmInit = api.GetPageSegMode();
let psmEdit = false;
if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) {
psmEdit = true;
api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO));
}
ptr = setImage(TessModule, api, image);
api.FindLines();
const rotateRadiansCalc = api.GetAngle();
// Restore user-provided PSM setting
if (psmEdit) {
api.SetVariable('tessedit_pageseg_mode', String(psmInit));
}
// Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime
if (Math.abs(rotateRadiansCalc) >= 0.005) {
rotateRadiansFinal = rotateRadiansCalc;
ptr = setImage(TessModule, api, image, rotateRadiansFinal);
} else {
// Image needs to be reset if run with different PSM setting earlier
if (psmEdit) {
ptr = setImage(TessModule, api, image);
}
rotateRadiansFinal = 0;
}
} else {
rotateRadiansFinal = options.rotateRadians || 0;
ptr = setImage(TessModule, api, image, rotateRadiansFinal);
}
const rec = options.rectangle;
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
if (recOutputCount > 0) {
api.Recognize(null);
} else {
log(`Skipping recognition: all output options requiring recognition are disabled.`);
}
const pdfTitle = options.pdfTitle;
const pdfTextOnly = options.pdfTextOnly;
const result = dump(TessModule, api, workingOutput, {pdfTitle, pdfTextOnly});
result.rotateRadians = rotateRadiansFinal;
if (Object.keys(optionsTess).length > 0) {
api.RestoreParameters();
}
res.resolve(result);
TessModule._free(ptr);
} catch (err) {
res.reject(err.toString());
6 years ago
}
};
const detect = async ({ payload: { image } }, res) => {
try {
6 years ago
const ptr = setImage(TessModule, api, image);
const results = new TessModule.OSResults();
7 years ago
if (!api.DetectOS(results)) {
TessModule._free(ptr);
res.resolve({
tesseract_script_id: null,
script: null,
script_confidence: null,
orientation_degrees: null,
orientation_confidence: null,
});
} else {
const best = results.best_result;
const oid = best.orientation_id;
const sid = best.script_id;
7 years ago
TessModule._free(ptr);
7 years ago
res.resolve({
tesseract_script_id: sid,
script: results.unicharset.get_script_from_script_id(sid),
script_confidence: best.sconfidence,
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.oconfidence,
});
}
} catch (err) {
res.reject(err.toString());
}
};
const terminate = async (_, res) => {
try {
6 years ago
if (api !== null) {
api.End();
}
res.resolve({ terminated: true });
} catch (err) {
res.reject(err.toString());
}
};
/**
* dispatchHandlers
*
* @name dispatchHandlers
* @function worker data handler
* @access public
* @param {object} data
* @param {string} data.jobId - unique job id
* @param {string} data.action - action of the job, only recognize and detect for now
* @param {object} data.payload - data for the job
* @param {function} send - trigger job to work
*/
exports.dispatchHandlers = (packet, send) => {
const res = (status, data) => {
send({
...packet,
status,
data,
});
};
res.resolve = res.bind(this, 'resolve');
res.reject = res.bind(this, 'reject');
res.progress = res.bind(this, 'progress');
latestJob = res;
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
.catch((err) => res.reject(err.toString()));
};
/**
* setAdapter
*
* @name setAdapter
* @function
* @access public
* @param {object} adapter - implementation of the worker, different in browser and node environment
*/
exports.setAdapter = (_adapter) => {
adapter = _adapter;
};