Browse Source

Added OutputFormats option/interface for setting output

dev/v4
Balearica 2 years ago
parent
commit
c407aeb559
  1. 1
      examples/browser/basic.html
  2. 2
      examples/browser/download-pdf.html
  3. 4
      examples/browser/image-processing.html
  4. 2
      examples/node/download-pdf.js
  5. 26
      examples/node/scheduler.js
  6. 2
      src/constants/imageType.js
  7. 13
      src/createWorker.js
  8. 28
      src/index.d.ts
  9. 2
      src/utils/circularize.js
  10. 17
      src/worker-script/constants/defaultOutput.js
  11. 133
      src/worker-script/index.js
  12. 50
      src/worker-script/utils/dump.js

1
examples/browser/basic.html

@ -8,6 +8,7 @@ @@ -8,6 +8,7 @@
const recognize = async ({ target: { files } }) => {
const { data: { text } } = await Tesseract.recognize(files[0], 'eng', {
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m),
});
console.log(text);

2
examples/browser/download-pdf.html

@ -21,7 +21,7 @@ @@ -21,7 +21,7 @@
const recognize = async ({ target: { files } }) => {
await worker.loadLanguage('eng');
await worker.initialize('eng');
const res = await worker.recognize(files[0], {savePDF: true});
const res = await worker.recognize(files[0],{pdfTitle: "Example PDF"},{pdf: true});
pdf = res.data.pdf;
const text = res.data.text;
const board = document.getElementById('board');

4
examples/browser/image-processing.html

@ -45,8 +45,8 @@ @@ -45,8 +45,8 @@
await worker.initialize('eng');
await worker.initialize();
const ret = await worker.recognize(files[0], { saveImageOriginal: true, saveImageGrey: true, saveImageBinary: true, rotateAuto: true })
document.getElementById("imgOriginal").src = ret.data.imageOriginal;
const ret = await worker.recognize(files[0], {rotateAuto: true}, {imageColor: true, imageGrey: true, imageBinary: true});
document.getElementById("imgOriginal").src = ret.data.imageColor;
document.getElementById("imgGrey").src = ret.data.imageGrey;
document.getElementById("imgBinary").src = ret.data.imageBinary;

2
examples/node/download-pdf.js

@ -12,7 +12,7 @@ console.log(`Recognizing ${image}`); @@ -12,7 +12,7 @@ console.log(`Recognizing ${image}`);
const worker = await createWorker();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text, pdf } } = await worker.recognize(image, {savePDF: true});
const { data: { text, pdf } } = await worker.recognize(image, {pdfTitle: "Example PDF"}, {pdf: true});
console.log(text);
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(pdf));
console.log('Generate PDF: tesseract-ocr-result.pdf');

26
examples/node/scheduler.js

@ -0,0 +1,26 @@ @@ -0,0 +1,26 @@
const { createWorker, createScheduler } = require('../../');
const scheduler = createScheduler();
// Creates worker and adds to scheduler
const workerGen = async () => {
const worker = createWorker({cachePath: "."});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
scheduler.addWorker(worker);
}
const workerN = 4;
(async () => {
const resArr = Array(workerN);
for (let i=0; i<workerN; i++) {
resArr[i] = workerGen();
}
await Promise.all(resArr);
/** Add 4 recognition jobs */
const results = await Promise.all(Array(10).fill(0).map(() => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png').then((x) => console.log(x.data.text))
)))
await scheduler.terminate(); // It also terminates all workers.
})();

2
src/constants/imageType.js

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
module.exports = {
ORIGINAL: 0,
COLOR: 0,
GREY: 1,
BINARY: 2,
};

13
src/createWorker.js

@ -129,19 +129,11 @@ module.exports = async (_options = {}) => { @@ -129,19 +129,11 @@ module.exports = async (_options = {}) => {
}))
);
const recognize = async (image, opts = {}, jobId) => (
const recognize = async (image, opts = {}, output = {blocks: true, text: true, hocr: true, tsv: true}, jobId) => (
startJob(createJob({
id: jobId,
action: 'recognize',
payload: { image: await loadImage(image), options: opts },
}))
);
const threshold = async (image, opts = {}, jobId) => (
startJob(createJob({
id: jobId,
action: 'threshold',
payload: { image: await loadImage(image), options: opts },
payload: { image: await loadImage(image), options: opts, output },
}))
);
@ -215,7 +207,6 @@ module.exports = async (_options = {}) => { @@ -215,7 +207,6 @@ module.exports = async (_options = {}) => {
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,

28
src/index.d.ts vendored

@ -23,8 +23,7 @@ declare namespace Tesseract { @@ -23,8 +23,7 @@ declare namespace Tesseract {
initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
getImage(type: imageType): string
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
threshold(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, output?: Partial<OutputFormats>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
@ -54,16 +53,25 @@ declare namespace Tesseract { @@ -54,16 +53,25 @@ declare namespace Tesseract {
tessjs_create_unlv: string
tessjs_create_osd: string
}
interface OutputFormats {
text: boolean;
blocks: boolean;
hocr: boolean;
tsv: boolean;
box: boolean;
unlv: boolean;
osd: boolean;
pdf: boolean;
imageColor: boolean;
imageGrey: boolean;
imageBinary: boolean;
}
interface RecognizeOptions {
rectangle: Rectangle
saveImageOriginal: boolean
saveImageGrey: boolean
saveImageBinary: boolean
savePDF: boolean
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean
rotateRadians: float
rotateRadians: number
}
interface ConfigResult {
jobId: string
@ -117,7 +125,7 @@ declare namespace Tesseract { @@ -117,7 +125,7 @@ declare namespace Tesseract {
RAW_LINE = '13'
}
const enum imageType {
ORIGINAL = 0,
COLOR = 0,
GREY = 1,
BINARY = 2
}
@ -218,7 +226,7 @@ declare namespace Tesseract { @@ -218,7 +226,7 @@ declare namespace Tesseract {
page: Page;
}
interface Page {
blocks: Block[];
blocks: Block[] | null;
confidence: number;
lines: Line[];
oem: string;
@ -234,7 +242,7 @@ declare namespace Tesseract { @@ -234,7 +242,7 @@ declare namespace Tesseract {
box: string | null;
unlv: string | null;
sd: string | null;
imageOriginal: string | null;
imageColor: string | null;
imageGrey: string | null;
imageBinary: string | null;
rotateRadians: number | null;

2
src/utils/circularize.js

@ -22,6 +22,7 @@ module.exports = (page) => { @@ -22,6 +22,7 @@ module.exports = (page) => {
const words = [];
const symbols = [];
if (page.blocks) {
page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
@ -47,6 +48,7 @@ module.exports = (page) => { @@ -47,6 +48,7 @@ module.exports = (page) => {
...block, page,
});
});
}
return {
...page, blocks, paragraphs, lines, words, symbols,

17
src/worker-script/constants/defaultOutput.js

@ -0,0 +1,17 @@ @@ -0,0 +1,17 @@
/*
* default output formats for tesseract.js
*/
module.exports = {
text: true,
blocks: true,
hocr: true,
tsv: true,
box: false,
unlv: false,
osd: false,
pdf: false,
imageColor: false,
imageGrey: false,
imageBinary: false
};

133
src/worker-script/index.js

@ -14,8 +14,8 @@ const dump = require('./utils/dump'); @@ -14,8 +14,8 @@ const dump = require('./utils/dump');
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker';
const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams');
const defaultOutput = require('./constants/defaultOutput');
const { log, setLogging } = require('../utils/log');
const arrayBufferToBase64 = require('./utils/arrayBufferToBase64');
const imageType = require('../constants/imageType');
const PSM = require('../constants/PSM');
@ -214,23 +214,44 @@ const getPDF = async ({ payload: { title, textonly } }, res) => { @@ -214,23 +214,44 @@ const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
const getImage = (type) => {
api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png');
const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`;
TessModule.FS.unlink('/image.png');
return pngStr;
};
// Combines default output with user-specified options and
// counts (1) total output formats requested and (2) outputs that require OCR
const processOutput = (output) => {
const workingOutput = JSON.parse(JSON.stringify(defaultOutput));
// Output formats were set using `setParameters` in previous versions
// These settings are copied over for compatability
if (params.tessjs_create_box === "1") workingOutput.box = true;
if (params.tessjs_create_hocr === "1") workingOutput.hocr = true;
if (params.tessjs_create_osd === "1") workingOutput.osd = true;
if (params.tessjs_create_tsv === "1") workingOutput.tsv = true;
if (params.tessjs_create_unlv === "1") workingOutput.unlv = true;
const nonRecOutputs = ["imageColor", "imageGrey", "imageBinary"];
let recOutputCount = 0;
for (const prop in output) {
workingOutput[prop] = output[prop];
}
for (const prop in workingOutput) {
if (workingOutput[prop]) {
if (!nonRecOutputs.includes(prop)) {
recOutputCount++;
}
}
}
return {workingOutput, recOutputCount}
}
const recognize = async ({
payload: {
image, options: {
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, savePDF, pdfTitle,
rectangle: rec, pdfTitle,
pdfTextOnly, rotateAuto, rotateRadians,
},
}, output
},
}, res) => {
try {
const {workingOutput, recOutputCount} = processOutput(output);
// When the auto-rotate option is True, setImage is called with no angle,
// then the angle is calculated by Tesseract and then setImage is re-called.
// Otherwise, setImage is called once using the user-provided rotateRadiansFinal value.
@ -274,96 +295,14 @@ const recognize = async ({ @@ -274,96 +295,14 @@ const recognize = async ({
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
api.Recognize(null);
const result = dump(TessModule, api, params);
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
if (savePDF) {
result.pdf = getPDFInternal(pdfTitle ?? 'Tesseract OCR Result', pdfTextOnly ?? false);
} else {
result.pdf = null;
}
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
TessModule._free(ptr);
} catch (err) {
res.reject(err.toString());
}
};
// `threshold` is similar to `recognize` except it skips the recognition step
// Useful for getting rotated/binarized images without running recognition
const threshold = async ({
payload: {
image, options: {
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians,
},
},
}, res) => {
try {
let ptr;
let rotateRadiansFinal;
if (rotateAuto) {
const psmInit = api.GetPageSegMode();
let psmEdit = false;
if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) {
psmEdit = true;
api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO));
}
ptr = setImage(TessModule, api, image);
api.FindLines();
const rotateRadiansCalc = api.GetAngle();
// Restore user-provided PSM setting
if (psmEdit) {
api.SetVariable('tessedit_pageseg_mode', String(psmInit));
}
// Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime
if (Math.abs(rotateRadiansCalc) >= 0.005) {
rotateRadiansFinal = rotateRadiansCalc;
ptr = setImage(TessModule, api, image, rotateRadiansFinal);
if (recOutputCount > 0) {
api.Recognize(null);
} else {
rotateRadiansFinal = 0;
}
} else {
rotateRadiansFinal = rotateRadians || 0;
ptr = setImage(TessModule, api, image, rotateRadiansFinal);
log(`Skipping recognition: all output options requiring recognition are disabled.`);
}
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
const result = {};
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
const result = dump(TessModule, api, workingOutput, {pdfTitle, pdfTextOnly});
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
TessModule._free(ptr);
@ -372,6 +311,7 @@ const threshold = async ({ @@ -372,6 +311,7 @@ const threshold = async ({
}
};
const detect = async ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
@ -451,7 +391,6 @@ exports.dispatchHandlers = (packet, send) => { @@ -451,7 +391,6 @@ exports.dispatchHandlers = (packet, send) => {
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,

50
src/worker-script/utils/dump.js

@ -7,6 +7,8 @@ @@ -7,6 +7,8 @@
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const arrayBufferToBase64 = require('./arrayBufferToBase64');
const imageType = require('../../constants/imageType');
/**
* deindent
@ -37,13 +39,7 @@ const deindent = (html) => { @@ -37,13 +39,7 @@ const deindent = (html) => {
* @function dump recognition result to a JSON object
* @access public
*/
module.exports = (TessModule, api, {
tessjs_create_hocr,
tessjs_create_tsv,
tessjs_create_box,
tessjs_create_unlv,
tessjs_create_osd,
}) => {
module.exports = (TessModule, api, output, options) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
@ -65,6 +61,26 @@ module.exports = (TessModule, api, { @@ -65,6 +61,26 @@ module.exports = (TessModule, api, {
.map((e) => e.slice(prefix.length + 1))[0]
);
const getImage = (type) => {
api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png');
const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`;
TessModule.FS.unlink('/image.png');
return pngStr;
};
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
if (output.blocks) {
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
@ -185,15 +201,21 @@ module.exports = (TessModule, api, { @@ -185,15 +201,21 @@ module.exports = (TessModule, api, {
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
}
return {
text: api.GetUTF8Text(),
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
box: tessjs_create_box === '1' ? api.GetBoxText() : null,
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
text: output.text ? api.GetUTF8Text() : null,
hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
tsv: output.tsv ? api.GetTSVText() : null,
box: output.box ? api.GetBoxText() : null,
unlv: output.unlv ? api.GetUNLVText() : null,
osd: output.osd ? api.GetOsdText() : null,
pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null,
imageColor: output.imageColor ? getImage(imageType.COLOR) : null,
imageGrey: output.imageColor ? getImage(imageType.GREY) : null,
imageBinary: output.imageColor ? getImage(imageType.BINARY) : null,
confidence: api.MeanTextConf(),
blocks,
blocks: output.blocks ? blocks : null,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),

Loading…
Cancel
Save