From c407aeb559cad33e02bd30af0441f4cbc3233e43 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sun, 18 Sep 2022 13:53:33 -0700 Subject: [PATCH] Added OutputFormats option/interface for setting output --- examples/browser/basic.html | 1 + examples/browser/download-pdf.html | 2 +- examples/browser/image-processing.html | 4 +- examples/node/download-pdf.js | 2 +- examples/node/scheduler.js | 26 ++ src/constants/imageType.js | 2 +- src/createWorker.js | 13 +- src/index.d.ts | 28 +- src/utils/circularize.js | 36 +-- src/worker-script/constants/defaultOutput.js | 17 ++ src/worker-script/index.js | 135 +++------ src/worker-script/utils/dump.js | 274 ++++++++++--------- 12 files changed, 273 insertions(+), 267 deletions(-) create mode 100755 examples/node/scheduler.js create mode 100644 src/worker-script/constants/defaultOutput.js diff --git a/examples/browser/basic.html b/examples/browser/basic.html index f9e64fb..8fee3b6 100644 --- a/examples/browser/basic.html +++ b/examples/browser/basic.html @@ -8,6 +8,7 @@ const recognize = async ({ target: { files } }) => { const { data: { text } } = await Tesseract.recognize(files[0], 'eng', { corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js', + workerPath: "/dist/worker.dev.js", logger: m => console.log(m), }); console.log(text); diff --git a/examples/browser/download-pdf.html b/examples/browser/download-pdf.html index e562795..0485df9 100644 --- a/examples/browser/download-pdf.html +++ b/examples/browser/download-pdf.html @@ -21,7 +21,7 @@ const recognize = async ({ target: { files } }) => { await worker.loadLanguage('eng'); await worker.initialize('eng'); - const res = await worker.recognize(files[0], {savePDF: true}); + const res = await worker.recognize(files[0],{pdfTitle: "Example PDF"},{pdf: true}); pdf = res.data.pdf; const text = res.data.text; const board = document.getElementById('board'); diff --git a/examples/browser/image-processing.html b/examples/browser/image-processing.html index 17ad2d7..3f1a845 100644 --- a/examples/browser/image-processing.html +++ b/examples/browser/image-processing.html @@ -45,8 +45,8 @@ await worker.initialize('eng'); await worker.initialize(); - const ret = await worker.recognize(files[0], { saveImageOriginal: true, saveImageGrey: true, saveImageBinary: true, rotateAuto: true }) - document.getElementById("imgOriginal").src = ret.data.imageOriginal; + const ret = await worker.recognize(files[0], {rotateAuto: true}, {imageColor: true, imageGrey: true, imageBinary: true}); + document.getElementById("imgOriginal").src = ret.data.imageColor; document.getElementById("imgGrey").src = ret.data.imageGrey; document.getElementById("imgBinary").src = ret.data.imageBinary; diff --git a/examples/node/download-pdf.js b/examples/node/download-pdf.js index 6a82216..3c45997 100755 --- a/examples/node/download-pdf.js +++ b/examples/node/download-pdf.js @@ -12,7 +12,7 @@ console.log(`Recognizing ${image}`); const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); - const { data: { text, pdf } } = await worker.recognize(image, {savePDF: true}); + const { data: { text, pdf } } = await worker.recognize(image, {pdfTitle: "Example PDF"}, {pdf: true}); console.log(text); fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(pdf)); console.log('Generate PDF: tesseract-ocr-result.pdf'); diff --git a/examples/node/scheduler.js b/examples/node/scheduler.js new file mode 100755 index 0000000..0087158 --- /dev/null +++ b/examples/node/scheduler.js @@ -0,0 +1,26 @@ +const { createWorker, createScheduler } = require('../../'); + +const scheduler = createScheduler(); + +// Creates worker and adds to scheduler +const workerGen = async () => { + const worker = createWorker({cachePath: "."}); + await worker.load(); + await worker.loadLanguage('eng'); + await worker.initialize('eng'); + scheduler.addWorker(worker); +} + +const workerN = 4; +(async () => { + const resArr = Array(workerN); + for (let i=0; i ( + scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png').then((x) => console.log(x.data.text)) + ))) + await scheduler.terminate(); // It also terminates all workers. +})(); \ No newline at end of file diff --git a/src/constants/imageType.js b/src/constants/imageType.js index e21ccfe..df5cb2a 100644 --- a/src/constants/imageType.js +++ b/src/constants/imageType.js @@ -1,5 +1,5 @@ module.exports = { - ORIGINAL: 0, + COLOR: 0, GREY: 1, BINARY: 2, }; diff --git a/src/createWorker.js b/src/createWorker.js index d78c8e8..2f04fe8 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -129,19 +129,11 @@ module.exports = async (_options = {}) => { })) ); - const recognize = async (image, opts = {}, jobId) => ( + const recognize = async (image, opts = {}, output = {blocks: true, text: true, hocr: true, tsv: true}, jobId) => ( startJob(createJob({ id: jobId, action: 'recognize', - payload: { image: await loadImage(image), options: opts }, - })) - ); - - const threshold = async (image, opts = {}, jobId) => ( - startJob(createJob({ - id: jobId, - action: 'threshold', - payload: { image: await loadImage(image), options: opts }, + payload: { image: await loadImage(image), options: opts, output }, })) ); @@ -215,7 +207,6 @@ module.exports = async (_options = {}) => { initialize, setParameters, recognize, - threshold, getPDF, detect, terminate, diff --git a/src/index.d.ts b/src/index.d.ts index 688302d..db8fb22 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -23,8 +23,7 @@ declare namespace Tesseract { initialize(langs?: string, oem?: OEM, jobId?: string): Promise setParameters(params: Partial, jobId?: string): Promise getImage(type: imageType): string - recognize(image: ImageLike, options?: Partial, jobId?: string): Promise - threshold(image: ImageLike, options?: Partial, jobId?: string): Promise + recognize(image: ImageLike, options?: Partial, output?: Partial, jobId?: string): Promise detect(image: ImageLike, jobId?: string): Promise terminate(jobId?: string): Promise getPDF(title?: string, textonly?: boolean, jobId?: string):Promise @@ -54,16 +53,25 @@ declare namespace Tesseract { tessjs_create_unlv: string tessjs_create_osd: string } + interface OutputFormats { + text: boolean; + blocks: boolean; + hocr: boolean; + tsv: boolean; + box: boolean; + unlv: boolean; + osd: boolean; + pdf: boolean; + imageColor: boolean; + imageGrey: boolean; + imageBinary: boolean; + } interface RecognizeOptions { rectangle: Rectangle - saveImageOriginal: boolean - saveImageGrey: boolean - saveImageBinary: boolean - savePDF: boolean pdfTitle: string pdfTextOnly: boolean rotateAuto: boolean - rotateRadians: float + rotateRadians: number } interface ConfigResult { jobId: string @@ -117,7 +125,7 @@ declare namespace Tesseract { RAW_LINE = '13' } const enum imageType { - ORIGINAL = 0, + COLOR = 0, GREY = 1, BINARY = 2 } @@ -218,7 +226,7 @@ declare namespace Tesseract { page: Page; } interface Page { - blocks: Block[]; + blocks: Block[] | null; confidence: number; lines: Line[]; oem: string; @@ -234,7 +242,7 @@ declare namespace Tesseract { box: string | null; unlv: string | null; sd: string | null; - imageOriginal: string | null; + imageColor: string | null; imageGrey: string | null; imageBinary: string | null; rotateRadians: number | null; diff --git a/src/utils/circularize.js b/src/utils/circularize.js index 55486da..89a4f0e 100644 --- a/src/utils/circularize.js +++ b/src/utils/circularize.js @@ -22,31 +22,33 @@ module.exports = (page) => { const words = []; const symbols = []; - page.blocks.forEach((block) => { - block.paragraphs.forEach((paragraph) => { - paragraph.lines.forEach((line) => { - line.words.forEach((word) => { - word.symbols.forEach((sym) => { - symbols.push({ - ...sym, page, block, paragraph, line, word, + if (page.blocks) { + page.blocks.forEach((block) => { + block.paragraphs.forEach((paragraph) => { + paragraph.lines.forEach((line) => { + line.words.forEach((word) => { + word.symbols.forEach((sym) => { + symbols.push({ + ...sym, page, block, paragraph, line, word, + }); + }); + words.push({ + ...word, page, block, paragraph, line, }); }); - words.push({ - ...word, page, block, paragraph, line, + lines.push({ + ...line, page, block, paragraph, }); }); - lines.push({ - ...line, page, block, paragraph, + paragraphs.push({ + ...paragraph, page, block, }); }); - paragraphs.push({ - ...paragraph, page, block, + blocks.push({ + ...block, page, }); }); - blocks.push({ - ...block, page, - }); - }); + } return { ...page, blocks, paragraphs, lines, words, symbols, diff --git a/src/worker-script/constants/defaultOutput.js b/src/worker-script/constants/defaultOutput.js new file mode 100644 index 0000000..3fec901 --- /dev/null +++ b/src/worker-script/constants/defaultOutput.js @@ -0,0 +1,17 @@ +/* + * default output formats for tesseract.js + */ + +module.exports = { + text: true, + blocks: true, + hocr: true, + tsv: true, + box: false, + unlv: false, + osd: false, + pdf: false, + imageColor: false, + imageGrey: false, + imageBinary: false +}; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index c30f0a0..0bac9ba 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -14,8 +14,8 @@ const dump = require('./utils/dump'); const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker'; const setImage = require('./utils/setImage'); const defaultParams = require('./constants/defaultParams'); +const defaultOutput = require('./constants/defaultOutput'); const { log, setLogging } = require('../utils/log'); -const arrayBufferToBase64 = require('./utils/arrayBufferToBase64'); const imageType = require('../constants/imageType'); const PSM = require('../constants/PSM'); @@ -214,23 +214,44 @@ const getPDF = async ({ payload: { title, textonly } }, res) => { res.resolve(getPDFInternal(title, textonly)); }; -const getImage = (type) => { - api.WriteImage(type, '/image.png'); - const pngBuffer = TessModule.FS.readFile('/image.png'); - const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; - TessModule.FS.unlink('/image.png'); - return pngStr; -}; +// Combines default output with user-specified options and +// counts (1) total output formats requested and (2) outputs that require OCR +const processOutput = (output) => { + const workingOutput = JSON.parse(JSON.stringify(defaultOutput)); + // Output formats were set using `setParameters` in previous versions + // These settings are copied over for compatability + if (params.tessjs_create_box === "1") workingOutput.box = true; + if (params.tessjs_create_hocr === "1") workingOutput.hocr = true; + if (params.tessjs_create_osd === "1") workingOutput.osd = true; + if (params.tessjs_create_tsv === "1") workingOutput.tsv = true; + if (params.tessjs_create_unlv === "1") workingOutput.unlv = true; + + const nonRecOutputs = ["imageColor", "imageGrey", "imageBinary"]; + let recOutputCount = 0; + for (const prop in output) { + workingOutput[prop] = output[prop]; + } + for (const prop in workingOutput) { + if (workingOutput[prop]) { + if (!nonRecOutputs.includes(prop)) { + recOutputCount++; + } + } + } + return {workingOutput, recOutputCount} +} const recognize = async ({ payload: { image, options: { - rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, savePDF, pdfTitle, + rectangle: rec, pdfTitle, pdfTextOnly, rotateAuto, rotateRadians, - }, + }, output }, }, res) => { try { + const {workingOutput, recOutputCount} = processOutput(output); + // When the auto-rotate option is True, setImage is called with no angle, // then the angle is calculated by Tesseract and then setImage is re-called. // Otherwise, setImage is called once using the user-provided rotateRadiansFinal value. @@ -274,96 +295,14 @@ const recognize = async ({ if (typeof rec === 'object') { api.SetRectangle(rec.left, rec.top, rec.width, rec.height); } - api.Recognize(null); - const result = dump(TessModule, api, params); - if (saveImageOriginal) { - result.imageOriginal = getImage(imageType.ORIGINAL); - } else { - result.imageOriginal = null; - } - if (saveImageGrey) { - result.imageGrey = getImage(imageType.GREY); - } else { - result.imageGrey = null; - } - if (saveImageBinary) { - result.imageBinary = getImage(imageType.BINARY); - } else { - result.imageBinary = null; - } - if (savePDF) { - result.pdf = getPDFInternal(pdfTitle ?? 'Tesseract OCR Result', pdfTextOnly ?? false); - } else { - result.pdf = null; - } - result.rotateRadians = rotateRadiansFinal; - res.resolve(result); - TessModule._free(ptr); - } catch (err) { - res.reject(err.toString()); - } -}; -// `threshold` is similar to `recognize` except it skips the recognition step -// Useful for getting rotated/binarized images without running recognition -const threshold = async ({ - payload: { - image, options: { - rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, - }, - }, -}, res) => { - try { - let ptr; - let rotateRadiansFinal; - if (rotateAuto) { - const psmInit = api.GetPageSegMode(); - let psmEdit = false; - if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) { - psmEdit = true; - api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO)); - } - - ptr = setImage(TessModule, api, image); - api.FindLines(); - const rotateRadiansCalc = api.GetAngle(); - - // Restore user-provided PSM setting - if (psmEdit) { - api.SetVariable('tessedit_pageseg_mode', String(psmInit)); - } - - // Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime - if (Math.abs(rotateRadiansCalc) >= 0.005) { - rotateRadiansFinal = rotateRadiansCalc; - ptr = setImage(TessModule, api, image, rotateRadiansFinal); - } else { - rotateRadiansFinal = 0; - } - } else { - rotateRadiansFinal = rotateRadians || 0; - ptr = setImage(TessModule, api, image, rotateRadiansFinal); - } - - if (typeof rec === 'object') { - api.SetRectangle(rec.left, rec.top, rec.width, rec.height); - } - const result = {}; - if (saveImageOriginal) { - result.imageOriginal = getImage(imageType.ORIGINAL); - } else { - result.imageOriginal = null; - } - if (saveImageGrey) { - result.imageGrey = getImage(imageType.GREY); - } else { - result.imageGrey = null; - } - if (saveImageBinary) { - result.imageBinary = getImage(imageType.BINARY); + if (recOutputCount > 0) { + api.Recognize(null); } else { - result.imageBinary = null; + log(`Skipping recognition: all output options requiring recognition are disabled.`); } + + const result = dump(TessModule, api, workingOutput, {pdfTitle, pdfTextOnly}); result.rotateRadians = rotateRadiansFinal; res.resolve(result); TessModule._free(ptr); @@ -372,6 +311,7 @@ const threshold = async ({ } }; + const detect = async ({ payload: { image } }, res) => { try { const ptr = setImage(TessModule, api, image); @@ -451,7 +391,6 @@ exports.dispatchHandlers = (packet, send) => { initialize, setParameters, recognize, - threshold, getPDF, detect, terminate, diff --git a/src/worker-script/utils/dump.js b/src/worker-script/utils/dump.js index 77b318e..ce44f1c 100644 --- a/src/worker-script/utils/dump.js +++ b/src/worker-script/utils/dump.js @@ -7,6 +7,8 @@ * @author Guillermo Webster * @author Jerome Wu */ +const arrayBufferToBase64 = require('./arrayBufferToBase64'); +const imageType = require('../../constants/imageType'); /** * deindent @@ -37,13 +39,7 @@ const deindent = (html) => { * @function dump recognition result to a JSON object * @access public */ -module.exports = (TessModule, api, { - tessjs_create_hocr, - tessjs_create_tsv, - tessjs_create_box, - tessjs_create_unlv, - tessjs_create_osd, -}) => { +module.exports = (TessModule, api, output, options) => { const ri = api.GetIterator(); const { RIL_BLOCK, @@ -65,135 +61,161 @@ module.exports = (TessModule, api, { .map((e) => e.slice(prefix.length + 1))[0] ); - ri.Begin(); - do { - if (ri.IsAtBeginningOf(RIL_BLOCK)) { - const poly = ri.BlockPolygon(); - let polygon = null; - // BlockPolygon() returns null when automatic page segmentation is off - if (TessModule.getPointer(poly) > 0) { - const n = poly.get_n(); - const px = poly.get_x(); - const py = poly.get_y(); - polygon = []; - for (let i = 0; i < n; i += 1) { - polygon.push([px.getValue(i), py.getValue(i)]); + const getImage = (type) => { + api.WriteImage(type, '/image.png'); + const pngBuffer = TessModule.FS.readFile('/image.png'); + const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; + TessModule.FS.unlink('/image.png'); + return pngStr; + }; + + const getPDFInternal = (title, textonly) => { + const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); + pdfRenderer.BeginDocument(title); + pdfRenderer.AddImage(api); + pdfRenderer.EndDocument(); + TessModule._free(pdfRenderer); + + return TessModule.FS.readFile('/tesseract-ocr.pdf'); + }; + + if (output.blocks) { + + ri.Begin(); + do { + if (ri.IsAtBeginningOf(RIL_BLOCK)) { + const poly = ri.BlockPolygon(); + let polygon = null; + // BlockPolygon() returns null when automatic page segmentation is off + if (TessModule.getPointer(poly) > 0) { + const n = poly.get_n(); + const px = poly.get_x(); + const py = poly.get_y(); + polygon = []; + for (let i = 0; i < n; i += 1) { + polygon.push([px.getValue(i), py.getValue(i)]); + } + /* + * TODO: find out why _ptaDestroy doesn't work + */ + // TessModule._ptaDestroy(TessModule.getPointer(poly)); } - /* - * TODO: find out why _ptaDestroy doesn't work - */ - // TessModule._ptaDestroy(TessModule.getPointer(poly)); + + block = { + paragraphs: [], + text: ri.GetUTF8Text(RIL_BLOCK), + confidence: ri.Confidence(RIL_BLOCK), + baseline: ri.getBaseline(RIL_BLOCK), + bbox: ri.getBoundingBox(RIL_BLOCK), + blocktype: enumToString(ri.BlockType(), 'PT'), + polygon, + }; + blocks.push(block); + } + if (ri.IsAtBeginningOf(RIL_PARA)) { + para = { + lines: [], + text: ri.GetUTF8Text(RIL_PARA), + confidence: ri.Confidence(RIL_PARA), + baseline: ri.getBaseline(RIL_PARA), + bbox: ri.getBoundingBox(RIL_PARA), + is_ltr: !!ri.ParagraphIsLtr(), + }; + block.paragraphs.push(para); + } + if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { + textline = { + words: [], + text: ri.GetUTF8Text(RIL_TEXTLINE), + confidence: ri.Confidence(RIL_TEXTLINE), + baseline: ri.getBaseline(RIL_TEXTLINE), + bbox: ri.getBoundingBox(RIL_TEXTLINE), + }; + para.lines.push(textline); } + if (ri.IsAtBeginningOf(RIL_WORD)) { + const fontInfo = ri.getWordFontAttributes(); + const wordDir = ri.WordDirection(); + word = { + symbols: [], + choices: [], - block = { - paragraphs: [], - text: ri.GetUTF8Text(RIL_BLOCK), - confidence: ri.Confidence(RIL_BLOCK), - baseline: ri.getBaseline(RIL_BLOCK), - bbox: ri.getBoundingBox(RIL_BLOCK), - blocktype: enumToString(ri.BlockType(), 'PT'), - polygon, - }; - blocks.push(block); - } - if (ri.IsAtBeginningOf(RIL_PARA)) { - para = { - lines: [], - text: ri.GetUTF8Text(RIL_PARA), - confidence: ri.Confidence(RIL_PARA), - baseline: ri.getBaseline(RIL_PARA), - bbox: ri.getBoundingBox(RIL_PARA), - is_ltr: !!ri.ParagraphIsLtr(), - }; - block.paragraphs.push(para); - } - if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { - textline = { - words: [], - text: ri.GetUTF8Text(RIL_TEXTLINE), - confidence: ri.Confidence(RIL_TEXTLINE), - baseline: ri.getBaseline(RIL_TEXTLINE), - bbox: ri.getBoundingBox(RIL_TEXTLINE), - }; - para.lines.push(textline); - } - if (ri.IsAtBeginningOf(RIL_WORD)) { - const fontInfo = ri.getWordFontAttributes(); - const wordDir = ri.WordDirection(); - word = { - symbols: [], - choices: [], + text: ri.GetUTF8Text(RIL_WORD), + confidence: ri.Confidence(RIL_WORD), + baseline: ri.getBaseline(RIL_WORD), + bbox: ri.getBoundingBox(RIL_WORD), - text: ri.GetUTF8Text(RIL_WORD), - confidence: ri.Confidence(RIL_WORD), - baseline: ri.getBaseline(RIL_WORD), - bbox: ri.getBoundingBox(RIL_WORD), + is_numeric: !!ri.WordIsNumeric(), + in_dictionary: !!ri.WordIsFromDictionary(), + direction: enumToString(wordDir, 'DIR'), + language: ri.WordRecognitionLanguage(), - is_numeric: !!ri.WordIsNumeric(), - in_dictionary: !!ri.WordIsFromDictionary(), - direction: enumToString(wordDir, 'DIR'), - language: ri.WordRecognitionLanguage(), + is_bold: fontInfo.is_bold, + is_italic: fontInfo.is_italic, + is_underlined: fontInfo.is_underlined, + is_monospace: fontInfo.is_monospace, + is_serif: fontInfo.is_serif, + is_smallcaps: fontInfo.is_smallcaps, + font_size: fontInfo.pointsize, + font_id: fontInfo.font_id, + font_name: fontInfo.font_name, + }; + const wc = new TessModule.WordChoiceIterator(ri); + do { + word.choices.push({ + text: wc.GetUTF8Text(), + confidence: wc.Confidence(), + }); + } while (wc.Next()); + TessModule.destroy(wc); + textline.words.push(word); + } - is_bold: fontInfo.is_bold, - is_italic: fontInfo.is_italic, - is_underlined: fontInfo.is_underlined, - is_monospace: fontInfo.is_monospace, - is_serif: fontInfo.is_serif, - is_smallcaps: fontInfo.is_smallcaps, - font_size: fontInfo.pointsize, - font_id: fontInfo.font_id, - font_name: fontInfo.font_name, - }; - const wc = new TessModule.WordChoiceIterator(ri); - do { - word.choices.push({ - text: wc.GetUTF8Text(), - confidence: wc.Confidence(), - }); - } while (wc.Next()); - TessModule.destroy(wc); - textline.words.push(word); - } + // let image = null; + // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) + // var image = pix2array(pix); + // // for some reason it seems that things stop working if you destroy pics + // TessModule._pixDestroy(TessModule.getPointer(pix)); + if (ri.IsAtBeginningOf(RIL_SYMBOL)) { + symbol = { + choices: [], + image: null, + text: ri.GetUTF8Text(RIL_SYMBOL), + confidence: ri.Confidence(RIL_SYMBOL), + baseline: ri.getBaseline(RIL_SYMBOL), + bbox: ri.getBoundingBox(RIL_SYMBOL), + is_superscript: !!ri.SymbolIsSuperscript(), + is_subscript: !!ri.SymbolIsSubscript(), + is_dropcap: !!ri.SymbolIsDropcap(), + }; + word.symbols.push(symbol); + const ci = new TessModule.ChoiceIterator(ri); + do { + symbol.choices.push({ + text: ci.GetUTF8Text(), + confidence: ci.Confidence(), + }); + } while (ci.Next()); + // TessModule.destroy(i); + } + } while (ri.Next(RIL_SYMBOL)); + TessModule.destroy(ri); - // let image = null; - // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) - // var image = pix2array(pix); - // // for some reason it seems that things stop working if you destroy pics - // TessModule._pixDestroy(TessModule.getPointer(pix)); - if (ri.IsAtBeginningOf(RIL_SYMBOL)) { - symbol = { - choices: [], - image: null, - text: ri.GetUTF8Text(RIL_SYMBOL), - confidence: ri.Confidence(RIL_SYMBOL), - baseline: ri.getBaseline(RIL_SYMBOL), - bbox: ri.getBoundingBox(RIL_SYMBOL), - is_superscript: !!ri.SymbolIsSuperscript(), - is_subscript: !!ri.SymbolIsSubscript(), - is_dropcap: !!ri.SymbolIsDropcap(), - }; - word.symbols.push(symbol); - const ci = new TessModule.ChoiceIterator(ri); - do { - symbol.choices.push({ - text: ci.GetUTF8Text(), - confidence: ci.Confidence(), - }); - } while (ci.Next()); - // TessModule.destroy(i); - } - } while (ri.Next(RIL_SYMBOL)); - TessModule.destroy(ri); + } return { - text: api.GetUTF8Text(), - hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, - tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, - box: tessjs_create_box === '1' ? api.GetBoxText() : null, - unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, - osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, + text: output.text ? api.GetUTF8Text() : null, + hocr: output.hocr ? deindent(api.GetHOCRText()) : null, + tsv: output.tsv ? api.GetTSVText() : null, + box: output.box ? api.GetBoxText() : null, + unlv: output.unlv ? api.GetUNLVText() : null, + osd: output.osd ? api.GetOsdText() : null, + pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null, + imageColor: output.imageColor ? getImage(imageType.COLOR) : null, + imageGrey: output.imageColor ? getImage(imageType.GREY) : null, + imageBinary: output.imageColor ? getImage(imageType.BINARY) : null, confidence: api.MeanTextConf(), - blocks, + blocks: output.blocks ? blocks : null, psm: enumToString(api.GetPageSegMode(), 'PSM'), oem: enumToString(api.oem(), 'OEM'), version: api.Version(),