From af115e4940493b44814c95756319bfa895db7a59 Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Sun, 26 May 2019 01:38:46 +0800 Subject: [PATCH] Make output of hocr, tsv, box, unlv, osd to be optional --- docs/tesseract_parameters.md | 7 ++++++- src/common/dump.js | 18 ++++++++++++------ src/common/options.js | 5 +++++ src/common/workerUtils.js | 15 ++++++++------- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/docs/tesseract_parameters.md b/docs/tesseract_parameters.md index 72fde6a..9d8105d 100644 --- a/docs/tesseract_parameters.md +++ b/docs/tesseract_parameters.md @@ -24,7 +24,12 @@ worker | tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | -| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js will generate a pdf output | +| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output | +| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | +| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | +| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | +| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | +| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | | pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | | pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | | pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file | diff --git a/src/common/dump.js b/src/common/dump.js index ecd68aa..affc56d 100644 --- a/src/common/dump.js +++ b/src/common/dump.js @@ -42,7 +42,13 @@ const deindent = (html) => { * @param {object} api TesseractBaseAPI instance * @returns {object} dumpped JSON object */ -module.exports = (TessModule, api) => { +module.exports = (TessModule, api, { + tessedit_create_hocr, + tessedit_create_tsv, + tessedit_create_box, + tessedit_create_unlv, + tessedit_create_osd, +}) => { const ri = api.GetIterator(); const blocks = []; let block; @@ -177,11 +183,11 @@ module.exports = (TessModule, api) => { return { text: api.GetUTF8Text(), - hocr: deindent(api.GetHOCRText()), - tsv: api.GetTSVText(), - box: api.GetBoxText(), - unlv: api.GetUNLVText(), - osd: api.GetOsdText(), + hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, + tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null, + box: tessedit_create_box === '1' ? api.GetBoxText() : null, + unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null, + osd: tessedit_create_osd === '1' ? api.GetOsdText() : null, confidence: api.MeanTextConf(), blocks, psm: enumToString(api.GetPageSegMode(), 'PSM'), diff --git a/src/common/options.js b/src/common/options.js index b9c17ce..53f20c2 100644 --- a/src/common/options.js +++ b/src/common/options.js @@ -17,6 +17,11 @@ module.exports = { tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_char_whiltelist: '', tessedit_create_pdf: '0', + tessedit_create_hocr: '1', + tessedit_create_tsv: '1', + tessedit_create_box: '0', + tessedit_create_unlv: '0', + tessedit_create_osd: '0', textonly_pdf: '0', pdf_name: 'tesseract.js-ocr-result', pdf_title: 'Tesseract.js OCR Result', diff --git a/src/common/workerUtils.js b/src/common/workerUtils.js index 43aeb74..d74af60 100644 --- a/src/common/workerUtils.js +++ b/src/common/workerUtils.js @@ -67,14 +67,11 @@ const getLangsStr = langs => ( * @param {string} langs - lang string for Init() * @param {object} customParams - an object of params */ -const handleParams = (langs, customParams) => { +const handleParams = (langs, iParams) => { const { tessedit_ocr_engine_mode, ...params - } = { - ...defaultParams, - ...customParams, - }; + } = iParams; api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); Object.keys(params).forEach((key) => { api.SetVariable(key, params[key]); @@ -191,7 +188,7 @@ const loadLanguage = ({ langs, options }, res) => { * @param {object} res - job instance */ const handleRecognize = ({ - image, langs, options, params, + image, langs, options, params: customParams, }, res) => ( handleInit(options, res) .then(() => ( @@ -211,6 +208,10 @@ const handleRecognize = ({ const progressUpdate = (progress) => { res.progress({ status: 'initializing api', progress }); }; + const params = { + ...defaultParams, + ...customParams, + }; progressUpdate(0); handleParams(langs, params); progressUpdate(0.5); @@ -218,7 +219,7 @@ const handleRecognize = ({ progressUpdate(1); api.Recognize(null); const files = handleOutput(params); - const result = dump(TessModule, api); + const result = dump(TessModule, api, params); api.End(); TessModule._free(ptr); res.resolve({ files, ...result });