Browse Source

Make output of hocr, tsv, box, unlv, osd to be optional

pull/288/head
Jerome Wu 6 years ago
parent
commit
af115e4940
  1. 7
      docs/tesseract_parameters.md
  2. 18
      src/common/dump.js
  3. 5
      src/common/options.js
  4. 15
      src/common/workerUtils.js

7
docs/tesseract_parameters.md

@ -24,7 +24,12 @@ worker
| tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js will generate a pdf output | | tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output |
| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
| pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | | pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file |
| pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | | pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file |
| pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file | | pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file |

18
src/common/dump.js

@ -42,7 +42,13 @@ const deindent = (html) => {
* @param {object} api TesseractBaseAPI instance * @param {object} api TesseractBaseAPI instance
* @returns {object} dumpped JSON object * @returns {object} dumpped JSON object
*/ */
module.exports = (TessModule, api) => { module.exports = (TessModule, api, {
tessedit_create_hocr,
tessedit_create_tsv,
tessedit_create_box,
tessedit_create_unlv,
tessedit_create_osd,
}) => {
const ri = api.GetIterator(); const ri = api.GetIterator();
const blocks = []; const blocks = [];
let block; let block;
@ -177,11 +183,11 @@ module.exports = (TessModule, api) => {
return { return {
text: api.GetUTF8Text(), text: api.GetUTF8Text(),
hocr: deindent(api.GetHOCRText()), hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: api.GetTSVText(), tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null,
box: api.GetBoxText(), box: tessedit_create_box === '1' ? api.GetBoxText() : null,
unlv: api.GetUNLVText(), unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null,
osd: api.GetOsdText(), osd: tessedit_create_osd === '1' ? api.GetOsdText() : null,
confidence: api.MeanTextConf(), confidence: api.MeanTextConf(),
blocks, blocks,
psm: enumToString(api.GetPageSegMode(), 'PSM'), psm: enumToString(api.GetPageSegMode(), 'PSM'),

5
src/common/options.js

@ -17,6 +17,11 @@ module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '', tessedit_char_whiltelist: '',
tessedit_create_pdf: '0', tessedit_create_pdf: '0',
tessedit_create_hocr: '1',
tessedit_create_tsv: '1',
tessedit_create_box: '0',
tessedit_create_unlv: '0',
tessedit_create_osd: '0',
textonly_pdf: '0', textonly_pdf: '0',
pdf_name: 'tesseract.js-ocr-result', pdf_name: 'tesseract.js-ocr-result',
pdf_title: 'Tesseract.js OCR Result', pdf_title: 'Tesseract.js OCR Result',

15
src/common/workerUtils.js

@ -67,14 +67,11 @@ const getLangsStr = langs => (
* @param {string} langs - lang string for Init() * @param {string} langs - lang string for Init()
* @param {object} customParams - an object of params * @param {object} customParams - an object of params
*/ */
const handleParams = (langs, customParams) => { const handleParams = (langs, iParams) => {
const { const {
tessedit_ocr_engine_mode, tessedit_ocr_engine_mode,
...params ...params
} = { } = iParams;
...defaultParams,
...customParams,
};
api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode);
Object.keys(params).forEach((key) => { Object.keys(params).forEach((key) => {
api.SetVariable(key, params[key]); api.SetVariable(key, params[key]);
@ -191,7 +188,7 @@ const loadLanguage = ({ langs, options }, res) => {
* @param {object} res - job instance * @param {object} res - job instance
*/ */
const handleRecognize = ({ const handleRecognize = ({
image, langs, options, params, image, langs, options, params: customParams,
}, res) => ( }, res) => (
handleInit(options, res) handleInit(options, res)
.then(() => ( .then(() => (
@ -211,6 +208,10 @@ const handleRecognize = ({
const progressUpdate = (progress) => { const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress }); res.progress({ status: 'initializing api', progress });
}; };
const params = {
...defaultParams,
...customParams,
};
progressUpdate(0); progressUpdate(0);
handleParams(langs, params); handleParams(langs, params);
progressUpdate(0.5); progressUpdate(0.5);
@ -218,7 +219,7 @@ const handleRecognize = ({
progressUpdate(1); progressUpdate(1);
api.Recognize(null); api.Recognize(null);
const files = handleOutput(params); const files = handleOutput(params);
const result = dump(TessModule, api); const result = dump(TessModule, api, params);
api.End(); api.End();
TessModule._free(ptr); TessModule._free(ptr);
res.resolve({ files, ...result }); res.resolve({ files, ...result });

Loading…
Cancel
Save