Browse Source

Make output of hocr, tsv, box, unlv, osd to be optional

pull/288/head
Jerome Wu 6 years ago
parent
commit
af115e4940
  1. 7
      docs/tesseract_parameters.md
  2. 18
      src/common/dump.js
  3. 5
      src/common/options.js
  4. 15
      src/common/workerUtils.js

7
docs/tesseract_parameters.md

@ -24,7 +24,12 @@ worker @@ -24,7 +24,12 @@ worker
| tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js will generate a pdf output |
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output |
| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
| pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file |
| pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file |
| pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file |

18
src/common/dump.js

@ -42,7 +42,13 @@ const deindent = (html) => { @@ -42,7 +42,13 @@ const deindent = (html) => {
* @param {object} api TesseractBaseAPI instance
* @returns {object} dumpped JSON object
*/
module.exports = (TessModule, api) => {
module.exports = (TessModule, api, {
tessedit_create_hocr,
tessedit_create_tsv,
tessedit_create_box,
tessedit_create_unlv,
tessedit_create_osd,
}) => {
const ri = api.GetIterator();
const blocks = [];
let block;
@ -177,11 +183,11 @@ module.exports = (TessModule, api) => { @@ -177,11 +183,11 @@ module.exports = (TessModule, api) => {
return {
text: api.GetUTF8Text(),
hocr: deindent(api.GetHOCRText()),
tsv: api.GetTSVText(),
box: api.GetBoxText(),
unlv: api.GetUNLVText(),
osd: api.GetOsdText(),
hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null,
box: tessedit_create_box === '1' ? api.GetBoxText() : null,
unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessedit_create_osd === '1' ? api.GetOsdText() : null,
confidence: api.MeanTextConf(),
blocks,
psm: enumToString(api.GetPageSegMode(), 'PSM'),

5
src/common/options.js

@ -17,6 +17,11 @@ module.exports = { @@ -17,6 +17,11 @@ module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessedit_create_pdf: '0',
tessedit_create_hocr: '1',
tessedit_create_tsv: '1',
tessedit_create_box: '0',
tessedit_create_unlv: '0',
tessedit_create_osd: '0',
textonly_pdf: '0',
pdf_name: 'tesseract.js-ocr-result',
pdf_title: 'Tesseract.js OCR Result',

15
src/common/workerUtils.js

@ -67,14 +67,11 @@ const getLangsStr = langs => ( @@ -67,14 +67,11 @@ const getLangsStr = langs => (
* @param {string} langs - lang string for Init()
* @param {object} customParams - an object of params
*/
const handleParams = (langs, customParams) => {
const handleParams = (langs, iParams) => {
const {
tessedit_ocr_engine_mode,
...params
} = {
...defaultParams,
...customParams,
};
} = iParams;
api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode);
Object.keys(params).forEach((key) => {
api.SetVariable(key, params[key]);
@ -191,7 +188,7 @@ const loadLanguage = ({ langs, options }, res) => { @@ -191,7 +188,7 @@ const loadLanguage = ({ langs, options }, res) => {
* @param {object} res - job instance
*/
const handleRecognize = ({
image, langs, options, params,
image, langs, options, params: customParams,
}, res) => (
handleInit(options, res)
.then(() => (
@ -211,6 +208,10 @@ const handleRecognize = ({ @@ -211,6 +208,10 @@ const handleRecognize = ({
const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress });
};
const params = {
...defaultParams,
...customParams,
};
progressUpdate(0);
handleParams(langs, params);
progressUpdate(0.5);
@ -218,7 +219,7 @@ const handleRecognize = ({ @@ -218,7 +219,7 @@ const handleRecognize = ({
progressUpdate(1);
api.Recognize(null);
const files = handleOutput(params);
const result = dump(TessModule, api);
const result = dump(TessModule, api, params);
api.End();
TessModule._free(ptr);
res.resolve({ files, ...result });

Loading…
Cancel
Save