Browse Source

Add tessedit_char_whitelist test case

develop
Jerome Wu 5 years ago
parent
commit
b5d0651698
  1. 1
      docs/api.md
  2. 22
      examples/node/preserve-interword-spaces.js
  3. 1
      src/worker-script/constants/defaultParams.js
  4. 2
      tests/constants.js
  5. 9
      tests/recognize.test.js

1
docs/api.md

@ -148,6 +148,7 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i @@ -148,6 +148,7 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i
| tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |

22
examples/node/preserve-interword-spaces.js

@ -1,22 +0,0 @@ @@ -1,22 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const fs = require('fs');
const { createWorker } = require('../../');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/bill.png'));
console.log(`Recognizing ${image}`);
(async () => {
const worker = createWorker();
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
preserve_interword_spaces: '1',
});
const { data: { text } } = await worker.recognize(image);
console.log(JSON.stringify({ text }));
await worker.terminate();
})();

1
src/worker-script/constants/defaultParams.js

@ -6,7 +6,6 @@ const PSM = require('../../constants/PSM'); @@ -6,7 +6,6 @@ const PSM = require('../../constants/PSM');
module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
user_defined_dpi: '300',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
tessjs_create_box: '0',

2
tests/constants.js

File diff suppressed because one or more lines are too long

9
tests/recognize.test.js

@ -90,6 +90,15 @@ describe('recognize()', () => { @@ -90,6 +90,15 @@ describe('recognize()', () => {
const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/bill.png`);
expect(text).to.be(BILL_SPACED_TEXT);
}).timeout(TIMEOUT);
it('support tessedit_char_whitelist', async () => {
await worker.initialize('eng');
await worker.setParameters({
tessedit_char_whitelist: 'Tess',
});
const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/simple.png`);
expect(text).to.be(SIMPLE_WHITELIST_TEXT);
}).timeout(TIMEOUT);
});
describe('should support all page seg modes', () => {

Loading…
Cancel
Save