diff --git a/docs/api.md b/docs/api.md index 8e35ccf..95c932a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -148,6 +148,7 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i | tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | +| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words | | tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | | tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | | tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | diff --git a/examples/node/preserve-interword-spaces.js b/examples/node/preserve-interword-spaces.js deleted file mode 100755 index 8cf26b6..0000000 --- a/examples/node/preserve-interword-spaces.js +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env node -const path = require('path'); -const fs = require('fs'); -const { createWorker } = require('../../'); - -const [,, imagePath] = process.argv; -const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/bill.png')); - -console.log(`Recognizing ${image}`); - -(async () => { - const worker = createWorker(); - await worker.load(); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); - await worker.setParameters({ - preserve_interword_spaces: '1', - }); - const { data: { text } } = await worker.recognize(image); - console.log(JSON.stringify({ text })); - await worker.terminate(); -})(); diff --git a/src/worker-script/constants/defaultParams.js b/src/worker-script/constants/defaultParams.js index 2e45308..e0e946d 100644 --- a/src/worker-script/constants/defaultParams.js +++ b/src/worker-script/constants/defaultParams.js @@ -6,7 +6,6 @@ const PSM = require('../../constants/PSM'); module.exports = { tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_char_whiltelist: '', - user_defined_dpi: '300', tessjs_create_hocr: '1', tessjs_create_tsv: '1', tessjs_create_box: '0', diff --git a/tests/constants.js b/tests/constants.js index 50dbb1a..7f334eb 100644 --- a/tests/constants.js +++ b/tests/constants.js @@ -13,6 +13,7 @@ const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; const CHINESE_TEXT = '繁 體 中 文 測 試\n'; const BILL_SPACED_TEXT = 'FIRST CHEQUING\n\nLine of Credit 100,000.00 Rate 4.2000\n\nDate Description Number Debits Credits Balance\n31Jul2018 Balance Forward 99,878.08 -\n01Aug2018 Clearing Cheque 4987 36.07 99,914.15 -\n01Aug2018 Clearing Cheque 4986 60.93 99,975.08 -\n01Aug2018 Clearing Cheque 4982 800.04 100,775.12 EX\n01Aug2018 Clearing Cheque 4981 82334 101,598.46 EX\n01Aug2018 Incoming Interac e-Transfer 1454 101,583.92 EX\n01Aug2018 Incoming Interac e-Transfer 400.00 101,183.92 EX\n01Aug2018 Assisted Deposit 3241450 68,769.42 -\n01Aug2018 Transfer out to loan 7 1,500.00 70,269.42 -\n02Aug2018 Clearing Cheque 4984 48.08 70,317.50 -\n02Aug2018 Clearing Cheque 4985 7051 70,388.01 -\n02Aug2018 Clearing Cheque 4992 500.00 70.888.01 -\n'; +const SIMPLE_WHITELIST_TEXT = 'Tesses\n'; const FORMATS = ['png', 'jpg', 'bmp', 'pbm']; const SIMPLE_PNG_BASE64 = ''; const SIMPLE_JPG_BASE64 = ''; @@ -26,6 +27,7 @@ if (typeof module !== 'undefined') { SIMPLE_JPG_BASE64, CHINESE_TEXT, SIMPLE_TEXT, + SIMPLE_WHITELIST_TEXT, SIMPLE_TEXT_HALF, COMSIC_TEXT, TESTOCR_TEXT, diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 874ef26..6904489 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -90,6 +90,15 @@ describe('recognize()', () => { const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/bill.png`); expect(text).to.be(BILL_SPACED_TEXT); }).timeout(TIMEOUT); + + it('support tessedit_char_whitelist', async () => { + await worker.initialize('eng'); + await worker.setParameters({ + tessedit_char_whitelist: 'Tess', + }); + const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/simple.png`); + expect(text).to.be(SIMPLE_WHITELIST_TEXT); + }).timeout(TIMEOUT); }); describe('should support all page seg modes', () => {