diff --git a/docs/examples.md b/docs/examples.md index 30aae38..cc08942 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -124,7 +124,7 @@ worker }); ``` -### with pdf output (^2.0.0-alpha.7) +### with pdf output (^2.0.0-alpha.12) In this example, pdf file will be downloaded in browser and write to file system in Node.js @@ -139,7 +139,7 @@ worker 'https://tesseract.projectnaptha.com/img/eng_bw.png', 'eng', { - 'tessedit_create_pdf': '1', + 'tessjs_create_pdf': '1', } ) .progress((p) => { @@ -164,9 +164,9 @@ worker 'https://tesseract.projectnaptha.com/img/eng_bw.png', 'eng', { - 'tessedit_create_pdf': '1', - 'pdf_auto_download': false, // disable auto download - 'pdf_bin': true, // add pdf file bin array in result + 'tessjs_create_pdf': '1', + 'tessjs_pdf_auto_download': false, // disable auto download + 'tessjs_pdf_bin': true, // add pdf file bin array in result } ) .progress((p) => { @@ -198,3 +198,31 @@ loadLang({ langs: 'eng', langPath: worker.options.langPath }) }); ``` + +### with only part of the image (^2.0.0-alpha.12) + +```javascript +import Tesseract from 'tesseract.js'; + +const { TesseractWorker } = Tesseract; +const worker = new TesseractWorker(); + +worker + .recognize( + 'https://tesseract.projectnaptha.com/img/eng_bw.png', + 'eng', + { + tessjs_image_rectangle_left: 0, + tessjs_image_rectangle_top: 0, + tessjs_image_rectangle_width: 500, + tessjs_image_rectangle_height: 250, + } + ) + .progress((p) => { + console.log('progress', p); + }) + .then(({ text }) => { + console.log(text); + worker.terminate(); + }); +``` diff --git a/docs/tesseract_parameters.md b/docs/tesseract_parameters.md index 1dbe13a..6b6d598 100644 --- a/docs/tesseract_parameters.md +++ b/docs/tesseract_parameters.md @@ -24,14 +24,17 @@ worker | tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | -| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output | -| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | -| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | -| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | -| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | -| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | -| pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | -| pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | -| pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file | -| pdf\_bin | boolean | false | whether to include pdf binary array in the result object (result.files.pdf) | - +| tessjs\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output | +| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | +| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | +| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | +| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | +| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | +| tessjs\_pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | +| tessjs\_pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | +| tessjs\_pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file | +| tessjs\_pdf\_bin | boolean | false | whether to include pdf binary array in the result object (result.files.pdf) | +| tessjs\_image\_rectangle\_left | number | 0 | The left of the sub-rectangle of the image. | +| tessjs\_image\_rectangle\_top | number | 0 | The top of the sub-rectangle of the image. | +| tessjs\_image\_rectangle\_width | number | -1 | The width of the sub-rectangle of the image, -1 means auto width detection | +| tessjs\_image\_rectangle\_height | number | -1 | The height of the sub-rectangle of the image, -1 means auto height detection | diff --git a/src/common/dump.js b/src/common/dump.js index affc56d..4a4965f 100644 --- a/src/common/dump.js +++ b/src/common/dump.js @@ -43,11 +43,11 @@ const deindent = (html) => { * @returns {object} dumpped JSON object */ module.exports = (TessModule, api, { - tessedit_create_hocr, - tessedit_create_tsv, - tessedit_create_box, - tessedit_create_unlv, - tessedit_create_osd, + tessjs_create_hocr, + tessjs_create_tsv, + tessjs_create_box, + tessjs_create_unlv, + tessjs_create_osd, }) => { const ri = api.GetIterator(); const blocks = []; @@ -183,11 +183,11 @@ module.exports = (TessModule, api, { return { text: api.GetUTF8Text(), - hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, - tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null, - box: tessedit_create_box === '1' ? api.GetBoxText() : null, - unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null, - osd: tessedit_create_osd === '1' ? api.GetOsdText() : null, + hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, + tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, + box: tessjs_create_box === '1' ? api.GetBoxText() : null, + unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, + osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, confidence: api.MeanTextConf(), blocks, psm: enumToString(api.GetPageSegMode(), 'PSM'), diff --git a/src/common/options.js b/src/common/options.js index 83fbf33..36a9239 100644 --- a/src/common/options.js +++ b/src/common/options.js @@ -16,16 +16,20 @@ module.exports = { tessedit_ocr_engine_mode: OEM.LSTM_ONLY, tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_char_whiltelist: '', - tessedit_create_pdf: '0', - tessedit_create_hocr: '1', - tessedit_create_tsv: '1', - tessedit_create_box: '0', - tessedit_create_unlv: '0', - tessedit_create_osd: '0', - textonly_pdf: '0', - pdf_name: 'tesseract.js-ocr-result', - pdf_title: 'Tesseract.js OCR Result', - pdf_auto_download: true, - pdf_bin: false, + tessjs_create_pdf: '0', + tessjs_create_hocr: '1', + tessjs_create_tsv: '1', + tessjs_create_box: '0', + tessjs_create_unlv: '0', + tessjs_create_osd: '0', + tessjs_textonly_pdf: '0', + tessjs_pdf_name: 'tesseract.js-ocr-result', + tessjs_pdf_title: 'Tesseract.js OCR Result', + tessjs_pdf_auto_download: true, + tessjs_pdf_bin: false, + tessjs_image_rectangle_left: 0, + tessjs_image_rectangle_top: 0, + tessjs_image_rectangle_width: -1, + tessjs_image_rectangle_height: -1, }, }; diff --git a/src/common/workerUtils.js b/src/common/workerUtils.js index 65692ab..296bf7a 100644 --- a/src/common/workerUtils.js +++ b/src/common/workerUtils.js @@ -32,7 +32,13 @@ let adapter = {}; * @param {array} image - binary array in array format * @returns {number} - an emscripten pointer of the image */ -const setImage = (image) => { +const setImage = (image, params) => { + const { + tessjs_image_rectangle_left: left, + tessjs_image_rectangle_top: top, + tessjs_image_rectangle_width: width, + tessjs_image_rectangle_height: height, + } = params; const { w, h, bytesPerPixel, data, pix, } = readImage(TessModule, Array.from(image)); @@ -48,7 +54,12 @@ const setImage = (image) => { } else { api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel); } - api.SetRectangle(0, 0, w, h); + api.SetRectangle( + (left < 0) ? 0 : left, + (top < 0) ? 0 : top, + (width < 0) ? w : width, + (height < 0) ? h : height, + ); return data === null ? pix : data; }; @@ -74,7 +85,9 @@ const handleParams = (langs, iParams) => { } = iParams; api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); Object.keys(params).forEach((key) => { - api.SetVariable(key, params[key]); + if (!key.startsWith('tessjs')) { + api.SetVariable(key, params[key]); + } }); }; @@ -89,32 +102,32 @@ const handleParams = (langs, iParams) => { const handleOutput = (customParams) => { let files = {}; const { - tessedit_create_pdf, - textonly_pdf, - pdf_name, - pdf_title, - pdf_auto_download, - pdf_bin, + tessjs_create_pdf, + tessjs_textonly_pdf, + tessjs_pdf_name, + tessjs_pdf_title, + tessjs_pdf_auto_download, + tessjs_pdf_bin, } = { ...defaultParams, ...customParams, }; - if (tessedit_create_pdf === '1') { - const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1'); - pdfRenderer.BeginDocument(pdf_title); + if (tessjs_create_pdf === '1') { + const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1'); + pdfRenderer.BeginDocument(tessjs_pdf_title); pdfRenderer.AddImage(api); pdfRenderer.EndDocument(); TessModule._free(pdfRenderer); - const data = TessModule.FS.readFile(`/${pdf_name}.pdf`); + const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`); - if (pdf_bin) { + if (tessjs_pdf_bin) { files = { pdf: data, ...files }; } - if (pdf_auto_download) { - adapter.writeFile(`${pdf_name}.pdf`, data, 'application/pdf'); + if (tessjs_pdf_auto_download) { + adapter.writeFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf'); } } @@ -216,7 +229,7 @@ const handleRecognize = ({ progressUpdate(0); handleParams(langs, params); progressUpdate(0.5); - const ptr = setImage(image); + const ptr = setImage(image, params); progressUpdate(1); api.Recognize(null); const files = handleOutput(params); @@ -244,7 +257,7 @@ const handleRecognize = ({ * @param {object} res - job instance */ const handleDetect = ({ - image, langs, options, + image, langs, options, params: customParams, }, res) => ( handleInit(options, res) .then(() => ( @@ -252,8 +265,12 @@ const handleDetect = ({ .then(() => { api.Init(null, getLangsStr(langs)); api.SetPageSegMode(TessModule.PSM_OSD_ONLY); + const params = { + ...defaultParams, + ...customParams, + }; - const ptr = setImage(image); + const ptr = setImage(image, params); const results = new TessModule.OSResults(); if (!api.DetectOS(results)) { diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 3bd4dac..bf2c513 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -5,6 +5,7 @@ const SIMPLE_PNG = ' const SIMPLE_JPG = ''; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const SIMPLE_TEXT = 'Tesseract.js\n'; +const SIMPLE_TEXT_HALF = 'Tesse\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; const CHINESE_TEXT = '繁 體 中 文 測 試\n'; @@ -57,6 +58,32 @@ describe('recognize()', () => { )); }); + describe('should recognize part of the image', () => { + [ + { + name: 'simple.png', left: 0, top: 0, width: 140, height: 180, ans: SIMPLE_TEXT_HALF, + }, + ].forEach(({ + name, left, top, width, height, ans, + }) => ( + it(`recongize half ${name}`, (done) => { + const worker = getWorker(); + worker + .recognize(`${IMAGE_PATH}/${name}`, 'eng', { + tessjs_image_rectangle_left: left, + tessjs_image_rectangle_top: top, + tessjs_image_rectangle_width: width, + tessjs_image_rectangle_height: height, + }) + .then(({ text }) => { + expect(text).to.be(ans); + worker.terminate(); + done(); + }); + }).timeout(30000) + )); + }); + describe('should be able to recognize multiple images with 1 worker', () => { [3, 10, 20].forEach(num => ( it(`recognize ${num} images with 1 worker`, (done) => { @@ -168,7 +195,7 @@ describe('recognize()', () => { }).timeout(10000) )); }); - + (isBrowser ? describe : describe.skip)('should read image from video DOM element (browser only)', () => { FORMATS.forEach(format => ( it(`support ${format} format`, (done) => {