Browse Source

Add rectangle capabilities

pull/309/head
Jerome Wu 6 years ago
parent
commit
8bc9f64177
  1. 38
      docs/examples.md
  2. 25
      docs/tesseract_parameters.md
  3. 20
      src/common/dump.js
  4. 26
      src/common/options.js
  5. 55
      src/common/workerUtils.js
  6. 29
      tests/recognize.test.js

38
docs/examples.md

@ -124,7 +124,7 @@ worker
}); });
``` ```
### with pdf output (^2.0.0-alpha.7) ### with pdf output (^2.0.0-alpha.12)
In this example, pdf file will be downloaded in browser and write to file system in Node.js In this example, pdf file will be downloaded in browser and write to file system in Node.js
@ -139,7 +139,7 @@ worker
'https://tesseract.projectnaptha.com/img/eng_bw.png', 'https://tesseract.projectnaptha.com/img/eng_bw.png',
'eng', 'eng',
{ {
'tessedit_create_pdf': '1', 'tessjs_create_pdf': '1',
} }
) )
.progress((p) => { .progress((p) => {
@ -164,9 +164,9 @@ worker
'https://tesseract.projectnaptha.com/img/eng_bw.png', 'https://tesseract.projectnaptha.com/img/eng_bw.png',
'eng', 'eng',
{ {
'tessedit_create_pdf': '1', 'tessjs_create_pdf': '1',
'pdf_auto_download': false, // disable auto download 'tessjs_pdf_auto_download': false, // disable auto download
'pdf_bin': true, // add pdf file bin array in result 'tessjs_pdf_bin': true, // add pdf file bin array in result
} }
) )
.progress((p) => { .progress((p) => {
@ -198,3 +198,31 @@ loadLang({ langs: 'eng', langPath: worker.options.langPath })
}); });
``` ```
### with only part of the image (^2.0.0-alpha.12)
```javascript
import Tesseract from 'tesseract.js';
const { TesseractWorker } = Tesseract;
const worker = new TesseractWorker();
worker
.recognize(
'https://tesseract.projectnaptha.com/img/eng_bw.png',
'eng',
{
tessjs_image_rectangle_left: 0,
tessjs_image_rectangle_top: 0,
tessjs_image_rectangle_width: 500,
tessjs_image_rectangle_height: 250,
}
)
.progress((p) => {
console.log('progress', p);
})
.then(({ text }) => {
console.log(text);
worker.terminate();
});
```

25
docs/tesseract_parameters.md

@ -24,14 +24,17 @@ worker
| tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output | | tessjs\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output |
| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | | tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | | tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | | tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | | tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | | tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
| pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | | tessjs\_pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file |
| pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | | tessjs\_pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file |
| pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file | | tessjs\_pdf\_auto\_download | boolean | true | If the value is true, tesseract.js will automatic download/writeFile pdf file |
| pdf\_bin | boolean | false | whether to include pdf binary array in the result object (result.files.pdf) | | tessjs\_pdf\_bin | boolean | false | whether to include pdf binary array in the result object (result.files.pdf) |
| tessjs\_image\_rectangle\_left | number | 0 | The left of the sub-rectangle of the image. |
| tessjs\_image\_rectangle\_top | number | 0 | The top of the sub-rectangle of the image. |
| tessjs\_image\_rectangle\_width | number | -1 | The width of the sub-rectangle of the image, -1 means auto width detection |
| tessjs\_image\_rectangle\_height | number | -1 | The height of the sub-rectangle of the image, -1 means auto height detection |

20
src/common/dump.js

@ -43,11 +43,11 @@ const deindent = (html) => {
* @returns {object} dumpped JSON object * @returns {object} dumpped JSON object
*/ */
module.exports = (TessModule, api, { module.exports = (TessModule, api, {
tessedit_create_hocr, tessjs_create_hocr,
tessedit_create_tsv, tessjs_create_tsv,
tessedit_create_box, tessjs_create_box,
tessedit_create_unlv, tessjs_create_unlv,
tessedit_create_osd, tessjs_create_osd,
}) => { }) => {
const ri = api.GetIterator(); const ri = api.GetIterator();
const blocks = []; const blocks = [];
@ -183,11 +183,11 @@ module.exports = (TessModule, api, {
return { return {
text: api.GetUTF8Text(), text: api.GetUTF8Text(),
hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null, tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
box: tessedit_create_box === '1' ? api.GetBoxText() : null, box: tessjs_create_box === '1' ? api.GetBoxText() : null,
unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null, unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessedit_create_osd === '1' ? api.GetOsdText() : null, osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
confidence: api.MeanTextConf(), confidence: api.MeanTextConf(),
blocks, blocks,
psm: enumToString(api.GetPageSegMode(), 'PSM'), psm: enumToString(api.GetPageSegMode(), 'PSM'),

26
src/common/options.js

@ -16,16 +16,20 @@ module.exports = {
tessedit_ocr_engine_mode: OEM.LSTM_ONLY, tessedit_ocr_engine_mode: OEM.LSTM_ONLY,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '', tessedit_char_whiltelist: '',
tessedit_create_pdf: '0', tessjs_create_pdf: '0',
tessedit_create_hocr: '1', tessjs_create_hocr: '1',
tessedit_create_tsv: '1', tessjs_create_tsv: '1',
tessedit_create_box: '0', tessjs_create_box: '0',
tessedit_create_unlv: '0', tessjs_create_unlv: '0',
tessedit_create_osd: '0', tessjs_create_osd: '0',
textonly_pdf: '0', tessjs_textonly_pdf: '0',
pdf_name: 'tesseract.js-ocr-result', tessjs_pdf_name: 'tesseract.js-ocr-result',
pdf_title: 'Tesseract.js OCR Result', tessjs_pdf_title: 'Tesseract.js OCR Result',
pdf_auto_download: true, tessjs_pdf_auto_download: true,
pdf_bin: false, tessjs_pdf_bin: false,
tessjs_image_rectangle_left: 0,
tessjs_image_rectangle_top: 0,
tessjs_image_rectangle_width: -1,
tessjs_image_rectangle_height: -1,
}, },
}; };

55
src/common/workerUtils.js

@ -32,7 +32,13 @@ let adapter = {};
* @param {array} image - binary array in array format * @param {array} image - binary array in array format
* @returns {number} - an emscripten pointer of the image * @returns {number} - an emscripten pointer of the image
*/ */
const setImage = (image) => { const setImage = (image, params) => {
const {
tessjs_image_rectangle_left: left,
tessjs_image_rectangle_top: top,
tessjs_image_rectangle_width: width,
tessjs_image_rectangle_height: height,
} = params;
const { const {
w, h, bytesPerPixel, data, pix, w, h, bytesPerPixel, data, pix,
} = readImage(TessModule, Array.from(image)); } = readImage(TessModule, Array.from(image));
@ -48,7 +54,12 @@ const setImage = (image) => {
} else { } else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel); api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
} }
api.SetRectangle(0, 0, w, h); api.SetRectangle(
(left < 0) ? 0 : left,
(top < 0) ? 0 : top,
(width < 0) ? w : width,
(height < 0) ? h : height,
);
return data === null ? pix : data; return data === null ? pix : data;
}; };
@ -74,7 +85,9 @@ const handleParams = (langs, iParams) => {
} = iParams; } = iParams;
api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode);
Object.keys(params).forEach((key) => { Object.keys(params).forEach((key) => {
api.SetVariable(key, params[key]); if (!key.startsWith('tessjs')) {
api.SetVariable(key, params[key]);
}
}); });
}; };
@ -89,32 +102,32 @@ const handleParams = (langs, iParams) => {
const handleOutput = (customParams) => { const handleOutput = (customParams) => {
let files = {}; let files = {};
const { const {
tessedit_create_pdf, tessjs_create_pdf,
textonly_pdf, tessjs_textonly_pdf,
pdf_name, tessjs_pdf_name,
pdf_title, tessjs_pdf_title,
pdf_auto_download, tessjs_pdf_auto_download,
pdf_bin, tessjs_pdf_bin,
} = { } = {
...defaultParams, ...defaultParams,
...customParams, ...customParams,
}; };
if (tessedit_create_pdf === '1') { if (tessjs_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1'); const pdfRenderer = new TessModule.TessPDFRenderer(tessjs_pdf_name, '/', tessjs_textonly_pdf === '1');
pdfRenderer.BeginDocument(pdf_title); pdfRenderer.BeginDocument(tessjs_pdf_title);
pdfRenderer.AddImage(api); pdfRenderer.AddImage(api);
pdfRenderer.EndDocument(); pdfRenderer.EndDocument();
TessModule._free(pdfRenderer); TessModule._free(pdfRenderer);
const data = TessModule.FS.readFile(`/${pdf_name}.pdf`); const data = TessModule.FS.readFile(`/${tessjs_pdf_name}.pdf`);
if (pdf_bin) { if (tessjs_pdf_bin) {
files = { pdf: data, ...files }; files = { pdf: data, ...files };
} }
if (pdf_auto_download) { if (tessjs_pdf_auto_download) {
adapter.writeFile(`${pdf_name}.pdf`, data, 'application/pdf'); adapter.writeFile(`${tessjs_pdf_name}.pdf`, data, 'application/pdf');
} }
} }
@ -216,7 +229,7 @@ const handleRecognize = ({
progressUpdate(0); progressUpdate(0);
handleParams(langs, params); handleParams(langs, params);
progressUpdate(0.5); progressUpdate(0.5);
const ptr = setImage(image); const ptr = setImage(image, params);
progressUpdate(1); progressUpdate(1);
api.Recognize(null); api.Recognize(null);
const files = handleOutput(params); const files = handleOutput(params);
@ -244,7 +257,7 @@ const handleRecognize = ({
* @param {object} res - job instance * @param {object} res - job instance
*/ */
const handleDetect = ({ const handleDetect = ({
image, langs, options, image, langs, options, params: customParams,
}, res) => ( }, res) => (
handleInit(options, res) handleInit(options, res)
.then(() => ( .then(() => (
@ -252,8 +265,12 @@ const handleDetect = ({
.then(() => { .then(() => {
api.Init(null, getLangsStr(langs)); api.Init(null, getLangsStr(langs));
api.SetPageSegMode(TessModule.PSM_OSD_ONLY); api.SetPageSegMode(TessModule.PSM_OSD_ONLY);
const params = {
...defaultParams,
...customParams,
};
const ptr = setImage(image); const ptr = setImage(image, params);
const results = new TessModule.OSResults(); const results = new TessModule.OSResults();
if (!api.DetectOS(results)) { if (!api.DetectOS(results)) {

29
tests/recognize.test.js

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save