diff --git a/README.md b/README.md
index 4da9c29..e4af5f1 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Tesseract.js works with a `
+
```
After including your scripts, the `Tesseract` variable will be defined globally!
diff --git a/docs/examples.md b/docs/examples.md
index 99409d9..f2b332b 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -56,7 +56,7 @@ worker
});
```
-### with whitelist chars (^2.0.0-alpha.4)
+### with whitelist char (^2.0.0-alpha.5)
Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work.
@@ -71,7 +71,7 @@ worker
'http://jeroen.github.io/images/testocr.png',
'eng',
{
- 'init_oem': OEM.TESSERACT_ONLY,
+ 'tessedit_ocr_engine_mode': OEM.TESSERACT_ONLY,
'tessedit_char_whitelist': '0123456789-.',
}
)
@@ -82,3 +82,53 @@ worker
console.log(result);
});
```
+
+### with different pageseg mode (^2.0.0-alpha.5)
+
+Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
+
+```javascript
+import Tesseract from 'tesseract.js';
+
+const { TesseractWorker, PSM } = Tesseract;
+const worker = new TesseractWorker();
+
+worker
+ .recognize(
+ 'http://jeroen.github.io/images/testocr.png',
+ 'eng',
+ {
+ 'tessedit_pageseg_mode': PSM.SINGLE_BLOCK,
+ }
+ )
+ .progress((p) => {
+ console.log('progress', p);
+ })
+ .then((result) => {
+ console.log(result);
+ });
+```
+
+### with pdf output (^2.0.0-alpha.5)
+
+```javascript
+import Tesseract from 'tesseract.js';
+
+const { TesseractWorker } = Tesseract;
+const worker = new TesseractWorker();
+
+worker
+ .recognize(
+ 'http://jeroen.github.io/images/testocr.png',
+ 'eng',
+ {
+ 'tessedit_create_pdf': '1',
+ }
+ )
+ .progress((p) => {
+ console.log('progress', p);
+ })
+ .then((result) => {
+ console.log(result);
+ });
+```
diff --git a/docs/local-installation.md b/docs/local-installation.md
index eb4667b..9f2c874 100644
--- a/docs/local-installation.md
+++ b/docs/local-installation.md
@@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan
```javascript
const worker = Tesseract.TesseractWorker({
- workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/worker.min.js',
+ workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
- corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js',
+ corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.wasm.js',
});
```
diff --git a/examples/browser/basic.html b/examples/browser/basic.html
index e44a6d6..84870d5 100644
--- a/examples/browser/basic.html
+++ b/examples/browser/basic.html
@@ -1,2 +1,2 @@
-
+
diff --git a/examples/browser/demo.html b/examples/browser/demo.html
index a68eab9..677ccbb 100644
--- a/examples/browser/demo.html
+++ b/examples/browser/demo.html
@@ -42,7 +42,9 @@ function recognizeFile(file){
const { TesseractWorker } = Tesseract;
- const worker = new TesseractWorker();
+ const worker = new TesseractWorker({
+ corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
+ });
worker.recognize(file,
document.querySelector('#langsel').value
diff --git a/src/browser/index.js b/src/browser/index.js
index 7e344e4..9f80d5e 100644
--- a/src/browser/index.js
+++ b/src/browser/index.js
@@ -71,6 +71,25 @@ const loadImage = (image) => {
return Promise.reject();
};
+const downloadFile = (path, blob) => {
+ if (navigator.msSaveBlob) {
+ // IE 10+
+ navigator.msSaveBlob(blob, path);
+ } else {
+ const link = document.createElement('a');
+ // Browsers that support HTML5 download attribute
+ if (link.download !== undefined) {
+ const url = URL.createObjectURL(blob);
+ link.setAttribute('href', url);
+ link.setAttribute('download', path);
+ link.style.visibility = 'hidden';
+ document.body.appendChild(link);
+ link.click();
+ document.body.removeChild(link);
+ }
+ }
+}
+
/*
* Default options for browser worker
*/
@@ -83,7 +102,7 @@ exports.defaultOptions = {
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
- corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
+ corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
};
/**
@@ -108,7 +127,12 @@ exports.spawnWorker = (instance, { workerPath }) => {
}
worker.onmessage = ({ data }) => {
- instance.recv(data);
+ if (data.jobId.startsWith('Job')) {
+ instance.recv(data);
+ } else if (data.jobId.startsWith('Download')) {
+ const { path, blob } = data;
+ downloadFile(path, blob);
+ }
};
return worker;
diff --git a/src/browser/worker.js b/src/browser/worker.js
index d4f5b42..e558ef8 100644
--- a/src/browser/worker.js
+++ b/src/browser/worker.js
@@ -42,4 +42,13 @@ workerUtils.setAdapter({
}
return global.TesseractCore;
},
+ b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))),
+ writeFile: (path, data, type) => {
+ const blob = new Blob([data], { type });
+ self.postMessage({
+ jobId: 'Download',
+ path,
+ blob,
+ });
+ },
});
diff --git a/src/common/options.js b/src/common/options.js
index 8b49e1a..abbd45b 100644
--- a/src/common/options.js
+++ b/src/common/options.js
@@ -1,3 +1,5 @@
+const { OEM, PSM } = require('./types');
+
module.exports = {
defaultOptions: {
/*
@@ -7,4 +9,16 @@ module.exports = {
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
},
+ /*
+ * default params for recognize()
+ */
+ defaultParams: {
+ tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED,
+ tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
+ tessedit_char_whiltelist: '',
+ tessedit_create_pdf: '0',
+ textonly_pdf: '0',
+ pdf_name: 'tesseract.js-ocr-result',
+ pdf_title: 'Tesseract.js OCR Result',
+ },
};
diff --git a/src/common/pdf-ttf.js b/src/common/pdf-ttf.js
new file mode 100644
index 0000000..8e24c21
--- /dev/null
+++ b/src/common/pdf-ttf.js
@@ -0,0 +1 @@
+module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA=';
diff --git a/src/common/types.js b/src/common/types.js
index c8f868f..68ae1f8 100644
--- a/src/common/types.js
+++ b/src/common/types.js
@@ -2,7 +2,7 @@ module.exports = {
/*
* OEM = OCR Engine Mode, and there are 5 possible modes.
*
- * By default tesseract.js uses DEFAULT mode, which uses LSTM when possible.
+ * By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible.
* If you need to use some tesseract v3 features (like tessedit_char_whitelist),
* you need to use TESSERACT_ONLY mode.
*
@@ -14,4 +14,23 @@ module.exports = {
DEFAULT: 3,
COUNT: 4,
},
+ /*
+ * PSM = Page Segmentation Mode
+ */
+ PSM: {
+ OSD_ONLY: '0',
+ AUTO_OSD: '1',
+ AUTO_ONLY: '2',
+ AUTO: '3',
+ SINGLE_COLUMN: '4',
+ SINGLE_BLOCK_VERT_TEXT: '5',
+ SINGLE_BLOCK: '6',
+ SINGLE_LINE: '7',
+ SINGLE_WORD: '8',
+ SINGLE_CHAR: '9',
+ SPARSE_TEXT: '10',
+ SPARSE_TEXT_OSD: '11',
+ RAW_LINE: '12',
+ COUNT: '13',
+ },
};
diff --git a/src/common/workerUtils.js b/src/common/workerUtils.js
index 0f3905e..a49c9b6 100644
--- a/src/common/workerUtils.js
+++ b/src/common/workerUtils.js
@@ -9,7 +9,9 @@
*/
const { readImage, loadLang } = require('tesseract.js-utils');
const check = require('check-types');
+const pdfTTF = require('./pdf-ttf');
const dump = require('./dump');
+const { defaultParams } = require('./options');
/*
* Tesseract Module returned by TesseractCore.
@@ -51,6 +53,58 @@ const setImage = (image) => {
return data === null ? pix : data;
};
+/**
+ * handleParams
+ *
+ * @name handleParams
+ * @function hanlde params from users
+ * @access private
+ * @param {string} lang - lang string for Init()
+ * @param {object} customParams - an object of params
+ */
+const handleParams = (lang, customParams) => {
+ const {
+ tessedit_ocr_engine_mode,
+ ...params
+ } = {
+ ...defaultParams,
+ ...customParams,
+ };
+ api.Init(null, lang, tessedit_ocr_engine_mode);
+ Object.keys(params).forEach((key) => {
+ api.SetVariable(key, params[key]);
+ });
+};
+
+/**
+ * handleOutput
+ *
+ * @name handleOutput
+ * @function handle file output
+ * @access private
+ * @param {object} customParams - an object of params
+ */
+const handleOutput = (customParams) => {
+ const {
+ tessedit_create_pdf,
+ textonly_pdf,
+ pdf_name,
+ pdf_title,
+ } = {
+ ...defaultParams,
+ ...customParams,
+ };
+
+ if (tessedit_create_pdf === '1') {
+ const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1');
+ pdfRenderer.BeginDocument(pdf_title);
+ pdfRenderer.AddImage(api);
+ pdfRenderer.EndDocument();
+ adapter.writeFile(`${pdf_name}.pdf`, TessModule.FS.readFile(`/${pdf_name}.pdf`), 'application/pdf');
+ TessModule._free(pdfRenderer);
+ }
+}
+
/**
* handleInit
*
@@ -75,6 +129,7 @@ const handleInit = ({ corePath }, res) => {
})
.then((tessModule) => {
TessModule = tessModule;
+ TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF));
api = new TessModule.TessBaseAPI();
res.progress({ status: 'initialized tesseract', progress: 1 });
});
@@ -123,22 +178,16 @@ const handleRecognize = ({
.then(() => (
loadLanguage({ lang, options }, res)
.then(() => {
- const OEM = check.undefined(params['init_oem'])
- ? TessModule.OEM_DEFAULT
- : params['init_oem'];
const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress });
};
progressUpdate(0);
- api.Init(null, lang, OEM);
- progressUpdate(0.3);
- Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => {
- api.SetVariable(key, params[key]);
- });
- progressUpdate(0.6);
+ handleParams(lang, params);
+ progressUpdate(0.5);
const ptr = setImage(image);
progressUpdate(1);
api.Recognize(null);
+ handleOutput(params);
const result = dump(TessModule, api);
api.End();
TessModule._free(ptr);
diff --git a/src/index.js b/src/index.js
index ab9a505..5dc211b 100644
--- a/src/index.js
+++ b/src/index.js
@@ -9,7 +9,7 @@
*/
const utils = require('tesseract.js-utils');
const TesseractWorker = require('./common/TesseractWorker');
-const { OEM } = require('./common/types');
+const types = require('./common/types');
module.exports = {
/** Worker for OCR, @see common/TesseractWorker.js */
@@ -17,5 +17,5 @@ module.exports = {
/** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */
utils,
/** Check ./common/types for more details */
- OEM,
+ ...types,
};
diff --git a/src/node/worker.js b/src/node/worker.js
index b526fe0..ae1e5cf 100644
--- a/src/node/worker.js
+++ b/src/node/worker.js
@@ -33,4 +33,11 @@ workerUtils.setAdapter({
}
return TesseractCore;
},
+ b64toU8Array: s => Buffer.from(s, 'base64'),
+ writeFile: (path, data) => {
+ const fs = require('fs');
+ fs.writeFile(path, data, () => {
+ console.log('File Write Succeeded!');
+ });
+ },
});