Browse Source

Add pdf output feature and rename oem params

feature/aio
Jerome Wu 6 years ago
parent
commit
e32272ea2e
  1. 2
      README.md
  2. 54
      docs/examples.md
  3. 4
      docs/local-installation.md
  4. 2
      examples/browser/basic.html
  5. 4
      examples/browser/demo.html
  6. 28
      src/browser/index.js
  7. 9
      src/browser/worker.js
  8. 14
      src/common/options.js
  9. 1
      src/common/pdf-ttf.js
  10. 21
      src/common/types.js
  11. 67
      src/common/workerUtils.js
  12. 4
      src/index.js
  13. 7
      src/node/worker.js

2
README.md

@ -39,7 +39,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via
You can simply include Tesseract.js with a CDN like this: You can simply include Tesseract.js with a CDN like this:
```html ```html
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/tesseract.min.js'></script> <script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/tesseract.min.js'></script>
``` ```
After including your scripts, the `Tesseract` variable will be defined globally! After including your scripts, the `Tesseract` variable will be defined globally!

54
docs/examples.md

@ -56,7 +56,7 @@ worker
}); });
``` ```
### with whitelist chars (^2.0.0-alpha.4) ### with whitelist char (^2.0.0-alpha.5)
Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work. Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work.
@ -71,7 +71,7 @@ worker
'http://jeroen.github.io/images/testocr.png', 'http://jeroen.github.io/images/testocr.png',
'eng', 'eng',
{ {
'init_oem': OEM.TESSERACT_ONLY, 'tessedit_ocr_engine_mode': OEM.TESSERACT_ONLY,
'tessedit_char_whitelist': '0123456789-.', 'tessedit_char_whitelist': '0123456789-.',
} }
) )
@ -82,3 +82,53 @@ worker
console.log(result); console.log(result);
}); });
``` ```
### with different pageseg mode (^2.0.0-alpha.5)
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript
import Tesseract from 'tesseract.js';
const { TesseractWorker, PSM } = Tesseract;
const worker = new TesseractWorker();
worker
.recognize(
'http://jeroen.github.io/images/testocr.png',
'eng',
{
'tessedit_pageseg_mode': PSM.SINGLE_BLOCK,
}
)
.progress((p) => {
console.log('progress', p);
})
.then((result) => {
console.log(result);
});
```
### with pdf output (^2.0.0-alpha.5)
```javascript
import Tesseract from 'tesseract.js';
const { TesseractWorker } = Tesseract;
const worker = new TesseractWorker();
worker
.recognize(
'http://jeroen.github.io/images/testocr.png',
'eng',
{
'tessedit_create_pdf': '1',
}
)
.progress((p) => {
console.log('progress', p);
})
.then((result) => {
console.log(result);
});
```

4
docs/local-installation.md

@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan
```javascript ```javascript
const worker = Tesseract.TesseractWorker({ const worker = Tesseract.TesseractWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/worker.min.js', workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0', langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js', corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.wasm.js',
}); });
``` ```

2
examples/browser/basic.html

@ -1,2 +1,2 @@
<script src="/dist/tesseract.dev.js"></script> <script src="/dist/tesseract.dev.js"></script>
<input type="file" onchange="const worker = new Tesseract.TesseractWorker();worker.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})"> <input type="file" onchange="const worker = new Tesseract.TesseractWorker({ corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js' });worker.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})">

4
examples/browser/demo.html

@ -42,7 +42,9 @@ function recognizeFile(file){
const { TesseractWorker } = Tesseract; const { TesseractWorker } = Tesseract;
const worker = new TesseractWorker(); const worker = new TesseractWorker({
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
});
worker.recognize(file, worker.recognize(file,
document.querySelector('#langsel').value document.querySelector('#langsel').value

28
src/browser/index.js

@ -71,6 +71,25 @@ const loadImage = (image) => {
return Promise.reject(); return Promise.reject();
}; };
const downloadFile = (path, blob) => {
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, path);
} else {
const link = document.createElement('a');
// Browsers that support HTML5 download attribute
if (link.download !== undefined) {
const url = URL.createObjectURL(blob);
link.setAttribute('href', url);
link.setAttribute('download', path);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
}
/* /*
* Default options for browser worker * Default options for browser worker
*/ */
@ -83,7 +102,7 @@ exports.defaultOptions = {
* If browser doesn't support WebAssembly, * If browser doesn't support WebAssembly,
* load ASM version instead * load ASM version instead
*/ */
corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
}; };
/** /**
@ -108,7 +127,12 @@ exports.spawnWorker = (instance, { workerPath }) => {
} }
worker.onmessage = ({ data }) => { worker.onmessage = ({ data }) => {
instance.recv(data); if (data.jobId.startsWith('Job')) {
instance.recv(data);
} else if (data.jobId.startsWith('Download')) {
const { path, blob } = data;
downloadFile(path, blob);
}
}; };
return worker; return worker;

9
src/browser/worker.js

@ -42,4 +42,13 @@ workerUtils.setAdapter({
} }
return global.TesseractCore; return global.TesseractCore;
}, },
b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))),
writeFile: (path, data, type) => {
const blob = new Blob([data], { type });
self.postMessage({
jobId: 'Download',
path,
blob,
});
},
}); });

14
src/common/options.js

@ -1,3 +1,5 @@
const { OEM, PSM } = require('./types');
module.exports = { module.exports = {
defaultOptions: { defaultOptions: {
/* /*
@ -7,4 +9,16 @@ module.exports = {
*/ */
langPath: 'https://tessdata.projectnaptha.com/4.0.0', langPath: 'https://tessdata.projectnaptha.com/4.0.0',
}, },
/*
* default params for recognize()
*/
defaultParams: {
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessedit_create_pdf: '0',
textonly_pdf: '0',
pdf_name: 'tesseract.js-ocr-result',
pdf_title: 'Tesseract.js OCR Result',
},
}; };

1
src/common/pdf-ttf.js

@ -0,0 +1 @@
module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA=';

21
src/common/types.js

@ -2,7 +2,7 @@ module.exports = {
/* /*
* OEM = OCR Engine Mode, and there are 5 possible modes. * OEM = OCR Engine Mode, and there are 5 possible modes.
* *
* By default tesseract.js uses DEFAULT mode, which uses LSTM when possible. * By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible.
* If you need to use some tesseract v3 features (like tessedit_char_whitelist), * If you need to use some tesseract v3 features (like tessedit_char_whitelist),
* you need to use TESSERACT_ONLY mode. * you need to use TESSERACT_ONLY mode.
* *
@ -14,4 +14,23 @@ module.exports = {
DEFAULT: 3, DEFAULT: 3,
COUNT: 4, COUNT: 4,
}, },
/*
* PSM = Page Segmentation Mode
*/
PSM: {
OSD_ONLY: '0',
AUTO_OSD: '1',
AUTO_ONLY: '2',
AUTO: '3',
SINGLE_COLUMN: '4',
SINGLE_BLOCK_VERT_TEXT: '5',
SINGLE_BLOCK: '6',
SINGLE_LINE: '7',
SINGLE_WORD: '8',
SINGLE_CHAR: '9',
SPARSE_TEXT: '10',
SPARSE_TEXT_OSD: '11',
RAW_LINE: '12',
COUNT: '13',
},
}; };

67
src/common/workerUtils.js

@ -9,7 +9,9 @@
*/ */
const { readImage, loadLang } = require('tesseract.js-utils'); const { readImage, loadLang } = require('tesseract.js-utils');
const check = require('check-types'); const check = require('check-types');
const pdfTTF = require('./pdf-ttf');
const dump = require('./dump'); const dump = require('./dump');
const { defaultParams } = require('./options');
/* /*
* Tesseract Module returned by TesseractCore. * Tesseract Module returned by TesseractCore.
@ -51,6 +53,58 @@ const setImage = (image) => {
return data === null ? pix : data; return data === null ? pix : data;
}; };
/**
* handleParams
*
* @name handleParams
* @function hanlde params from users
* @access private
* @param {string} lang - lang string for Init()
* @param {object} customParams - an object of params
*/
const handleParams = (lang, customParams) => {
const {
tessedit_ocr_engine_mode,
...params
} = {
...defaultParams,
...customParams,
};
api.Init(null, lang, tessedit_ocr_engine_mode);
Object.keys(params).forEach((key) => {
api.SetVariable(key, params[key]);
});
};
/**
* handleOutput
*
* @name handleOutput
* @function handle file output
* @access private
* @param {object} customParams - an object of params
*/
const handleOutput = (customParams) => {
const {
tessedit_create_pdf,
textonly_pdf,
pdf_name,
pdf_title,
} = {
...defaultParams,
...customParams,
};
if (tessedit_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1');
pdfRenderer.BeginDocument(pdf_title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
adapter.writeFile(`${pdf_name}.pdf`, TessModule.FS.readFile(`/${pdf_name}.pdf`), 'application/pdf');
TessModule._free(pdfRenderer);
}
}
/** /**
* handleInit * handleInit
* *
@ -75,6 +129,7 @@ const handleInit = ({ corePath }, res) => {
}) })
.then((tessModule) => { .then((tessModule) => {
TessModule = tessModule; TessModule = tessModule;
TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF));
api = new TessModule.TessBaseAPI(); api = new TessModule.TessBaseAPI();
res.progress({ status: 'initialized tesseract', progress: 1 }); res.progress({ status: 'initialized tesseract', progress: 1 });
}); });
@ -123,22 +178,16 @@ const handleRecognize = ({
.then(() => ( .then(() => (
loadLanguage({ lang, options }, res) loadLanguage({ lang, options }, res)
.then(() => { .then(() => {
const OEM = check.undefined(params['init_oem'])
? TessModule.OEM_DEFAULT
: params['init_oem'];
const progressUpdate = (progress) => { const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress }); res.progress({ status: 'initializing api', progress });
}; };
progressUpdate(0); progressUpdate(0);
api.Init(null, lang, OEM); handleParams(lang, params);
progressUpdate(0.3); progressUpdate(0.5);
Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => {
api.SetVariable(key, params[key]);
});
progressUpdate(0.6);
const ptr = setImage(image); const ptr = setImage(image);
progressUpdate(1); progressUpdate(1);
api.Recognize(null); api.Recognize(null);
handleOutput(params);
const result = dump(TessModule, api); const result = dump(TessModule, api);
api.End(); api.End();
TessModule._free(ptr); TessModule._free(ptr);

4
src/index.js

@ -9,7 +9,7 @@
*/ */
const utils = require('tesseract.js-utils'); const utils = require('tesseract.js-utils');
const TesseractWorker = require('./common/TesseractWorker'); const TesseractWorker = require('./common/TesseractWorker');
const { OEM } = require('./common/types'); const types = require('./common/types');
module.exports = { module.exports = {
/** Worker for OCR, @see common/TesseractWorker.js */ /** Worker for OCR, @see common/TesseractWorker.js */
@ -17,5 +17,5 @@ module.exports = {
/** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */ /** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */
utils, utils,
/** Check ./common/types for more details */ /** Check ./common/types for more details */
OEM, ...types,
}; };

7
src/node/worker.js

@ -33,4 +33,11 @@ workerUtils.setAdapter({
} }
return TesseractCore; return TesseractCore;
}, },
b64toU8Array: s => Buffer.from(s, 'base64'),
writeFile: (path, data) => {
const fs = require('fs');
fs.writeFile(path, data, () => {
console.log('File Write Succeeded!');
});
},
}); });

Loading…
Cancel
Save