Browse Source

Add pdf output feature and rename oem params

feature/aio
Jerome Wu 6 years ago
parent
commit
e32272ea2e
  1. 2
      README.md
  2. 54
      docs/examples.md
  3. 4
      docs/local-installation.md
  4. 2
      examples/browser/basic.html
  5. 4
      examples/browser/demo.html
  6. 28
      src/browser/index.js
  7. 9
      src/browser/worker.js
  8. 14
      src/common/options.js
  9. 1
      src/common/pdf-ttf.js
  10. 21
      src/common/types.js
  11. 67
      src/common/workerUtils.js
  12. 4
      src/index.js
  13. 7
      src/node/worker.js

2
README.md

@ -39,7 +39,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via @@ -39,7 +39,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via
You can simply include Tesseract.js with a CDN like this:
```html
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/tesseract.min.js'></script>
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/tesseract.min.js'></script>
```
After including your scripts, the `Tesseract` variable will be defined globally!

54
docs/examples.md

@ -56,7 +56,7 @@ worker @@ -56,7 +56,7 @@ worker
});
```
### with whitelist chars (^2.0.0-alpha.4)
### with whitelist char (^2.0.0-alpha.5)
Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work.
@ -71,7 +71,7 @@ worker @@ -71,7 +71,7 @@ worker
'http://jeroen.github.io/images/testocr.png',
'eng',
{
'init_oem': OEM.TESSERACT_ONLY,
'tessedit_ocr_engine_mode': OEM.TESSERACT_ONLY,
'tessedit_char_whitelist': '0123456789-.',
}
)
@ -82,3 +82,53 @@ worker @@ -82,3 +82,53 @@ worker
console.log(result);
});
```
### with different pageseg mode (^2.0.0-alpha.5)
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript
import Tesseract from 'tesseract.js';
const { TesseractWorker, PSM } = Tesseract;
const worker = new TesseractWorker();
worker
.recognize(
'http://jeroen.github.io/images/testocr.png',
'eng',
{
'tessedit_pageseg_mode': PSM.SINGLE_BLOCK,
}
)
.progress((p) => {
console.log('progress', p);
})
.then((result) => {
console.log(result);
});
```
### with pdf output (^2.0.0-alpha.5)
```javascript
import Tesseract from 'tesseract.js';
const { TesseractWorker } = Tesseract;
const worker = new TesseractWorker();
worker
.recognize(
'http://jeroen.github.io/images/testocr.png',
'eng',
{
'tessedit_create_pdf': '1',
}
)
.progress((p) => {
console.log('progress', p);
})
.then((result) => {
console.log(result);
});
```

4
docs/local-installation.md

@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan @@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan
```javascript
const worker = Tesseract.TesseractWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/worker.min.js',
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.wasm.js',
});
```

2
examples/browser/basic.html

@ -1,2 +1,2 @@ @@ -1,2 +1,2 @@
<script src="/dist/tesseract.dev.js"></script>
<input type="file" onchange="const worker = new Tesseract.TesseractWorker();worker.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})">
<input type="file" onchange="const worker = new Tesseract.TesseractWorker({ corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js' });worker.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})">

4
examples/browser/demo.html

@ -42,7 +42,9 @@ function recognizeFile(file){ @@ -42,7 +42,9 @@ function recognizeFile(file){
const { TesseractWorker } = Tesseract;
const worker = new TesseractWorker();
const worker = new TesseractWorker({
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
});
worker.recognize(file,
document.querySelector('#langsel').value

28
src/browser/index.js

@ -71,6 +71,25 @@ const loadImage = (image) => { @@ -71,6 +71,25 @@ const loadImage = (image) => {
return Promise.reject();
};
const downloadFile = (path, blob) => {
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, path);
} else {
const link = document.createElement('a');
// Browsers that support HTML5 download attribute
if (link.download !== undefined) {
const url = URL.createObjectURL(blob);
link.setAttribute('href', url);
link.setAttribute('download', path);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
}
/*
* Default options for browser worker
*/
@ -83,7 +102,7 @@ exports.defaultOptions = { @@ -83,7 +102,7 @@ exports.defaultOptions = {
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
};
/**
@ -108,7 +127,12 @@ exports.spawnWorker = (instance, { workerPath }) => { @@ -108,7 +127,12 @@ exports.spawnWorker = (instance, { workerPath }) => {
}
worker.onmessage = ({ data }) => {
instance.recv(data);
if (data.jobId.startsWith('Job')) {
instance.recv(data);
} else if (data.jobId.startsWith('Download')) {
const { path, blob } = data;
downloadFile(path, blob);
}
};
return worker;

9
src/browser/worker.js

@ -42,4 +42,13 @@ workerUtils.setAdapter({ @@ -42,4 +42,13 @@ workerUtils.setAdapter({
}
return global.TesseractCore;
},
b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))),
writeFile: (path, data, type) => {
const blob = new Blob([data], { type });
self.postMessage({
jobId: 'Download',
path,
blob,
});
},
});

14
src/common/options.js

@ -1,3 +1,5 @@ @@ -1,3 +1,5 @@
const { OEM, PSM } = require('./types');
module.exports = {
defaultOptions: {
/*
@ -7,4 +9,16 @@ module.exports = { @@ -7,4 +9,16 @@ module.exports = {
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
},
/*
* default params for recognize()
*/
defaultParams: {
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessedit_create_pdf: '0',
textonly_pdf: '0',
pdf_name: 'tesseract.js-ocr-result',
pdf_title: 'Tesseract.js OCR Result',
},
};

1
src/common/pdf-ttf.js

@ -0,0 +1 @@ @@ -0,0 +1 @@
module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA=';

21
src/common/types.js

@ -2,7 +2,7 @@ module.exports = { @@ -2,7 +2,7 @@ module.exports = {
/*
* OEM = OCR Engine Mode, and there are 5 possible modes.
*
* By default tesseract.js uses DEFAULT mode, which uses LSTM when possible.
* By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible.
* If you need to use some tesseract v3 features (like tessedit_char_whitelist),
* you need to use TESSERACT_ONLY mode.
*
@ -14,4 +14,23 @@ module.exports = { @@ -14,4 +14,23 @@ module.exports = {
DEFAULT: 3,
COUNT: 4,
},
/*
* PSM = Page Segmentation Mode
*/
PSM: {
OSD_ONLY: '0',
AUTO_OSD: '1',
AUTO_ONLY: '2',
AUTO: '3',
SINGLE_COLUMN: '4',
SINGLE_BLOCK_VERT_TEXT: '5',
SINGLE_BLOCK: '6',
SINGLE_LINE: '7',
SINGLE_WORD: '8',
SINGLE_CHAR: '9',
SPARSE_TEXT: '10',
SPARSE_TEXT_OSD: '11',
RAW_LINE: '12',
COUNT: '13',
},
};

67
src/common/workerUtils.js

@ -9,7 +9,9 @@ @@ -9,7 +9,9 @@
*/
const { readImage, loadLang } = require('tesseract.js-utils');
const check = require('check-types');
const pdfTTF = require('./pdf-ttf');
const dump = require('./dump');
const { defaultParams } = require('./options');
/*
* Tesseract Module returned by TesseractCore.
@ -51,6 +53,58 @@ const setImage = (image) => { @@ -51,6 +53,58 @@ const setImage = (image) => {
return data === null ? pix : data;
};
/**
* handleParams
*
* @name handleParams
* @function hanlde params from users
* @access private
* @param {string} lang - lang string for Init()
* @param {object} customParams - an object of params
*/
const handleParams = (lang, customParams) => {
const {
tessedit_ocr_engine_mode,
...params
} = {
...defaultParams,
...customParams,
};
api.Init(null, lang, tessedit_ocr_engine_mode);
Object.keys(params).forEach((key) => {
api.SetVariable(key, params[key]);
});
};
/**
* handleOutput
*
* @name handleOutput
* @function handle file output
* @access private
* @param {object} customParams - an object of params
*/
const handleOutput = (customParams) => {
const {
tessedit_create_pdf,
textonly_pdf,
pdf_name,
pdf_title,
} = {
...defaultParams,
...customParams,
};
if (tessedit_create_pdf === '1') {
const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1');
pdfRenderer.BeginDocument(pdf_title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
adapter.writeFile(`${pdf_name}.pdf`, TessModule.FS.readFile(`/${pdf_name}.pdf`), 'application/pdf');
TessModule._free(pdfRenderer);
}
}
/**
* handleInit
*
@ -75,6 +129,7 @@ const handleInit = ({ corePath }, res) => { @@ -75,6 +129,7 @@ const handleInit = ({ corePath }, res) => {
})
.then((tessModule) => {
TessModule = tessModule;
TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF));
api = new TessModule.TessBaseAPI();
res.progress({ status: 'initialized tesseract', progress: 1 });
});
@ -123,22 +178,16 @@ const handleRecognize = ({ @@ -123,22 +178,16 @@ const handleRecognize = ({
.then(() => (
loadLanguage({ lang, options }, res)
.then(() => {
const OEM = check.undefined(params['init_oem'])
? TessModule.OEM_DEFAULT
: params['init_oem'];
const progressUpdate = (progress) => {
res.progress({ status: 'initializing api', progress });
};
progressUpdate(0);
api.Init(null, lang, OEM);
progressUpdate(0.3);
Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => {
api.SetVariable(key, params[key]);
});
progressUpdate(0.6);
handleParams(lang, params);
progressUpdate(0.5);
const ptr = setImage(image);
progressUpdate(1);
api.Recognize(null);
handleOutput(params);
const result = dump(TessModule, api);
api.End();
TessModule._free(ptr);

4
src/index.js

@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
*/
const utils = require('tesseract.js-utils');
const TesseractWorker = require('./common/TesseractWorker');
const { OEM } = require('./common/types');
const types = require('./common/types');
module.exports = {
/** Worker for OCR, @see common/TesseractWorker.js */
@ -17,5 +17,5 @@ module.exports = { @@ -17,5 +17,5 @@ module.exports = {
/** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */
utils,
/** Check ./common/types for more details */
OEM,
...types,
};

7
src/node/worker.js

@ -33,4 +33,11 @@ workerUtils.setAdapter({ @@ -33,4 +33,11 @@ workerUtils.setAdapter({
}
return TesseractCore;
},
b64toU8Array: s => Buffer.from(s, 'base64'),
writeFile: (path, data) => {
const fs = require('fs');
fs.writeFile(path, data, () => {
console.log('File Write Succeeded!');
});
},
});

Loading…
Cancel
Save