From e32272ea2e383a4e4a81113b946da9410f1101df Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Fri, 17 May 2019 13:19:07 +0800 Subject: [PATCH] Add pdf output feature and rename oem params --- README.md | 2 +- docs/examples.md | 54 ++++++++++++++++++++++++++++-- docs/local-installation.md | 4 +-- examples/browser/basic.html | 2 +- examples/browser/demo.html | 4 ++- src/browser/index.js | 28 ++++++++++++++-- src/browser/worker.js | 9 +++++ src/common/options.js | 14 ++++++++ src/common/pdf-ttf.js | 1 + src/common/types.js | 21 +++++++++++- src/common/workerUtils.js | 67 ++++++++++++++++++++++++++++++++----- src/index.js | 4 +-- src/node/worker.js | 7 ++++ 13 files changed, 196 insertions(+), 21 deletions(-) create mode 100644 src/common/pdf-ttf.js diff --git a/README.md b/README.md index 4da9c29..e4af5f1 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Tesseract.js works with a ` + ``` After including your scripts, the `Tesseract` variable will be defined globally! diff --git a/docs/examples.md b/docs/examples.md index 99409d9..f2b332b 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -56,7 +56,7 @@ worker }); ``` -### with whitelist chars (^2.0.0-alpha.4) +### with whitelist char (^2.0.0-alpha.5) Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work. @@ -71,7 +71,7 @@ worker 'http://jeroen.github.io/images/testocr.png', 'eng', { - 'init_oem': OEM.TESSERACT_ONLY, + 'tessedit_ocr_engine_mode': OEM.TESSERACT_ONLY, 'tessedit_char_whitelist': '0123456789-.', } ) @@ -82,3 +82,53 @@ worker console.log(result); }); ``` + +### with different pageseg mode (^2.0.0-alpha.5) + +Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 + +```javascript +import Tesseract from 'tesseract.js'; + +const { TesseractWorker, PSM } = Tesseract; +const worker = new TesseractWorker(); + +worker + .recognize( + 'http://jeroen.github.io/images/testocr.png', + 'eng', + { + 'tessedit_pageseg_mode': PSM.SINGLE_BLOCK, + } + ) + .progress((p) => { + console.log('progress', p); + }) + .then((result) => { + console.log(result); + }); +``` + +### with pdf output (^2.0.0-alpha.5) + +```javascript +import Tesseract from 'tesseract.js'; + +const { TesseractWorker } = Tesseract; +const worker = new TesseractWorker(); + +worker + .recognize( + 'http://jeroen.github.io/images/testocr.png', + 'eng', + { + 'tessedit_create_pdf': '1', + } + ) + .progress((p) => { + console.log('progress', p); + }) + .then((result) => { + console.log(result); + }); +``` diff --git a/docs/local-installation.md b/docs/local-installation.md index eb4667b..9f2c874 100644 --- a/docs/local-installation.md +++ b/docs/local-installation.md @@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan ```javascript const worker = Tesseract.TesseractWorker({ - workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/worker.min.js', + workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.5/dist/worker.min.js', langPath: 'https://tessdata.projectnaptha.com/4.0.0', - corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js', + corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.wasm.js', }); ``` diff --git a/examples/browser/basic.html b/examples/browser/basic.html index e44a6d6..84870d5 100644 --- a/examples/browser/basic.html +++ b/examples/browser/basic.html @@ -1,2 +1,2 @@ - + diff --git a/examples/browser/demo.html b/examples/browser/demo.html index a68eab9..677ccbb 100644 --- a/examples/browser/demo.html +++ b/examples/browser/demo.html @@ -42,7 +42,9 @@ function recognizeFile(file){ const { TesseractWorker } = Tesseract; - const worker = new TesseractWorker(); + const worker = new TesseractWorker({ + corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js', + }); worker.recognize(file, document.querySelector('#langsel').value diff --git a/src/browser/index.js b/src/browser/index.js index 7e344e4..9f80d5e 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -71,6 +71,25 @@ const loadImage = (image) => { return Promise.reject(); }; +const downloadFile = (path, blob) => { + if (navigator.msSaveBlob) { + // IE 10+ + navigator.msSaveBlob(blob, path); + } else { + const link = document.createElement('a'); + // Browsers that support HTML5 download attribute + if (link.download !== undefined) { + const url = URL.createObjectURL(blob); + link.setAttribute('href', url); + link.setAttribute('download', path); + link.style.visibility = 'hidden'; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + } + } +} + /* * Default options for browser worker */ @@ -83,7 +102,7 @@ exports.defaultOptions = { * If browser doesn't support WebAssembly, * load ASM version instead */ - corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, + corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.9/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, }; /** @@ -108,7 +127,12 @@ exports.spawnWorker = (instance, { workerPath }) => { } worker.onmessage = ({ data }) => { - instance.recv(data); + if (data.jobId.startsWith('Job')) { + instance.recv(data); + } else if (data.jobId.startsWith('Download')) { + const { path, blob } = data; + downloadFile(path, blob); + } }; return worker; diff --git a/src/browser/worker.js b/src/browser/worker.js index d4f5b42..e558ef8 100644 --- a/src/browser/worker.js +++ b/src/browser/worker.js @@ -42,4 +42,13 @@ workerUtils.setAdapter({ } return global.TesseractCore; }, + b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))), + writeFile: (path, data, type) => { + const blob = new Blob([data], { type }); + self.postMessage({ + jobId: 'Download', + path, + blob, + }); + }, }); diff --git a/src/common/options.js b/src/common/options.js index 8b49e1a..abbd45b 100644 --- a/src/common/options.js +++ b/src/common/options.js @@ -1,3 +1,5 @@ +const { OEM, PSM } = require('./types'); + module.exports = { defaultOptions: { /* @@ -7,4 +9,16 @@ module.exports = { */ langPath: 'https://tessdata.projectnaptha.com/4.0.0', }, + /* + * default params for recognize() + */ + defaultParams: { + tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED, + tessedit_pageseg_mode: PSM.SINGLE_BLOCK, + tessedit_char_whiltelist: '', + tessedit_create_pdf: '0', + textonly_pdf: '0', + pdf_name: 'tesseract.js-ocr-result', + pdf_title: 'Tesseract.js OCR Result', + }, }; diff --git a/src/common/pdf-ttf.js b/src/common/pdf-ttf.js new file mode 100644 index 0000000..8e24c21 --- /dev/null +++ b/src/common/pdf-ttf.js @@ -0,0 +1 @@ +module.exports = 'AAEAAAAKAIAAAwAgT1MvMlbeyJQAAAEoAAAAYGNtYXAACgA0AAABkAAAAB5nbHlmFSJBJAAAAbgAAAAYaGVhZAt48WUAAACsAAAANmhoZWEMAgQCAAAA5AAAACRobXR4BAAAAAAAAYgAAAAIbG9jYQAMAAAAAAGwAAAABm1heHAABAAFAAABCAAAACBuYW1l8usW2gAAAdAAAABLcG9zdAABAAEAAAIcAAAAIAABAAAAAQAAsJRxEF8PPPUEBwgAAAAAAM+a/G4AAAAA1MOn8gAAAAAEAAgAAAAAEAACAAAAAAAAAAEAAAgA//8AAAQAAAAAAAQAAAEAAAAAAAAAAAAAAAAAAAACAAEAAAACAAQAAQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAwAAAZAABQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUAAQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAR09PRwBAAAAAAAAB//8AAAABAAGAAAAAAAAAAAAAAAAAAAABAAAAAAAABAAAAAAAAAIAAQAAAAAAFAADAAAAAAAUAAYACgAAAAAAAAAAAAAAAAAMAAAAAQAAAAAEAAgAAAMAADEhESEEAPwACAAAAAADACoAAAADAAAABQAWAAAAAQAAAAAABQALABYAAwABBAkABQAWAAAAVgBlAHIAcwBpAG8AbgAgADEALgAwVmVyc2lvbiAxLjAAAAEAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAA='; diff --git a/src/common/types.js b/src/common/types.js index c8f868f..68ae1f8 100644 --- a/src/common/types.js +++ b/src/common/types.js @@ -2,7 +2,7 @@ module.exports = { /* * OEM = OCR Engine Mode, and there are 5 possible modes. * - * By default tesseract.js uses DEFAULT mode, which uses LSTM when possible. + * By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible. * If you need to use some tesseract v3 features (like tessedit_char_whitelist), * you need to use TESSERACT_ONLY mode. * @@ -14,4 +14,23 @@ module.exports = { DEFAULT: 3, COUNT: 4, }, + /* + * PSM = Page Segmentation Mode + */ + PSM: { + OSD_ONLY: '0', + AUTO_OSD: '1', + AUTO_ONLY: '2', + AUTO: '3', + SINGLE_COLUMN: '4', + SINGLE_BLOCK_VERT_TEXT: '5', + SINGLE_BLOCK: '6', + SINGLE_LINE: '7', + SINGLE_WORD: '8', + SINGLE_CHAR: '9', + SPARSE_TEXT: '10', + SPARSE_TEXT_OSD: '11', + RAW_LINE: '12', + COUNT: '13', + }, }; diff --git a/src/common/workerUtils.js b/src/common/workerUtils.js index 0f3905e..a49c9b6 100644 --- a/src/common/workerUtils.js +++ b/src/common/workerUtils.js @@ -9,7 +9,9 @@ */ const { readImage, loadLang } = require('tesseract.js-utils'); const check = require('check-types'); +const pdfTTF = require('./pdf-ttf'); const dump = require('./dump'); +const { defaultParams } = require('./options'); /* * Tesseract Module returned by TesseractCore. @@ -51,6 +53,58 @@ const setImage = (image) => { return data === null ? pix : data; }; +/** + * handleParams + * + * @name handleParams + * @function hanlde params from users + * @access private + * @param {string} lang - lang string for Init() + * @param {object} customParams - an object of params + */ +const handleParams = (lang, customParams) => { + const { + tessedit_ocr_engine_mode, + ...params + } = { + ...defaultParams, + ...customParams, + }; + api.Init(null, lang, tessedit_ocr_engine_mode); + Object.keys(params).forEach((key) => { + api.SetVariable(key, params[key]); + }); +}; + +/** + * handleOutput + * + * @name handleOutput + * @function handle file output + * @access private + * @param {object} customParams - an object of params + */ +const handleOutput = (customParams) => { + const { + tessedit_create_pdf, + textonly_pdf, + pdf_name, + pdf_title, + } = { + ...defaultParams, + ...customParams, + }; + + if (tessedit_create_pdf === '1') { + const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1'); + pdfRenderer.BeginDocument(pdf_title); + pdfRenderer.AddImage(api); + pdfRenderer.EndDocument(); + adapter.writeFile(`${pdf_name}.pdf`, TessModule.FS.readFile(`/${pdf_name}.pdf`), 'application/pdf'); + TessModule._free(pdfRenderer); + } +} + /** * handleInit * @@ -75,6 +129,7 @@ const handleInit = ({ corePath }, res) => { }) .then((tessModule) => { TessModule = tessModule; + TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF)); api = new TessModule.TessBaseAPI(); res.progress({ status: 'initialized tesseract', progress: 1 }); }); @@ -123,22 +178,16 @@ const handleRecognize = ({ .then(() => ( loadLanguage({ lang, options }, res) .then(() => { - const OEM = check.undefined(params['init_oem']) - ? TessModule.OEM_DEFAULT - : params['init_oem']; const progressUpdate = (progress) => { res.progress({ status: 'initializing api', progress }); }; progressUpdate(0); - api.Init(null, lang, OEM); - progressUpdate(0.3); - Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => { - api.SetVariable(key, params[key]); - }); - progressUpdate(0.6); + handleParams(lang, params); + progressUpdate(0.5); const ptr = setImage(image); progressUpdate(1); api.Recognize(null); + handleOutput(params); const result = dump(TessModule, api); api.End(); TessModule._free(ptr); diff --git a/src/index.js b/src/index.js index ab9a505..5dc211b 100644 --- a/src/index.js +++ b/src/index.js @@ -9,7 +9,7 @@ */ const utils = require('tesseract.js-utils'); const TesseractWorker = require('./common/TesseractWorker'); -const { OEM } = require('./common/types'); +const types = require('./common/types'); module.exports = { /** Worker for OCR, @see common/TesseractWorker.js */ @@ -17,5 +17,5 @@ module.exports = { /** Utilities for tesseract.js, @see {@link https://www.npmjs.com/package/tesseract.js-utils} */ utils, /** Check ./common/types for more details */ - OEM, + ...types, }; diff --git a/src/node/worker.js b/src/node/worker.js index b526fe0..ae1e5cf 100644 --- a/src/node/worker.js +++ b/src/node/worker.js @@ -33,4 +33,11 @@ workerUtils.setAdapter({ } return TesseractCore; }, + b64toU8Array: s => Buffer.from(s, 'base64'), + writeFile: (path, data) => { + const fs = require('fs'); + fs.writeFile(path, data, () => { + console.log('File Write Succeeded!'); + }); + }, });