Browse Source

rewriting with node support

pull/12/head
Kevin Kwok 8 years ago
parent
commit
c5a1726e1d
  1. 26
      devServer.js
  2. 252
      dist/tesseract.js
  3. 12293
      dist/worker.js
  4. 0
      docs/tesseract_lang_list.md
  5. 0
      docs/tesseract_parameters.md
  6. 5
      examples/file-input/demo.html
  7. 23
      examples/node/test.js
  8. 30
      index.html
  9. 47
      package.json
  10. 154
      src/browser/index.js
  11. 61
      src/browser/lang.js
  12. 21
      src/browser/worker.js
  13. 2
      src/common/circularize.js
  14. 23
      src/common/desaturate.js
  15. 161
      src/common/dump.js
  16. 1
      src/common/langdata.json
  17. 143
      src/common/worker.js
  18. 137
      src/index.js
  19. 83
      src/node/index.js
  20. 36
      src/node/lang.js
  21. 22
      src/node/worker.js
  22. 26
      src/shared/desaturate.js
  23. 166
      src/shared/dump.js
  24. 2
      src/shared/fileSizes.js
  25. 3
      src/worker/db.js
  26. 53
      src/worker/detect.js
  27. 37
      src/worker/index.js
  28. 99
      src/worker/loadLanguage.js
  29. 56
      src/worker/recognize.js
  30. 42
      webpack.config.dev.js
  31. 46
      webpack.config.prod.js

26
devServer.js

@ -1,26 +0,0 @@ @@ -1,26 +0,0 @@
var path = require('path');
var express = require('express');
var webpack = require('webpack');
var config = require('./webpack.config.dev');
var app = express();
var compiler = webpack(config);
app.use(require('webpack-dev-middleware')(compiler, {
noInfo: true,
publicPath: config[0].output.publicPath
}));
// app.use(require('webpack-hot-middleware')(compiler));
app.use('/', express.static('./'));
var port = 7355
app.listen(port, 'localhost', function(err) {
if (err) {
console.log(err);
return;
}
console.log('Listening at http://localhost:' + port);
});

252
dist/tesseract.js vendored

@ -1 +1,251 @@ @@ -1 +1,251 @@
!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t():"function"==typeof define&&define.amd?define([],t):"object"==typeof exports?exports.Tesseract=t():e.Tesseract=t()}(this,function(){return function(e){function t(n){if(r[n])return r[n].exports;var o=r[n]={exports:{},id:n,loaded:!1};return e[n].call(o.exports,o,o.exports,t),o.loaded=!0,o.exports}var r={};return t.m=e,t.c=r,t.p="",t(0)}([function(e,t){"use strict";function r(e,t){return a||(a=o(g.coreUrl,g.workerUrl,g.langUrl)),a.recognize(e,t)}function n(e){return a||(a=o(g.coreUrl,g.workerUrl,g.langUrl)),a.detect(e)}function o(){function e(e,t){var r=s++;u[r]={};var n=0;return Object.getOwnPropertyNames(t).filter(function(e){return"function"==typeof t[e]}).forEach(function(o){n++,t[o](function(a){t[o]=a,0==--n&&i.postMessage({jobId:r,action:e,args:t})})}),0==n&&i.postMessage({jobId:r,action:e,args:t}),{then:function(e){return u[r].result=e,this},error:function(e){return u[r].error=e,this},progress:function(e){return u[r].progress=e,this}}}function t(e){if(e.match&&e.match(/^https?:\/\//))return function(r){var n=new Image;n.src=e,n.onload=function(){return r(t(n))}};if("string"==typeof e&&(e=document.querySelector(e)),e.getContext)e=e.getContext("2d");else if("IMG"==e.tagName||"VIDEO"==e.tagName){var r=document.createElement("canvas");r.width=e.naturalWidth||e.videoWidth,r.height=e.naturalHeight||e.videoHeight;var n=r.getContext("2d");n.drawImage(e,0,0),e=n}return e.getImageData&&(e=e.getImageData(0,0,e.canvas.width,e.canvas.height)),e}var r=arguments.length>0&&void 0!==arguments[0]?arguments[0]:g.coreUrl,n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:g.workerUrl,o=arguments.length>2&&void 0!==arguments[2]?arguments[2]:g.langUrl,a=new Blob(["importScripts('"+r+"');\n\t\t importScripts('"+n+"');"]),i=new Worker(window.URL.createObjectURL(a)),c=!1,s=0,u={};return i.onmessage=function(e){var t=e.data,r=t.jobId,n=t.progress,o=t.error,a=t.result,i=u[r];n&&i.progress&&i.progress(n),o&&i.error&&i.error(o),a&&i.result&&i.result(a)},e("init",{mem:100663296,langUrl:o}),{detect:function(r){return e("detect",{image:t(r)})},recognize:function(r){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:"eng";return"string"==typeof n?n={lang:n}:n.lang=n.lang||"eng",c||["chi_sim","chi_tra","jpn"].indexOf(n.lang)==-1||(e("init",{mem:167772160,langUrl:o}),c=!0),e("recognize",{options:n,image:t(r)})}}}var a,i="https://cdn.rawgit.com/naptha/tesseract.js-core/master/index.js",c="https://cdn.rawgit.com/naptha/tesseract.js/8b915dc/dist/tesseract.worker.js",s="https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/",g={coreUrl:i,workerUrl:c,langUrl:s,recognize:r,detect:n,createWorker:o};e.exports=g}])});
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.Tesseract = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
'use strict';
exports.defaultOptions = {
langPath: 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/',
workerPath: 'dist/worker.js',
tesseractPath: 'https://cdn.rawgit.com/naptha/tesseract.js-core/0.1.0/index.js'
};
exports.spawnWorker = function spawnWorker(instance, workerOptions) {
var worker = new Worker(workerOptions.workerPath);
worker.onmessage = function (e) {
instance._recv(e.data);
};
return worker;
};
exports.terminateWorker = function (instance) {
instance.worker.terminate();
};
exports.sendPacket = function sendPacket(instance, packet) {
loadImage(packet.payload.image, function (img) {
packet.payload.image = img;
instance.worker.postMessage(packet);
});
};
function loadImage(image, cb) {
if (typeof image === 'string') {
if (/^\#/.test(image)) {
// element css selector
return loadImage(document.querySelector(image), cb);
} else {
// url or path
var im = new Image();
im.src = image;
im.onload = function (e) {
return loadImage(im, cb);
};
return;
}
} else if (image instanceof File) {
// files
var fr = new FileReader();
fr.onload = function (e) {
return loadImage(fr.result, cb);
};
fr.readAsDataURL(image);
return;
} else if (image instanceof Blob) {
return loadImage(URL.createObjectURL(image), cb);
} else if (image.getContext) {
// canvas element
return loadImage(image.getContext('2d'), cb);
} else if (image.tagName == "IMG" || image.tagName == "VIDEO") {
// image element or video element
var c = document.createElement('canvas');
c.width = image.naturalWidth || image.videoWidth;
c.height = image.naturalHeight || image.videoHeight;
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
return loadImage(ctx, cb);
} else if (image.getImageData) {
// canvas context
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return loadImage(data, cb);
}
cb(image);
}
},{}],2:[function(require,module,exports){
"use strict";
var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }();
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
var adapter = require('./node/index.js');
function createWorker(workerOptions) {
return new TesseractWorker(workerOptions);
}
var TesseractWorker = function () {
function TesseractWorker(workerOptions) {
_classCallCheck(this, TesseractWorker);
this.worker = null;
this.workerOptions = workerOptions;
this._currentJob = null;
this._queue = [];
}
_createClass(TesseractWorker, [{
key: 'recognize',
value: function recognize(image, options) {
var _this = this;
return this._delay(function (job) {
options = options || {};
options.lang = options.lang || 'eng';
job._send('recognize', { image: image, options: options, workerOptions: _this.workerOptions });
});
}
}, {
key: 'detect',
value: function detect(image, options) {
var _this2 = this;
options = options || {};
return this._delay(function (job) {
job._send('detect', { image: image, options: options, workerOptions: _this2.workerOptions });
});
}
}, {
key: 'terminate',
value: function terminate() {
if (this.worker) adapter.terminateWorker(this);
this.worker = null;
}
}, {
key: '_delay',
value: function _delay(fn) {
var _this3 = this;
if (!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions);
var job = new TesseractJob(this);
this._queue.push(function (e) {
_this3._queue.shift();
_this3._currentJob = job;
fn(job);
});
if (!this._currentJob) this._dequeue();
return job;
}
}, {
key: '_dequeue',
value: function _dequeue() {
this._currentJob = null;
if (this._queue.length > 0) {
this._queue[0]();
}
}
}, {
key: '_recv',
value: function _recv(packet) {
if (this._currentJob.id === packet.jobId) {
this._currentJob._handle(packet);
} else {
console.warn('Job ID ' + packet.jobId + ' not known.');
}
}
}]);
return TesseractWorker;
}();
var jobCounter = 0;
var TesseractJob = function () {
function TesseractJob(instance) {
_classCallCheck(this, TesseractJob);
this.id = 'Job-' + ++jobCounter + '-' + Math.random().toString(16).slice(3, 8);
this._instance = instance;
this._resolve = [];
this._reject = [];
this._progress = [];
}
_createClass(TesseractJob, [{
key: 'then',
value: function then(resolve, reject) {
if (this._resolve.push) {
this._resolve.push(resolve);
} else {
resolve(this._resolve);
}
if (reject) this.catch(reject);
return this;
}
}, {
key: 'catch',
value: function _catch(reject) {
if (this._reject.push) {
this._reject.push(reject);
} else {
reject(this._reject);
}
return this;
}
}, {
key: 'progress',
value: function progress(fn) {
this._progress.push(fn);
return this;
}
}, {
key: '_send',
value: function _send(action, payload) {
adapter.sendPacket(this._instance, {
jobId: this.id,
action: action,
payload: payload
});
}
}, {
key: '_handle',
value: function _handle(packet) {
var data = packet.data;
if (packet.status === 'resolve') {
if (this._resolve.length === 0) console.debug(data);
this._resolve.forEach(function (fn) {
var ret = fn(data);
if (ret && typeof ret.then == 'function') {
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.');
}
});
this._resolve = data;
this._instance._dequeue();
} else if (packet.status === 'reject') {
if (this._reject.length === 0) console.error(data);
this._reject.forEach(function (fn) {
return fn(data);
});
this._reject = data;
this._instance._dequeue();
} else if (packet.status === 'progress') {
this._progress.forEach(function (fn) {
return fn(data);
});
} else {
console.warn('Message type unknown', packet.status);
}
}
}]);
return TesseractJob;
}();
var DefaultTesseract = createWorker(adapter.defaultOptions);
DefaultTesseract.createWorker = createWorker;
module.exports = DefaultTesseract;
},{"./node/index.js":1}]},{},[2])(2)
});

12293
dist/worker.js vendored

File diff suppressed because it is too large Load Diff

0
tesseract_lang_list.md → docs/tesseract_lang_list.md

0
tesseract_parameters.md → docs/tesseract_parameters.md

5
examples/file-input/demo.html

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
<script src="dist/tesseract.js"></script>
<input type="file" onchange="Tesseract.recognize(this.files[0]).progress(console.log)">
<input type="file" onchange="Tesseract.recognize(this.files[0], {lang: 'chi_sim'}).progress(console.log)">

23
examples/node/test.js

@ -0,0 +1,23 @@ @@ -0,0 +1,23 @@
var Tesseract = require('./src/index.js')
global.Tesseract = Tesseract;
// Tesseract.recognize('yolop.png', {
// lang: 'eng'
// }).progress(function(info){
// console.log('--', info)
// })
// .then(function(data){
// console.log('--', data)
// })
// Tesseract.recognize('cosmic.jpg', {
// lang: 'eng'
// })
Tesseract.detect('cosmic.jpg')
.progress(function(info){
console.log(info)
})
.then(function(data){
console.log('done', data)
})

30
index.html

@ -1,30 +0,0 @@ @@ -1,30 +0,0 @@
<canvas id="c"></canvas>
<script type="text/javascript" src="./tesseract/tesseract.js"></script>
<script type="text/javascript">
var canvas = document.getElementById('c')
canvas.width = 400
canvas.height = 400
var ctx = canvas.getContext('2d');
ctx.font = '30px "Arial Black"'
ctx.fillText('Hell0 World', 100, 40)
// ctx.fillText("囚犯離奇掙脫囚犯離奇掙脫", 100, 40)
ctx.font = '30px "Times New Roman"'
ctx.fillText('from beyond', 100, 80)
// ctx.fillText('2小時可換乘2次2小時可換乘2次', 100, 80)
ctx.font = '30px sans-serif'
ctx.fillText('the Cosmic Void', 100, 120)
Tesseract.workerUrl = location.protocol+'//'+location.host+'/tesseract/tesseract.worker.js'
// Tesseract.detect(canvas)
// Tesseract.recognize('http://localhost:7355/westmorland.jpg')
// Tesseract.recognize(canvas, { tessedit_char_blacklist: 'e' })
Tesseract.recognize(canvas)
.progress(function(e){
console.log('progress', e)
})
.then(function(e){
console.log('result', e)
})
</script>

47
package.json

@ -2,34 +2,31 @@ @@ -2,34 +2,31 @@
"name": "tesseract.js",
"version": "1.0.0",
"description": "",
"main": "Tesseract.js",
"main": "index.js",
"scripts": {
"start": "node devServer.js",
"build": "webpack --config webpack.config.prod.js"
"test": "echo \"Error: no test specified\" & exit 1",
"start": "watchify src/index.js -o dist/tesseract.js --standalone Tesseract & watchify src/browser/worker.js -o dist/worker.js & http-server -p 7355",
"build": "browserify src/index.js -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.js --standalone Tesseract && browserify src/browser/worker.js -t [ babelify --presets [ es2015 ] ] -o dist/worker.js"
},
"dependencies": {
"level-js": "^2.1.6",
"pako": "^0.2.7",
"tesseract.js-core": "^1.0.0"
},
"devDependencies": {
"babel": "^6.5.2",
"babel-core": "^6.7.0",
"babel-loader": "^6.2.4",
"express": "^4.13.4",
"webpack": "^1.13.0",
"webpack-dev-middleware": "^1.5.1",
"babel-preset-stage-1": "^6.5.0",
"babel-preset-es2015": "^6.6.0"
},
"repository": {
"type": "git",
"url": "https://github.com/naptha/tesseract.js.git"
"browser": {
"./src/node/index.js": "./src/browser/index.js"
},
"author": "",
"license": "MIT",
"bugs": {
"url": "https://github.com/naptha/tesseract.js/issues"
"license": "ISC",
"devDependencies": {
"babel-preset-es2015": "^6.16.0",
"babel-preset-react": "^6.16.0",
"babelify": "^7.3.0",
"browserify": "^13.1.0",
"http-server": "^0.9.0",
"watchify": "^3.7.0"
},
"homepage": "https://github.com/naptha/tesseract.js"
"dependencies": {
"file-type": "^3.8.0",
"jpeg-js": "^0.2.0",
"level-js": "^2.2.4",
"pako": "^1.0.3",
"png.js": "^0.2.1",
"tesseract.js-core": "^1.0.2"
}
}

154
src/browser/index.js

@ -1,106 +1,64 @@ @@ -1,106 +1,64 @@
var coreUrl = 'https://cdn.rawgit.com/naptha/tesseract.js-core/master/index.js',
workerUrl = 'https://cdn.rawgit.com/naptha/tesseract.js/8b915dc/dist/tesseract.worker.js',
langUrl = 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/',
worker;
function recognize(image, options){
if(!worker) worker = createWorker( Tesseract.coreUrl, Tesseract.workerUrl, Tesseract.langUrl )
return worker.recognize(image, options)
exports.defaultOptions = {
langPath: 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/',
workerPath: 'dist/worker.js',
tesseractPath: 'https://cdn.rawgit.com/naptha/tesseract.js-core/0.1.0/index.js',
}
function detect(image){
if(!worker) worker = createWorker( Tesseract.coreUrl, Tesseract.workerUrl, Tesseract.langUrl )
return worker.detect(image)
exports.spawnWorker = function spawnWorker(instance, workerOptions){
var worker = new Worker(workerOptions.workerPath)
worker.onmessage = function(e){
instance._recv(e.data)
}
return worker
}
function createWorker(coreUrl=Tesseract.coreUrl, workerUrl=Tesseract.workerUrl, langUrl=Tesseract.langUrl){
var blob = new Blob([`importScripts('${coreUrl}');
importScripts('${workerUrl}');`])
var worker = new Worker(window.URL.createObjectURL(blob));
var bigworker = false
var jobCounter = 0
var handlers = {}
function runAsync(action, args){
var jobId = jobCounter++
handlers[jobId] = {}
var waitingCount = 0
Object.getOwnPropertyNames(args)
.filter(name => typeof args[name] === 'function')
.forEach(name => {
waitingCount++
args[name](value => {
args[name] = value
if(--waitingCount == 0) worker.postMessage({jobId, action, args})
})
})
if(waitingCount == 0) worker.postMessage({jobId, action, args})
return {
then (f){ handlers[jobId].result = f; return this},
error (f){ handlers[jobId].error = f; return this},
progress(f){ handlers[jobId].progress = f; return this}
}
}
worker.onmessage = function(e){
var {jobId, progress, error, result} = e.data
var handler = handlers[jobId]
if(progress && handler.progress) handler.progress(progress);
if(error && handler.error) handler.error(error);
if(result && handler.result) handler.result(result);
}
function convertToImageData(image){
if(image.match && image.match(/^https?:\/\//)) {
return function thunk(cb){
var img = new Image()
img.src = image
img.onload = () => cb(convertToImageData(img))
}
}
if(typeof image === 'string') image = document.querySelector(image)
if(image.getContext) image = image.getContext('2d');
else if(image.tagName == "IMG" || image.tagName == "VIDEO"){
var c = document.createElement('canvas');
c.width = image.naturalWidth || image.videoWidth;
c.height = image.naturalHeight || image.videoHeight;
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
image = ctx;
}
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return image
}
runAsync('init', {mem: (1<<24) * 6, langUrl})
return {
detect(image){
return runAsync('detect', {image: convertToImageData(image)})
},
recognize(image, options='eng'){
if (typeof options === 'string') options = {lang: options};
else options.lang = options.lang || 'eng';
if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(options.lang) != -1){
runAsync('init', {mem: (1<<24) * 10, langUrl})
bigworker = true
}
exports.terminateWorker = function(instance){
instance.worker.terminate()
}
return runAsync('recognize', {options, image: convertToImageData(image)})
}
}
exports.sendPacket = function sendPacket(instance, packet){
loadImage(packet.payload.image, function(img){
packet.payload.image = img
instance.worker.postMessage(packet)
})
}
var Tesseract = {coreUrl, workerUrl, langUrl, recognize, detect, createWorker}
module.exports = Tesseract
function loadImage(image, cb){
if(typeof image === 'string'){
if(/^\#/.test(image)){
// element css selector
return loadImage(document.querySelector(image), cb)
}else{
// url or path
var im = new Image
im.src = image;
im.onload = e => loadImage(im, cb);
return
}
}else if(image instanceof File){
// files
var fr = new FileReader()
fr.onload = e => loadImage(fr.result, cb);
fr.readAsDataURL(image)
return
}else if(image instanceof Blob){
return loadImage(URL.createObjectURL(image), cb)
}else if(image.getContext){
// canvas element
return loadImage(image.getContext('2d'), cb)
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){
// image element or video element
var c = document.createElement('canvas');
c.width = image.naturalWidth || image.videoWidth;
c.height = image.naturalHeight || image.videoHeight;
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
return loadImage(ctx, cb)
}else if(image.getImageData){
// canvas context
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return loadImage(data, cb)
}
cb(image)
}

61
src/browser/lang.js

@ -0,0 +1,61 @@ @@ -0,0 +1,61 @@
var leveljs = require('level-js')
var db = typeof indexedDB === 'undefined' ? { open: (_, cb) => cb(true) } : leveljs('./tessdata2')
var langdata = require('../common/langdata.json')
module.exports = function getLanguageData(req, res, cb){
var lang = req.options.lang;
function saveDataFile(data){
db.put(lang, data, err => console.log('cached', lang, err))
cb(data)
}
db.open({ compression: false }, err => {
if (err) return fetchLanguageData(req, res, cb);
db.get(lang, (err, data) => {
if (err) return fetchLanguageData(req, res, saveDataFile);
res.progress({ status: 'found in cache ' + lang + '.traineddata' })
cb(data)
})
})
}
var ungzip = require('pako').ungzip;
function fetchLanguageData(req, res, cb){
var lang = req.options.lang;
var langfile = lang + '.traineddata.gz';
var url = req.workerOptions.langPath + langfile;
var xhr = new XMLHttpRequest();
xhr.responseType = 'arraybuffer';
xhr.open('GET', url, true);
xhr.onerror = e => {
xhr.onprogress = xhr.onload = null
cb(xhr, null)
}
xhr.onprogress = e =>
res.progress({
status: 'downloading ' + langfile,
loaded: e.loaded,
progress: Math.min(1, e.loaded / langdata[lang])
});
xhr.onload = e => {
if (!(xhr.status == 200 || (xhr.status == 0 && xhr.response))) return res.reject('Error downloading language ' + url);
res.progress({ status: 'unzipping ' + langfile })
// in case the gzips are already ungzipped or extra gzipped
var response = new Uint8Array(xhr.response)
try {
while(response[0] == 0x1f && response[1] == 0x8b) response = ungzip(response);
} catch (err) {
return res.reject('Error unzipping language file ' + langfile + '\n' + err.message)
}
cb(response)
}
xhr.send()
}

21
src/browser/worker.js

@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
"use strict";
var workerUtils = require('../common/worker.js')
global.addEventListener('message', function(e){
var packet = e.data;
workerUtils.dispatchHandlers(packet, obj => postMessage(obj))
})
exports.getLanguageData = require('./lang.js')
exports.getCore = function(req, res){
if(!global.TesseractCore){
res.progress({ status: 'loading tesseract core' })
importScripts(req.workerOptions.tesseractPath)
res.progress({ status: 'loaded tesseract core' })
}
return TesseractCore
}
workerUtils.setAdapter(module.exports);

2
src/shared/circularize.js → src/common/circularize.js

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
export default function circularize(page){
module.exports = function circularize(page){
page.paragraphs = []
page.lines = []
page.words = []

23
src/common/desaturate.js

@ -0,0 +1,23 @@ @@ -0,0 +1,23 @@
module.exports = function desaturate(image){
var width, height;
if(image.data){
var src = image.data;
width = image.width, height = image.height;
var dst = new Uint8Array(width * height);
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0;
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) {
// convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16
}
for (; i < srcLength; i += 4, ++j) //finish up
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
image = dst;
} else {
throw 'Expected ImageData'
}
return image
}

161
src/common/dump.js

@ -0,0 +1,161 @@ @@ -0,0 +1,161 @@
function deindent(html){
var lines = html.split('\n')
if(lines[0].substring(0,2) === " "){
for (var i = 0; i < lines.length; i++) {
if (lines[i].substring(0,2) === " ") {
lines[i] = lines[i].slice(2)
}
};
}
return lines.join('\n')
}
module.exports = function DumpLiterallyEverything(Module, base){
var ri = base.GetIterator();
var blocks = [];
var block, para, textline, word, symbol;
function enumToString(value, prefix){
return (Object.keys(Module)
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' })
.filter(function(e){ return Module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
}
ri.Begin()
do {
if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){
var poly = ri.BlockPolygon();
var polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if(Module.getPointer(poly) > 0){
var n = poly.get_n(),
px = poly.get_x(),
py = poly.get_y(),
polygon = [];
for(var i = 0; i < n; i++){
polygon.push([px.getValue(i), py.getValue(i)]);
}
Module._ptaDestroy(Module.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(Module.RIL_BLOCK),
confidence: ri.Confidence(Module.RIL_BLOCK),
baseline: ri.getBaseline(Module.RIL_BLOCK),
bbox: ri.getBoundingBox(Module.RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon: polygon
}
blocks.push(block)
}
if(ri.IsAtBeginningOf(Module.RIL_PARA)){
para = {
lines: [],
text: ri.GetUTF8Text(Module.RIL_PARA),
confidence: ri.Confidence(Module.RIL_PARA),
baseline: ri.getBaseline(Module.RIL_PARA),
bbox: ri.getBoundingBox(Module.RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr()
}
block.paragraphs.push(para)
}
if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){
textline = {
words: [],
text: ri.GetUTF8Text(Module.RIL_TEXTLINE),
confidence: ri.Confidence(Module.RIL_TEXTLINE),
baseline: ri.getBaseline(Module.RIL_TEXTLINE),
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE)
}
para.lines.push(textline)
}
if(ri.IsAtBeginningOf(Module.RIL_WORD)){
var fontInfo = ri.getWordFontAttributes(),
wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(Module.RIL_WORD),
confidence: ri.Confidence(Module.RIL_WORD),
baseline: ri.getBaseline(Module.RIL_WORD),
bbox: ri.getBoundingBox(Module.RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
}
var wc = new Module.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence()
})
} while (wc.Next());
Module.destroy(wc)
textline.words.push(word)
}
var image = null;
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// Module._pixDestroy(Module.getPointer(pix));
if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){
symbol = {
choices: [],
image: image,
text: ri.GetUTF8Text(Module.RIL_SYMBOL),
confidence: ri.Confidence(Module.RIL_SYMBOL),
baseline: ri.getBaseline(Module.RIL_SYMBOL),
bbox: ri.getBoundingBox(Module.RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
}
word.symbols.push(symbol)
var ci = new Module.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence()
})
} while (ci.Next());
Module.destroy(ci)
}
} while (ri.Next(Module.RIL_SYMBOL));
Module.destroy(ri)
return {
text: base.GetUTF8Text(),
html: deindent(base.GetHOCRText()),
confidence: base.MeanTextConf(),
blocks: blocks,
psm: enumToString(base.GetPageSegMode(), 'PSM'),
oem: enumToString(base.oem(), 'OEM'),
version: base.Version(),
}
}

1
src/common/langdata.json

@ -0,0 +1 @@ @@ -0,0 +1 @@
{"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}

143
src/common/worker.js

@ -0,0 +1,143 @@ @@ -0,0 +1,143 @@
var latestJob;
var Module;
var base;
var adapter = {};
function dispatchHandlers(packet, send){
function respond(status, data){
send({
jobId: packet.jobId,
status: status,
data: data
})
}
respond.resolve = respond.bind(this, 'resolve')
respond.reject = respond.bind(this, 'reject')
respond.progress = respond.bind(this, 'progress')
latestJob = respond;
if(packet.action === 'recognize'){
handleRecognize(packet.payload, respond)
}else if(packet.action === 'detect'){
handleDetect(packet.payload, respond)
}
}
exports.dispatchHandlers = dispatchHandlers;
exports.setAdapter = function setAdapter(impl){
adapter = impl;
}
function handleInit(req, res){
if(!Module){
var Core = adapter.getCore(req, res);
res.progress({ status: 'initializing tesseract api' })
Module = Core({
TOTAL_MEMORY: req.memory,
TesseractProgress(percent){
latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent-30)/70) })
},
onRuntimeInitialized() {}
})
Module.FS_createPath("/", "tessdata", true, true)
base = new Module.TessBaseAPI()
res.progress({ status: 'initialized tesseract api' })
}
}
var dump = require('./dump.js')
var desaturate = require('./desaturate.js')
function setImage(Module, base, image){
var imgbin = desaturate(image),
width = image.width,
height = image.height;
var ptr = Module.allocate(imgbin, 'i8', Module.ALLOC_NORMAL);
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width);
base.SetRectangle(0, 0, width, height)
return ptr;
}
function loadLanguage(req, res, cb){
var lang = req.options.lang;
if(!Module._loadedLanguages) Module._loadedLanguages = {};
if(lang in Module._loadedLanguages) return cb();
adapter.getLanguageData(req, res, function(data){
Module.FS_createDataFile('tessdata', lang + ".traineddata", data, true, false);
res.progress({ status: 'loaded ' + lang + '.traineddata' })
Module._loadedLanguages[lang] = true;
cb()
})
}
function handleRecognize(req, res){
handleInit(req, res)
loadLanguage(req, res, function(){
var lang = req.options.lang;
base.Init(null, lang)
res.progress({ status: 'initialized with language' })
var ptr = setImage(Module, base, req.image);
base.Recognize(null)
var result = dump(Module, base)
base.End();
Module._free(ptr);
res.resolve(result);
})
}
function handleDetect(req, res){
handleInit(req, res)
req.options.lang = 'osd';
loadLanguage(req, res, function(){
base.Init(null, 'osd')
base.SetPageSegMode(Module.PSM_OSD_ONLY)
var ptr = setImage(Module, base, req.image);
var results = new Module.OSResults();
var success = base.DetectOS(results);
if(!success){
base.End();
Module._free(ptr);
res.reject("failed to detect os")
} else {
var charset = results.get_unicharset()
var best = results.get_best_result()
var oid = best.get_orientation_id(),
sid = best.get_script_id();
var result = {
tesseract_script_id: sid,
script: charset.get_script_from_script_id(sid),
script_confidence: best.get_sconfidence(),
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.get_oconfidence()
}
base.End();
Module._free(ptr);
res.resolve(result)
}
})
}

137
src/index.js

@ -0,0 +1,137 @@ @@ -0,0 +1,137 @@
"use strict";
var adapter = require('./node/index.js')
function createWorker(workerOptions){
return new TesseractWorker(workerOptions)
}
class TesseractWorker {
constructor(workerOptions){
this.worker = null;
this.workerOptions = workerOptions;
this._currentJob = null;
this._queue = []
}
recognize(image, options){
return this._delay(job => {
options = options || {}
options.lang = options.lang || 'eng';
job._send('recognize', { image: image, options: options, workerOptions: this.workerOptions })
})
}
detect(image, options){
options = options || {}
return this._delay(job => {
job._send('detect', { image: image, options: options, workerOptions: this.workerOptions })
})
}
terminate(){
if(this.worker) adapter.terminateWorker(this);
this.worker = null;
}
_delay(fn){
if(!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions);
var job = new TesseractJob(this);
this._queue.push(e => {
this._queue.shift()
this._currentJob = job;
fn(job)
})
if(!this._currentJob) this._dequeue();
return job
}
_dequeue(){
this._currentJob = null;
if(this._queue.length > 0){
this._queue[0]()
}
}
_recv(packet){
if(this._currentJob.id === packet.jobId){
this._currentJob._handle(packet)
}else{
console.warn('Job ID ' + packet.jobId + ' not known.')
}
}
}
var jobCounter = 0;
class TesseractJob {
constructor(instance){
this.id = 'Job-' + (++jobCounter) + '-' + Math.random().toString(16).slice(3, 8)
this._instance = instance;
this._resolve = []
this._reject = []
this._progress = []
}
then(resolve, reject){
if(this._resolve.push){
this._resolve.push(resolve)
}else{
resolve(this._resolve)
}
if(reject) this.catch(reject);
return this;
}
catch(reject){
if(this._reject.push){
this._reject.push(reject)
}else{
reject(this._reject)
}
return this;
}
progress(fn){
this._progress.push(fn)
return this;
}
_send(action, payload){
adapter.sendPacket(this._instance, {
jobId: this.id,
action: action,
payload: payload
})
}
_handle(packet){
var data = packet.data;
if(packet.status === 'resolve'){
if(this._resolve.length === 0) console.debug(data);
this._resolve.forEach(fn => {
var ret = fn(data);
if(ret && typeof ret.then == 'function'){
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.')
}
})
this._resolve = data;
this._instance._dequeue()
}else if(packet.status === 'reject'){
if(this._reject.length === 0) console.error(data);
this._reject.forEach(fn => fn(data))
this._reject = data;
this._instance._dequeue()
}else if(packet.status === 'progress'){
this._progress.forEach(fn => fn(data))
}else{
console.warn('Message type unknown', packet.status)
}
}
}
var DefaultTesseract = createWorker(adapter.defaultOptions)
DefaultTesseract.createWorker = createWorker;
module.exports = DefaultTesseract

83
src/node/index.js

@ -0,0 +1,83 @@ @@ -0,0 +1,83 @@
var path = require('path')
exports.defaultOptions = {
workerPath: path.join(__dirname, 'worker.js'),
langPath: 'http://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/',
}
var fork = require('child_process').fork;
var fs = require('fs')
exports.spawnWorker = function spawnWorker(instance, workerOptions){
var cp = fork(workerOptions.workerPath);
cp.on('message', function(packet){
instance._recv(packet)
})
return cp;
}
exports.terminateWorker = function(instance){
instance.worker.kill()
}
exports.sendPacket = function sendPacket(instance, packet){
loadImage(packet.payload.image, function(img){
packet.payload.image = img
instance.worker.send(packet)
})
}
function loadImage(image, cb){
if(typeof image === 'string'){
fs.readFile(image, function(err, buffer){
loadImage(buffer, cb)
})
return
}else if(image instanceof Buffer){
var fileType = require('file-type');
var mime = fileType(image).mime
if(mime === 'image/png'){
var PNGReader = require('png.js');
var reader = new PNGReader(image);
reader.parse(function(err, png){
if (err) throw err;
var image = {
width: png.getWidth(),
height: png.getHeight()
}
image.data = new Uint8Array(image.width * image.height * 4)
for(var j = 0; j < image.height; j++){
for(var i = 0; i < image.width; i++){
var offset = 4 * (i + j * image.width),
pix = png.getPixel(i, j);
image.data[offset] = pix[0]
image.data[offset + 1] = pix[1]
image.data[offset + 2] = pix[2]
image.data[offset + 3] = pix[3];
}
}
// console.log(image)
loadImage(image, cb)
});
return
}else if(mime === 'image/jpeg'){
var jpeg = require('jpeg-js');
loadImage(jpeg.decode(image), cb)
return
}
// TODO: support for TIFF, NetPBM, BMP, etc.
}
// node uses json.stringify for ipc which means we need to turn
// fancy arrays into raw arrays
if(image && image.data && image.data.length && !Array.isArray(image.data)){
image.data = Array.from(image.data)
return loadImage(image, cb)
}
cb(image)
}

36
src/node/lang.js

@ -0,0 +1,36 @@ @@ -0,0 +1,36 @@
var http = require("http"),
zlib = require("zlib"),
fs = require("fs"),
path = require("path");
var langdata = require('../common/langdata.json')
function getLanguageData(req, res, cb){
var lang = req.options.lang;
var langfile = lang + '.traineddata.gz';
var url = req.workerOptions.langPath + langfile;
fs.readFile(lang + '.traineddata', function (err, data) {
if(!err) return cb(new Uint8Array(data));
http.get(url, function(stream){
var received_bytes = 0;
stream.on('data', function(chunk) {
received_bytes += chunk.length;
res.progress({
status: 'downloading ' + langfile,
loaded: received_bytes,
progress: Math.min(1, received_bytes / langdata[lang])
});
});
var gunzip = zlib.createGunzip();
stream.pipe(gunzip).pipe(fs.createWriteStream(lang + '.traineddata'))
gunzip.on('end', function(){ getLanguageData(req, stream, cb) })
})
});
}
module.exports = getLanguageData;

22
src/node/worker.js

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
"use strict";
var workerUtils = require('../common/worker.js')
process.on('message', function(packet){
workerUtils.dispatchHandlers(packet, obj => process.send(obj))
})
exports.getLanguageData = require('./lang.js')
var TesseractCore;
exports.getCore = function(req, res){
if(!TesseractCore){
res.progress({ status: 'loading tesseract core' })
TesseractCore = require('tesseract.js-core')
res.progress({ status: 'loaded tesseract core' })
}
return TesseractCore
}
workerUtils.setAdapter(module.exports);

26
src/shared/desaturate.js

@ -1,26 +0,0 @@ @@ -1,26 +0,0 @@
export default function desaturate(image){
var width, height;
if(image.data){
var src = image.data;
width = image.width, height = image.height;
var dst = new Uint8Array(width * height);
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0;
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) {
// convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16
}
for (; i < srcLength; i += 4, ++j) //finish up
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
image = dst;
}
else {
throw 'Expected ImageData'
}
return image
}

166
src/shared/dump.js

@ -1,166 +0,0 @@ @@ -1,166 +0,0 @@
function deindent(html){
var lines = html.split('\n')
if(lines[0].substring(0,2) === " "){
for (var i = 0; i < lines.length; i++) {
if (lines[i].substring(0,2) === " ") {
lines[i] = lines[i].slice(2)
}
};
}
return lines.join('\n')
}
export default function DumpLiterallyEverything(){
var {module, base} = self
var ri = base.GetIterator();
var blocks = [];
var block, para, textline, word, symbol;
function enumToString(value, prefix){
return (Object.keys(module)
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' })
.filter(function(e){ return module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
}
const {RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL} = module
ri.Begin()
do {
if(ri.IsAtBeginningOf(RIL_BLOCK)){
var poly = ri.BlockPolygon();
var polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if(module.getPointer(poly) > 0){
var n = poly.get_n(),
px = poly.get_x(),
py = poly.get_y(),
polygon = [];
for(var i = 0; i < n; i++){
polygon.push([px.getValue(i), py.getValue(i)]);
}
module._ptaDestroy(module.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon: polygon
}
blocks.push(block)
}
if(ri.IsAtBeginningOf(RIL_PARA)){
para = {
lines: [],
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr()
}
block.paragraphs.push(para)
}
if(ri.IsAtBeginningOf(RIL_TEXTLINE)){
textline = {
words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE)
}
para.lines.push(textline)
}
if(ri.IsAtBeginningOf(RIL_WORD)){
var fontInfo = ri.getWordFontAttributes(),
wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
}
var wc = new module.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence()
})
} while (wc.Next());
module.destroy(wc)
textline.words.push(word)
}
var image = null;
// var pix = ri.GetBinaryImage(RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// module._pixDestroy(module.getPointer(pix));
if(ri.IsAtBeginningOf(RIL_SYMBOL)){
symbol = {
choices: [],
image: image,
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
}
word.symbols.push(symbol)
var ci = new module.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence()
})
} while (ci.Next());
module.destroy(ci)
}
} while (ri.Next(RIL_SYMBOL));
module.destroy(ri)
return {
text: base.GetUTF8Text(),
html: deindent(base.GetHOCRText()),
confidence: base.MeanTextConf(),
blocks: blocks,
psm: enumToString(base.GetPageSegMode(), 'PSM'),
oem: enumToString(base.oem(), 'OEM'),
version: base.Version(),
}
}

2
src/shared/fileSizes.js

@ -1,2 +0,0 @@ @@ -1,2 +0,0 @@
const fileSizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}
export default fileSizes;

3
src/worker/db.js

@ -1,3 +0,0 @@ @@ -1,3 +0,0 @@
import leveljs from 'level-js'
var db = typeof indexedDB === 'undefined' ? { open: (_, cb) => cb(true) } : leveljs('./tessdata')
export default db

53
src/worker/detect.js

@ -1,53 +0,0 @@ @@ -1,53 +0,0 @@
import desaturate from '../shared/desaturate'
import loadLanguage from './loadLanguage'
export default function detect(jobId, image, cb){
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = self.module.allocate(image, 'i8', self.module.ALLOC_NORMAL);
// console.log('allocated image')
loadLanguage(jobId, 'osd', err => {
self.module._free(ptr);
cb(err)
}, success => {
self.base.Init(null, 'osd')
self.base.SetPageSegMode(self.module.PSM_OSD_ONLY)
// console.log('loaded language')
self.base.SetImage(self.module.wrapPointer(ptr), width, height, 1, width)
self.base.SetRectangle(0, 0, width, height)
var results = new self.module.OSResults();
var success = self.base.DetectOS(results);
if(!success){
self.base.End();
self.module._free(ptr);
cb("failed to detect os")
}
else {
var charset = results.get_unicharset()
// console.log(charset)
// results.print_scores()
var best = results.get_best_result()
var oid = best.get_orientation_id(),
sid = best.get_script_id();
// console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence())
// console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence())
// console.log(best)
cb(null, {
tesseract_script_id: sid,
script: charset.get_script_from_script_id(sid),
script_confidence: best.get_sconfidence(),
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.get_oconfidence()
})
self.base.End();
self.module._free(ptr);
}
})
}

37
src/worker/index.js

@ -1,37 +0,0 @@ @@ -1,37 +0,0 @@
import recognize from './recognize'
import detect from './detect'
var module, base, jobId
onmessage = function(e) {
var {action, args} = e.data;
jobId = e.data.jobId
console.log('worker got action', action)
if(action == 'init'){
self.langUrl = args.langUrl
self.module = TesseractCore({
TOTAL_MEMORY: args.mem, //must be a multiple of 10 megabytes
TesseractProgress(percent){
postMessage({ jobId,
'progress': {
'recognized': Math.max(0,(percent-30)/70)
}
})
},
onRuntimeInitialized() {}
})
self.module.FS_createPath("/","tessdata",true,true)
self.base = new self.module.TessBaseAPI()
} else if(action === 'recognize'){
var {image, options} = args
recognize(jobId, image, options,
(error, result) => postMessage({jobId, error: error.message, result}))
} else if(action === 'detect'){
detect(jobId, args.image,
(error, result) => postMessage({jobId, error: error.message, result}))
}
}

99
src/worker/loadLanguage.js

@ -1,99 +0,0 @@ @@ -1,99 +0,0 @@
import {ungzip} from 'pako'
import db from './db'
import fileSizes from '../shared/fileSizes'
function getLanguageData(lang, progress, cb){
var xhr = new XMLHttpRequest();
xhr.responseType = 'arraybuffer';
xhr.open('GET', self.langUrl + lang + '.traineddata.gz', true);
xhr.onerror = e => {
xhr.onprogress = xhr.onload = null
cb(xhr, null)
}
xhr.onprogress = e => progress({
'loaded_lang_model': e.loaded/fileSizes[lang], //this is kinda wrong on safari
cached: false
})
xhr.onload = e => {
if (!(xhr.status == 200 || (xhr.status == 0 && xhr.response))) return cb(xhr, null);
progress({'unzipping_lang_model': true})
var response = new Uint8Array(xhr.response)
while(response[0] == 0x1f && response[1] == 0x8b) response = ungzip(response);
progress({
'unzipped_lang_model': true,
'lang_model_size': response.length
})
cb(null, response)
}
progress({
'loaded_lang_model': 0,
cached: false,
requesting: true
})
xhr.send()
}
function load(lang, jobId, cb){
console.log('loadLanguage jobId', jobId)
function progressMessage(progress){
postMessage({ jobId, progress })
}
function finish(err, data) {
if(err) return cb(err);
// loaded_langs.push(lang)
cb(null, data)
}
function createDataFile(err, data){
progressMessage({ created_virtual_datafile: true})
finish(err, data)
}
function createDataFileCached(err, data) {
if(err) return createDataFile(err);
db.put(lang, data, err => console.log('cached', lang, err))
progressMessage({cached_lang: lang})
createDataFile(null, data)
}
db.open({compression: false}, err => {
if (err) return getLanguageData(lang, progressMessage, createDataFile);
db.get(lang, (err, data) => {
if (err) return getLanguageData(lang, progressMessage, createDataFileCached)
while(data[0] == 0x1f && data[1] == 0x8b) data = ungzip(data);
progressMessage({ loaded_lang_model: lang, from_cache: true })
cb(null, data)
})
})
}
var loaded_langs = []
export default function loadLanguage(jobId, lang, error, success){
if(loaded_langs.indexOf(lang) == -1) load(lang, jobId, function(err, result){
if(err) return error(err)
loaded_langs.push(lang)
self.module.FS_createDataFile('tessdata', lang +".traineddata", result, true, false);
success()
})
else success();
}

56
src/worker/recognize.js

@ -1,56 +0,0 @@ @@ -1,56 +0,0 @@
import desaturate from '../shared/desaturate'
import loadLanguage from './loadLanguage'
import circularize from '../shared/circularize'
import dump from '../shared/dump'
var loaded_langs = []
export default function recognize(jobId, image, options, cb){
console.log('recognize id', jobId)
var {lang} = options
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = self.module.allocate(image, 'i8', self.module.ALLOC_NORMAL);
loadLanguage(jobId, lang, err => {
self.module._free(ptr)
cb(err)
}, success => {
self.base.Init(null, lang)
postMessage({
jobId,
'progress': {
'initialized_with_lang': lang
}
})
for (var option in options) {
if (options.hasOwnProperty(option)) {
self.base.SetVariable(option, options[option]);
postMessage({
jobId: jobId,
'progress': {
'set_variable': {
variable: option,
value: options[option]
}
}
})
}
}
self.base.SetImage(self.module.wrapPointer(ptr), width, height, 1, width)
self.base.SetRectangle(0, 0, width, height)
// self.base.GetUTF8Text()
self.base.Recognize(null)
var everything = circularize(dump())
self.base.End();
self.module._free(ptr);
cb(null, everything)
})
}

42
webpack.config.dev.js

@ -1,42 +0,0 @@ @@ -1,42 +0,0 @@
var path = require('path');
var webpack = require('webpack');
function config(opt) {
return {
devtool: 'cheap-module-eval-source-map',
entry: opt.entry,
output: Object.assign({}, opt.output, {
path: path.join(__dirname, 'build'),
publicPath: '/tesseract/',
}),
plugins: [
new webpack.NoErrorsPlugin()
],
module: {
loaders: [{
test: /\.js$/,
loaders: ['babel'],
include: opt.include
}]
},
node: {
fs: "empty"
}
}
}
module.exports = [{
entry: './src/browser/index.js',
output: {
filename: 'tesseract.js',
library: "Tesseract",
libraryTarget: "umd"
},
include: [path.join(__dirname, 'src/browser'), path.join(__dirname, 'src/shared')]
}, {
entry: './src/worker/index.js',
output: {
filename: 'tesseract.worker.js',
},
include: [path.join(__dirname, 'src/worker'), path.join(__dirname, 'src/shared')]
}].map(config);

46
webpack.config.prod.js

@ -1,46 +0,0 @@ @@ -1,46 +0,0 @@
var path = require('path');
var webpack = require('webpack');
function config({entry, output, include}) {
return {
entry,
output: Object.assign({}, output, {
path: path.join(__dirname, 'dist')
}),
plugins: [
new webpack.optimize.OccurenceOrderPlugin(),
new webpack.optimize.DedupePlugin(),
new webpack.optimize.UglifyJsPlugin({
compressor: {
warnings: false
}
})
],
module: {
loaders: [{
test: /\.js$/,
loaders: ['babel'],
include
}]
},
node: {
fs: "empty"
}
}
}
module.exports = [{
entry: './src/browser/index.js',
output: {
filename: 'tesseract.js',
library: "Tesseract",
libraryTarget: "umd"
},
include: [path.join(__dirname, 'src/browser')]
}, {
entry: './src/worker/index.js',
output: {
filename: 'tesseract.worker.js',
},
include: [path.join(__dirname, 'src/worker')]
}].map(config);
Loading…
Cancel
Save