Guillermo
8 years ago
23 changed files with 725 additions and 13213 deletions
@ -1 +1,18 @@
@@ -1 +1,18 @@
|
||||
# tesseract.js |
||||
Tesseract.js is a pure javascript version of the Tesseract OCR Engine that can recognize English, Chinese, Russian, and 60 other languages. |
||||
|
||||
<!-- ![alt text]( "Logo Title Text 1") --> |
||||
|
||||
# Installation |
||||
Tesseract.js works with a `<script>` tag, or with `npm` (if you're using webpack /browserify). |
||||
|
||||
## `<script/>` |
||||
|
||||
```html |
||||
<script src='' /> |
||||
``` |
||||
|
||||
## npm |
||||
```shell |
||||
npm install tesseract |
||||
``` |
@ -0,0 +1,26 @@
@@ -0,0 +1,26 @@
|
||||
var path = require('path'); |
||||
var express = require('express'); |
||||
var webpack = require('webpack'); |
||||
var config = require('./webpack.config.dev'); |
||||
|
||||
var app = express(); |
||||
var compiler = webpack(config); |
||||
|
||||
app.use(require('webpack-dev-middleware')(compiler, { |
||||
noInfo: true, |
||||
publicPath: config[0].output.publicPath |
||||
})); |
||||
|
||||
// app.use(require('webpack-hot-middleware')(compiler));
|
||||
|
||||
app.use('/', express.static('./')); |
||||
|
||||
var port = 7355 |
||||
app.listen(port, 'localhost', function(err) { |
||||
if (err) { |
||||
console.log(err); |
||||
return; |
||||
} |
||||
|
||||
console.log('Listening at http://localhost:' + port); |
||||
}); |
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t():"function"==typeof define&&define.amd?define([],t):"object"==typeof exports?exports.Tesseract=t():e.Tesseract=t()}(this,function(){return function(e){function t(n){if(r[n])return r[n].exports;var o=r[n]={exports:{},id:n,loaded:!1};return e[n].call(o.exports,o,o.exports,t),o.loaded=!0,o.exports}var r={};return t.m=e,t.c=r,t.p="",t(0)}([function(e,t){"use strict";e.exports=function(){function e(e,t){var r=a++;s[r]={};var n={jobId:r,action:e,args:t};return console.log(n),o.postMessage(n),{then:function(e){return s[r].result=e,this},error:function(e){return s[r].error=e,this},progress:function(e){return s[r].progress=e,this}}}function t(e){if(e.getContext)e=e.getContext("2d");else if("IMG"==e.tagName||"VIDEO"==e.tagName){var t=document.createElement("canvas");t.width=e.naturalWidth||e.videoWidth,t.height=e.naturalHeight||e.videoHeight;var r=t.getContext("2d");r.drawImage(e,0,0),e=r}return e.getImageData&&(e=e.getImageData(0,0,e.canvas.width,e.canvas.height)),e}var r=arguments.length>0&&void 0!==arguments[0]?arguments[0]:location.href+"build/tesseract.worker.js",n=new Blob(["importScripts('"+r+"');"]),o=new Worker(window.URL.createObjectURL(n)),i=!1,a=0,s={};return o.onmessage=function(e){var t=e.data,r=t.jobId,n=t.progress,o=t.error,i=t.result,a=s[r];n&&a.progress&&a.progress(n),o&&a.error&&a.error(o),i&&a.result&&a.result(i)},e("init",{mem:100663296}),{detect:function(r){return e("detect",{image:t(r)})},recognize:function(r){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:"eng";return"string"==typeof n?n={lang:n}:n.lang=n.lang||"eng",i||["chi_sim","chi_tra","jpn"].indexOf(n.lang)==-1||(e("init",{mem:167772160}),i=!0),e("recognize",{options:n,image:t(r)})}}}}])}); |
@ -1,130 +0,0 @@
@@ -1,130 +0,0 @@
|
||||
var Tesseract = (function(){ |
||||
|
||||
var Tesseract = {} |
||||
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
//https://rawgit.com/naptha/tesseract.js/master/worker/worker.js for testing
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
|
||||
var blob = new Blob(["importScripts('https://cdn.rawgit.com/naptha/tesseract.js/master/lib/worker.2015.07.26.js');"]); // changed on build
|
||||
// console.log('localhost')
|
||||
var worker = new Worker(window.URL.createObjectURL(blob)); |
||||
worker.postMessage({init: {mem: 16777216*6}}) |
||||
var bigworker = false |
||||
|
||||
var index = 0 |
||||
var handlers = [] |
||||
|
||||
worker.onmessage = function(e){ |
||||
var handler = handlers[e.data.index] |
||||
if(e.data.progress){ |
||||
handler.progress(e.data.progress) |
||||
} |
||||
else if(e.data.err){ |
||||
handler.reject(e.data.err) |
||||
handler.callback(e.data.err) |
||||
} |
||||
else { |
||||
handler.resolve(e.data.result) |
||||
handler.callback(null,e.data.result) |
||||
} |
||||
} |
||||
|
||||
function convertToImageData(image){ |
||||
if(image.getContext){ |
||||
image = image.getContext('2d'); |
||||
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){ |
||||
var c = document.createElement('canvas'); |
||||
if(image.tagName == "IMG"){ |
||||
c.width = image.naturalWidth; |
||||
c.height = image.naturalHeight; |
||||
}else if(image.tagName == "VIDEO"){ |
||||
c.width = image.videoWidth; |
||||
c.height = image.videoHeight; |
||||
} |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
image = ctx; |
||||
} |
||||
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return image |
||||
} |
||||
|
||||
Tesseract.detect = function(image, progress, callback){ |
||||
image = convertToImageData(image) |
||||
|
||||
if(typeof progress === "undefined"){ |
||||
progress = callback = new Function() |
||||
} |
||||
|
||||
if (typeof callback === "undefined"){ |
||||
callback = progress |
||||
progress = new Function() |
||||
} |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback |
||||
handlers[i].progress = progress |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'detect', image: image}) |
||||
}) |
||||
|
||||
} |
||||
|
||||
Tesseract.recognize = function(image, options, callback){ |
||||
var lang = options.lang |
||||
if (typeof lang === "undefined"){ |
||||
lang = 'eng' |
||||
} |
||||
|
||||
if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(lang) != -1){ |
||||
worker.postMessage({init: {mem: 16777216*10}}) |
||||
bigworker = true |
||||
console.log('started big worker') |
||||
} |
||||
|
||||
if (typeof options === 'string') { |
||||
lang = options |
||||
options = {} |
||||
} |
||||
|
||||
if (typeof options === "function") { |
||||
callback = options |
||||
options = {} |
||||
} |
||||
|
||||
image = convertToImageData(image) |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback || new Function() |
||||
handlers[i].progress = (function(){ |
||||
if(typeof options.progress === 'function'){ |
||||
var p = options.progress |
||||
delete options.progress |
||||
return p |
||||
} |
||||
return function(){} |
||||
})() |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'recognize', image: image, lang: lang, options: options}) |
||||
}) |
||||
|
||||
} |
||||
return Tesseract |
||||
})() |
@ -1,130 +0,0 @@
@@ -1,130 +0,0 @@
|
||||
var Tesseract = (function(){ |
||||
|
||||
var Tesseract = {} |
||||
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
//https://rawgit.com/naptha/tesseract.js/master/worker/worker.js for testing
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
|
||||
var blob = new Blob(["importScripts('http://localhost:1234/master/lib/worker.2015.07.26.js');"]); // changed on build
|
||||
// console.log('localhost')
|
||||
var worker = new Worker(window.URL.createObjectURL(blob)); |
||||
worker.postMessage({init: {mem: 16777216*6}}) |
||||
var bigworker = false |
||||
|
||||
var index = 0 |
||||
var handlers = [] |
||||
|
||||
worker.onmessage = function(e){ |
||||
var handler = handlers[e.data.index] |
||||
if(e.data.progress){ |
||||
handler.progress(e.data.progress) |
||||
} |
||||
else if(e.data.err){ |
||||
handler.reject(e.data.err) |
||||
handler.callback(e.data.err) |
||||
} |
||||
else { |
||||
handler.resolve(e.data.result) |
||||
handler.callback(null,e.data.result) |
||||
} |
||||
} |
||||
|
||||
function convertToImageData(image){ |
||||
if(image.getContext){ |
||||
image = image.getContext('2d'); |
||||
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){ |
||||
var c = document.createElement('canvas'); |
||||
if(image.tagName == "IMG"){ |
||||
c.width = image.naturalWidth; |
||||
c.height = image.naturalHeight; |
||||
}else if(image.tagName == "VIDEO"){ |
||||
c.width = image.videoWidth; |
||||
c.height = image.videoHeight; |
||||
} |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
image = ctx; |
||||
} |
||||
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return image |
||||
} |
||||
|
||||
Tesseract.detect = function(image, progress, callback){ |
||||
image = convertToImageData(image) |
||||
|
||||
if(typeof progress === "undefined"){ |
||||
progress = callback = new Function() |
||||
} |
||||
|
||||
if (typeof callback === "undefined"){ |
||||
callback = progress |
||||
progress = new Function() |
||||
} |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback |
||||
handlers[i].progress = progress |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'detect', image: image}) |
||||
}) |
||||
|
||||
} |
||||
|
||||
Tesseract.recognize = function(image, options, callback){ |
||||
var lang = options.lang |
||||
if (typeof lang === "undefined"){ |
||||
lang = 'eng' |
||||
} |
||||
|
||||
if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(lang) != -1){ |
||||
worker.postMessage({init: {mem: 16777216*10}}) |
||||
bigworker = true |
||||
console.log('started big worker') |
||||
} |
||||
|
||||
if (typeof options === 'string') { |
||||
lang = options |
||||
options = {} |
||||
} |
||||
|
||||
if (typeof options === "function") { |
||||
callback = options |
||||
options = {} |
||||
} |
||||
|
||||
image = convertToImageData(image) |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback || new Function() |
||||
handlers[i].progress = (function(){ |
||||
if(typeof options.progress === 'function'){ |
||||
var p = options.progress |
||||
delete options.progress |
||||
return p |
||||
} |
||||
return function(){} |
||||
})() |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'recognize', image: image, lang: lang, options: options}) |
||||
}) |
||||
|
||||
} |
||||
return Tesseract |
||||
})() |
File diff suppressed because one or more lines are too long
@ -1,130 +0,0 @@
@@ -1,130 +0,0 @@
|
||||
var Tesseract = (function(){ |
||||
|
||||
var Tesseract = {} |
||||
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
//https://rawgit.com/naptha/tesseract.js/master/worker/worker.js for testing
|
||||
//https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js
|
||||
|
||||
var blob = new Blob(["importScripts('__worker__');"]); // changed on build
|
||||
// console.log('localhost')
|
||||
var worker = new Worker(window.URL.createObjectURL(blob)); |
||||
worker.postMessage({init: {mem: 16777216*6}}) |
||||
var bigworker = false |
||||
|
||||
var index = 0 |
||||
var handlers = [] |
||||
|
||||
worker.onmessage = function(e){ |
||||
var handler = handlers[e.data.index] |
||||
if(e.data.progress){ |
||||
handler.progress(e.data.progress) |
||||
} |
||||
else if(e.data.err){ |
||||
handler.reject(e.data.err) |
||||
handler.callback(e.data.err) |
||||
} |
||||
else { |
||||
handler.resolve(e.data.result) |
||||
handler.callback(null,e.data.result) |
||||
} |
||||
} |
||||
|
||||
function convertToImageData(image){ |
||||
if(image.getContext){ |
||||
image = image.getContext('2d'); |
||||
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){ |
||||
var c = document.createElement('canvas'); |
||||
if(image.tagName == "IMG"){ |
||||
c.width = image.naturalWidth; |
||||
c.height = image.naturalHeight; |
||||
}else if(image.tagName == "VIDEO"){ |
||||
c.width = image.videoWidth; |
||||
c.height = image.videoHeight; |
||||
} |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
image = ctx; |
||||
} |
||||
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return image |
||||
} |
||||
|
||||
Tesseract.detect = function(image, progress, callback){ |
||||
image = convertToImageData(image) |
||||
|
||||
if(typeof progress === "undefined"){ |
||||
progress = callback = new Function() |
||||
} |
||||
|
||||
if (typeof callback === "undefined"){ |
||||
callback = progress |
||||
progress = new Function() |
||||
} |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback |
||||
handlers[i].progress = progress |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'detect', image: image}) |
||||
}) |
||||
|
||||
} |
||||
|
||||
Tesseract.recognize = function(image, options, callback){ |
||||
var lang = options.lang |
||||
if (typeof lang === "undefined"){ |
||||
lang = 'eng' |
||||
} |
||||
|
||||
if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(lang) != -1){ |
||||
worker.postMessage({init: {mem: 16777216*10}}) |
||||
bigworker = true |
||||
console.log('started big worker') |
||||
} |
||||
|
||||
if (typeof options === 'string') { |
||||
lang = options |
||||
options = {} |
||||
} |
||||
|
||||
if (typeof options === "function") { |
||||
callback = options |
||||
options = {} |
||||
} |
||||
|
||||
image = convertToImageData(image) |
||||
|
||||
var i = index++ |
||||
|
||||
handlers[i] = { |
||||
resolve: new Function(), |
||||
reject: new Function() |
||||
} |
||||
handlers[i].callback = callback || new Function() |
||||
handlers[i].progress = (function(){ |
||||
if(typeof options.progress === 'function'){ |
||||
var p = options.progress |
||||
delete options.progress |
||||
return p |
||||
} |
||||
return function(){} |
||||
})() |
||||
|
||||
return new Promise(function(resolve, reject){ |
||||
handlers[i].resolve = resolve |
||||
handlers[i].reject = reject |
||||
worker.postMessage({index: i, fun: 'recognize', image: image, lang: lang, options: options}) |
||||
}) |
||||
|
||||
} |
||||
return Tesseract |
||||
})() |
@ -0,0 +1,65 @@
@@ -0,0 +1,65 @@
|
||||
//TODO: replace with cdn url
|
||||
module.exports = function Tesseract(url=location.href+'build/tesseract.worker.js'){ |
||||
var blob = new Blob(["importScripts('"+url+"');"]) |
||||
var worker = new Worker(window.URL.createObjectURL(blob)); |
||||
|
||||
var bigworker = false |
||||
var jobCounter = 0 |
||||
var handlers = {} |
||||
|
||||
function runAsync(action, args){ |
||||
var jobId = jobCounter++ |
||||
handlers[jobId] = {} |
||||
var message = {jobId, action, args} |
||||
console.log(message) |
||||
worker.postMessage(message) |
||||
return { |
||||
then (f){ handlers[jobId].result = f; return this}, |
||||
error (f){ handlers[jobId].error = f; return this}, |
||||
progress(f){ handlers[jobId].progress = f; return this} |
||||
} |
||||
} |
||||
|
||||
worker.onmessage = function(e){ |
||||
var {jobId, progress, error, result} = e.data |
||||
var handler = handlers[jobId] |
||||
if(progress && handler.progress) handler.progress(progress); |
||||
if(error && handler.error) handler.error(error); |
||||
if(result && handler.result) handler.result(result); |
||||
} |
||||
|
||||
function convertToImageData(image){ |
||||
if(image.getContext) image = image.getContext('2d'); |
||||
else if(image.tagName == "IMG" || image.tagName == "VIDEO"){ |
||||
var c = document.createElement('canvas'); |
||||
c.width = image.naturalWidth || image.videoWidth; |
||||
c.height = image.naturalHeight || image.videoHeight; |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
image = ctx; |
||||
} |
||||
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return image |
||||
} |
||||
|
||||
runAsync('init', {mem: (1<<24) * 6}) |
||||
|
||||
return { |
||||
detect(image){ |
||||
return runAsync('detect', {image: convertToImageData(image)}) |
||||
}, |
||||
|
||||
recognize(image, options='eng'){ |
||||
|
||||
if (typeof options === 'string') options = {lang: options}; |
||||
else options.lang = options.lang || 'eng'; |
||||
|
||||
if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(options.lang) != -1){ |
||||
runAsync('init', {mem: (1<<24) * 10}) |
||||
bigworker = true |
||||
} |
||||
|
||||
return runAsync('recognize', {options, image: convertToImageData(image)}) |
||||
} |
||||
} |
||||
} |
@ -1,533 +0,0 @@
@@ -1,533 +0,0 @@
|
||||
var Tesseract304 = require('tesseract.js-core') |
||||
var leveljs = require('level-js') |
||||
var db; |
||||
if (typeof indexedDB === 'undefined'){ |
||||
db = { open: function(opts, cb){ cb(true) /*err = true*/ } } |
||||
} |
||||
else { |
||||
db = leveljs('./tessdata') |
||||
} |
||||
|
||||
console.log('hallo') |
||||
|
||||
var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922} |
||||
|
||||
var pako = require('pako') |
||||
|
||||
var T; |
||||
|
||||
var tesseractinit = (function createTesseractInstance(memory){ |
||||
|
||||
curindex = 0 |
||||
|
||||
var Module = Tesseract304({ |
||||
TOTAL_MEMORY: memory, //must be a multiple of 10 megabytes
|
||||
TesseractProgress: function(percent){ |
||||
postMessage({ |
||||
index: curindex, |
||||
'progress': { |
||||
'recognized': Math.max(0,(percent-30)/70) |
||||
} |
||||
}) |
||||
}//,
|
||||
// onRuntimeInitialized: function(){
|
||||
// console.log('wau')
|
||||
// }
|
||||
}) |
||||
|
||||
var base = new Module.TessBaseAPI() |
||||
var loaded_langs = [] |
||||
var loadLanguage = function(lang, index, cb){ // NodeJS style callback
|
||||
if(loaded_langs.indexOf(lang) != -1){ |
||||
cb(null, lang) |
||||
} |
||||
else{ |
||||
Module.FS_createPath("/","tessdata",true,true) |
||||
|
||||
var downloadlang = function(shouldcache){ |
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'loaded_lang_model': 0, |
||||
cached: false, |
||||
requesting: true |
||||
} |
||||
}) |
||||
var xhr = new XMLHttpRequest(); |
||||
xhr.open('GET', 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz', true); |
||||
xhr.responseType = 'arraybuffer'; |
||||
xhr.onerror = function(){ cb(xhr, null) } |
||||
xhr.onprogress = function(e){ |
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'loaded_lang_model': e.loaded/filesizes[lang], //this is kinda wrong on safari
|
||||
cached: false |
||||
} |
||||
}) |
||||
} |
||||
xhr.onload = function(){ |
||||
if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) { |
||||
postMessage({ |
||||
index: index, |
||||
'progress': 'unzipping_lang_model' |
||||
}) |
||||
|
||||
var response = new Uint8Array(xhr.response) |
||||
|
||||
while(response[0] == 0x1f && response[1] == 0x8b){ |
||||
response = pako.ungzip(response) |
||||
} |
||||
console.log('asdf') |
||||
|
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'unzipped_lang_model': true, |
||||
'lang_model_size': response.length |
||||
} |
||||
}) |
||||
|
||||
Module.FS_createDataFile('tessdata', lang +".traineddata", response, true, false); |
||||
|
||||
if(shouldcache){ |
||||
db.put(lang, response, function(err){ |
||||
console.log('cached lang') |
||||
}) |
||||
} |
||||
|
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'created_virtual_datafile': true, |
||||
'cached_file': shouldcache |
||||
} |
||||
}) |
||||
|
||||
loaded_langs.push(lang) |
||||
|
||||
cb(null, lang) |
||||
} else cb(xhr, null); |
||||
} |
||||
xhr.send(null) |
||||
} |
||||
|
||||
db.open({compression: false},function(err){ |
||||
// err = true
|
||||
if (err) { |
||||
downloadlang(false) |
||||
} |
||||
else { |
||||
db.get(lang, function (err, value) { |
||||
|
||||
// err = true
|
||||
|
||||
if (err) { |
||||
downloadlang(true) |
||||
} |
||||
else { |
||||
|
||||
while(value[0] == 0x1f && value[1] == 0x8b){ |
||||
value = pako.ungzip(value) |
||||
} |
||||
|
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
loaded_lang_model:1, |
||||
cached: true |
||||
} |
||||
}) |
||||
|
||||
Module.FS_createDataFile('tessdata', lang +".traineddata", value, true, false); |
||||
loaded_langs.push(lang) |
||||
cb(null, lang) |
||||
} |
||||
}) |
||||
} |
||||
}) |
||||
} |
||||
} |
||||
|
||||
function circularize(page){ |
||||
page.paragraphs = [] |
||||
page.lines = [] |
||||
page.words = [] |
||||
page.symbols = [] |
||||
|
||||
page.blocks.forEach(function(block){ |
||||
block.page = page; |
||||
|
||||
block.lines = [] |
||||
block.words = [] |
||||
block.symbols = [] |
||||
|
||||
block.paragraphs.forEach(function(para){ |
||||
para.block = block; |
||||
para.page = page; |
||||
|
||||
para.words = [] |
||||
para.symbols = [] |
||||
|
||||
para.lines.forEach(function(line){ |
||||
line.paragraph = para; |
||||
line.block = block; |
||||
line.page = page; |
||||
|
||||
line.symbols = [] |
||||
|
||||
line.words.forEach(function(word){ |
||||
word.line = line; |
||||
word.paragraph = para; |
||||
word.block = block; |
||||
word.page = page; |
||||
word.symbols.forEach(function(sym){ |
||||
sym.word = word; |
||||
sym.line = line; |
||||
sym.paragraph = para; |
||||
sym.block = block; |
||||
sym.page = page; |
||||
|
||||
sym.line.symbols.push(sym) |
||||
sym.paragraph.symbols.push(sym) |
||||
sym.block.symbols.push(sym) |
||||
sym.page.symbols.push(sym) |
||||
}) |
||||
word.paragraph.words.push(word) |
||||
word.block.words.push(word) |
||||
word.page.words.push(word) |
||||
}) |
||||
line.block.lines.push(line) |
||||
line.page.lines.push(line) |
||||
}) |
||||
para.page.paragraphs.push(para) |
||||
}) |
||||
}) |
||||
return page |
||||
} |
||||
|
||||
function DumpLiterallyEverything(){ |
||||
var ri = base.GetIterator(); |
||||
var blocks = []; |
||||
var block, para, textline, word, symbol; |
||||
|
||||
function enumToString(value, prefix){ |
||||
return (Object.keys(Module) |
||||
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' }) |
||||
.filter(function(e){ return Module[e] === value }) |
||||
.map(function(e){ return e.slice(prefix.length + 1) })[0]) |
||||
} |
||||
|
||||
ri.Begin() |
||||
do { |
||||
if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){ |
||||
var poly = ri.BlockPolygon(); |
||||
var polygon = null; |
||||
// BlockPolygon() returns null when automatic page segmentation is off
|
||||
if(Module.getPointer(poly) > 0){ |
||||
var n = poly.get_n(), |
||||
px = poly.get_x(), |
||||
py = poly.get_y(), |
||||
polygon = []; |
||||
for(var i = 0; i < n; i++){ |
||||
polygon.push([px.getValue(i), py.getValue(i)]); |
||||
} |
||||
Module._ptaDestroy(Module.getPointer(poly)); |
||||
} |
||||
|
||||
block = { |
||||
paragraphs: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_BLOCK), |
||||
confidence: ri.Confidence(Module.RIL_BLOCK), |
||||
baseline: ri.getBaseline(Module.RIL_BLOCK), |
||||
bbox: ri.getBoundingBox(Module.RIL_BLOCK), |
||||
|
||||
blocktype: enumToString(ri.BlockType(), 'PT'), |
||||
polygon: polygon |
||||
} |
||||
blocks.push(block) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_PARA)){ |
||||
para = { |
||||
lines: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_PARA), |
||||
confidence: ri.Confidence(Module.RIL_PARA), |
||||
baseline: ri.getBaseline(Module.RIL_PARA), |
||||
bbox: ri.getBoundingBox(Module.RIL_PARA), |
||||
|
||||
is_ltr: !!ri.ParagraphIsLtr() |
||||
} |
||||
block.paragraphs.push(para) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){ |
||||
textline = { |
||||
words: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_TEXTLINE), |
||||
confidence: ri.Confidence(Module.RIL_TEXTLINE), |
||||
baseline: ri.getBaseline(Module.RIL_TEXTLINE), |
||||
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE) |
||||
} |
||||
para.lines.push(textline) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_WORD)){ |
||||
var fontInfo = ri.getWordFontAttributes(), |
||||
wordDir = ri.WordDirection(); |
||||
word = { |
||||
symbols: [], |
||||
choices: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_WORD), |
||||
confidence: ri.Confidence(Module.RIL_WORD), |
||||
baseline: ri.getBaseline(Module.RIL_WORD), |
||||
bbox: ri.getBoundingBox(Module.RIL_WORD), |
||||
|
||||
is_numeric: !!ri.WordIsNumeric(), |
||||
in_dictionary: !!ri.WordIsFromDictionary(), |
||||
direction: enumToString(wordDir, 'DIR'), |
||||
language: ri.WordRecognitionLanguage(), |
||||
|
||||
is_bold: fontInfo.is_bold, |
||||
is_italic: fontInfo.is_italic, |
||||
is_underlined: fontInfo.is_underlined, |
||||
is_monospace: fontInfo.is_monospace, |
||||
is_serif: fontInfo.is_serif, |
||||
is_smallcaps: fontInfo.is_smallcaps, |
||||
font_size: fontInfo.pointsize, |
||||
font_id: fontInfo.font_id, |
||||
font_name: fontInfo.font_name, |
||||
} |
||||
var wc = new Module.WordChoiceIterator(ri); |
||||
do { |
||||
word.choices.push({ |
||||
text: wc.GetUTF8Text(), |
||||
confidence: wc.Confidence() |
||||
}) |
||||
} while (wc.Next()); |
||||
Module.destroy(wc) |
||||
textline.words.push(word) |
||||
} |
||||
|
||||
var image = null; |
||||
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
|
||||
// var image = pix2array(pix);
|
||||
// // for some reason it seems that things stop working if you destroy pics
|
||||
// Module._pixDestroy(Module.getPointer(pix));
|
||||
if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){ |
||||
symbol = { |
||||
choices: [], |
||||
image: image, |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_SYMBOL), |
||||
confidence: ri.Confidence(Module.RIL_SYMBOL), |
||||
baseline: ri.getBaseline(Module.RIL_SYMBOL), |
||||
bbox: ri.getBoundingBox(Module.RIL_SYMBOL), |
||||
|
||||
is_superscript: !!ri.SymbolIsSuperscript(), |
||||
is_subscript: !!ri.SymbolIsSubscript(), |
||||
is_dropcap: !!ri.SymbolIsDropcap(), |
||||
} |
||||
word.symbols.push(symbol) |
||||
var ci = new Module.ChoiceIterator(ri); |
||||
do { |
||||
symbol.choices.push({ |
||||
text: ci.GetUTF8Text(), |
||||
confidence: ci.Confidence() |
||||
}) |
||||
} while (ci.Next()); |
||||
Module.destroy(ci) |
||||
} |
||||
} while (ri.Next(Module.RIL_SYMBOL)); |
||||
Module.destroy(ri) |
||||
|
||||
return { |
||||
text: base.GetUTF8Text(), |
||||
html: deindent(base.GetHOCRText()), |
||||
|
||||
confidence: base.MeanTextConf(), |
||||
|
||||
blocks: blocks, |
||||
|
||||
psm: enumToString(base.GetPageSegMode(), 'PSM'), |
||||
oem: enumToString(base.oem(), 'OEM'), |
||||
version: base.Version(), |
||||
} |
||||
} |
||||
|
||||
function deindent(html){ |
||||
var lines = html.split('\n') |
||||
if(lines[0].substring(0,2) === " "){ |
||||
for (var i = 0; i < lines.length; i++) { |
||||
if (lines[i].substring(0,2) === " ") { |
||||
lines[i] = lines[i].slice(2) |
||||
} |
||||
}; |
||||
} |
||||
return lines.join('\n') |
||||
} |
||||
|
||||
function desaturate(image){ |
||||
var width, height; |
||||
if(image.data){ |
||||
var src = image.data; |
||||
width = image.width, height = image.height; |
||||
var dst = new Uint8Array(width * height); |
||||
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0; |
||||
|
||||
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) { |
||||
// convert to grayscale 4 pixels at a time; eveything with alpha get put in front of 50% gray
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16 |
||||
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16 |
||||
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16 |
||||
|
||||
} |
||||
for (; i < srcLength; i += 4, ++j) //finish up
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
|
||||
image = dst; |
||||
} |
||||
else { |
||||
throw 'Expected ImageData' |
||||
} |
||||
return image |
||||
} |
||||
|
||||
function recognize(index, image, lang, options, cb){ |
||||
|
||||
|
||||
var width = image.width, height = image.height; |
||||
|
||||
image = desaturate(image) |
||||
|
||||
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL); |
||||
|
||||
loadLanguage(lang, index, function(err, result){ |
||||
|
||||
if(err){ |
||||
console.error("error loading", lang); |
||||
Module._free(ptr); |
||||
cb(err, null) |
||||
} |
||||
else { |
||||
curindex = index |
||||
|
||||
base.Init(null, lang) |
||||
|
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'initialized_with_lang': true, |
||||
'lang': lang |
||||
} |
||||
}) |
||||
|
||||
for (var option in options) { |
||||
if (options.hasOwnProperty(option)) { |
||||
base.SetVariable(option, options[option]); |
||||
postMessage({ |
||||
index: index, |
||||
'progress': { |
||||
'set_variable': { |
||||
variable: option, |
||||
value: options[option] |
||||
} |
||||
} |
||||
}) |
||||
} |
||||
} |
||||
|
||||
|
||||
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) |
||||
base.SetRectangle(0, 0, width, height) |
||||
// base.GetUTF8Text()
|
||||
base.Recognize(null) |
||||
var everything = circularize(DumpLiterallyEverything()) |
||||
base.End(); |
||||
Module._free(ptr); |
||||
cb(null, everything) |
||||
|
||||
} |
||||
}) |
||||
} |
||||
|
||||
function detect(index, image, cb){ |
||||
var width = image.width, height = image.height; |
||||
image = desaturate(image) |
||||
|
||||
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL); |
||||
console.log('allocated image') |
||||
// base = new Module.TessBaseAPI()
|
||||
|
||||
loadLanguage('osd', index, function(err, result){ |
||||
if(err){ |
||||
Module._free(ptr); |
||||
cb(err) |
||||
} |
||||
else { |
||||
curindex = index |
||||
base.Init(null, 'osd') |
||||
base.SetPageSegMode(Module.PSM_OSD_ONLY) |
||||
console.log('loaded language') |
||||
|
||||
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) |
||||
base.SetRectangle(0, 0, width, height) |
||||
|
||||
var results = new Module.OSResults(); |
||||
var success = base.DetectOS(results); |
||||
if(!success){ |
||||
base.End(); |
||||
Module._free(ptr); |
||||
cb("failed to detect os") |
||||
} |
||||
else { |
||||
var charset = results.get_unicharset() |
||||
console.log(charset) |
||||
// results.print_scores()
|
||||
|
||||
var best = results.get_best_result() |
||||
var oid = best.get_orientation_id(), |
||||
sid = best.get_script_id(); |
||||
// console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence())
|
||||
// console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence())
|
||||
// console.log(best)
|
||||
|
||||
cb(null, { |
||||
tesseract_script_id: sid, |
||||
script: charset.get_script_from_script_id(sid), |
||||
script_confidence: best.get_sconfidence(), |
||||
orientation_degrees: [0, 270, 180, 90][oid], |
||||
orientation_confidence: best.get_oconfidence() |
||||
}) |
||||
|
||||
base.End(); |
||||
Module._free(ptr); |
||||
} |
||||
} |
||||
}) |
||||
} |
||||
|
||||
return { |
||||
recognize: recognize, |
||||
detect: detect |
||||
} |
||||
}) |
||||
|
||||
onmessage = function(e) { |
||||
|
||||
if(e.data.init){ |
||||
T = tesseractinit(e.data.init.mem) |
||||
} |
||||
else if(e.data.fun === 'recognize'){ |
||||
T.recognize(e.data.index, e.data.image, e.data.lang, e.data.options, function(err, result){ |
||||
postMessage({index: e.data.index, err:err, result: result}) |
||||
}) |
||||
} |
||||
else if(e.data.fun === 'detect'){ |
||||
T.detect(e.data.index, e.data.image, function(err, result){ |
||||
postMessage({index: e.data.index, err:err, result: result}) |
||||
}) |
||||
} |
||||
} |
@ -0,0 +1,56 @@
@@ -0,0 +1,56 @@
|
||||
export default function circularize(page){ |
||||
page.paragraphs = [] |
||||
page.lines = [] |
||||
page.words = [] |
||||
page.symbols = [] |
||||
|
||||
page.blocks.forEach(function(block){ |
||||
block.page = page; |
||||
|
||||
block.lines = [] |
||||
block.words = [] |
||||
block.symbols = [] |
||||
|
||||
block.paragraphs.forEach(function(para){ |
||||
para.block = block; |
||||
para.page = page; |
||||
|
||||
para.words = [] |
||||
para.symbols = [] |
||||
|
||||
para.lines.forEach(function(line){ |
||||
line.paragraph = para; |
||||
line.block = block; |
||||
line.page = page; |
||||
|
||||
line.symbols = [] |
||||
|
||||
line.words.forEach(function(word){ |
||||
word.line = line; |
||||
word.paragraph = para; |
||||
word.block = block; |
||||
word.page = page; |
||||
word.symbols.forEach(function(sym){ |
||||
sym.word = word; |
||||
sym.line = line; |
||||
sym.paragraph = para; |
||||
sym.block = block; |
||||
sym.page = page; |
||||
|
||||
sym.line.symbols.push(sym) |
||||
sym.paragraph.symbols.push(sym) |
||||
sym.block.symbols.push(sym) |
||||
sym.page.symbols.push(sym) |
||||
}) |
||||
word.paragraph.words.push(word) |
||||
word.block.words.push(word) |
||||
word.page.words.push(word) |
||||
}) |
||||
line.block.lines.push(line) |
||||
line.page.lines.push(line) |
||||
}) |
||||
para.page.paragraphs.push(para) |
||||
}) |
||||
}) |
||||
return page |
||||
} |
@ -0,0 +1,3 @@
@@ -0,0 +1,3 @@
|
||||
import leveljs from 'level-js' |
||||
var db = typeof indexedDB === 'undefined' ? { open: (_, cb) => cb(true) } : leveljs('./tessdata') |
||||
export default db |
@ -0,0 +1,26 @@
@@ -0,0 +1,26 @@
|
||||
export default function desaturate(image){ |
||||
var width, height; |
||||
if(image.data){ |
||||
var src = image.data; |
||||
width = image.width, height = image.height; |
||||
var dst = new Uint8Array(width * height); |
||||
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0; |
||||
|
||||
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) { |
||||
// convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16 |
||||
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16 |
||||
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16 |
||||
|
||||
} |
||||
for (; i < srcLength; i += 4, ++j) //finish up
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
|
||||
image = dst; |
||||
} |
||||
else { |
||||
throw 'Expected ImageData' |
||||
} |
||||
return image |
||||
} |
@ -0,0 +1,54 @@
@@ -0,0 +1,54 @@
|
||||
export default function detect(jobId, module, base, image, cb){ |
||||
var width = image.width, height = image.height; |
||||
image = desaturate(image) |
||||
|
||||
var ptr = module.allocate(image, 'i8', module.ALLOC_NORMAL); |
||||
console.log('allocated image') |
||||
// base = new module.TessBaseAPI()
|
||||
|
||||
loadLanguage('osd', jobId, function(err, result){ |
||||
if(err){ |
||||
module._free(ptr); |
||||
cb(err) |
||||
} |
||||
else { |
||||
base.Init(null, 'osd') |
||||
base.SetPageSegMode(module.PSM_OSD_ONLY) |
||||
console.log('loaded language') |
||||
|
||||
base.SetImage(module.wrapPointer(ptr), width, height, 1, width) |
||||
base.SetRectangle(0, 0, width, height) |
||||
|
||||
var results = new module.OSResults(); |
||||
var success = base.DetectOS(results); |
||||
if(!success){ |
||||
base.End(); |
||||
module._free(ptr); |
||||
cb("failed to detect os") |
||||
} |
||||
else { |
||||
var charset = results.get_unicharset() |
||||
console.log(charset) |
||||
// results.print_scores()
|
||||
|
||||
var best = results.get_best_result() |
||||
var oid = best.get_orientation_id(), |
||||
sid = best.get_script_id(); |
||||
// console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence())
|
||||
// console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence())
|
||||
// console.log(best)
|
||||
|
||||
cb(null, { |
||||
tesseract_script_id: sid, |
||||
script: charset.get_script_from_script_id(sid), |
||||
script_confidence: best.get_sconfidence(), |
||||
orientation_degrees: [0, 270, 180, 90][oid], |
||||
orientation_confidence: best.get_oconfidence() |
||||
}) |
||||
|
||||
base.End(); |
||||
module._free(ptr); |
||||
} |
||||
} |
||||
}) |
||||
} |
@ -0,0 +1,163 @@
@@ -0,0 +1,163 @@
|
||||
function deindent(html){ |
||||
var lines = html.split('\n') |
||||
if(lines[0].substring(0,2) === " "){ |
||||
for (var i = 0; i < lines.length; i++) { |
||||
if (lines[i].substring(0,2) === " ") { |
||||
lines[i] = lines[i].slice(2) |
||||
} |
||||
}; |
||||
} |
||||
return lines.join('\n') |
||||
} |
||||
|
||||
export default function DumpLiterallyEverything(module, base){ |
||||
var ri = base.GetIterator(); |
||||
var blocks = []; |
||||
var block, para, textline, word, symbol; |
||||
|
||||
function enumToString(value, prefix){ |
||||
return (Object.keys(module) |
||||
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' }) |
||||
.filter(function(e){ return module[e] === value }) |
||||
.map(function(e){ return e.slice(prefix.length + 1) })[0]) |
||||
} |
||||
|
||||
const {RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL} = module |
||||
|
||||
ri.Begin() |
||||
do { |
||||
if(ri.IsAtBeginningOf(RIL_BLOCK)){ |
||||
var poly = ri.BlockPolygon(); |
||||
var polygon = null; |
||||
// BlockPolygon() returns null when automatic page segmentation is off
|
||||
if(module.getPointer(poly) > 0){ |
||||
var n = poly.get_n(), |
||||
px = poly.get_x(), |
||||
py = poly.get_y(), |
||||
polygon = []; |
||||
for(var i = 0; i < n; i++){ |
||||
polygon.push([px.getValue(i), py.getValue(i)]); |
||||
} |
||||
module._ptaDestroy(module.getPointer(poly)); |
||||
} |
||||
|
||||
block = { |
||||
paragraphs: [], |
||||
|
||||
text: ri.GetUTF8Text(RIL_BLOCK), |
||||
confidence: ri.Confidence(RIL_BLOCK), |
||||
baseline: ri.getBaseline(RIL_BLOCK), |
||||
bbox: ri.getBoundingBox(RIL_BLOCK), |
||||
|
||||
blocktype: enumToString(ri.BlockType(), 'PT'), |
||||
polygon: polygon |
||||
} |
||||
blocks.push(block) |
||||
} |
||||
if(ri.IsAtBeginningOf(RIL_PARA)){ |
||||
para = { |
||||
lines: [], |
||||
|
||||
text: ri.GetUTF8Text(RIL_PARA), |
||||
confidence: ri.Confidence(RIL_PARA), |
||||
baseline: ri.getBaseline(RIL_PARA), |
||||
bbox: ri.getBoundingBox(RIL_PARA), |
||||
|
||||
is_ltr: !!ri.ParagraphIsLtr() |
||||
} |
||||
block.paragraphs.push(para) |
||||
} |
||||
if(ri.IsAtBeginningOf(RIL_TEXTLINE)){ |
||||
textline = { |
||||
words: [], |
||||
|
||||
text: ri.GetUTF8Text(RIL_TEXTLINE), |
||||
confidence: ri.Confidence(RIL_TEXTLINE), |
||||
baseline: ri.getBaseline(RIL_TEXTLINE), |
||||
bbox: ri.getBoundingBox(RIL_TEXTLINE) |
||||
} |
||||
para.lines.push(textline) |
||||
} |
||||
if(ri.IsAtBeginningOf(RIL_WORD)){ |
||||
var fontInfo = ri.getWordFontAttributes(), |
||||
wordDir = ri.WordDirection(); |
||||
word = { |
||||
symbols: [], |
||||
choices: [], |
||||
|
||||
text: ri.GetUTF8Text(RIL_WORD), |
||||
confidence: ri.Confidence(RIL_WORD), |
||||
baseline: ri.getBaseline(RIL_WORD), |
||||
bbox: ri.getBoundingBox(RIL_WORD), |
||||
|
||||
is_numeric: !!ri.WordIsNumeric(), |
||||
in_dictionary: !!ri.WordIsFromDictionary(), |
||||
direction: enumToString(wordDir, 'DIR'), |
||||
language: ri.WordRecognitionLanguage(), |
||||
|
||||
is_bold: fontInfo.is_bold, |
||||
is_italic: fontInfo.is_italic, |
||||
is_underlined: fontInfo.is_underlined, |
||||
is_monospace: fontInfo.is_monospace, |
||||
is_serif: fontInfo.is_serif, |
||||
is_smallcaps: fontInfo.is_smallcaps, |
||||
font_size: fontInfo.pointsize, |
||||
font_id: fontInfo.font_id, |
||||
font_name: fontInfo.font_name, |
||||
} |
||||
var wc = new module.WordChoiceIterator(ri); |
||||
do { |
||||
word.choices.push({ |
||||
text: wc.GetUTF8Text(), |
||||
confidence: wc.Confidence() |
||||
}) |
||||
} while (wc.Next()); |
||||
module.destroy(wc) |
||||
textline.words.push(word) |
||||
} |
||||
|
||||
var image = null; |
||||
// var pix = ri.GetBinaryImage(RIL_SYMBOL)
|
||||
// var image = pix2array(pix);
|
||||
// // for some reason it seems that things stop working if you destroy pics
|
||||
// module._pixDestroy(module.getPointer(pix));
|
||||
if(ri.IsAtBeginningOf(RIL_SYMBOL)){ |
||||
symbol = { |
||||
choices: [], |
||||
image: image, |
||||
|
||||
text: ri.GetUTF8Text(RIL_SYMBOL), |
||||
confidence: ri.Confidence(RIL_SYMBOL), |
||||
baseline: ri.getBaseline(RIL_SYMBOL), |
||||
bbox: ri.getBoundingBox(RIL_SYMBOL), |
||||
|
||||
is_superscript: !!ri.SymbolIsSuperscript(), |
||||
is_subscript: !!ri.SymbolIsSubscript(), |
||||
is_dropcap: !!ri.SymbolIsDropcap(), |
||||
} |
||||
word.symbols.push(symbol) |
||||
var ci = new module.ChoiceIterator(ri); |
||||
do { |
||||
symbol.choices.push({ |
||||
text: ci.GetUTF8Text(), |
||||
confidence: ci.Confidence() |
||||
}) |
||||
} while (ci.Next()); |
||||
module.destroy(ci) |
||||
} |
||||
} while (ri.Next(RIL_SYMBOL)); |
||||
module.destroy(ri) |
||||
|
||||
return { |
||||
text: base.GetUTF8Text(), |
||||
html: deindent(base.GetHOCRText()), |
||||
|
||||
confidence: base.MeanTextConf(), |
||||
|
||||
blocks: blocks, |
||||
|
||||
psm: enumToString(base.GetPageSegMode(), 'PSM'), |
||||
oem: enumToString(base.oem(), 'OEM'), |
||||
version: base.Version(), |
||||
} |
||||
} |
@ -0,0 +1,2 @@
@@ -0,0 +1,2 @@
|
||||
const fileSizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922} |
||||
export default fileSizes; |
@ -0,0 +1,39 @@
@@ -0,0 +1,39 @@
|
||||
import TesseractCore from 'tesseract.js-core' |
||||
import pako from 'pako' |
||||
|
||||
import recognize from './recognize' |
||||
import detect from './detect' |
||||
|
||||
var module, base, jobId |
||||
|
||||
onmessage = function(e) { |
||||
var {action, args} = e.data; |
||||
jobId = e.data.jobId |
||||
|
||||
console.log('worker got action', action) |
||||
|
||||
if(action == 'init'){ |
||||
|
||||
module = TesseractCore({ |
||||
TOTAL_MEMORY: args.mem, //must be a multiple of 10 megabytes
|
||||
TesseractProgress(percent){ |
||||
postMessage({ jobId, |
||||
'progress': { |
||||
'recognized': Math.max(0,(percent-30)/70) |
||||
} |
||||
}) |
||||
}, |
||||
onRuntimeInitialized() {} |
||||
}) |
||||
module.FS_createPath("/","tessdata",true,true) |
||||
base = new module.TessBaseAPI() |
||||
|
||||
} else if(action === 'recognize'){ |
||||
var {image, options} = args |
||||
recognize(jobId, module, base, image, options, |
||||
(error, result) => postMessage({jobId, error, result})) |
||||
} else if(action === 'detect'){ |
||||
detect(jobId, module, base, args.image, |
||||
(error, result) => postMessage({jobId, error, result})) |
||||
} |
||||
} |
@ -0,0 +1,88 @@
@@ -0,0 +1,88 @@
|
||||
import pako from 'pako' |
||||
import db from './db' |
||||
import fileSizes from './fileSizes' |
||||
|
||||
function getLanguageData(lang, progress, cb, url='https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz'){ |
||||
var xhr = new XMLHttpRequest(); |
||||
xhr.responseType = 'arraybuffer'; |
||||
xhr.open('GET', url, true); |
||||
xhr.onerror = e => { |
||||
xhr.onprogress = xhr.onload = null |
||||
cb(xhr, null) |
||||
} |
||||
xhr.onprogress = e => progress({ |
||||
'loaded_lang_model': e.loaded/fileSizes[lang], //this is kinda wrong on safari
|
||||
cached: false |
||||
}) |
||||
xhr.onload = e => { |
||||
if (!(xhr.status == 200 || (xhr.status == 0 && xhr.response))) return cb(xhr, null); |
||||
|
||||
progress({'unzipping_lang_model': true}) |
||||
|
||||
var response = new Uint8Array(xhr.response) |
||||
while(response[0] == 0x1f && response[1] == 0x8b) response = pako.ungzip(response); |
||||
|
||||
progress({ |
||||
'unzipped_lang_model': true, |
||||
'lang_model_size': response.length |
||||
}) |
||||
|
||||
cb(null, response) |
||||
} |
||||
|
||||
progress({ |
||||
'loaded_lang_model': 0, |
||||
cached: false, |
||||
requesting: true |
||||
}) |
||||
|
||||
xhr.send() |
||||
} |
||||
|
||||
// var loaded_langs = []
|
||||
|
||||
export default function loadLanguage(lang, jobId, cb, url){ |
||||
|
||||
console.log('loadLanguage jobId', jobId) |
||||
|
||||
// if(loaded_langs.indexOf(lang) != -1) return cb(null, lang);
|
||||
|
||||
function progressMessage(progress){ |
||||
postMessage({ jobId, progress }) |
||||
} |
||||
|
||||
function finish(err, data) { |
||||
if(err) return cb(err); |
||||
// loaded_langs.push(lang)
|
||||
cb(null, data) |
||||
} |
||||
|
||||
function createDataFile(err, data){ |
||||
progressMessage({ created_virtual_datafile: true}) |
||||
finish(err, data) |
||||
} |
||||
|
||||
function createDataFileCached(err, data) { |
||||
if(err) return createDataFile(err); |
||||
|
||||
db.put(lang, data, err => console.log('cached', lang, err)) |
||||
progressMessage({cached_lang: lang}) |
||||
createDataFile(null, data) |
||||
} |
||||
|
||||
|
||||
db.open({compression: false}, err => { |
||||
if (err) return getLanguageData(lang, progressMessage, createDataFile, url); |
||||
|
||||
db.get(lang, (err, data) => { |
||||
|
||||
if (err) return getLanguageData(lang, progressMessage, createDataFileCached, url) |
||||
|
||||
while(data[0] == 0x1f && data[1] == 0x8b) data = pako.ungzip(data); |
||||
|
||||
progressMessage({ loaded_lang_model: lang, from_cache: true }) |
||||
|
||||
cb(null, data) |
||||
}) |
||||
}) |
||||
} |
@ -0,0 +1,71 @@
@@ -0,0 +1,71 @@
|
||||
import desaturate from './desaturate' |
||||
import loadLanguage from './loadLanguage' |
||||
import circularize from './circularize' |
||||
import dump from './dump' |
||||
|
||||
var loaded_langs = [] |
||||
|
||||
export default function recognize(jobId, module, base, image, options, cb){ |
||||
|
||||
console.log('recognize id', jobId) |
||||
var {lang} = options |
||||
var width = image.width, height = image.height; |
||||
|
||||
image = desaturate(image) |
||||
|
||||
var ptr = module.allocate(image, 'i8', module.ALLOC_NORMAL); |
||||
|
||||
|
||||
function run() { |
||||
base.Init(null, lang) |
||||
|
||||
postMessage({ |
||||
jobId, |
||||
'progress': { |
||||
'initialized_with_lang': lang |
||||
} |
||||
}) |
||||
|
||||
for (var option in options) { |
||||
if (options.hasOwnProperty(option)) { |
||||
base.SetVariable(option, options[option]); |
||||
postMessage({ |
||||
jobId: jobId, |
||||
'progress': { |
||||
'set_variable': { |
||||
variable: option, |
||||
value: options[option] |
||||
} |
||||
} |
||||
}) |
||||
} |
||||
} |
||||
|
||||
|
||||
base.SetImage(module.wrapPointer(ptr), width, height, 1, width) |
||||
base.SetRectangle(0, 0, width, height) |
||||
// base.GetUTF8Text()
|
||||
base.Recognize(null) |
||||
var everything = circularize(dump(module, base)) |
||||
base.End(); |
||||
module._free(ptr); |
||||
cb(null, everything) |
||||
} |
||||
|
||||
|
||||
|
||||
if(loaded_langs.indexOf(lang) == -1) loadLanguage(lang, jobId, function(err, result){ |
||||
|
||||
if(err){ |
||||
console.error("error loading", lang); |
||||
module._free(ptr); |
||||
return cb(err, null); |
||||
} |
||||
|
||||
loaded_langs.push(lang) |
||||
module.FS_createDataFile('tessdata', lang +".traineddata", result, true, false); |
||||
run() |
||||
|
||||
}) |
||||
else run(); |
||||
} |
@ -0,0 +1,42 @@
@@ -0,0 +1,42 @@
|
||||
var path = require('path'); |
||||
var webpack = require('webpack'); |
||||
|
||||
function config({entry, output, include}) { |
||||
return { |
||||
devtool: 'cheap-module-eval-source-map', |
||||
entry, |
||||
output: Object.assign({}, output, { |
||||
path: path.join(__dirname, 'build'), |
||||
publicPath: '/build/', |
||||
}), |
||||
plugins: [ |
||||
new webpack.NoErrorsPlugin() |
||||
], |
||||
module: { |
||||
loaders: [{ |
||||
test: /\.js$/, |
||||
loaders: ['babel'], |
||||
include |
||||
}] |
||||
}, |
||||
node: { |
||||
fs: "empty" |
||||
} |
||||
} |
||||
} |
||||
|
||||
module.exports = [{ |
||||
entry: './src/browser/index.js', |
||||
output: { |
||||
filename: 'tesseract.js', |
||||
library: "Tesseract", |
||||
libraryTarget: "umd" |
||||
}, |
||||
include: [path.join(__dirname, 'src/browser')] |
||||
}, { |
||||
entry: './src/worker/index.js', |
||||
output: { |
||||
filename: 'tesseract.worker.js', |
||||
}, |
||||
include: [path.join(__dirname, 'src/worker')] |
||||
}].map(config); |
@ -0,0 +1,46 @@
@@ -0,0 +1,46 @@
|
||||
var path = require('path'); |
||||
var webpack = require('webpack'); |
||||
|
||||
function config({entry, output, include}) { |
||||
return { |
||||
entry, |
||||
output: Object.assign({}, output, { |
||||
path: path.join(__dirname, 'dist') |
||||
}), |
||||
plugins: [ |
||||
new webpack.optimize.OccurenceOrderPlugin(), |
||||
new webpack.optimize.DedupePlugin(), |
||||
new webpack.optimize.UglifyJsPlugin({ |
||||
compressor: { |
||||
warnings: false |
||||
} |
||||
}) |
||||
], |
||||
module: { |
||||
loaders: [{ |
||||
test: /\.js$/, |
||||
loaders: ['babel'], |
||||
include |
||||
}] |
||||
}, |
||||
node: { |
||||
fs: "empty" |
||||
} |
||||
} |
||||
} |
||||
|
||||
module.exports = [{ |
||||
entry: './src/browser/index.js', |
||||
output: { |
||||
filename: 'tesseract.js', |
||||
library: "Tesseract", |
||||
libraryTarget: "umd" |
||||
}, |
||||
include: [path.join(__dirname, 'src/browser')] |
||||
}, { |
||||
entry: './src/worker/index.js', |
||||
output: { |
||||
filename: 'tesseract.worker.js', |
||||
}, |
||||
include: [path.join(__dirname, 'src/worker')] |
||||
}].map(config); |
Loading…
Reference in new issue