Browse Source

reuse workers, added language detect

pull/12/head
Guillermo 10 years ago
parent
commit
3901492461
  1. 160
      lib/Tesseract.js
  2. 134
      worker_src/worker.js

160
lib/Tesseract.js

@ -1,79 +1,119 @@ @@ -1,79 +1,119 @@
var Tesseract = {}
var Tesseract = (function(){
Tesseract.recognize = function(image, options, callback){
var lang = options.lang
if(typeof lang === "undefined"){
lang = 'eng'
}
var Tesseract = {}
var blob = new Blob(["importScripts('http://localhost:1234/master/worker/worker.js');"]);
console.log('localhost')
var worker = new Worker(window.URL.createObjectURL(blob));
var index = 0
var handlers = []
if (typeof options === 'string') {
lang = options
options = {}
worker.onmessage = function(e){
// console.log(handlers, e)
var handler = handlers[e.data.index]
if(e.data.progress){
handler.progress(e.data.progress)
}
else if(e.data.err){
handler.reject(e.data.err)
handler.callback(e.data.err)
}
else {
handler.resolve(e.data.result)
handler.callback(null,e.data.result)
}
}
if (typeof options === "function") {
callback = options
options = {}
function convertToImageData(image){
if(image.getContext){
image = image.getContext('2d');
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){
var c = document.createElement('canvas');
if(image.tagName == "IMG"){
c.width = image.naturalWidth;
c.height = image.naturalHeight;
}else if(image.tagName == "VIDEO"){
c.width = image.videoWidth;
c.height = image.videoHeight;
}
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
image = ctx;
}
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return image
}
Tesseract.detect = function(image, progress, callback){
image = convertToImageData(image)
if(image.getContext){
image = image.getContext('2d');
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){
var c = document.createElement('canvas');
if(image.tagName == "IMG"){
c.width = image.naturalWidth;
c.height = image.naturalHeight;
}else if(image.tagName == "VIDEO"){
c.width = image.videoWidth;
c.height = image.videoHeight;
if(typeof progress === "undefined"){
progress = callback = new Function()
}
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
image = ctx;
}
if(image.getImageData) image = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
if (typeof callback === "undefined"){
callback = progress
progress = new Function()
}
var blob = new Blob(["importScripts('https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js');"]);
var i = index++
var worker = new Worker(window.URL.createObjectURL(blob));
handlers[i] = {
resolve: new Function(),
reject: new Function()
}
handlers[i].callback = callback
handlers[i].progress = progress
return new Promise(function(resolve, reject){
handlers[i].resolve = resolve
handlers[i].reject = reject
worker.postMessage({index: i, fun: 'detect', image: image})
})
}
var progress = (function(){
if(typeof options.progress === 'function'){
var p = options.progress
delete options.progress
return p
Tesseract.recognize = function(image, options, callback){
var lang = options.lang
if(typeof lang === "undefined"){
lang = 'eng'
}
return function(){}
})()
if (typeof options === 'string') {
lang = options
options = {}
}
if(typeof callback === "function"){
worker.onmessage = function(e){
if(e.data.progress){
progress(e.data.progress)
}
else{
callback(e.data.err, e.data.result)
}
if (typeof options === "function") {
callback = options
options = {}
}
worker.postMessage({image: image, lang: lang})
}
else {
return new Promise(function(resolve, reject){
worker.onmessage = function(e){
if(e.data.progress){
progress(e.data.progress)
}
else if(e.data.err){
reject(e.data.err)
}
else {
resolve(e.data.result)
}
image = convertToImageData(image)
var i = index++
handlers[i] = {
resolve: new Function(),
reject: new Function()
}
handlers[i].callback = callback || new Function()
handlers[i].progress = (function(){
if(typeof options.progress === 'function'){
var p = options.progress
delete options.progress
return p
}
worker.postMessage({image: image, lang: lang, options: options})
return function(){}
})()
return new Promise(function(resolve, reject){
handlers[i].resolve = resolve
handlers[i].reject = reject
worker.postMessage({index: i, fun: 'recognize', image: image, lang: lang, options: options})
})
}
}
return Tesseract
})()

134
worker_src/worker.js

@ -7,16 +7,21 @@ else { @@ -7,16 +7,21 @@ else {
db = leveljs('./tessdata')
}
console.log('hallo')
var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}
var pako = require('pako')
var recognize = (function createTesseractInstance(){
var T = (function createTesseractInstance(){
curindex = 0
var Module = Tesseract304({
TOTAL_MEMORY: 6*16777216, //must be a multiple of 10 megabytes
TesseractProgress: function(percent){
postMessage({
index: curindex,
'progress': {
'recognized': Math.max(0,(percent-30)/70)
}
@ -29,7 +34,7 @@ var recognize = (function createTesseractInstance(){ @@ -29,7 +34,7 @@ var recognize = (function createTesseractInstance(){
var base = new Module.TessBaseAPI()
var loaded_langs = []
var loadLanguage = function(lang, cb){ // NodeJS style callback
var loadLanguage = function(lang, index, cb){ // NodeJS style callback
if(loaded_langs.indexOf(lang) != -1){
cb(null, lang)
}
@ -38,6 +43,7 @@ var recognize = (function createTesseractInstance(){ @@ -38,6 +43,7 @@ var recognize = (function createTesseractInstance(){
var downloadlang = function(shouldcache){
postMessage({
index: index,
'progress': {
'loaded_lang_model': 0,
cached: false,
@ -50,8 +56,9 @@ var recognize = (function createTesseractInstance(){ @@ -50,8 +56,9 @@ var recognize = (function createTesseractInstance(){
xhr.onerror = function(){ cb(xhr, null) }
xhr.onprogress = function(e){
postMessage({
index: index,
'progress': {
'loaded_lang_model': e.loaded/filesizes[lang],
'loaded_lang_model': e.loaded/filesizes[lang], //this is kinda wrong on safari
cached: false
}
})
@ -59,17 +66,26 @@ var recognize = (function createTesseractInstance(){ @@ -59,17 +66,26 @@ var recognize = (function createTesseractInstance(){
xhr.onload = function(){
if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) {
postMessage({
index: index,
'progress': 'unzipping_lang_model'
})
var response = new Uint8Array(xhr.response)
var data = pako.inflate(response)
while(response[0] == 0x1f && response[1] == 0x8b){
response = pako.ungzip(response)
}
console.log('asdf')
postMessage({
'progress': 'unzipped_lang_model'
index: index,
'progress': {
'unzipped_lang_model': true,
'lang_model_size': response.length
}
})
Module.FS_createDataFile('tessdata', lang +".traineddata", data, true, false);
Module.FS_createDataFile('tessdata', lang +".traineddata", response, true, false);
if(shouldcache){
db.put(lang, response, function(err){
@ -77,6 +93,14 @@ var recognize = (function createTesseractInstance(){ @@ -77,6 +93,14 @@ var recognize = (function createTesseractInstance(){
})
}
postMessage({
index: index,
'progress': {
'created_virtual_datafile': true,
'cached_file': shouldcache
}
})
loaded_langs.push(lang)
cb(null, lang)
@ -86,6 +110,7 @@ var recognize = (function createTesseractInstance(){ @@ -86,6 +110,7 @@ var recognize = (function createTesseractInstance(){
}
db.open({compression: false},function(err){
// err = true
if (err) {
downloadlang(false)
}
@ -101,6 +126,7 @@ var recognize = (function createTesseractInstance(){ @@ -101,6 +126,7 @@ var recognize = (function createTesseractInstance(){
value = pako.inflate(value)
postMessage({
index: index,
'progress': {
loaded_lang_model:1,
cached: true
@ -117,7 +143,6 @@ var recognize = (function createTesseractInstance(){ @@ -117,7 +143,6 @@ var recognize = (function createTesseractInstance(){
}
}
function circularize(page){
page.paragraphs = []
page.lines = []
@ -175,18 +200,16 @@ var recognize = (function createTesseractInstance(){ @@ -175,18 +200,16 @@ var recognize = (function createTesseractInstance(){
return page
}
function DumpLiterallyEverything(){
var ri = base.GetIterator();
var blocks = [];
var block, para, textline, word, symbol;
function enumToString(value, prefix){
return (Object.keys(Module)
.filter(function(e){ return e.startsWith(prefix + '_') })
.filter(function(e){ return Module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
return (Object.keys(Module)
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' })
.filter(function(e){ return Module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
}
ri.Begin()
@ -327,7 +350,7 @@ var recognize = (function createTesseractInstance(){ @@ -327,7 +350,7 @@ var recognize = (function createTesseractInstance(){
}
}
function recognize(image, lang, options,cb){
function desaturate(image){
var width, height;
if(image.data){
var src = image.data;
@ -351,9 +374,19 @@ var recognize = (function createTesseractInstance(){ @@ -351,9 +374,19 @@ var recognize = (function createTesseractInstance(){
else {
throw 'Expected ImageData'
}
return image
}
function recognize(index, image, lang, options, cb){
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL);
loadLanguage(lang, function(err, result){
loadLanguage(lang, index, function(err, result){
if(err){
console.error("error loading", lang);
@ -361,7 +394,18 @@ var recognize = (function createTesseractInstance(){ @@ -361,7 +394,18 @@ var recognize = (function createTesseractInstance(){
cb(err, null)
}
else {
curindex = index
base.Init(null, lang)
postMessage({
index: index,
'progress': {
'initialized_with_lang': true,
'lang': lang
}
})
for (var option in options) {
if (options.hasOwnProperty(option)) {
base.SetVariable(option, options[option]);
@ -390,11 +434,63 @@ var recognize = (function createTesseractInstance(){ @@ -390,11 +434,63 @@ var recognize = (function createTesseractInstance(){
})
}
return recognize
function detect(index, image, cb){
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL);
console.log('allocated image')
// base = new Module.TessBaseAPI()
loadLanguage('osd', index, function(err, result){
if(err){
Module._free(ptr);
cb(err)
}
else {
curindex = index
base.Init(null, 'osd')
base.SetPageSegMode(Module.PSM_OSD_ONLY)
console.log('loaded language')
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width)
base.SetRectangle(0, 0, width, height)
// base.Recognize(0);
var results = new Module.OSResults();
var success = base.DetectOS(results);
console.log('detected os successfully', !!success);
var charset = results.get_unicharset()
results.print_scores()
var best = results.get_best_result()
var oid = best.get_orientation_id(),
sid = best.get_script_id();
console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence())
console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence())
cb(null, 'wolo')
base.End();
Module._free(ptr);
}
})
}
return {
recognize: recognize,
detect: detect
}
})()
onmessage = function(e) {
recognize(e.data.image, e.data.lang, e.data.options, function(err, result){
postMessage({err:err, result: result})
})
if(e.data.fun === 'recognize'){
T.recognize(e.data.index, e.data.image, e.data.lang, e.data.options, function(err, result){
postMessage({index: e.data.index, err:err, result: result})
})
}
else if(e.data.fun === 'detect'){
T.detect(e.data.index, e.data.image, function(err, result){
postMessage({index: e.data.index, err:err, result: result})
})
}
}
Loading…
Cancel
Save