diff --git a/example.htm b/example.htm index b1672b6..c189828 100644 --- a/example.htm +++ b/example.htm @@ -13,10 +13,12 @@ // ctx.fillText('2小時可換乘2次2小時可換乘2次', 100, 80) ctx.font = '30px sans-serif' ctx.fillText('the Cosmic Void', 100, 120) + Tesseract.recognize(canvas,{ tessedit_char_blacklist:'e', progress: function(e){ console.log(e) } }).then( function(d){ console.log(d) } ) + \ No newline at end of file diff --git a/package.json b/package.json index ee439bb..6ca6dad 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "description": "", "main": "Tesseract.js", "dependencies": { + "level-js": "^2.1.6", "pako": "^0.2.7" }, "devDependencies": {}, diff --git a/worker_src/worker.js b/worker_src/worker.js index d1b2c87..4e029d6 100644 --- a/worker_src/worker.js +++ b/worker_src/worker.js @@ -1,76 +1,23 @@ importScripts('madeline.js') +var leveljs = require('level-js') +// var levelup = require('levelup') +var db = leveljs('./tessdata', function(){ -var filesizes = { - "afr": 1079573, - "ara": 1701536, - "aze": 1420865, - "bel": 1276820, - "ben": 6772012, - "bul": 1605615, - "cat": 1652368, - "ces": 1035441, - "chi_sim": 17710414, - "chi_tra": 24717749, - "chr": 320649, - "dan-frak": 677656, - "dan": 1972936, - "deu-frak": 822644, - "deu": 991656, - "ell": 859719, - "eng": 9453554, - "enm": 619254, - "epo": 1241212, - "equ": 821130, - "est": 1905040, - "eus": 1641190, - "fin": 979418, - "fra": 1376221, - "frk": 5912963, - "frm": 5147082, - "glg": 1674938, - "grc": 3012615, - "heb": 1051501, - "hin": 6590065, - "hrv": 1926995, - "hun": 3074473, - "ind": 1874776, - "isl": 1634041, - "ita": 948593, - "ita_old": 3436571, - "jpn": 13507168, - "kan": 4390317, - "kor": 5353098, - "lav": 1843944, - "lit": 1779240, - "mal": 5966263, - "meme": 88453, - "mkd": 1163087, - "mlt": 1463001, - "msa": 1665427, - "nld": 1134708, - "nor": 2191610, - "osd": 4274649, - "pol": 7024662, - "por": 909359, - "ron": 915680, - "rus": 5969957, - "slk-frak": 289885, - "slk": 2217342, - "slv": 1611338, - "spa": 883170, - "spa_old": 5647453, - "sqi": 1667041, - "srp": 1770244, - "swa": 757916, - "swe": 2451917, - "tam": 3498763, - "tel": 5795246, - "tgl": 1496256, - "tha": 3811136, - "tur": 3563264, - "ukr": 937566, - "vie": 2195922 -} +}) +// // 2) put a key & value +// db.put('name2', 'LevelUP', function (err) { +// if (err) return console.log('Ooops!', err) // some kind of I/O error + +// // 3) fetch by key +// db.get('name', function (err, value) { +// if (err) return console.log('Ooops!', err) // likely the key was not found +// console.log('my name is' + value) +// }) +// }) + + + +var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922} var pako = require('pako') @@ -95,32 +42,77 @@ var recognize = (function createTesseractInstance(){ } else{ Module.FS_createPath("/","tessdata",true,true) - var xhr = new XMLHttpRequest(); - xhr.open('GET', 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz', true); - xhr.responseType = 'arraybuffer'; - xhr.onerror = function(){ cb(xhr, null) } - xhr.onprogress = function(e){ + + var downloadlang = function(shouldcache){ postMessage({ - 'progress': { - 'loaded_lang_model': e.loaded/filesizes[lang] - } + 'progress': lang+' not found in cache, downloading' }) - } - xhr.onload = function(){ - if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) { + var xhr = new XMLHttpRequest(); + xhr.open('GET', 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz', true); + xhr.responseType = 'arraybuffer'; + xhr.onerror = function(){ cb(xhr, null) } + xhr.onprogress = function(e){ postMessage({ - 'progress': 'unzipping_lang_model' - }) - var data = pako.inflate(new Uint8Array(xhr.response)) - postMessage({ - 'progress': 'unzipped_lang_model' + 'progress': { + 'loaded_lang_model': e.loaded/filesizes[lang] + } }) - Module.FS_createDataFile('tessdata', lang +".traineddata", data, true, false); - loaded_langs.push(lang) - cb(null, lang) - } else cb(xhr, null); + } + xhr.onload = function(){ + if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) { + postMessage({ + 'progress': 'unzipping_lang_model' + }) + + var response = new Uint8Array(xhr.response) + + var data = pako.inflate(response) + postMessage({ + 'progress': 'unzipped_lang_model' + }) + + Module.FS_createDataFile('tessdata', lang +".traineddata", data, true, false); + + if(shouldcache){ + db.put(lang, response, function(err){ + console.log('cached lang') + }) + } + + loaded_langs.push(lang) + + cb(null, lang) + } else cb(xhr, null); + } + xhr.send(null) } - xhr.send(null) + + db.open({compression: false},function(err){ + if (err) { + downloadlang(false) + } + else { + db.get(lang, function (err, value) { + + // err = true + + if (err) { + downloadlang(true) + } + else { + value = pako.inflate(value) + + postMessage({ + 'progress': lang+' found in cache, length '+ value.length + }) + + Module.FS_createDataFile('tessdata', lang +".traineddata", value, true, false); + loaded_langs.push(lang) + cb(null, lang) + } + }) + } + }) } } @@ -305,31 +297,35 @@ var recognize = (function createTesseractInstance(){ if(err){ console.error("error loading", lang); + Module._free(ptr); cb(err, null) } - base.Init(null, lang) - for (var option in options) { - if (options.hasOwnProperty(option)) { - base.SetVariable(option, options[option]); - postMessage({ - progress: { - set_variable: { - variable: option, - value: options[option] + else { + base.Init(null, lang) + for (var option in options) { + if (options.hasOwnProperty(option)) { + base.SetVariable(option, options[option]); + postMessage({ + progress: { + set_variable: { + variable: option, + value: options[option] + } } - } - }) - } - } + }) + } + } + + base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) + base.SetRectangle(0, 0, width, height) + base.GetUTF8Text() + var everything = DumpLiterallyEverything() + base.End(); + Module._free(ptr); + cb(null, everything) - base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) - base.SetRectangle(0, 0, width, height) - base.GetUTF8Text() - var everything = DumpLiterallyEverything() - base.End(); - Module._free(ptr); - cb(null, everything) + } }) } @@ -337,7 +333,6 @@ var recognize = (function createTesseractInstance(){ })() onmessage = function(e) { - recognize(e.data.image, e.data.lang, e.data.options, function(err, result){ postMessage({err:err, result: result}) })