From 71fbe3d8eaa3a55396b91527c10d7626e0f3682f Mon Sep 17 00:00:00 2001 From: Guillermo Date: Tue, 14 Jul 2015 19:12:48 -0700 Subject: [PATCH] started working oden --- lib/Tesseract.js | 17 +++++++++++++---- worker_src/worker.js | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/lib/Tesseract.js b/lib/Tesseract.js index f826872..1904345 100644 --- a/lib/Tesseract.js +++ b/lib/Tesseract.js @@ -2,11 +2,14 @@ var Tesseract = (function(){ var Tesseract = {} - var blob = new Blob(["importScripts('http://localhost:1234/master/worker/worker.js');"]); + //https://cdn.rawgit.com/naptha/tesseract.js/master/worker/worker.js + //https://rawgit.com/naptha/tesseract.js/master/worker/worker.js for testing + + var blob = new Blob(["importScripts('https://cdn.rawgit.com/naptha/tessdata/worker/worker.js');"]); console.log('localhost') var worker = new Worker(window.URL.createObjectURL(blob)); - - console.log(worker) + worker.postMessage({init: {mem: 16777216*6}}) + var bigworker = false var index = 0 var handlers = [] @@ -77,10 +80,16 @@ var Tesseract = (function(){ Tesseract.recognize = function(image, options, callback){ var lang = options.lang - if(typeof lang === "undefined"){ + if (typeof lang === "undefined"){ lang = 'eng' } + if (!bigworker && ['chi_sim', 'chi_tra', 'jpn'].indexOf(lang) != -1){ + worker.postMessage({init: {mem: 16777216*10}}) + bigworker = true + console.log('started big worker') + } + if (typeof options === 'string') { lang = options options = {} diff --git a/worker_src/worker.js b/worker_src/worker.js index 5f99f2c..b773139 100644 --- a/worker_src/worker.js +++ b/worker_src/worker.js @@ -1,3 +1,4 @@ +var Tesseract304 = require('tesseract') var leveljs = require('level-js') var db; if (typeof indexedDB === 'undefined'){ @@ -13,12 +14,14 @@ var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, var pako = require('pako') -var T = (function createTesseractInstance(){ +var T; + +var tesseractinit = (function createTesseractInstance(memory){ curindex = 0 var Module = Tesseract304({ - TOTAL_MEMORY: 6*16777216, //must be a multiple of 10 megabytes + TOTAL_MEMORY: memory, //must be a multiple of 10 megabytes TesseractProgress: function(percent){ postMessage({ index: curindex, @@ -341,7 +344,7 @@ var T = (function createTesseractInstance(){ return { text: base.GetUTF8Text(), - html: base.GetHOCRText(), + html: deindent(base.GetHOCRText()), confidence: base.MeanTextConf(), @@ -353,6 +356,18 @@ var T = (function createTesseractInstance(){ } } + function deindent(html){ + var lines = html.split('\n') + if(lines[0].substring(0,2) === " "){ + for (var i = 0; i < lines.length; i++) { + if (lines[i].substring(0,2) === " ") { + lines[i] = lines[i].slice(2) + } + }; + } + return lines.join('\n') + } + function desaturate(image){ var width, height; if(image.data){ @@ -413,8 +428,9 @@ var T = (function createTesseractInstance(){ if (options.hasOwnProperty(option)) { base.SetVariable(option, options[option]); postMessage({ - progress: { - set_variable: { + index: index, + 'progress': { + 'set_variable': { variable: option, value: options[option] } @@ -485,7 +501,7 @@ var T = (function createTesseractInstance(){ orientation_degrees: [0, 270, 180, 90][oid], orientation_confidence: best.get_oconfidence() }) - + base.End(); Module._free(ptr); } @@ -497,10 +513,14 @@ var T = (function createTesseractInstance(){ recognize: recognize, detect: detect } -})() +}) onmessage = function(e) { - if(e.data.fun === 'recognize'){ + + if(e.data.init){ + T = tesseractinit(e.data.init.mem) + } + else if(e.data.fun === 'recognize'){ T.recognize(e.data.index, e.data.image, e.data.lang, e.data.options, function(err, result){ postMessage({index: e.data.index, err:err, result: result}) })