diff --git a/src/common/circularize.js b/src/common/circularize.js index 804bdc2..c68d83a 100644 --- a/src/common/circularize.js +++ b/src/common/circularize.js @@ -3,61 +3,77 @@ // to be sent from a webworker to the main app // or through Node's IPC), but we want // a (circular) DOM-like interface for walking -// through the data. +// through the data. -module.exports = function circularize(page){ - page.paragraphs = [] - page.lines = [] - page.words = [] - page.symbols = [] +module.exports = (iPage) => { + const page = { + ...iPage, + paragraphs: [], + lines: [], + words: [], + symbols: [], + }; - page.blocks.forEach(function(block){ - block.page = page; + page.blocks.forEach((iBlock) => { + const block = { + ...iBlock, + page, + lines: [], + words: [], + symbols: [], + }; - block.lines = [] - block.words = [] - block.symbols = [] + block.paragraphs.forEach((iPara) => { + const para = { + ...iPara, + block, + page, + words: [], + symbols: [], + }; - block.paragraphs.forEach(function(para){ - para.block = block; - para.page = page; + para.lines.forEach((iLine) => { + const line = { + ...iLine, + paragraph: para, + block, + page, + symbols: [], + }; - para.words = [] - para.symbols = [] - - para.lines.forEach(function(line){ - line.paragraph = para; - line.block = block; - line.page = page; + line.words.forEach((iWord) => { + const word = { + ...iWord, + line, + paragraph: para, + block, + page, + }; - line.symbols = [] + word.symbols.forEach((iSym) => { + const sym = { + ...iSym, + word, + line, + paragraph: para, + block, + page, + }; - line.words.forEach(function(word){ - word.line = line; - word.paragraph = para; - word.block = block; - word.page = page; - word.symbols.forEach(function(sym){ - sym.word = word; - sym.line = line; - sym.paragraph = para; - sym.block = block; - sym.page = page; - - sym.line.symbols.push(sym) - sym.paragraph.symbols.push(sym) - sym.block.symbols.push(sym) - sym.page.symbols.push(sym) - }) - word.paragraph.words.push(word) - word.block.words.push(word) - word.page.words.push(word) - }) - line.block.lines.push(line) - line.page.lines.push(line) - }) - para.page.paragraphs.push(para) - }) - }) - return page -} \ No newline at end of file + sym.line.symbols.push(sym); + sym.paragraph.symbols.push(sym); + sym.block.symbols.push(sym); + sym.page.symbols.push(sym); + }); + word.paragraph.words.push(word); + word.block.words.push(word); + word.page.words.push(word); + }); + line.block.lines.push(line); + line.page.lines.push(line); + }); + para.page.paragraphs.push(para); + }); + }); + return page; +}; diff --git a/src/common/desaturate.js b/src/common/desaturate.js index ab83054..0136ab9 100644 --- a/src/common/desaturate.js +++ b/src/common/desaturate.js @@ -1,24 +1,30 @@ +/* eslint-disable no-bitwise */ +/* eslint-disable max-len */ + // This converts an image to grayscale +module.exports = (image) => { + if (image.data) { + const src = image.data; + const { width, height } = image; + const dst = new Uint8Array(width * height); + const srcLength = src.length | 0; + const srcLength16 = (srcLength - 16) | 0; + let i = 0; + let j = 0; -module.exports = function desaturate(image){ - var width, height; - if(image.data){ - var src = image.data; - width = image.width, - height = image.height; - var dst = new Uint8Array(width * height); - var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0; - - for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) { - // convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray - dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 - dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16 - dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16 - dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16 - } - for (; i < srcLength; i += 4, ++j) //finish up - dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 - image = dst; - } else { throw 'Invalid ImageData' } - return image -} \ No newline at end of file + for (; i <= srcLength16; i += 16, j += 4) { + // convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray + dst[j] = (((src[i] * 77 + src[i + 1] * 151 + src[i + 2] * 28) * src[i + 3]) + ((255 - src[i + 3]) << 15) + 32768) >> 16; + dst[j + 1] = (((src[i + 4] * 77 + src[i + 5] * 151 + src[i + 6] * 28) * src[i + 7]) + ((255 - src[i + 7]) << 15) + 32768) >> 16; + dst[j + 2] = (((src[i + 8] * 77 + src[i + 9] * 151 + src[i + 10] * 28) * src[i + 11]) + ((255 - src[i + 11]) << 15) + 32768) >> 16; + dst[j + 3] = (((src[i + 12] * 77 + src[i + 13] * 151 + src[i + 14] * 28) * src[i + 15]) + ((255 - src[i + 15]) << 15) + 32768) >> 16; + } + // finish up + for (; i < srcLength; i += 4, j += 1) { + dst[j] = (((src[i] * 77 + src[i + 1] * 151 + src[i + 2] * 28) * src[i + 3]) + ((255 - src[i + 3]) << 15) + 32768) >> 16; + } + return dst; + } + return null; + // throw { err: 'Invalid ImageData' }; +}; diff --git a/src/common/dump.js b/src/common/dump.js index 9019077..72223d4 100644 --- a/src/common/dump.js +++ b/src/common/dump.js @@ -1,164 +1,158 @@ -module.exports = function DumpLiterallyEverything(Module, base){ - var ri = base.GetIterator(); - var blocks = []; - var block, para, textline, word, symbol; +// the generated HOCR is excessively indented, so +// we get rid of that indentation - function enumToString(value, prefix){ - return (Object.keys(Module) - .filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' }) - .filter(function(e){ return Module[e] === value }) - .map(function(e){ return e.slice(prefix.length + 1) })[0]) +const deindent = (html) => { + const lines = html.split('\n'); + if (lines[0].substring(0, 2) === ' ') { + for (let i = 0; i < lines.length; i += 1) { + if (lines[i].substring(0, 2) === ' ') { + lines[i] = lines[i].slice(2); + } } - - ri.Begin() - do { - if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){ - var poly = ri.BlockPolygon(); - var polygon = null; - // BlockPolygon() returns null when automatic page segmentation is off - if(Module.getPointer(poly) > 0){ - var n = poly.get_n(), - px = poly.get_x(), - py = poly.get_y(), - polygon = []; - for(var i = 0; i < n; i++){ - polygon.push([px.getValue(i), py.getValue(i)]); - } - Module._ptaDestroy(Module.getPointer(poly)); - } - - block = { - paragraphs: [], - - text: ri.GetUTF8Text(Module.RIL_BLOCK), - confidence: ri.Confidence(Module.RIL_BLOCK), - baseline: ri.getBaseline(Module.RIL_BLOCK), - bbox: ri.getBoundingBox(Module.RIL_BLOCK), - - blocktype: enumToString(ri.BlockType(), 'PT'), - polygon: polygon - } - blocks.push(block) - } - if(ri.IsAtBeginningOf(Module.RIL_PARA)){ - para = { - lines: [], - - text: ri.GetUTF8Text(Module.RIL_PARA), - confidence: ri.Confidence(Module.RIL_PARA), - baseline: ri.getBaseline(Module.RIL_PARA), - bbox: ri.getBoundingBox(Module.RIL_PARA), - - is_ltr: !!ri.ParagraphIsLtr() - } - block.paragraphs.push(para) - } - if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){ - textline = { - words: [], - - text: ri.GetUTF8Text(Module.RIL_TEXTLINE), - confidence: ri.Confidence(Module.RIL_TEXTLINE), - baseline: ri.getBaseline(Module.RIL_TEXTLINE), - bbox: ri.getBoundingBox(Module.RIL_TEXTLINE) - } - para.lines.push(textline) - } - if(ri.IsAtBeginningOf(Module.RIL_WORD)){ - var fontInfo = ri.getWordFontAttributes(), - wordDir = ri.WordDirection(); - word = { - symbols: [], - choices: [], - - text: ri.GetUTF8Text(Module.RIL_WORD), - confidence: ri.Confidence(Module.RIL_WORD), - baseline: ri.getBaseline(Module.RIL_WORD), - bbox: ri.getBoundingBox(Module.RIL_WORD), - - is_numeric: !!ri.WordIsNumeric(), - in_dictionary: !!ri.WordIsFromDictionary(), - direction: enumToString(wordDir, 'DIR'), - language: ri.WordRecognitionLanguage(), - - is_bold: fontInfo.is_bold, - is_italic: fontInfo.is_italic, - is_underlined: fontInfo.is_underlined, - is_monospace: fontInfo.is_monospace, - is_serif: fontInfo.is_serif, - is_smallcaps: fontInfo.is_smallcaps, - font_size: fontInfo.pointsize, - font_id: fontInfo.font_id, - font_name: fontInfo.font_name, - } - var wc = new Module.WordChoiceIterator(ri); - do { - word.choices.push({ - text: wc.GetUTF8Text(), - confidence: wc.Confidence() - }) - } while (wc.Next()); - Module.destroy(wc) - textline.words.push(word) + } + return lines.join('\n'); +}; + +module.exports = (Module, base) => { + const ri = base.GetIterator(); + const blocks = []; + let block; + let para; + let textline; + let word; + let symbol; + + const enumToString = (value, prefix) => ( + Object.keys(Module) + .filter(e => (e.substr(0, prefix.length + 1) === `${prefix}_`)) + .filter(e => Module[e] === value) + .map(e => e.slice(prefix.length + 1))[0] + ); + + ri.Begin(); + do { + if (ri.IsAtBeginningOf(Module.RIL_BLOCK)) { + const poly = ri.BlockPolygon(); + let polygon = null; + // BlockPolygon() returns null when automatic page segmentation is off + if (Module.getPointer(poly) > 0) { + const n = poly.get_n(); + const px = poly.get_x(); + const py = poly.get_y(); + polygon = []; + for (let i = 0; i < n; i += 1) { + polygon.push([px.getValue(i), py.getValue(i)]); } - - var image = null; - // var pix = ri.GetBinaryImage(Module.RIL_SYMBOL) - // var image = pix2array(pix); - // // for some reason it seems that things stop working if you destroy pics - // Module._pixDestroy(Module.getPointer(pix)); - if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){ - symbol = { - choices: [], - image: image, - - text: ri.GetUTF8Text(Module.RIL_SYMBOL), - confidence: ri.Confidence(Module.RIL_SYMBOL), - baseline: ri.getBaseline(Module.RIL_SYMBOL), - bbox: ri.getBoundingBox(Module.RIL_SYMBOL), - - is_superscript: !!ri.SymbolIsSuperscript(), - is_subscript: !!ri.SymbolIsSubscript(), - is_dropcap: !!ri.SymbolIsDropcap(), - } - word.symbols.push(symbol) - var ci = new Module.ChoiceIterator(ri); - do { - symbol.choices.push({ - text: ci.GetUTF8Text(), - confidence: ci.Confidence() - }) - } while (ci.Next()); - Module.destroy(ci) - } - } while (ri.Next(Module.RIL_SYMBOL)); - Module.destroy(ri) - - return { - text: base.GetUTF8Text(), - html: deindent(base.GetHOCRText()), - - confidence: base.MeanTextConf(), - - blocks: blocks, - - psm: enumToString(base.GetPageSegMode(), 'PSM'), - oem: enumToString(base.oem(), 'OEM'), - version: base.Version(), + Module._ptaDestroy(Module.getPointer(poly)); + } + + block = { + paragraphs: [], + text: ri.GetUTF8Text(Module.RIL_BLOCK), + confidence: ri.Confidence(Module.RIL_BLOCK), + baseline: ri.getBaseline(Module.RIL_BLOCK), + bbox: ri.getBoundingBox(Module.RIL_BLOCK), + blocktype: enumToString(ri.BlockType(), 'PT'), + polygon, + }; + blocks.push(block); + } + if (ri.IsAtBeginningOf(Module.RIL_PARA)) { + para = { + lines: [], + text: ri.GetUTF8Text(Module.RIL_PARA), + confidence: ri.Confidence(Module.RIL_PARA), + baseline: ri.getBaseline(Module.RIL_PARA), + bbox: ri.getBoundingBox(Module.RIL_PARA), + is_ltr: !!ri.ParagraphIsLtr(), + }; + block.paragraphs.push(para); + } + if (ri.IsAtBeginningOf(Module.RIL_TEXTLINE)) { + textline = { + words: [], + text: ri.GetUTF8Text(Module.RIL_TEXTLINE), + confidence: ri.Confidence(Module.RIL_TEXTLINE), + baseline: ri.getBaseline(Module.RIL_TEXTLINE), + bbox: ri.getBoundingBox(Module.RIL_TEXTLINE), + }; + para.lines.push(textline); + } + if (ri.IsAtBeginningOf(Module.RIL_WORD)) { + const fontInfo = ri.getWordFontAttributes(); + const wordDir = ri.WordDirection(); + word = { + symbols: [], + choices: [], + + text: ri.GetUTF8Text(Module.RIL_WORD), + confidence: ri.Confidence(Module.RIL_WORD), + baseline: ri.getBaseline(Module.RIL_WORD), + bbox: ri.getBoundingBox(Module.RIL_WORD), + + is_numeric: !!ri.WordIsNumeric(), + in_dictionary: !!ri.WordIsFromDictionary(), + direction: enumToString(wordDir, 'DIR'), + language: ri.WordRecognitionLanguage(), + + is_bold: fontInfo.is_bold, + is_italic: fontInfo.is_italic, + is_underlined: fontInfo.is_underlined, + is_monospace: fontInfo.is_monospace, + is_serif: fontInfo.is_serif, + is_smallcaps: fontInfo.is_smallcaps, + font_size: fontInfo.pointsize, + font_id: fontInfo.font_id, + font_name: fontInfo.font_name, + }; + const wc = new Module.WordChoiceIterator(ri); + do { + word.choices.push({ + text: wc.GetUTF8Text(), + confidence: wc.Confidence(), + }); + } while (wc.Next()); + Module.destroy(wc); + textline.words.push(word); } -} - -// the generated HOCR is excessively indented, so -// we get rid of that indentation -function deindent(html){ - var lines = html.split('\n') - if(lines[0].substring(0, 2) === " "){ - for (var i = 0; i < lines.length; i++) { - if (lines[i].substring(0,2) === " ") { - lines[i] = lines[i].slice(2) - } - }; + // let image = null; + // var pix = ri.GetBinaryImage(Module.RIL_SYMBOL) + // var image = pix2array(pix); + // // for some reason it seems that things stop working if you destroy pics + // Module._pixDestroy(Module.getPointer(pix)); + if (ri.IsAtBeginningOf(Module.RIL_SYMBOL)) { + symbol = { + choices: [], + image: null, + text: ri.GetUTF8Text(Module.RIL_SYMBOL), + confidence: ri.Confidence(Module.RIL_SYMBOL), + baseline: ri.getBaseline(Module.RIL_SYMBOL), + bbox: ri.getBoundingBox(Module.RIL_SYMBOL), + is_superscript: !!ri.SymbolIsSuperscript(), + is_subscript: !!ri.SymbolIsSubscript(), + is_dropcap: !!ri.SymbolIsDropcap(), + }; + word.symbols.push(symbol); + const ci = new Module.ChoiceIterator(ri); + do { + symbol.choices.push({ + text: ci.GetUTF8Text(), + confidence: ci.Confidence(), + }); + } while (ci.Next()); + // Module.destroy(i); } - return lines.join('\n') -} + } while (ri.Next(Module.RIL_SYMBOL)); + Module.destroy(ri); + + return { + text: base.GetUTF8Text(), + html: deindent(base.GetHOCRText()), + confidence: base.MeanTextConf(), + blocks, + psm: enumToString(base.GetPageSegMode(), 'PSM'), + oem: enumToString(base.oem(), 'OEM'), + version: base.Version(), + }; +}; diff --git a/src/common/job.js b/src/common/job.js index d867468..d691fad 100644 --- a/src/common/job.js +++ b/src/common/job.js @@ -1,81 +1,86 @@ -const adapter = require('../node/index.js') +const adapter = require('../node/'); let jobCounter = 0; module.exports = class TesseractJob { - constructor(instance){ - this.id = 'Job-' + (++jobCounter) + '-' + Math.random().toString(16).slice(3, 8) + constructor(instance) { + jobCounter += 1; + this.id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`; - this._instance = instance; - this._resolve = [] - this._reject = [] - this._progress = [] - this._finally = [] + this._instance = instance; + this._resolve = []; + this._reject = []; + this._progress = []; + this._finally = []; + } + + then(resolve, reject) { + if (this._resolve.push) { + this._resolve.push(resolve); + } else { + resolve(this._resolve); } - then(resolve, reject){ - if(this._resolve.push){ - this._resolve.push(resolve) - }else{ - resolve(this._resolve) - } + if (reject) this.catch(reject); + return this; + } - if(reject) this.catch(reject); - return this; - } - catch(reject){ - if(this._reject.push){ - this._reject.push(reject) - }else{ - reject(this._reject) - } - return this; - } - progress(fn){ - this._progress.push(fn) - return this; - } - finally(fn) { - this._finally.push(fn) - return this; - } - _send(action, payload){ - adapter.sendPacket(this._instance, { - jobId: this.id, - action: action, - payload: payload - }) + catch(reject) { + if (this._reject.push) { + this._reject.push(reject); + } else { + reject(this._reject); } + return this; + } - _handle(packet){ - var data = packet.data; - let runFinallyCbs = false; + progress(fn) { + this._progress.push(fn); + return this; + } - if(packet.status === 'resolve'){ - if(this._resolve.length === 0) console.log(data); - this._resolve.forEach(fn => { - var ret = fn(data); - if(ret && typeof ret.then == 'function'){ - console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.') - } - }) - this._resolve = data; - this._instance._dequeue() - runFinallyCbs = true; - }else if(packet.status === 'reject'){ - if(this._reject.length === 0) console.error(data); - this._reject.forEach(fn => fn(data)) - this._reject = data; - this._instance._dequeue() - runFinallyCbs = true; - }else if(packet.status === 'progress'){ - this._progress.forEach(fn => fn(data)) - }else{ - console.warn('Message type unknown', packet.status) - } + finally(fn) { + this._finally.push(fn); + return this; + } + + _send(action, payload) { + adapter.sendPacket(this._instance, { + jobId: this.id, + action, + payload, + }); + } - if (runFinallyCbs) { - this._finally.forEach(fn => fn(data)); + _handle(packet) { + const { data } = packet; + let runFinallyCbs = false; + + if (packet.status === 'resolve') { + if (this._resolve.length === 0) console.log(data); + this._resolve.forEach((fn) => { + const ret = fn(data); + if (ret && typeof ret.then === 'function') { + console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.'); } + }); + this._resolve = data; + this._instance._dequeue(); + runFinallyCbs = true; + } else if (packet.status === 'reject') { + if (this._reject.length === 0) console.error(data); + this._reject.forEach(fn => fn(data)); + this._reject = data; + this._instance._dequeue(); + runFinallyCbs = true; + } else if (packet.status === 'progress') { + this._progress.forEach(fn => fn(data)); + } else { + console.warn('Message type unknown', packet.status); + } + + if (runFinallyCbs) { + this._finally.forEach(fn => fn(data)); } -} + } +};