|
|
|
@ -1,6 +1,25 @@
@@ -1,6 +1,25 @@
|
|
|
|
|
// the generated HOCR is excessively indented, so
|
|
|
|
|
// we get rid of that indentation
|
|
|
|
|
/** |
|
|
|
|
* |
|
|
|
|
* Dump data to a big JSON tree |
|
|
|
|
* |
|
|
|
|
* @fileoverview dump data to JSON tree |
|
|
|
|
* @author Kevin Kwok <antimatter15@gmail.com> |
|
|
|
|
* @author Guillermo Webster <gui@mit.edu> |
|
|
|
|
* @author Jerome Wu <jeromewus@gmail.com> |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* deindent |
|
|
|
|
* |
|
|
|
|
* The generated HOCR is excessively indented, so |
|
|
|
|
* we get rid of that indentation |
|
|
|
|
* |
|
|
|
|
* @name deindent |
|
|
|
|
* @function deindent string |
|
|
|
|
* @access public |
|
|
|
|
* @param {string} html HOCR in html format |
|
|
|
|
* @returns {string} deindent html string |
|
|
|
|
*/ |
|
|
|
|
const deindent = (html) => { |
|
|
|
|
const lines = html.split('\n'); |
|
|
|
|
if (lines[0].substring(0, 2) === ' ') { |
|
|
|
@ -13,8 +32,18 @@ const deindent = (html) => {
@@ -13,8 +32,18 @@ const deindent = (html) => {
|
|
|
|
|
return lines.join('\n'); |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
module.exports = (Module, base) => { |
|
|
|
|
const ri = base.GetIterator(); |
|
|
|
|
/** |
|
|
|
|
* dump |
|
|
|
|
* |
|
|
|
|
* @name dump |
|
|
|
|
* @function dump recognition result to a JSON object |
|
|
|
|
* @access public |
|
|
|
|
* @param {object} TessModule TessModule from TesseractCore |
|
|
|
|
* @param {object} api TesseractBaseAPI instance |
|
|
|
|
* @returns {object} dumpped JSON object |
|
|
|
|
*/ |
|
|
|
|
module.exports = (TessModule, api) => { |
|
|
|
|
const ri = api.GetIterator(); |
|
|
|
|
const blocks = []; |
|
|
|
|
let block; |
|
|
|
|
let para; |
|
|
|
@ -23,19 +52,19 @@ module.exports = (Module, base) => {
@@ -23,19 +52,19 @@ module.exports = (Module, base) => {
|
|
|
|
|
let symbol; |
|
|
|
|
|
|
|
|
|
const enumToString = (value, prefix) => ( |
|
|
|
|
Object.keys(Module) |
|
|
|
|
Object.keys(TessModule) |
|
|
|
|
.filter(e => (e.substr(0, prefix.length + 1) === `${prefix}_`)) |
|
|
|
|
.filter(e => Module[e] === value) |
|
|
|
|
.filter(e => TessModule[e] === value) |
|
|
|
|
.map(e => e.slice(prefix.length + 1))[0] |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
ri.Begin(); |
|
|
|
|
do { |
|
|
|
|
if (ri.IsAtBeginningOf(Module.RIL_BLOCK)) { |
|
|
|
|
if (ri.IsAtBeginningOf(TessModule.RIL_BLOCK)) { |
|
|
|
|
const poly = ri.BlockPolygon(); |
|
|
|
|
let polygon = null; |
|
|
|
|
// BlockPolygon() returns null when automatic page segmentation is off
|
|
|
|
|
if (Module.getPointer(poly) > 0) { |
|
|
|
|
if (TessModule.getPointer(poly) > 0) { |
|
|
|
|
const n = poly.get_n(); |
|
|
|
|
const px = poly.get_x(); |
|
|
|
|
const py = poly.get_y(); |
|
|
|
@ -43,52 +72,52 @@ module.exports = (Module, base) => {
@@ -43,52 +72,52 @@ module.exports = (Module, base) => {
|
|
|
|
|
for (let i = 0; i < n; i += 1) { |
|
|
|
|
polygon.push([px.getValue(i), py.getValue(i)]); |
|
|
|
|
} |
|
|
|
|
Module._ptaDestroy(Module.getPointer(poly)); |
|
|
|
|
TessModule._ptaDestroy(TessModule.getPointer(poly)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
block = { |
|
|
|
|
paragraphs: [], |
|
|
|
|
text: ri.GetUTF8Text(Module.RIL_BLOCK), |
|
|
|
|
confidence: ri.Confidence(Module.RIL_BLOCK), |
|
|
|
|
baseline: ri.getBaseline(Module.RIL_BLOCK), |
|
|
|
|
bbox: ri.getBoundingBox(Module.RIL_BLOCK), |
|
|
|
|
text: ri.GetUTF8Text(TessModule.RIL_BLOCK), |
|
|
|
|
confidence: ri.Confidence(TessModule.RIL_BLOCK), |
|
|
|
|
baseline: ri.getBaseline(TessModule.RIL_BLOCK), |
|
|
|
|
bbox: ri.getBoundingBox(TessModule.RIL_BLOCK), |
|
|
|
|
blocktype: enumToString(ri.BlockType(), 'PT'), |
|
|
|
|
polygon, |
|
|
|
|
}; |
|
|
|
|
blocks.push(block); |
|
|
|
|
} |
|
|
|
|
if (ri.IsAtBeginningOf(Module.RIL_PARA)) { |
|
|
|
|
if (ri.IsAtBeginningOf(TessModule.RIL_PARA)) { |
|
|
|
|
para = { |
|
|
|
|
lines: [], |
|
|
|
|
text: ri.GetUTF8Text(Module.RIL_PARA), |
|
|
|
|
confidence: ri.Confidence(Module.RIL_PARA), |
|
|
|
|
baseline: ri.getBaseline(Module.RIL_PARA), |
|
|
|
|
bbox: ri.getBoundingBox(Module.RIL_PARA), |
|
|
|
|
text: ri.GetUTF8Text(TessModule.RIL_PARA), |
|
|
|
|
confidence: ri.Confidence(TessModule.RIL_PARA), |
|
|
|
|
baseline: ri.getBaseline(TessModule.RIL_PARA), |
|
|
|
|
bbox: ri.getBoundingBox(TessModule.RIL_PARA), |
|
|
|
|
is_ltr: !!ri.ParagraphIsLtr(), |
|
|
|
|
}; |
|
|
|
|
block.paragraphs.push(para); |
|
|
|
|
} |
|
|
|
|
if (ri.IsAtBeginningOf(Module.RIL_TEXTLINE)) { |
|
|
|
|
if (ri.IsAtBeginningOf(TessModule.RIL_TEXTLINE)) { |
|
|
|
|
textline = { |
|
|
|
|
words: [], |
|
|
|
|
text: ri.GetUTF8Text(Module.RIL_TEXTLINE), |
|
|
|
|
confidence: ri.Confidence(Module.RIL_TEXTLINE), |
|
|
|
|
baseline: ri.getBaseline(Module.RIL_TEXTLINE), |
|
|
|
|
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE), |
|
|
|
|
text: ri.GetUTF8Text(TessModule.RIL_TEXTLINE), |
|
|
|
|
confidence: ri.Confidence(TessModule.RIL_TEXTLINE), |
|
|
|
|
baseline: ri.getBaseline(TessModule.RIL_TEXTLINE), |
|
|
|
|
bbox: ri.getBoundingBox(TessModule.RIL_TEXTLINE), |
|
|
|
|
}; |
|
|
|
|
para.lines.push(textline); |
|
|
|
|
} |
|
|
|
|
if (ri.IsAtBeginningOf(Module.RIL_WORD)) { |
|
|
|
|
if (ri.IsAtBeginningOf(TessModule.RIL_WORD)) { |
|
|
|
|
const fontInfo = ri.getWordFontAttributes(); |
|
|
|
|
const wordDir = ri.WordDirection(); |
|
|
|
|
word = { |
|
|
|
|
symbols: [], |
|
|
|
|
choices: [], |
|
|
|
|
|
|
|
|
|
text: ri.GetUTF8Text(Module.RIL_WORD), |
|
|
|
|
confidence: ri.Confidence(Module.RIL_WORD), |
|
|
|
|
baseline: ri.getBaseline(Module.RIL_WORD), |
|
|
|
|
bbox: ri.getBoundingBox(Module.RIL_WORD), |
|
|
|
|
text: ri.GetUTF8Text(TessModule.RIL_WORD), |
|
|
|
|
confidence: ri.Confidence(TessModule.RIL_WORD), |
|
|
|
|
baseline: ri.getBaseline(TessModule.RIL_WORD), |
|
|
|
|
bbox: ri.getBoundingBox(TessModule.RIL_WORD), |
|
|
|
|
|
|
|
|
|
is_numeric: !!ri.WordIsNumeric(), |
|
|
|
|
in_dictionary: !!ri.WordIsFromDictionary(), |
|
|
|
@ -105,54 +134,54 @@ module.exports = (Module, base) => {
@@ -105,54 +134,54 @@ module.exports = (Module, base) => {
|
|
|
|
|
font_id: fontInfo.font_id, |
|
|
|
|
font_name: fontInfo.font_name, |
|
|
|
|
}; |
|
|
|
|
const wc = new Module.WordChoiceIterator(ri); |
|
|
|
|
const wc = new TessModule.WordChoiceIterator(ri); |
|
|
|
|
do { |
|
|
|
|
word.choices.push({ |
|
|
|
|
text: wc.GetUTF8Text(), |
|
|
|
|
confidence: wc.Confidence(), |
|
|
|
|
}); |
|
|
|
|
} while (wc.Next()); |
|
|
|
|
Module.destroy(wc); |
|
|
|
|
TessModule.destroy(wc); |
|
|
|
|
textline.words.push(word); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// let image = null;
|
|
|
|
|
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
|
|
|
|
|
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
|
|
|
|
|
// var image = pix2array(pix);
|
|
|
|
|
// // for some reason it seems that things stop working if you destroy pics
|
|
|
|
|
// Module._pixDestroy(Module.getPointer(pix));
|
|
|
|
|
if (ri.IsAtBeginningOf(Module.RIL_SYMBOL)) { |
|
|
|
|
// TessModule._pixDestroy(TessModule.getPointer(pix));
|
|
|
|
|
if (ri.IsAtBeginningOf(TessModule.RIL_SYMBOL)) { |
|
|
|
|
symbol = { |
|
|
|
|
choices: [], |
|
|
|
|
image: null, |
|
|
|
|
text: ri.GetUTF8Text(Module.RIL_SYMBOL), |
|
|
|
|
confidence: ri.Confidence(Module.RIL_SYMBOL), |
|
|
|
|
baseline: ri.getBaseline(Module.RIL_SYMBOL), |
|
|
|
|
bbox: ri.getBoundingBox(Module.RIL_SYMBOL), |
|
|
|
|
text: ri.GetUTF8Text(TessModule.RIL_SYMBOL), |
|
|
|
|
confidence: ri.Confidence(TessModule.RIL_SYMBOL), |
|
|
|
|
baseline: ri.getBaseline(TessModule.RIL_SYMBOL), |
|
|
|
|
bbox: ri.getBoundingBox(TessModule.RIL_SYMBOL), |
|
|
|
|
is_superscript: !!ri.SymbolIsSuperscript(), |
|
|
|
|
is_subscript: !!ri.SymbolIsSubscript(), |
|
|
|
|
is_dropcap: !!ri.SymbolIsDropcap(), |
|
|
|
|
}; |
|
|
|
|
word.symbols.push(symbol); |
|
|
|
|
const ci = new Module.ChoiceIterator(ri); |
|
|
|
|
const ci = new TessModule.ChoiceIterator(ri); |
|
|
|
|
do { |
|
|
|
|
symbol.choices.push({ |
|
|
|
|
text: ci.GetUTF8Text(), |
|
|
|
|
confidence: ci.Confidence(), |
|
|
|
|
}); |
|
|
|
|
} while (ci.Next()); |
|
|
|
|
// Module.destroy(i);
|
|
|
|
|
// TessModule.destroy(i);
|
|
|
|
|
} |
|
|
|
|
} while (ri.Next(Module.RIL_SYMBOL)); |
|
|
|
|
Module.destroy(ri); |
|
|
|
|
} while (ri.Next(TessModule.RIL_SYMBOL)); |
|
|
|
|
TessModule.destroy(ri); |
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
text: base.GetUTF8Text(), |
|
|
|
|
html: deindent(base.GetHOCRText()), |
|
|
|
|
confidence: base.MeanTextConf(), |
|
|
|
|
text: api.GetUTF8Text(), |
|
|
|
|
html: deindent(api.GetHOCRText()), |
|
|
|
|
confidence: api.MeanTextConf(), |
|
|
|
|
blocks, |
|
|
|
|
psm: enumToString(base.GetPageSegMode(), 'PSM'), |
|
|
|
|
oem: enumToString(base.oem(), 'OEM'), |
|
|
|
|
version: base.Version(), |
|
|
|
|
psm: enumToString(api.GetPageSegMode(), 'PSM'), |
|
|
|
|
oem: enumToString(api.oem(), 'OEM'), |
|
|
|
|
version: api.Version(), |
|
|
|
|
}; |
|
|
|
|
}; |
|
|
|
|