|
|
@ -7,6 +7,8 @@ |
|
|
|
* @author Guillermo Webster <gui@mit.edu> |
|
|
|
* @author Guillermo Webster <gui@mit.edu> |
|
|
|
* @author Jerome Wu <jeromewus@gmail.com> |
|
|
|
* @author Jerome Wu <jeromewus@gmail.com> |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
|
|
|
|
const arrayBufferToBase64 = require('./arrayBufferToBase64'); |
|
|
|
|
|
|
|
const imageType = require('../../constants/imageType'); |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
/** |
|
|
|
* deindent |
|
|
|
* deindent |
|
|
@ -37,13 +39,7 @@ const deindent = (html) => { |
|
|
|
* @function dump recognition result to a JSON object |
|
|
|
* @function dump recognition result to a JSON object |
|
|
|
* @access public |
|
|
|
* @access public |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
module.exports = (TessModule, api, { |
|
|
|
module.exports = (TessModule, api, output, options) => { |
|
|
|
tessjs_create_hocr, |
|
|
|
|
|
|
|
tessjs_create_tsv, |
|
|
|
|
|
|
|
tessjs_create_box, |
|
|
|
|
|
|
|
tessjs_create_unlv, |
|
|
|
|
|
|
|
tessjs_create_osd, |
|
|
|
|
|
|
|
}) => { |
|
|
|
|
|
|
|
const ri = api.GetIterator(); |
|
|
|
const ri = api.GetIterator(); |
|
|
|
const { |
|
|
|
const { |
|
|
|
RIL_BLOCK, |
|
|
|
RIL_BLOCK, |
|
|
@ -65,135 +61,161 @@ module.exports = (TessModule, api, { |
|
|
|
.map((e) => e.slice(prefix.length + 1))[0] |
|
|
|
.map((e) => e.slice(prefix.length + 1))[0] |
|
|
|
); |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
ri.Begin(); |
|
|
|
const getImage = (type) => { |
|
|
|
do { |
|
|
|
api.WriteImage(type, '/image.png'); |
|
|
|
if (ri.IsAtBeginningOf(RIL_BLOCK)) { |
|
|
|
const pngBuffer = TessModule.FS.readFile('/image.png'); |
|
|
|
const poly = ri.BlockPolygon(); |
|
|
|
const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; |
|
|
|
let polygon = null; |
|
|
|
TessModule.FS.unlink('/image.png'); |
|
|
|
// BlockPolygon() returns null when automatic page segmentation is off
|
|
|
|
return pngStr; |
|
|
|
if (TessModule.getPointer(poly) > 0) { |
|
|
|
}; |
|
|
|
const n = poly.get_n(); |
|
|
|
|
|
|
|
const px = poly.get_x(); |
|
|
|
const getPDFInternal = (title, textonly) => { |
|
|
|
const py = poly.get_y(); |
|
|
|
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); |
|
|
|
polygon = []; |
|
|
|
pdfRenderer.BeginDocument(title); |
|
|
|
for (let i = 0; i < n; i += 1) { |
|
|
|
pdfRenderer.AddImage(api); |
|
|
|
polygon.push([px.getValue(i), py.getValue(i)]); |
|
|
|
pdfRenderer.EndDocument(); |
|
|
|
|
|
|
|
TessModule._free(pdfRenderer); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return TessModule.FS.readFile('/tesseract-ocr.pdf'); |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (output.blocks) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ri.Begin(); |
|
|
|
|
|
|
|
do { |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_BLOCK)) { |
|
|
|
|
|
|
|
const poly = ri.BlockPolygon(); |
|
|
|
|
|
|
|
let polygon = null; |
|
|
|
|
|
|
|
// BlockPolygon() returns null when automatic page segmentation is off
|
|
|
|
|
|
|
|
if (TessModule.getPointer(poly) > 0) { |
|
|
|
|
|
|
|
const n = poly.get_n(); |
|
|
|
|
|
|
|
const px = poly.get_x(); |
|
|
|
|
|
|
|
const py = poly.get_y(); |
|
|
|
|
|
|
|
polygon = []; |
|
|
|
|
|
|
|
for (let i = 0; i < n; i += 1) { |
|
|
|
|
|
|
|
polygon.push([px.getValue(i), py.getValue(i)]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
|
|
|
|
* TODO: find out why _ptaDestroy doesn't work |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
// TessModule._ptaDestroy(TessModule.getPointer(poly));
|
|
|
|
} |
|
|
|
} |
|
|
|
/* |
|
|
|
|
|
|
|
* TODO: find out why _ptaDestroy doesn't work |
|
|
|
block = { |
|
|
|
*/ |
|
|
|
paragraphs: [], |
|
|
|
// TessModule._ptaDestroy(TessModule.getPointer(poly));
|
|
|
|
text: ri.GetUTF8Text(RIL_BLOCK), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_BLOCK), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_BLOCK), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_BLOCK), |
|
|
|
|
|
|
|
blocktype: enumToString(ri.BlockType(), 'PT'), |
|
|
|
|
|
|
|
polygon, |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
blocks.push(block); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_PARA)) { |
|
|
|
|
|
|
|
para = { |
|
|
|
|
|
|
|
lines: [], |
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_PARA), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_PARA), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_PARA), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_PARA), |
|
|
|
|
|
|
|
is_ltr: !!ri.ParagraphIsLtr(), |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
block.paragraphs.push(para); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { |
|
|
|
|
|
|
|
textline = { |
|
|
|
|
|
|
|
words: [], |
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_TEXTLINE), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_TEXTLINE), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_TEXTLINE), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_TEXTLINE), |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
para.lines.push(textline); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_WORD)) { |
|
|
|
|
|
|
|
const fontInfo = ri.getWordFontAttributes(); |
|
|
|
|
|
|
|
const wordDir = ri.WordDirection(); |
|
|
|
|
|
|
|
word = { |
|
|
|
|
|
|
|
symbols: [], |
|
|
|
|
|
|
|
choices: [], |
|
|
|
|
|
|
|
|
|
|
|
block = { |
|
|
|
text: ri.GetUTF8Text(RIL_WORD), |
|
|
|
paragraphs: [], |
|
|
|
confidence: ri.Confidence(RIL_WORD), |
|
|
|
text: ri.GetUTF8Text(RIL_BLOCK), |
|
|
|
baseline: ri.getBaseline(RIL_WORD), |
|
|
|
confidence: ri.Confidence(RIL_BLOCK), |
|
|
|
bbox: ri.getBoundingBox(RIL_WORD), |
|
|
|
baseline: ri.getBaseline(RIL_BLOCK), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_BLOCK), |
|
|
|
|
|
|
|
blocktype: enumToString(ri.BlockType(), 'PT'), |
|
|
|
|
|
|
|
polygon, |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
blocks.push(block); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_PARA)) { |
|
|
|
|
|
|
|
para = { |
|
|
|
|
|
|
|
lines: [], |
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_PARA), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_PARA), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_PARA), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_PARA), |
|
|
|
|
|
|
|
is_ltr: !!ri.ParagraphIsLtr(), |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
block.paragraphs.push(para); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { |
|
|
|
|
|
|
|
textline = { |
|
|
|
|
|
|
|
words: [], |
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_TEXTLINE), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_TEXTLINE), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_TEXTLINE), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_TEXTLINE), |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
para.lines.push(textline); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_WORD)) { |
|
|
|
|
|
|
|
const fontInfo = ri.getWordFontAttributes(); |
|
|
|
|
|
|
|
const wordDir = ri.WordDirection(); |
|
|
|
|
|
|
|
word = { |
|
|
|
|
|
|
|
symbols: [], |
|
|
|
|
|
|
|
choices: [], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_WORD), |
|
|
|
is_numeric: !!ri.WordIsNumeric(), |
|
|
|
confidence: ri.Confidence(RIL_WORD), |
|
|
|
in_dictionary: !!ri.WordIsFromDictionary(), |
|
|
|
baseline: ri.getBaseline(RIL_WORD), |
|
|
|
direction: enumToString(wordDir, 'DIR'), |
|
|
|
bbox: ri.getBoundingBox(RIL_WORD), |
|
|
|
language: ri.WordRecognitionLanguage(), |
|
|
|
|
|
|
|
|
|
|
|
is_numeric: !!ri.WordIsNumeric(), |
|
|
|
is_bold: fontInfo.is_bold, |
|
|
|
in_dictionary: !!ri.WordIsFromDictionary(), |
|
|
|
is_italic: fontInfo.is_italic, |
|
|
|
direction: enumToString(wordDir, 'DIR'), |
|
|
|
is_underlined: fontInfo.is_underlined, |
|
|
|
language: ri.WordRecognitionLanguage(), |
|
|
|
is_monospace: fontInfo.is_monospace, |
|
|
|
|
|
|
|
is_serif: fontInfo.is_serif, |
|
|
|
|
|
|
|
is_smallcaps: fontInfo.is_smallcaps, |
|
|
|
|
|
|
|
font_size: fontInfo.pointsize, |
|
|
|
|
|
|
|
font_id: fontInfo.font_id, |
|
|
|
|
|
|
|
font_name: fontInfo.font_name, |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
const wc = new TessModule.WordChoiceIterator(ri); |
|
|
|
|
|
|
|
do { |
|
|
|
|
|
|
|
word.choices.push({ |
|
|
|
|
|
|
|
text: wc.GetUTF8Text(), |
|
|
|
|
|
|
|
confidence: wc.Confidence(), |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} while (wc.Next()); |
|
|
|
|
|
|
|
TessModule.destroy(wc); |
|
|
|
|
|
|
|
textline.words.push(word); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
is_bold: fontInfo.is_bold, |
|
|
|
// let image = null;
|
|
|
|
is_italic: fontInfo.is_italic, |
|
|
|
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
|
|
|
|
is_underlined: fontInfo.is_underlined, |
|
|
|
// var image = pix2array(pix);
|
|
|
|
is_monospace: fontInfo.is_monospace, |
|
|
|
// // for some reason it seems that things stop working if you destroy pics
|
|
|
|
is_serif: fontInfo.is_serif, |
|
|
|
// TessModule._pixDestroy(TessModule.getPointer(pix));
|
|
|
|
is_smallcaps: fontInfo.is_smallcaps, |
|
|
|
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { |
|
|
|
font_size: fontInfo.pointsize, |
|
|
|
symbol = { |
|
|
|
font_id: fontInfo.font_id, |
|
|
|
choices: [], |
|
|
|
font_name: fontInfo.font_name, |
|
|
|
image: null, |
|
|
|
}; |
|
|
|
text: ri.GetUTF8Text(RIL_SYMBOL), |
|
|
|
const wc = new TessModule.WordChoiceIterator(ri); |
|
|
|
confidence: ri.Confidence(RIL_SYMBOL), |
|
|
|
do { |
|
|
|
baseline: ri.getBaseline(RIL_SYMBOL), |
|
|
|
word.choices.push({ |
|
|
|
bbox: ri.getBoundingBox(RIL_SYMBOL), |
|
|
|
text: wc.GetUTF8Text(), |
|
|
|
is_superscript: !!ri.SymbolIsSuperscript(), |
|
|
|
confidence: wc.Confidence(), |
|
|
|
is_subscript: !!ri.SymbolIsSubscript(), |
|
|
|
}); |
|
|
|
is_dropcap: !!ri.SymbolIsDropcap(), |
|
|
|
} while (wc.Next()); |
|
|
|
}; |
|
|
|
TessModule.destroy(wc); |
|
|
|
word.symbols.push(symbol); |
|
|
|
textline.words.push(word); |
|
|
|
const ci = new TessModule.ChoiceIterator(ri); |
|
|
|
} |
|
|
|
do { |
|
|
|
|
|
|
|
symbol.choices.push({ |
|
|
|
|
|
|
|
text: ci.GetUTF8Text(), |
|
|
|
|
|
|
|
confidence: ci.Confidence(), |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} while (ci.Next()); |
|
|
|
|
|
|
|
// TessModule.destroy(i);
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} while (ri.Next(RIL_SYMBOL)); |
|
|
|
|
|
|
|
TessModule.destroy(ri); |
|
|
|
|
|
|
|
|
|
|
|
// let image = null;
|
|
|
|
} |
|
|
|
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
|
|
|
|
|
|
|
|
// var image = pix2array(pix);
|
|
|
|
|
|
|
|
// // for some reason it seems that things stop working if you destroy pics
|
|
|
|
|
|
|
|
// TessModule._pixDestroy(TessModule.getPointer(pix));
|
|
|
|
|
|
|
|
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { |
|
|
|
|
|
|
|
symbol = { |
|
|
|
|
|
|
|
choices: [], |
|
|
|
|
|
|
|
image: null, |
|
|
|
|
|
|
|
text: ri.GetUTF8Text(RIL_SYMBOL), |
|
|
|
|
|
|
|
confidence: ri.Confidence(RIL_SYMBOL), |
|
|
|
|
|
|
|
baseline: ri.getBaseline(RIL_SYMBOL), |
|
|
|
|
|
|
|
bbox: ri.getBoundingBox(RIL_SYMBOL), |
|
|
|
|
|
|
|
is_superscript: !!ri.SymbolIsSuperscript(), |
|
|
|
|
|
|
|
is_subscript: !!ri.SymbolIsSubscript(), |
|
|
|
|
|
|
|
is_dropcap: !!ri.SymbolIsDropcap(), |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
word.symbols.push(symbol); |
|
|
|
|
|
|
|
const ci = new TessModule.ChoiceIterator(ri); |
|
|
|
|
|
|
|
do { |
|
|
|
|
|
|
|
symbol.choices.push({ |
|
|
|
|
|
|
|
text: ci.GetUTF8Text(), |
|
|
|
|
|
|
|
confidence: ci.Confidence(), |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} while (ci.Next()); |
|
|
|
|
|
|
|
// TessModule.destroy(i);
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} while (ri.Next(RIL_SYMBOL)); |
|
|
|
|
|
|
|
TessModule.destroy(ri); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
return { |
|
|
|
text: api.GetUTF8Text(), |
|
|
|
text: output.text ? api.GetUTF8Text() : null, |
|
|
|
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, |
|
|
|
hocr: output.hocr ? deindent(api.GetHOCRText()) : null, |
|
|
|
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, |
|
|
|
tsv: output.tsv ? api.GetTSVText() : null, |
|
|
|
box: tessjs_create_box === '1' ? api.GetBoxText() : null, |
|
|
|
box: output.box ? api.GetBoxText() : null, |
|
|
|
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, |
|
|
|
unlv: output.unlv ? api.GetUNLVText() : null, |
|
|
|
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, |
|
|
|
osd: output.osd ? api.GetOsdText() : null, |
|
|
|
|
|
|
|
pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null, |
|
|
|
|
|
|
|
imageColor: output.imageColor ? getImage(imageType.COLOR) : null, |
|
|
|
|
|
|
|
imageGrey: output.imageColor ? getImage(imageType.GREY) : null, |
|
|
|
|
|
|
|
imageBinary: output.imageColor ? getImage(imageType.BINARY) : null, |
|
|
|
confidence: api.MeanTextConf(), |
|
|
|
confidence: api.MeanTextConf(), |
|
|
|
blocks, |
|
|
|
blocks: output.blocks ? blocks : null, |
|
|
|
psm: enumToString(api.GetPageSegMode(), 'PSM'), |
|
|
|
psm: enumToString(api.GetPageSegMode(), 'PSM'), |
|
|
|
oem: enumToString(api.oem(), 'OEM'), |
|
|
|
oem: enumToString(api.oem(), 'OEM'), |
|
|
|
version: api.Version(), |
|
|
|
version: api.Version(), |
|
|
|