Pure Javascript OCR for more than 100 Languages 📖🎉🖥
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
5.6 KiB

/**
*
* Dump data to a big JSON tree
*
* @fileoverview dump data to JSON tree
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
/**
* deindent
*
* The generated HOCR is excessively indented, so
* we get rid of that indentation
*
* @name deindent
* @function deindent string
* @access public
*/
7 years ago
const deindent = (html) => {
const lines = html.split('\n');
if (lines[0].substring(0, 2) === ' ') {
for (let i = 0; i < lines.length; i += 1) {
if (lines[i].substring(0, 2) === ' ') {
lines[i] = lines[i].slice(2);
}
}
7 years ago
}
return lines.join('\n');
};
/**
* dump
*
* @name dump
* @function dump recognition result to a JSON object
* @access public
*/
module.exports = (TessModule, api, {
tessjs_create_hocr,
tessjs_create_tsv,
tessjs_create_box,
tessjs_create_unlv,
tessjs_create_osd,
}) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL,
} = TessModule;
7 years ago
const blocks = [];
let block;
let para;
let textline;
let word;
let symbol;
const enumToString = (value, prefix) => (
Object.keys(TessModule)
5 years ago
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map((e) => e.slice(prefix.length + 1))[0]
7 years ago
);
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
7 years ago
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
7 years ago
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
6 years ago
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
7 years ago
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
7 years ago
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
7 years ago
para = {
lines: [],
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
7 years ago
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
7 years ago
textline = {
words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE),
7 years ago
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
7 years ago
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
7 years ago
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
7 years ago
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence(),
});
} while (wc.Next());
TessModule.destroy(wc);
7 years ago
textline.words.push(word);
}
9 years ago
7 years ago
// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
7 years ago
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
7 years ago
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
7 years ago
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
7 years ago
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence(),
});
} while (ci.Next());
// TessModule.destroy(i);
9 years ago
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
7 years ago
return {
text: api.GetUTF8Text(),
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
box: tessjs_create_box === '1' ? api.GetBoxText() : null,
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
confidence: api.MeanTextConf(),
7 years ago
blocks,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
7 years ago
};
};