Pure Javascript OCR for more than 100 Languages 📖🎉🖥
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.4 KiB

/**
* In the recognition result of tesseract, there
* is a deep JSON object for details, it has around
*
* The result of dump.js is a big JSON tree
* which can be easily serialized (for instance
* to be sent from a webworker to the main app
* or through Node's IPC), but we want
* a (circular) DOM-like interface for walking
* through the data.
*
* @fileoverview DOM-like interface for walking through data
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
module.exports = (page) => {
const blocks = [];
const paragraphs = [];
const lines = [];
const words = [];
const symbols = [];
page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
line.words.forEach((word) => {
word.symbols.forEach((sym) => {
symbols.push({
...sym, page, block, paragraph, line, word,
});
});
words.push({
...word, page, block, paragraph, line,
});
});
lines.push({
...line, page, block, paragraph,
});
});
paragraphs.push({
...paragraph, page, block,
});
});
blocks.push({
...block, page,
});
});
return {
...page, blocks, paragraphs, lines, words, symbols,
};
};