Compare commits
14 Commits
master
...
support/1.
Author | SHA1 | Date |
---|---|---|
Jerome Wu | aba237af2e | 6 years ago |
Jerome Wu | a26566be04 | 6 years ago |
Jerome Wu | 55e355eff5 | 6 years ago |
Jerome Wu | 1f497271b5 | 6 years ago |
jeromewu | 1a12ead46f | 6 years ago |
Urs Wolfer | 5c930514f5 | 6 years ago |
jeromewu | 9268572644 | 6 years ago |
HoldYourWaffle | 7911518b39 | 6 years ago |
Jerome Wu | 613a19c7e1 | 6 years ago |
Jerome Wu | 07ea31a9cd | 6 years ago |
Jerome Wu | 741ff413b3 | 6 years ago |
Jerome Wu | cdb86c694a | 6 years ago |
Jerome Wu | 06d32c6804 | 6 years ago |
Jerome Wu | 8e1b21cd2c | 6 years ago |
@ -1,17 +0,0 @@
@@ -1,17 +0,0 @@
|
||||
{ |
||||
"extends": "airbnb-base", |
||||
"parser": "babel-eslint", |
||||
"env": { |
||||
"browser": true, |
||||
"node": true, |
||||
"mocha": true, |
||||
"worker": true |
||||
}, |
||||
"rules": { |
||||
"no-underscore-dangle": 0, |
||||
"no-console": 0, |
||||
"global-require": 0, |
||||
"camelcase": 0, |
||||
"no-control-regex": 0 |
||||
} |
||||
} |
@ -1,9 +0,0 @@
@@ -1,9 +0,0 @@
|
||||
# These are supported funding model platforms |
||||
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] |
||||
patreon: # Replace with a single Patreon username |
||||
open_collective: tesseractjs |
||||
ko_fi: # Replace with a single Ko-fi username |
||||
tidelift: npm/tesseract.js |
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry |
||||
custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2"] |
@ -1,38 +0,0 @@
@@ -1,38 +0,0 @@
|
||||
--- |
||||
name: Bug report |
||||
about: Create a report to help us improve |
||||
title: '' |
||||
labels: '' |
||||
assignees: '' |
||||
|
||||
--- |
||||
|
||||
**Describe the bug** |
||||
A clear and concise description of what the bug is. |
||||
|
||||
**To Reproduce** |
||||
Steps to reproduce the behavior: |
||||
1. Go to '...' |
||||
2. Click on '....' |
||||
3. Scroll down to '....' |
||||
4. See error |
||||
|
||||
**Expected behavior** |
||||
A clear and concise description of what you expected to happen. |
||||
|
||||
**Screenshots** |
||||
If applicable, add screenshots to help explain your problem. |
||||
|
||||
**Desktop (please complete the following information):** |
||||
- OS: [e.g. iOS] |
||||
- Browser [e.g. chrome, safari] |
||||
- Version [e.g. 22] |
||||
|
||||
**Smartphone (please complete the following information):** |
||||
- Device: [e.g. iPhone6] |
||||
- OS: [e.g. iOS8.1] |
||||
- Browser [e.g. stock browser, safari] |
||||
- Version [e.g. 22] |
||||
|
||||
**Additional context** |
||||
Add any other context about the problem here. |
@ -1,20 +0,0 @@
@@ -1,20 +0,0 @@
|
||||
--- |
||||
name: Feature request |
||||
about: Suggest an idea for this project |
||||
title: '' |
||||
labels: '' |
||||
assignees: '' |
||||
|
||||
--- |
||||
|
||||
**Is your feature request related to a problem? Please describe.** |
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] |
||||
|
||||
**Describe the solution you'd like** |
||||
A clear and concise description of what you want to happen. |
||||
|
||||
**Describe alternatives you've considered** |
||||
A clear and concise description of any alternative solutions or features you've considered. |
||||
|
||||
**Additional context** |
||||
Add any other context or screenshots about the feature request here. |
@ -1,5 +0,0 @@
@@ -1,5 +0,0 @@
|
||||
## Security contact information |
||||
|
||||
To report a security vulnerability, please use the |
||||
[Tidelift security contact](https://tidelift.com/security). |
||||
Tidelift will coordinate the fix and disclosure. |
@ -1,29 +0,0 @@
@@ -1,29 +0,0 @@
|
||||
# This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node |
||||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions |
||||
|
||||
name: Node.js CI |
||||
|
||||
on: |
||||
push: |
||||
branches: [ master ] |
||||
pull_request: |
||||
branches: [ master ] |
||||
|
||||
jobs: |
||||
build: |
||||
|
||||
runs-on: ubuntu-latest |
||||
|
||||
strategy: |
||||
matrix: |
||||
node-version: [14.x, 16.x] |
||||
|
||||
steps: |
||||
- uses: actions/checkout@v2 |
||||
- name: Use Node.js ${{ matrix.node-version }} |
||||
uses: actions/setup-node@v1 |
||||
with: |
||||
node-version: ${{ matrix.node-version }} |
||||
- run: npm ci |
||||
- run: npm run lint |
||||
- run: npm test |
@ -1,2 +0,0 @@
@@ -1,2 +0,0 @@
|
||||
FROM gitpod/workspace-full |
||||
RUN sudo apt-get update && sudo apt-get install -y libgtk-3-0 libx11-xcb1 libnss3 libxss1 libasound2 |
@ -1,9 +0,0 @@
@@ -1,9 +0,0 @@
|
||||
image: |
||||
file: .gitpod.Dockerfile |
||||
tasks: |
||||
- command: gp await-port 3000 && sleep 3 && gp preview $(gp url 3000)/examples/browser/demo.html |
||||
- init: npm install |
||||
command: npm start |
||||
ports: |
||||
- port: 3000 |
||||
onOpen: ignore |
@ -1,197 +1,303 @@
@@ -1,197 +1,303 @@
|
||||
<p align="center"> |
||||
<a href="https://tesseract.projectnaptha.com/"><img width="256px" height="256px" alt="Tesseract.js" src="./docs/images/tesseract.png"></a> |
||||
</p> |
||||
|
||||
![Lint & Test](https://github.com/naptha/tesseract.js/workflows/Node.js%20CI/badge.svg) |
||||
![CodeQL](https://github.com/naptha/tesseract.js/workflows/CodeQL/badge.svg) |
||||
[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://github.com/naptha/tesseract.js) |
||||
[![Financial Contributors on Open Collective](https://opencollective.com/tesseractjs/all/badge.svg?label=financial+contributors)](https://opencollective.com/tesseractjs) [![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js) |
||||
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity) |
||||
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) |
||||
[![Code Style](https://badgen.net/badge/code%20style/airbnb/ff5a5f?icon=airbnb)](https://github.com/airbnb/javascript) |
||||
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) |
||||
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) |
||||
# [Tesseract.js](http://tesseract.projectnaptha.com/) |
||||
|
||||
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/)) |
||||
|
||||
Image Recognition |
||||
[![NPM version][tesseractjs-npm-image]][tesseractjs-npm-url] |
||||
|
||||
[tesseractjs-npm-image]: https://img.shields.io/npm/v/tesseract.js.svg |
||||
[tesseractjs-npm-url]: https://npmjs.org/package/tesseract.js |
||||
|
||||
**Tesseract.js v2 alpha is now available!! Check [HERE](https://github.com/naptha/tesseract.js) for more information.** |
||||
|
||||
[![fancy demo gif](./docs/images/demo.gif)](http://tesseract.projectnaptha.com) |
||||
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/)) |
||||
|
||||
[![fancy demo gif](https://raw.githubusercontent.com/naptha/tesseract.js/support/1.x/docs/images/demo.gif)](http://tesseract.projectnaptha.com) |
||||
|
||||
Video Real-time Recognition |
||||
Tesseract.js works with script tags, [webpack](https://webpack.js.org/)/[Browserify](http://browserify.org/), and [Node.js](https://nodejs.org/en/). [After you install it](#installation), using it is as simple as |
||||
|
||||
<p align="center"> |
||||
<a href="https://github.com/jeromewu/tesseract.js-video"><img alt="Tesseract.js Video" src="./docs/images/video-demo.gif"></a> |
||||
</p> |
||||
```javascript |
||||
Tesseract.recognize(myImage) |
||||
.progress(function (p) { console.log('progress', p) }) |
||||
.then(function (result) { console.log('result', result) }) |
||||
``` |
||||
|
||||
[Check out the docs](#docs) for a full treatment of the API. |
||||
|
||||
## Provenance |
||||
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine. |
||||
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/). |
||||
After you [install it](#installation), using it is as simple as: |
||||
|
||||
|
||||
# Installation |
||||
|
||||
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack and Browserify via `npm`, and on Node.js via `npm`. [Check out the docs](#docs) for a full treatment of the API. |
||||
|
||||
## <script /> |
||||
|
||||
You can simply include Tesseract.js with a CDN like this: |
||||
```html |
||||
<script src='https://cdn.jsdelivr.net/gh/naptha/tesseract.js@v1.0.14/dist/tesseract.min.js'></script> |
||||
``` |
||||
|
||||
After including your scripts, the `Tesseract` variable will be defined globally! |
||||
|
||||
## Dependency |
||||
First: |
||||
```shell |
||||
> yarn add tesseract.js |
||||
``` |
||||
or |
||||
``` |
||||
> npm install tesseract.js --save |
||||
``` |
||||
> Note: Tesseract.js currently requires Node.js v6.8.0 or higher. |
||||
|
||||
|
||||
## Usage |
||||
```javascript |
||||
var Tesseract = require('tesseract.js') |
||||
``` |
||||
|
||||
or |
||||
```javascript |
||||
import Tesseract from 'tesseract.js' |
||||
``` |
||||
|
||||
|
||||
# Docs |
||||
|
||||
* [Tesseract.recognize](#tesseractrecognizeimage-imagelike-options---tesseractjob) |
||||
+ [Simple Example](#simple-example) |
||||
+ [More Complicated Example](#more-complicated-example) |
||||
* [Tesseract.detect](#tesseractdetectimage-imagelike---tesseractjob) |
||||
* [ImageLike](#imagelike) |
||||
* [TesseractJob](#tesseractjob) |
||||
+ [TesseractJob.progress](#tesseractjobprogresscallback-function---tesseractjob) |
||||
+ [TesseractJob.then](#tesseractjobthencallback-function---tesseractjob) |
||||
+ [TesseractJob.catch](#tesseractjobcatchcallback-function---tesseractjob) |
||||
+ [TesseractJob.finally](#tesseractjobfinallycallback-function---tesseractjob) |
||||
* [Local Installation](#local-installation) |
||||
+ [corePath](#corepath) |
||||
+ [workerPath](#workerpath) |
||||
+ [langPath](#langpath) |
||||
* [Contributing](#contributing) |
||||
+ [Development](#development) |
||||
+ [Building Static Files](#building-static-files) |
||||
+ [Send us a Pull Request!](#send-us-a-pull-request) |
||||
|
||||
|
||||
## Tesseract.recognize(image: [ImageLike](#imagelike)[, options]) -> [TesseractJob](#tesseractjob) |
||||
Figures out what words are in `image`, where the words are in `image`, etc. |
||||
> Note: `image` should be sufficiently high resolution. |
||||
> Often, the same image will get much better results if you upscale it before calling `recognize`. |
||||
|
||||
- `image` is any [ImageLike](#imagelike) object. |
||||
- `options` is either absent (in which case it is interpreted as `'eng'`), a string specifing a language short code from the [language list](./docs/tesseract_lang_list.md), or a flat json object that may: |
||||
+ include properties that override some subset of the [default tesseract parameters](./docs/tesseract_parameters.md) |
||||
+ include a `lang` property with a value from the [list of lang parameters](./docs/tesseract_lang_list.md) |
||||
|
||||
Returns a [TesseractJob](#tesseractjob) whose `then`, `progress`, `catch` and `finally` methods can be used to act on the result. |
||||
|
||||
### Simple Example: |
||||
```javascript |
||||
import Tesseract from 'tesseract.js'; |
||||
|
||||
Tesseract.recognize( |
||||
'https://tesseract.projectnaptha.com/img/eng_bw.png', |
||||
'eng', |
||||
{ logger: m => console.log(m) } |
||||
).then(({ data: { text } }) => { |
||||
console.log(text); |
||||
Tesseract.recognize(myImage) |
||||
.then(function(result){ |
||||
console.log(result) |
||||
}) |
||||
``` |
||||
|
||||
### More Complicated Example: |
||||
```javascript |
||||
// if we know our image is of spanish words without the letter 'e': |
||||
Tesseract.recognize(myImage, { |
||||
lang: 'spa', |
||||
tessedit_char_blacklist: 'e' |
||||
}) |
||||
.then(function(result){ |
||||
console.log(result) |
||||
}) |
||||
``` |
||||
|
||||
Or more imperative |
||||
|
||||
|
||||
|
||||
## Tesseract.detect(image: [ImageLike](#imagelike)) -> [TesseractJob](#tesseractjob) |
||||
|
||||
Figures out what script (e.g. 'Latin', 'Chinese') the words in image are written in. |
||||
|
||||
- `image` is any [ImageLike](#imagelike) object. |
||||
|
||||
Returns a [TesseractJob](#tesseractjob) whose `then`, `progress`, `catch` and `finally` methods can be used to act on the result of the script. |
||||
|
||||
|
||||
```javascript |
||||
import { createWorker } from 'tesseract.js'; |
||||
|
||||
const worker = createWorker({ |
||||
logger: m => console.log(m) |
||||
}); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
[Check out the docs](#documentation) for a full explanation of the API. |
||||
|
||||
## Major changes in v3 |
||||
- Significantly faster performance |
||||
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data) |
||||
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18) |
||||
- Added SIMD-enabled build for supported devices |
||||
- Added support: |
||||
- Node.js version 18 |
||||
- Removed support: |
||||
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0) |
||||
- Node.js versions 10 and 12 |
||||
|
||||
## Major changes in v2 |
||||
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream) |
||||
- Support multiple languages at the same time, eg: eng+chi\_tra for English and Traditional Chinese |
||||
- Supported image formats: png, jpg, bmp, pbm |
||||
- Support WebAssembly (fallback to ASM.js when browser doesn't support) |
||||
- Support Typescript |
||||
|
||||
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br> |
||||
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1 |
||||
## Installation |
||||
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`. |
||||
|
||||
|
||||
### CDN |
||||
```html |
||||
<!-- v2 --> |
||||
<script src='https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js'></script> |
||||
Tesseract.detect(myImage) |
||||
.then(function(result){ |
||||
console.log(result) |
||||
}) |
||||
``` |
||||
|
||||
<!-- v1 --> |
||||
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script> |
||||
|
||||
## ImageLike |
||||
|
||||
The main Tesseract.js functions take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS. |
||||
|
||||
|
||||
On a browser, an image can be: |
||||
- an `img`, `video`, or `canvas` element |
||||
- a CanvasRenderingContext2D (returned by `canvas.getContext('2d')`) |
||||
- a `File` object (from a file `<input>` or drag-drop event) |
||||
- a `Blob` object |
||||
- a `ImageData` instance (an object containing `width`, `height` and `data` properties) |
||||
- a path or URL to an accessible image (the image must either be hosted locally or accessible by CORS) |
||||
|
||||
|
||||
|
||||
|
||||
In Node.js, an image can be |
||||
- a path to a local image |
||||
- a `Buffer` instance containing a `PNG` or `JPEG` image |
||||
- a `ImageData` instance (an object containing `width`, `height` and `data` properties) |
||||
|
||||
|
||||
## TesseractJob |
||||
|
||||
A TesseractJob is an object returned by a call to `recognize` or `detect`. It's inspired by the ES6 Promise interface and provides `then` and `catch` methods. It also provides `finally` method, which will be fired regardless of the job fate. One important difference is that these methods return the job itself (to enable chaining) rather than new. |
||||
|
||||
Typical use is: |
||||
```javascript |
||||
Tesseract.recognize(myImage) |
||||
.progress(message => console.log(message)) |
||||
.catch(err => console.error(err)) |
||||
.then(result => console.log(result)) |
||||
.finally(resultOrError => console.log(resultOrError)) |
||||
``` |
||||
After including the script the `Tesseract` variable will be globally available. |
||||
|
||||
Which is equivalent to: |
||||
```javascript |
||||
var job1 = Tesseract.recognize(myImage); |
||||
|
||||
### Node.js |
||||
job1.progress(message => console.log(message)); |
||||
|
||||
**Tesseract.js v3 requires Node.js v14 or higher** |
||||
job1.catch(err => console.error(err)); |
||||
|
||||
```shell |
||||
# For v3 |
||||
npm install tesseract.js |
||||
yarn add tesseract.js |
||||
job1.then(result => console.log(result)); |
||||
|
||||
# For v2 |
||||
npm install tesseract.js@2 |
||||
yarn add tesseract.js@2 |
||||
job1.finally(resultOrError => console.log(resultOrError)); |
||||
``` |
||||
|
||||
|
||||
## Documentation |
||||
|
||||
* [Examples](./docs/examples.md) |
||||
* [Image Format](./docs/image-format.md) |
||||
* [API](./docs/api.md) |
||||
* [Local Installation](./docs/local-installation.md) |
||||
* [FAQ](./docs/faq.md) |
||||
### TesseractJob.progress(callback: function) -> TesseractJob |
||||
Sets `callback` as the function that will be called every time the job progresses. |
||||
- `callback` is a function with the signature `callback(progress)` where `progress` is a json object. |
||||
|
||||
## Use tesseract.js the way you like! |
||||
For example: |
||||
```javascript |
||||
Tesseract.recognize(myImage) |
||||
.progress(function(message){console.log('progress is: ', message)}) |
||||
``` |
||||
|
||||
- Offline Version: https://github.com/jeromewu/tesseract.js-offline |
||||
- Electron Version: https://github.com/jeromewu/tesseract.js-electron |
||||
- Custom Traineddata: https://github.com/jeromewu/tesseract.js-custom-traineddata |
||||
- Chrome Extension #1: https://github.com/jeromewu/tesseract.js-chrome-extension |
||||
- Chrome Extension #2: https://github.com/fxnoob/image-to-text |
||||
- Firefox Extension: https://github.com/gnonio/korporize |
||||
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app |
||||
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app |
||||
- With React: https://github.com/jeromewu/tesseract.js-react-app |
||||
- Typescript: https://github.com/jeromewu/tesseract.js-typescript |
||||
- Video Real-time Recognition: https://github.com/jeromewu/tesseract.js-video |
||||
The console will show something like: |
||||
```javascript |
||||
progress is: {loaded_lang_model: "eng", from_cache: true} |
||||
progress is: {initialized_with_lang: "eng"} |
||||
progress is: {set_variable: Object} |
||||
progress is: {set_variable: Object} |
||||
progress is: {recognized: 0} |
||||
progress is: {recognized: 0.3} |
||||
progress is: {recognized: 0.6} |
||||
progress is: {recognized: 0.9} |
||||
progress is: {recognized: 1} |
||||
``` |
||||
|
||||
## Contributing |
||||
|
||||
### Development |
||||
To run a development copy of Tesseract.js do the following: |
||||
```shell |
||||
# First we clone the repository |
||||
git clone https://github.com/naptha/tesseract.js.git |
||||
cd tesseract.js |
||||
### TesseractJob.then(callback: function) -> TesseractJob |
||||
Sets `callback` as the function that will be called if and when the job successfully completes. |
||||
- `callback` is a function with the signature `callback(result)` where `result` is a json object. |
||||
|
||||
# Then we install the dependencies |
||||
npm install |
||||
|
||||
# And finally we start the development server |
||||
npm start |
||||
For example: |
||||
```javascript |
||||
Tesseract.recognize(myImage) |
||||
.then(function(result){console.log('result is: ', result)}) |
||||
``` |
||||
|
||||
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser. |
||||
It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder. |
||||
The console will show something like: |
||||
```javascript |
||||
result is: { |
||||
blocks: Array[1] |
||||
confidence: 87 |
||||
html: "<div class='ocr_page' id='page_1' ..." |
||||
lines: Array[3] |
||||
oem: "DEFAULT" |
||||
paragraphs: Array[1] |
||||
psm: "SINGLE_BLOCK" |
||||
symbols: Array[33] |
||||
text: "Hello World↵from beyond↵the Cosmic Void↵↵" |
||||
version: "3.04.00" |
||||
words: Array[7] |
||||
} |
||||
``` |
||||
|
||||
### Online Setup with a single Click |
||||
### TesseractJob.catch(callback: function) -> TesseractJob |
||||
Sets `callback` as the function that will be called if the job fails. |
||||
- `callback` is a function with the signature `callback(error)` where `error` is a json object. |
||||
|
||||
You can use Gitpod(A free online VS Code like IDE) for contributing. With a single click it will launch a ready to code workspace with the build & start scripts already in process and within a few seconds it will spin up the dev server so that you can start contributing straight away without wasting any time. |
||||
### TesseractJob.finally(callback: function) -> TesseractJob |
||||
Sets `callback` as the function that will be called regardless if the job fails or success. |
||||
- `callback` is a function with the signature `callback(resultOrError)` where `resultOrError` is a json object. |
||||
|
||||
[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/naptha/tesseract.js/blob/master/examples/browser/demo.html) |
||||
## Local Installation |
||||
|
||||
### Building Static Files |
||||
To build the compiled static files just execute the following: |
||||
```shell |
||||
npm run build |
||||
In the browser, `tesseract.js` simply provides the API layer. Internally, it opens a WebWorker to handle requests. That worker itself loads code from the Emscripten-built `tesseract.js-core` which itself is hosted on a CDN. Then it dynamically loads language files hosted on another CDN. |
||||
|
||||
Because of this we recommend loading `tesseract.js` from a CDN. But if you really need to have all your files local, you can use the `Tesseract.create` function which allows you to specify custom paths for workers, languages, and core. |
||||
|
||||
```javascript |
||||
window.Tesseract = Tesseract.create({ |
||||
workerPath: '/path/to/worker.js', |
||||
langPath: 'https://tessdata.projectnaptha.com/3.02/', |
||||
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js', |
||||
}) |
||||
``` |
||||
This will output the files into the `dist` directory. |
||||
|
||||
## Contributors |
||||
### corePath |
||||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js'. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use a different file. |
||||
|
||||
### Code Contributors |
||||
### workerPath |
||||
A string specifying the location of the [worker.js](./dist/worker.js) file. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use a different file. |
||||
|
||||
This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)]. |
||||
<a href="https://github.com/naptha/tesseract.js/graphs/contributors"><img src="https://opencollective.com/tesseractjs/contributors.svg?width=890&button=false" /></a> |
||||
### langPath |
||||
A string specifying the location of the tesseract language files, with default value 'https://cdn.jsdelivr.net/gh/naptha/tessdata@gh-pages/3.02/'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use different language files. |
||||
|
||||
### Financial Contributors |
||||
|
||||
Become a financial contributor and help us sustain our community. [[Contribute](https://opencollective.com/tesseractjs/contribute)] |
||||
## Contributing |
||||
### Development |
||||
To run a development copy of tesseract.js, first clone this repo. |
||||
```shell |
||||
> git clone https://github.com/naptha/tesseract.js.git |
||||
``` |
||||
|
||||
Then, `cd tesseract.js && npm install && npm start` |
||||
```shell |
||||
> cd tesseract.js |
||||
> npm install && npm start |
||||
|
||||
#### Individuals |
||||
... a bunch of npm stuff ... |
||||
|
||||
<a href="https://opencollective.com/tesseractjs"><img src="https://opencollective.com/tesseractjs/individuals.svg?width=890"></a> |
||||
Starting up http-server, serving ./ |
||||
Available on: |
||||
http://127.0.0.1:7355 |
||||
http://[your ip]:7355 |
||||
|
||||
#### Organizations |
||||
``` |
||||
|
||||
Then open `http://localhost:7355/examples/file-input/demo.html` in your favorite browser. The devServer automatically rebuilds `tesseract.js` and `tesseract.worker.js` when you change files in the src folder. |
||||
|
||||
Support this project with your organization. Your logo will show up here with a link to your website. [[Contribute](https://opencollective.com/tesseractjs/contribute)] |
||||
### Building Static Files |
||||
After you've cloned the repo and run `npm install` as described in the [Development Section](#development), you can build static library files in the dist folder with |
||||
```shell |
||||
> npm run build |
||||
``` |
||||
|
||||
<a href="https://opencollective.com/tesseractjs/organization/0/website"><img src="https://opencollective.com/tesseractjs/organization/0/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/1/website"><img src="https://opencollective.com/tesseractjs/organization/1/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/2/website"><img src="https://opencollective.com/tesseractjs/organization/2/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/3/website"><img src="https://opencollective.com/tesseractjs/organization/3/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/4/website"><img src="https://opencollective.com/tesseractjs/organization/4/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/5/website"><img src="https://opencollective.com/tesseractjs/organization/5/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/6/website"><img src="https://opencollective.com/tesseractjs/organization/6/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/7/website"><img src="https://opencollective.com/tesseractjs/organization/7/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/8/website"><img src="https://opencollective.com/tesseractjs/organization/8/avatar.svg"></a> |
||||
<a href="https://opencollective.com/tesseractjs/organization/9/website"><img src="https://opencollective.com/tesseractjs/organization/9/avatar.svg"></a> |
||||
### Send us a Pull Request! |
||||
Thanks :) |
||||
|
@ -0,0 +1,640 @@
@@ -0,0 +1,640 @@
|
||||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.Tesseract = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){ |
||||
// shim for using process in browser
|
||||
var process = module.exports = {}; |
||||
|
||||
// cached from whatever global is present so that test runners that stub it
|
||||
// don't break things. But we need to wrap it in a try catch in case it is
|
||||
// wrapped in strict mode code which doesn't define any globals. It's inside a
|
||||
// function because try/catches deoptimize in certain engines.
|
||||
|
||||
var cachedSetTimeout; |
||||
var cachedClearTimeout; |
||||
|
||||
function defaultSetTimout() { |
||||
throw new Error('setTimeout has not been defined'); |
||||
} |
||||
function defaultClearTimeout () { |
||||
throw new Error('clearTimeout has not been defined'); |
||||
} |
||||
(function () { |
||||
try { |
||||
if (typeof setTimeout === 'function') { |
||||
cachedSetTimeout = setTimeout; |
||||
} else { |
||||
cachedSetTimeout = defaultSetTimout; |
||||
} |
||||
} catch (e) { |
||||
cachedSetTimeout = defaultSetTimout; |
||||
} |
||||
try { |
||||
if (typeof clearTimeout === 'function') { |
||||
cachedClearTimeout = clearTimeout; |
||||
} else { |
||||
cachedClearTimeout = defaultClearTimeout; |
||||
} |
||||
} catch (e) { |
||||
cachedClearTimeout = defaultClearTimeout; |
||||
} |
||||
} ()) |
||||
function runTimeout(fun) { |
||||
if (cachedSetTimeout === setTimeout) { |
||||
//normal enviroments in sane situations
|
||||
return setTimeout(fun, 0); |
||||
} |
||||
// if setTimeout wasn't available but was latter defined
|
||||
if ((cachedSetTimeout === defaultSetTimout || !cachedSetTimeout) && setTimeout) { |
||||
cachedSetTimeout = setTimeout; |
||||
return setTimeout(fun, 0); |
||||
} |
||||
try { |
||||
// when when somebody has screwed with setTimeout but no I.E. maddness
|
||||
return cachedSetTimeout(fun, 0); |
||||
} catch(e){ |
||||
try { |
||||
// When we are in I.E. but the script has been evaled so I.E. doesn't trust the global object when called normally
|
||||
return cachedSetTimeout.call(null, fun, 0); |
||||
} catch(e){ |
||||
// same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error
|
||||
return cachedSetTimeout.call(this, fun, 0); |
||||
} |
||||
} |
||||
|
||||
|
||||
} |
||||
function runClearTimeout(marker) { |
||||
if (cachedClearTimeout === clearTimeout) { |
||||
//normal enviroments in sane situations
|
||||
return clearTimeout(marker); |
||||
} |
||||
// if clearTimeout wasn't available but was latter defined
|
||||
if ((cachedClearTimeout === defaultClearTimeout || !cachedClearTimeout) && clearTimeout) { |
||||
cachedClearTimeout = clearTimeout; |
||||
return clearTimeout(marker); |
||||
} |
||||
try { |
||||
// when when somebody has screwed with setTimeout but no I.E. maddness
|
||||
return cachedClearTimeout(marker); |
||||
} catch (e){ |
||||
try { |
||||
// When we are in I.E. but the script has been evaled so I.E. doesn't trust the global object when called normally
|
||||
return cachedClearTimeout.call(null, marker); |
||||
} catch (e){ |
||||
// same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error.
|
||||
// Some versions of I.E. have different rules for clearTimeout vs setTimeout
|
||||
return cachedClearTimeout.call(this, marker); |
||||
} |
||||
} |
||||
|
||||
|
||||
|
||||
} |
||||
var queue = []; |
||||
var draining = false; |
||||
var currentQueue; |
||||
var queueIndex = -1; |
||||
|
||||
function cleanUpNextTick() { |
||||
if (!draining || !currentQueue) { |
||||
return; |
||||
} |
||||
draining = false; |
||||
if (currentQueue.length) { |
||||
queue = currentQueue.concat(queue); |
||||
} else { |
||||
queueIndex = -1; |
||||
} |
||||
if (queue.length) { |
||||
drainQueue(); |
||||
} |
||||
} |
||||
|
||||
function drainQueue() { |
||||
if (draining) { |
||||
return; |
||||
} |
||||
var timeout = runTimeout(cleanUpNextTick); |
||||
draining = true; |
||||
|
||||
var len = queue.length; |
||||
while(len) { |
||||
currentQueue = queue; |
||||
queue = []; |
||||
while (++queueIndex < len) { |
||||
if (currentQueue) { |
||||
currentQueue[queueIndex].run(); |
||||
} |
||||
} |
||||
queueIndex = -1; |
||||
len = queue.length; |
||||
} |
||||
currentQueue = null; |
||||
draining = false; |
||||
runClearTimeout(timeout); |
||||
} |
||||
|
||||
process.nextTick = function (fun) { |
||||
var args = new Array(arguments.length - 1); |
||||
if (arguments.length > 1) { |
||||
for (var i = 1; i < arguments.length; i++) { |
||||
args[i - 1] = arguments[i]; |
||||
} |
||||
} |
||||
queue.push(new Item(fun, args)); |
||||
if (queue.length === 1 && !draining) { |
||||
runTimeout(drainQueue); |
||||
} |
||||
}; |
||||
|
||||
// v8 likes predictible objects
|
||||
function Item(fun, array) { |
||||
this.fun = fun; |
||||
this.array = array; |
||||
} |
||||
Item.prototype.run = function () { |
||||
this.fun.apply(null, this.array); |
||||
}; |
||||
process.title = 'browser'; |
||||
process.browser = true; |
||||
process.env = {}; |
||||
process.argv = []; |
||||
process.version = ''; // empty string to avoid regexp issues
|
||||
process.versions = {}; |
||||
|
||||
function noop() {} |
||||
|
||||
process.on = noop; |
||||
process.addListener = noop; |
||||
process.once = noop; |
||||
process.off = noop; |
||||
process.removeListener = noop; |
||||
process.removeAllListeners = noop; |
||||
process.emit = noop; |
||||
process.prependListener = noop; |
||||
process.prependOnceListener = noop; |
||||
|
||||
process.listeners = function (name) { return [] } |
||||
|
||||
process.binding = function (name) { |
||||
throw new Error('process.binding is not supported'); |
||||
}; |
||||
|
||||
process.cwd = function () { return '/' }; |
||||
process.chdir = function (dir) { |
||||
throw new Error('process.chdir is not supported'); |
||||
}; |
||||
process.umask = function() { return 0; }; |
||||
|
||||
},{}],2:[function(require,module,exports){ |
||||
module.exports={ |
||||
"name": "tesseract.js", |
||||
"version": "1.0.19", |
||||
"description": "Pure Javascript Multilingual OCR", |
||||
"main": "src/index.js", |
||||
"scripts": { |
||||
"start": "concurrently --kill-others \"watchify src/index.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.dev.js --standalone Tesseract\" \"watchify src/browser/worker.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/worker.dev.js\" \"http-server -p 7355\"", |
||||
"build": "browserify src/index.js -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.js --standalone Tesseract && browserify src/browser/worker.js -t [ babelify --presets [ es2015 ] ] -o dist/worker.js && uglifyjs dist/tesseract.js --source-map -o dist/tesseract.min.js && uglifyjs dist/worker.js --source-map -o dist/worker.min.js", |
||||
"release": "npm run build && git commit -am 'new release' && git push && git tag `jq -r '.version' package.json` && git push origin --tags && npm publish" |
||||
}, |
||||
"browser": { |
||||
"./src/node/index.js": "./src/browser/index.js" |
||||
}, |
||||
"author": "", |
||||
"license": "Apache-2.0", |
||||
"devDependencies": { |
||||
"babel-preset-es2015": "^6.16.0", |
||||
"babelify": "^7.3.0", |
||||
"browserify": "^13.1.0", |
||||
"concurrently": "^3.1.0", |
||||
"envify": "^3.4.1", |
||||
"http-server": "^0.9.0", |
||||
"pako": "^1.0.3", |
||||
"uglify-js": "^3.4.9", |
||||
"watchify": "^3.7.0" |
||||
}, |
||||
"dependencies": { |
||||
"file-type": "^3.8.0", |
||||
"isomorphic-fetch": "^2.2.1", |
||||
"is-url": "1.2.2", |
||||
"jpeg-js": "^0.2.0", |
||||
"level-js": "^2.2.4", |
||||
"node-fetch": "^1.6.3", |
||||
"object-assign": "^4.1.0", |
||||
"png.js": "^0.2.1", |
||||
"tesseract.js-core": "^1.0.2" |
||||
}, |
||||
"repository": { |
||||
"type": "git", |
||||
"url": "https://github.com/naptha/tesseract.js.git" |
||||
}, |
||||
"bugs": { |
||||
"url": "https://github.com/naptha/tesseract.js/issues" |
||||
}, |
||||
"homepage": "https://github.com/naptha/tesseract.js" |
||||
} |
||||
|
||||
},{}],3:[function(require,module,exports){ |
||||
(function (process){ |
||||
'use strict'; |
||||
|
||||
var defaultOptions = { |
||||
// workerPath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@0.2.0/dist/worker.js',
|
||||
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js', |
||||
langPath: 'https://tessdata.projectnaptha.com/3.02/' |
||||
}; |
||||
|
||||
if (process.env.TESS_ENV === "development") { |
||||
console.debug('Using Development Configuration'); |
||||
defaultOptions.workerPath = location.protocol + '//' + location.host + '/dist/worker.dev.js?nocache=' + Math.random().toString(36).slice(3); |
||||
} else { |
||||
var version = require('../../package.json').version; |
||||
defaultOptions.workerPath = 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@' + version + '/dist/worker.js'; |
||||
} |
||||
|
||||
exports.defaultOptions = defaultOptions; |
||||
|
||||
exports.spawnWorker = function spawnWorker(instance, workerOptions) { |
||||
if (Blob && URL) { |
||||
var blob = new Blob(['importScripts("' + workerOptions.workerPath + '");'], { |
||||
type: 'application/javascript' |
||||
}); |
||||
var worker = new Worker(URL.createObjectURL(blob)); |
||||
} else { |
||||
var worker = new Worker(workerOptions.workerPath); |
||||
} |
||||
|
||||
worker.onmessage = function (e) { |
||||
var packet = e.data; |
||||
instance._recv(packet); |
||||
}; |
||||
return worker; |
||||
}; |
||||
|
||||
exports.terminateWorker = function (instance) { |
||||
instance.worker.terminate(); |
||||
}; |
||||
|
||||
exports.sendPacket = function sendPacket(instance, packet) { |
||||
loadImage(packet.payload.image, function (img) { |
||||
packet.payload.image = img; |
||||
instance.worker.postMessage(packet); |
||||
}); |
||||
}; |
||||
|
||||
function loadImage(image, cb) { |
||||
if (typeof image === 'string') { |
||||
if (/^\#/.test(image)) { |
||||
// element css selector
|
||||
return loadImage(document.querySelector(image), cb); |
||||
} else if (/(blob|data)\:/.test(image)) { |
||||
// data url
|
||||
var im = new Image(); |
||||
im.src = image; |
||||
im.onload = function (e) { |
||||
return loadImage(im, cb); |
||||
}; |
||||
im.onerror = function (e) { |
||||
throw e; |
||||
}; |
||||
return; |
||||
} else { |
||||
var xhr = new XMLHttpRequest(); |
||||
xhr.open('GET', image, true); |
||||
xhr.responseType = "blob"; |
||||
|
||||
xhr.onload = function (e) { |
||||
if (xhr.status >= 400) { |
||||
throw new Error('Fail to get image as Blob'); |
||||
} else { |
||||
loadImage(xhr.response, cb); |
||||
} |
||||
}; |
||||
xhr.onerror = function (e) { |
||||
throw e; |
||||
}; |
||||
|
||||
xhr.send(null); |
||||
return; |
||||
} |
||||
} else if (image instanceof File) { |
||||
// files
|
||||
var fr = new FileReader(); |
||||
fr.onload = function (e) { |
||||
return loadImage(fr.result, cb); |
||||
}; |
||||
fr.onerror = function (e) { |
||||
throw e; |
||||
}; |
||||
fr.readAsDataURL(image); |
||||
return; |
||||
} else if (image instanceof Blob) { |
||||
return loadImage(URL.createObjectURL(image), cb); |
||||
} else if (image.getContext) { |
||||
// canvas element
|
||||
return loadImage(image.getContext('2d'), cb); |
||||
} else if (image.tagName == "IMG" || image.tagName == "VIDEO") { |
||||
// image element or video element
|
||||
var c = document.createElement('canvas'); |
||||
c.width = image.naturalWidth || image.videoWidth; |
||||
c.height = image.naturalHeight || image.videoHeight; |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
return loadImage(ctx, cb); |
||||
} else if (image.getImageData) { |
||||
// canvas context
|
||||
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return loadImage(data, cb); |
||||
} else { |
||||
return cb(image); |
||||
} |
||||
throw new Error('Missing return in loadImage cascade'); |
||||
} |
||||
|
||||
}).call(this,require('_process')) |
||||
},{"../../package.json":2,"_process":1}],4:[function(require,module,exports){ |
||||
"use strict"; |
||||
|
||||
// The result of dump.js is a big JSON tree
|
||||
// which can be easily serialized (for instance
|
||||
// to be sent from a webworker to the main app
|
||||
// or through Node's IPC), but we want
|
||||
// a (circular) DOM-like interface for walking
|
||||
// through the data.
|
||||
|
||||
module.exports = function circularize(page) { |
||||
page.paragraphs = []; |
||||
page.lines = []; |
||||
page.words = []; |
||||
page.symbols = []; |
||||
|
||||
page.blocks.forEach(function (block) { |
||||
block.page = page; |
||||
|
||||
block.lines = []; |
||||
block.words = []; |
||||
block.symbols = []; |
||||
|
||||
block.paragraphs.forEach(function (para) { |
||||
para.block = block; |
||||
para.page = page; |
||||
|
||||
para.words = []; |
||||
para.symbols = []; |
||||
|
||||
para.lines.forEach(function (line) { |
||||
line.paragraph = para; |
||||
line.block = block; |
||||
line.page = page; |
||||
|
||||
line.symbols = []; |
||||
|
||||
line.words.forEach(function (word) { |
||||
word.line = line; |
||||
word.paragraph = para; |
||||
word.block = block; |
||||
word.page = page; |
||||
word.symbols.forEach(function (sym) { |
||||
sym.word = word; |
||||
sym.line = line; |
||||
sym.paragraph = para; |
||||
sym.block = block; |
||||
sym.page = page; |
||||
|
||||
sym.line.symbols.push(sym); |
||||
sym.paragraph.symbols.push(sym); |
||||
sym.block.symbols.push(sym); |
||||
sym.page.symbols.push(sym); |
||||
}); |
||||
word.paragraph.words.push(word); |
||||
word.block.words.push(word); |
||||
word.page.words.push(word); |
||||
}); |
||||
line.block.lines.push(line); |
||||
line.page.lines.push(line); |
||||
}); |
||||
para.page.paragraphs.push(para); |
||||
}); |
||||
}); |
||||
return page; |
||||
}; |
||||
|
||||
},{}],5:[function(require,module,exports){ |
||||
'use strict'; |
||||
|
||||
var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); |
||||
|
||||
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } |
||||
|
||||
var adapter = require('../node/index.js'); |
||||
|
||||
var jobCounter = 0; |
||||
|
||||
module.exports = function () { |
||||
function TesseractJob(instance) { |
||||
_classCallCheck(this, TesseractJob); |
||||
|
||||
this.id = 'Job-' + ++jobCounter + '-' + Math.random().toString(16).slice(3, 8); |
||||
|
||||
this._instance = instance; |
||||
this._resolve = []; |
||||
this._reject = []; |
||||
this._progress = []; |
||||
this._finally = []; |
||||
} |
||||
|
||||
_createClass(TesseractJob, [{ |
||||
key: 'then', |
||||
value: function then(resolve, reject) { |
||||
if (this._resolve.push) { |
||||
this._resolve.push(resolve); |
||||
} else { |
||||
resolve(this._resolve); |
||||
} |
||||
|
||||
if (reject) this.catch(reject); |
||||
return this; |
||||
} |
||||
}, { |
||||
key: 'catch', |
||||
value: function _catch(reject) { |
||||
if (this._reject.push) { |
||||
this._reject.push(reject); |
||||
} else { |
||||
reject(this._reject); |
||||
} |
||||
return this; |
||||
} |
||||
}, { |
||||
key: 'progress', |
||||
value: function progress(fn) { |
||||
this._progress.push(fn); |
||||
return this; |
||||
} |
||||
}, { |
||||
key: 'finally', |
||||
value: function _finally(fn) { |
||||
this._finally.push(fn); |
||||
return this; |
||||
} |
||||
}, { |
||||
key: '_send', |
||||
value: function _send(action, payload) { |
||||
adapter.sendPacket(this._instance, { |
||||
jobId: this.id, |
||||
action: action, |
||||
payload: payload |
||||
}); |
||||
} |
||||
}, { |
||||
key: '_handle', |
||||
value: function _handle(packet) { |
||||
var data = packet.data; |
||||
var runFinallyCbs = false; |
||||
|
||||
if (packet.status === 'resolve') { |
||||
if (this._resolve.length === 0) console.log(data); |
||||
this._resolve.forEach(function (fn) { |
||||
var ret = fn(data); |
||||
if (ret && typeof ret.then == 'function') { |
||||
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.'); |
||||
} |
||||
}); |
||||
this._resolve = data; |
||||
this._instance._dequeue(); |
||||
runFinallyCbs = true; |
||||
} else if (packet.status === 'reject') { |
||||
if (this._reject.length === 0) console.error(data); |
||||
this._reject.forEach(function (fn) { |
||||
return fn(data); |
||||
}); |
||||
this._reject = data; |
||||
this._instance._dequeue(); |
||||
runFinallyCbs = true; |
||||
} else if (packet.status === 'progress') { |
||||
this._progress.forEach(function (fn) { |
||||
return fn(data); |
||||
}); |
||||
} else { |
||||
console.warn('Message type unknown', packet.status); |
||||
} |
||||
|
||||
if (runFinallyCbs) { |
||||
this._finally.forEach(function (fn) { |
||||
return fn(data); |
||||
}); |
||||
} |
||||
} |
||||
}]); |
||||
|
||||
return TesseractJob; |
||||
}(); |
||||
|
||||
},{"../node/index.js":3}],6:[function(require,module,exports){ |
||||
'use strict'; |
||||
|
||||
var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); |
||||
|
||||
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } |
||||
|
||||
var adapter = require('./node/index.js'); |
||||
var circularize = require('./common/circularize.js'); |
||||
var TesseractJob = require('./common/job'); |
||||
var version = require('../package.json').version; |
||||
|
||||
var create = function create() { |
||||
var workerOptions = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; |
||||
|
||||
var worker = new TesseractWorker(Object.assign({}, adapter.defaultOptions, workerOptions)); |
||||
worker.create = create; |
||||
worker.version = version; |
||||
return worker; |
||||
}; |
||||
|
||||
var TesseractWorker = function () { |
||||
function TesseractWorker(workerOptions) { |
||||
_classCallCheck(this, TesseractWorker); |
||||
|
||||
this.worker = null; |
||||
this.workerOptions = workerOptions; |
||||
this._currentJob = null; |
||||
this._queue = []; |
||||
} |
||||
|
||||
_createClass(TesseractWorker, [{ |
||||
key: 'recognize', |
||||
value: function recognize(image) { |
||||
var _this = this; |
||||
|
||||
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; |
||||
|
||||
return this._delay(function (job) { |
||||
if (typeof options === 'string') options = { lang: options }; |
||||
options.lang = options.lang || 'eng'; |
||||
|
||||
job._send('recognize', { image: image, options: options, workerOptions: _this.workerOptions }); |
||||
}); |
||||
} |
||||
}, { |
||||
key: 'detect', |
||||
value: function detect(image) { |
||||
var _this2 = this; |
||||
|
||||
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; |
||||
|
||||
return this._delay(function (job) { |
||||
job._send('detect', { image: image, options: options, workerOptions: _this2.workerOptions }); |
||||
}); |
||||
} |
||||
}, { |
||||
key: 'terminate', |
||||
value: function terminate() { |
||||
if (this.worker) adapter.terminateWorker(this); |
||||
this.worker = null; |
||||
this._currentJob = null; |
||||
this._queue = []; |
||||
} |
||||
}, { |
||||
key: '_delay', |
||||
value: function _delay(fn) { |
||||
var _this3 = this; |
||||
|
||||
if (!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions); |
||||
|
||||
var job = new TesseractJob(this); |
||||
this._queue.push(function (e) { |
||||
_this3._queue.shift(); |
||||
_this3._currentJob = job; |
||||
fn(job); |
||||
}); |
||||
if (!this._currentJob) this._dequeue(); |
||||
return job; |
||||
} |
||||
}, { |
||||
key: '_dequeue', |
||||
value: function _dequeue() { |
||||
this._currentJob = null; |
||||
if (this._queue.length) { |
||||
this._queue[0](); |
||||
} |
||||
} |
||||
}, { |
||||
key: '_recv', |
||||
value: function _recv(packet) { |
||||
if (packet.status === 'resolve' && packet.action === 'recognize') { |
||||
packet.data = circularize(packet.data); |
||||
} |
||||
|
||||
if (this._currentJob.id === packet.jobId) { |
||||
this._currentJob._handle(packet); |
||||
} else { |
||||
console.warn('Job ID ' + packet.jobId + ' not known.'); |
||||
} |
||||
} |
||||
}]); |
||||
|
||||
return TesseractWorker; |
||||
}(); |
||||
|
||||
module.exports = create(); |
||||
|
||||
},{"../package.json":2,"./common/circularize.js":4,"./common/job":5,"./node/index.js":3}]},{},[6])(6) |
||||
}); |
@ -1,448 +0,0 @@
@@ -1,448 +0,0 @@
|
||||
# API |
||||
|
||||
- [createWorker()](#create-worker) |
||||
- [Worker.load](#worker-load) |
||||
- [Worker.writeText](#worker-writeText) |
||||
- [Worker.readText](#worker-readText) |
||||
- [Worker.removeFile](#worker-removeFile) |
||||
- [Worker.FS](#worker-FS) |
||||
- [Worker.loadLanguage](#worker-load-language) |
||||
- [Worker.initialize](#worker-initialize) |
||||
- [Worker.setParameters](#worker-set-parameters) |
||||
- [Worker.recognize](#worker-recognize) |
||||
- [Worker.detect](#worker-detect) |
||||
- [Worker.terminate](#worker-terminate) |
||||
- [createScheduler()](#create-scheduler) |
||||
- [Scheduler.addWorker](#scheduler-add-worker) |
||||
- [Scheduler.addJob](#scheduler-add-job) |
||||
- [Scheduler.getQueueLen](#scheduler-get-queue-len) |
||||
- [Scheduler.getNumWorkers](#scheduler-get-num-workers) |
||||
- [setLogging()](#set-logging) |
||||
- [recognize()](#recognize) |
||||
- [detect()](#detect) |
||||
- [PSM](#psm) |
||||
- [OEM](#oem) |
||||
|
||||
--- |
||||
|
||||
<a name="create-worker"></a> |
||||
## createWorker(options): Worker |
||||
|
||||
createWorker is a factory function that creates a tesseract worker, a worker is basically a Web Worker in browser and Child Process in Node. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `options` an object of customized options |
||||
- `corePath` path for tesseract-core.js script |
||||
- `langPath` path for downloading traineddata, do not include `/` at the end of the path |
||||
- `workerPath` path for downloading worker script |
||||
- `dataPath` path for saving traineddata in WebAssembly file system, not common to modify |
||||
- `cachePath` path for the cached traineddata, more useful for Node, for browser it only changes the key in IndexDB |
||||
- `cacheMethod` a string to indicate the method of cache management, should be one of the following options |
||||
- write: read cache and write back (default method) |
||||
- readOnly: read cache and not to write back |
||||
- refresh: not to read cache and write back |
||||
- none: not to read cache and not to write back |
||||
- `workerBlobURL` a boolean to define whether to use Blob URL for worker script, default: true |
||||
- `gzip` a boolean to define whether the traineddata from the remote is gzipped, default: true |
||||
- `logger` a function to log the progress, a quick example is `m => console.log(m)` |
||||
- `errorHandler` a function to handle worker errors, a quick example is `err => console.error(err)` |
||||
|
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { createWorker } = Tesseract; |
||||
const worker = createWorker({ |
||||
langPath: '...', |
||||
logger: m => console.log(m), |
||||
}); |
||||
``` |
||||
|
||||
## Worker |
||||
|
||||
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is: |
||||
|
||||
- load |
||||
- FS functions // optional |
||||
- loadLanguauge |
||||
- initialize |
||||
- setParameters // optional |
||||
- recognize or detect |
||||
- terminate |
||||
|
||||
Each function is async, so using async/await or Promise is required. When it is resolved, you get an object: |
||||
|
||||
```json |
||||
{ |
||||
"jobId": "Job-1-123", |
||||
"data": { ... } |
||||
} |
||||
``` |
||||
|
||||
jobId is generated by Tesseract.js, but you can put your own when calling any of the function above. |
||||
|
||||
<a name="worker-load"></a> |
||||
### Worker.load(jobId): Promise |
||||
|
||||
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.load(); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-writeText"></a> |
||||
### Worker.writeText(path, text, jobId): Promise |
||||
|
||||
Worker.writeText() writes a text file to the path specified in MEMFS, it is useful when you want to use some features that requires tesseract.js |
||||
to read file from file system. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `path` text file path |
||||
- `text` content of the text file |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.writeText('tmp.txt', 'Hi\nTesseract.js\n'); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-readText"></a> |
||||
### Worker.readText(path, jobId): Promise |
||||
|
||||
Worker.readText() reads a text file to the path specified in MEMFS, it is useful when you want to check the content. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `path` text file path |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
const { data } = await worker.readText('tmp.txt'); |
||||
console.log(data); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-removeFile"></a> |
||||
### Worker.removeFile(path, jobId): Promise |
||||
|
||||
Worker.readFile() remove a file in MEMFS, it is useful when you want to free the memory. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `path` file path |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.removeFile('tmp.txt'); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-FS"></a> |
||||
### Worker.FS(method, args, jobId): Promise |
||||
|
||||
Worker.FS() is a generic FS function to do anything you want, you can check [HERE](ihttps://emscripten.org/docs/api_reference/Filesystem-API.html) for all functions. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `method` method name |
||||
- `args` array of arguments to pass |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.FS('writeFile', ['tmp.txt', 'Hi\nTesseract.js\n']); |
||||
// equal to: |
||||
// await worker.readText('tmp.txt', 'Hi\nTesseract.js\n'); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-load-language"></a> |
||||
### Worker.loadLanguage(langs, jobId): Promise |
||||
|
||||
Worker.loadLanguage() loads traineddata from cache or download traineddata from remote, and put traineddata into the WebAssembly file system. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `langs` a string to indicate the languages traineddata to download, multiple languages are concated with **+**, ex: **eng+chi\_tra** |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.loadLanguage('eng+chi_tra'); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-initialize"></a> |
||||
### Worker.initialize(langs, oem, jobId): Promise |
||||
|
||||
Worker.initialize() initializes the Tesseract API, make sure it is ready for doing OCR tasks. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `langs` a string to indicate the languages loaded by Tesseract API, it can be the subset of the languauge traineddata you loaded from Worker.loadLanguage. |
||||
- `oem` a enum to indicate the OCR Engine Mode you use |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
/** You can load more languages in advance, but use only part of them in Worker.initialize() */ |
||||
await worker.loadLanguage('eng+chi_tra'); |
||||
await worker.initialize('eng'); |
||||
})(); |
||||
``` |
||||
<a name="worker-set-parameters"></a> |
||||
### Worker.setParameters(params, jobId): Promise |
||||
|
||||
Worker.setParameters() set parameters for Tesseract API (using SetVariable()), it changes the behavior of Tesseract and some parameters like tessedit\_char\_whitelist is very useful. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `params` an object with key and value of the parameters |
||||
- `jobId` Please see details above |
||||
|
||||
**Supported Paramters:** |
||||
|
||||
| name | type | default value | description | |
||||
| --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- | |
||||
| tessedit\_ocr\_engine\_mode | enum | OEM.DEFAULT | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | |
||||
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | |
||||
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | |
||||
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words | |
||||
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** | |
||||
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | |
||||
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | |
||||
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | |
||||
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | |
||||
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.setParameters({ |
||||
tessedit_char_whitelist: '0123456789', |
||||
}); |
||||
}) |
||||
``` |
||||
|
||||
<a name="worker-recognize"></a> |
||||
### Worker.recognize(image, options, jobId): Promise |
||||
|
||||
Worker.recognize() provides core function of Tesseract.js as it executes OCR |
||||
|
||||
Figures out what words are in `image`, where the words are in `image`, etc. |
||||
> Note: `image` should be sufficiently high resolution. |
||||
> Often, the same image will get much better results if you upscale it before calling `recognize`. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `image` see [Image Format](./image-format.md) for more details. |
||||
- `options` a object of customized options |
||||
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below. |
||||
- `jobId` Please see details above |
||||
|
||||
**Output:** |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { createWorker } = Tesseract; |
||||
(async () => { |
||||
const worker = createWorker(); |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize(image); |
||||
console.log(text); |
||||
})(); |
||||
``` |
||||
|
||||
With rectangle |
||||
|
||||
```javascript |
||||
const { createWorker } = Tesseract; |
||||
(async () => { |
||||
const worker = createWorker(); |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize(image, { |
||||
rectangle: { top: 0, left: 0, width: 100, height: 100 }, |
||||
}); |
||||
console.log(text); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-detect"></a> |
||||
### Worker.detect(image, jobId): Promise |
||||
|
||||
Worker.detect() does OSD (Orientation and Script Detection) to the image instead of OCR. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `image` see [Image Format](./image-format.md) for more details. |
||||
- `jobId` Please see details above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { createWorker } = Tesseract; |
||||
(async () => { |
||||
const worker = createWorker(); |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data } = await worker.detect(image); |
||||
console.log(data); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="worker-terminate"></a> |
||||
### Worker.terminate(jobId): Promise |
||||
|
||||
Worker.terminate() terminates the worker and cleans up |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="create-scheduler"></a> |
||||
## createScheduler(): Scheduler |
||||
|
||||
createScheduler() is a factory function to create a scheduler, a scheduler manages a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance. |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { createScheduler } = Tesseract; |
||||
const scheduler = createScheduler(); |
||||
``` |
||||
|
||||
### Scheduler |
||||
|
||||
<a name="scheduler-add-worker"></a> |
||||
### Scheduler.addWorker(worker): string |
||||
|
||||
Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one scheduler. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `worker` see Worker above |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { createWorker, createScheduler } = Tesseract; |
||||
const scheduler = createScheduler(); |
||||
const worker = createWorker(); |
||||
scheduler.addWorker(worker); |
||||
``` |
||||
|
||||
<a name="scheduler-add-job"></a> |
||||
### Scheduler.addJob(action, ...payload): Promise |
||||
|
||||
Scheduler.addJob() adds a job to the job queue and scheduler waits and finds an idle worker to take the job. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `action` a string to indicate the action you want to do, right now only **recognize** and **detect** are supported |
||||
- `payload` a arbitrary number of args depending on the action you called. |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
const { data: { text } } = await scheduler.addJob('recognize', image, options); |
||||
const { data } = await scheduler.addJob('detect', image); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="scheduler-get-queue-len"></a> |
||||
### Scheduler.getQueueLen(): number |
||||
|
||||
Scheduler.getNumWorkers() returns the length of job queue. |
||||
|
||||
<a name="scheduler-get-num-workers"></a> |
||||
### Scheduler.getNumWorkers(): number |
||||
|
||||
Scheduler.getNumWorkers() returns number of workers added into the scheduler |
||||
|
||||
<a name="scheduler-terminate"></a> |
||||
### Scheduler.terminate(): Promise |
||||
|
||||
Scheduler.terminate() terminates all workers added, useful to do quick clean up. |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
(async () => { |
||||
await scheduler.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
<a name="set-logging"></a> |
||||
## setLogging(logging: boolean) |
||||
|
||||
setLogging() sets the logging flag, you can `setLogging(true)` to see detailed information, useful for debugging. |
||||
|
||||
**Arguments:** |
||||
|
||||
- `logging` boolean to define whether to see detailed logs, default: false |
||||
|
||||
**Examples:** |
||||
|
||||
```javascript |
||||
const { setLogging } = Tesseract; |
||||
setLogging(true); |
||||
``` |
||||
|
||||
<a name="recognize"></a> |
||||
## recognize(image, langs, options): Promise |
||||
|
||||
recognize() is a function to quickly do recognize() task, it is not recommended to use in real application, but useful when you want to save some time. |
||||
|
||||
See [Tesseract.js](../src/Tesseract.js) |
||||
|
||||
<a name="detect"></a> |
||||
## detect(image, options): Promise |
||||
|
||||
Same background as recognize(), but it does detect instead. |
||||
|
||||
See [Tesseract.js](../src/Tesseract.js) |
||||
|
||||
<a name="psm"></a> |
||||
## PSM |
||||
|
||||
See [PSM.js](../src/constants/PSM.js) |
||||
|
||||
<a name="oem"></a> |
||||
## OEM |
||||
|
||||
See [OEM.js](../src/constants/OEM.js) |
@ -1,226 +0,0 @@
@@ -1,226 +0,0 @@
|
||||
# Tesseract.js Examples |
||||
|
||||
You can also check [examples](../examples) folder. |
||||
|
||||
### basic |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
### with detailed progress |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker({ |
||||
logger: m => console.log(m), // Add logger here |
||||
}); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
### with multiple languages, separate by '+' |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng+chi_tra'); |
||||
await worker.initialize('eng+chi_tra'); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
### with whitelist char (^2.0.0-beta.1) |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
await worker.setParameters({ |
||||
tessedit_char_whitelist: '0123456789', |
||||
}); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
### with different pageseg mode (^2.0.0-beta.1) |
||||
|
||||
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 |
||||
|
||||
```javascript |
||||
const { createWorker, PSM } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
await worker.setParameters({ |
||||
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, |
||||
}); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
### with pdf output (^2.0.0-beta.1) |
||||
|
||||
Please check **examples** folder for details. |
||||
|
||||
Browser: [download-pdf.html](../examples/browser/download-pdf.html) |
||||
Node: [download-pdf.js](../examples/node/download-pdf.js) |
||||
|
||||
### with only part of the image (^2.0.1) |
||||
|
||||
**One rectangle** |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
const rectangle = { left: 0, top: 0, width: 500, height: 250 }; |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle }); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
**Multiple Rectangles** |
||||
|
||||
```javascript |
||||
const { createWorker } = require('tesseract.js'); |
||||
|
||||
const worker = createWorker(); |
||||
const rectangles = [ |
||||
{ |
||||
left: 0, |
||||
top: 0, |
||||
width: 500, |
||||
height: 250, |
||||
}, |
||||
{ |
||||
left: 500, |
||||
top: 0, |
||||
width: 500, |
||||
height: 250, |
||||
}, |
||||
]; |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const values = []; |
||||
for (let i = 0; i < rectangles.length; i++) { |
||||
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle: rectangles[i] }); |
||||
values.push(text); |
||||
} |
||||
console.log(values); |
||||
await worker.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
**Multiple Rectangles (with scheduler to do recognition in parallel)** |
||||
|
||||
```javascript |
||||
const { createWorker, createScheduler } = require('tesseract.js'); |
||||
|
||||
const scheduler = createScheduler(); |
||||
const worker1 = createWorker(); |
||||
const worker2 = createWorker(); |
||||
const rectangles = [ |
||||
{ |
||||
left: 0, |
||||
top: 0, |
||||
width: 500, |
||||
height: 250, |
||||
}, |
||||
{ |
||||
left: 500, |
||||
top: 0, |
||||
width: 500, |
||||
height: 250, |
||||
}, |
||||
]; |
||||
|
||||
(async () => { |
||||
await worker1.load(); |
||||
await worker2.load(); |
||||
await worker1.loadLanguage('eng'); |
||||
await worker2.loadLanguage('eng'); |
||||
await worker1.initialize('eng'); |
||||
await worker2.initialize('eng'); |
||||
scheduler.addWorker(worker1); |
||||
scheduler.addWorker(worker2); |
||||
const results = await Promise.all(rectangles.map((rectangle) => ( |
||||
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle }) |
||||
))); |
||||
console.log(results.map(r => r.data.text)); |
||||
await scheduler.terminate(); |
||||
})(); |
||||
``` |
||||
|
||||
### with multiple workers to speed up (^2.0.0-beta.1) |
||||
|
||||
```javascript |
||||
const { createWorker, createScheduler } = require('tesseract.js'); |
||||
|
||||
const scheduler = createScheduler(); |
||||
const worker1 = createWorker(); |
||||
const worker2 = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker1.load(); |
||||
await worker2.load(); |
||||
await worker1.loadLanguage('eng'); |
||||
await worker2.loadLanguage('eng'); |
||||
await worker1.initialize('eng'); |
||||
await worker2.initialize('eng'); |
||||
scheduler.addWorker(worker1); |
||||
scheduler.addWorker(worker2); |
||||
/** Add 10 recognition jobs */ |
||||
const results = await Promise.all(Array(10).fill(0).map(() => ( |
||||
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png') |
||||
))) |
||||
console.log(results); |
||||
await scheduler.terminate(); // It also terminates all workers. |
||||
})(); |
||||
``` |
@ -1,42 +0,0 @@
@@ -1,42 +0,0 @@
|
||||
FAQ |
||||
=== |
||||
|
||||
## How does tesseract.js download and keep \*.traineddata? |
||||
|
||||
The language model is downloaded by `worker.loadLanguage()` and you need to pass the langs to `worker.initialize()`. |
||||
|
||||
During the downloading of language model, Tesseract.js will first check if \*.traineddata already exists. (browser: [IndexedDB](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API), Node.js: fs, in the folder you execute the command) If the \*.traineddata doesn't exist, it will fetch \*.traineddata.gz from [tessdata](https://github.com/naptha/tessdata), ungzip and store in IndexedDB or fs, you can delete it manually and it will download again for you. |
||||
|
||||
## How can I train my own \*.traineddata? |
||||
|
||||
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00) |
||||
|
||||
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05) |
||||
|
||||
## How can I get HOCR, TSV, Box, UNLV, OSD? |
||||
|
||||
Starting from 2.0.0-beta.1, you can get all these information in the final result. |
||||
|
||||
```javascript |
||||
import { createWorker } from 'tesseract.js'; |
||||
const worker = createWorker({ |
||||
logger: m => console.log(m) |
||||
}); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
await worker.setParameters({ |
||||
tessedit_create_box: '1', |
||||
tessedit_create_unlv: '1', |
||||
tessedit_create_osd: '1', |
||||
}); |
||||
const { data: { text, hocr, tsv, box, unlv } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); |
||||
console.log(text); |
||||
console.log(hocr); |
||||
console.log(tsv); |
||||
console.log(box); |
||||
console.log(unlv); |
||||
})(); |
||||
``` |
@ -1,18 +0,0 @@
@@ -1,18 +0,0 @@
|
||||
# Image Format |
||||
|
||||
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below. |
||||
|
||||
Support Image Formats: **bmp, jpg, png, pbm, webp** |
||||
|
||||
For browser and Node, supported data types are: |
||||
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp) |
||||
- buffer |
||||
|
||||
For browser only, supported data types are: |
||||
- `File` or `Blob` object |
||||
- `img` or `canvas` element |
||||
|
||||
For Node only, supported data types are: |
||||
- string containing a path to local image |
||||
|
||||
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported. |
Before Width: | Height: | Size: 105 KiB |
Before Width: | Height: | Size: 237 KiB |
@ -1,38 +0,0 @@
@@ -1,38 +0,0 @@
|
||||
## Local Installation |
||||
|
||||
Check here for examples: https://github.com/naptha/tesseract.js/blob/master/docs/examples.md |
||||
|
||||
In browser environment, `tesseract.js` simply provides the API layer. Internally, it opens a WebWorker to handle requests. That worker itself loads code from the Emscripten-built `tesseract.js-core` which itself is hosted on a CDN. Then it dynamically loads language files hosted on another CDN. |
||||
|
||||
Because of this we recommend loading `tesseract.js` from a CDN. But if you really need to have all your files local, you can pass extra arguments to `TesseractWorker` to specify custom paths for workers, languages, and core. |
||||
|
||||
In Node.js environment, the only path you may want to customize is languages/langPath. |
||||
|
||||
```javascript |
||||
Tesseract.recognize(image, langs, { |
||||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', |
||||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', |
||||
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', |
||||
}) |
||||
``` |
||||
|
||||
Or |
||||
|
||||
```javascript |
||||
const worker = createWorker({ |
||||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', |
||||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', |
||||
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', |
||||
}); |
||||
``` |
||||
|
||||
### workerPath |
||||
A string specifying the location of the [worker.js](./dist/worker.min.js) file. |
||||
|
||||
### langPath |
||||
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`. |
||||
|
||||
### corePath |
||||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). |
||||
|
||||
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment. |
@ -1,3 +1,72 @@
@@ -1,3 +1,72 @@
|
||||
# Tesseract Languages |
||||
|
||||
Please check [HERE](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) for supported languages |
||||
The `lang` property of the options object passed to `Tesseract.recognize` can have one of the following values (the default is `'eng'`.): |
||||
|
||||
| `lang` | Language | |
||||
|-----------|-----------------------| |
||||
| 'afr' | Afrikaans | |
||||
| 'ara' | Arabic | |
||||
| 'aze' | Azerbaijani | |
||||
| 'bel' | Belarusian | |
||||
| 'ben' | Bengali | |
||||
| 'bul' | Bulgarian | |
||||
| 'cat' | Catalan | |
||||
| 'ces' | Czech | |
||||
| 'chi_sim' | Chinese | |
||||
| 'chi_tra' | Traditional Chinese | |
||||
| 'chr' | Cherokee | |
||||
| 'dan' | Danish | |
||||
| 'deu' | German | |
||||
| 'ell' | Greek | |
||||
| 'eng' | English | |
||||
| 'enm' | English (Old) | |
||||
| 'epo' | Esperanto | |
||||
| 'epo_alt' | Esperanto alternative | |
||||
| 'equ' | Math | |
||||
| 'est' | Estonian | |
||||
| 'eus' | Basque | |
||||
| 'fas' |Persian (Farsi) | |
||||
| 'fin' | Finnish | |
||||
| 'fra' | French | |
||||
| 'frk' | Frankish | |
||||
| 'frm' | French (Old) | |
||||
| 'glg' | Galician | |
||||
| 'grc' | Ancient Greek | |
||||
| 'heb' | Hebrew | |
||||
| 'hin' | Hindi | |
||||
| 'hrv' | Croatian | |
||||
| 'hun' | Hungarian | |
||||
| 'ind' | Indonesian | |
||||
| 'isl' | Icelandic | |
||||
| 'ita' | Italian | |
||||
| 'ita_old' | Italian (Old) | |
||||
| 'jpn' | Japanese | |
||||
| 'kan' | Kannada | |
||||
| 'kor' | Korean | |
||||
| 'lav' | Latvian | |
||||
| 'lit' | Lithuanian | |
||||
| 'mal' | Malayalam | |
||||
| 'mkd' | Macedonian | |
||||
| 'mlt' | Maltese | |
||||
| 'msa' | Malay | |
||||
| 'nld' | Dutch | |
||||
| 'nor' | Norwegian | |
||||
| 'pol' | Polish | |
||||
| 'por' | Portuguese | |
||||
| 'ron' | Romanian | |
||||
| 'rus' | Russian | |
||||
| 'slk' | Slovakian | |
||||
| 'slv' | Slovenian | |
||||
| 'spa' | Spanish | |
||||
| 'spa_old' | Old Spanish | |
||||
| 'sqi' | Albanian | |
||||
| 'srp' | Serbian (Latin) | |
||||
| 'swa' | Swahili | |
||||
| 'swe' | Swedish | |
||||
| 'tam' | Tamil | |
||||
| 'tel' | Telugu | |
||||
| 'tgl' | Tagalog | |
||||
| 'tha' | Thai | |
||||
| 'tur' | Turkish | |
||||
| 'ukr' | Ukrainian | |
||||
| 'vie' | Vietnamese | |
||||
|
Before Width: | Height: | Size: 169 KiB After Width: | Height: | Size: 215 KiB |
@ -1,37 +0,0 @@
@@ -1,37 +0,0 @@
|
||||
<!DOCTYPE HTML> |
||||
<html> |
||||
<head> |
||||
<script src="/dist/tesseract.dev.js"></script> |
||||
</head> |
||||
<body> |
||||
<input type="file" id="uploader"> |
||||
<script> |
||||
const recognize = function(evt){ |
||||
const files = evt.target.files; |
||||
const worker = Tesseract.createWorker({ |
||||
/* |
||||
* As Edge don't support webassembly, |
||||
* here we force to use asm.js version. |
||||
*/ |
||||
corePath: '../../node_modules/tesseract.js-core/tesseract-core.asm.js', |
||||
logger: function(m){console.log(m);}, |
||||
/* |
||||
* As there is no indexedDB in earlier version |
||||
* of Edge, here we disable cache. |
||||
*/ |
||||
cacheMethod: 'none', |
||||
}); |
||||
Promise.resolve() |
||||
.then(() => worker.load()) |
||||
.then(() => worker.loadLanguage('eng')) |
||||
.then(() => worker.initialize('eng')) |
||||
.then(() => worker.recognize(files[0])) |
||||
.then((ret) => { |
||||
console.log(ret.data.text); |
||||
}); |
||||
} |
||||
const elm = document.getElementById('uploader'); |
||||
elm.addEventListener('change', recognize); |
||||
</script> |
||||
</body> |
||||
</html> |
@ -1,19 +0,0 @@
@@ -1,19 +0,0 @@
|
||||
<html> |
||||
<head> |
||||
<script src="/dist/tesseract.dev.js"></script> |
||||
</head> |
||||
<body> |
||||
<input type="file" id="uploader"> |
||||
<script> |
||||
const recognize = async ({ target: { files } }) => { |
||||
const { data: { text } } = await Tesseract.recognize(files[0], 'eng', { |
||||
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js', |
||||
logger: m => console.log(m), |
||||
}); |
||||
console.log(text); |
||||
} |
||||
const elm = document.getElementById('uploader'); |
||||
elm.addEventListener('change', recognize); |
||||
</script> |
||||
</body> |
||||
</html> |
@ -1,33 +0,0 @@
@@ -1,33 +0,0 @@
|
||||
<html> |
||||
<head> |
||||
<script src="/dist/tesseract.dev.js"></script> |
||||
</head> |
||||
<body> |
||||
<textarea id="message">Working...</textarea> |
||||
|
||||
<script> |
||||
const { createWorker } = Tesseract; |
||||
const worker = createWorker(); |
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
|
||||
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; |
||||
let timeTotal = 0; |
||||
for (let file of fileArr) { |
||||
let time1 = Date.now(); |
||||
for (let i=0; i < 10; i++) { |
||||
await worker.recognize(file); |
||||
} |
||||
let time2 = Date.now(); |
||||
const timeDif = (time2 - time1) / 1e3; |
||||
timeTotal += timeDif; |
||||
document.getElementById('message').innerHTML += "\n" + file + " [x10] runtime: " + timeDif + "s"; |
||||
} |
||||
document.getElementById('message').innerHTML += "\nTotal runtime: " + timeTotal + "s"; |
||||
|
||||
})(); |
||||
</script> |
||||
</body> |
||||
</html> |
@ -1,52 +0,0 @@
@@ -1,52 +0,0 @@
|
||||
<html> |
||||
<head> |
||||
<script src="/dist/tesseract.dev.js"></script> |
||||
</head> |
||||
<body> |
||||
<div> |
||||
<input type="file" id="uploader"> |
||||
<button id="download-pdf" disabled="true">Download PDF</button> |
||||
</div> |
||||
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea> |
||||
<script> |
||||
const { createWorker } = Tesseract; |
||||
const worker = createWorker({ |
||||
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js', |
||||
logger: m => console.log(m), |
||||
}); |
||||
const uploader = document.getElementById('uploader'); |
||||
const dlBtn = document.getElementById('download-pdf'); |
||||
const recognize = async ({ target: { files } }) => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize(files[0]); |
||||
const board = document.getElementById('board'); |
||||
board.value = text; |
||||
dlBtn.disabled = false; |
||||
}; |
||||
const downloadPDF = async () => { |
||||
const filename = 'tesseract-ocr-result.pdf'; |
||||
const { data } = await worker.getPDF('Tesseract OCR Result'); |
||||
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' }); |
||||
if (navigator.msSaveBlob) { |
||||
// IE 10+ |
||||
navigator.msSaveBlob(blob, filename); |
||||
} else { |
||||
const link = document.createElement('a'); |
||||
if (link.download !== undefined) { |
||||
const url = URL.createObjectURL(blob); |
||||
link.setAttribute('href', url); |
||||
link.setAttribute('download', filename); |
||||
link.style.visibility = 'hidden'; |
||||
document.body.appendChild(link); |
||||
link.click(); |
||||
document.body.removeChild(link); |
||||
} |
||||
} |
||||
}; |
||||
uploader.addEventListener('change', recognize); |
||||
dlBtn.addEventListener('click', downloadPDF); |
||||
</script> |
||||
</body> |
||||
</html> |
Before Width: | Height: | Size: 1011 KiB |
Before Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 408 KiB |
@ -0,0 +1,2 @@
@@ -0,0 +1,2 @@
|
||||
<script src="/dist/tesseract.dev.js"></script> |
||||
<input type="file" onchange="Tesseract.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})"> |
@ -0,0 +1,15 @@
@@ -0,0 +1,15 @@
|
||||
// replace this with require('tesseract.js')
|
||||
var Tesseract = require('../../'), |
||||
image = require('path').resolve(__dirname, 'cosmic.png'); |
||||
|
||||
Tesseract.recognize(image) |
||||
.then(data => { |
||||
console.log('then\n', data.text) |
||||
}) |
||||
.catch(err => { |
||||
console.log('catch\n', err); |
||||
}) |
||||
.finally(e => { |
||||
console.log('finally\n'); |
||||
process.exit(); |
||||
}); |
@ -1,27 +0,0 @@
@@ -1,27 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
const path = require('path'); |
||||
const { createWorker } = require('../../'); |
||||
|
||||
const worker = createWorker(); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; |
||||
let timeTotal = 0; |
||||
for (let file of fileArr) { |
||||
let time1 = Date.now(); |
||||
for (let i=0; i < 10; i++) { |
||||
await worker.recognize(file) |
||||
} |
||||
let time2 = Date.now(); |
||||
const timeDif = (time2 - time1) / 1e3; |
||||
timeTotal += timeDif; |
||||
|
||||
console.log(file + " [x10] runtime: " + timeDif + "s"); |
||||
} |
||||
console.log("Total runtime: " + timeTotal + "s"); |
||||
|
||||
await worker.terminate(); |
||||
})(); |
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
@ -1,13 +1,12 @@
@@ -1,13 +1,12 @@
|
||||
#!/usr/bin/env node
|
||||
const path = require('path'); |
||||
const Tesseract = require('../../'); |
||||
// replace this with require('tesseract.js')
|
||||
var Tesseract = require('../../'), |
||||
image = require('path').resolve(__dirname, 'cosmic.png'); |
||||
|
||||
const [,, imagePath] = process.argv; |
||||
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); |
||||
|
||||
console.log(`Recognizing ${image}`); |
||||
|
||||
Tesseract.detect(image, { logger: m => console.log(m) }) |
||||
.then(({ data }) => { |
||||
console.log(data); |
||||
}); |
||||
Tesseract.detect(image) |
||||
.progress(function(info){ |
||||
console.log(info); |
||||
}) |
||||
.then(function(data){ |
||||
console.log('done', data); |
||||
process.exit(); |
||||
}) |
@ -1,22 +0,0 @@
@@ -1,22 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
const path = require('path'); |
||||
const fs = require('fs'); |
||||
const { createWorker } = require('../../'); |
||||
|
||||
const [,, imagePath] = process.argv; |
||||
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); |
||||
|
||||
console.log(`Recognizing ${image}`); |
||||
|
||||
(async () => { |
||||
const worker = createWorker(); |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize(image); |
||||
console.log(text); |
||||
const { data } = await worker.getPDF('Tesseract OCR Result'); |
||||
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(data)); |
||||
console.log('Generate PDF: tesseract-ocr-result.pdf'); |
||||
await worker.terminate(); |
||||
})(); |
@ -1,20 +0,0 @@
@@ -1,20 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
const path = require('path'); |
||||
const { createWorker } = require('../../'); |
||||
|
||||
const [,, imagePath] = process.argv; |
||||
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); |
||||
|
||||
console.log(`Recognizing ${image}`); |
||||
const worker = createWorker({ |
||||
logger: m => console.log(m), |
||||
}); |
||||
|
||||
(async () => { |
||||
await worker.load(); |
||||
await worker.loadLanguage('eng'); |
||||
await worker.initialize('eng'); |
||||
const { data: { text } } = await worker.recognize(image); |
||||
console.log(text); |
||||
await worker.terminate(); |
||||
})(); |
@ -1,5 +0,0 @@
@@ -1,5 +0,0 @@
|
||||
{ |
||||
"rules": { |
||||
"import/no-extraneous-dependencies": 0 |
||||
} |
||||
} |
@ -1,13 +0,0 @@
@@ -1,13 +0,0 @@
|
||||
import commonjs from "@rollup/plugin-commonjs"; |
||||
|
||||
export default [ |
||||
{ |
||||
input: "dist/tesseract.min.js", |
||||
output: { |
||||
file: "dist/tesseract.esm.min.js", |
||||
format: "esm", |
||||
banner: "/* eslint-disable */", |
||||
}, |
||||
plugins: [commonjs()], |
||||
}, |
||||
]; |
@ -1,17 +0,0 @@
@@ -1,17 +0,0 @@
|
||||
const webpack = require('webpack'); |
||||
const middleware = require('webpack-dev-middleware'); |
||||
const express = require('express'); |
||||
const path = require('path'); |
||||
const cors = require('cors'); |
||||
const webpackConfig = require('./webpack.config.dev'); |
||||
|
||||
const compiler = webpack(webpackConfig); |
||||
const app = express(); |
||||
|
||||
app.use(cors()); |
||||
app.use('/', express.static(path.resolve(__dirname, '..'))); |
||||
app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true })); |
||||
|
||||
module.exports = app.listen(3000, () => { |
||||
console.log('Server is running on the port no. 3000'); |
||||
}); |
@ -1,9 +0,0 @@
@@ -1,9 +0,0 @@
|
||||
const constants = require('../tests/constants'); |
||||
global.expect = require('expect.js'); |
||||
global.fs = require('fs'); |
||||
global.path = require('path'); |
||||
global.Tesseract = require('../src'); |
||||
|
||||
Object.keys(constants).forEach((key) => { |
||||
global[key] = constants[key]; |
||||
}); |
@ -1,28 +0,0 @@
@@ -1,28 +0,0 @@
|
||||
module.exports = { |
||||
resolve: { |
||||
fallback: { |
||||
buffer: require.resolve('buffer/'), |
||||
}, |
||||
}, |
||||
module: { |
||||
rules: [ |
||||
{ |
||||
test: /\.m?js$/, |
||||
// exclude: /(node_modules|bower_components)/,
|
||||
use: { |
||||
loader: 'babel-loader', |
||||
options: { |
||||
presets: [ |
||||
[ |
||||
'@babel/preset-env', |
||||
{ |
||||
targets: 'last 2 versions', |
||||
}, |
||||
], |
||||
], |
||||
}, |
||||
}, |
||||
}, |
||||
], |
||||
}, |
||||
}; |
@ -1,48 +0,0 @@
@@ -1,48 +0,0 @@
|
||||
const path = require('path'); |
||||
const webpack = require('webpack'); |
||||
const { BundleAnalyzerPlugin } = require('webpack-bundle-analyzer'); |
||||
const common = require('./webpack.config.common'); |
||||
|
||||
const genConfig = ({ |
||||
entry, filename, library, libraryTarget, |
||||
}) => ({ |
||||
...common, |
||||
mode: 'development', |
||||
entry, |
||||
output: { |
||||
filename, |
||||
library, |
||||
libraryTarget, |
||||
}, |
||||
plugins: [ |
||||
new webpack.ProvidePlugin({ |
||||
Buffer: ['buffer', 'Buffer'], |
||||
}), |
||||
new webpack.DefinePlugin({ |
||||
'process.env': { |
||||
TESS_ENV: JSON.stringify('development'), |
||||
}, |
||||
}), |
||||
new BundleAnalyzerPlugin({ |
||||
analyzerMode: 'disable', |
||||
statsFilename: `${filename.split('.')[0]}-stats.json`, |
||||
generateStatsFile: true |
||||
}), |
||||
], |
||||
devServer: { |
||||
allowedHosts: ['localhost', '.gitpod.io'], |
||||
}, |
||||
}); |
||||
|
||||
module.exports = [ |
||||
genConfig({ |
||||
entry: path.resolve(__dirname, '..', 'src', 'index.js'), |
||||
filename: 'tesseract.dev.js', |
||||
library: 'Tesseract', |
||||
libraryTarget: 'umd', |
||||
}), |
||||
genConfig({ |
||||
entry: path.resolve(__dirname, '..', 'src', 'worker-script', 'browser', 'index.js'), |
||||
filename: 'worker.dev.js', |
||||
}), |
||||
]; |
@ -1,36 +0,0 @@
@@ -1,36 +0,0 @@
|
||||
const path = require('path'); |
||||
const common = require('./webpack.config.common'); |
||||
const webpack = require('webpack'); |
||||
|
||||
const genConfig = ({ |
||||
entry, filename, library, libraryTarget, |
||||
}) => ({ |
||||
...common, |
||||
mode: 'production', |
||||
devtool: 'source-map', |
||||
entry, |
||||
output: { |
||||
path: path.resolve(__dirname, '..', 'dist'), |
||||
filename, |
||||
library, |
||||
libraryTarget, |
||||
}, |
||||
plugins: [ |
||||
new webpack.ProvidePlugin({ |
||||
Buffer: ['buffer', 'Buffer'], |
||||
}), |
||||
] |
||||
}); |
||||
|
||||
module.exports = [ |
||||
genConfig({ |
||||
entry: path.resolve(__dirname, '..', 'src', 'index.js'), |
||||
filename: 'tesseract.min.js', |
||||
library: 'Tesseract', |
||||
libraryTarget: 'umd', |
||||
}), |
||||
genConfig({ |
||||
entry: path.resolve(__dirname, '..', 'src', 'worker-script', 'browser', 'index.js'), |
||||
filename: 'worker.min.js', |
||||
}), |
||||
]; |
@ -1,28 +0,0 @@
@@ -1,28 +0,0 @@
|
||||
const createWorker = require('./createWorker'); |
||||
|
||||
const recognize = async (image, langs, options) => { |
||||
const worker = createWorker(options); |
||||
await worker.load(); |
||||
await worker.loadLanguage(langs); |
||||
await worker.initialize(langs); |
||||
return worker.recognize(image) |
||||
.finally(async () => { |
||||
await worker.terminate(); |
||||
}); |
||||
}; |
||||
|
||||
const detect = async (image, options) => { |
||||
const worker = createWorker(options); |
||||
await worker.load(); |
||||
await worker.loadLanguage('osd'); |
||||
await worker.initialize('osd'); |
||||
return worker.detect(image) |
||||
.finally(async () => { |
||||
await worker.terminate(); |
||||
}); |
||||
}; |
||||
|
||||
module.exports = { |
||||
recognize, |
||||
detect, |
||||
}; |
@ -0,0 +1,105 @@
@@ -0,0 +1,105 @@
|
||||
var defaultOptions = { |
||||
// workerPath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@0.2.0/dist/worker.js',
|
||||
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js', |
||||
langPath: 'https://tessdata.projectnaptha.com/3.02/', |
||||
} |
||||
|
||||
if (process.env.TESS_ENV === "development") { |
||||
console.debug('Using Development Configuration') |
||||
defaultOptions.workerPath = location.protocol + '//' + location.host + '/dist/worker.dev.js?nocache=' + Math.random().toString(36).slice(3) |
||||
}else{ |
||||
var version = require('../../package.json').version; |
||||
defaultOptions.workerPath = 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@' + version + '/dist/worker.js' |
||||
} |
||||
|
||||
exports.defaultOptions = defaultOptions; |
||||
|
||||
|
||||
exports.spawnWorker = function spawnWorker(instance, workerOptions){ |
||||
if(Blob && URL){ |
||||
var blob = new Blob(['importScripts("' + workerOptions.workerPath + '");'], { |
||||
type: 'application/javascript' |
||||
}); |
||||
var worker = new Worker(URL.createObjectURL(blob)); |
||||
}else{ |
||||
var worker = new Worker(workerOptions.workerPath) |
||||
} |
||||
|
||||
worker.onmessage = function(e){ |
||||
var packet = e.data; |
||||
instance._recv(packet) |
||||
} |
||||
return worker |
||||
} |
||||
|
||||
exports.terminateWorker = function(instance){ |
||||
instance.worker.terminate() |
||||
} |
||||
|
||||
exports.sendPacket = function sendPacket(instance, packet){ |
||||
loadImage(packet.payload.image, function(img){ |
||||
packet.payload.image = img |
||||
instance.worker.postMessage(packet) |
||||
}) |
||||
} |
||||
|
||||
|
||||
function loadImage(image, cb){ |
||||
if(typeof image === 'string'){ |
||||
if(/^\#/.test(image)){ |
||||
// element css selector
|
||||
return loadImage(document.querySelector(image), cb) |
||||
}else if(/(blob|data)\:/.test(image)){ |
||||
// data url
|
||||
var im = new Image |
||||
im.src = image; |
||||
im.onload = e => loadImage(im, cb); |
||||
im.onerror = e => { throw e; }; |
||||
return |
||||
}else{ |
||||
var xhr = new XMLHttpRequest(); |
||||
xhr.open('GET', image, true) |
||||
xhr.responseType = "blob"; |
||||
|
||||
xhr.onload = e => { |
||||
if (xhr.status >= 400){ |
||||
throw new Error('Fail to get image as Blob'); |
||||
}else{ |
||||
loadImage(xhr.response, cb); |
||||
} |
||||
}; |
||||
xhr.onerror = e => { throw e; }; |
||||
|
||||
xhr.send(null) |
||||
return |
||||
} |
||||
}else if(image instanceof File){ |
||||
// files
|
||||
var fr = new FileReader() |
||||
fr.onload = e => loadImage(fr.result, cb); |
||||
fr.onerror = e => { throw e; }; |
||||
fr.readAsDataURL(image) |
||||
return |
||||
}else if(image instanceof Blob){ |
||||
return loadImage(URL.createObjectURL(image), cb) |
||||
}else if(image.getContext){ |
||||
// canvas element
|
||||
return loadImage(image.getContext('2d'), cb) |
||||
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){ |
||||
// image element or video element
|
||||
var c = document.createElement('canvas'); |
||||
c.width = image.naturalWidth || image.videoWidth; |
||||
c.height = image.naturalHeight || image.videoHeight; |
||||
var ctx = c.getContext('2d'); |
||||
ctx.drawImage(image, 0, 0); |
||||
return loadImage(ctx, cb) |
||||
}else if(image.getImageData){ |
||||
// canvas context
|
||||
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height); |
||||
return loadImage(data, cb) |
||||
}else{ |
||||
return cb(image) |
||||
} |
||||
throw new Error('Missing return in loadImage cascade') |
||||
|
||||
} |
@ -0,0 +1,76 @@
@@ -0,0 +1,76 @@
|
||||
const leveljs = require('level-js') |
||||
|
||||
// something about trying to store these language files in indexedDB
|
||||
// causes iOS Safari to crash
|
||||
|
||||
var iOS = /iPad|iPhone|iPod/.test(navigator.userAgent); |
||||
var noIDB = typeof indexedDB === 'undefined' || iOS; |
||||
|
||||
var db = noIDB ? { open: (_, cb) => cb(true) } : leveljs('./tessdata2') |
||||
|
||||
var langdata = require('../common/langdata.json') |
||||
|
||||
module.exports = function getLanguageData(req, res, cb){ |
||||
var lang = req.options.lang; |
||||
|
||||
function saveDataFile(data){ |
||||
try { |
||||
db.put(lang, data, err => console.log('cached', lang, err)) |
||||
} finally { |
||||
cb(data) |
||||
} |
||||
} |
||||
|
||||
db.open({ compression: false }, err => { |
||||
if (err) return fetchLanguageData(req, res, cb); |
||||
db.get(lang, (err, data) => { |
||||
if (err) return fetchLanguageData(req, res, saveDataFile); |
||||
res.progress({ status: 'found in cache ' + lang + '.traineddata' }) |
||||
cb(data) |
||||
}) |
||||
}) |
||||
} |
||||
|
||||
|
||||
const ungzip = require('pako/lib/inflate.js').ungzip; |
||||
|
||||
function fetchLanguageData(req, res, cb){ |
||||
var lang = req.options.lang; |
||||
var langfile = lang + '.traineddata.gz'; |
||||
var url = req.workerOptions.langPath + langfile; |
||||
|
||||
var xhr = new XMLHttpRequest(); |
||||
xhr.open('GET', url, true); |
||||
xhr.responseType = 'arraybuffer'; |
||||
xhr.onerror = e => { |
||||
xhr.onprogress = xhr.onload = null |
||||
cb(xhr, null) |
||||
} |
||||
xhr.onprogress = e => |
||||
res.progress({ |
||||
status: 'downloading ' + langfile, |
||||
loaded: e.loaded, |
||||
progress: Math.min(1, e.loaded / langdata[lang]) |
||||
}); |
||||
|
||||
xhr.onload = e => { |
||||
if (!(xhr.status == 200 || (xhr.status == 0 && xhr.response))) return res.reject('Error downloading language ' + url); |
||||
res.progress({ status: 'unzipping ' + langfile, progress: 0 }) |
||||
|
||||
// in case the gzips are already ungzipped or extra gzipped
|
||||
var response = new Uint8Array(xhr.response) |
||||
try { |
||||
var n = 2; |
||||
while(response[0] == 0x1f && response[1] == 0x8b){ |
||||
response = ungzip(response); |
||||
res.progress({ status: 'unzipping ' + langfile, progress: 1 - 1 / (n++) }) |
||||
} |
||||
} catch (err) { |
||||
return res.reject('Error unzipping language file ' + langfile + '\n' + err.message) |
||||
} |
||||
res.progress({ status: 'unzipping ' + langfile, progress: 1 }) |
||||
|
||||
cb(response) |
||||
} |
||||
xhr.send() |
||||
} |
@ -0,0 +1,23 @@
@@ -0,0 +1,23 @@
|
||||
const workerUtils = require('../common/worker.js') |
||||
|
||||
if (process.env.TESS_ENV === "development") { |
||||
console.debug('Using Development Worker') |
||||
} |
||||
|
||||
global.addEventListener('message', function(e){ |
||||
var packet = e.data; |
||||
workerUtils.dispatchHandlers(packet, obj => postMessage(obj)) |
||||
}) |
||||
|
||||
exports.getCore = function(req, res){ |
||||
if(!global.TesseractCore){ |
||||
res.progress({ status: 'loading tesseract core', progress: 0 }) |
||||
importScripts(req.workerOptions.corePath) |
||||
res.progress({ status: 'loading tesseract core', progress: 1 }) |
||||
} |
||||
return TesseractCore |
||||
} |
||||
|
||||
exports.getLanguageData = require('./lang.js') |
||||
|
||||
workerUtils.setAdapter(module.exports); |
@ -0,0 +1,63 @@
@@ -0,0 +1,63 @@
|
||||
// The result of dump.js is a big JSON tree
|
||||
// which can be easily serialized (for instance
|
||||
// to be sent from a webworker to the main app
|
||||
// or through Node's IPC), but we want
|
||||
// a (circular) DOM-like interface for walking
|
||||
// through the data.
|
||||
|
||||
module.exports = function circularize(page){ |
||||
page.paragraphs = [] |
||||
page.lines = [] |
||||
page.words = [] |
||||
page.symbols = [] |
||||
|
||||
page.blocks.forEach(function(block){ |
||||
block.page = page; |
||||
|
||||
block.lines = [] |
||||
block.words = [] |
||||
block.symbols = [] |
||||
|
||||
block.paragraphs.forEach(function(para){ |
||||
para.block = block; |
||||
para.page = page; |
||||
|
||||
para.words = [] |
||||
para.symbols = [] |
||||
|
||||
para.lines.forEach(function(line){ |
||||
line.paragraph = para; |
||||
line.block = block; |
||||
line.page = page; |
||||
|
||||
line.symbols = [] |
||||
|
||||
line.words.forEach(function(word){ |
||||
word.line = line; |
||||
word.paragraph = para; |
||||
word.block = block; |
||||
word.page = page; |
||||
word.symbols.forEach(function(sym){ |
||||
sym.word = word; |
||||
sym.line = line; |
||||
sym.paragraph = para; |
||||
sym.block = block; |
||||
sym.page = page; |
||||
|
||||
sym.line.symbols.push(sym) |
||||
sym.paragraph.symbols.push(sym) |
||||
sym.block.symbols.push(sym) |
||||
sym.page.symbols.push(sym) |
||||
}) |
||||
word.paragraph.words.push(word) |
||||
word.block.words.push(word) |
||||
word.page.words.push(word) |
||||
}) |
||||
line.block.lines.push(line) |
||||
line.page.lines.push(line) |
||||
}) |
||||
para.page.paragraphs.push(para) |
||||
}) |
||||
}) |
||||
return page |
||||
} |
@ -0,0 +1,24 @@
@@ -0,0 +1,24 @@
|
||||
// This converts an image to grayscale
|
||||
|
||||
module.exports = function desaturate(image){ |
||||
var width, height; |
||||
if(image.data){ |
||||
var src = image.data; |
||||
width = image.width, |
||||
height = image.height; |
||||
var dst = new Uint8Array(width * height); |
||||
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0; |
||||
|
||||
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) { |
||||
// convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16 |
||||
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16 |
||||
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16 |
||||
} |
||||
for (; i < srcLength; i += 4, ++j) //finish up
|
||||
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 |
||||
image = dst; |
||||
} else { throw 'Invalid ImageData' } |
||||
return image |
||||
} |
@ -0,0 +1,164 @@
@@ -0,0 +1,164 @@
|
||||
module.exports = function DumpLiterallyEverything(Module, base){ |
||||
var ri = base.GetIterator(); |
||||
var blocks = []; |
||||
var block, para, textline, word, symbol; |
||||
|
||||
function enumToString(value, prefix){ |
||||
return (Object.keys(Module) |
||||
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' }) |
||||
.filter(function(e){ return Module[e] === value }) |
||||
.map(function(e){ return e.slice(prefix.length + 1) })[0]) |
||||
} |
||||
|
||||
ri.Begin() |
||||
do { |
||||
if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){ |
||||
var poly = ri.BlockPolygon(); |
||||
var polygon = null; |
||||
// BlockPolygon() returns null when automatic page segmentation is off
|
||||
if(Module.getPointer(poly) > 0){ |
||||
var n = poly.get_n(), |
||||
px = poly.get_x(), |
||||
py = poly.get_y(), |
||||
polygon = []; |
||||
for(var i = 0; i < n; i++){ |
||||
polygon.push([px.getValue(i), py.getValue(i)]); |
||||
} |
||||
Module._ptaDestroy(Module.getPointer(poly)); |
||||
} |
||||
|
||||
block = { |
||||
paragraphs: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_BLOCK), |
||||
confidence: ri.Confidence(Module.RIL_BLOCK), |
||||
baseline: ri.getBaseline(Module.RIL_BLOCK), |
||||
bbox: ri.getBoundingBox(Module.RIL_BLOCK), |
||||
|
||||
blocktype: enumToString(ri.BlockType(), 'PT'), |
||||
polygon: polygon |
||||
} |
||||
blocks.push(block) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_PARA)){ |
||||
para = { |
||||
lines: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_PARA), |
||||
confidence: ri.Confidence(Module.RIL_PARA), |
||||
baseline: ri.getBaseline(Module.RIL_PARA), |
||||
bbox: ri.getBoundingBox(Module.RIL_PARA), |
||||
|
||||
is_ltr: !!ri.ParagraphIsLtr() |
||||
} |
||||
block.paragraphs.push(para) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){ |
||||
textline = { |
||||
words: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_TEXTLINE), |
||||
confidence: ri.Confidence(Module.RIL_TEXTLINE), |
||||
baseline: ri.getBaseline(Module.RIL_TEXTLINE), |
||||
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE) |
||||
} |
||||
para.lines.push(textline) |
||||
} |
||||
if(ri.IsAtBeginningOf(Module.RIL_WORD)){ |
||||
var fontInfo = ri.getWordFontAttributes(), |
||||
wordDir = ri.WordDirection(); |
||||
word = { |
||||
symbols: [], |
||||
choices: [], |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_WORD), |
||||
confidence: ri.Confidence(Module.RIL_WORD), |
||||
baseline: ri.getBaseline(Module.RIL_WORD), |
||||
bbox: ri.getBoundingBox(Module.RIL_WORD), |
||||
|
||||
is_numeric: !!ri.WordIsNumeric(), |
||||
in_dictionary: !!ri.WordIsFromDictionary(), |
||||
direction: enumToString(wordDir, 'DIR'), |
||||
language: ri.WordRecognitionLanguage(), |
||||
|
||||
is_bold: fontInfo.is_bold, |
||||
is_italic: fontInfo.is_italic, |
||||
is_underlined: fontInfo.is_underlined, |
||||
is_monospace: fontInfo.is_monospace, |
||||
is_serif: fontInfo.is_serif, |
||||
is_smallcaps: fontInfo.is_smallcaps, |
||||
font_size: fontInfo.pointsize, |
||||
font_id: fontInfo.font_id, |
||||
font_name: fontInfo.font_name, |
||||
} |
||||
var wc = new Module.WordChoiceIterator(ri); |
||||
do { |
||||
word.choices.push({ |
||||
text: wc.GetUTF8Text(), |
||||
confidence: wc.Confidence() |
||||
}) |
||||
} while (wc.Next()); |
||||
Module.destroy(wc) |
||||
textline.words.push(word) |
||||
} |
||||
|
||||
var image = null; |
||||
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
|
||||
// var image = pix2array(pix);
|
||||
// // for some reason it seems that things stop working if you destroy pics
|
||||
// Module._pixDestroy(Module.getPointer(pix));
|
||||
if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){ |
||||
symbol = { |
||||
choices: [], |
||||
image: image, |
||||
|
||||
text: ri.GetUTF8Text(Module.RIL_SYMBOL), |
||||
confidence: ri.Confidence(Module.RIL_SYMBOL), |
||||
baseline: ri.getBaseline(Module.RIL_SYMBOL), |
||||
bbox: ri.getBoundingBox(Module.RIL_SYMBOL), |
||||
|
||||
is_superscript: !!ri.SymbolIsSuperscript(), |
||||
is_subscript: !!ri.SymbolIsSubscript(), |
||||
is_dropcap: !!ri.SymbolIsDropcap(), |
||||
} |
||||
word.symbols.push(symbol) |
||||
var ci = new Module.ChoiceIterator(ri); |
||||
do { |
||||
symbol.choices.push({ |
||||
text: ci.GetUTF8Text(), |
||||
confidence: ci.Confidence() |
||||
}) |
||||
} while (ci.Next()); |
||||
Module.destroy(ci) |
||||
} |
||||
} while (ri.Next(Module.RIL_SYMBOL)); |
||||
Module.destroy(ri) |
||||
|
||||
return { |
||||
text: base.GetUTF8Text(), |
||||
html: deindent(base.GetHOCRText()), |
||||
|
||||
confidence: base.MeanTextConf(), |
||||
|
||||
blocks: blocks, |
||||
|
||||
psm: enumToString(base.GetPageSegMode(), 'PSM'), |
||||
oem: enumToString(base.oem(), 'OEM'), |
||||
version: base.Version(), |
||||
} |
||||
} |
||||
|
||||
// the generated HOCR is excessively indented, so
|
||||
// we get rid of that indentation
|
||||
|
||||
function deindent(html){ |
||||
var lines = html.split('\n') |
||||
if(lines[0].substring(0, 2) === " "){ |
||||
for (var i = 0; i < lines.length; i++) { |
||||
if (lines[i].substring(0,2) === " ") { |
||||
lines[i] = lines[i].slice(2) |
||||
} |
||||
}; |
||||
} |
||||
return lines.join('\n') |
||||
} |
@ -0,0 +1,81 @@
@@ -0,0 +1,81 @@
|
||||
const adapter = require('../node/index.js') |
||||
|
||||
let jobCounter = 0; |
||||
|
||||
module.exports = class TesseractJob { |
||||
constructor(instance){ |
||||
this.id = 'Job-' + (++jobCounter) + '-' + Math.random().toString(16).slice(3, 8) |
||||
|
||||
this._instance = instance; |
||||
this._resolve = [] |
||||
this._reject = [] |
||||
this._progress = [] |
||||
this._finally = [] |
||||
} |
||||
|
||||
then(resolve, reject){ |
||||
if(this._resolve.push){ |
||||
this._resolve.push(resolve) |
||||
}else{ |
||||
resolve(this._resolve) |
||||
} |
||||
|
||||
if(reject) this.catch(reject); |
||||
return this; |
||||
} |
||||
catch(reject){ |
||||
if(this._reject.push){ |
||||
this._reject.push(reject) |
||||
}else{ |
||||
reject(this._reject) |
||||
} |
||||
return this; |
||||
} |
||||
progress(fn){ |
||||
this._progress.push(fn) |
||||
return this; |
||||
} |
||||
finally(fn) { |
||||
this._finally.push(fn) |
||||
return this; |
||||
} |
||||
_send(action, payload){ |
||||
adapter.sendPacket(this._instance, { |
||||
jobId: this.id, |
||||
action: action, |
||||
payload: payload |
||||
}) |
||||
} |
||||
|
||||
_handle(packet){ |
||||
var data = packet.data; |
||||
let runFinallyCbs = false; |
||||
|
||||
if(packet.status === 'resolve'){ |
||||
if(this._resolve.length === 0) console.log(data); |
||||
this._resolve.forEach(fn => { |
||||
var ret = fn(data); |
||||
if(ret && typeof ret.then == 'function'){ |
||||
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.') |
||||
} |
||||
}) |
||||
this._resolve = data; |
||||
this._instance._dequeue() |
||||
runFinallyCbs = true; |
||||
}else if(packet.status === 'reject'){ |
||||
if(this._reject.length === 0) console.error(data); |
||||
this._reject.forEach(fn => fn(data)) |
||||
this._reject = data; |
||||
this._instance._dequeue() |
||||
runFinallyCbs = true; |
||||
}else if(packet.status === 'progress'){ |
||||
this._progress.forEach(fn => fn(data)) |
||||
}else{ |
||||
console.warn('Message type unknown', packet.status) |
||||
} |
||||
|
||||
if (runFinallyCbs) { |
||||
this._finally.forEach(fn => fn(data)); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
{"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922} |
@ -0,0 +1,165 @@
@@ -0,0 +1,165 @@
|
||||
var latestJob, |
||||
Module, |
||||
base, |
||||
adapter = {}, |
||||
dump = require('./dump.js'), |
||||
desaturate = require('./desaturate.js'); |
||||
|
||||
function dispatchHandlers(packet, send){ |
||||
function respond(status, data){ |
||||
send({ |
||||
jobId: packet.jobId, |
||||
status, |
||||
action: packet.action, |
||||
data |
||||
}); |
||||
} |
||||
respond.resolve = respond.bind(this, 'resolve'); |
||||
respond.reject = respond.bind(this, 'reject'); |
||||
respond.progress = respond.bind(this, 'progress'); |
||||
|
||||
latestJob = respond; |
||||
|
||||
try { |
||||
if(packet.action === 'recognize'){ |
||||
handleRecognize(packet.payload, respond); |
||||
} else if (packet.action === 'detect'){ |
||||
handleDetect(packet.payload, respond); |
||||
} |
||||
} catch (err) { |
||||
// Prepare exception to travel through postMessage
|
||||
err = err.toString(); |
||||
|
||||
respond.reject(err) |
||||
} |
||||
} |
||||
exports.dispatchHandlers = dispatchHandlers; |
||||
|
||||
exports.setAdapter = function setAdapter(impl){ |
||||
adapter = impl; |
||||
}; |
||||
|
||||
|
||||
function handleInit(req, res){ |
||||
var MIN_MEMORY = 100663296; |
||||
|
||||
if(['chi_sim', 'chi_tra', 'jpn'].includes(req.options.lang)){ |
||||
MIN_MEMORY = 167772160; |
||||
} |
||||
|
||||
if(!Module || Module.TOTAL_MEMORY < MIN_MEMORY){ |
||||
var Core = adapter.getCore(req, res); |
||||
|
||||
res.progress({ status: 'initializing tesseract', progress: 0 }) |
||||
|
||||
Module = Core({ |
||||
TOTAL_MEMORY: MIN_MEMORY, |
||||
TesseractProgress(percent){ |
||||
latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent-30)/70) }); |
||||
}, |
||||
onRuntimeInitialized() {} |
||||
}); |
||||
|
||||
Module.FS_createPath("/", "tessdata", true, true); |
||||
base = new Module.TessBaseAPI(); |
||||
res.progress({ status: 'initializing tesseract', progress: 1 }); |
||||
} |
||||
} |
||||
|
||||
function setImage(Module, base, image){ |
||||
var imgbin = desaturate(image), |
||||
width = image.width, |
||||
height = image.height; |
||||
|
||||
var ptr = Module.allocate(imgbin, 'i8', Module.ALLOC_NORMAL); |
||||
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width); |
||||
base.SetRectangle(0, 0, width, height); |
||||
return ptr; |
||||
} |
||||
|
||||
function loadLanguage(req, res, cb){ |
||||
var lang = req.options.lang, |
||||
langFile = lang + '.traineddata'; |
||||
|
||||
if(!Module._loadedLanguages) Module._loadedLanguages = {}; |
||||
if(lang in Module._loadedLanguages) return cb(); |
||||
|
||||
adapter.getLanguageData(req, res, function(data){ |
||||
res.progress({ status: 'loading ' + langFile, progress: 0 }); |
||||
Module.FS_createDataFile('tessdata', langFile, data, true, false); |
||||
Module._loadedLanguages[lang] = true; |
||||
res.progress({ status: 'loading ' + langFile, progress: 1 }); |
||||
cb(); |
||||
}) |
||||
} |
||||
|
||||
|
||||
|
||||
function handleRecognize(req, res){ |
||||
handleInit(req, res); |
||||
|
||||
loadLanguage(req, res, () => { |
||||
var options = req.options; |
||||
|
||||
function progressUpdate(progress){ |
||||
res.progress({ status: 'initializing api', progress: progress }); |
||||
} |
||||
|
||||
progressUpdate(0); |
||||
base.Init(null, req.options.lang); |
||||
progressUpdate(.3); |
||||
|
||||
for (var option in options) { |
||||
if (options.hasOwnProperty(option)) { |
||||
base.SetVariable(option, options[option]); |
||||
} |
||||
} |
||||
|
||||
progressUpdate(.6); |
||||
var ptr = setImage(Module, base, req.image); |
||||
progressUpdate(1); |
||||
|
||||
base.Recognize(null); |
||||
|
||||
var result = dump(Module, base); |
||||
|
||||
base.End(); |
||||
Module._free(ptr); |
||||
|
||||
res.resolve(result); |
||||
}) |
||||
} |
||||
|
||||
|
||||
function handleDetect(req, res){ |
||||
handleInit(req, res); |
||||
req.options.lang = 'osd'; |
||||
loadLanguage(req, res, () => { |
||||
base.Init(null, 'osd'); |
||||
base.SetPageSegMode(Module.PSM_OSD_ONLY); |
||||
|
||||
var ptr = setImage(Module, base, req.image), |
||||
results = new Module.OSResults(); |
||||
|
||||
if(!base.DetectOS(results)){ |
||||
base.End(); |
||||
Module._free(ptr); |
||||
res.reject("Failed to detect OS"); |
||||
} else { |
||||
var best = results.get_best_result(), |
||||
oid = best.get_orientation_id(), |
||||
sid = best.get_script_id(); |
||||
|
||||
base.End(); |
||||
Module._free(ptr); |
||||
|
||||
res.resolve({ |
||||
tesseract_script_id: sid, |
||||
script: results.get_unicharset().get_script_from_script_id(sid), |
||||
script_confidence: best.get_sconfidence(), |
||||
orientation_degrees: [0, 270, 180, 90][oid], |
||||
orientation_confidence: best.get_oconfidence() |
||||
}); |
||||
} |
||||
}); |
||||
} |
@ -1,12 +0,0 @@
@@ -1,12 +0,0 @@
|
||||
/* |
||||
* OEM = OCR Engine Mode, and there are 4 possible modes. |
||||
* |
||||
* By default tesseract.js uses LSTM_ONLY mode. |
||||
* |
||||
*/ |
||||
module.exports = { |
||||
TESSERACT_ONLY: 0, |
||||
LSTM_ONLY: 1, |
||||
TESSERACT_LSTM_COMBINED: 2, |
||||
DEFAULT: 3, |
||||
}; |
@ -1,19 +0,0 @@
@@ -1,19 +0,0 @@
|
||||
/* |
||||
* PSM = Page Segmentation Mode |
||||
*/ |
||||
module.exports = { |
||||
OSD_ONLY: '0', |
||||
AUTO_OSD: '1', |
||||
AUTO_ONLY: '2', |
||||
AUTO: '3', |
||||
SINGLE_COLUMN: '4', |
||||
SINGLE_BLOCK_VERT_TEXT: '5', |
||||
SINGLE_BLOCK: '6', |
||||
SINGLE_LINE: '7', |
||||
SINGLE_WORD: '8', |
||||
CIRCLE_WORD: '9', |
||||
SINGLE_CHAR: '10', |
||||
SPARSE_TEXT: '11', |
||||
SPARSE_TEXT_OSD: '12', |
||||
RAW_LINE: '13', |
||||
}; |
@ -1,5 +0,0 @@
@@ -1,5 +0,0 @@
|
||||
const OEM = require('./OEM'); |
||||
|
||||
module.exports = { |
||||
defaultOEM: OEM.DEFAULT, |
||||
}; |
@ -1,13 +0,0 @@
@@ -1,13 +0,0 @@
|
||||
module.exports = { |
||||
/* |
||||
* default path for downloading *.traineddata |
||||
*/ |
||||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', |
||||
/* |
||||
* Use BlobURL for worker script by default |
||||
* TODO: remove this option |
||||
* |
||||
*/ |
||||
workerBlobURL: true, |
||||
logger: () => {}, |
||||
}; |
@ -1,218 +0,0 @@
@@ -1,218 +0,0 @@
|
||||
/* |
||||
* languages with existing tesseract traineddata |
||||
* https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
|
||||
*/ |
||||
|
||||
/** |
||||
* @typedef {object} Languages |
||||
* @property {string} AFR Afrikaans |
||||
* @property {string} AMH Amharic |
||||
* @property {string} ARA Arabic |
||||
* @property {string} ASM Assamese |
||||
* @property {string} AZE Azerbaijani |
||||
* @property {string} AZE_CYRL Azerbaijani - Cyrillic |
||||
* @property {string} BEL Belarusian |
||||
* @property {string} BEN Bengali |
||||
* @property {string} BOD Tibetan |
||||
* @property {string} BOS Bosnian |
||||
* @property {string} BUL Bulgarian |
||||
* @property {string} CAT Catalan; Valencian |
||||
* @property {string} CEB Cebuano |
||||
* @property {string} CES Czech |
||||
* @property {string} CHI_SIM Chinese - Simplified |
||||
* @property {string} CHI_TRA Chinese - Traditional |
||||
* @property {string} CHR Cherokee |
||||
* @property {string} CYM Welsh |
||||
* @property {string} DAN Danish |
||||
* @property {string} DEU German |
||||
* @property {string} DZO Dzongkha |
||||
* @property {string} ELL Greek, Modern (1453-) |
||||
* @property {string} ENG English |
||||
* @property {string} ENM English, Middle (1100-1500) |
||||
* @property {string} EPO Esperanto |
||||
* @property {string} EST Estonian |
||||
* @property {string} EUS Basque |
||||
* @property {string} FAS Persian |
||||
* @property {string} FIN Finnish |
||||
* @property {string} FRA French |
||||
* @property {string} FRK German Fraktur |
||||
* @property {string} FRM French, Middle (ca. 1400-1600) |
||||
* @property {string} GLE Irish |
||||
* @property {string} GLG Galician |
||||
* @property {string} GRC Greek, Ancient (-1453) |
||||
* @property {string} GUJ Gujarati |
||||
* @property {string} HAT Haitian; Haitian Creole |
||||
* @property {string} HEB Hebrew |
||||
* @property {string} HIN Hindi |
||||
* @property {string} HRV Croatian |
||||
* @property {string} HUN Hungarian |
||||
* @property {string} IKU Inuktitut |
||||
* @property {string} IND Indonesian |
||||
* @property {string} ISL Icelandic |
||||
* @property {string} ITA Italian |
||||
* @property {string} ITA_OLD Italian - Old |
||||
* @property {string} JAV Javanese |
||||
* @property {string} JPN Japanese |
||||
* @property {string} KAN Kannada |
||||
* @property {string} KAT Georgian |
||||
* @property {string} KAT_OLD Georgian - Old |
||||
* @property {string} KAZ Kazakh |
||||
* @property {string} KHM Central Khmer |
||||
* @property {string} KIR Kirghiz; Kyrgyz |
||||
* @property {string} KOR Korean |
||||
* @property {string} KUR Kurdish |
||||
* @property {string} LAO Lao |
||||
* @property {string} LAT Latin |
||||
* @property {string} LAV Latvian |
||||
* @property {string} LIT Lithuanian |
||||
* @property {string} MAL Malayalam |
||||
* @property {string} MAR Marathi |
||||
* @property {string} MKD Macedonian |
||||
* @property {string} MLT Maltese |
||||
* @property {string} MSA Malay |
||||
* @property {string} MYA Burmese |
||||
* @property {string} NEP Nepali |
||||
* @property {string} NLD Dutch; Flemish |
||||
* @property {string} NOR Norwegian |
||||
* @property {string} ORI Oriya |
||||
* @property {string} PAN Panjabi; Punjabi |
||||
* @property {string} POL Polish |
||||
* @property {string} POR Portuguese |
||||
* @property {string} PUS Pushto; Pashto |
||||
* @property {string} RON Romanian; Moldavian; Moldovan |
||||
* @property {string} RUS Russian |
||||
* @property {string} SAN Sanskrit |
||||
* @property {string} SIN Sinhala; Sinhalese |
||||
* @property {string} SLK Slovak |
||||
* @property {string} SLV Slovenian |
||||
* @property {string} SPA Spanish; Castilian |
||||
* @property {string} SPA_OLD Spanish; Castilian - Old |
||||
* @property {string} SQI Albanian |
||||
* @property {string} SRP Serbian |
||||
* @property {string} SRP_LATN Serbian - Latin |
||||
* @property {string} SWA Swahili |
||||
* @property {string} SWE Swedish |
||||
* @property {string} SYR Syriac |
||||
* @property {string} TAM Tamil |
||||
* @property {string} TEL Telugu |
||||
* @property {string} TGK Tajik |
||||
* @property {string} TGL Tagalog |
||||
* @property {string} THA Thai |
||||
* @property {string} TIR Tigrinya |
||||
* @property {string} TUR Turkish |
||||
* @property {string} UIG Uighur; Uyghur |
||||
* @property {string} UKR Ukrainian |
||||
* @property {string} URD Urdu |
||||
* @property {string} UZB Uzbek |
||||
* @property {string} UZB_CYRL Uzbek - Cyrillic |
||||
* @property {string} VIE Vietnamese |
||||
* @property {string} YID Yiddish |
||||
*/ |
||||
|
||||
/** |
||||
* @type {Languages} |
||||
*/ |
||||
module.exports = { |
||||
AFR: 'afr', |
||||
AMH: 'amh', |
||||
ARA: 'ara', |
||||
ASM: 'asm', |
||||
AZE: 'aze', |
||||
AZE_CYRL: 'aze_cyrl', |
||||
BEL: 'bel', |
||||
BEN: 'ben', |
||||
BOD: 'bod', |
||||
BOS: 'bos', |
||||
BUL: 'bul', |
||||
CAT: 'cat', |
||||
CEB: 'ceb', |
||||
CES: 'ces', |
||||
CHI_SIM: 'chi_sim', |
||||
CHI_TRA: 'chi_tra', |
||||
CHR: 'chr', |
||||
CYM: 'cym', |
||||
DAN: 'dan', |
||||
DEU: 'deu', |
||||
DZO: 'dzo', |
||||
ELL: 'ell', |
||||
ENG: 'eng', |
||||
ENM: 'enm', |
||||
EPO: 'epo', |
||||
EST: 'est', |
||||
EUS: 'eus', |
||||
FAS: 'fas', |
||||
FIN: 'fin', |
||||
FRA: 'fra', |
||||
FRK: 'frk', |
||||
FRM: 'frm', |
||||
GLE: 'gle', |
||||
GLG: 'glg', |
||||
GRC: 'grc', |
||||
GUJ: 'guj', |
||||
HAT: 'hat', |
||||
HEB: 'heb', |
||||
HIN: 'hin', |
||||
HRV: 'hrv', |
||||
HUN: 'hun', |
||||
IKU: 'iku', |
||||
IND: 'ind', |
||||
ISL: 'isl', |
||||
ITA: 'ita', |
||||
ITA_OLD: 'ita_old', |
||||
JAV: 'jav', |
||||
JPN: 'jpn', |
||||
KAN: 'kan', |
||||
KAT: 'kat', |
||||
KAT_OLD: 'kat_old', |
||||
KAZ: 'kaz', |
||||
KHM: 'khm', |
||||
KIR: 'kir', |
||||
KOR: 'kor', |
||||
KUR: 'kur', |
||||
LAO: 'lao', |
||||
LAT: 'lat', |
||||
LAV: 'lav', |
||||
LIT: 'lit', |
||||
MAL: 'mal', |
||||
MAR: 'mar', |
||||
MKD: 'mkd', |
||||
MLT: 'mlt', |
||||
MSA: 'msa', |
||||
MYA: 'mya', |
||||
NEP: 'nep', |
||||
NLD: 'nld', |
||||
NOR: 'nor', |
||||
ORI: 'ori', |
||||
PAN: 'pan', |
||||
POL: 'pol', |
||||
POR: 'por', |
||||
PUS: 'pus', |
||||
RON: 'ron', |
||||
RUS: 'rus', |
||||
SAN: 'san', |
||||
SIN: 'sin', |
||||
SLK: 'slk', |
||||
SLV: 'slv', |
||||
SPA: 'spa', |
||||
SPA_OLD: 'spa_old', |
||||
SQI: 'sqi', |
||||
SRP: 'srp', |
||||
SRP_LATN: 'srp_latn', |
||||
SWA: 'swa', |
||||
SWE: 'swe', |
||||
SYR: 'syr', |
||||
TAM: 'tam', |
||||
TEL: 'tel', |
||||
TGK: 'tgk', |
||||
TGL: 'tgl', |
||||
THA: 'tha', |
||||
TIR: 'tir', |
||||
TUR: 'tur', |
||||
UIG: 'uig', |
||||
UKR: 'ukr', |
||||
URD: 'urd', |
||||
UZB: 'uzb', |
||||
UZB_CYRL: 'uzb_cyrl', |
||||
VIE: 'vie', |
||||
YID: 'yid', |
||||
}; |
@ -1,21 +0,0 @@
@@ -1,21 +0,0 @@
|
||||
const getId = require('./utils/getId'); |
||||
|
||||
let jobCounter = 0; |
||||
|
||||
module.exports = ({ |
||||
id: _id, |
||||
action, |
||||
payload = {}, |
||||
}) => { |
||||
let id = _id; |
||||
if (typeof id === 'undefined') { |
||||
id = getId('Job', jobCounter); |
||||
jobCounter += 1; |
||||
} |
||||
|
||||
return { |
||||
id, |
||||
action, |
||||
payload, |
||||
}; |
||||
}; |
@ -1,80 +0,0 @@
@@ -1,80 +0,0 @@
|
||||
const createJob = require('./createJob'); |
||||
const { log } = require('./utils/log'); |
||||
const getId = require('./utils/getId'); |
||||
|
||||
let schedulerCounter = 0; |
||||
|
||||
module.exports = () => { |
||||
const id = getId('Scheduler', schedulerCounter); |
||||
const workers = {}; |
||||
const runningWorkers = {}; |
||||
let jobQueue = []; |
||||
|
||||
schedulerCounter += 1; |
||||
|
||||
const getQueueLen = () => jobQueue.length; |
||||
const getNumWorkers = () => Object.keys(workers).length; |
||||
|
||||
const dequeue = () => { |
||||
if (jobQueue.length !== 0) { |
||||
const wIds = Object.keys(workers); |
||||
for (let i = 0; i < wIds.length; i += 1) { |
||||
if (typeof runningWorkers[wIds[i]] === 'undefined') { |
||||
jobQueue[0](workers[wIds[i]]); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
const queue = (action, payload) => ( |
||||
new Promise((resolve, reject) => { |
||||
const job = createJob({ action, payload }); |
||||
jobQueue.push(async (w) => { |
||||
jobQueue.shift(); |
||||
runningWorkers[w.id] = job; |
||||
try { |
||||
resolve(await w[action].apply(this, [...payload, job.id])); |
||||
} catch (err) { |
||||
reject(err); |
||||
} finally { |
||||
delete runningWorkers[w.id]; |
||||
dequeue(); |
||||
} |
||||
}); |
||||
log(`[${id}]: Add ${job.id} to JobQueue`); |
||||
log(`[${id}]: JobQueue length=${jobQueue.length}`); |
||||
dequeue(); |
||||
}) |
||||
); |
||||
|
||||
const addWorker = (w) => { |
||||
workers[w.id] = w; |
||||
log(`[${id}]: Add ${w.id}`); |
||||
log(`[${id}]: Number of workers=${getNumWorkers()}`); |
||||
dequeue(); |
||||
return w.id; |
||||
}; |
||||
|
||||
const addJob = async (action, ...payload) => { |
||||
if (getNumWorkers() === 0) { |
||||
throw Error(`[${id}]: You need to have at least one worker before adding jobs`); |
||||
} |
||||
return queue(action, payload); |
||||
}; |
||||
|
||||
const terminate = async () => { |
||||
Object.keys(workers).forEach(async (wid) => { |
||||
await workers[wid].terminate(); |
||||
}); |
||||
jobQueue = []; |
||||
}; |
||||
|
||||
return { |
||||
addWorker, |
||||
addJob, |
||||
terminate, |
||||
getQueueLen, |
||||
getNumWorkers, |
||||
}; |
||||
}; |
@ -1,198 +0,0 @@
@@ -1,198 +0,0 @@
|
||||
const resolvePaths = require('./utils/resolvePaths'); |
||||
const circularize = require('./utils/circularize'); |
||||
const createJob = require('./createJob'); |
||||
const { log } = require('./utils/log'); |
||||
const getId = require('./utils/getId'); |
||||
const { defaultOEM } = require('./constants/config'); |
||||
const { |
||||
defaultOptions, |
||||
spawnWorker, |
||||
terminateWorker, |
||||
onMessage, |
||||
loadImage, |
||||
send, |
||||
} = require('./worker/node'); |
||||
|
||||
let workerCounter = 0; |
||||
|
||||
module.exports = (_options = {}) => { |
||||
const id = getId('Worker', workerCounter); |
||||
const { |
||||
logger, |
||||
errorHandler, |
||||
...options |
||||
} = resolvePaths({ |
||||
...defaultOptions, |
||||
..._options, |
||||
}); |
||||
const resolves = {}; |
||||
const rejects = {}; |
||||
let worker = spawnWorker(options); |
||||
|
||||
workerCounter += 1; |
||||
|
||||
const setResolve = (action, res) => { |
||||
resolves[action] = res; |
||||
}; |
||||
|
||||
const setReject = (action, rej) => { |
||||
rejects[action] = rej; |
||||
}; |
||||
|
||||
const startJob = ({ id: jobId, action, payload }) => ( |
||||
new Promise((resolve, reject) => { |
||||
log(`[${id}]: Start ${jobId}, action=${action}`); |
||||
setResolve(action, resolve); |
||||
setReject(action, reject); |
||||
send(worker, { |
||||
workerId: id, |
||||
jobId, |
||||
action, |
||||
payload, |
||||
}); |
||||
}) |
||||
); |
||||
|
||||
const load = (jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, action: 'load', payload: { options }, |
||||
})) |
||||
); |
||||
|
||||
const writeText = (path, text, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'FS', |
||||
payload: { method: 'writeFile', args: [path, text] }, |
||||
})) |
||||
); |
||||
|
||||
const readText = (path, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'FS', |
||||
payload: { method: 'readFile', args: [path, { encoding: 'utf8' }] }, |
||||
})) |
||||
); |
||||
|
||||
const removeFile = (path, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'FS', |
||||
payload: { method: 'unlink', args: [path] }, |
||||
})) |
||||
); |
||||
|
||||
const FS = (method, args, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'FS', |
||||
payload: { method, args }, |
||||
})) |
||||
); |
||||
|
||||
const loadLanguage = (langs = 'eng', jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'loadLanguage', |
||||
payload: { langs, options }, |
||||
})) |
||||
); |
||||
|
||||
const initialize = (langs = 'eng', oem = defaultOEM, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'initialize', |
||||
payload: { langs, oem }, |
||||
})) |
||||
); |
||||
|
||||
const setParameters = (params = {}, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'setParameters', |
||||
payload: { params }, |
||||
})) |
||||
); |
||||
|
||||
const recognize = async (image, opts = {}, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'recognize', |
||||
payload: { image: await loadImage(image), options: opts }, |
||||
})) |
||||
); |
||||
|
||||
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'getPDF', |
||||
payload: { title, textonly }, |
||||
})) |
||||
); |
||||
|
||||
const detect = async (image, jobId) => ( |
||||
startJob(createJob({ |
||||
id: jobId, |
||||
action: 'detect', |
||||
payload: { image: await loadImage(image) }, |
||||
})) |
||||
); |
||||
|
||||
const terminate = async () => { |
||||
if (worker !== null) { |
||||
/* |
||||
await startJob(createJob({ |
||||
id: jobId, |
||||
action: 'terminate', |
||||
})); |
||||
*/ |
||||
terminateWorker(worker); |
||||
worker = null; |
||||
} |
||||
return Promise.resolve(); |
||||
}; |
||||
|
||||
onMessage(worker, ({ |
||||
workerId, jobId, status, action, data, |
||||
}) => { |
||||
if (status === 'resolve') { |
||||
log(`[${workerId}]: Complete ${jobId}`); |
||||
let d = data; |
||||
if (action === 'recognize') { |
||||
d = circularize(data); |
||||
} else if (action === 'getPDF') { |
||||
d = Array.from({ ...data, length: Object.keys(data).length }); |
||||
} |
||||
resolves[action]({ jobId, data: d }); |
||||
} else if (status === 'reject') { |
||||
rejects[action](data); |
||||
if (errorHandler) { |
||||
errorHandler(data); |
||||
} else { |
||||
throw Error(data); |
||||
} |
||||
} else if (status === 'progress') { |
||||
logger({ ...data, userJobId: jobId }); |
||||
} |
||||
}); |
||||
|
||||
return { |
||||
id, |
||||
worker, |
||||
setResolve, |
||||
setReject, |
||||
load, |
||||
writeText, |
||||
readText, |
||||
removeFile, |
||||
FS, |
||||
loadLanguage, |
||||
initialize, |
||||
setParameters, |
||||
recognize, |
||||
getPDF, |
||||
detect, |
||||
terminate, |
||||
}; |
||||
}; |
@ -1,231 +0,0 @@
@@ -1,231 +0,0 @@
|
||||
declare namespace Tesseract { |
||||
function createScheduler(): Scheduler |
||||
function createWorker(options?: Partial<WorkerOptions>): Worker |
||||
function setLogging(logging: boolean): void |
||||
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult> |
||||
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any |
||||
|
||||
interface Scheduler { |
||||
addWorker(worker: Worker): string |
||||
addJob(action: string, ...args: any[]): Promise<ConfigResult | RecognizeResult | DetectResult> |
||||
terminate(): Promise<any> |
||||
getQueueLen(): number |
||||
getNumWorkers(): number |
||||
} |
||||
|
||||
interface Worker { |
||||
load(jobId?: string): Promise<ConfigResult> |
||||
writeText(path: string, text: string, jobId?: string): Promise<ConfigResult> |
||||
readText(path: string, jobId?: string): Promise<ConfigResult> |
||||
removeText(path: string, jobId?: string): Promise<ConfigResult> |
||||
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult> |
||||
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult> |
||||
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult> |
||||
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult> |
||||
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult> |
||||
detect(image: ImageLike, jobId?: string): Promise<DetectResult> |
||||
terminate(jobId?: string): Promise<ConfigResult> |
||||
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult> |
||||
} |
||||
|
||||
interface Lang { |
||||
code: string; |
||||
data: unknown; |
||||
} |
||||
|
||||
interface WorkerOptions { |
||||
corePath: string |
||||
langPath: string |
||||
cachePath: string |
||||
dataPath: string |
||||
workerPath: string |
||||
cacheMethod: string |
||||
workerBlobURL: boolean |
||||
gzip: boolean |
||||
logger: (arg: any) => void, |
||||
errorHandler: (arg: any) => void |
||||
} |
||||
interface WorkerParams { |
||||
tessedit_ocr_engine_mode: OEM |
||||
tessedit_pageseg_mode: PSM |
||||
tessedit_char_whitelist: string |
||||
preserve_interword_spaces: string |
||||
user_defined_dpi: string |
||||
tessjs_create_hocr: string |
||||
tessjs_create_tsv: string |
||||
tessjs_create_box: string |
||||
tessjs_create_unlv: string |
||||
tessjs_create_osd: string |
||||
} |
||||
interface RecognizeOptions { |
||||
rectangle: Rectangle |
||||
} |
||||
interface ConfigResult { |
||||
jobId: string |
||||
data: any |
||||
} |
||||
interface RecognizeResult { |
||||
jobId: string |
||||
data: Page |
||||
} |
||||
interface GetPDFResult { |
||||
jobId: string |
||||
data: number[] |
||||
} |
||||
interface DetectResult { |
||||
jobId: string |
||||
data: DetectData |
||||
} |
||||
interface DetectData { |
||||
tesseract_script_id: number |
||||
script: string |
||||
script_confidence: number |
||||
orientation_degrees: number |
||||
orientation_confidence: number |
||||
} |
||||
interface Rectangle { |
||||
left: number |
||||
top: number |
||||
width: number |
||||
height: number |
||||
} |
||||
enum OEM { |
||||
TESSERACT_ONLY, |
||||
LSTM_ONLY, |
||||
TESSERACT_LSTM_COMBINED, |
||||
DEFAULT, |
||||
} |
||||
enum PSM { |
||||
OSD_ONLY = '0', |
||||
AUTO_OSD = '1', |
||||
AUTO_ONLY = '2', |
||||
AUTO = '3', |
||||
SINGLE_COLUMN = '4', |
||||
SINGLE_BLOCK_VERT_TEXT = '5', |
||||
SINGLE_BLOCK = '6', |
||||
SINGLE_LINE = '7', |
||||
SINGLE_WORD = '8', |
||||
CIRCLE_WORD = '9', |
||||
SINGLE_CHAR = '10', |
||||
SPARSE_TEXT = '11', |
||||
SPARSE_TEXT_OSD = '12', |
||||
RAW_LINE = '13' |
||||
} |
||||
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement |
||||
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer; |
||||
interface Block { |
||||
paragraphs: Paragraph[]; |
||||
text: string; |
||||
confidence: number; |
||||
baseline: Baseline; |
||||
bbox: Bbox; |
||||
blocktype: string; |
||||
polygon: any; |
||||
page: Page; |
||||
lines: Line[]; |
||||
words: Word[]; |
||||
symbols: Symbol[]; |
||||
} |
||||
interface Baseline { |
||||
x0: number; |
||||
y0: number; |
||||
x1: number; |
||||
y1: number; |
||||
has_baseline: boolean; |
||||
} |
||||
interface Bbox { |
||||
x0: number; |
||||
y0: number; |
||||
x1: number; |
||||
y1: number; |
||||
} |
||||
interface Line { |
||||
words: Word[]; |
||||
text: string; |
||||
confidence: number; |
||||
baseline: Baseline; |
||||
bbox: Bbox; |
||||
paragraph: Paragraph; |
||||
block: Block; |
||||
page: Page; |
||||
symbols: Symbol[]; |
||||
} |
||||
interface Paragraph { |
||||
lines: Line[]; |
||||
text: string; |
||||
confidence: number; |
||||
baseline: Baseline; |
||||
bbox: Bbox; |
||||
is_ltr: boolean; |
||||
block: Block; |
||||
page: Page; |
||||
words: Word[]; |
||||
symbols: Symbol[]; |
||||
} |
||||
interface Symbol { |
||||
choices: Choice[]; |
||||
image: any; |
||||
text: string; |
||||
confidence: number; |
||||
baseline: Baseline; |
||||
bbox: Bbox; |
||||
is_superscript: boolean; |
||||
is_subscript: boolean; |
||||
is_dropcap: boolean; |
||||
word: Word; |
||||
line: Line; |
||||
paragraph: Paragraph; |
||||
block: Block; |
||||
page: Page; |
||||
} |
||||
interface Choice { |
||||
text: string; |
||||
confidence: number; |
||||
} |
||||
interface Word { |
||||
symbols: Symbol[]; |
||||
choices: Choice[]; |
||||
text: string; |
||||
confidence: number; |
||||
baseline: Baseline; |
||||
bbox: Bbox; |
||||
is_numeric: boolean; |
||||
in_dictionary: boolean; |
||||
direction: string; |
||||
language: string; |
||||
is_bold: boolean; |
||||
is_italic: boolean; |
||||
is_underlined: boolean; |
||||
is_monospace: boolean; |
||||
is_serif: boolean; |
||||
is_smallcaps: boolean; |
||||
font_size: number; |
||||
font_id: number; |
||||
font_name: string; |
||||
line: Line; |
||||
paragraph: Paragraph; |
||||
block: Block; |
||||
page: Page; |
||||
} |
||||
interface Page { |
||||
blocks: Block[]; |
||||
confidence: number; |
||||
lines: Line[]; |
||||
oem: string; |
||||
osd: string; |
||||
paragraphs: Paragraph[]; |
||||
psm: string; |
||||
symbols: Symbol[]; |
||||
text: string; |
||||
version: string; |
||||
words: Word[]; |
||||
hocr: string | null; |
||||
tsv: string | null; |
||||
box: string | null; |
||||
unlv: string | null; |
||||
sd: string | null; |
||||
} |
||||
} |
||||
|
||||
export = Tesseract; |
||||
export as namespace Tesseract; |
@ -1,27 +1,75 @@
@@ -1,27 +1,75 @@
|
||||
/** |
||||
* |
||||
* Entry point for tesseract.js, should be the entry when bundling. |
||||
* |
||||
* @fileoverview entry point for tesseract.js |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
require('regenerator-runtime/runtime'); |
||||
const createScheduler = require('./createScheduler'); |
||||
const createWorker = require('./createWorker'); |
||||
const Tesseract = require('./Tesseract'); |
||||
const languages = require('./constants/languages'); |
||||
const OEM = require('./constants/OEM'); |
||||
const PSM = require('./constants/PSM'); |
||||
const { setLogging } = require('./utils/log'); |
||||
|
||||
module.exports = { |
||||
languages, |
||||
OEM, |
||||
PSM, |
||||
createScheduler, |
||||
createWorker, |
||||
setLogging, |
||||
...Tesseract, |
||||
}; |
||||
const adapter = require('./node/index.js') |
||||
const circularize = require('./common/circularize.js') |
||||
const TesseractJob = require('./common/job'); |
||||
const version = require('../package.json').version; |
||||
|
||||
const create = function(workerOptions = {}){ |
||||
var worker = new TesseractWorker(Object.assign({}, adapter.defaultOptions, workerOptions)); |
||||
worker.create = create; |
||||
worker.version = version; |
||||
return worker; |
||||
} |
||||
|
||||
class TesseractWorker { |
||||
constructor(workerOptions){ |
||||
this.worker = null; |
||||
this.workerOptions = workerOptions; |
||||
this._currentJob = null; |
||||
this._queue = []; |
||||
} |
||||
|
||||
recognize(image, options = {}){ |
||||
return this._delay(job => { |
||||
if (typeof options === 'string') options = {lang: options} |
||||
options.lang = options.lang || 'eng'; |
||||
|
||||
job._send('recognize', { image, options, workerOptions: this.workerOptions }); |
||||
}) |
||||
} |
||||
detect(image, options = {}){ |
||||
return this._delay(job => { |
||||
job._send('detect', { image, options, workerOptions: this.workerOptions }); |
||||
}) |
||||
} |
||||
|
||||
terminate(){ |
||||
if(this.worker) adapter.terminateWorker(this); |
||||
this.worker = null; |
||||
this._currentJob = null; |
||||
this._queue = []; |
||||
} |
||||
|
||||
_delay(fn){ |
||||
if(!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions); |
||||
|
||||
var job = new TesseractJob(this); |
||||
this._queue.push(e => { |
||||
this._queue.shift(); |
||||
this._currentJob = job; |
||||
fn(job); |
||||
}); |
||||
if(!this._currentJob) this._dequeue(); |
||||
return job; |
||||
} |
||||
|
||||
_dequeue(){ |
||||
this._currentJob = null; |
||||
if(this._queue.length){ |
||||
this._queue[0](); |
||||
} |
||||
} |
||||
|
||||
_recv(packet){ |
||||
if(packet.status === 'resolve' && packet.action === 'recognize'){ |
||||
packet.data = circularize(packet.data); |
||||
} |
||||
|
||||
if(this._currentJob.id === packet.jobId){ |
||||
this._currentJob._handle(packet) |
||||
} else { |
||||
console.warn('Job ID ' + packet.jobId + ' not known.') |
||||
} |
||||
} |
||||
} |
||||
|
||||
module.exports = create(); |
||||
|
@ -0,0 +1,89 @@
@@ -0,0 +1,89 @@
|
||||
const fetch = require('isomorphic-fetch'), |
||||
isURL = require('is-url'), |
||||
fork = require('child_process').fork, |
||||
fs = require('fs'); |
||||
|
||||
exports.defaultOptions = { |
||||
workerPath: require('path').join(__dirname, 'worker.js'), |
||||
langPath: 'https://tessdata.projectnaptha.com/3.02/', |
||||
} |
||||
|
||||
exports.spawnWorker = function spawnWorker(instance, workerOptions){ |
||||
var cp = fork(workerOptions.workerPath); |
||||
cp.on('message', packet => { |
||||
instance._recv(packet); |
||||
}); |
||||
return cp; |
||||
} |
||||
|
||||
exports.terminateWorker = function(instance){ |
||||
instance.worker.kill(); |
||||
} |
||||
|
||||
exports.sendPacket = function sendPacket(instance, packet){ |
||||
loadImage(packet.payload.image, img => { |
||||
packet.payload.image = img; |
||||
instance.worker.send(packet); |
||||
}); |
||||
} |
||||
|
||||
|
||||
function loadImage(image, cb){ |
||||
|
||||
if(typeof image === 'string'){ |
||||
if (isURL(image)) { |
||||
fetch(image) |
||||
.then(resp => resp.buffer()) |
||||
.then(buffer => loadImage(buffer, cb)) |
||||
.catch(err => console.error(err)); |
||||
} else { |
||||
fs.readFile(image, function(err, buffer){ |
||||
if (err) throw err; |
||||
loadImage(buffer, cb); |
||||
}); |
||||
} |
||||
return; |
||||
} else if (image instanceof Buffer){ |
||||
var mime = require('file-type')(image).mime |
||||
|
||||
if(mime === 'image/png'){ |
||||
var PNGReader = require('png.js'); |
||||
var reader = new PNGReader(image); |
||||
reader.parse(function(err, png){ |
||||
if (err) throw err; |
||||
|
||||
var image = { |
||||
width: png.getWidth(), |
||||
height: png.getHeight() |
||||
} |
||||
image.data = new Uint8Array(image.width * image.height * 4) |
||||
for(var j = 0; j < image.height; j++){ |
||||
for(var i = 0; i < image.width; i++){ |
||||
var offset = 4 * (i + j * image.width), |
||||
pix = png.getPixel(i, j); |
||||
|
||||
image.data[offset] = pix[0]; |
||||
image.data[offset + 1] = pix[1]; |
||||
image.data[offset + 2] = pix[2]; |
||||
image.data[offset + 3] = pix[3]; |
||||
} |
||||
} |
||||
loadImage(image, cb); |
||||
}); |
||||
return; |
||||
} else if (mime === 'image/jpeg'){ |
||||
loadImage(require('jpeg-js').decode(image), cb); |
||||
return; |
||||
} |
||||
|
||||
// TODO: support for TIFF, NetPBM, BMP, etc.
|
||||
} |
||||
|
||||
// node uses json.stringify for ipc which means we need to turn
|
||||
// fancy arrays into raw arrays
|
||||
if(image && image.data && image.data.length && !Array.isArray(image.data)){ |
||||
image.data = Array.from(image.data); |
||||
return loadImage(image, cb) |
||||
} |
||||
cb(image); |
||||
} |
@ -0,0 +1,47 @@
@@ -0,0 +1,47 @@
|
||||
const https = require("https"), |
||||
http = require("http"), |
||||
zlib = require("zlib"), |
||||
fs = require("fs"), |
||||
path = require("path"), |
||||
isURL = require("is-url"); |
||||
|
||||
var langdata = require('../common/langdata.json') |
||||
|
||||
function getLanguageData(req, res, cb){ |
||||
var lang = req.options.lang, |
||||
langfile = lang + '.traineddata.gz'; |
||||
|
||||
// langPath defaults to a URL where languages can be downloaded. If a custom path is specified
|
||||
// and it is a local path, use that instead
|
||||
var localPath = isURL(req.workerOptions.langPath) ? |
||||
lang + '.traineddata' : |
||||
path.join(req.workerOptions.langPath, lang + '.traineddata'); |
||||
|
||||
var fetchProtocol = req.workerOptions.langPath.startsWith('http://') ? http : https; |
||||
|
||||
fs.readFile(localPath, function (err, data) { |
||||
if(!err) return cb(new Uint8Array(data)); |
||||
|
||||
fetchProtocol.get(req.workerOptions.langPath + langfile, stream => { |
||||
var received_bytes = 0; |
||||
stream.on('data', function(chunk) { |
||||
received_bytes += chunk.length; |
||||
res.progress({ |
||||
status: 'downloading ' + langfile, |
||||
loaded: received_bytes, |
||||
progress: Math.min(1, received_bytes / langdata[lang]) |
||||
}); |
||||
|
||||
}); |
||||
|
||||
var gunzip = zlib.createGunzip(); |
||||
stream.pipe(gunzip).pipe(fs.createWriteStream(lang + '.traineddata')) |
||||
gunzip.on('end',() => { |
||||
getLanguageData(req, stream, cb) |
||||
}); |
||||
}); |
||||
}); |
||||
} |
||||
|
||||
|
||||
module.exports = getLanguageData; |
@ -0,0 +1,19 @@
@@ -0,0 +1,19 @@
|
||||
const workerUtils = require('../common/worker.js') |
||||
|
||||
process.on('message', function(packet){ |
||||
workerUtils.dispatchHandlers(packet, obj => process.send(obj)) |
||||
}) |
||||
|
||||
var TesseractCore; |
||||
exports.getCore = function(req, res){ |
||||
if(!TesseractCore){ |
||||
res.progress({ status: 'loading tesseract core' }) |
||||
TesseractCore = require('tesseract.js-core') |
||||
res.progress({ status: 'loaded tesseract core' }) |
||||
} |
||||
return TesseractCore |
||||
} |
||||
|
||||
exports.getLanguageData = require('./lang.js') |
||||
|
||||
workerUtils.setAdapter(module.exports); |
@ -1,54 +0,0 @@
@@ -1,54 +0,0 @@
|
||||
/** |
||||
* In the recognition result of tesseract, there |
||||
* is a deep JSON object for details, it has around |
||||
* |
||||
* The result of dump.js is a big JSON tree |
||||
* which can be easily serialized (for instance |
||||
* to be sent from a webworker to the main app |
||||
* or through Node's IPC), but we want |
||||
* a (circular) DOM-like interface for walking |
||||
* through the data. |
||||
* |
||||
* @fileoverview DOM-like interface for walking through data |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
|
||||
module.exports = (page) => { |
||||
const blocks = []; |
||||
const paragraphs = []; |
||||
const lines = []; |
||||
const words = []; |
||||
const symbols = []; |
||||
|
||||
page.blocks.forEach((block) => { |
||||
block.paragraphs.forEach((paragraph) => { |
||||
paragraph.lines.forEach((line) => { |
||||
line.words.forEach((word) => { |
||||
word.symbols.forEach((sym) => { |
||||
symbols.push({ |
||||
...sym, page, block, paragraph, line, word, |
||||
}); |
||||
}); |
||||
words.push({ |
||||
...word, page, block, paragraph, line, |
||||
}); |
||||
}); |
||||
lines.push({ |
||||
...line, page, block, paragraph, |
||||
}); |
||||
}); |
||||
paragraphs.push({ |
||||
...paragraph, page, block, |
||||
}); |
||||
}); |
||||
blocks.push({ |
||||
...block, page, |
||||
}); |
||||
}); |
||||
|
||||
return { |
||||
...page, blocks, paragraphs, lines, words, symbols, |
||||
}; |
||||
}; |
@ -1,21 +0,0 @@
@@ -1,21 +0,0 @@
|
||||
const isElectron = require('is-electron'); |
||||
|
||||
module.exports = (key) => { |
||||
const env = {}; |
||||
|
||||
if (typeof WorkerGlobalScope !== 'undefined') { |
||||
env.type = 'webworker'; |
||||
} else if (isElectron()) { |
||||
env.type = 'electron'; |
||||
} else if (typeof window === 'object') { |
||||
env.type = 'browser'; |
||||
} else if (typeof process === 'object' && typeof require === 'function') { |
||||
env.type = 'node'; |
||||
} |
||||
|
||||
if (typeof key === 'undefined') { |
||||
return env; |
||||
} |
||||
|
||||
return env[key]; |
||||
}; |
@ -1,3 +0,0 @@
@@ -1,3 +0,0 @@
|
||||
module.exports = (prefix, cnt) => ( |
||||
`${prefix}-${cnt}-${Math.random().toString(16).slice(3, 8)}` |
||||
); |
@ -1,9 +0,0 @@
@@ -1,9 +0,0 @@
|
||||
let logging = false; |
||||
|
||||
exports.logging = logging; |
||||
|
||||
exports.setLogging = (_logging) => { |
||||
logging = _logging; |
||||
}; |
||||
|
||||
exports.log = (...args) => (logging ? console.log.apply(this, args) : null); |
@ -1,12 +0,0 @@
@@ -1,12 +0,0 @@
|
||||
const isBrowser = require('./getEnvironment')('type') === 'browser'; |
||||
const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line
|
||||
|
||||
module.exports = (options) => { |
||||
const opts = { ...options }; |
||||
['corePath', 'workerPath', 'langPath'].forEach((key) => { |
||||
if (options[key]) { |
||||
opts[key] = resolveURL(opts[key]); |
||||
} |
||||
}); |
||||
return opts; |
||||
}; |
@ -1,10 +0,0 @@
@@ -1,10 +0,0 @@
|
||||
const { set, get, del } = require('idb-keyval'); |
||||
|
||||
module.exports = { |
||||
readCache: get, |
||||
writeCache: set, |
||||
deleteCache: del, |
||||
checkCache: (path) => ( |
||||
get(path).then((v) => typeof v !== 'undefined') |
||||
), |
||||
}; |
@ -1,30 +0,0 @@
@@ -1,30 +0,0 @@
|
||||
const { simd } = require('wasm-feature-detect'); |
||||
const { dependencies } = require('../../../package.json'); |
||||
|
||||
module.exports = async (corePath, res) => { |
||||
if (typeof global.TesseractCore === 'undefined') { |
||||
res.progress({ status: 'loading tesseract core', progress: 0 }); |
||||
|
||||
// If the user specifies a core path, we use that
|
||||
// Otherwise, we detect the correct core based on SIMD support
|
||||
let corePathImport = corePath; |
||||
if (!corePathImport) { |
||||
const simdSupport = await simd(); |
||||
if (simdSupport) { |
||||
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`; |
||||
} else { |
||||
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`; |
||||
} |
||||
} |
||||
|
||||
global.importScripts(corePathImport); |
||||
|
||||
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') { |
||||
global.TesseractCore = global.TesseractCoreWASM; |
||||
} else { |
||||
throw Error('Failed to load TesseractCore'); |
||||
} |
||||
res.progress({ status: 'loading tesseract core', progress: 1 }); |
||||
} |
||||
return global.TesseractCore; |
||||
}; |
@ -1 +0,0 @@
@@ -1 +0,0 @@
|
||||
module.exports = require('zlibjs').gunzipSync; |
@ -1,32 +0,0 @@
@@ -1,32 +0,0 @@
|
||||
/** |
||||
* |
||||
* Browser worker scripts |
||||
* |
||||
* @fileoverview Browser worker implementation |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
|
||||
const worker = require('..'); |
||||
const getCore = require('./getCore'); |
||||
const gunzip = require('./gunzip'); |
||||
const cache = require('./cache'); |
||||
|
||||
/* |
||||
* register message handler |
||||
*/ |
||||
global.addEventListener('message', ({ data }) => { |
||||
worker.dispatchHandlers(data, (obj) => postMessage(obj)); |
||||
}); |
||||
|
||||
/* |
||||
* getCore is a sync function to load and return |
||||
* TesseractCore. |
||||
*/ |
||||
worker.setAdapter({ |
||||
getCore, |
||||
gunzip, |
||||
fetch: () => {}, |
||||
...cache, |
||||
}); |
@ -1,14 +0,0 @@
@@ -1,14 +0,0 @@
|
||||
/* |
||||
* default params for tesseract.js |
||||
*/ |
||||
const PSM = require('../../constants/PSM'); |
||||
|
||||
module.exports = { |
||||
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, |
||||
tessedit_char_whitelist: '', |
||||
tessjs_create_hocr: '1', |
||||
tessjs_create_tsv: '1', |
||||
tessjs_create_box: '0', |
||||
tessjs_create_unlv: '0', |
||||
tessjs_create_osd: '0', |
||||
}; |
@ -1,313 +0,0 @@
@@ -1,313 +0,0 @@
|
||||
/** |
||||
* |
||||
* Worker script for browser and node |
||||
* |
||||
* @fileoverview Worker script for browser and node |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
require('regenerator-runtime/runtime'); |
||||
const fileType = require('file-type'); |
||||
const isURL = require('is-url'); |
||||
const dump = require('./utils/dump'); |
||||
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker'; |
||||
const setImage = require('./utils/setImage'); |
||||
const defaultParams = require('./constants/defaultParams'); |
||||
const { log, setLogging } = require('../utils/log'); |
||||
|
||||
/* |
||||
* Tesseract Module returned by TesseractCore. |
||||
*/ |
||||
let TessModule; |
||||
/* |
||||
* TessearctBaseAPI instance |
||||
*/ |
||||
let api = null; |
||||
let latestJob; |
||||
let adapter = {}; |
||||
let params = defaultParams; |
||||
|
||||
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { |
||||
setLogging(logging); |
||||
if (!TessModule) { |
||||
const Core = await adapter.getCore(corePath, res); |
||||
|
||||
res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); |
||||
|
||||
Core({ |
||||
TesseractProgress(percent) { |
||||
latestJob.progress({ |
||||
workerId, |
||||
jobId, |
||||
status: 'recognizing text', |
||||
progress: Math.max(0, (percent - 30) / 70), |
||||
}); |
||||
}, |
||||
}).then((tessModule) => { |
||||
TessModule = tessModule; |
||||
res.progress({ workerId, status: 'initialized tesseract', progress: 1 }); |
||||
res.resolve({ loaded: true }); |
||||
}); |
||||
} else { |
||||
res.resolve({ loaded: true }); |
||||
} |
||||
}; |
||||
|
||||
const FS = ({ workerId, payload: { method, args } }, res) => { |
||||
log(`[${workerId}]: FS.${method} with args ${args}`); |
||||
res.resolve(TessModule.FS[method](...args)); |
||||
}; |
||||
|
||||
const loadLanguage = async ({ |
||||
workerId, |
||||
payload: { |
||||
langs, |
||||
options: { |
||||
langPath, |
||||
dataPath, |
||||
cachePath, |
||||
cacheMethod, |
||||
gzip = true, |
||||
}, |
||||
}, |
||||
}, |
||||
res) => { |
||||
const loadAndGunzipFile = async (_lang) => { |
||||
const lang = typeof _lang === 'string' ? _lang : _lang.code; |
||||
const readCache = ['refresh', 'none'].includes(cacheMethod) |
||||
? () => Promise.resolve() |
||||
: adapter.readCache; |
||||
let data = null; |
||||
|
||||
try { |
||||
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); |
||||
if (typeof _data !== 'undefined') { |
||||
log(`[${workerId}]: Load ${lang}.traineddata from cache`); |
||||
res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 }); |
||||
data = _data; |
||||
} else { |
||||
throw Error('Not found in cache'); |
||||
} |
||||
} catch (e) { |
||||
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`); |
||||
if (typeof _lang === 'string') { |
||||
let path = null; |
||||
|
||||
if (isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */ |
||||
path = langPath; |
||||
} |
||||
|
||||
if (path !== null) { |
||||
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`; |
||||
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl); |
||||
if (!resp.ok) { |
||||
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`); |
||||
} |
||||
data = await resp.arrayBuffer(); |
||||
} else { |
||||
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`); |
||||
} |
||||
} else { |
||||
data = _lang.data; // eslint-disable-line
|
||||
} |
||||
} |
||||
|
||||
data = new Uint8Array(data); |
||||
|
||||
const type = fileType(data); |
||||
if (typeof type !== 'undefined' && type.mime === 'application/gzip') { |
||||
data = adapter.gunzip(data); |
||||
} |
||||
|
||||
if (TessModule) { |
||||
if (dataPath) { |
||||
try { |
||||
TessModule.FS.mkdir(dataPath); |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
} |
||||
TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data); |
||||
} |
||||
|
||||
if (['write', 'refresh', undefined].includes(cacheMethod)) { |
||||
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data); |
||||
} |
||||
|
||||
return Promise.resolve(data); |
||||
}; |
||||
|
||||
res.progress({ workerId, status: 'loading language traineddata', progress: 0 }); |
||||
try { |
||||
await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile)); |
||||
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); |
||||
res.resolve(langs); |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
const setParameters = ({ payload: { params: _params } }, res) => { |
||||
Object.keys(_params) |
||||
.filter((k) => !k.startsWith('tessjs_')) |
||||
.forEach((key) => { |
||||
api.SetVariable(key, _params[key]); |
||||
}); |
||||
params = { ...params, ..._params }; |
||||
|
||||
if (typeof res !== 'undefined') { |
||||
res.resolve(params); |
||||
} |
||||
}; |
||||
|
||||
const initialize = ({ |
||||
workerId, |
||||
payload: { langs: _langs, oem }, |
||||
}, res) => { |
||||
const langs = (typeof _langs === 'string') |
||||
? _langs |
||||
: _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+'); |
||||
|
||||
try { |
||||
res.progress({ |
||||
workerId, status: 'initializing api', progress: 0, |
||||
}); |
||||
if (api !== null) { |
||||
api.End(); |
||||
} |
||||
api = new TessModule.TessBaseAPI(); |
||||
const status = api.Init(null, langs, oem); |
||||
if (status === -1) { |
||||
res.reject('initialization failed'); |
||||
} |
||||
params = defaultParams; |
||||
setParameters({ payload: { params } }); |
||||
res.progress({ |
||||
workerId, status: 'initialized api', progress: 1, |
||||
}); |
||||
res.resolve(); |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => { |
||||
try { |
||||
const ptr = setImage(TessModule, api, image); |
||||
if (typeof rec === 'object') { |
||||
api.SetRectangle(rec.left, rec.top, rec.width, rec.height); |
||||
} |
||||
api.Recognize(null); |
||||
res.resolve(dump(TessModule, api, params)); |
||||
TessModule._free(ptr); |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
const getPDF = ({ payload: { title, textonly } }, res) => { |
||||
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); |
||||
pdfRenderer.BeginDocument(title); |
||||
pdfRenderer.AddImage(api); |
||||
pdfRenderer.EndDocument(); |
||||
TessModule._free(pdfRenderer); |
||||
|
||||
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf')); |
||||
}; |
||||
|
||||
const detect = ({ payload: { image } }, res) => { |
||||
try { |
||||
const ptr = setImage(TessModule, api, image); |
||||
const results = new TessModule.OSResults(); |
||||
|
||||
if (!api.DetectOS(results)) { |
||||
api.End(); |
||||
TessModule._free(ptr); |
||||
res.reject('Failed to detect OS'); |
||||
} else { |
||||
const best = results.best_result; |
||||
const oid = best.orientation_id; |
||||
const sid = best.script_id; |
||||
|
||||
TessModule._free(ptr); |
||||
|
||||
res.resolve({ |
||||
tesseract_script_id: sid, |
||||
script: results.unicharset.get_script_from_script_id(sid), |
||||
script_confidence: best.sconfidence, |
||||
orientation_degrees: [0, 270, 180, 90][oid], |
||||
orientation_confidence: best.oconfidence, |
||||
}); |
||||
} |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
const terminate = (_, res) => { |
||||
try { |
||||
if (api !== null) { |
||||
api.End(); |
||||
} |
||||
res.resolve({ terminated: true }); |
||||
} catch (err) { |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
/** |
||||
* dispatchHandlers |
||||
* |
||||
* @name dispatchHandlers |
||||
* @function worker data handler |
||||
* @access public |
||||
* @param {object} data |
||||
* @param {string} data.jobId - unique job id |
||||
* @param {string} data.action - action of the job, only recognize and detect for now |
||||
* @param {object} data.payload - data for the job |
||||
* @param {function} send - trigger job to work |
||||
*/ |
||||
exports.dispatchHandlers = (packet, send) => { |
||||
const res = (status, data) => { |
||||
send({ |
||||
...packet, |
||||
status, |
||||
data, |
||||
}); |
||||
}; |
||||
res.resolve = res.bind(this, 'resolve'); |
||||
res.reject = res.bind(this, 'reject'); |
||||
res.progress = res.bind(this, 'progress'); |
||||
|
||||
latestJob = res; |
||||
|
||||
try { |
||||
({ |
||||
load, |
||||
FS, |
||||
loadLanguage, |
||||
initialize, |
||||
setParameters, |
||||
recognize, |
||||
getPDF, |
||||
detect, |
||||
terminate, |
||||
})[packet.action](packet, res); |
||||
} catch (err) { |
||||
/** Prepare exception to travel through postMessage */ |
||||
res.reject(err.toString()); |
||||
} |
||||
}; |
||||
|
||||
/** |
||||
* setAdapter |
||||
* |
||||
* @name setAdapter |
||||
* @function |
||||
* @access public |
||||
* @param {object} adapter - implementation of the worker, different in browser and node environment |
||||
*/ |
||||
exports.setAdapter = (_adapter) => { |
||||
adapter = _adapter; |
||||
}; |
@ -1,16 +0,0 @@
@@ -1,16 +0,0 @@
|
||||
const util = require('util'); |
||||
const fs = require('fs'); |
||||
|
||||
module.exports = { |
||||
readCache: util.promisify(fs.readFile), |
||||
writeCache: util.promisify(fs.writeFile), |
||||
deleteCache: (path) => ( |
||||
util.promisify(fs.unlink)(path) |
||||
.catch(() => {}) |
||||
), |
||||
checkCache: (path) => ( |
||||
util.promisify(fs.access)(path, fs.F_OK) |
||||
.then((err) => (err === null)) |
||||
.catch(() => false) |
||||
), |
||||
}; |
@ -1,20 +0,0 @@
@@ -1,20 +0,0 @@
|
||||
const { simd } = require('wasm-feature-detect'); |
||||
|
||||
let TesseractCore = null; |
||||
/* |
||||
* getCore is a sync function to load and return |
||||
* TesseractCore. |
||||
*/ |
||||
module.exports = async (_, res) => { |
||||
if (TesseractCore === null) { |
||||
const simdSupport = await simd(); |
||||
res.progress({ status: 'loading tesseract core', progress: 0 }); |
||||
if (simdSupport) { |
||||
TesseractCore = require('tesseract.js-core/tesseract-core-simd'); |
||||
} else { |
||||
TesseractCore = require('tesseract.js-core/tesseract-core'); |
||||
} |
||||
res.progress({ status: 'loaded tesseract core', progress: 1 }); |
||||
} |
||||
return TesseractCore; |
||||
}; |
@ -1 +0,0 @@
@@ -1 +0,0 @@
|
||||
module.exports = require('zlib').gunzipSync; |
@ -1,30 +0,0 @@
@@ -1,30 +0,0 @@
|
||||
/** |
||||
* |
||||
* Tesseract Worker Script for Node |
||||
* |
||||
* @fileoverview Node worker implementation |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
|
||||
const fetch = require('node-fetch'); |
||||
const { parentPort } = require('worker_threads'); |
||||
const worker = require('..'); |
||||
const getCore = require('./getCore'); |
||||
const gunzip = require('./gunzip'); |
||||
const cache = require('./cache'); |
||||
|
||||
/* |
||||
* register message handler |
||||
*/ |
||||
parentPort.on('message', (packet) => { |
||||
worker.dispatchHandlers(packet, (obj) => parentPort.postMessage(obj)); |
||||
}); |
||||
|
||||
worker.setAdapter({ |
||||
getCore, |
||||
gunzip, |
||||
fetch, |
||||
...cache, |
||||
}); |
@ -1,201 +0,0 @@
@@ -1,201 +0,0 @@
|
||||
/** |
||||
* |
||||
* Dump data to a big JSON tree |
||||
* |
||||
* @fileoverview dump data to JSON tree |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
|
||||
/** |
||||
* deindent |
||||
* |
||||
* The generated HOCR is excessively indented, so |
||||
* we get rid of that indentation |
||||
* |
||||
* @name deindent |
||||
* @function deindent string |
||||
* @access public |
||||
*/ |
||||
const deindent = (html) => { |
||||
const lines = html.split('\n'); |
||||
if (lines[0].substring(0, 2) === ' ') { |
||||
for (let i = 0; i < lines.length; i += 1) { |
||||
if (lines[i].substring(0, 2) === ' ') { |
||||
lines[i] = lines[i].slice(2); |
||||
} |
||||
} |
||||
} |
||||
return lines.join('\n'); |
||||
}; |
||||
|
||||
/** |
||||
* dump |
||||
* |
||||
* @name dump |
||||
* @function dump recognition result to a JSON object |
||||
* @access public |
||||
*/ |
||||
module.exports = (TessModule, api, { |
||||
tessjs_create_hocr, |
||||
tessjs_create_tsv, |
||||
tessjs_create_box, |
||||
tessjs_create_unlv, |
||||
tessjs_create_osd, |
||||
}) => { |
||||
const ri = api.GetIterator(); |
||||
const { |
||||
RIL_BLOCK, |
||||
RIL_PARA, |
||||
RIL_TEXTLINE, |
||||
RIL_WORD, |
||||
RIL_SYMBOL, |
||||
} = TessModule; |
||||
const blocks = []; |
||||
let block; |
||||
let para; |
||||
let textline; |
||||
let word; |
||||
let symbol; |
||||
|
||||
const enumToString = (value, prefix) => ( |
||||
Object.keys(TessModule) |
||||
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value)) |
||||
.map((e) => e.slice(prefix.length + 1))[0] |
||||
); |
||||
|
||||
ri.Begin(); |
||||
do { |
||||
if (ri.IsAtBeginningOf(RIL_BLOCK)) { |
||||
const poly = ri.BlockPolygon(); |
||||
let polygon = null; |
||||
// BlockPolygon() returns null when automatic page segmentation is off
|
||||
if (TessModule.getPointer(poly) > 0) { |
||||
const n = poly.get_n(); |
||||
const px = poly.get_x(); |
||||
const py = poly.get_y(); |
||||
polygon = []; |
||||
for (let i = 0; i < n; i += 1) { |
||||
polygon.push([px.getValue(i), py.getValue(i)]); |
||||
} |
||||
/* |
||||
* TODO: find out why _ptaDestroy doesn't work |
||||
*/ |
||||
// TessModule._ptaDestroy(TessModule.getPointer(poly));
|
||||
} |
||||
|
||||
block = { |
||||
paragraphs: [], |
||||
text: ri.GetUTF8Text(RIL_BLOCK), |
||||
confidence: ri.Confidence(RIL_BLOCK), |
||||
baseline: ri.getBaseline(RIL_BLOCK), |
||||
bbox: ri.getBoundingBox(RIL_BLOCK), |
||||
blocktype: enumToString(ri.BlockType(), 'PT'), |
||||
polygon, |
||||
}; |
||||
blocks.push(block); |
||||
} |
||||
if (ri.IsAtBeginningOf(RIL_PARA)) { |
||||
para = { |
||||
lines: [], |
||||
text: ri.GetUTF8Text(RIL_PARA), |
||||
confidence: ri.Confidence(RIL_PARA), |
||||
baseline: ri.getBaseline(RIL_PARA), |
||||
bbox: ri.getBoundingBox(RIL_PARA), |
||||
is_ltr: !!ri.ParagraphIsLtr(), |
||||
}; |
||||
block.paragraphs.push(para); |
||||
} |
||||
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { |
||||
textline = { |
||||
words: [], |
||||
text: ri.GetUTF8Text(RIL_TEXTLINE), |
||||
confidence: ri.Confidence(RIL_TEXTLINE), |
||||
baseline: ri.getBaseline(RIL_TEXTLINE), |
||||
bbox: ri.getBoundingBox(RIL_TEXTLINE), |
||||
}; |
||||
para.lines.push(textline); |
||||
} |
||||
if (ri.IsAtBeginningOf(RIL_WORD)) { |
||||
const fontInfo = ri.getWordFontAttributes(); |
||||
const wordDir = ri.WordDirection(); |
||||
word = { |
||||
symbols: [], |
||||
choices: [], |
||||
|
||||
text: ri.GetUTF8Text(RIL_WORD), |
||||
confidence: ri.Confidence(RIL_WORD), |
||||
baseline: ri.getBaseline(RIL_WORD), |
||||
bbox: ri.getBoundingBox(RIL_WORD), |
||||
|
||||
is_numeric: !!ri.WordIsNumeric(), |
||||
in_dictionary: !!ri.WordIsFromDictionary(), |
||||
direction: enumToString(wordDir, 'DIR'), |
||||
language: ri.WordRecognitionLanguage(), |
||||
|
||||
is_bold: fontInfo.is_bold, |
||||
is_italic: fontInfo.is_italic, |
||||
is_underlined: fontInfo.is_underlined, |
||||
is_monospace: fontInfo.is_monospace, |
||||
is_serif: fontInfo.is_serif, |
||||
is_smallcaps: fontInfo.is_smallcaps, |
||||
font_size: fontInfo.pointsize, |
||||
font_id: fontInfo.font_id, |
||||
font_name: fontInfo.font_name, |
||||
}; |
||||
const wc = new TessModule.WordChoiceIterator(ri); |
||||
do { |
||||
word.choices.push({ |
||||
text: wc.GetUTF8Text(), |
||||
confidence: wc.Confidence(), |
||||
}); |
||||
} while (wc.Next()); |
||||
TessModule.destroy(wc); |
||||
textline.words.push(word); |
||||
} |
||||
|
||||
// let image = null;
|
||||
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
|
||||
// var image = pix2array(pix);
|
||||
// // for some reason it seems that things stop working if you destroy pics
|
||||
// TessModule._pixDestroy(TessModule.getPointer(pix));
|
||||
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { |
||||
symbol = { |
||||
choices: [], |
||||
image: null, |
||||
text: ri.GetUTF8Text(RIL_SYMBOL), |
||||
confidence: ri.Confidence(RIL_SYMBOL), |
||||
baseline: ri.getBaseline(RIL_SYMBOL), |
||||
bbox: ri.getBoundingBox(RIL_SYMBOL), |
||||
is_superscript: !!ri.SymbolIsSuperscript(), |
||||
is_subscript: !!ri.SymbolIsSubscript(), |
||||
is_dropcap: !!ri.SymbolIsDropcap(), |
||||
}; |
||||
word.symbols.push(symbol); |
||||
const ci = new TessModule.ChoiceIterator(ri); |
||||
do { |
||||
symbol.choices.push({ |
||||
text: ci.GetUTF8Text(), |
||||
confidence: ci.Confidence(), |
||||
}); |
||||
} while (ci.Next()); |
||||
// TessModule.destroy(i);
|
||||
} |
||||
} while (ri.Next(RIL_SYMBOL)); |
||||
TessModule.destroy(ri); |
||||
|
||||
return { |
||||
text: api.GetUTF8Text(), |
||||
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, |
||||
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, |
||||
box: tessjs_create_box === '1' ? api.GetBoxText() : null, |
||||
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, |
||||
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, |
||||
confidence: api.MeanTextConf(), |
||||
blocks, |
||||
psm: enumToString(api.GetPageSegMode(), 'PSM'), |
||||
oem: enumToString(api.oem(), 'OEM'), |
||||
version: api.Version(), |
||||
}; |
||||
}; |
@ -1,63 +0,0 @@
@@ -1,63 +0,0 @@
|
||||
const bmp = require('bmp-js'); |
||||
const fileType = require('file-type'); |
||||
|
||||
/** |
||||
* setImage |
||||
* |
||||
* @name setImage |
||||
* @function set image in tesseract for recognition |
||||
* @access public |
||||
*/ |
||||
module.exports = (TessModule, api, image) => { |
||||
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length })); |
||||
const type = fileType(buf); |
||||
let bytesPerPixel = 0; |
||||
let data = null; |
||||
let pix = null; |
||||
let w = 0; |
||||
let h = 0; |
||||
|
||||
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1; |
||||
|
||||
/* |
||||
* Leptonica supports uncompressed but not compressed bmp files |
||||
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
|
||||
* We therefore use bmp-js to process all bmp files |
||||
*/ |
||||
if (type && type.mime === 'image/bmp') { |
||||
const bmpBuf = bmp.decode(buf); |
||||
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT); |
||||
TessModule.HEAPU8.set(bmpBuf.data, data); |
||||
w = bmpBuf.width; |
||||
h = bmpBuf.height; |
||||
bytesPerPixel = 4; |
||||
} else { |
||||
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT); |
||||
TessModule.HEAPU8.set(buf, ptr); |
||||
pix = TessModule._pixReadMem(ptr, buf.length); |
||||
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) { |
||||
/* |
||||
* Set a yres default value to prevent warning from tesseract |
||||
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h |
||||
*/ |
||||
TessModule.setValue(pix + (7 * 4), 300, 'i32'); |
||||
} |
||||
[w, h] = Array(2).fill(0) |
||||
.map((v, idx) => ( |
||||
TessModule.getValue(pix + (idx * 4), 'i32') |
||||
)); |
||||
} |
||||
|
||||
/* |
||||
* As some image format (ex. bmp) is not supported natiely by tesseract, |
||||
* sometimes it will not return pix directly, but data and bytesPerPixel |
||||
* for another SetImage usage. |
||||
* |
||||
*/ |
||||
if (data === null) { |
||||
api.SetImage(pix, undefined, undefined, undefined, undefined, exif); |
||||
} else { |
||||
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif); |
||||
} |
||||
return data === null ? pix : data; |
||||
}; |
@ -1,18 +0,0 @@
@@ -1,18 +0,0 @@
|
||||
const resolveURL = require('resolve-url'); |
||||
const { version } = require('../../../package.json'); |
||||
const defaultOptions = require('../../constants/defaultOptions'); |
||||
|
||||
/* |
||||
* Default options for browser worker |
||||
*/ |
||||
module.exports = { |
||||
...defaultOptions, |
||||
workerPath: (typeof process !== 'undefined' && process.env.TESS_ENV === 'development') |
||||
? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`) |
||||
: `https://unpkg.com/tesseract.js@v${version}/dist/worker.min.js`, |
||||
/* |
||||
* If browser doesn't support WebAssembly, |
||||
* load ASM version instead |
||||
*/ |
||||
corePath: null, |
||||
}; |
@ -1,24 +0,0 @@
@@ -1,24 +0,0 @@
|
||||
/** |
||||
* |
||||
* Tesseract Worker adapter for browser |
||||
* |
||||
* @fileoverview Tesseract Worker adapter for browser |
||||
* @author Kevin Kwok <antimatter15@gmail.com> |
||||
* @author Guillermo Webster <gui@mit.edu> |
||||
* @author Jerome Wu <jeromewus@gmail.com> |
||||
*/ |
||||
const defaultOptions = require('./defaultOptions'); |
||||
const spawnWorker = require('./spawnWorker'); |
||||
const terminateWorker = require('./terminateWorker'); |
||||
const onMessage = require('./onMessage'); |
||||
const send = require('./send'); |
||||
const loadImage = require('./loadImage'); |
||||
|
||||
module.exports = { |
||||
defaultOptions, |
||||
spawnWorker, |
||||
terminateWorker, |
||||
onMessage, |
||||
send, |
||||
loadImage, |
||||
}; |
@ -1,68 +0,0 @@
@@ -1,68 +0,0 @@
|
||||
const resolveURL = require('resolve-url'); |
||||
|
||||
/** |
||||
* readFromBlobOrFile |
||||
* |
||||
* @name readFromBlobOrFile |
||||
* @function |
||||
* @access private |
||||
*/ |
||||
const readFromBlobOrFile = (blob) => ( |
||||
new Promise((resolve, reject) => { |
||||
const fileReader = new FileReader(); |
||||
fileReader.onload = () => { |
||||
resolve(fileReader.result); |
||||
}; |
||||
fileReader.onerror = ({ target: { error: { code } } }) => { |
||||
reject(Error(`File could not be read! Code=${code}`)); |
||||
}; |
||||
fileReader.readAsArrayBuffer(blob); |
||||
}) |
||||
); |
||||
|
||||
/** |
||||
* loadImage |
||||
* |
||||
* @name loadImage |
||||
* @function load image from different source |
||||
* @access private |
||||
*/ |
||||
const loadImage = async (image) => { |
||||
let data = image; |
||||
if (typeof image === 'undefined') { |
||||
return 'undefined'; |
||||
} |
||||
|
||||
if (typeof image === 'string') { |
||||
// Base64 Image
|
||||
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { |
||||
data = atob(image.split(',')[1]) |
||||
.split('') |
||||
.map((c) => c.charCodeAt(0)); |
||||
} else { |
||||
const resp = await fetch(resolveURL(image)); |
||||
data = await resp.arrayBuffer(); |
||||
} |
||||
} else if (image instanceof HTMLElement) { |
||||
if (image.tagName === 'IMG') { |
||||
data = await loadImage(image.src); |
||||
} |
||||
if (image.tagName === 'VIDEO') { |
||||
data = await loadImage(image.poster); |
||||
} |
||||
if (image.tagName === 'CANVAS') { |
||||
await new Promise((resolve) => { |
||||
image.toBlob(async (blob) => { |
||||
data = await readFromBlobOrFile(blob); |
||||
resolve(); |
||||
}); |
||||
}); |
||||
} |
||||
} else if (image instanceof File || image instanceof Blob) { |
||||
data = await readFromBlobOrFile(image); |
||||
} |
||||
|
||||
return new Uint8Array(data); |
||||
}; |
||||
|
||||
module.exports = loadImage; |
@ -1,5 +0,0 @@
@@ -1,5 +0,0 @@
|
||||
module.exports = (worker, handler) => { |
||||
worker.onmessage = ({ data }) => { // eslint-disable-line
|
||||
handler(data); |
||||
}; |
||||
}; |
@ -1,10 +0,0 @@
@@ -1,10 +0,0 @@
|
||||
/** |
||||
* send |
||||
* |
||||
* @name send |
||||
* @function send packet to worker and create a job |
||||
* @access public |
||||
*/ |
||||
module.exports = async (worker, packet) => { |
||||
worker.postMessage(packet); |
||||
}; |
@ -1,20 +0,0 @@
@@ -1,20 +0,0 @@
|
||||
/** |
||||
* spawnWorker |
||||
* |
||||
* @name spawnWorker |
||||
* @function create a new Worker in browser |
||||
* @access public |
||||
*/ |
||||
module.exports = ({ workerPath, workerBlobURL }) => { |
||||
let worker; |
||||
if (Blob && URL && workerBlobURL) { |
||||
const blob = new Blob([`importScripts("${workerPath}");`], { |
||||
type: 'application/javascript', |
||||
}); |
||||
worker = new Worker(URL.createObjectURL(blob)); |
||||
} else { |
||||
worker = new Worker(workerPath); |
||||
} |
||||
|
||||
return worker; |
||||
}; |
@ -1,10 +0,0 @@
@@ -1,10 +0,0 @@
|
||||
/** |
||||
* terminateWorker |
||||
* |
||||
* @name terminateWorker |
||||
* @function terminate worker |
||||
* @access public |
||||
*/ |
||||
module.exports = (worker) => { |
||||
worker.terminate(); |
||||
}; |
@ -1,10 +0,0 @@
@@ -1,10 +0,0 @@
|
||||
const path = require('path'); |
||||
const defaultOptions = require('../../constants/defaultOptions'); |
||||
|
||||
/* |
||||
* Default options for node worker |
||||
*/ |
||||
module.exports = { |
||||
...defaultOptions, |
||||
workerPath: path.join(__dirname, '..', '..', 'worker-script', 'node', 'index.js'), |
||||
}; |