Compare commits

..

14 Commits

Author SHA1 Message Date
Jerome Wu aba237af2e Fix image url in README.md and release 1.0.19 6 years ago
Jerome Wu a26566be04 Update README.md 6 years ago
Jerome Wu 55e355eff5 Release 1.0.17 6 years ago
Jerome Wu 1f497271b5 Update README.md and add error handler in loadImage 6 years ago
jeromewu 1a12ead46f
Merge pull request #280 from uwolfer/fix-call-from-worker 6 years ago
Urs Wolfer 5c930514f5 Fix initialization when calling from other Worker 6 years ago
jeromewu 9268572644
Merge pull request #267 from HoldYourWaffle/pr-load-error-handling 6 years ago
HoldYourWaffle 7911518b39 Add stubs for error handling 6 years ago
Jerome Wu 613a19c7e1 Force upgrade to 1.0.16 due to npm issue 6 years ago
Jerome Wu 07ea31a9cd Update dist/tesseract.js to 1.0.15 6 years ago
Jerome Wu 741ff413b3 1.0.15 6 years ago
Jerome Wu cdb86c694a Fix module is not defined issue 6 years ago
Jerome Wu 06d32c6804 1.0.14 6 years ago
Jerome Wu 8e1b21cd2c Replace langPath from jsDelivr to Github Page as there is 20MB limitation 6 years ago
  1. 17
      .eslintrc
  2. 9
      .github/FUNDING.yml
  3. 38
      .github/ISSUE_TEMPLATE/bug_report.md
  4. 20
      .github/ISSUE_TEMPLATE/feature_request.md
  5. 5
      .github/SECURITY.md
  6. 71
      .github/workflows/codeql-analysis.yml
  7. 29
      .github/workflows/node.js.yml
  8. 6
      .gitignore
  9. 2
      .gitpod.Dockerfile
  10. 9
      .gitpod.yml
  11. 3
      .npmignore
  12. 402
      README.md
  13. 640
      dist/tesseract.js
  14. 1
      dist/tesseract.min.js
  15. 1
      dist/tesseract.min.js.map
  16. 9051
      dist/worker.js
  17. 1
      dist/worker.min.js
  18. 1
      dist/worker.min.js.map
  19. 448
      docs/api.md
  20. 226
      docs/examples.md
  21. 42
      docs/faq.md
  22. 18
      docs/image-format.md
  23. BIN
      docs/images/tesseract.png
  24. BIN
      docs/images/video-demo.gif
  25. 38
      docs/local-installation.md
  26. 71
      docs/tesseract_lang_list.md
  27. BIN
      docs/tesseract_parameters.md
  28. 37
      examples/browser/basic-edge.html
  29. 19
      examples/browser/basic.html
  30. 33
      examples/browser/benchmark.html
  31. 52
      examples/browser/download-pdf.html
  32. BIN
      examples/data/meditations.jpg
  33. BIN
      examples/data/testocr.png
  34. BIN
      examples/data/tyger.jpg
  35. 1
      examples/file-input/README.md
  36. 2
      examples/file-input/basic.html
  37. 32
      examples/file-input/demo.html
  38. 15
      examples/node/basic.js
  39. 27
      examples/node/benchmark.js
  40. 0
      examples/node/cosmic.png
  41. 23
      examples/node/detect.js
  42. 22
      examples/node/download-pdf.js
  43. 20
      examples/node/recognize.js
  44. 18807
      package-lock.json
  45. 91
      package.json
  46. 5
      scripts/.eslintrc
  47. 13
      scripts/rollup.esm.js
  48. 17
      scripts/server.js
  49. 9
      scripts/test-helper.js
  50. 28
      scripts/webpack.config.common.js
  51. 48
      scripts/webpack.config.dev.js
  52. 36
      scripts/webpack.config.prod.js
  53. 28
      src/Tesseract.js
  54. 105
      src/browser/index.js
  55. 76
      src/browser/lang.js
  56. 23
      src/browser/worker.js
  57. 63
      src/common/circularize.js
  58. 24
      src/common/desaturate.js
  59. 164
      src/common/dump.js
  60. 81
      src/common/job.js
  61. 1
      src/common/langdata.json
  62. 165
      src/common/worker.js
  63. 12
      src/constants/OEM.js
  64. 19
      src/constants/PSM.js
  65. 5
      src/constants/config.js
  66. 13
      src/constants/defaultOptions.js
  67. 218
      src/constants/languages.js
  68. 21
      src/createJob.js
  69. 80
      src/createScheduler.js
  70. 198
      src/createWorker.js
  71. 231
      src/index.d.ts
  72. 102
      src/index.js
  73. 89
      src/node/index.js
  74. 47
      src/node/lang.js
  75. 19
      src/node/worker.js
  76. 54
      src/utils/circularize.js
  77. 21
      src/utils/getEnvironment.js
  78. 3
      src/utils/getId.js
  79. 9
      src/utils/log.js
  80. 12
      src/utils/resolvePaths.js
  81. 10
      src/worker-script/browser/cache.js
  82. 30
      src/worker-script/browser/getCore.js
  83. 1
      src/worker-script/browser/gunzip.js
  84. 32
      src/worker-script/browser/index.js
  85. 14
      src/worker-script/constants/defaultParams.js
  86. 313
      src/worker-script/index.js
  87. 16
      src/worker-script/node/cache.js
  88. 20
      src/worker-script/node/getCore.js
  89. 1
      src/worker-script/node/gunzip.js
  90. 30
      src/worker-script/node/index.js
  91. 201
      src/worker-script/utils/dump.js
  92. 63
      src/worker-script/utils/setImage.js
  93. 18
      src/worker/browser/defaultOptions.js
  94. 24
      src/worker/browser/index.js
  95. 68
      src/worker/browser/loadImage.js
  96. 5
      src/worker/browser/onMessage.js
  97. 10
      src/worker/browser/send.js
  98. 20
      src/worker/browser/spawnWorker.js
  99. 10
      src/worker/browser/terminateWorker.js
  100. 10
      src/worker/node/defaultOptions.js
  101. Some files were not shown because too many files have changed in this diff Show More

17
.eslintrc

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
{
"extends": "airbnb-base",
"parser": "babel-eslint",
"env": {
"browser": true,
"node": true,
"mocha": true,
"worker": true
},
"rules": {
"no-underscore-dangle": 0,
"no-console": 0,
"global-require": 0,
"camelcase": 0,
"no-control-regex": 0
}
}

9
.github/FUNDING.yml

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: tesseractjs
ko_fi: # Replace with a single Ko-fi username
tidelift: npm/tesseract.js
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2"]

38
.github/ISSUE_TEMPLATE/bug_report.md

@ -1,38 +0,0 @@ @@ -1,38 +0,0 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]
**Smartphone (please complete the following information):**
- Device: [e.g. iPhone6]
- OS: [e.g. iOS8.1]
- Browser [e.g. stock browser, safari]
- Version [e.g. 22]
**Additional context**
Add any other context about the problem here.

20
.github/ISSUE_TEMPLATE/feature_request.md

@ -1,20 +0,0 @@ @@ -1,20 +0,0 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.

5
.github/SECURITY.md

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
## Security contact information
To report a security vulnerability, please use the
[Tidelift security contact](https://tidelift.com/security).
Tidelift will coordinate the fix and disclosure.

71
.github/workflows/codeql-analysis.yml

@ -1,71 +0,0 @@ @@ -1,71 +0,0 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
name: "CodeQL"
on:
push:
branches: [master]
pull_request:
# The branches below must be a subset of the branches above
branches: [master]
schedule:
- cron: '0 17 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
# Override automatic language detection by changing the below list
# Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
language: ['javascript']
# Learn more...
# https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
# We must fetch at least the immediate parents so that if this is
# a pull request then we can checkout the head.
fetch-depth: 2
# If this run was triggered by a pull request event, then checkout
# the head of the pull request instead of the merge commit.
- run: git checkout HEAD^2
if: ${{ github.event_name == 'pull_request' }}
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# ℹ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

29
.github/workflows/node.js.yml

@ -1,29 +0,0 @@ @@ -1,29 +0,0 @@
# This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions
name: Node.js CI
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [14.x, 16.x]
steps:
- uses: actions/checkout@v2
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
- run: npm ci
- run: npm run lint
- run: npm test

6
.gitignore vendored

@ -3,8 +3,4 @@ node_modules/* @@ -3,8 +3,4 @@ node_modules/*
yarn.lock
tesseract.dev.js
worker.dev.js
/*.traineddata
/examples/**/*.traineddata
.nyc_output
dist/
*.swp
*.traineddata

2
.gitpod.Dockerfile

@ -1,2 +0,0 @@ @@ -1,2 +0,0 @@
FROM gitpod/workspace-full
RUN sudo apt-get update && sudo apt-get install -y libgtk-3-0 libx11-xcb1 libnss3 libxss1 libasound2

9
.gitpod.yml

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
image:
file: .gitpod.Dockerfile
tasks:
- command: gp await-port 3000 && sleep 3 && gp preview $(gp url 3000)/examples/browser/demo.html
- init: npm install
command: npm start
ports:
- port: 3000
onOpen: ignore

3
.npmignore

@ -1,3 +0,0 @@ @@ -1,3 +0,0 @@
tests
.nyc_output
.github

402
README.md

@ -1,197 +1,303 @@ @@ -1,197 +1,303 @@
<p align="center">
<a href="https://tesseract.projectnaptha.com/"><img width="256px" height="256px" alt="Tesseract.js" src="./docs/images/tesseract.png"></a>
</p>
![Lint & Test](https://github.com/naptha/tesseract.js/workflows/Node.js%20CI/badge.svg)
![CodeQL](https://github.com/naptha/tesseract.js/workflows/CodeQL/badge.svg)
[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://github.com/naptha/tesseract.js)
[![Financial Contributors on Open Collective](https://opencollective.com/tesseractjs/all/badge.svg?label=financial+contributors)](https://opencollective.com/tesseractjs) [![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![Code Style](https://badgen.net/badge/code%20style/airbnb/ff5a5f?icon=airbnb)](https://github.com/airbnb/javascript)
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
# [Tesseract.js](http://tesseract.projectnaptha.com/)
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
Image Recognition
[![NPM version][tesseractjs-npm-image]][tesseractjs-npm-url]
[tesseractjs-npm-image]: https://img.shields.io/npm/v/tesseract.js.svg
[tesseractjs-npm-url]: https://npmjs.org/package/tesseract.js
**Tesseract.js v2 alpha is now available!! Check [HERE](https://github.com/naptha/tesseract.js) for more information.**
[![fancy demo gif](./docs/images/demo.gif)](http://tesseract.projectnaptha.com)
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
[![fancy demo gif](https://raw.githubusercontent.com/naptha/tesseract.js/support/1.x/docs/images/demo.gif)](http://tesseract.projectnaptha.com)
Video Real-time Recognition
Tesseract.js works with script tags, [webpack](https://webpack.js.org/)/[Browserify](http://browserify.org/), and [Node.js](https://nodejs.org/en/). [After you install it](#installation), using it is as simple as
<p align="center">
<a href="https://github.com/jeromewu/tesseract.js-video"><img alt="Tesseract.js Video" src="./docs/images/video-demo.gif"></a>
</p>
```javascript
Tesseract.recognize(myImage)
.progress(function (p) { console.log('progress', p) })
.then(function (result) { console.log('result', result) })
```
[Check out the docs](#docs) for a full treatment of the API.
## Provenance
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine.
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/).
After you [install it](#installation), using it is as simple as:
# Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack and Browserify via `npm`, and on Node.js via `npm`. [Check out the docs](#docs) for a full treatment of the API.
## &lt;script />
You can simply include Tesseract.js with a CDN like this:
```html
<script src='https://cdn.jsdelivr.net/gh/naptha/tesseract.js@v1.0.14/dist/tesseract.min.js'></script>
```
After including your scripts, the `Tesseract` variable will be defined globally!
## Dependency
First:
```shell
> yarn add tesseract.js
```
or
```
> npm install tesseract.js --save
```
> Note: Tesseract.js currently requires Node.js v6.8.0 or higher.
## Usage
```javascript
var Tesseract = require('tesseract.js')
```
or
```javascript
import Tesseract from 'tesseract.js'
```
# Docs
* [Tesseract.recognize](#tesseractrecognizeimage-imagelike-options---tesseractjob)
+ [Simple Example](#simple-example)
+ [More Complicated Example](#more-complicated-example)
* [Tesseract.detect](#tesseractdetectimage-imagelike---tesseractjob)
* [ImageLike](#imagelike)
* [TesseractJob](#tesseractjob)
+ [TesseractJob.progress](#tesseractjobprogresscallback-function---tesseractjob)
+ [TesseractJob.then](#tesseractjobthencallback-function---tesseractjob)
+ [TesseractJob.catch](#tesseractjobcatchcallback-function---tesseractjob)
+ [TesseractJob.finally](#tesseractjobfinallycallback-function---tesseractjob)
* [Local Installation](#local-installation)
+ [corePath](#corepath)
+ [workerPath](#workerpath)
+ [langPath](#langpath)
* [Contributing](#contributing)
+ [Development](#development)
+ [Building Static Files](#building-static-files)
+ [Send us a Pull Request!](#send-us-a-pull-request)
## Tesseract.recognize(image: [ImageLike](#imagelike)[, options]) -> [TesseractJob](#tesseractjob)
Figures out what words are in `image`, where the words are in `image`, etc.
> Note: `image` should be sufficiently high resolution.
> Often, the same image will get much better results if you upscale it before calling `recognize`.
- `image` is any [ImageLike](#imagelike) object.
- `options` is either absent (in which case it is interpreted as `'eng'`), a string specifing a language short code from the [language list](./docs/tesseract_lang_list.md), or a flat json object that may:
+ include properties that override some subset of the [default tesseract parameters](./docs/tesseract_parameters.md)
+ include a `lang` property with a value from the [list of lang parameters](./docs/tesseract_lang_list.md)
Returns a [TesseractJob](#tesseractjob) whose `then`, `progress`, `catch` and `finally` methods can be used to act on the result.
### Simple Example:
```javascript
import Tesseract from 'tesseract.js';
Tesseract.recognize(
'https://tesseract.projectnaptha.com/img/eng_bw.png',
'eng',
{ logger: m => console.log(m) }
).then(({ data: { text } }) => {
console.log(text);
Tesseract.recognize(myImage)
.then(function(result){
console.log(result)
})
```
### More Complicated Example:
```javascript
// if we know our image is of spanish words without the letter 'e':
Tesseract.recognize(myImage, {
lang: 'spa',
tessedit_char_blacklist: 'e'
})
.then(function(result){
console.log(result)
})
```
Or more imperative
## Tesseract.detect(image: [ImageLike](#imagelike)) -> [TesseractJob](#tesseractjob)
Figures out what script (e.g. 'Latin', 'Chinese') the words in image are written in.
- `image` is any [ImageLike](#imagelike) object.
Returns a [TesseractJob](#tesseractjob) whose `then`, `progress`, `catch` and `finally` methods can be used to act on the result of the script.
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
[Check out the docs](#documentation) for a full explanation of the API.
## Major changes in v3
- Significantly faster performance
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data)
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18)
- Added SIMD-enabled build for supported devices
- Added support:
- Node.js version 18
- Removed support:
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0)
- Node.js versions 10 and 12
## Major changes in v2
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream)
- Support multiple languages at the same time, eg: eng+chi\_tra for English and Traditional Chinese
- Supported image formats: png, jpg, bmp, pbm
- Support WebAssembly (fallback to ASM.js when browser doesn't support)
- Support Typescript
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
### CDN
```html
<!-- v2 -->
<script src='https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js'></script>
Tesseract.detect(myImage)
.then(function(result){
console.log(result)
})
```
<!-- v1 -->
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script>
## ImageLike
The main Tesseract.js functions take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
On a browser, an image can be:
- an `img`, `video`, or `canvas` element
- a CanvasRenderingContext2D (returned by `canvas.getContext('2d')`)
- a `File` object (from a file `<input>` or drag-drop event)
- a `Blob` object
- a `ImageData` instance (an object containing `width`, `height` and `data` properties)
- a path or URL to an accessible image (the image must either be hosted locally or accessible by CORS)
In Node.js, an image can be
- a path to a local image
- a `Buffer` instance containing a `PNG` or `JPEG` image
- a `ImageData` instance (an object containing `width`, `height` and `data` properties)
## TesseractJob
A TesseractJob is an object returned by a call to `recognize` or `detect`. It's inspired by the ES6 Promise interface and provides `then` and `catch` methods. It also provides `finally` method, which will be fired regardless of the job fate. One important difference is that these methods return the job itself (to enable chaining) rather than new.
Typical use is:
```javascript
Tesseract.recognize(myImage)
.progress(message => console.log(message))
.catch(err => console.error(err))
.then(result => console.log(result))
.finally(resultOrError => console.log(resultOrError))
```
After including the script the `Tesseract` variable will be globally available.
Which is equivalent to:
```javascript
var job1 = Tesseract.recognize(myImage);
### Node.js
job1.progress(message => console.log(message));
**Tesseract.js v3 requires Node.js v14 or higher**
job1.catch(err => console.error(err));
```shell
# For v3
npm install tesseract.js
yarn add tesseract.js
job1.then(result => console.log(result));
# For v2
npm install tesseract.js@2
yarn add tesseract.js@2
job1.finally(resultOrError => console.log(resultOrError));
```
## Documentation
* [Examples](./docs/examples.md)
* [Image Format](./docs/image-format.md)
* [API](./docs/api.md)
* [Local Installation](./docs/local-installation.md)
* [FAQ](./docs/faq.md)
### TesseractJob.progress(callback: function) -> TesseractJob
Sets `callback` as the function that will be called every time the job progresses.
- `callback` is a function with the signature `callback(progress)` where `progress` is a json object.
## Use tesseract.js the way you like!
For example:
```javascript
Tesseract.recognize(myImage)
.progress(function(message){console.log('progress is: ', message)})
```
- Offline Version: https://github.com/jeromewu/tesseract.js-offline
- Electron Version: https://github.com/jeromewu/tesseract.js-electron
- Custom Traineddata: https://github.com/jeromewu/tesseract.js-custom-traineddata
- Chrome Extension #1: https://github.com/jeromewu/tesseract.js-chrome-extension
- Chrome Extension #2: https://github.com/fxnoob/image-to-text
- Firefox Extension: https://github.com/gnonio/korporize
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app
- With React: https://github.com/jeromewu/tesseract.js-react-app
- Typescript: https://github.com/jeromewu/tesseract.js-typescript
- Video Real-time Recognition: https://github.com/jeromewu/tesseract.js-video
The console will show something like:
```javascript
progress is: {loaded_lang_model: "eng", from_cache: true}
progress is: {initialized_with_lang: "eng"}
progress is: {set_variable: Object}
progress is: {set_variable: Object}
progress is: {recognized: 0}
progress is: {recognized: 0.3}
progress is: {recognized: 0.6}
progress is: {recognized: 0.9}
progress is: {recognized: 1}
```
## Contributing
### Development
To run a development copy of Tesseract.js do the following:
```shell
# First we clone the repository
git clone https://github.com/naptha/tesseract.js.git
cd tesseract.js
### TesseractJob.then(callback: function) -> TesseractJob
Sets `callback` as the function that will be called if and when the job successfully completes.
- `callback` is a function with the signature `callback(result)` where `result` is a json object.
# Then we install the dependencies
npm install
# And finally we start the development server
npm start
For example:
```javascript
Tesseract.recognize(myImage)
.then(function(result){console.log('result is: ', result)})
```
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser.
It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder.
The console will show something like:
```javascript
result is: {
blocks: Array[1]
confidence: 87
html: "<div class='ocr_page' id='page_1' ..."
lines: Array[3]
oem: "DEFAULT"
paragraphs: Array[1]
psm: "SINGLE_BLOCK"
symbols: Array[33]
text: "Hello World↵from beyond↵the Cosmic Void↵↵"
version: "3.04.00"
words: Array[7]
}
```
### Online Setup with a single Click
### TesseractJob.catch(callback: function) -> TesseractJob
Sets `callback` as the function that will be called if the job fails.
- `callback` is a function with the signature `callback(error)` where `error` is a json object.
You can use Gitpod(A free online VS Code like IDE) for contributing. With a single click it will launch a ready to code workspace with the build & start scripts already in process and within a few seconds it will spin up the dev server so that you can start contributing straight away without wasting any time.
### TesseractJob.finally(callback: function) -> TesseractJob
Sets `callback` as the function that will be called regardless if the job fails or success.
- `callback` is a function with the signature `callback(resultOrError)` where `resultOrError` is a json object.
[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/naptha/tesseract.js/blob/master/examples/browser/demo.html)
## Local Installation
### Building Static Files
To build the compiled static files just execute the following:
```shell
npm run build
In the browser, `tesseract.js` simply provides the API layer. Internally, it opens a WebWorker to handle requests. That worker itself loads code from the Emscripten-built `tesseract.js-core` which itself is hosted on a CDN. Then it dynamically loads language files hosted on another CDN.
Because of this we recommend loading `tesseract.js` from a CDN. But if you really need to have all your files local, you can use the `Tesseract.create` function which allows you to specify custom paths for workers, languages, and core.
```javascript
window.Tesseract = Tesseract.create({
workerPath: '/path/to/worker.js',
langPath: 'https://tessdata.projectnaptha.com/3.02/',
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js',
})
```
This will output the files into the `dist` directory.
## Contributors
### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js'. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use a different file.
### Code Contributors
### workerPath
A string specifying the location of the [worker.js](./dist/worker.js) file. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use a different file.
This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)].
<a href="https://github.com/naptha/tesseract.js/graphs/contributors"><img src="https://opencollective.com/tesseractjs/contributors.svg?width=890&button=false" /></a>
### langPath
A string specifying the location of the tesseract language files, with default value 'https://cdn.jsdelivr.net/gh/naptha/tessdata@gh-pages/3.02/'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`. Set this string before calling `Tesseract.recognize` and `Tesseract.detect` if you want Tesseract.js to use different language files.
### Financial Contributors
Become a financial contributor and help us sustain our community. [[Contribute](https://opencollective.com/tesseractjs/contribute)]
## Contributing
### Development
To run a development copy of tesseract.js, first clone this repo.
```shell
> git clone https://github.com/naptha/tesseract.js.git
```
Then, `cd tesseract.js && npm install && npm start`
```shell
> cd tesseract.js
> npm install && npm start
#### Individuals
... a bunch of npm stuff ...
<a href="https://opencollective.com/tesseractjs"><img src="https://opencollective.com/tesseractjs/individuals.svg?width=890"></a>
Starting up http-server, serving ./
Available on:
http://127.0.0.1:7355
http://[your ip]:7355
#### Organizations
```
Then open `http://localhost:7355/examples/file-input/demo.html` in your favorite browser. The devServer automatically rebuilds `tesseract.js` and `tesseract.worker.js` when you change files in the src folder.
Support this project with your organization. Your logo will show up here with a link to your website. [[Contribute](https://opencollective.com/tesseractjs/contribute)]
### Building Static Files
After you've cloned the repo and run `npm install` as described in the [Development Section](#development), you can build static library files in the dist folder with
```shell
> npm run build
```
<a href="https://opencollective.com/tesseractjs/organization/0/website"><img src="https://opencollective.com/tesseractjs/organization/0/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/1/website"><img src="https://opencollective.com/tesseractjs/organization/1/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/2/website"><img src="https://opencollective.com/tesseractjs/organization/2/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/3/website"><img src="https://opencollective.com/tesseractjs/organization/3/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/4/website"><img src="https://opencollective.com/tesseractjs/organization/4/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/5/website"><img src="https://opencollective.com/tesseractjs/organization/5/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/6/website"><img src="https://opencollective.com/tesseractjs/organization/6/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/7/website"><img src="https://opencollective.com/tesseractjs/organization/7/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/8/website"><img src="https://opencollective.com/tesseractjs/organization/8/avatar.svg"></a>
<a href="https://opencollective.com/tesseractjs/organization/9/website"><img src="https://opencollective.com/tesseractjs/organization/9/avatar.svg"></a>
### Send us a Pull Request!
Thanks :)

640
dist/tesseract.js vendored

@ -0,0 +1,640 @@ @@ -0,0 +1,640 @@
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.Tesseract = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
// shim for using process in browser
var process = module.exports = {};
// cached from whatever global is present so that test runners that stub it
// don't break things. But we need to wrap it in a try catch in case it is
// wrapped in strict mode code which doesn't define any globals. It's inside a
// function because try/catches deoptimize in certain engines.
var cachedSetTimeout;
var cachedClearTimeout;
function defaultSetTimout() {
throw new Error('setTimeout has not been defined');
}
function defaultClearTimeout () {
throw new Error('clearTimeout has not been defined');
}
(function () {
try {
if (typeof setTimeout === 'function') {
cachedSetTimeout = setTimeout;
} else {
cachedSetTimeout = defaultSetTimout;
}
} catch (e) {
cachedSetTimeout = defaultSetTimout;
}
try {
if (typeof clearTimeout === 'function') {
cachedClearTimeout = clearTimeout;
} else {
cachedClearTimeout = defaultClearTimeout;
}
} catch (e) {
cachedClearTimeout = defaultClearTimeout;
}
} ())
function runTimeout(fun) {
if (cachedSetTimeout === setTimeout) {
//normal enviroments in sane situations
return setTimeout(fun, 0);
}
// if setTimeout wasn't available but was latter defined
if ((cachedSetTimeout === defaultSetTimout || !cachedSetTimeout) && setTimeout) {
cachedSetTimeout = setTimeout;
return setTimeout(fun, 0);
}
try {
// when when somebody has screwed with setTimeout but no I.E. maddness
return cachedSetTimeout(fun, 0);
} catch(e){
try {
// When we are in I.E. but the script has been evaled so I.E. doesn't trust the global object when called normally
return cachedSetTimeout.call(null, fun, 0);
} catch(e){
// same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error
return cachedSetTimeout.call(this, fun, 0);
}
}
}
function runClearTimeout(marker) {
if (cachedClearTimeout === clearTimeout) {
//normal enviroments in sane situations
return clearTimeout(marker);
}
// if clearTimeout wasn't available but was latter defined
if ((cachedClearTimeout === defaultClearTimeout || !cachedClearTimeout) && clearTimeout) {
cachedClearTimeout = clearTimeout;
return clearTimeout(marker);
}
try {
// when when somebody has screwed with setTimeout but no I.E. maddness
return cachedClearTimeout(marker);
} catch (e){
try {
// When we are in I.E. but the script has been evaled so I.E. doesn't trust the global object when called normally
return cachedClearTimeout.call(null, marker);
} catch (e){
// same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error.
// Some versions of I.E. have different rules for clearTimeout vs setTimeout
return cachedClearTimeout.call(this, marker);
}
}
}
var queue = [];
var draining = false;
var currentQueue;
var queueIndex = -1;
function cleanUpNextTick() {
if (!draining || !currentQueue) {
return;
}
draining = false;
if (currentQueue.length) {
queue = currentQueue.concat(queue);
} else {
queueIndex = -1;
}
if (queue.length) {
drainQueue();
}
}
function drainQueue() {
if (draining) {
return;
}
var timeout = runTimeout(cleanUpNextTick);
draining = true;
var len = queue.length;
while(len) {
currentQueue = queue;
queue = [];
while (++queueIndex < len) {
if (currentQueue) {
currentQueue[queueIndex].run();
}
}
queueIndex = -1;
len = queue.length;
}
currentQueue = null;
draining = false;
runClearTimeout(timeout);
}
process.nextTick = function (fun) {
var args = new Array(arguments.length - 1);
if (arguments.length > 1) {
for (var i = 1; i < arguments.length; i++) {
args[i - 1] = arguments[i];
}
}
queue.push(new Item(fun, args));
if (queue.length === 1 && !draining) {
runTimeout(drainQueue);
}
};
// v8 likes predictible objects
function Item(fun, array) {
this.fun = fun;
this.array = array;
}
Item.prototype.run = function () {
this.fun.apply(null, this.array);
};
process.title = 'browser';
process.browser = true;
process.env = {};
process.argv = [];
process.version = ''; // empty string to avoid regexp issues
process.versions = {};
function noop() {}
process.on = noop;
process.addListener = noop;
process.once = noop;
process.off = noop;
process.removeListener = noop;
process.removeAllListeners = noop;
process.emit = noop;
process.prependListener = noop;
process.prependOnceListener = noop;
process.listeners = function (name) { return [] }
process.binding = function (name) {
throw new Error('process.binding is not supported');
};
process.cwd = function () { return '/' };
process.chdir = function (dir) {
throw new Error('process.chdir is not supported');
};
process.umask = function() { return 0; };
},{}],2:[function(require,module,exports){
module.exports={
"name": "tesseract.js",
"version": "1.0.19",
"description": "Pure Javascript Multilingual OCR",
"main": "src/index.js",
"scripts": {
"start": "concurrently --kill-others \"watchify src/index.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.dev.js --standalone Tesseract\" \"watchify src/browser/worker.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/worker.dev.js\" \"http-server -p 7355\"",
"build": "browserify src/index.js -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.js --standalone Tesseract && browserify src/browser/worker.js -t [ babelify --presets [ es2015 ] ] -o dist/worker.js && uglifyjs dist/tesseract.js --source-map -o dist/tesseract.min.js && uglifyjs dist/worker.js --source-map -o dist/worker.min.js",
"release": "npm run build && git commit -am 'new release' && git push && git tag `jq -r '.version' package.json` && git push origin --tags && npm publish"
},
"browser": {
"./src/node/index.js": "./src/browser/index.js"
},
"author": "",
"license": "Apache-2.0",
"devDependencies": {
"babel-preset-es2015": "^6.16.0",
"babelify": "^7.3.0",
"browserify": "^13.1.0",
"concurrently": "^3.1.0",
"envify": "^3.4.1",
"http-server": "^0.9.0",
"pako": "^1.0.3",
"uglify-js": "^3.4.9",
"watchify": "^3.7.0"
},
"dependencies": {
"file-type": "^3.8.0",
"isomorphic-fetch": "^2.2.1",
"is-url": "1.2.2",
"jpeg-js": "^0.2.0",
"level-js": "^2.2.4",
"node-fetch": "^1.6.3",
"object-assign": "^4.1.0",
"png.js": "^0.2.1",
"tesseract.js-core": "^1.0.2"
},
"repository": {
"type": "git",
"url": "https://github.com/naptha/tesseract.js.git"
},
"bugs": {
"url": "https://github.com/naptha/tesseract.js/issues"
},
"homepage": "https://github.com/naptha/tesseract.js"
}
},{}],3:[function(require,module,exports){
(function (process){
'use strict';
var defaultOptions = {
// workerPath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@0.2.0/dist/worker.js',
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js',
langPath: 'https://tessdata.projectnaptha.com/3.02/'
};
if (process.env.TESS_ENV === "development") {
console.debug('Using Development Configuration');
defaultOptions.workerPath = location.protocol + '//' + location.host + '/dist/worker.dev.js?nocache=' + Math.random().toString(36).slice(3);
} else {
var version = require('../../package.json').version;
defaultOptions.workerPath = 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@' + version + '/dist/worker.js';
}
exports.defaultOptions = defaultOptions;
exports.spawnWorker = function spawnWorker(instance, workerOptions) {
if (Blob && URL) {
var blob = new Blob(['importScripts("' + workerOptions.workerPath + '");'], {
type: 'application/javascript'
});
var worker = new Worker(URL.createObjectURL(blob));
} else {
var worker = new Worker(workerOptions.workerPath);
}
worker.onmessage = function (e) {
var packet = e.data;
instance._recv(packet);
};
return worker;
};
exports.terminateWorker = function (instance) {
instance.worker.terminate();
};
exports.sendPacket = function sendPacket(instance, packet) {
loadImage(packet.payload.image, function (img) {
packet.payload.image = img;
instance.worker.postMessage(packet);
});
};
function loadImage(image, cb) {
if (typeof image === 'string') {
if (/^\#/.test(image)) {
// element css selector
return loadImage(document.querySelector(image), cb);
} else if (/(blob|data)\:/.test(image)) {
// data url
var im = new Image();
im.src = image;
im.onload = function (e) {
return loadImage(im, cb);
};
im.onerror = function (e) {
throw e;
};
return;
} else {
var xhr = new XMLHttpRequest();
xhr.open('GET', image, true);
xhr.responseType = "blob";
xhr.onload = function (e) {
if (xhr.status >= 400) {
throw new Error('Fail to get image as Blob');
} else {
loadImage(xhr.response, cb);
}
};
xhr.onerror = function (e) {
throw e;
};
xhr.send(null);
return;
}
} else if (image instanceof File) {
// files
var fr = new FileReader();
fr.onload = function (e) {
return loadImage(fr.result, cb);
};
fr.onerror = function (e) {
throw e;
};
fr.readAsDataURL(image);
return;
} else if (image instanceof Blob) {
return loadImage(URL.createObjectURL(image), cb);
} else if (image.getContext) {
// canvas element
return loadImage(image.getContext('2d'), cb);
} else if (image.tagName == "IMG" || image.tagName == "VIDEO") {
// image element or video element
var c = document.createElement('canvas');
c.width = image.naturalWidth || image.videoWidth;
c.height = image.naturalHeight || image.videoHeight;
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
return loadImage(ctx, cb);
} else if (image.getImageData) {
// canvas context
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return loadImage(data, cb);
} else {
return cb(image);
}
throw new Error('Missing return in loadImage cascade');
}
}).call(this,require('_process'))
},{"../../package.json":2,"_process":1}],4:[function(require,module,exports){
"use strict";
// The result of dump.js is a big JSON tree
// which can be easily serialized (for instance
// to be sent from a webworker to the main app
// or through Node's IPC), but we want
// a (circular) DOM-like interface for walking
// through the data.
module.exports = function circularize(page) {
page.paragraphs = [];
page.lines = [];
page.words = [];
page.symbols = [];
page.blocks.forEach(function (block) {
block.page = page;
block.lines = [];
block.words = [];
block.symbols = [];
block.paragraphs.forEach(function (para) {
para.block = block;
para.page = page;
para.words = [];
para.symbols = [];
para.lines.forEach(function (line) {
line.paragraph = para;
line.block = block;
line.page = page;
line.symbols = [];
line.words.forEach(function (word) {
word.line = line;
word.paragraph = para;
word.block = block;
word.page = page;
word.symbols.forEach(function (sym) {
sym.word = word;
sym.line = line;
sym.paragraph = para;
sym.block = block;
sym.page = page;
sym.line.symbols.push(sym);
sym.paragraph.symbols.push(sym);
sym.block.symbols.push(sym);
sym.page.symbols.push(sym);
});
word.paragraph.words.push(word);
word.block.words.push(word);
word.page.words.push(word);
});
line.block.lines.push(line);
line.page.lines.push(line);
});
para.page.paragraphs.push(para);
});
});
return page;
};
},{}],5:[function(require,module,exports){
'use strict';
var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }();
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
var adapter = require('../node/index.js');
var jobCounter = 0;
module.exports = function () {
function TesseractJob(instance) {
_classCallCheck(this, TesseractJob);
this.id = 'Job-' + ++jobCounter + '-' + Math.random().toString(16).slice(3, 8);
this._instance = instance;
this._resolve = [];
this._reject = [];
this._progress = [];
this._finally = [];
}
_createClass(TesseractJob, [{
key: 'then',
value: function then(resolve, reject) {
if (this._resolve.push) {
this._resolve.push(resolve);
} else {
resolve(this._resolve);
}
if (reject) this.catch(reject);
return this;
}
}, {
key: 'catch',
value: function _catch(reject) {
if (this._reject.push) {
this._reject.push(reject);
} else {
reject(this._reject);
}
return this;
}
}, {
key: 'progress',
value: function progress(fn) {
this._progress.push(fn);
return this;
}
}, {
key: 'finally',
value: function _finally(fn) {
this._finally.push(fn);
return this;
}
}, {
key: '_send',
value: function _send(action, payload) {
adapter.sendPacket(this._instance, {
jobId: this.id,
action: action,
payload: payload
});
}
}, {
key: '_handle',
value: function _handle(packet) {
var data = packet.data;
var runFinallyCbs = false;
if (packet.status === 'resolve') {
if (this._resolve.length === 0) console.log(data);
this._resolve.forEach(function (fn) {
var ret = fn(data);
if (ret && typeof ret.then == 'function') {
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.');
}
});
this._resolve = data;
this._instance._dequeue();
runFinallyCbs = true;
} else if (packet.status === 'reject') {
if (this._reject.length === 0) console.error(data);
this._reject.forEach(function (fn) {
return fn(data);
});
this._reject = data;
this._instance._dequeue();
runFinallyCbs = true;
} else if (packet.status === 'progress') {
this._progress.forEach(function (fn) {
return fn(data);
});
} else {
console.warn('Message type unknown', packet.status);
}
if (runFinallyCbs) {
this._finally.forEach(function (fn) {
return fn(data);
});
}
}
}]);
return TesseractJob;
}();
},{"../node/index.js":3}],6:[function(require,module,exports){
'use strict';
var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }();
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
var adapter = require('./node/index.js');
var circularize = require('./common/circularize.js');
var TesseractJob = require('./common/job');
var version = require('../package.json').version;
var create = function create() {
var workerOptions = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};
var worker = new TesseractWorker(Object.assign({}, adapter.defaultOptions, workerOptions));
worker.create = create;
worker.version = version;
return worker;
};
var TesseractWorker = function () {
function TesseractWorker(workerOptions) {
_classCallCheck(this, TesseractWorker);
this.worker = null;
this.workerOptions = workerOptions;
this._currentJob = null;
this._queue = [];
}
_createClass(TesseractWorker, [{
key: 'recognize',
value: function recognize(image) {
var _this = this;
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
return this._delay(function (job) {
if (typeof options === 'string') options = { lang: options };
options.lang = options.lang || 'eng';
job._send('recognize', { image: image, options: options, workerOptions: _this.workerOptions });
});
}
}, {
key: 'detect',
value: function detect(image) {
var _this2 = this;
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
return this._delay(function (job) {
job._send('detect', { image: image, options: options, workerOptions: _this2.workerOptions });
});
}
}, {
key: 'terminate',
value: function terminate() {
if (this.worker) adapter.terminateWorker(this);
this.worker = null;
this._currentJob = null;
this._queue = [];
}
}, {
key: '_delay',
value: function _delay(fn) {
var _this3 = this;
if (!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions);
var job = new TesseractJob(this);
this._queue.push(function (e) {
_this3._queue.shift();
_this3._currentJob = job;
fn(job);
});
if (!this._currentJob) this._dequeue();
return job;
}
}, {
key: '_dequeue',
value: function _dequeue() {
this._currentJob = null;
if (this._queue.length) {
this._queue[0]();
}
}
}, {
key: '_recv',
value: function _recv(packet) {
if (packet.status === 'resolve' && packet.action === 'recognize') {
packet.data = circularize(packet.data);
}
if (this._currentJob.id === packet.jobId) {
this._currentJob._handle(packet);
} else {
console.warn('Job ID ' + packet.jobId + ' not known.');
}
}
}]);
return TesseractWorker;
}();
module.exports = create();
},{"../package.json":2,"./common/circularize.js":4,"./common/job":5,"./node/index.js":3}]},{},[6])(6)
});

1
dist/tesseract.min.js vendored

File diff suppressed because one or more lines are too long

1
dist/tesseract.min.js.map vendored

File diff suppressed because one or more lines are too long

9051
dist/worker.js vendored

File diff suppressed because it is too large Load Diff

1
dist/worker.min.js vendored

File diff suppressed because one or more lines are too long

1
dist/worker.min.js.map vendored

File diff suppressed because one or more lines are too long

448
docs/api.md

@ -1,448 +0,0 @@ @@ -1,448 +0,0 @@
# API
- [createWorker()](#create-worker)
- [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText)
- [Worker.readText](#worker-readText)
- [Worker.removeFile](#worker-removeFile)
- [Worker.FS](#worker-FS)
- [Worker.loadLanguage](#worker-load-language)
- [Worker.initialize](#worker-initialize)
- [Worker.setParameters](#worker-set-parameters)
- [Worker.recognize](#worker-recognize)
- [Worker.detect](#worker-detect)
- [Worker.terminate](#worker-terminate)
- [createScheduler()](#create-scheduler)
- [Scheduler.addWorker](#scheduler-add-worker)
- [Scheduler.addJob](#scheduler-add-job)
- [Scheduler.getQueueLen](#scheduler-get-queue-len)
- [Scheduler.getNumWorkers](#scheduler-get-num-workers)
- [setLogging()](#set-logging)
- [recognize()](#recognize)
- [detect()](#detect)
- [PSM](#psm)
- [OEM](#oem)
---
<a name="create-worker"></a>
## createWorker(options): Worker
createWorker is a factory function that creates a tesseract worker, a worker is basically a Web Worker in browser and Child Process in Node.
**Arguments:**
- `options` an object of customized options
- `corePath` path for tesseract-core.js script
- `langPath` path for downloading traineddata, do not include `/` at the end of the path
- `workerPath` path for downloading worker script
- `dataPath` path for saving traineddata in WebAssembly file system, not common to modify
- `cachePath` path for the cached traineddata, more useful for Node, for browser it only changes the key in IndexDB
- `cacheMethod` a string to indicate the method of cache management, should be one of the following options
- write: read cache and write back (default method)
- readOnly: read cache and not to write back
- refresh: not to read cache and write back
- none: not to read cache and not to write back
- `workerBlobURL` a boolean to define whether to use Blob URL for worker script, default: true
- `gzip` a boolean to define whether the traineddata from the remote is gzipped, default: true
- `logger` a function to log the progress, a quick example is `m => console.log(m)`
- `errorHandler` a function to handle worker errors, a quick example is `err => console.error(err)`
**Examples:**
```javascript
const { createWorker } = Tesseract;
const worker = createWorker({
langPath: '...',
logger: m => console.log(m),
});
```
## Worker
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is:
- load
- FS functions // optional
- loadLanguauge
- initialize
- setParameters // optional
- recognize or detect
- terminate
Each function is async, so using async/await or Promise is required. When it is resolved, you get an object:
```json
{
"jobId": "Job-1-123",
"data": { ... }
}
```
jobId is generated by Tesseract.js, but you can put your own when calling any of the function above.
<a name="worker-load"></a>
### Worker.load(jobId): Promise
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action.
**Arguments:**
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.load();
})();
```
<a name="worker-writeText"></a>
### Worker.writeText(path, text, jobId): Promise
Worker.writeText() writes a text file to the path specified in MEMFS, it is useful when you want to use some features that requires tesseract.js
to read file from file system.
**Arguments:**
- `path` text file path
- `text` content of the text file
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.writeText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-readText"></a>
### Worker.readText(path, jobId): Promise
Worker.readText() reads a text file to the path specified in MEMFS, it is useful when you want to check the content.
**Arguments:**
- `path` text file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
const { data } = await worker.readText('tmp.txt');
console.log(data);
})();
```
<a name="worker-removeFile"></a>
### Worker.removeFile(path, jobId): Promise
Worker.readFile() remove a file in MEMFS, it is useful when you want to free the memory.
**Arguments:**
- `path` file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.removeFile('tmp.txt');
})();
```
<a name="worker-FS"></a>
### Worker.FS(method, args, jobId): Promise
Worker.FS() is a generic FS function to do anything you want, you can check [HERE](ihttps://emscripten.org/docs/api_reference/Filesystem-API.html) for all functions.
**Arguments:**
- `method` method name
- `args` array of arguments to pass
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.FS('writeFile', ['tmp.txt', 'Hi\nTesseract.js\n']);
// equal to:
// await worker.readText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-load-language"></a>
### Worker.loadLanguage(langs, jobId): Promise
Worker.loadLanguage() loads traineddata from cache or download traineddata from remote, and put traineddata into the WebAssembly file system.
**Arguments:**
- `langs` a string to indicate the languages traineddata to download, multiple languages are concated with **+**, ex: **eng+chi\_tra**
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.loadLanguage('eng+chi_tra');
})();
```
<a name="worker-initialize"></a>
### Worker.initialize(langs, oem, jobId): Promise
Worker.initialize() initializes the Tesseract API, make sure it is ready for doing OCR tasks.
**Arguments:**
- `langs` a string to indicate the languages loaded by Tesseract API, it can be the subset of the languauge traineddata you loaded from Worker.loadLanguage.
- `oem` a enum to indicate the OCR Engine Mode you use
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
/** You can load more languages in advance, but use only part of them in Worker.initialize() */
await worker.loadLanguage('eng+chi_tra');
await worker.initialize('eng');
})();
```
<a name="worker-set-parameters"></a>
### Worker.setParameters(params, jobId): Promise
Worker.setParameters() set parameters for Tesseract API (using SetVariable()), it changes the behavior of Tesseract and some parameters like tessedit\_char\_whitelist is very useful.
**Arguments:**
- `params` an object with key and value of the parameters
- `jobId` Please see details above
**Supported Paramters:**
| name | type | default value | description |
| --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- |
| tessedit\_ocr\_engine\_mode | enum | OEM.DEFAULT | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
**Examples:**
```javascript
(async () => {
await worker.setParameters({
tessedit_char_whitelist: '0123456789',
});
})
```
<a name="worker-recognize"></a>
### Worker.recognize(image, options, jobId): Promise
Worker.recognize() provides core function of Tesseract.js as it executes OCR
Figures out what words are in `image`, where the words are in `image`, etc.
> Note: `image` should be sufficiently high resolution.
> Often, the same image will get much better results if you upscale it before calling `recognize`.
**Arguments:**
- `image` see [Image Format](./image-format.md) for more details.
- `options` a object of customized options
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below.
- `jobId` Please see details above
**Output:**
**Examples:**
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
console.log(text);
})();
```
With rectangle
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image, {
rectangle: { top: 0, left: 0, width: 100, height: 100 },
});
console.log(text);
})();
```
<a name="worker-detect"></a>
### Worker.detect(image, jobId): Promise
Worker.detect() does OSD (Orientation and Script Detection) to the image instead of OCR.
**Arguments:**
- `image` see [Image Format](./image-format.md) for more details.
- `jobId` Please see details above
**Examples:**
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data } = await worker.detect(image);
console.log(data);
})();
```
<a name="worker-terminate"></a>
### Worker.terminate(jobId): Promise
Worker.terminate() terminates the worker and cleans up
```javascript
(async () => {
await worker.terminate();
})();
```
<a name="create-scheduler"></a>
## createScheduler(): Scheduler
createScheduler() is a factory function to create a scheduler, a scheduler manages a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance.
**Examples:**
```javascript
const { createScheduler } = Tesseract;
const scheduler = createScheduler();
```
### Scheduler
<a name="scheduler-add-worker"></a>
### Scheduler.addWorker(worker): string
Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one scheduler.
**Arguments:**
- `worker` see Worker above
**Examples:**
```javascript
const { createWorker, createScheduler } = Tesseract;
const scheduler = createScheduler();
const worker = createWorker();
scheduler.addWorker(worker);
```
<a name="scheduler-add-job"></a>
### Scheduler.addJob(action, ...payload): Promise
Scheduler.addJob() adds a job to the job queue and scheduler waits and finds an idle worker to take the job.
**Arguments:**
- `action` a string to indicate the action you want to do, right now only **recognize** and **detect** are supported
- `payload` a arbitrary number of args depending on the action you called.
**Examples:**
```javascript
(async () => {
const { data: { text } } = await scheduler.addJob('recognize', image, options);
const { data } = await scheduler.addJob('detect', image);
})();
```
<a name="scheduler-get-queue-len"></a>
### Scheduler.getQueueLen(): number
Scheduler.getNumWorkers() returns the length of job queue.
<a name="scheduler-get-num-workers"></a>
### Scheduler.getNumWorkers(): number
Scheduler.getNumWorkers() returns number of workers added into the scheduler
<a name="scheduler-terminate"></a>
### Scheduler.terminate(): Promise
Scheduler.terminate() terminates all workers added, useful to do quick clean up.
**Examples:**
```javascript
(async () => {
await scheduler.terminate();
})();
```
<a name="set-logging"></a>
## setLogging(logging: boolean)
setLogging() sets the logging flag, you can `setLogging(true)` to see detailed information, useful for debugging.
**Arguments:**
- `logging` boolean to define whether to see detailed logs, default: false
**Examples:**
```javascript
const { setLogging } = Tesseract;
setLogging(true);
```
<a name="recognize"></a>
## recognize(image, langs, options): Promise
recognize() is a function to quickly do recognize() task, it is not recommended to use in real application, but useful when you want to save some time.
See [Tesseract.js](../src/Tesseract.js)
<a name="detect"></a>
## detect(image, options): Promise
Same background as recognize(), but it does detect instead.
See [Tesseract.js](../src/Tesseract.js)
<a name="psm"></a>
## PSM
See [PSM.js](../src/constants/PSM.js)
<a name="oem"></a>
## OEM
See [OEM.js](../src/constants/OEM.js)

226
docs/examples.md

@ -1,226 +0,0 @@ @@ -1,226 +0,0 @@
# Tesseract.js Examples
You can also check [examples](../examples) folder.
### basic
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
### with detailed progress
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker({
logger: m => console.log(m), // Add logger here
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
### with multiple languages, separate by '+'
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng+chi_tra');
await worker.initialize('eng+chi_tra');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
### with whitelist char (^2.0.0-beta.1)
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_char_whitelist: '0123456789',
});
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
### with different pageseg mode (^2.0.0-beta.1)
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript
const { createWorker, PSM } = require('tesseract.js');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
});
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await worker.terminate();
})();
```
### with pdf output (^2.0.0-beta.1)
Please check **examples** folder for details.
Browser: [download-pdf.html](../examples/browser/download-pdf.html)
Node: [download-pdf.js](../examples/node/download-pdf.js)
### with only part of the image (^2.0.1)
**One rectangle**
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle });
console.log(text);
await worker.terminate();
})();
```
**Multiple Rectangles**
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const rectangles = [
{
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
];
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const values = [];
for (let i = 0; i < rectangles.length; i++) {
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle: rectangles[i] });
values.push(text);
}
console.log(values);
await worker.terminate();
})();
```
**Multiple Rectangles (with scheduler to do recognition in parallel)**
```javascript
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const rectangles = [
{
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
];
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');
await worker2.initialize('eng');
scheduler.addWorker(worker1);
scheduler.addWorker(worker2);
const results = await Promise.all(rectangles.map((rectangle) => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle })
)));
console.log(results.map(r => r.data.text));
await scheduler.terminate();
})();
```
### with multiple workers to speed up (^2.0.0-beta.1)
```javascript
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');
await worker2.initialize('eng');
scheduler.addWorker(worker1);
scheduler.addWorker(worker2);
/** Add 10 recognition jobs */
const results = await Promise.all(Array(10).fill(0).map(() => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png')
)))
console.log(results);
await scheduler.terminate(); // It also terminates all workers.
})();
```

42
docs/faq.md

@ -1,42 +0,0 @@ @@ -1,42 +0,0 @@
FAQ
===
## How does tesseract.js download and keep \*.traineddata?
The language model is downloaded by `worker.loadLanguage()` and you need to pass the langs to `worker.initialize()`.
During the downloading of language model, Tesseract.js will first check if \*.traineddata already exists. (browser: [IndexedDB](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API), Node.js: fs, in the folder you execute the command) If the \*.traineddata doesn't exist, it will fetch \*.traineddata.gz from [tessdata](https://github.com/naptha/tessdata), ungzip and store in IndexedDB or fs, you can delete it manually and it will download again for you.
## How can I train my own \*.traineddata?
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00)
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05)
## How can I get HOCR, TSV, Box, UNLV, OSD?
Starting from 2.0.0-beta.1, you can get all these information in the final result.
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_create_box: '1',
tessedit_create_unlv: '1',
tessedit_create_osd: '1',
});
const { data: { text, hocr, tsv, box, unlv } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
console.log(hocr);
console.log(tsv);
console.log(box);
console.log(unlv);
})();
```

18
docs/image-format.md

@ -1,18 +0,0 @@ @@ -1,18 +0,0 @@
# Image Format
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below.
Support Image Formats: **bmp, jpg, png, pbm, webp**
For browser and Node, supported data types are:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp)
- buffer
For browser only, supported data types are:
- `File` or `Blob` object
- `img` or `canvas` element
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

BIN
docs/images/tesseract.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

BIN
docs/images/video-demo.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 237 KiB

38
docs/local-installation.md

@ -1,38 +0,0 @@ @@ -1,38 +0,0 @@
## Local Installation
Check here for examples: https://github.com/naptha/tesseract.js/blob/master/docs/examples.md
In browser environment, `tesseract.js` simply provides the API layer. Internally, it opens a WebWorker to handle requests. That worker itself loads code from the Emscripten-built `tesseract.js-core` which itself is hosted on a CDN. Then it dynamically loads language files hosted on another CDN.
Because of this we recommend loading `tesseract.js` from a CDN. But if you really need to have all your files local, you can pass extra arguments to `TesseractWorker` to specify custom paths for workers, languages, and core.
In Node.js environment, the only path you may want to customize is languages/langPath.
```javascript
Tesseract.recognize(image, langs, {
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',
})
```
Or
```javascript
const worker = createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',
});
```
### workerPath
A string specifying the location of the [worker.js](./dist/worker.min.js) file.
### langPath
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`.
### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available).
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment.

71
docs/tesseract_lang_list.md

@ -1,3 +1,72 @@ @@ -1,3 +1,72 @@
# Tesseract Languages
Please check [HERE](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) for supported languages
The `lang` property of the options object passed to `Tesseract.recognize` can have one of the following values (the default is `'eng'`.):
| `lang` | Language |
|-----------|-----------------------|
| 'afr' | Afrikaans |
| 'ara' | Arabic |
| 'aze' | Azerbaijani |
| 'bel' | Belarusian |
| 'ben' | Bengali |
| 'bul' | Bulgarian |
| 'cat' | Catalan |
| 'ces' | Czech |
| 'chi_sim' | Chinese |
| 'chi_tra' | Traditional Chinese |
| 'chr' | Cherokee |
| 'dan' | Danish |
| 'deu' | German |
| 'ell' | Greek |
| 'eng' | English |
| 'enm' | English (Old) |
| 'epo' | Esperanto |
| 'epo_alt' | Esperanto alternative |
| 'equ' | Math |
| 'est' | Estonian |
| 'eus' | Basque |
| 'fas' |Persian (Farsi) |
| 'fin' | Finnish |
| 'fra' | French |
| 'frk' | Frankish |
| 'frm' | French (Old) |
| 'glg' | Galician |
| 'grc' | Ancient Greek |
| 'heb' | Hebrew |
| 'hin' | Hindi |
| 'hrv' | Croatian |
| 'hun' | Hungarian |
| 'ind' | Indonesian |
| 'isl' | Icelandic |
| 'ita' | Italian |
| 'ita_old' | Italian (Old) |
| 'jpn' | Japanese |
| 'kan' | Kannada |
| 'kor' | Korean |
| 'lav' | Latvian |
| 'lit' | Lithuanian |
| 'mal' | Malayalam |
| 'mkd' | Macedonian |
| 'mlt' | Maltese |
| 'msa' | Malay |
| 'nld' | Dutch |
| 'nor' | Norwegian |
| 'pol' | Polish |
| 'por' | Portuguese |
| 'ron' | Romanian |
| 'rus' | Russian |
| 'slk' | Slovakian |
| 'slv' | Slovenian |
| 'spa' | Spanish |
| 'spa_old' | Old Spanish |
| 'sqi' | Albanian |
| 'srp' | Serbian (Latin) |
| 'swa' | Swahili |
| 'swe' | Swedish |
| 'tam' | Tamil |
| 'tel' | Telugu |
| 'tgl' | Tagalog |
| 'tha' | Thai |
| 'tur' | Turkish |
| 'ukr' | Ukrainian |
| 'vie' | Vietnamese |

BIN
tests/assets/images/simple.bmp → docs/tesseract_parameters.md

Binary file not shown.

Before

Width:  |  Height:  |  Size: 169 KiB

After

Width:  |  Height:  |  Size: 215 KiB

37
examples/browser/basic-edge.html

@ -1,37 +0,0 @@ @@ -1,37 +0,0 @@
<!DOCTYPE HTML>
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script>
const recognize = function(evt){
const files = evt.target.files;
const worker = Tesseract.createWorker({
/*
* As Edge don't support webassembly,
* here we force to use asm.js version.
*/
corePath: '../../node_modules/tesseract.js-core/tesseract-core.asm.js',
logger: function(m){console.log(m);},
/*
* As there is no indexedDB in earlier version
* of Edge, here we disable cache.
*/
cacheMethod: 'none',
});
Promise.resolve()
.then(() => worker.load())
.then(() => worker.loadLanguage('eng'))
.then(() => worker.initialize('eng'))
.then(() => worker.recognize(files[0]))
.then((ret) => {
console.log(ret.data.text);
});
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

19
examples/browser/basic.html

@ -1,19 +0,0 @@ @@ -1,19 +0,0 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script>
const recognize = async ({ target: { files } }) => {
const { data: { text } } = await Tesseract.recognize(files[0], 'eng', {
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
logger: m => console.log(m),
});
console.log(text);
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

33
examples/browser/benchmark.html

@ -1,33 +0,0 @@ @@ -1,33 +0,0 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<textarea id="message">Working...</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file);
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
document.getElementById('message').innerHTML += "\n" + file + " [x10] runtime: " + timeDif + "s";
}
document.getElementById('message').innerHTML += "\nTotal runtime: " + timeTotal + "s";
})();
</script>
</body>
</html>

52
examples/browser/download-pdf.html

@ -1,52 +0,0 @@ @@ -1,52 +0,0 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<div>
<input type="file" id="uploader">
<button id="download-pdf" disabled="true">Download PDF</button>
</div>
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
logger: m => console.log(m),
});
const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf');
const recognize = async ({ target: { files } }) => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]);
const board = document.getElementById('board');
board.value = text;
dlBtn.disabled = false;
};
const downloadPDF = async () => {
const filename = 'tesseract-ocr-result.pdf';
const { data } = await worker.getPDF('Tesseract OCR Result');
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' });
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, filename);
} else {
const link = document.createElement('a');
if (link.download !== undefined) {
const url = URL.createObjectURL(blob);
link.setAttribute('href', url);
link.setAttribute('download', filename);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
};
uploader.addEventListener('change', recognize);
dlBtn.addEventListener('click', downloadPDF);
</script>
</body>
</html>

BIN
examples/data/meditations.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1011 KiB

BIN
examples/data/testocr.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

BIN
examples/data/tyger.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

1
examples/file-input/README.md

@ -0,0 +1 @@ @@ -0,0 +1 @@
#

2
examples/file-input/basic.html

@ -0,0 +1,2 @@ @@ -0,0 +1,2 @@
<script src="/dist/tesseract.dev.js"></script>
<input type="file" onchange="Tesseract.recognize(this.files[0]).progress(function(data){console.log(data)}).then(function(data){console.log(data)})">

32
examples/browser/demo.html → examples/file-input/demo.html

@ -27,7 +27,7 @@ function progressUpdate(packet){ @@ -27,7 +27,7 @@ function progressUpdate(packet){
if(packet.status == 'done'){
var pre = document.createElement('pre')
pre.appendChild(document.createTextNode(packet.data.data.text))
pre.appendChild(document.createTextNode(packet.data.text))
line.innerHTML = ''
line.appendChild(pre)
@ -37,18 +37,21 @@ function progressUpdate(packet){ @@ -37,18 +37,21 @@ function progressUpdate(packet){
}
}
async function recognizeFile(file) {
function recognizeFile(file){
document.querySelector("#log").innerHTML = ''
const corePath = window.navigator.userAgent.indexOf("Edge") > -1
? '../../node_modules/tesseract.js-core/tesseract-core.asm.js'
: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js';
const lang = document.querySelector('#langsel').value
const data = await Tesseract.recognize(file, lang, {
corePath,
logger: progressUpdate,
});
progressUpdate({ status: 'done', data });
Tesseract.recognize(file, {
lang: document.querySelector('#langsel').value
})
.progress(function(packet){
console.info(packet)
progressUpdate(packet)
})
.then(function(data){
console.log(data)
progressUpdate({ status: 'done', data: data })
})
}
</script>
<select id="langsel" onchange="window.lastFile && recognizeFile(window.lastFile)">
@ -71,6 +74,7 @@ async function recognizeFile(file) { @@ -71,6 +74,7 @@ async function recognizeFile(file) {
<option value='meme' > Internet Meme </option>
<option value='epo' > Esperanto </option>
<option value='epo_alt' > Esperanto alternative </option>
<option value='equ' > Math </option>
<option value='est' > Estonian </option>
<option value='eus' > Basque </option>
<option value='fin' > Finnish </option>
@ -119,7 +123,7 @@ async function recognizeFile(file) { @@ -119,7 +123,7 @@ async function recognizeFile(file) {
<option value='vie' > Vietnamese </option>
</select>
<button onclick="recognizeFile('../../tests/assets/images/simple.png')">Sample Image</button>
<button onclick="recognizeFile('../node/cosmic.png')">Sample Image</button>
<input type="file" onchange="recognizeFile(window.lastFile=this.files[0])">
<div id="log"></div>
@ -159,4 +163,4 @@ progress { @@ -159,4 +163,4 @@ progress {
progress[value="1"] {
opacity: 0.5;
}
</style>
</style>

15
examples/node/basic.js

@ -0,0 +1,15 @@ @@ -0,0 +1,15 @@
// replace this with require('tesseract.js')
var Tesseract = require('../../'),
image = require('path').resolve(__dirname, 'cosmic.png');
Tesseract.recognize(image)
.then(data => {
console.log('then\n', data.text)
})
.catch(err => {
console.log('catch\n', err);
})
.finally(e => {
console.log('finally\n');
process.exit();
});

27
examples/node/benchmark.js

@ -1,27 +0,0 @@ @@ -1,27 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const { createWorker } = require('../../');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file)
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
console.log(file + " [x10] runtime: " + timeDif + "s");
}
console.log("Total runtime: " + timeTotal + "s");
await worker.terminate();
})();

0
tests/assets/images/cosmic.png → examples/node/cosmic.png

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

23
examples/node/detect.js

@ -1,13 +1,12 @@ @@ -1,13 +1,12 @@
#!/usr/bin/env node
const path = require('path');
const Tesseract = require('../../');
// replace this with require('tesseract.js')
var Tesseract = require('../../'),
image = require('path').resolve(__dirname, 'cosmic.png');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`);
Tesseract.detect(image, { logger: m => console.log(m) })
.then(({ data }) => {
console.log(data);
});
Tesseract.detect(image)
.progress(function(info){
console.log(info);
})
.then(function(data){
console.log('done', data);
process.exit();
})

22
examples/node/download-pdf.js

@ -1,22 +0,0 @@ @@ -1,22 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const fs = require('fs');
const { createWorker } = require('../../');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`);
(async () => {
const worker = createWorker();
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
console.log(text);
const { data } = await worker.getPDF('Tesseract OCR Result');
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(data));
console.log('Generate PDF: tesseract-ocr-result.pdf');
await worker.terminate();
})();

20
examples/node/recognize.js

@ -1,20 +0,0 @@ @@ -1,20 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const { createWorker } = require('../../');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`);
const worker = createWorker({
logger: m => console.log(m),
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
console.log(text);
await worker.terminate();
})();

18807
package-lock.json generated

File diff suppressed because it is too large Load Diff

91
package.json

@ -1,78 +1,39 @@ @@ -1,78 +1,39 @@
{
"name": "tesseract.js",
"version": "3.0.3",
"version": "1.0.19",
"description": "Pure Javascript Multilingual OCR",
"main": "src/index.js",
"types": "src/index.d.ts",
"unpkg": "dist/tesseract.min.js",
"jsdelivr": "dist/tesseract.min.js",
"scripts": {
"start": "node scripts/server.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js && rollup -c scripts/rollup.esm.js",
"profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json",
"profile:worker": "webpack-bundle-analyzer dist/worker-stats.json",
"prepublishOnly": "npm run build",
"wait": "rimraf dist && wait-on http://localhost:3000/dist/tesseract.dev.js",
"test": "npm-run-all -p -r start test:all",
"test:all": "npm-run-all wait test:browser:* test:node:all",
"test:node": "nyc mocha --exit --bail --require ./scripts/test-helper.js",
"test:node:all": "npm run test:node -- ./tests/*.test.js",
"test:browser-tpl": "mocha-headless-chrome -a incognito -a no-sandbox -a disable-setuid-sandbox -a disable-logging -t 300000",
"test:browser:detect": "npm run test:browser-tpl -- -f ./tests/detect.test.html",
"test:browser:recognize": "npm run test:browser-tpl -- -f ./tests/recognize.test.html",
"test:browser:scheduler": "npm run test:browser-tpl -- -f ./tests/scheduler.test.html",
"test:browser:FS": "npm run test:browser-tpl -- -f ./tests/FS.test.html",
"lint": "eslint src",
"lint:fix": "eslint --fix src",
"postinstall": "opencollective-postinstall || true"
"start": "concurrently --kill-others \"watchify src/index.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.dev.js --standalone Tesseract\" \"watchify src/browser/worker.js -t [ envify --TESS_ENV development ] -t [ babelify --presets [ es2015 ] ] -o dist/worker.dev.js\" \"http-server -p 7355\"",
"build": "browserify src/index.js -t [ babelify --presets [ es2015 ] ] -o dist/tesseract.js --standalone Tesseract && browserify src/browser/worker.js -t [ babelify --presets [ es2015 ] ] -o dist/worker.js && uglifyjs dist/tesseract.js --source-map -o dist/tesseract.min.js && uglifyjs dist/worker.js --source-map -o dist/worker.min.js",
"release": "npm run build && git commit -am 'new release' && git push && git tag `jq -r '.version' package.json` && git push origin --tags && npm publish"
},
"browser": {
"./src/worker/node/index.js": "./src/worker/browser/index.js"
"./src/node/index.js": "./src/browser/index.js"
},
"author": "",
"contributors": [
"jeromewu"
],
"license": "Apache-2.0",
"devDependencies": {
"@babel/core": "^7.18.7",
"@babel/preset-env": "^7.18.7",
"@rollup/plugin-commonjs": "^22.0.2",
"acorn": "^6.4.0",
"babel-loader": "^8.2.0",
"buffer": "^6.0.3",
"cors": "^2.8.5",
"eslint": "^7.2.0",
"eslint-config-airbnb-base": "^14.2.0",
"eslint-plugin-import": "^2.22.1",
"expect.js": "^0.3.1",
"express": "^4.17.1",
"mocha": "^8.1.3",
"mocha-headless-chrome": "^2.0.3",
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"rimraf": "^2.7.1",
"rollup": "^2.79.0",
"wait-on": "^3.3.0",
"webpack": "^5.74.0",
"webpack-bundle-analyzer": "^4.6.0",
"webpack-cli": "^4.10.0",
"webpack-dev-middleware": "^5.3.3"
"babel-preset-es2015": "^6.16.0",
"babelify": "^7.3.0",
"browserify": "^13.1.0",
"concurrently": "^3.1.0",
"envify": "^3.4.1",
"http-server": "^0.9.0",
"pako": "^1.0.3",
"uglify-js": "^3.4.9",
"watchify": "^3.7.0"
},
"dependencies": {
"babel-eslint": "^10.1.0",
"bmp-js": "^0.1.0",
"file-type": "^12.4.1",
"idb-keyval": "^3.2.0",
"is-electron": "^2.2.0",
"is-url": "^1.2.4",
"node-fetch": "^2.6.0",
"opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.2",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
"file-type": "^3.8.0",
"isomorphic-fetch": "^2.2.1",
"is-url": "1.2.2",
"jpeg-js": "^0.2.0",
"level-js": "^2.2.4",
"node-fetch": "^1.6.3",
"object-assign": "^4.1.0",
"png.js": "^0.2.1",
"tesseract.js-core": "^1.0.2"
},
"repository": {
"type": "git",
@ -81,9 +42,5 @@ @@ -81,9 +42,5 @@
"bugs": {
"url": "https://github.com/naptha/tesseract.js/issues"
},
"homepage": "https://github.com/naptha/tesseract.js",
"collective": {
"type": "opencollective",
"url": "https://opencollective.com/tesseractjs"
}
"homepage": "https://github.com/naptha/tesseract.js"
}

5
scripts/.eslintrc

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
{
"rules": {
"import/no-extraneous-dependencies": 0
}
}

13
scripts/rollup.esm.js

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
import commonjs from "@rollup/plugin-commonjs";
export default [
{
input: "dist/tesseract.min.js",
output: {
file: "dist/tesseract.esm.min.js",
format: "esm",
banner: "/* eslint-disable */",
},
plugins: [commonjs()],
},
];

17
scripts/server.js

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
const webpack = require('webpack');
const middleware = require('webpack-dev-middleware');
const express = require('express');
const path = require('path');
const cors = require('cors');
const webpackConfig = require('./webpack.config.dev');
const compiler = webpack(webpackConfig);
const app = express();
app.use(cors());
app.use('/', express.static(path.resolve(__dirname, '..')));
app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true }));
module.exports = app.listen(3000, () => {
console.log('Server is running on the port no. 3000');
});

9
scripts/test-helper.js

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
const constants = require('../tests/constants');
global.expect = require('expect.js');
global.fs = require('fs');
global.path = require('path');
global.Tesseract = require('../src');
Object.keys(constants).forEach((key) => {
global[key] = constants[key];
});

28
scripts/webpack.config.common.js

@ -1,28 +0,0 @@ @@ -1,28 +0,0 @@
module.exports = {
resolve: {
fallback: {
buffer: require.resolve('buffer/'),
},
},
module: {
rules: [
{
test: /\.m?js$/,
// exclude: /(node_modules|bower_components)/,
use: {
loader: 'babel-loader',
options: {
presets: [
[
'@babel/preset-env',
{
targets: 'last 2 versions',
},
],
],
},
},
},
],
},
};

48
scripts/webpack.config.dev.js

@ -1,48 +0,0 @@ @@ -1,48 +0,0 @@
const path = require('path');
const webpack = require('webpack');
const { BundleAnalyzerPlugin } = require('webpack-bundle-analyzer');
const common = require('./webpack.config.common');
const genConfig = ({
entry, filename, library, libraryTarget,
}) => ({
...common,
mode: 'development',
entry,
output: {
filename,
library,
libraryTarget,
},
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
new webpack.DefinePlugin({
'process.env': {
TESS_ENV: JSON.stringify('development'),
},
}),
new BundleAnalyzerPlugin({
analyzerMode: 'disable',
statsFilename: `${filename.split('.')[0]}-stats.json`,
generateStatsFile: true
}),
],
devServer: {
allowedHosts: ['localhost', '.gitpod.io'],
},
});
module.exports = [
genConfig({
entry: path.resolve(__dirname, '..', 'src', 'index.js'),
filename: 'tesseract.dev.js',
library: 'Tesseract',
libraryTarget: 'umd',
}),
genConfig({
entry: path.resolve(__dirname, '..', 'src', 'worker-script', 'browser', 'index.js'),
filename: 'worker.dev.js',
}),
];

36
scripts/webpack.config.prod.js

@ -1,36 +0,0 @@ @@ -1,36 +0,0 @@
const path = require('path');
const common = require('./webpack.config.common');
const webpack = require('webpack');
const genConfig = ({
entry, filename, library, libraryTarget,
}) => ({
...common,
mode: 'production',
devtool: 'source-map',
entry,
output: {
path: path.resolve(__dirname, '..', 'dist'),
filename,
library,
libraryTarget,
},
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
]
});
module.exports = [
genConfig({
entry: path.resolve(__dirname, '..', 'src', 'index.js'),
filename: 'tesseract.min.js',
library: 'Tesseract',
libraryTarget: 'umd',
}),
genConfig({
entry: path.resolve(__dirname, '..', 'src', 'worker-script', 'browser', 'index.js'),
filename: 'worker.min.js',
}),
];

28
src/Tesseract.js

@ -1,28 +0,0 @@ @@ -1,28 +0,0 @@
const createWorker = require('./createWorker');
const recognize = async (image, langs, options) => {
const worker = createWorker(options);
await worker.load();
await worker.loadLanguage(langs);
await worker.initialize(langs);
return worker.recognize(image)
.finally(async () => {
await worker.terminate();
});
};
const detect = async (image, options) => {
const worker = createWorker(options);
await worker.load();
await worker.loadLanguage('osd');
await worker.initialize('osd');
return worker.detect(image)
.finally(async () => {
await worker.terminate();
});
};
module.exports = {
recognize,
detect,
};

105
src/browser/index.js

@ -0,0 +1,105 @@ @@ -0,0 +1,105 @@
var defaultOptions = {
// workerPath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@0.2.0/dist/worker.js',
corePath: 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js-core@0.1.0/index.js',
langPath: 'https://tessdata.projectnaptha.com/3.02/',
}
if (process.env.TESS_ENV === "development") {
console.debug('Using Development Configuration')
defaultOptions.workerPath = location.protocol + '//' + location.host + '/dist/worker.dev.js?nocache=' + Math.random().toString(36).slice(3)
}else{
var version = require('../../package.json').version;
defaultOptions.workerPath = 'https://cdn.jsdelivr.net/gh/naptha/tesseract.js@' + version + '/dist/worker.js'
}
exports.defaultOptions = defaultOptions;
exports.spawnWorker = function spawnWorker(instance, workerOptions){
if(Blob && URL){
var blob = new Blob(['importScripts("' + workerOptions.workerPath + '");'], {
type: 'application/javascript'
});
var worker = new Worker(URL.createObjectURL(blob));
}else{
var worker = new Worker(workerOptions.workerPath)
}
worker.onmessage = function(e){
var packet = e.data;
instance._recv(packet)
}
return worker
}
exports.terminateWorker = function(instance){
instance.worker.terminate()
}
exports.sendPacket = function sendPacket(instance, packet){
loadImage(packet.payload.image, function(img){
packet.payload.image = img
instance.worker.postMessage(packet)
})
}
function loadImage(image, cb){
if(typeof image === 'string'){
if(/^\#/.test(image)){
// element css selector
return loadImage(document.querySelector(image), cb)
}else if(/(blob|data)\:/.test(image)){
// data url
var im = new Image
im.src = image;
im.onload = e => loadImage(im, cb);
im.onerror = e => { throw e; };
return
}else{
var xhr = new XMLHttpRequest();
xhr.open('GET', image, true)
xhr.responseType = "blob";
xhr.onload = e => {
if (xhr.status >= 400){
throw new Error('Fail to get image as Blob');
}else{
loadImage(xhr.response, cb);
}
};
xhr.onerror = e => { throw e; };
xhr.send(null)
return
}
}else if(image instanceof File){
// files
var fr = new FileReader()
fr.onload = e => loadImage(fr.result, cb);
fr.onerror = e => { throw e; };
fr.readAsDataURL(image)
return
}else if(image instanceof Blob){
return loadImage(URL.createObjectURL(image), cb)
}else if(image.getContext){
// canvas element
return loadImage(image.getContext('2d'), cb)
}else if(image.tagName == "IMG" || image.tagName == "VIDEO"){
// image element or video element
var c = document.createElement('canvas');
c.width = image.naturalWidth || image.videoWidth;
c.height = image.naturalHeight || image.videoHeight;
var ctx = c.getContext('2d');
ctx.drawImage(image, 0, 0);
return loadImage(ctx, cb)
}else if(image.getImageData){
// canvas context
var data = image.getImageData(0, 0, image.canvas.width, image.canvas.height);
return loadImage(data, cb)
}else{
return cb(image)
}
throw new Error('Missing return in loadImage cascade')
}

76
src/browser/lang.js

@ -0,0 +1,76 @@ @@ -0,0 +1,76 @@
const leveljs = require('level-js')
// something about trying to store these language files in indexedDB
// causes iOS Safari to crash
var iOS = /iPad|iPhone|iPod/.test(navigator.userAgent);
var noIDB = typeof indexedDB === 'undefined' || iOS;
var db = noIDB ? { open: (_, cb) => cb(true) } : leveljs('./tessdata2')
var langdata = require('../common/langdata.json')
module.exports = function getLanguageData(req, res, cb){
var lang = req.options.lang;
function saveDataFile(data){
try {
db.put(lang, data, err => console.log('cached', lang, err))
} finally {
cb(data)
}
}
db.open({ compression: false }, err => {
if (err) return fetchLanguageData(req, res, cb);
db.get(lang, (err, data) => {
if (err) return fetchLanguageData(req, res, saveDataFile);
res.progress({ status: 'found in cache ' + lang + '.traineddata' })
cb(data)
})
})
}
const ungzip = require('pako/lib/inflate.js').ungzip;
function fetchLanguageData(req, res, cb){
var lang = req.options.lang;
var langfile = lang + '.traineddata.gz';
var url = req.workerOptions.langPath + langfile;
var xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.responseType = 'arraybuffer';
xhr.onerror = e => {
xhr.onprogress = xhr.onload = null
cb(xhr, null)
}
xhr.onprogress = e =>
res.progress({
status: 'downloading ' + langfile,
loaded: e.loaded,
progress: Math.min(1, e.loaded / langdata[lang])
});
xhr.onload = e => {
if (!(xhr.status == 200 || (xhr.status == 0 && xhr.response))) return res.reject('Error downloading language ' + url);
res.progress({ status: 'unzipping ' + langfile, progress: 0 })
// in case the gzips are already ungzipped or extra gzipped
var response = new Uint8Array(xhr.response)
try {
var n = 2;
while(response[0] == 0x1f && response[1] == 0x8b){
response = ungzip(response);
res.progress({ status: 'unzipping ' + langfile, progress: 1 - 1 / (n++) })
}
} catch (err) {
return res.reject('Error unzipping language file ' + langfile + '\n' + err.message)
}
res.progress({ status: 'unzipping ' + langfile, progress: 1 })
cb(response)
}
xhr.send()
}

23
src/browser/worker.js

@ -0,0 +1,23 @@ @@ -0,0 +1,23 @@
const workerUtils = require('../common/worker.js')
if (process.env.TESS_ENV === "development") {
console.debug('Using Development Worker')
}
global.addEventListener('message', function(e){
var packet = e.data;
workerUtils.dispatchHandlers(packet, obj => postMessage(obj))
})
exports.getCore = function(req, res){
if(!global.TesseractCore){
res.progress({ status: 'loading tesseract core', progress: 0 })
importScripts(req.workerOptions.corePath)
res.progress({ status: 'loading tesseract core', progress: 1 })
}
return TesseractCore
}
exports.getLanguageData = require('./lang.js')
workerUtils.setAdapter(module.exports);

63
src/common/circularize.js

@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
// The result of dump.js is a big JSON tree
// which can be easily serialized (for instance
// to be sent from a webworker to the main app
// or through Node's IPC), but we want
// a (circular) DOM-like interface for walking
// through the data.
module.exports = function circularize(page){
page.paragraphs = []
page.lines = []
page.words = []
page.symbols = []
page.blocks.forEach(function(block){
block.page = page;
block.lines = []
block.words = []
block.symbols = []
block.paragraphs.forEach(function(para){
para.block = block;
para.page = page;
para.words = []
para.symbols = []
para.lines.forEach(function(line){
line.paragraph = para;
line.block = block;
line.page = page;
line.symbols = []
line.words.forEach(function(word){
word.line = line;
word.paragraph = para;
word.block = block;
word.page = page;
word.symbols.forEach(function(sym){
sym.word = word;
sym.line = line;
sym.paragraph = para;
sym.block = block;
sym.page = page;
sym.line.symbols.push(sym)
sym.paragraph.symbols.push(sym)
sym.block.symbols.push(sym)
sym.page.symbols.push(sym)
})
word.paragraph.words.push(word)
word.block.words.push(word)
word.page.words.push(word)
})
line.block.lines.push(line)
line.page.lines.push(line)
})
para.page.paragraphs.push(para)
})
})
return page
}

24
src/common/desaturate.js

@ -0,0 +1,24 @@ @@ -0,0 +1,24 @@
// This converts an image to grayscale
module.exports = function desaturate(image){
var width, height;
if(image.data){
var src = image.data;
width = image.width,
height = image.height;
var dst = new Uint8Array(width * height);
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0;
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) {
// convert to grayscale 4 pixels at a time; eveything with alpha gets put in front of 50% gray
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16
}
for (; i < srcLength; i += 4, ++j) //finish up
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
image = dst;
} else { throw 'Invalid ImageData' }
return image
}

164
src/common/dump.js

@ -0,0 +1,164 @@ @@ -0,0 +1,164 @@
module.exports = function DumpLiterallyEverything(Module, base){
var ri = base.GetIterator();
var blocks = [];
var block, para, textline, word, symbol;
function enumToString(value, prefix){
return (Object.keys(Module)
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' })
.filter(function(e){ return Module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
}
ri.Begin()
do {
if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){
var poly = ri.BlockPolygon();
var polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if(Module.getPointer(poly) > 0){
var n = poly.get_n(),
px = poly.get_x(),
py = poly.get_y(),
polygon = [];
for(var i = 0; i < n; i++){
polygon.push([px.getValue(i), py.getValue(i)]);
}
Module._ptaDestroy(Module.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(Module.RIL_BLOCK),
confidence: ri.Confidence(Module.RIL_BLOCK),
baseline: ri.getBaseline(Module.RIL_BLOCK),
bbox: ri.getBoundingBox(Module.RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon: polygon
}
blocks.push(block)
}
if(ri.IsAtBeginningOf(Module.RIL_PARA)){
para = {
lines: [],
text: ri.GetUTF8Text(Module.RIL_PARA),
confidence: ri.Confidence(Module.RIL_PARA),
baseline: ri.getBaseline(Module.RIL_PARA),
bbox: ri.getBoundingBox(Module.RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr()
}
block.paragraphs.push(para)
}
if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){
textline = {
words: [],
text: ri.GetUTF8Text(Module.RIL_TEXTLINE),
confidence: ri.Confidence(Module.RIL_TEXTLINE),
baseline: ri.getBaseline(Module.RIL_TEXTLINE),
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE)
}
para.lines.push(textline)
}
if(ri.IsAtBeginningOf(Module.RIL_WORD)){
var fontInfo = ri.getWordFontAttributes(),
wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(Module.RIL_WORD),
confidence: ri.Confidence(Module.RIL_WORD),
baseline: ri.getBaseline(Module.RIL_WORD),
bbox: ri.getBoundingBox(Module.RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
}
var wc = new Module.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence()
})
} while (wc.Next());
Module.destroy(wc)
textline.words.push(word)
}
var image = null;
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// Module._pixDestroy(Module.getPointer(pix));
if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){
symbol = {
choices: [],
image: image,
text: ri.GetUTF8Text(Module.RIL_SYMBOL),
confidence: ri.Confidence(Module.RIL_SYMBOL),
baseline: ri.getBaseline(Module.RIL_SYMBOL),
bbox: ri.getBoundingBox(Module.RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
}
word.symbols.push(symbol)
var ci = new Module.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence()
})
} while (ci.Next());
Module.destroy(ci)
}
} while (ri.Next(Module.RIL_SYMBOL));
Module.destroy(ri)
return {
text: base.GetUTF8Text(),
html: deindent(base.GetHOCRText()),
confidence: base.MeanTextConf(),
blocks: blocks,
psm: enumToString(base.GetPageSegMode(), 'PSM'),
oem: enumToString(base.oem(), 'OEM'),
version: base.Version(),
}
}
// the generated HOCR is excessively indented, so
// we get rid of that indentation
function deindent(html){
var lines = html.split('\n')
if(lines[0].substring(0, 2) === " "){
for (var i = 0; i < lines.length; i++) {
if (lines[i].substring(0,2) === " ") {
lines[i] = lines[i].slice(2)
}
};
}
return lines.join('\n')
}

81
src/common/job.js

@ -0,0 +1,81 @@ @@ -0,0 +1,81 @@
const adapter = require('../node/index.js')
let jobCounter = 0;
module.exports = class TesseractJob {
constructor(instance){
this.id = 'Job-' + (++jobCounter) + '-' + Math.random().toString(16).slice(3, 8)
this._instance = instance;
this._resolve = []
this._reject = []
this._progress = []
this._finally = []
}
then(resolve, reject){
if(this._resolve.push){
this._resolve.push(resolve)
}else{
resolve(this._resolve)
}
if(reject) this.catch(reject);
return this;
}
catch(reject){
if(this._reject.push){
this._reject.push(reject)
}else{
reject(this._reject)
}
return this;
}
progress(fn){
this._progress.push(fn)
return this;
}
finally(fn) {
this._finally.push(fn)
return this;
}
_send(action, payload){
adapter.sendPacket(this._instance, {
jobId: this.id,
action: action,
payload: payload
})
}
_handle(packet){
var data = packet.data;
let runFinallyCbs = false;
if(packet.status === 'resolve'){
if(this._resolve.length === 0) console.log(data);
this._resolve.forEach(fn => {
var ret = fn(data);
if(ret && typeof ret.then == 'function'){
console.warn('TesseractJob instances do not chain like ES6 Promises. To convert it into a real promise, use Promise.resolve.')
}
})
this._resolve = data;
this._instance._dequeue()
runFinallyCbs = true;
}else if(packet.status === 'reject'){
if(this._reject.length === 0) console.error(data);
this._reject.forEach(fn => fn(data))
this._reject = data;
this._instance._dequeue()
runFinallyCbs = true;
}else if(packet.status === 'progress'){
this._progress.forEach(fn => fn(data))
}else{
console.warn('Message type unknown', packet.status)
}
if (runFinallyCbs) {
this._finally.forEach(fn => fn(data));
}
}
}

1
src/common/langdata.json

@ -0,0 +1 @@ @@ -0,0 +1 @@
{"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}

165
src/common/worker.js

@ -0,0 +1,165 @@ @@ -0,0 +1,165 @@
var latestJob,
Module,
base,
adapter = {},
dump = require('./dump.js'),
desaturate = require('./desaturate.js');
function dispatchHandlers(packet, send){
function respond(status, data){
send({
jobId: packet.jobId,
status,
action: packet.action,
data
});
}
respond.resolve = respond.bind(this, 'resolve');
respond.reject = respond.bind(this, 'reject');
respond.progress = respond.bind(this, 'progress');
latestJob = respond;
try {
if(packet.action === 'recognize'){
handleRecognize(packet.payload, respond);
} else if (packet.action === 'detect'){
handleDetect(packet.payload, respond);
}
} catch (err) {
// Prepare exception to travel through postMessage
err = err.toString();
respond.reject(err)
}
}
exports.dispatchHandlers = dispatchHandlers;
exports.setAdapter = function setAdapter(impl){
adapter = impl;
};
function handleInit(req, res){
var MIN_MEMORY = 100663296;
if(['chi_sim', 'chi_tra', 'jpn'].includes(req.options.lang)){
MIN_MEMORY = 167772160;
}
if(!Module || Module.TOTAL_MEMORY < MIN_MEMORY){
var Core = adapter.getCore(req, res);
res.progress({ status: 'initializing tesseract', progress: 0 })
Module = Core({
TOTAL_MEMORY: MIN_MEMORY,
TesseractProgress(percent){
latestJob.progress({ status: 'recognizing text', progress: Math.max(0, (percent-30)/70) });
},
onRuntimeInitialized() {}
});
Module.FS_createPath("/", "tessdata", true, true);
base = new Module.TessBaseAPI();
res.progress({ status: 'initializing tesseract', progress: 1 });
}
}
function setImage(Module, base, image){
var imgbin = desaturate(image),
width = image.width,
height = image.height;
var ptr = Module.allocate(imgbin, 'i8', Module.ALLOC_NORMAL);
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width);
base.SetRectangle(0, 0, width, height);
return ptr;
}
function loadLanguage(req, res, cb){
var lang = req.options.lang,
langFile = lang + '.traineddata';
if(!Module._loadedLanguages) Module._loadedLanguages = {};
if(lang in Module._loadedLanguages) return cb();
adapter.getLanguageData(req, res, function(data){
res.progress({ status: 'loading ' + langFile, progress: 0 });
Module.FS_createDataFile('tessdata', langFile, data, true, false);
Module._loadedLanguages[lang] = true;
res.progress({ status: 'loading ' + langFile, progress: 1 });
cb();
})
}
function handleRecognize(req, res){
handleInit(req, res);
loadLanguage(req, res, () => {
var options = req.options;
function progressUpdate(progress){
res.progress({ status: 'initializing api', progress: progress });
}
progressUpdate(0);
base.Init(null, req.options.lang);
progressUpdate(.3);
for (var option in options) {
if (options.hasOwnProperty(option)) {
base.SetVariable(option, options[option]);
}
}
progressUpdate(.6);
var ptr = setImage(Module, base, req.image);
progressUpdate(1);
base.Recognize(null);
var result = dump(Module, base);
base.End();
Module._free(ptr);
res.resolve(result);
})
}
function handleDetect(req, res){
handleInit(req, res);
req.options.lang = 'osd';
loadLanguage(req, res, () => {
base.Init(null, 'osd');
base.SetPageSegMode(Module.PSM_OSD_ONLY);
var ptr = setImage(Module, base, req.image),
results = new Module.OSResults();
if(!base.DetectOS(results)){
base.End();
Module._free(ptr);
res.reject("Failed to detect OS");
} else {
var best = results.get_best_result(),
oid = best.get_orientation_id(),
sid = best.get_script_id();
base.End();
Module._free(ptr);
res.resolve({
tesseract_script_id: sid,
script: results.get_unicharset().get_script_from_script_id(sid),
script_confidence: best.get_sconfidence(),
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.get_oconfidence()
});
}
});
}

12
src/constants/OEM.js

@ -1,12 +0,0 @@ @@ -1,12 +0,0 @@
/*
* OEM = OCR Engine Mode, and there are 4 possible modes.
*
* By default tesseract.js uses LSTM_ONLY mode.
*
*/
module.exports = {
TESSERACT_ONLY: 0,
LSTM_ONLY: 1,
TESSERACT_LSTM_COMBINED: 2,
DEFAULT: 3,
};

19
src/constants/PSM.js

@ -1,19 +0,0 @@ @@ -1,19 +0,0 @@
/*
* PSM = Page Segmentation Mode
*/
module.exports = {
OSD_ONLY: '0',
AUTO_OSD: '1',
AUTO_ONLY: '2',
AUTO: '3',
SINGLE_COLUMN: '4',
SINGLE_BLOCK_VERT_TEXT: '5',
SINGLE_BLOCK: '6',
SINGLE_LINE: '7',
SINGLE_WORD: '8',
CIRCLE_WORD: '9',
SINGLE_CHAR: '10',
SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
};

5
src/constants/config.js

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
const OEM = require('./OEM');
module.exports = {
defaultOEM: OEM.DEFAULT,
};

13
src/constants/defaultOptions.js

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
module.exports = {
/*
* default path for downloading *.traineddata
*/
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
/*
* Use BlobURL for worker script by default
* TODO: remove this option
*
*/
workerBlobURL: true,
logger: () => {},
};

218
src/constants/languages.js

@ -1,218 +0,0 @@ @@ -1,218 +0,0 @@
/*
* languages with existing tesseract traineddata
* https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
*/
/**
* @typedef {object} Languages
* @property {string} AFR Afrikaans
* @property {string} AMH Amharic
* @property {string} ARA Arabic
* @property {string} ASM Assamese
* @property {string} AZE Azerbaijani
* @property {string} AZE_CYRL Azerbaijani - Cyrillic
* @property {string} BEL Belarusian
* @property {string} BEN Bengali
* @property {string} BOD Tibetan
* @property {string} BOS Bosnian
* @property {string} BUL Bulgarian
* @property {string} CAT Catalan; Valencian
* @property {string} CEB Cebuano
* @property {string} CES Czech
* @property {string} CHI_SIM Chinese - Simplified
* @property {string} CHI_TRA Chinese - Traditional
* @property {string} CHR Cherokee
* @property {string} CYM Welsh
* @property {string} DAN Danish
* @property {string} DEU German
* @property {string} DZO Dzongkha
* @property {string} ELL Greek, Modern (1453-)
* @property {string} ENG English
* @property {string} ENM English, Middle (1100-1500)
* @property {string} EPO Esperanto
* @property {string} EST Estonian
* @property {string} EUS Basque
* @property {string} FAS Persian
* @property {string} FIN Finnish
* @property {string} FRA French
* @property {string} FRK German Fraktur
* @property {string} FRM French, Middle (ca. 1400-1600)
* @property {string} GLE Irish
* @property {string} GLG Galician
* @property {string} GRC Greek, Ancient (-1453)
* @property {string} GUJ Gujarati
* @property {string} HAT Haitian; Haitian Creole
* @property {string} HEB Hebrew
* @property {string} HIN Hindi
* @property {string} HRV Croatian
* @property {string} HUN Hungarian
* @property {string} IKU Inuktitut
* @property {string} IND Indonesian
* @property {string} ISL Icelandic
* @property {string} ITA Italian
* @property {string} ITA_OLD Italian - Old
* @property {string} JAV Javanese
* @property {string} JPN Japanese
* @property {string} KAN Kannada
* @property {string} KAT Georgian
* @property {string} KAT_OLD Georgian - Old
* @property {string} KAZ Kazakh
* @property {string} KHM Central Khmer
* @property {string} KIR Kirghiz; Kyrgyz
* @property {string} KOR Korean
* @property {string} KUR Kurdish
* @property {string} LAO Lao
* @property {string} LAT Latin
* @property {string} LAV Latvian
* @property {string} LIT Lithuanian
* @property {string} MAL Malayalam
* @property {string} MAR Marathi
* @property {string} MKD Macedonian
* @property {string} MLT Maltese
* @property {string} MSA Malay
* @property {string} MYA Burmese
* @property {string} NEP Nepali
* @property {string} NLD Dutch; Flemish
* @property {string} NOR Norwegian
* @property {string} ORI Oriya
* @property {string} PAN Panjabi; Punjabi
* @property {string} POL Polish
* @property {string} POR Portuguese
* @property {string} PUS Pushto; Pashto
* @property {string} RON Romanian; Moldavian; Moldovan
* @property {string} RUS Russian
* @property {string} SAN Sanskrit
* @property {string} SIN Sinhala; Sinhalese
* @property {string} SLK Slovak
* @property {string} SLV Slovenian
* @property {string} SPA Spanish; Castilian
* @property {string} SPA_OLD Spanish; Castilian - Old
* @property {string} SQI Albanian
* @property {string} SRP Serbian
* @property {string} SRP_LATN Serbian - Latin
* @property {string} SWA Swahili
* @property {string} SWE Swedish
* @property {string} SYR Syriac
* @property {string} TAM Tamil
* @property {string} TEL Telugu
* @property {string} TGK Tajik
* @property {string} TGL Tagalog
* @property {string} THA Thai
* @property {string} TIR Tigrinya
* @property {string} TUR Turkish
* @property {string} UIG Uighur; Uyghur
* @property {string} UKR Ukrainian
* @property {string} URD Urdu
* @property {string} UZB Uzbek
* @property {string} UZB_CYRL Uzbek - Cyrillic
* @property {string} VIE Vietnamese
* @property {string} YID Yiddish
*/
/**
* @type {Languages}
*/
module.exports = {
AFR: 'afr',
AMH: 'amh',
ARA: 'ara',
ASM: 'asm',
AZE: 'aze',
AZE_CYRL: 'aze_cyrl',
BEL: 'bel',
BEN: 'ben',
BOD: 'bod',
BOS: 'bos',
BUL: 'bul',
CAT: 'cat',
CEB: 'ceb',
CES: 'ces',
CHI_SIM: 'chi_sim',
CHI_TRA: 'chi_tra',
CHR: 'chr',
CYM: 'cym',
DAN: 'dan',
DEU: 'deu',
DZO: 'dzo',
ELL: 'ell',
ENG: 'eng',
ENM: 'enm',
EPO: 'epo',
EST: 'est',
EUS: 'eus',
FAS: 'fas',
FIN: 'fin',
FRA: 'fra',
FRK: 'frk',
FRM: 'frm',
GLE: 'gle',
GLG: 'glg',
GRC: 'grc',
GUJ: 'guj',
HAT: 'hat',
HEB: 'heb',
HIN: 'hin',
HRV: 'hrv',
HUN: 'hun',
IKU: 'iku',
IND: 'ind',
ISL: 'isl',
ITA: 'ita',
ITA_OLD: 'ita_old',
JAV: 'jav',
JPN: 'jpn',
KAN: 'kan',
KAT: 'kat',
KAT_OLD: 'kat_old',
KAZ: 'kaz',
KHM: 'khm',
KIR: 'kir',
KOR: 'kor',
KUR: 'kur',
LAO: 'lao',
LAT: 'lat',
LAV: 'lav',
LIT: 'lit',
MAL: 'mal',
MAR: 'mar',
MKD: 'mkd',
MLT: 'mlt',
MSA: 'msa',
MYA: 'mya',
NEP: 'nep',
NLD: 'nld',
NOR: 'nor',
ORI: 'ori',
PAN: 'pan',
POL: 'pol',
POR: 'por',
PUS: 'pus',
RON: 'ron',
RUS: 'rus',
SAN: 'san',
SIN: 'sin',
SLK: 'slk',
SLV: 'slv',
SPA: 'spa',
SPA_OLD: 'spa_old',
SQI: 'sqi',
SRP: 'srp',
SRP_LATN: 'srp_latn',
SWA: 'swa',
SWE: 'swe',
SYR: 'syr',
TAM: 'tam',
TEL: 'tel',
TGK: 'tgk',
TGL: 'tgl',
THA: 'tha',
TIR: 'tir',
TUR: 'tur',
UIG: 'uig',
UKR: 'ukr',
URD: 'urd',
UZB: 'uzb',
UZB_CYRL: 'uzb_cyrl',
VIE: 'vie',
YID: 'yid',
};

21
src/createJob.js

@ -1,21 +0,0 @@ @@ -1,21 +0,0 @@
const getId = require('./utils/getId');
let jobCounter = 0;
module.exports = ({
id: _id,
action,
payload = {},
}) => {
let id = _id;
if (typeof id === 'undefined') {
id = getId('Job', jobCounter);
jobCounter += 1;
}
return {
id,
action,
payload,
};
};

80
src/createScheduler.js

@ -1,80 +0,0 @@ @@ -1,80 +0,0 @@
const createJob = require('./createJob');
const { log } = require('./utils/log');
const getId = require('./utils/getId');
let schedulerCounter = 0;
module.exports = () => {
const id = getId('Scheduler', schedulerCounter);
const workers = {};
const runningWorkers = {};
let jobQueue = [];
schedulerCounter += 1;
const getQueueLen = () => jobQueue.length;
const getNumWorkers = () => Object.keys(workers).length;
const dequeue = () => {
if (jobQueue.length !== 0) {
const wIds = Object.keys(workers);
for (let i = 0; i < wIds.length; i += 1) {
if (typeof runningWorkers[wIds[i]] === 'undefined') {
jobQueue[0](workers[wIds[i]]);
break;
}
}
}
};
const queue = (action, payload) => (
new Promise((resolve, reject) => {
const job = createJob({ action, payload });
jobQueue.push(async (w) => {
jobQueue.shift();
runningWorkers[w.id] = job;
try {
resolve(await w[action].apply(this, [...payload, job.id]));
} catch (err) {
reject(err);
} finally {
delete runningWorkers[w.id];
dequeue();
}
});
log(`[${id}]: Add ${job.id} to JobQueue`);
log(`[${id}]: JobQueue length=${jobQueue.length}`);
dequeue();
})
);
const addWorker = (w) => {
workers[w.id] = w;
log(`[${id}]: Add ${w.id}`);
log(`[${id}]: Number of workers=${getNumWorkers()}`);
dequeue();
return w.id;
};
const addJob = async (action, ...payload) => {
if (getNumWorkers() === 0) {
throw Error(`[${id}]: You need to have at least one worker before adding jobs`);
}
return queue(action, payload);
};
const terminate = async () => {
Object.keys(workers).forEach(async (wid) => {
await workers[wid].terminate();
});
jobQueue = [];
};
return {
addWorker,
addJob,
terminate,
getQueueLen,
getNumWorkers,
};
};

198
src/createWorker.js

@ -1,198 +0,0 @@ @@ -1,198 +0,0 @@
const resolvePaths = require('./utils/resolvePaths');
const circularize = require('./utils/circularize');
const createJob = require('./createJob');
const { log } = require('./utils/log');
const getId = require('./utils/getId');
const { defaultOEM } = require('./constants/config');
const {
defaultOptions,
spawnWorker,
terminateWorker,
onMessage,
loadImage,
send,
} = require('./worker/node');
let workerCounter = 0;
module.exports = (_options = {}) => {
const id = getId('Worker', workerCounter);
const {
logger,
errorHandler,
...options
} = resolvePaths({
...defaultOptions,
..._options,
});
const resolves = {};
const rejects = {};
let worker = spawnWorker(options);
workerCounter += 1;
const setResolve = (action, res) => {
resolves[action] = res;
};
const setReject = (action, rej) => {
rejects[action] = rej;
};
const startJob = ({ id: jobId, action, payload }) => (
new Promise((resolve, reject) => {
log(`[${id}]: Start ${jobId}, action=${action}`);
setResolve(action, resolve);
setReject(action, reject);
send(worker, {
workerId: id,
jobId,
action,
payload,
});
})
);
const load = (jobId) => (
startJob(createJob({
id: jobId, action: 'load', payload: { options },
}))
);
const writeText = (path, text, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'writeFile', args: [path, text] },
}))
);
const readText = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'readFile', args: [path, { encoding: 'utf8' }] },
}))
);
const removeFile = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'unlink', args: [path] },
}))
);
const FS = (method, args, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method, args },
}))
);
const loadLanguage = (langs = 'eng', jobId) => (
startJob(createJob({
id: jobId,
action: 'loadLanguage',
payload: { langs, options },
}))
);
const initialize = (langs = 'eng', oem = defaultOEM, jobId) => (
startJob(createJob({
id: jobId,
action: 'initialize',
payload: { langs, oem },
}))
);
const setParameters = (params = {}, jobId) => (
startJob(createJob({
id: jobId,
action: 'setParameters',
payload: { params },
}))
);
const recognize = async (image, opts = {}, jobId) => (
startJob(createJob({
id: jobId,
action: 'recognize',
payload: { image: await loadImage(image), options: opts },
}))
);
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => (
startJob(createJob({
id: jobId,
action: 'getPDF',
payload: { title, textonly },
}))
);
const detect = async (image, jobId) => (
startJob(createJob({
id: jobId,
action: 'detect',
payload: { image: await loadImage(image) },
}))
);
const terminate = async () => {
if (worker !== null) {
/*
await startJob(createJob({
id: jobId,
action: 'terminate',
}));
*/
terminateWorker(worker);
worker = null;
}
return Promise.resolve();
};
onMessage(worker, ({
workerId, jobId, status, action, data,
}) => {
if (status === 'resolve') {
log(`[${workerId}]: Complete ${jobId}`);
let d = data;
if (action === 'recognize') {
d = circularize(data);
} else if (action === 'getPDF') {
d = Array.from({ ...data, length: Object.keys(data).length });
}
resolves[action]({ jobId, data: d });
} else if (status === 'reject') {
rejects[action](data);
if (errorHandler) {
errorHandler(data);
} else {
throw Error(data);
}
} else if (status === 'progress') {
logger({ ...data, userJobId: jobId });
}
});
return {
id,
worker,
setResolve,
setReject,
load,
writeText,
readText,
removeFile,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
};
};

231
src/index.d.ts vendored

@ -1,231 +0,0 @@ @@ -1,231 +0,0 @@
declare namespace Tesseract {
function createScheduler(): Scheduler
function createWorker(options?: Partial<WorkerOptions>): Worker
function setLogging(logging: boolean): void
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any
interface Scheduler {
addWorker(worker: Worker): string
addJob(action: string, ...args: any[]): Promise<ConfigResult | RecognizeResult | DetectResult>
terminate(): Promise<any>
getQueueLen(): number
getNumWorkers(): number
}
interface Worker {
load(jobId?: string): Promise<ConfigResult>
writeText(path: string, text: string, jobId?: string): Promise<ConfigResult>
readText(path: string, jobId?: string): Promise<ConfigResult>
removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
}
interface Lang {
code: string;
data: unknown;
}
interface WorkerOptions {
corePath: string
langPath: string
cachePath: string
dataPath: string
workerPath: string
cacheMethod: string
workerBlobURL: boolean
gzip: boolean
logger: (arg: any) => void,
errorHandler: (arg: any) => void
}
interface WorkerParams {
tessedit_ocr_engine_mode: OEM
tessedit_pageseg_mode: PSM
tessedit_char_whitelist: string
preserve_interword_spaces: string
user_defined_dpi: string
tessjs_create_hocr: string
tessjs_create_tsv: string
tessjs_create_box: string
tessjs_create_unlv: string
tessjs_create_osd: string
}
interface RecognizeOptions {
rectangle: Rectangle
}
interface ConfigResult {
jobId: string
data: any
}
interface RecognizeResult {
jobId: string
data: Page
}
interface GetPDFResult {
jobId: string
data: number[]
}
interface DetectResult {
jobId: string
data: DetectData
}
interface DetectData {
tesseract_script_id: number
script: string
script_confidence: number
orientation_degrees: number
orientation_confidence: number
}
interface Rectangle {
left: number
top: number
width: number
height: number
}
enum OEM {
TESSERACT_ONLY,
LSTM_ONLY,
TESSERACT_LSTM_COMBINED,
DEFAULT,
}
enum PSM {
OSD_ONLY = '0',
AUTO_OSD = '1',
AUTO_ONLY = '2',
AUTO = '3',
SINGLE_COLUMN = '4',
SINGLE_BLOCK_VERT_TEXT = '5',
SINGLE_BLOCK = '6',
SINGLE_LINE = '7',
SINGLE_WORD = '8',
CIRCLE_WORD = '9',
SINGLE_CHAR = '10',
SPARSE_TEXT = '11',
SPARSE_TEXT_OSD = '12',
RAW_LINE = '13'
}
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer;
interface Block {
paragraphs: Paragraph[];
text: string;
confidence: number;
baseline: Baseline;
bbox: Bbox;
blocktype: string;
polygon: any;
page: Page;
lines: Line[];
words: Word[];
symbols: Symbol[];
}
interface Baseline {
x0: number;
y0: number;
x1: number;
y1: number;
has_baseline: boolean;
}
interface Bbox {
x0: number;
y0: number;
x1: number;
y1: number;
}
interface Line {
words: Word[];
text: string;
confidence: number;
baseline: Baseline;
bbox: Bbox;
paragraph: Paragraph;
block: Block;
page: Page;
symbols: Symbol[];
}
interface Paragraph {
lines: Line[];
text: string;
confidence: number;
baseline: Baseline;
bbox: Bbox;
is_ltr: boolean;
block: Block;
page: Page;
words: Word[];
symbols: Symbol[];
}
interface Symbol {
choices: Choice[];
image: any;
text: string;
confidence: number;
baseline: Baseline;
bbox: Bbox;
is_superscript: boolean;
is_subscript: boolean;
is_dropcap: boolean;
word: Word;
line: Line;
paragraph: Paragraph;
block: Block;
page: Page;
}
interface Choice {
text: string;
confidence: number;
}
interface Word {
symbols: Symbol[];
choices: Choice[];
text: string;
confidence: number;
baseline: Baseline;
bbox: Bbox;
is_numeric: boolean;
in_dictionary: boolean;
direction: string;
language: string;
is_bold: boolean;
is_italic: boolean;
is_underlined: boolean;
is_monospace: boolean;
is_serif: boolean;
is_smallcaps: boolean;
font_size: number;
font_id: number;
font_name: string;
line: Line;
paragraph: Paragraph;
block: Block;
page: Page;
}
interface Page {
blocks: Block[];
confidence: number;
lines: Line[];
oem: string;
osd: string;
paragraphs: Paragraph[];
psm: string;
symbols: Symbol[];
text: string;
version: string;
words: Word[];
hocr: string | null;
tsv: string | null;
box: string | null;
unlv: string | null;
sd: string | null;
}
}
export = Tesseract;
export as namespace Tesseract;

102
src/index.js

@ -1,27 +1,75 @@ @@ -1,27 +1,75 @@
/**
*
* Entry point for tesseract.js, should be the entry when bundling.
*
* @fileoverview entry point for tesseract.js
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
require('regenerator-runtime/runtime');
const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker');
const Tesseract = require('./Tesseract');
const languages = require('./constants/languages');
const OEM = require('./constants/OEM');
const PSM = require('./constants/PSM');
const { setLogging } = require('./utils/log');
module.exports = {
languages,
OEM,
PSM,
createScheduler,
createWorker,
setLogging,
...Tesseract,
};
const adapter = require('./node/index.js')
const circularize = require('./common/circularize.js')
const TesseractJob = require('./common/job');
const version = require('../package.json').version;
const create = function(workerOptions = {}){
var worker = new TesseractWorker(Object.assign({}, adapter.defaultOptions, workerOptions));
worker.create = create;
worker.version = version;
return worker;
}
class TesseractWorker {
constructor(workerOptions){
this.worker = null;
this.workerOptions = workerOptions;
this._currentJob = null;
this._queue = [];
}
recognize(image, options = {}){
return this._delay(job => {
if (typeof options === 'string') options = {lang: options}
options.lang = options.lang || 'eng';
job._send('recognize', { image, options, workerOptions: this.workerOptions });
})
}
detect(image, options = {}){
return this._delay(job => {
job._send('detect', { image, options, workerOptions: this.workerOptions });
})
}
terminate(){
if(this.worker) adapter.terminateWorker(this);
this.worker = null;
this._currentJob = null;
this._queue = [];
}
_delay(fn){
if(!this.worker) this.worker = adapter.spawnWorker(this, this.workerOptions);
var job = new TesseractJob(this);
this._queue.push(e => {
this._queue.shift();
this._currentJob = job;
fn(job);
});
if(!this._currentJob) this._dequeue();
return job;
}
_dequeue(){
this._currentJob = null;
if(this._queue.length){
this._queue[0]();
}
}
_recv(packet){
if(packet.status === 'resolve' && packet.action === 'recognize'){
packet.data = circularize(packet.data);
}
if(this._currentJob.id === packet.jobId){
this._currentJob._handle(packet)
} else {
console.warn('Job ID ' + packet.jobId + ' not known.')
}
}
}
module.exports = create();

89
src/node/index.js

@ -0,0 +1,89 @@ @@ -0,0 +1,89 @@
const fetch = require('isomorphic-fetch'),
isURL = require('is-url'),
fork = require('child_process').fork,
fs = require('fs');
exports.defaultOptions = {
workerPath: require('path').join(__dirname, 'worker.js'),
langPath: 'https://tessdata.projectnaptha.com/3.02/',
}
exports.spawnWorker = function spawnWorker(instance, workerOptions){
var cp = fork(workerOptions.workerPath);
cp.on('message', packet => {
instance._recv(packet);
});
return cp;
}
exports.terminateWorker = function(instance){
instance.worker.kill();
}
exports.sendPacket = function sendPacket(instance, packet){
loadImage(packet.payload.image, img => {
packet.payload.image = img;
instance.worker.send(packet);
});
}
function loadImage(image, cb){
if(typeof image === 'string'){
if (isURL(image)) {
fetch(image)
.then(resp => resp.buffer())
.then(buffer => loadImage(buffer, cb))
.catch(err => console.error(err));
} else {
fs.readFile(image, function(err, buffer){
if (err) throw err;
loadImage(buffer, cb);
});
}
return;
} else if (image instanceof Buffer){
var mime = require('file-type')(image).mime
if(mime === 'image/png'){
var PNGReader = require('png.js');
var reader = new PNGReader(image);
reader.parse(function(err, png){
if (err) throw err;
var image = {
width: png.getWidth(),
height: png.getHeight()
}
image.data = new Uint8Array(image.width * image.height * 4)
for(var j = 0; j < image.height; j++){
for(var i = 0; i < image.width; i++){
var offset = 4 * (i + j * image.width),
pix = png.getPixel(i, j);
image.data[offset] = pix[0];
image.data[offset + 1] = pix[1];
image.data[offset + 2] = pix[2];
image.data[offset + 3] = pix[3];
}
}
loadImage(image, cb);
});
return;
} else if (mime === 'image/jpeg'){
loadImage(require('jpeg-js').decode(image), cb);
return;
}
// TODO: support for TIFF, NetPBM, BMP, etc.
}
// node uses json.stringify for ipc which means we need to turn
// fancy arrays into raw arrays
if(image && image.data && image.data.length && !Array.isArray(image.data)){
image.data = Array.from(image.data);
return loadImage(image, cb)
}
cb(image);
}

47
src/node/lang.js

@ -0,0 +1,47 @@ @@ -0,0 +1,47 @@
const https = require("https"),
http = require("http"),
zlib = require("zlib"),
fs = require("fs"),
path = require("path"),
isURL = require("is-url");
var langdata = require('../common/langdata.json')
function getLanguageData(req, res, cb){
var lang = req.options.lang,
langfile = lang + '.traineddata.gz';
// langPath defaults to a URL where languages can be downloaded. If a custom path is specified
// and it is a local path, use that instead
var localPath = isURL(req.workerOptions.langPath) ?
lang + '.traineddata' :
path.join(req.workerOptions.langPath, lang + '.traineddata');
var fetchProtocol = req.workerOptions.langPath.startsWith('http://') ? http : https;
fs.readFile(localPath, function (err, data) {
if(!err) return cb(new Uint8Array(data));
fetchProtocol.get(req.workerOptions.langPath + langfile, stream => {
var received_bytes = 0;
stream.on('data', function(chunk) {
received_bytes += chunk.length;
res.progress({
status: 'downloading ' + langfile,
loaded: received_bytes,
progress: Math.min(1, received_bytes / langdata[lang])
});
});
var gunzip = zlib.createGunzip();
stream.pipe(gunzip).pipe(fs.createWriteStream(lang + '.traineddata'))
gunzip.on('end',() => {
getLanguageData(req, stream, cb)
});
});
});
}
module.exports = getLanguageData;

19
src/node/worker.js

@ -0,0 +1,19 @@ @@ -0,0 +1,19 @@
const workerUtils = require('../common/worker.js')
process.on('message', function(packet){
workerUtils.dispatchHandlers(packet, obj => process.send(obj))
})
var TesseractCore;
exports.getCore = function(req, res){
if(!TesseractCore){
res.progress({ status: 'loading tesseract core' })
TesseractCore = require('tesseract.js-core')
res.progress({ status: 'loaded tesseract core' })
}
return TesseractCore
}
exports.getLanguageData = require('./lang.js')
workerUtils.setAdapter(module.exports);

54
src/utils/circularize.js

@ -1,54 +0,0 @@ @@ -1,54 +0,0 @@
/**
* In the recognition result of tesseract, there
* is a deep JSON object for details, it has around
*
* The result of dump.js is a big JSON tree
* which can be easily serialized (for instance
* to be sent from a webworker to the main app
* or through Node's IPC), but we want
* a (circular) DOM-like interface for walking
* through the data.
*
* @fileoverview DOM-like interface for walking through data
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
module.exports = (page) => {
const blocks = [];
const paragraphs = [];
const lines = [];
const words = [];
const symbols = [];
page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
line.words.forEach((word) => {
word.symbols.forEach((sym) => {
symbols.push({
...sym, page, block, paragraph, line, word,
});
});
words.push({
...word, page, block, paragraph, line,
});
});
lines.push({
...line, page, block, paragraph,
});
});
paragraphs.push({
...paragraph, page, block,
});
});
blocks.push({
...block, page,
});
});
return {
...page, blocks, paragraphs, lines, words, symbols,
};
};

21
src/utils/getEnvironment.js

@ -1,21 +0,0 @@ @@ -1,21 +0,0 @@
const isElectron = require('is-electron');
module.exports = (key) => {
const env = {};
if (typeof WorkerGlobalScope !== 'undefined') {
env.type = 'webworker';
} else if (isElectron()) {
env.type = 'electron';
} else if (typeof window === 'object') {
env.type = 'browser';
} else if (typeof process === 'object' && typeof require === 'function') {
env.type = 'node';
}
if (typeof key === 'undefined') {
return env;
}
return env[key];
};

3
src/utils/getId.js

@ -1,3 +0,0 @@ @@ -1,3 +0,0 @@
module.exports = (prefix, cnt) => (
`${prefix}-${cnt}-${Math.random().toString(16).slice(3, 8)}`
);

9
src/utils/log.js

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
let logging = false;
exports.logging = logging;
exports.setLogging = (_logging) => {
logging = _logging;
};
exports.log = (...args) => (logging ? console.log.apply(this, args) : null);

12
src/utils/resolvePaths.js

@ -1,12 +0,0 @@ @@ -1,12 +0,0 @@
const isBrowser = require('./getEnvironment')('type') === 'browser';
const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disable-line
module.exports = (options) => {
const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (options[key]) {
opts[key] = resolveURL(opts[key]);
}
});
return opts;
};

10
src/worker-script/browser/cache.js

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
const { set, get, del } = require('idb-keyval');
module.exports = {
readCache: get,
writeCache: set,
deleteCache: del,
checkCache: (path) => (
get(path).then((v) => typeof v !== 'undefined')
),
};

30
src/worker-script/browser/getCore.js

@ -1,30 +0,0 @@ @@ -1,30 +0,0 @@
const { simd } = require('wasm-feature-detect');
const { dependencies } = require('../../../package.json');
module.exports = async (corePath, res) => {
if (typeof global.TesseractCore === 'undefined') {
res.progress({ status: 'loading tesseract core', progress: 0 });
// If the user specifies a core path, we use that
// Otherwise, we detect the correct core based on SIMD support
let corePathImport = corePath;
if (!corePathImport) {
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
}
}
global.importScripts(corePathImport);
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') {
global.TesseractCore = global.TesseractCoreWASM;
} else {
throw Error('Failed to load TesseractCore');
}
res.progress({ status: 'loading tesseract core', progress: 1 });
}
return global.TesseractCore;
};

1
src/worker-script/browser/gunzip.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = require('zlibjs').gunzipSync;

32
src/worker-script/browser/index.js

@ -1,32 +0,0 @@ @@ -1,32 +0,0 @@
/**
*
* Browser worker scripts
*
* @fileoverview Browser worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const worker = require('..');
const getCore = require('./getCore');
const gunzip = require('./gunzip');
const cache = require('./cache');
/*
* register message handler
*/
global.addEventListener('message', ({ data }) => {
worker.dispatchHandlers(data, (obj) => postMessage(obj));
});
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
worker.setAdapter({
getCore,
gunzip,
fetch: () => {},
...cache,
});

14
src/worker-script/constants/defaultParams.js

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
/*
* default params for tesseract.js
*/
const PSM = require('../../constants/PSM');
module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whitelist: '',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
tessjs_create_box: '0',
tessjs_create_unlv: '0',
tessjs_create_osd: '0',
};

313
src/worker-script/index.js

@ -1,313 +0,0 @@ @@ -1,313 +0,0 @@
/**
*
* Worker script for browser and node
*
* @fileoverview Worker script for browser and node
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
require('regenerator-runtime/runtime');
const fileType = require('file-type');
const isURL = require('is-url');
const dump = require('./utils/dump');
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker';
const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams');
const { log, setLogging } = require('../utils/log');
/*
* Tesseract Module returned by TesseractCore.
*/
let TessModule;
/*
* TessearctBaseAPI instance
*/
let api = null;
let latestJob;
let adapter = {};
let params = defaultParams;
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging);
if (!TessModule) {
const Core = await adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
Core({
TesseractProgress(percent) {
latestJob.progress({
workerId,
jobId,
status: 'recognizing text',
progress: Math.max(0, (percent - 30) / 70),
});
},
}).then((tessModule) => {
TessModule = tessModule;
res.progress({ workerId, status: 'initialized tesseract', progress: 1 });
res.resolve({ loaded: true });
});
} else {
res.resolve({ loaded: true });
}
};
const FS = ({ workerId, payload: { method, args } }, res) => {
log(`[${workerId}]: FS.${method} with args ${args}`);
res.resolve(TessModule.FS[method](...args));
};
const loadLanguage = async ({
workerId,
payload: {
langs,
options: {
langPath,
dataPath,
cachePath,
cacheMethod,
gzip = true,
},
},
},
res) => {
const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod)
? () => Promise.resolve()
: adapter.readCache;
let data = null;
try {
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
if (typeof _data !== 'undefined') {
log(`[${workerId}]: Load ${lang}.traineddata from cache`);
res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 });
data = _data;
} else {
throw Error('Not found in cache');
}
} catch (e) {
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`);
if (typeof _lang === 'string') {
let path = null;
if (isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */
path = langPath;
}
if (path !== null) {
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`;
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl);
if (!resp.ok) {
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`);
}
data = await resp.arrayBuffer();
} else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
}
} else {
data = _lang.data; // eslint-disable-line
}
}
data = new Uint8Array(data);
const type = fileType(data);
if (typeof type !== 'undefined' && type.mime === 'application/gzip') {
data = adapter.gunzip(data);
}
if (TessModule) {
if (dataPath) {
try {
TessModule.FS.mkdir(dataPath);
} catch (err) {
res.reject(err.toString());
}
}
TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data);
}
if (['write', 'refresh', undefined].includes(cacheMethod)) {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
}
return Promise.resolve(data);
};
res.progress({ workerId, status: 'loading language traineddata', progress: 0 });
try {
await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile));
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
} catch (err) {
res.reject(err.toString());
}
};
const setParameters = ({ payload: { params: _params } }, res) => {
Object.keys(_params)
.filter((k) => !k.startsWith('tessjs_'))
.forEach((key) => {
api.SetVariable(key, _params[key]);
});
params = { ...params, ..._params };
if (typeof res !== 'undefined') {
res.resolve(params);
}
};
const initialize = ({
workerId,
payload: { langs: _langs, oem },
}, res) => {
const langs = (typeof _langs === 'string')
? _langs
: _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+');
try {
res.progress({
workerId, status: 'initializing api', progress: 0,
});
if (api !== null) {
api.End();
}
api = new TessModule.TessBaseAPI();
const status = api.Init(null, langs, oem);
if (status === -1) {
res.reject('initialization failed');
}
params = defaultParams;
setParameters({ payload: { params } });
res.progress({
workerId, status: 'initialized api', progress: 1,
});
res.resolve();
} catch (err) {
res.reject(err.toString());
}
};
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
api.Recognize(null);
res.resolve(dump(TessModule, api, params));
TessModule._free(ptr);
} catch (err) {
res.reject(err.toString());
}
};
const getPDF = ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
const results = new TessModule.OSResults();
if (!api.DetectOS(results)) {
api.End();
TessModule._free(ptr);
res.reject('Failed to detect OS');
} else {
const best = results.best_result;
const oid = best.orientation_id;
const sid = best.script_id;
TessModule._free(ptr);
res.resolve({
tesseract_script_id: sid,
script: results.unicharset.get_script_from_script_id(sid),
script_confidence: best.sconfidence,
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.oconfidence,
});
}
} catch (err) {
res.reject(err.toString());
}
};
const terminate = (_, res) => {
try {
if (api !== null) {
api.End();
}
res.resolve({ terminated: true });
} catch (err) {
res.reject(err.toString());
}
};
/**
* dispatchHandlers
*
* @name dispatchHandlers
* @function worker data handler
* @access public
* @param {object} data
* @param {string} data.jobId - unique job id
* @param {string} data.action - action of the job, only recognize and detect for now
* @param {object} data.payload - data for the job
* @param {function} send - trigger job to work
*/
exports.dispatchHandlers = (packet, send) => {
const res = (status, data) => {
send({
...packet,
status,
data,
});
};
res.resolve = res.bind(this, 'resolve');
res.reject = res.bind(this, 'reject');
res.progress = res.bind(this, 'progress');
latestJob = res;
try {
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
})[packet.action](packet, res);
} catch (err) {
/** Prepare exception to travel through postMessage */
res.reject(err.toString());
}
};
/**
* setAdapter
*
* @name setAdapter
* @function
* @access public
* @param {object} adapter - implementation of the worker, different in browser and node environment
*/
exports.setAdapter = (_adapter) => {
adapter = _adapter;
};

16
src/worker-script/node/cache.js

@ -1,16 +0,0 @@ @@ -1,16 +0,0 @@
const util = require('util');
const fs = require('fs');
module.exports = {
readCache: util.promisify(fs.readFile),
writeCache: util.promisify(fs.writeFile),
deleteCache: (path) => (
util.promisify(fs.unlink)(path)
.catch(() => {})
),
checkCache: (path) => (
util.promisify(fs.access)(path, fs.F_OK)
.then((err) => (err === null))
.catch(() => false)
),
};

20
src/worker-script/node/getCore.js

@ -1,20 +0,0 @@ @@ -1,20 +0,0 @@
const { simd } = require('wasm-feature-detect');
let TesseractCore = null;
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
module.exports = async (_, res) => {
if (TesseractCore === null) {
const simdSupport = await simd();
res.progress({ status: 'loading tesseract core', progress: 0 });
if (simdSupport) {
TesseractCore = require('tesseract.js-core/tesseract-core-simd');
} else {
TesseractCore = require('tesseract.js-core/tesseract-core');
}
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;
};

1
src/worker-script/node/gunzip.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = require('zlib').gunzipSync;

30
src/worker-script/node/index.js

@ -1,30 +0,0 @@ @@ -1,30 +0,0 @@
/**
*
* Tesseract Worker Script for Node
*
* @fileoverview Node worker implementation
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const fetch = require('node-fetch');
const { parentPort } = require('worker_threads');
const worker = require('..');
const getCore = require('./getCore');
const gunzip = require('./gunzip');
const cache = require('./cache');
/*
* register message handler
*/
parentPort.on('message', (packet) => {
worker.dispatchHandlers(packet, (obj) => parentPort.postMessage(obj));
});
worker.setAdapter({
getCore,
gunzip,
fetch,
...cache,
});

201
src/worker-script/utils/dump.js

@ -1,201 +0,0 @@ @@ -1,201 +0,0 @@
/**
*
* Dump data to a big JSON tree
*
* @fileoverview dump data to JSON tree
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
/**
* deindent
*
* The generated HOCR is excessively indented, so
* we get rid of that indentation
*
* @name deindent
* @function deindent string
* @access public
*/
const deindent = (html) => {
const lines = html.split('\n');
if (lines[0].substring(0, 2) === ' ') {
for (let i = 0; i < lines.length; i += 1) {
if (lines[i].substring(0, 2) === ' ') {
lines[i] = lines[i].slice(2);
}
}
}
return lines.join('\n');
};
/**
* dump
*
* @name dump
* @function dump recognition result to a JSON object
* @access public
*/
module.exports = (TessModule, api, {
tessjs_create_hocr,
tessjs_create_tsv,
tessjs_create_box,
tessjs_create_unlv,
tessjs_create_osd,
}) => {
const ri = api.GetIterator();
const {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL,
} = TessModule;
const blocks = [];
let block;
let para;
let textline;
let word;
let symbol;
const enumToString = (value, prefix) => (
Object.keys(TessModule)
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map((e) => e.slice(prefix.length + 1))[0]
);
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
textline = {
words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence(),
});
} while (wc.Next());
TessModule.destroy(wc);
textline.words.push(word);
}
// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence(),
});
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
return {
text: api.GetUTF8Text(),
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
box: tessjs_create_box === '1' ? api.GetBoxText() : null,
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
confidence: api.MeanTextConf(),
blocks,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
};
};

63
src/worker-script/utils/setImage.js

@ -1,63 +0,0 @@ @@ -1,63 +0,0 @@
const bmp = require('bmp-js');
const fileType = require('file-type');
/**
* setImage
*
* @name setImage
* @function set image in tesseract for recognition
* @access public
*/
module.exports = (TessModule, api, image) => {
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length }));
const type = fileType(buf);
let bytesPerPixel = 0;
let data = null;
let pix = null;
let w = 0;
let h = 0;
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/*
* Leptonica supports uncompressed but not compressed bmp files
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
* We therefore use bmp-js to process all bmp files
*/
if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf);
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(bmpBuf.data, data);
w = bmpBuf.width;
h = bmpBuf.height;
bytesPerPixel = 4;
} else {
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(buf, ptr);
pix = TessModule._pixReadMem(ptr, buf.length);
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) {
/*
* Set a yres default value to prevent warning from tesseract
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h
*/
TessModule.setValue(pix + (7 * 4), 300, 'i32');
}
[w, h] = Array(2).fill(0)
.map((v, idx) => (
TessModule.getValue(pix + (idx * 4), 'i32')
));
}
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix, undefined, undefined, undefined, undefined, exif);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif);
}
return data === null ? pix : data;
};

18
src/worker/browser/defaultOptions.js

@ -1,18 +0,0 @@ @@ -1,18 +0,0 @@
const resolveURL = require('resolve-url');
const { version } = require('../../../package.json');
const defaultOptions = require('../../constants/defaultOptions');
/*
* Default options for browser worker
*/
module.exports = {
...defaultOptions,
workerPath: (typeof process !== 'undefined' && process.env.TESS_ENV === 'development')
? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`)
: `https://unpkg.com/tesseract.js@v${version}/dist/worker.min.js`,
/*
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
corePath: null,
};

24
src/worker/browser/index.js

@ -1,24 +0,0 @@ @@ -1,24 +0,0 @@
/**
*
* Tesseract Worker adapter for browser
*
* @fileoverview Tesseract Worker adapter for browser
* @author Kevin Kwok <antimatter15@gmail.com>
* @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com>
*/
const defaultOptions = require('./defaultOptions');
const spawnWorker = require('./spawnWorker');
const terminateWorker = require('./terminateWorker');
const onMessage = require('./onMessage');
const send = require('./send');
const loadImage = require('./loadImage');
module.exports = {
defaultOptions,
spawnWorker,
terminateWorker,
onMessage,
send,
loadImage,
};

68
src/worker/browser/loadImage.js

@ -1,68 +0,0 @@ @@ -1,68 +0,0 @@
const resolveURL = require('resolve-url');
/**
* readFromBlobOrFile
*
* @name readFromBlobOrFile
* @function
* @access private
*/
const readFromBlobOrFile = (blob) => (
new Promise((resolve, reject) => {
const fileReader = new FileReader();
fileReader.onload = () => {
resolve(fileReader.result);
};
fileReader.onerror = ({ target: { error: { code } } }) => {
reject(Error(`File could not be read! Code=${code}`));
};
fileReader.readAsArrayBuffer(blob);
})
);
/**
* loadImage
*
* @name loadImage
* @function load image from different source
* @access private
*/
const loadImage = async (image) => {
let data = image;
if (typeof image === 'undefined') {
return 'undefined';
}
if (typeof image === 'string') {
// Base64 Image
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = atob(image.split(',')[1])
.split('')
.map((c) => c.charCodeAt(0));
} else {
const resp = await fetch(resolveURL(image));
data = await resp.arrayBuffer();
}
} else if (image instanceof HTMLElement) {
if (image.tagName === 'IMG') {
data = await loadImage(image.src);
}
if (image.tagName === 'VIDEO') {
data = await loadImage(image.poster);
}
if (image.tagName === 'CANVAS') {
await new Promise((resolve) => {
image.toBlob(async (blob) => {
data = await readFromBlobOrFile(blob);
resolve();
});
});
}
} else if (image instanceof File || image instanceof Blob) {
data = await readFromBlobOrFile(image);
}
return new Uint8Array(data);
};
module.exports = loadImage;

5
src/worker/browser/onMessage.js

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
module.exports = (worker, handler) => {
worker.onmessage = ({ data }) => { // eslint-disable-line
handler(data);
};
};

10
src/worker/browser/send.js

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
/**
* send
*
* @name send
* @function send packet to worker and create a job
* @access public
*/
module.exports = async (worker, packet) => {
worker.postMessage(packet);
};

20
src/worker/browser/spawnWorker.js

@ -1,20 +0,0 @@ @@ -1,20 +0,0 @@
/**
* spawnWorker
*
* @name spawnWorker
* @function create a new Worker in browser
* @access public
*/
module.exports = ({ workerPath, workerBlobURL }) => {
let worker;
if (Blob && URL && workerBlobURL) {
const blob = new Blob([`importScripts("${workerPath}");`], {
type: 'application/javascript',
});
worker = new Worker(URL.createObjectURL(blob));
} else {
worker = new Worker(workerPath);
}
return worker;
};

10
src/worker/browser/terminateWorker.js

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
/**
* terminateWorker
*
* @name terminateWorker
* @function terminate worker
* @access public
*/
module.exports = (worker) => {
worker.terminate();
};

10
src/worker/node/defaultOptions.js

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
const path = require('path');
const defaultOptions = require('../../constants/defaultOptions');
/*
* Default options for node worker
*/
module.exports = {
...defaultOptions,
workerPath: path.join(__dirname, '..', '..', 'worker-script', 'node', 'index.js'),
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save