Compare commits

..

No commits in common. 'master' and 'develop' have entirely different histories.

  1. 6
      .eslintrc
  2. 4
      .github/FUNDING.yml
  3. 5
      .github/SECURITY.md
  4. 71
      .github/workflows/codeql-analysis.yml
  5. 29
      .github/workflows/node.js.yml
  6. 2
      .gitpod.Dockerfile
  7. 2
      .gitpod.yml
  8. 7
      .travis.yml
  9. 80
      README.md
  10. 133
      docs/api.md
  11. 106
      docs/examples.md
  12. 4
      docs/faq.md
  13. 25
      docs/image-format.md
  14. BIN
      docs/images/tesseract.png
  15. BIN
      docs/images/video-demo.gif
  16. 12
      docs/local-installation.md
  17. 2
      docs/tesseract_lang_list.md
  18. 37
      examples/browser/basic-edge.html
  19. 33
      examples/browser/benchmark.html
  20. 3
      examples/browser/demo.html
  21. BIN
      examples/data/meditations.jpg
  22. BIN
      examples/data/testocr.png
  23. BIN
      examples/data/tyger.jpg
  24. 27
      examples/node/benchmark.js
  25. 17
      examples/node/recognize.js
  26. 22452
      package-lock.json
  27. 59
      package.json
  28. 13
      scripts/rollup.esm.js
  29. 2
      scripts/server.js
  30. 5
      scripts/webpack.config.common.js
  31. 9
      scripts/webpack.config.dev.js
  32. 6
      scripts/webpack.config.prod.js
  33. 1
      src/constants/PSM.js
  34. 218
      src/constants/languages.js
  35. 51
      src/createWorker.js
  36. 37
      src/index.d.ts
  37. 2
      src/index.js
  38. 17
      src/utils/getEnvironment.js
  39. 2
      src/utils/resolvePaths.js
  40. 4
      src/worker-script/browser/cache.js
  41. 27
      src/worker-script/browser/getCore.js
  42. 7
      src/worker-script/browser/index.js
  43. 1
      src/worker-script/browser/resolveURL.js
  44. 2
      src/worker-script/constants/defaultParams.js
  45. 60
      src/worker-script/index.js
  46. 6
      src/worker-script/node/cache.js
  47. 11
      src/worker-script/node/getCore.js
  48. 11
      src/worker-script/node/index.js
  49. 1
      src/worker-script/node/resolveURL.js
  50. 4
      src/worker-script/utils/dump.js
  51. 12
      src/worker-script/utils/setImage.js
  52. 4
      src/worker/browser/defaultOptions.js
  53. 9
      src/worker/browser/loadImage.js
  54. 8
      src/worker/node/loadImage.js
  55. 4
      src/worker/node/send.js
  56. 6
      src/worker/node/spawnWorker.js
  57. 2
      src/worker/node/terminateWorker.js
  58. 18
      tests/FS.test.html
  59. 37
      tests/FS.test.js
  60. BIN
      tests/assets/images/simple.gif
  61. BIN
      tests/assets/images/simple.webp
  62. 5
      tests/constants.js
  63. 8
      tests/recognize.test.js

6
.eslintrc

@ -1,6 +1,5 @@
{ {
"extends": "airbnb-base", "extends": "airbnb",
"parser": "babel-eslint",
"env": { "env": {
"browser": true, "browser": true,
"node": true, "node": true,
@ -11,7 +10,6 @@
"no-underscore-dangle": 0, "no-underscore-dangle": 0,
"no-console": 0, "no-console": 0,
"global-require": 0, "global-require": 0,
"camelcase": 0, "camelcase": 0
"no-control-regex": 0
} }
} }

4
.github/FUNDING.yml

@ -4,6 +4,6 @@ github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, u
patreon: # Replace with a single Patreon username patreon: # Replace with a single Patreon username
open_collective: tesseractjs open_collective: tesseractjs
ko_fi: # Replace with a single Ko-fi username ko_fi: # Replace with a single Ko-fi username
tidelift: npm/tesseract.js tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2"] custom: # Replace with a single custom sponsorship URL

5
.github/SECURITY.md

@ -1,5 +0,0 @@
## Security contact information
To report a security vulnerability, please use the
[Tidelift security contact](https://tidelift.com/security).
Tidelift will coordinate the fix and disclosure.

71
.github/workflows/codeql-analysis.yml

@ -1,71 +0,0 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
name: "CodeQL"
on:
push:
branches: [master]
pull_request:
# The branches below must be a subset of the branches above
branches: [master]
schedule:
- cron: '0 17 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
# Override automatic language detection by changing the below list
# Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
language: ['javascript']
# Learn more...
# https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
# We must fetch at least the immediate parents so that if this is
# a pull request then we can checkout the head.
fetch-depth: 2
# If this run was triggered by a pull request event, then checkout
# the head of the pull request instead of the merge commit.
- run: git checkout HEAD^2
if: ${{ github.event_name == 'pull_request' }}
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# ℹ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

29
.github/workflows/node.js.yml

@ -1,29 +0,0 @@
# This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions
name: Node.js CI
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [14.x, 16.x]
steps:
- uses: actions/checkout@v2
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
- run: npm ci
- run: npm run lint
- run: npm test

2
.gitpod.Dockerfile

@ -1,2 +0,0 @@
FROM gitpod/workspace-full
RUN sudo apt-get update && sudo apt-get install -y libgtk-3-0 libx11-xcb1 libnss3 libxss1 libasound2

2
.gitpod.yml

@ -1,5 +1,3 @@
image:
file: .gitpod.Dockerfile
tasks: tasks:
- command: gp await-port 3000 && sleep 3 && gp preview $(gp url 3000)/examples/browser/demo.html - command: gp await-port 3000 && sleep 3 && gp preview $(gp url 3000)/examples/browser/demo.html
- init: npm install - init: npm install

7
.travis.yml

@ -0,0 +1,7 @@
language: node_js
node_js:
- "lts/*" # Use LTS version
script:
- npm run lint
- npm test

80
README.md

@ -1,10 +1,8 @@
<p align="center"> <p align="center">
<a href="https://tesseract.projectnaptha.com/"><img width="256px" height="256px" alt="Tesseract.js" src="./docs/images/tesseract.png"></a> <a href="https://tesseract.projectnaptha.com/"><img alt="Tesseract.js" src="https://tesseract.projectnaptha.com/img/logo_small.png"></a>
</p> </p>
![Lint & Test](https://github.com/naptha/tesseract.js/workflows/Node.js%20CI/badge.svg) [![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js)
![CodeQL](https://github.com/naptha/tesseract.js/workflows/CodeQL/badge.svg)
[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://github.com/naptha/tesseract.js)
[![Financial Contributors on Open Collective](https://opencollective.com/tesseractjs/all/badge.svg?label=financial+contributors)](https://opencollective.com/tesseractjs) [![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js) [![Financial Contributors on Open Collective](https://opencollective.com/tesseractjs/all/badge.svg?label=financial+contributors)](https://opencollective.com/tesseractjs) [![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity) [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
@ -12,18 +10,16 @@
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) [![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) [![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/)) <h3 align="center">
Version 2 beta is now available and under development in the master branch<br>
Image Recognition Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
</h3>
[![fancy demo gif](./docs/images/demo.gif)](http://tesseract.projectnaptha.com)
Video Real-time Recognition <br>
<p align="center"> Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
<a href="https://github.com/jeromewu/tesseract.js-video"><img alt="Tesseract.js Video" src="./docs/images/video-demo.gif"></a>
</p>
[![fancy demo gif](./docs/images/demo.gif)](http://tesseract.projectnaptha.com)
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine. Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine.
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/). It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/).
@ -56,32 +52,21 @@ const worker = createWorker({
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text); console.log(text);
await worker.terminate(); await woker.terminate();
})(); })();
``` ```
[Check out the docs](#documentation) for a full explanation of the API. [Check out the docs](#docs) for a full explanation of the API.
## Major changes in v3
- Significantly faster performance ## Major changes in v2 beta
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data) - Upgrade to tesseract v4.1 (using emscripten 1.38.45)
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18) - Support multiple languages at the same time, eg: eng+chi_tra for English and Traditional Chinese
- Added SIMD-enabled build for supported devices
- Added support:
- Node.js version 18
- Removed support:
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0)
- Node.js versions 10 and 12
## Major changes in v2
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream)
- Support multiple languages at the same time, eg: eng+chi\_tra for English and Traditional Chinese
- Supported image formats: png, jpg, bmp, pbm - Supported image formats: png, jpg, bmp, pbm
- Support WebAssembly (fallback to ASM.js when browser doesn't support) - Support WebAssembly (fallback to ASM.js when browser doesn't support)
- Support Typescript - Support Typescript
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
## Installation ## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`. Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
@ -89,7 +74,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via
### CDN ### CDN
```html ```html
<!-- v2 --> <!-- v2 -->
<script src='https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js'></script> <script src='https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/tesseract.min.js'></script>
<!-- v1 --> <!-- v1 -->
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script> <script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script>
@ -99,16 +84,16 @@ After including the script the `Tesseract` variable will be globally available.
### Node.js ### Node.js
**Tesseract.js v3 requires Node.js v14 or higher** **Tesseract.js currently requires Node.js v6.8.0 or higher**
```shell ```shell
# For v3 # For v2
npm install tesseract.js@next
yarn add tesseract.js@next
# For v1
npm install tesseract.js npm install tesseract.js
yarn add tesseract.js yarn add tesseract.js
# For v2
npm install tesseract.js@2
yarn add tesseract.js@2
``` ```
@ -120,19 +105,6 @@ yarn add tesseract.js@2
* [Local Installation](./docs/local-installation.md) * [Local Installation](./docs/local-installation.md)
* [FAQ](./docs/faq.md) * [FAQ](./docs/faq.md)
## Use tesseract.js the way you like!
- Offline Version: https://github.com/jeromewu/tesseract.js-offline
- Electron Version: https://github.com/jeromewu/tesseract.js-electron
- Custom Traineddata: https://github.com/jeromewu/tesseract.js-custom-traineddata
- Chrome Extension #1: https://github.com/jeromewu/tesseract.js-chrome-extension
- Chrome Extension #2: https://github.com/fxnoob/image-to-text
- Firefox Extension: https://github.com/gnonio/korporize
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app
- With React: https://github.com/jeromewu/tesseract.js-react-app
- Typescript: https://github.com/jeromewu/tesseract.js-typescript
- Video Real-time Recognition: https://github.com/jeromewu/tesseract.js-video
## Contributing ## Contributing
@ -153,9 +125,7 @@ npm start
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser. The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser.
It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder. It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder.
### Online Setup with a single Click You can also run the development server in Gitpod ( a free online IDE and dev environment for GitHub that will automate your dev setup ) with a single click.
You can use Gitpod(A free online VS Code like IDE) for contributing. With a single click it will launch a ready to code workspace with the build & start scripts already in process and within a few seconds it will spin up the dev server so that you can start contributing straight away without wasting any time.
[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/naptha/tesseract.js/blob/master/examples/browser/demo.html) [![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/naptha/tesseract.js/blob/master/examples/browser/demo.html)

133
docs/api.md

@ -2,10 +2,6 @@
- [createWorker()](#create-worker) - [createWorker()](#create-worker)
- [Worker.load](#worker-load) - [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText)
- [Worker.readText](#worker-readText)
- [Worker.removeFile](#worker-removeFile)
- [Worker.FS](#worker-FS)
- [Worker.loadLanguage](#worker-load-language) - [Worker.loadLanguage](#worker-load-language)
- [Worker.initialize](#worker-initialize) - [Worker.initialize](#worker-initialize)
- [Worker.setParameters](#worker-set-parameters) - [Worker.setParameters](#worker-set-parameters)
@ -46,7 +42,6 @@ createWorker is a factory function that creates a tesseract worker, a worker is
- `workerBlobURL` a boolean to define whether to use Blob URL for worker script, default: true - `workerBlobURL` a boolean to define whether to use Blob URL for worker script, default: true
- `gzip` a boolean to define whether the traineddata from the remote is gzipped, default: true - `gzip` a boolean to define whether the traineddata from the remote is gzipped, default: true
- `logger` a function to log the progress, a quick example is `m => console.log(m)` - `logger` a function to log the progress, a quick example is `m => console.log(m)`
- `errorHandler` a function to handle worker errors, a quick example is `err => console.error(err)`
**Examples:** **Examples:**
@ -64,7 +59,6 @@ const worker = createWorker({
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is: A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is:
- load - load
- FS functions // optional
- loadLanguauge - loadLanguauge
- initialize - initialize
- setParameters // optional - setParameters // optional
@ -99,84 +93,6 @@ Worker.load() loads tesseract.js-core scripts (download from remote if not prese
})(); })();
``` ```
<a name="worker-writeText"></a>
### Worker.writeText(path, text, jobId): Promise
Worker.writeText() writes a text file to the path specified in MEMFS, it is useful when you want to use some features that requires tesseract.js
to read file from file system.
**Arguments:**
- `path` text file path
- `text` content of the text file
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.writeText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-readText"></a>
### Worker.readText(path, jobId): Promise
Worker.readText() reads a text file to the path specified in MEMFS, it is useful when you want to check the content.
**Arguments:**
- `path` text file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
const { data } = await worker.readText('tmp.txt');
console.log(data);
})();
```
<a name="worker-removeFile"></a>
### Worker.removeFile(path, jobId): Promise
Worker.readFile() remove a file in MEMFS, it is useful when you want to free the memory.
**Arguments:**
- `path` file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.removeFile('tmp.txt');
})();
```
<a name="worker-FS"></a>
### Worker.FS(method, args, jobId): Promise
Worker.FS() is a generic FS function to do anything you want, you can check [HERE](ihttps://emscripten.org/docs/api_reference/Filesystem-API.html) for all functions.
**Arguments:**
- `method` method name
- `args` array of arguments to pass
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.FS('writeFile', ['tmp.txt', 'Hi\nTesseract.js\n']);
// equal to:
// await worker.readText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-load-language"></a> <a name="worker-load-language"></a>
### Worker.loadLanguage(langs, jobId): Promise ### Worker.loadLanguage(langs, jobId): Promise
@ -227,18 +143,17 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i
**Supported Paramters:** **Supported Paramters:**
| name | type | default value | description | | name | type | default value | description |
| --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- | | ---- | ---- | ------------- | ----------- |
| tessedit\_ocr\_engine\_mode | enum | OEM.DEFAULT | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words | | preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** | | tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | | tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | | tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | | tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | | tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
**Examples:** **Examples:**
@ -262,8 +177,8 @@ Figures out what words are in `image`, where the words are in `image`, etc.
**Arguments:** **Arguments:**
- `image` see [Image Format](./image-format.md) for more details. - `image` see [Image Format](./image-format.md) for more details.
- `options` a object of customized options - `options` a object of customized optons
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below. - `rectangles` an array of objects to specify the region you want to recognized in the image, the object should contain top, left, width and height, see example below.
- `jobId` Please see details above - `jobId` Please see details above
**Output:** **Output:**
@ -282,7 +197,7 @@ const { createWorker } = Tesseract;
})(); })();
``` ```
With rectangle With rectangles
```javascript ```javascript
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
@ -292,7 +207,9 @@ const { createWorker } = Tesseract;
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image, { const { data: { text } } = await worker.recognize(image, {
rectangle: { top: 0, left: 0, width: 100, height: 100 }, rectangles: [
{ top: 0, left: 0, width: 100, height: 100 },
],
}); });
console.log(text); console.log(text);
})(); })();
@ -325,7 +242,11 @@ const { createWorker } = Tesseract;
<a name="worker-terminate"></a> <a name="worker-terminate"></a>
### Worker.terminate(jobId): Promise ### Worker.terminate(jobId): Promise
Worker.terminate() terminates the worker and cleans up Worker.terminate() terminates the worker and clean up
**Arguments:**
- `jobId` Please see details above
```javascript ```javascript
(async () => { (async () => {
@ -336,7 +257,7 @@ Worker.terminate() terminates the worker and cleans up
<a name="create-scheduler"></a> <a name="create-scheduler"></a>
## createScheduler(): Scheduler ## createScheduler(): Scheduler
createScheduler() is a factory function to create a scheduler, a scheduler manages a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance. createScheduler() is a factory function to create a scheduler, a scheduler manage a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance.
**Examples:** **Examples:**
@ -350,7 +271,7 @@ const scheduler = createScheduler();
<a name="scheduler-add-worker"></a> <a name="scheduler-add-worker"></a>
### Scheduler.addWorker(worker): string ### Scheduler.addWorker(worker): string
Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one scheduler. Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one sheduler.
**Arguments:** **Arguments:**
@ -433,16 +354,16 @@ See [Tesseract.js](../src/Tesseract.js)
<a name="detect"></a> <a name="detect"></a>
## detect(image, options): Promise ## detect(image, options): Promise
Same background as recognize(), but it does detect instead. Same background as recongize(), but it does detect instead.
See [Tesseract.js](../src/Tesseract.js) See [Tesseract.js](../src/Tesseract.js)
<a name="psm"></a> <a name="psm"></a>
## PSM ## PSM
See [PSM.js](../src/constants/PSM.js) See [PSM.js](../src/constatns/PSM.js)
<a name="oem"></a> <a name="oem"></a>
## OEM ## OEM
See [OEM.js](../src/constants/OEM.js) See [OEM.js](../src/constatns/OEM.js)

106
docs/examples.md

@ -2,10 +2,17 @@
You can also check [examples](../examples) folder. You can also check [examples](../examples) folder.
Example repositories:
- Offline version: https://github.com/jeromewu/tesseract.js-offline
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app
- Chrome Extension: https://github.com/jeromewu/tesseract.js-chrome-extension
### basic ### basic
```javascript ```javascript
const { createWorker } = require('tesseract.js'); import { createWorker } from 'tesseract.js';
const worker = createWorker(); const worker = createWorker();
@ -22,7 +29,7 @@ const worker = createWorker();
### with detailed progress ### with detailed progress
```javascript ```javascript
const { createWorker } = require('tesseract.js'); import { createWorker } from 'tesseract.js';
const worker = createWorker({ const worker = createWorker({
logger: m => console.log(m), // Add logger here logger: m => console.log(m), // Add logger here
@ -41,7 +48,7 @@ const worker = createWorker({
### with multiple languages, separate by '+' ### with multiple languages, separate by '+'
```javascript ```javascript
const { createWorker } = require('tesseract.js'); import { createWorker } from 'tesseract.js';
const worker = createWorker(); const worker = createWorker();
@ -57,7 +64,7 @@ const worker = createWorker();
### with whitelist char (^2.0.0-beta.1) ### with whitelist char (^2.0.0-beta.1)
```javascript ```javascript
const { createWorker } = require('tesseract.js'); import { createWorker } from 'tesseract.js';
const worker = createWorker(); const worker = createWorker();
@ -79,7 +86,7 @@ const worker = createWorker();
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript ```javascript
const { createWorker, PSM } = require('tesseract.js'); import { createWorker, PSM } from 'tesseract.js';
const worker = createWorker(); const worker = createWorker();
@ -103,105 +110,30 @@ Please check **examples** folder for details.
Browser: [download-pdf.html](../examples/browser/download-pdf.html) Browser: [download-pdf.html](../examples/browser/download-pdf.html)
Node: [download-pdf.js](../examples/node/download-pdf.js) Node: [download-pdf.js](../examples/node/download-pdf.js)
### with only part of the image (^2.0.1) ### with only part of the image (^2.0.0-beta.1)
**One rectangle**
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle });
console.log(text);
await worker.terminate();
})();
```
**Multiple Rectangles**
```javascript ```javascript
const { createWorker } = require('tesseract.js'); import { createWorker } from 'tesseract.js';
const worker = createWorker(); const worker = createWorker();
const rectangles = [ const rectangles = [
{ { left: 0, top: 0, width: 500, height: 250 },
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
]; ];
(async () => { (async () => {
await worker.load(); await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const values = []; const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', 'eng', { rectangles });
for (let i = 0; i < rectangles.length; i++) { console.log(text);
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle: rectangles[i] });
values.push(text);
}
console.log(values);
await worker.terminate(); await worker.terminate();
})(); })();
``` ```
**Multiple Rectangles (with scheduler to do recognition in parallel)**
```javascript
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const rectangles = [
{
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
];
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');
await worker2.initialize('eng');
scheduler.addWorker(worker1);
scheduler.addWorker(worker2);
const results = await Promise.all(rectangles.map((rectangle) => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle })
)));
console.log(results.map(r => r.data.text));
await scheduler.terminate();
})();
```
### with multiple workers to speed up (^2.0.0-beta.1) ### with multiple workers to speed up (^2.0.0-beta.1)
```javascript ```javascript
const { createWorker, createScheduler } = require('tesseract.js'); import { createWorker, createScheduler } from 'tesseract.js';
const scheduler = createScheduler(); const scheduler = createScheduler();
const worker1 = createWorker(); const worker1 = createWorker();
@ -218,7 +150,7 @@ const worker2 = createWorker();
scheduler.addWorker(worker2); scheduler.addWorker(worker2);
/** Add 10 recognition jobs */ /** Add 10 recognition jobs */
const results = await Promise.all(Array(10).fill(0).map(() => ( const results = await Promise.all(Array(10).fill(0).map(() => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png') await scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png')
))) )))
console.log(results); console.log(results);
await scheduler.terminate(); // It also terminates all workers. await scheduler.terminate(); // It also terminates all workers.

4
docs/faq.md

@ -9,9 +9,9 @@ During the downloading of language model, Tesseract.js will first check if \*.tr
## How can I train my own \*.traineddata? ## How can I train my own \*.traineddata?
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00) For tesseract.js v2, check [TrainingTesseract 4.00](https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00)
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05) For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-3.03%E2%80%933.05)
## How can I get HOCR, TSV, Box, UNLV, OSD? ## How can I get HOCR, TSV, Box, UNLV, OSD?

25
docs/image-format.md

@ -1,18 +1,17 @@
# Image Format # Image Format
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below. Support Format: **bmp, jpg, png, pbm**
Support Image Formats: **bmp, jpg, png, pbm, webp** The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
For browser and Node, supported data types are: On a browser, an image can be:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp) - an `img`, `video`, or `canvas` element
- buffer - a `File` object (from a file `<input>`)
- a `Blob` object
- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser only, supported data types are: In Node.js, an image can be
- `File` or `Blob` object - a path to a local image
- `img` or `canvas` element - a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

BIN
docs/images/tesseract.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

BIN
docs/images/video-demo.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 237 KiB

12
docs/local-installation.md

@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan
```javascript ```javascript
Tesseract.recognize(image, langs, { Tesseract.recognize(image, langs, {
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0', langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js',
}) })
``` ```
@ -20,9 +20,9 @@ Or
```javascript ```javascript
const worker = createWorker({ const worker = createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0', langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js',
}); });
``` ```
@ -33,6 +33,6 @@ A string specifying the location of the [worker.js](./dist/worker.min.js) file.
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`. A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`.
### corePath ### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available).
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment. Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm'. But it fails to fetch at this moment.

2
docs/tesseract_lang_list.md

@ -1,3 +1,3 @@
# Tesseract Languages # Tesseract Languages
Please check [HERE](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) for supported languages Please check [HERE](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files#data-files-for-version-400-november-29-2016) for supported languages

37
examples/browser/basic-edge.html

@ -1,37 +0,0 @@
<!DOCTYPE HTML>
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script>
const recognize = function(evt){
const files = evt.target.files;
const worker = Tesseract.createWorker({
/*
* As Edge don't support webassembly,
* here we force to use asm.js version.
*/
corePath: '../../node_modules/tesseract.js-core/tesseract-core.asm.js',
logger: function(m){console.log(m);},
/*
* As there is no indexedDB in earlier version
* of Edge, here we disable cache.
*/
cacheMethod: 'none',
});
Promise.resolve()
.then(() => worker.load())
.then(() => worker.loadLanguage('eng'))
.then(() => worker.initialize('eng'))
.then(() => worker.recognize(files[0]))
.then((ret) => {
console.log(ret.data.text);
});
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

33
examples/browser/benchmark.html

@ -1,33 +0,0 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<textarea id="message">Working...</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file);
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
document.getElementById('message').innerHTML += "\n" + file + " [x10] runtime: " + timeDif + "s";
}
document.getElementById('message').innerHTML += "\nTotal runtime: " + timeTotal + "s";
})();
</script>
</body>
</html>

3
examples/browser/demo.html

@ -27,7 +27,7 @@ function progressUpdate(packet){
if(packet.status == 'done'){ if(packet.status == 'done'){
var pre = document.createElement('pre') var pre = document.createElement('pre')
pre.appendChild(document.createTextNode(packet.data.data.text)) pre.appendChild(document.createTextNode(packet.data.text))
line.innerHTML = '' line.innerHTML = ''
line.appendChild(pre) line.appendChild(pre)
@ -71,6 +71,7 @@ async function recognizeFile(file) {
<option value='meme' > Internet Meme </option> <option value='meme' > Internet Meme </option>
<option value='epo' > Esperanto </option> <option value='epo' > Esperanto </option>
<option value='epo_alt' > Esperanto alternative </option> <option value='epo_alt' > Esperanto alternative </option>
<option value='equ' > Math </option>
<option value='est' > Estonian </option> <option value='est' > Estonian </option>
<option value='eus' > Basque </option> <option value='eus' > Basque </option>
<option value='fin' > Finnish </option> <option value='fin' > Finnish </option>

BIN
examples/data/meditations.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1011 KiB

BIN
examples/data/testocr.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

BIN
examples/data/tyger.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

27
examples/node/benchmark.js

@ -1,27 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const { createWorker } = require('../../');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file)
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
console.log(file + " [x10] runtime: " + timeDif + "s");
}
console.log("Total runtime: " + timeTotal + "s");
await worker.terminate();
})();

17
examples/node/recognize.js

@ -1,20 +1,13 @@
#!/usr/bin/env node #!/usr/bin/env node
const path = require('path'); const path = require('path');
const { createWorker } = require('../../'); const Tesseract = require('../../');
const [,, imagePath] = process.argv; const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`); console.log(`Recognizing ${image}`);
const worker = createWorker({
logger: m => console.log(m),
});
(async () => { Tesseract.recognize(image, 'eng', { logger: m => console.log(m) })
await worker.load(); .then(({ data: { text } }) => {
await worker.loadLanguage('eng'); console.log(text);
await worker.initialize('eng'); });
const { data: { text } } = await worker.recognize(image);
console.log(text);
await worker.terminate();
})();

22452
package-lock.json generated

File diff suppressed because it is too large Load Diff

59
package.json

@ -1,6 +1,6 @@
{ {
"name": "tesseract.js", "name": "tesseract.js",
"version": "3.0.3", "version": "2.0.0-beta.1",
"description": "Pure Javascript Multilingual OCR", "description": "Pure Javascript Multilingual OCR",
"main": "src/index.js", "main": "src/index.js",
"types": "src/index.d.ts", "types": "src/index.d.ts",
@ -8,22 +8,18 @@
"jsdelivr": "dist/tesseract.min.js", "jsdelivr": "dist/tesseract.min.js",
"scripts": { "scripts": {
"start": "node scripts/server.js", "start": "node scripts/server.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js && rollup -c scripts/rollup.esm.js", "build": "rimraf dist && webpack --config scripts/webpack.config.prod.js",
"profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json",
"profile:worker": "webpack-bundle-analyzer dist/worker-stats.json",
"prepublishOnly": "npm run build", "prepublishOnly": "npm run build",
"wait": "rimraf dist && wait-on http://localhost:3000/dist/tesseract.dev.js", "wait": "rimraf dist && wait-on http://localhost:3000/dist/tesseract.dev.js",
"test": "npm-run-all -p -r start test:all", "test": "npm-run-all -p -r start test:all",
"test:all": "npm-run-all wait test:browser:* test:node:all", "test:all": "npm-run-all wait test:browser:* test:node:all",
"test:node": "nyc mocha --exit --bail --require ./scripts/test-helper.js", "test:node": "nyc mocha --exit --bail --require ./scripts/test-helper.js",
"test:node:all": "npm run test:node -- ./tests/*.test.js", "test:node:all": "npm run test:node:one -- ./tests/*.test.js",
"test:browser-tpl": "mocha-headless-chrome -a incognito -a no-sandbox -a disable-setuid-sandbox -a disable-logging -t 300000", "test:browser-tpl": "mocha-headless-chrome -a incognito -a no-sandbox -a disable-setuid-sandbox -a disable-logging -t 300000",
"test:browser:detect": "npm run test:browser-tpl -- -f ./tests/detect.test.html", "test:browser:detect": "npm run test:browser-tpl -- -f ./tests/detect.test.html",
"test:browser:recognize": "npm run test:browser-tpl -- -f ./tests/recognize.test.html", "test:browser:recognize": "npm run test:browser-tpl -- -f ./tests/recognize.test.html",
"test:browser:scheduler": "npm run test:browser-tpl -- -f ./tests/scheduler.test.html", "test:browser:scheduler": "npm run test:browser-tpl -- -f ./tests/scheduler.test.html",
"test:browser:FS": "npm run test:browser-tpl -- -f ./tests/FS.test.html",
"lint": "eslint src", "lint": "eslint src",
"lint:fix": "eslint --fix src",
"postinstall": "opencollective-postinstall || true" "postinstall": "opencollective-postinstall || true"
}, },
"browser": { "browser": {
@ -35,43 +31,38 @@
], ],
"license": "Apache-2.0", "license": "Apache-2.0",
"devDependencies": { "devDependencies": {
"@babel/core": "^7.18.7", "@babel/core": "^7.4.5",
"@babel/preset-env": "^7.18.7", "@babel/preset-env": "^7.4.5",
"@rollup/plugin-commonjs": "^22.0.2", "acorn": "^6.1.1",
"acorn": "^6.4.0", "babel-loader": "^8.0.6",
"babel-loader": "^8.2.0",
"buffer": "^6.0.3",
"cors": "^2.8.5", "cors": "^2.8.5",
"eslint": "^7.2.0", "eslint": "^5.9.0",
"eslint-config-airbnb-base": "^14.2.0", "eslint-config-airbnb": "^17.1.0",
"eslint-plugin-import": "^2.22.1", "eslint-plugin-import": "^2.14.0",
"eslint-plugin-jsx-a11y": "^6.1.2",
"eslint-plugin-react": "^7.11.1",
"expect.js": "^0.3.1", "expect.js": "^0.3.1",
"express": "^4.17.1", "express": "^4.16.4",
"mocha": "^8.1.3", "mocha": "^5.2.0",
"mocha-headless-chrome": "^2.0.3", "mocha-headless-chrome": "^2.0.2",
"npm-run-all": "^4.1.5", "npm-run-all": "^4.1.5",
"nyc": "^15.1.0", "nyc": "^13.1.0",
"rimraf": "^2.7.1", "rimraf": "^2.6.3",
"rollup": "^2.79.0", "wait-on": "^3.2.0",
"wait-on": "^3.3.0", "webpack": "^4.26.0",
"webpack": "^5.74.0", "webpack-cli": "^3.1.2",
"webpack-bundle-analyzer": "^4.6.0", "webpack-dev-middleware": "^3.4.0"
"webpack-cli": "^4.10.0",
"webpack-dev-middleware": "^5.3.3"
}, },
"dependencies": { "dependencies": {
"babel-eslint": "^10.1.0", "axios": "^0.18.0",
"bmp-js": "^0.1.0", "bmp-js": "^0.1.0",
"file-type": "^12.4.1", "file-type": "^12.3.0",
"idb-keyval": "^3.2.0", "idb-keyval": "^3.2.0",
"is-electron": "^2.2.0", "is-url": "1.2.2",
"is-url": "^1.2.4",
"node-fetch": "^2.6.0",
"opencollective-postinstall": "^2.0.2", "opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3", "regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1", "resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.2", "tesseract.js-core": "^2.0.0-beta.13",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1" "zlibjs": "^0.3.1"
}, },
"repository": { "repository": {

13
scripts/rollup.esm.js

@ -1,13 +0,0 @@
import commonjs from "@rollup/plugin-commonjs";
export default [
{
input: "dist/tesseract.min.js",
output: {
file: "dist/tesseract.esm.min.js",
format: "esm",
banner: "/* eslint-disable */",
},
plugins: [commonjs()],
},
];

2
scripts/server.js

@ -13,5 +13,5 @@ app.use('/', express.static(path.resolve(__dirname, '..')));
app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true })); app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true }));
module.exports = app.listen(3000, () => { module.exports = app.listen(3000, () => {
console.log('Server is running on the port no. 3000'); console.log('Server is running on port 3000');
}); });

5
scripts/webpack.config.common.js

@ -1,9 +1,4 @@
module.exports = { module.exports = {
resolve: {
fallback: {
buffer: require.resolve('buffer/'),
},
},
module: { module: {
rules: [ rules: [
{ {

9
scripts/webpack.config.dev.js

@ -1,6 +1,5 @@
const path = require('path'); const path = require('path');
const webpack = require('webpack'); const webpack = require('webpack');
const { BundleAnalyzerPlugin } = require('webpack-bundle-analyzer');
const common = require('./webpack.config.common'); const common = require('./webpack.config.common');
const genConfig = ({ const genConfig = ({
@ -15,19 +14,11 @@ const genConfig = ({
libraryTarget, libraryTarget,
}, },
plugins: [ plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
new webpack.DefinePlugin({ new webpack.DefinePlugin({
'process.env': { 'process.env': {
TESS_ENV: JSON.stringify('development'), TESS_ENV: JSON.stringify('development'),
}, },
}), }),
new BundleAnalyzerPlugin({
analyzerMode: 'disable',
statsFilename: `${filename.split('.')[0]}-stats.json`,
generateStatsFile: true
}),
], ],
devServer: { devServer: {
allowedHosts: ['localhost', '.gitpod.io'], allowedHosts: ['localhost', '.gitpod.io'],

6
scripts/webpack.config.prod.js

@ -1,6 +1,5 @@
const path = require('path'); const path = require('path');
const common = require('./webpack.config.common'); const common = require('./webpack.config.common');
const webpack = require('webpack');
const genConfig = ({ const genConfig = ({
entry, filename, library, libraryTarget, entry, filename, library, libraryTarget,
@ -15,11 +14,6 @@ const genConfig = ({
library, library,
libraryTarget, libraryTarget,
}, },
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
]
}); });
module.exports = [ module.exports = [

1
src/constants/PSM.js

@ -15,5 +15,4 @@ module.exports = {
SINGLE_CHAR: '10', SINGLE_CHAR: '10',
SPARSE_TEXT: '11', SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12', SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
}; };

218
src/constants/languages.js

@ -1,218 +0,0 @@
/*
* languages with existing tesseract traineddata
* https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
*/
/**
* @typedef {object} Languages
* @property {string} AFR Afrikaans
* @property {string} AMH Amharic
* @property {string} ARA Arabic
* @property {string} ASM Assamese
* @property {string} AZE Azerbaijani
* @property {string} AZE_CYRL Azerbaijani - Cyrillic
* @property {string} BEL Belarusian
* @property {string} BEN Bengali
* @property {string} BOD Tibetan
* @property {string} BOS Bosnian
* @property {string} BUL Bulgarian
* @property {string} CAT Catalan; Valencian
* @property {string} CEB Cebuano
* @property {string} CES Czech
* @property {string} CHI_SIM Chinese - Simplified
* @property {string} CHI_TRA Chinese - Traditional
* @property {string} CHR Cherokee
* @property {string} CYM Welsh
* @property {string} DAN Danish
* @property {string} DEU German
* @property {string} DZO Dzongkha
* @property {string} ELL Greek, Modern (1453-)
* @property {string} ENG English
* @property {string} ENM English, Middle (1100-1500)
* @property {string} EPO Esperanto
* @property {string} EST Estonian
* @property {string} EUS Basque
* @property {string} FAS Persian
* @property {string} FIN Finnish
* @property {string} FRA French
* @property {string} FRK German Fraktur
* @property {string} FRM French, Middle (ca. 1400-1600)
* @property {string} GLE Irish
* @property {string} GLG Galician
* @property {string} GRC Greek, Ancient (-1453)
* @property {string} GUJ Gujarati
* @property {string} HAT Haitian; Haitian Creole
* @property {string} HEB Hebrew
* @property {string} HIN Hindi
* @property {string} HRV Croatian
* @property {string} HUN Hungarian
* @property {string} IKU Inuktitut
* @property {string} IND Indonesian
* @property {string} ISL Icelandic
* @property {string} ITA Italian
* @property {string} ITA_OLD Italian - Old
* @property {string} JAV Javanese
* @property {string} JPN Japanese
* @property {string} KAN Kannada
* @property {string} KAT Georgian
* @property {string} KAT_OLD Georgian - Old
* @property {string} KAZ Kazakh
* @property {string} KHM Central Khmer
* @property {string} KIR Kirghiz; Kyrgyz
* @property {string} KOR Korean
* @property {string} KUR Kurdish
* @property {string} LAO Lao
* @property {string} LAT Latin
* @property {string} LAV Latvian
* @property {string} LIT Lithuanian
* @property {string} MAL Malayalam
* @property {string} MAR Marathi
* @property {string} MKD Macedonian
* @property {string} MLT Maltese
* @property {string} MSA Malay
* @property {string} MYA Burmese
* @property {string} NEP Nepali
* @property {string} NLD Dutch; Flemish
* @property {string} NOR Norwegian
* @property {string} ORI Oriya
* @property {string} PAN Panjabi; Punjabi
* @property {string} POL Polish
* @property {string} POR Portuguese
* @property {string} PUS Pushto; Pashto
* @property {string} RON Romanian; Moldavian; Moldovan
* @property {string} RUS Russian
* @property {string} SAN Sanskrit
* @property {string} SIN Sinhala; Sinhalese
* @property {string} SLK Slovak
* @property {string} SLV Slovenian
* @property {string} SPA Spanish; Castilian
* @property {string} SPA_OLD Spanish; Castilian - Old
* @property {string} SQI Albanian
* @property {string} SRP Serbian
* @property {string} SRP_LATN Serbian - Latin
* @property {string} SWA Swahili
* @property {string} SWE Swedish
* @property {string} SYR Syriac
* @property {string} TAM Tamil
* @property {string} TEL Telugu
* @property {string} TGK Tajik
* @property {string} TGL Tagalog
* @property {string} THA Thai
* @property {string} TIR Tigrinya
* @property {string} TUR Turkish
* @property {string} UIG Uighur; Uyghur
* @property {string} UKR Ukrainian
* @property {string} URD Urdu
* @property {string} UZB Uzbek
* @property {string} UZB_CYRL Uzbek - Cyrillic
* @property {string} VIE Vietnamese
* @property {string} YID Yiddish
*/
/**
* @type {Languages}
*/
module.exports = {
AFR: 'afr',
AMH: 'amh',
ARA: 'ara',
ASM: 'asm',
AZE: 'aze',
AZE_CYRL: 'aze_cyrl',
BEL: 'bel',
BEN: 'ben',
BOD: 'bod',
BOS: 'bos',
BUL: 'bul',
CAT: 'cat',
CEB: 'ceb',
CES: 'ces',
CHI_SIM: 'chi_sim',
CHI_TRA: 'chi_tra',
CHR: 'chr',
CYM: 'cym',
DAN: 'dan',
DEU: 'deu',
DZO: 'dzo',
ELL: 'ell',
ENG: 'eng',
ENM: 'enm',
EPO: 'epo',
EST: 'est',
EUS: 'eus',
FAS: 'fas',
FIN: 'fin',
FRA: 'fra',
FRK: 'frk',
FRM: 'frm',
GLE: 'gle',
GLG: 'glg',
GRC: 'grc',
GUJ: 'guj',
HAT: 'hat',
HEB: 'heb',
HIN: 'hin',
HRV: 'hrv',
HUN: 'hun',
IKU: 'iku',
IND: 'ind',
ISL: 'isl',
ITA: 'ita',
ITA_OLD: 'ita_old',
JAV: 'jav',
JPN: 'jpn',
KAN: 'kan',
KAT: 'kat',
KAT_OLD: 'kat_old',
KAZ: 'kaz',
KHM: 'khm',
KIR: 'kir',
KOR: 'kor',
KUR: 'kur',
LAO: 'lao',
LAT: 'lat',
LAV: 'lav',
LIT: 'lit',
MAL: 'mal',
MAR: 'mar',
MKD: 'mkd',
MLT: 'mlt',
MSA: 'msa',
MYA: 'mya',
NEP: 'nep',
NLD: 'nld',
NOR: 'nor',
ORI: 'ori',
PAN: 'pan',
POL: 'pol',
POR: 'por',
PUS: 'pus',
RON: 'ron',
RUS: 'rus',
SAN: 'san',
SIN: 'sin',
SLK: 'slk',
SLV: 'slv',
SPA: 'spa',
SPA_OLD: 'spa_old',
SQI: 'sqi',
SRP: 'srp',
SRP_LATN: 'srp_latn',
SWA: 'swa',
SWE: 'swe',
SYR: 'syr',
TAM: 'tam',
TEL: 'tel',
TGK: 'tgk',
TGL: 'tgl',
THA: 'tha',
TIR: 'tir',
TUR: 'tur',
UIG: 'uig',
UKR: 'ukr',
URD: 'urd',
UZB: 'uzb',
UZB_CYRL: 'uzb_cyrl',
VIE: 'vie',
YID: 'yid',
};

51
src/createWorker.js

@ -19,7 +19,6 @@ module.exports = (_options = {}) => {
const id = getId('Worker', workerCounter); const id = getId('Worker', workerCounter);
const { const {
logger, logger,
errorHandler,
...options ...options
} = resolvePaths({ } = resolvePaths({
...defaultOptions, ...defaultOptions,
@ -53,44 +52,12 @@ module.exports = (_options = {}) => {
}) })
); );
const load = (jobId) => ( const load = jobId => (
startJob(createJob({ startJob(createJob({
id: jobId, action: 'load', payload: { options }, id: jobId, action: 'load', payload: { options },
})) }))
); );
const writeText = (path, text, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'writeFile', args: [path, text] },
}))
);
const readText = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'readFile', args: [path, { encoding: 'utf8' }] },
}))
);
const removeFile = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'unlink', args: [path] },
}))
);
const FS = (method, args, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method, args },
}))
);
const loadLanguage = (langs = 'eng', jobId) => ( const loadLanguage = (langs = 'eng', jobId) => (
startJob(createJob({ startJob(createJob({
id: jobId, id: jobId,
@ -139,14 +106,12 @@ module.exports = (_options = {}) => {
})) }))
); );
const terminate = async () => { const terminate = async (jobId) => {
if (worker !== null) { if (worker !== null) {
/*
await startJob(createJob({ await startJob(createJob({
id: jobId, id: jobId,
action: 'terminate', action: 'terminate',
})); }));
*/
terminateWorker(worker); terminateWorker(worker);
worker = null; worker = null;
} }
@ -167,13 +132,9 @@ module.exports = (_options = {}) => {
resolves[action]({ jobId, data: d }); resolves[action]({ jobId, data: d });
} else if (status === 'reject') { } else if (status === 'reject') {
rejects[action](data); rejects[action](data);
if (errorHandler) { throw Error(data);
errorHandler(data);
} else {
throw Error(data);
}
} else if (status === 'progress') { } else if (status === 'progress') {
logger({ ...data, userJobId: jobId }); logger(data);
} }
}); });
@ -183,10 +144,6 @@ module.exports = (_options = {}) => {
setResolve, setResolve,
setReject, setReject,
load, load,
writeText,
readText,
removeFile,
FS,
loadLanguage, loadLanguage,
initialize, initialize,
setParameters, setParameters,

37
src/index.d.ts vendored

@ -3,7 +3,7 @@ declare namespace Tesseract {
function createWorker(options?: Partial<WorkerOptions>): Worker function createWorker(options?: Partial<WorkerOptions>): Worker
function setLogging(logging: boolean): void function setLogging(logging: boolean): void
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult> function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any function detect(image: ImageLike, options?: Partial<WorkerOptions>)
interface Scheduler { interface Scheduler {
addWorker(worker: Worker): string addWorker(worker: Worker): string
@ -15,22 +15,12 @@ declare namespace Tesseract {
interface Worker { interface Worker {
load(jobId?: string): Promise<ConfigResult> load(jobId?: string): Promise<ConfigResult>
writeText(path: string, text: string, jobId?: string): Promise<ConfigResult> loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult>
readText(path: string, jobId?: string): Promise<ConfigResult> initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult> setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult> recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult> detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult> terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
}
interface Lang {
code: string;
data: unknown;
} }
interface WorkerOptions { interface WorkerOptions {
@ -42,15 +32,12 @@ declare namespace Tesseract {
cacheMethod: string cacheMethod: string
workerBlobURL: boolean workerBlobURL: boolean
gzip: boolean gzip: boolean
logger: (arg: any) => void, logger: (any) => void
errorHandler: (arg: any) => void
} }
interface WorkerParams { interface WorkerParams {
tessedit_ocr_engine_mode: OEM tessedit_ocr_engine_mode: OEM
tessedit_pageseg_mode: PSM tessedit_pageseg_mode: PSM
tessedit_char_whitelist: string tessedit_char_whiltelist: string
preserve_interword_spaces: string
user_defined_dpi: string
tessjs_create_hocr: string tessjs_create_hocr: string
tessjs_create_tsv: string tessjs_create_tsv: string
tessjs_create_box: string tessjs_create_box: string
@ -58,7 +45,7 @@ declare namespace Tesseract {
tessjs_create_osd: string tessjs_create_osd: string
} }
interface RecognizeOptions { interface RecognizeOptions {
rectangle: Rectangle rectangles: Rectangle[]
} }
interface ConfigResult { interface ConfigResult {
jobId: string jobId: string
@ -68,10 +55,6 @@ declare namespace Tesseract {
jobId: string jobId: string
data: Page data: Page
} }
interface GetPDFResult {
jobId: string
data: number[]
}
interface DetectResult { interface DetectResult {
jobId: string jobId: string
data: DetectData data: DetectData
@ -89,13 +72,13 @@ declare namespace Tesseract {
width: number width: number
height: number height: number
} }
enum OEM { const enum OEM {
TESSERACT_ONLY, TESSERACT_ONLY,
LSTM_ONLY, LSTM_ONLY,
TESSERACT_LSTM_COMBINED, TESSERACT_LSTM_COMBINED,
DEFAULT, DEFAULT,
} }
enum PSM { const enum PSM {
OSD_ONLY = '0', OSD_ONLY = '0',
AUTO_OSD = '1', AUTO_OSD = '1',
AUTO_ONLY = '2', AUTO_ONLY = '2',
@ -109,12 +92,11 @@ declare namespace Tesseract {
SINGLE_CHAR = '10', SINGLE_CHAR = '10',
SPARSE_TEXT = '11', SPARSE_TEXT = '11',
SPARSE_TEXT_OSD = '12', SPARSE_TEXT_OSD = '12',
RAW_LINE = '13'
} }
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer; | CanvasRenderingContext2D | File | Blob | ImageData | Buffer;
interface Block { interface Block {
paragraphs: Paragraph[]; paragraphs: Paragraph;
text: string; text: string;
confidence: number; confidence: number;
baseline: Baseline; baseline: Baseline;
@ -212,7 +194,6 @@ declare namespace Tesseract {
confidence: number; confidence: number;
lines: Line[]; lines: Line[];
oem: string; oem: string;
osd: string;
paragraphs: Paragraph[]; paragraphs: Paragraph[];
psm: string; psm: string;
symbols: Symbol[]; symbols: Symbol[];

2
src/index.js

@ -11,13 +11,11 @@ require('regenerator-runtime/runtime');
const createScheduler = require('./createScheduler'); const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker'); const createWorker = require('./createWorker');
const Tesseract = require('./Tesseract'); const Tesseract = require('./Tesseract');
const languages = require('./constants/languages');
const OEM = require('./constants/OEM'); const OEM = require('./constants/OEM');
const PSM = require('./constants/PSM'); const PSM = require('./constants/PSM');
const { setLogging } = require('./utils/log'); const { setLogging } = require('./utils/log');
module.exports = { module.exports = {
languages,
OEM, OEM,
PSM, PSM,
createScheduler, createScheduler,

17
src/utils/getEnvironment.js

@ -1,21 +1,10 @@
const isElectron = require('is-electron');
module.exports = (key) => { module.exports = (key) => {
const env = {}; const env = {
type: (typeof window !== 'undefined') && (typeof window.document !== 'undefined') ? 'browser' : 'node',
if (typeof WorkerGlobalScope !== 'undefined') { };
env.type = 'webworker';
} else if (isElectron()) {
env.type = 'electron';
} else if (typeof window === 'object') {
env.type = 'browser';
} else if (typeof process === 'object' && typeof require === 'function') {
env.type = 'node';
}
if (typeof key === 'undefined') { if (typeof key === 'undefined') {
return env; return env;
} }
return env[key]; return env[key];
}; };

2
src/utils/resolvePaths.js

@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl
module.exports = (options) => { module.exports = (options) => {
const opts = { ...options }; const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => { ['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (options[key]) { if (typeof options[key] !== 'undefined') {
opts[key] = resolveURL(opts[key]); opts[key] = resolveURL(opts[key]);
} }
}); });

4
src/worker-script/browser/cache.js

@ -4,7 +4,7 @@ module.exports = {
readCache: get, readCache: get,
writeCache: set, writeCache: set,
deleteCache: del, deleteCache: del,
checkCache: (path) => ( checkCache: path => (
get(path).then((v) => typeof v !== 'undefined') get(path).then(v => typeof v !== 'undefined')
), ),
}; };

27
src/worker-script/browser/getCore.js

@ -1,26 +1,15 @@
const { simd } = require('wasm-feature-detect'); module.exports = (corePath, res) => {
const { dependencies } = require('../../../package.json');
module.exports = async (corePath, res) => {
if (typeof global.TesseractCore === 'undefined') { if (typeof global.TesseractCore === 'undefined') {
res.progress({ status: 'loading tesseract core', progress: 0 }); res.progress({ status: 'loading tesseract core', progress: 0 });
global.importScripts(corePath);
// If the user specifies a core path, we use that /*
// Otherwise, we detect the correct core based on SIMD support * Depending on whether the browser supports WebAssembly,
let corePathImport = corePath; * the version of the TesseractCore will be different.
if (!corePathImport) { */
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
}
}
global.importScripts(corePathImport);
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') { if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') {
global.TesseractCore = global.TesseractCoreWASM; global.TesseractCore = global.TesseractCoreWASM;
} else if (typeof global.TesseractCoreASM !== 'undefined') {
global.TesseractCore = global.TesseractCoreASM;
} else { } else {
throw Error('Failed to load TesseractCore'); throw Error('Failed to load TesseractCore');
} }

7
src/worker-script/browser/index.js

@ -8,16 +8,17 @@
* @author Jerome Wu <jeromewus@gmail.com> * @author Jerome Wu <jeromewus@gmail.com>
*/ */
const worker = require('..'); const worker = require('../');
const getCore = require('./getCore'); const getCore = require('./getCore');
const gunzip = require('./gunzip'); const gunzip = require('./gunzip');
const resolveURL = require('./resolveURL');
const cache = require('./cache'); const cache = require('./cache');
/* /*
* register message handler * register message handler
*/ */
global.addEventListener('message', ({ data }) => { global.addEventListener('message', ({ data }) => {
worker.dispatchHandlers(data, (obj) => postMessage(obj)); worker.dispatchHandlers(data, obj => postMessage(obj));
}); });
/* /*
@ -27,6 +28,6 @@ global.addEventListener('message', ({ data }) => {
worker.setAdapter({ worker.setAdapter({
getCore, getCore,
gunzip, gunzip,
fetch: () => {}, resolveURL,
...cache, ...cache,
}); });

1
src/worker-script/browser/resolveURL.js

@ -0,0 +1 @@
module.exports = require('resolve-url');

2
src/worker-script/constants/defaultParams.js

@ -5,7 +5,7 @@ const PSM = require('../../constants/PSM');
module.exports = { module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whitelist: '', tessedit_char_whiltelist: '',
tessjs_create_hocr: '1', tessjs_create_hocr: '1',
tessjs_create_tsv: '1', tessjs_create_tsv: '1',
tessjs_create_box: '0', tessjs_create_box: '0',

60
src/worker-script/index.js

@ -9,9 +9,10 @@
*/ */
require('regenerator-runtime/runtime'); require('regenerator-runtime/runtime');
const fileType = require('file-type'); const fileType = require('file-type');
const axios = require('axios');
const isURL = require('is-url'); const isURL = require('is-url');
const dump = require('./utils/dump'); const dump = require('./utils/dump');
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker'; const isBrowser = require('../utils/getEnvironment')('type') === 'browser';
const setImage = require('./utils/setImage'); const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams'); const defaultParams = require('./constants/defaultParams');
const { log, setLogging } = require('../utils/log'); const { log, setLogging } = require('../utils/log');
@ -28,10 +29,10 @@ let latestJob;
let adapter = {}; let adapter = {};
let params = defaultParams; let params = defaultParams;
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging); setLogging(logging);
if (!TessModule) { if (!TessModule) {
const Core = await adapter.getCore(corePath, res); const Core = adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
@ -54,11 +55,6 @@ const load = async ({ workerId, jobId, payload: { options: { corePath, logging }
} }
}; };
const FS = ({ workerId, payload: { method, args } }, res) => {
log(`[${workerId}]: FS.${method} with args ${args}`);
res.resolve(TessModule.FS[method](...args));
};
const loadLanguage = async ({ const loadLanguage = async ({
workerId, workerId,
payload: { payload: {
@ -72,7 +68,7 @@ const loadLanguage = async ({
}, },
}, },
}, },
res) => { res) => {
const loadAndGunzipFile = async (_lang) => { const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code; const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod) const readCache = ['refresh', 'none'].includes(cacheMethod)
@ -84,7 +80,6 @@ res) => {
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
if (typeof _data !== 'undefined') { if (typeof _data !== 'undefined') {
log(`[${workerId}]: Load ${lang}.traineddata from cache`); log(`[${workerId}]: Load ${lang}.traineddata from cache`);
res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 });
data = _data; data = _data;
} else { } else {
throw Error('Not found in cache'); throw Error('Not found in cache');
@ -94,17 +89,18 @@ res) => {
if (typeof _lang === 'string') { if (typeof _lang === 'string') {
let path = null; let path = null;
if (isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */ if (isURL(langPath) || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */
path = langPath; path = langPath;
} else if (process.browser) { /** When langPath is not an URL in browser */
path = adapter.resolveURL(langPath);
} }
if (path !== null) { if (path !== null) {
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`; const { data: _data } = await axios.get(
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl); `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`,
if (!resp.ok) { { responseType: 'arraybuffer' },
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`); );
} data = _data;
data = await resp.arrayBuffer();
} else { } else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`); data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
} }
@ -144,13 +140,21 @@ res) => {
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs); res.resolve(langs);
} catch (err) { } catch (err) {
res.reject(err.toString()); if (isBrowser && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
} }
}; };
const setParameters = ({ payload: { params: _params } }, res) => { const setParameters = ({ payload: { params: _params } }, res) => {
Object.keys(_params) Object.keys(_params)
.filter((k) => !k.startsWith('tessjs_')) .filter(k => !k.startsWith('tessjs_'))
.forEach((key) => { .forEach((key) => {
api.SetVariable(key, _params[key]); api.SetVariable(key, _params[key]);
}); });
@ -167,7 +171,7 @@ const initialize = ({
}, res) => { }, res) => {
const langs = (typeof _langs === 'string') const langs = (typeof _langs === 'string')
? _langs ? _langs
: _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+'); : _langs.map(l => ((typeof l === 'string') ? l : l.data)).join('+');
try { try {
res.progress({ res.progress({
@ -177,10 +181,7 @@ const initialize = ({
api.End(); api.End();
} }
api = new TessModule.TessBaseAPI(); api = new TessModule.TessBaseAPI();
const status = api.Init(null, langs, oem); api.Init(null, langs, oem);
if (status === -1) {
res.reject('initialization failed');
}
params = defaultParams; params = defaultParams;
setParameters({ payload: { params } }); setParameters({ payload: { params } });
res.progress({ res.progress({
@ -192,12 +193,14 @@ const initialize = ({
} }
}; };
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => { const recognize = ({ payload: { image, options: { rectangles = [] } } }, res) => {
try { try {
const ptr = setImage(TessModule, api, image); const ptr = setImage(TessModule, api, image);
if (typeof rec === 'object') { rectangles.forEach(({
api.SetRectangle(rec.left, rec.top, rec.width, rec.height); left, top, width, height,
} }) => {
api.SetRectangle(left, top, width, height);
});
api.Recognize(null); api.Recognize(null);
res.resolve(dump(TessModule, api, params)); res.resolve(dump(TessModule, api, params));
TessModule._free(ptr); TessModule._free(ptr);
@ -285,7 +288,6 @@ exports.dispatchHandlers = (packet, send) => {
try { try {
({ ({
load, load,
FS,
loadLanguage, loadLanguage,
initialize, initialize,
setParameters, setParameters,

6
src/worker-script/node/cache.js

@ -4,13 +4,13 @@ const fs = require('fs');
module.exports = { module.exports = {
readCache: util.promisify(fs.readFile), readCache: util.promisify(fs.readFile),
writeCache: util.promisify(fs.writeFile), writeCache: util.promisify(fs.writeFile),
deleteCache: (path) => ( deleteCache: path => (
util.promisify(fs.unlink)(path) util.promisify(fs.unlink)(path)
.catch(() => {}) .catch(() => {})
), ),
checkCache: (path) => ( checkCache: path => (
util.promisify(fs.access)(path, fs.F_OK) util.promisify(fs.access)(path, fs.F_OK)
.then((err) => (err === null)) .then(err => (err === null))
.catch(() => false) .catch(() => false)
), ),
}; };

11
src/worker-script/node/getCore.js

@ -1,19 +1,12 @@
const { simd } = require('wasm-feature-detect');
let TesseractCore = null; let TesseractCore = null;
/* /*
* getCore is a sync function to load and return * getCore is a sync function to load and return
* TesseractCore. * TesseractCore.
*/ */
module.exports = async (_, res) => { module.exports = (_, res) => {
if (TesseractCore === null) { if (TesseractCore === null) {
const simdSupport = await simd();
res.progress({ status: 'loading tesseract core', progress: 0 }); res.progress({ status: 'loading tesseract core', progress: 0 });
if (simdSupport) { TesseractCore = require('tesseract.js-core');
TesseractCore = require('tesseract.js-core/tesseract-core-simd');
} else {
TesseractCore = require('tesseract.js-core/tesseract-core');
}
res.progress({ status: 'loaded tesseract core', progress: 1 }); res.progress({ status: 'loaded tesseract core', progress: 1 });
} }
return TesseractCore; return TesseractCore;

11
src/worker-script/node/index.js

@ -8,23 +8,22 @@
* @author Jerome Wu <jeromewus@gmail.com> * @author Jerome Wu <jeromewus@gmail.com>
*/ */
const fetch = require('node-fetch'); const worker = require('../');
const { parentPort } = require('worker_threads');
const worker = require('..');
const getCore = require('./getCore'); const getCore = require('./getCore');
const resolveURL = require('./resolveURL');
const gunzip = require('./gunzip'); const gunzip = require('./gunzip');
const cache = require('./cache'); const cache = require('./cache');
/* /*
* register message handler * register message handler
*/ */
parentPort.on('message', (packet) => { process.on('message', (packet) => {
worker.dispatchHandlers(packet, (obj) => parentPort.postMessage(obj)); worker.dispatchHandlers(packet, obj => process.send(obj));
}); });
worker.setAdapter({ worker.setAdapter({
getCore, getCore,
gunzip, gunzip,
fetch, resolveURL,
...cache, ...cache,
}); });

1
src/worker-script/node/resolveURL.js

@ -0,0 +1 @@
module.exports = s => s;

4
src/worker-script/utils/dump.js

@ -61,8 +61,8 @@ module.exports = (TessModule, api, {
const enumToString = (value, prefix) => ( const enumToString = (value, prefix) => (
Object.keys(TessModule) Object.keys(TessModule)
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value)) .filter(e => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map((e) => e.slice(prefix.length + 1))[0] .map(e => e.slice(prefix.length + 1))[0]
); );
ri.Begin(); ri.Begin();

12
src/worker-script/utils/setImage.js

@ -17,12 +17,10 @@ module.exports = (TessModule, api, image) => {
let w = 0; let w = 0;
let h = 0; let h = 0;
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/* /*
* Leptonica supports uncompressed but not compressed bmp files * Although leptonica should support reading bmp, there is a bug of "compressed BMP files".
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 * As there is no solution, we need to use bmp-js for now.
* We therefore use bmp-js to process all bmp files * @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE
*/ */
if (type && type.mime === 'image/bmp') { if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf); const bmpBuf = bmp.decode(buf);
@ -55,9 +53,9 @@ module.exports = (TessModule, api, image) => {
* *
*/ */
if (data === null) { if (data === null) {
api.SetImage(pix, undefined, undefined, undefined, undefined, exif); api.SetImage(pix);
} else { } else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif); api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
} }
return data === null ? pix : data; return data === null ? pix : data;
}; };

4
src/worker/browser/defaultOptions.js

@ -1,5 +1,5 @@
const resolveURL = require('resolve-url'); const resolveURL = require('resolve-url');
const { version } = require('../../../package.json'); const { version, dependencies } = require('../../../package.json');
const defaultOptions = require('../../constants/defaultOptions'); const defaultOptions = require('../../constants/defaultOptions');
/* /*
@ -14,5 +14,5 @@ module.exports = {
* If browser doesn't support WebAssembly, * If browser doesn't support WebAssembly,
* load ASM version instead * load ASM version instead
*/ */
corePath: null, corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
}; };

9
src/worker/browser/loadImage.js

@ -1,3 +1,4 @@
const axios = require('axios');
const resolveURL = require('resolve-url'); const resolveURL = require('resolve-url');
/** /**
@ -7,7 +8,7 @@ const resolveURL = require('resolve-url');
* @function * @function
* @access private * @access private
*/ */
const readFromBlobOrFile = (blob) => ( const readFromBlobOrFile = blob => (
new Promise((resolve, reject) => { new Promise((resolve, reject) => {
const fileReader = new FileReader(); const fileReader = new FileReader();
fileReader.onload = () => { fileReader.onload = () => {
@ -38,10 +39,10 @@ const loadImage = async (image) => {
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = atob(image.split(',')[1]) data = atob(image.split(',')[1])
.split('') .split('')
.map((c) => c.charCodeAt(0)); .map(c => c.charCodeAt(0));
} else { } else {
const resp = await fetch(resolveURL(image)); const { data: _data } = await axios.get(resolveURL(image), { responseType: 'arraybuffer' });
data = await resp.arrayBuffer(); data = _data;
} }
} else if (image instanceof HTMLElement) { } else if (image instanceof HTMLElement) {
if (image.tagName === 'IMG') { if (image.tagName === 'IMG') {

8
src/worker/node/loadImage.js

@ -1,6 +1,6 @@
const util = require('util'); const util = require('util');
const fs = require('fs'); const fs = require('fs');
const fetch = require('node-fetch'); const axios = require('axios');
const isURL = require('is-url'); const isURL = require('is-url');
const readFile = util.promisify(fs.readFile); const readFile = util.promisify(fs.readFile);
@ -19,9 +19,9 @@ module.exports = async (image) => {
} }
if (typeof image === 'string') { if (typeof image === 'string') {
if (isURL(image) || image.startsWith('moz-extension://') || image.startsWith('chrome-extension://') || image.startsWith('file://')) { if (isURL(image) || image.startsWith('chrome-extension://') || image.startsWith('file://')) {
const resp = await fetch(image); const { data: _data } = await axios.get(image, { responseType: 'arraybuffer' });
data = await resp.arrayBuffer(); data = _data;
} else if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { } else if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = Buffer.from(image.split(',')[1], 'base64'); data = Buffer.from(image.split(',')[1], 'base64');
} else { } else {

4
src/worker/node/send.js

@ -5,6 +5,6 @@
* @function send packet to worker and create a job * @function send packet to worker and create a job
* @access public * @access public
*/ */
module.exports = async (worker, packet) => { module.exports = (worker, packet) => {
worker.postMessage(packet); worker.send(packet);
}; };

6
src/worker/node/spawnWorker.js

@ -1,4 +1,4 @@
const { Worker } = require('worker_threads'); const { fork } = require('child_process');
/** /**
* spawnWorker * spawnWorker
@ -7,4 +7,6 @@ const { Worker } = require('worker_threads');
* @function fork a new process in node * @function fork a new process in node
* @access public * @access public
*/ */
module.exports = ({ workerPath }) => new Worker(workerPath); module.exports = ({ workerPath }) => (
fork(workerPath)
);

2
src/worker/node/terminateWorker.js

@ -6,5 +6,5 @@
* @access public * @access public
*/ */
module.exports = (worker) => { module.exports = (worker) => {
worker.terminate(); worker.kill();
}; };

18
tests/FS.test.html

@ -1,18 +0,0 @@
<html>
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="../node_modules/mocha/mocha.css">
</head>
<body>
<div id="mocha"></div>
<script src="../node_modules/mocha/mocha.js"></script>
<script src="../node_modules/expect.js/index.js"></script>
<script src="../dist/tesseract.dev.js"></script>
<script src="./constants.js"></script>
<script>mocha.setup('bdd');</script>
<script src="./FS.test.js"></script>
<script>
mocha.run();
</script>
</body>
</html>

37
tests/FS.test.js

@ -1,37 +0,0 @@
const { createWorker } = Tesseract;
const FS_WAIT = 500;
const worker = createWorker(OPTIONS);
before(function cb() {
this.timeout(0);
return worker.load();
});
describe('FS', async () => {
it('should write and read text from FS (using FS only)', () => {
[
SIMPLE_TEXT,
].forEach(async (text) => {
const path = 'tmp.txt';
await worker.FS('writeFile', [path, SIMPLE_TEXT]);
setTimeout(async () => {
const { data } = await worker.FS('readFile', [path]);
await worker.FS('unlink', [path]);
expect(data.toString()).to.be(text);
}, FS_WAIT);
});
}).timeout(TIMEOUT);
it('should write and read text from FS (using writeFile, readFile)', () => {
[
SIMPLE_TEXT,
].forEach(async (text) => {
const path = 'tmp2.txt';
await worker.writeText(path, SIMPLE_TEXT);
setTimeout(async () => {
const { data } = await worker.readText(path);
await worker.removeFile(path);
expect(data.toString()).to.be(text);
}, FS_WAIT);
});
}).timeout(TIMEOUT);
});

BIN
tests/assets/images/simple.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1011 B

BIN
tests/assets/images/simple.webp

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

5
tests/constants.js

File diff suppressed because one or more lines are too long

8
tests/recognize.test.js

@ -69,9 +69,11 @@ describe('recognize()', () => {
const { data: { text } } = await worker.recognize( const { data: { text } } = await worker.recognize(
`${IMAGE_PATH}/${name}`, `${IMAGE_PATH}/${name}`,
{ {
rectangle: { rectangles: [
top, left, width, height, {
}, top, left, width, height,
},
],
}, },
); );
expect(text).to.be(ans); expect(text).to.be(ans);

Loading…
Cancel
Save