Compare commits

...

20 Commits

  1. 30
      README.md
  2. 25
      docs/image-format.md
  3. 1
      examples/browser/demo.html
  4. 230
      package-lock.json
  5. 9
      package.json
  6. 13
      scripts/rollup.esm.js
  7. 1
      src/constants/PSM.js
  8. 19
      src/index.d.ts
  9. 2
      src/utils/resolvePaths.js
  10. 27
      src/worker-script/browser/getCore.js
  11. 17
      src/worker-script/index.js
  12. 11
      src/worker-script/node/getCore.js
  13. 6
      src/worker-script/utils/setImage.js
  14. 4
      src/worker/browser/defaultOptions.js
  15. BIN
      tests/assets/images/simple.gif
  16. BIN
      tests/assets/images/simple.webp
  17. 2
      tests/constants.js

30
README.md

@ -12,13 +12,6 @@ @@ -12,13 +12,6 @@
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
<h3 align="center">
Version 2 is now available and under development in the master branch, read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
</h3>
<br>
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
Image Recognition
@ -69,6 +62,16 @@ const worker = createWorker({ @@ -69,6 +62,16 @@ const worker = createWorker({
[Check out the docs](#documentation) for a full explanation of the API.
## Major changes in v3
- Significantly faster performance
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data)
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18)
- Added SIMD-enabled build for supported devices
- Added support:
- Node.js version 18
- Removed support:
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0)
- Node.js versions 10 and 12
## Major changes in v2
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream)
@ -77,7 +80,8 @@ const worker = createWorker({ @@ -77,7 +80,8 @@ const worker = createWorker({
- Support WebAssembly (fallback to ASM.js when browser doesn't support)
- Support Typescript
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
@ -95,16 +99,16 @@ After including the script the `Tesseract` variable will be globally available. @@ -95,16 +99,16 @@ After including the script the `Tesseract` variable will be globally available.
### Node.js
**Tesseract.js currently requires Node.js v6.8.0 or higher**
**Tesseract.js v3 requires Node.js v14 or higher**
```shell
# For v2
# For v3
npm install tesseract.js
yarn add tesseract.js
# For v1
npm install tesseract.js@1
yarn add tesseract.js@1
# For v2
npm install tesseract.js@2
yarn add tesseract.js@2
```

25
docs/image-format.md

@ -1,17 +1,18 @@ @@ -1,17 +1,18 @@
# Image Format
Support Format: **bmp, jpg, png, pbm**
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below.
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
Support Image Formats: **bmp, jpg, png, pbm, webp**
On a browser, an image can be:
- an `img`, `video`, or `canvas` element
- a `File` object (from a file `<input>`)
- a `Blob` object
- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser and Node, supported data types are:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp)
- buffer
In Node.js, an image can be
- a path to a local image
- a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser only, supported data types are:
- `File` or `Blob` object
- `img` or `canvas` element
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

1
examples/browser/demo.html

@ -71,7 +71,6 @@ async function recognizeFile(file) { @@ -71,7 +71,6 @@ async function recognizeFile(file) {
<option value='meme' > Internet Meme </option>
<option value='epo' > Esperanto </option>
<option value='epo_alt' > Esperanto alternative </option>
<option value='equ' > Math </option>
<option value='est' > Estonian </option>
<option value='eus' > Basque </option>
<option value='fin' > Finnish </option>

230
package-lock.json generated

@ -1,12 +1,12 @@ @@ -1,12 +1,12 @@
{
"name": "tesseract.js",
"version": "2.1.5",
"version": "3.0.3",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "tesseract.js",
"version": "2.1.5",
"version": "3.0.3",
"hasInstallScript": true,
"license": "Apache-2.0",
"dependencies": {
@ -20,12 +20,14 @@ @@ -20,12 +20,14 @@
"opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.1",
"tesseract.js-core": "^3.0.2",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
},
"devDependencies": {
"@babel/core": "^7.18.7",
"@babel/preset-env": "^7.18.7",
"@rollup/plugin-commonjs": "^22.0.2",
"acorn": "^6.4.0",
"babel-loader": "^8.2.0",
"buffer": "^6.0.3",
@ -40,6 +42,7 @@ @@ -40,6 +42,7 @@
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"rimraf": "^2.7.1",
"rollup": "^2.79.0",
"wait-on": "^3.3.0",
"webpack": "^5.74.0",
"webpack-bundle-analyzer": "^4.6.0",
@ -2058,6 +2061,62 @@ @@ -2058,6 +2061,62 @@
"integrity": "sha512-a5Sab1C4/icpTZVzZc5Ghpz88yQtGOyNqYXcZgOssB2uuAr+wF/MvN6bgtW32q7HHrvBki+BsZ0OuNv6EV3K9g==",
"dev": true
},
"node_modules/@rollup/plugin-commonjs": {
"version": "22.0.2",
"resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-22.0.2.tgz",
"integrity": "sha512-//NdP6iIwPbMTcazYsiBMbJW7gfmpHom33u1beiIoHDEM0Q9clvtQB1T0efvMqHeKsGohiHo97BCPCkBXdscwg==",
"dev": true,
"dependencies": {
"@rollup/pluginutils": "^3.1.0",
"commondir": "^1.0.1",
"estree-walker": "^2.0.1",
"glob": "^7.1.6",
"is-reference": "^1.2.1",
"magic-string": "^0.25.7",
"resolve": "^1.17.0"
},
"engines": {
"node": ">= 12.0.0"
},
"peerDependencies": {
"rollup": "^2.68.0"
}
},
"node_modules/@rollup/plugin-commonjs/node_modules/estree-walker": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz",
"integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==",
"dev": true
},
"node_modules/@rollup/pluginutils": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-3.1.0.tgz",
"integrity": "sha512-GksZ6pr6TpIjHm8h9lSQ8pi8BE9VeubNT0OMJ3B5uZJ8pz73NPiqOtCog/x2/QzM1ENChPKxMDhiQuRHsqc+lg==",
"dev": true,
"dependencies": {
"@types/estree": "0.0.39",
"estree-walker": "^1.0.1",
"picomatch": "^2.2.2"
},
"engines": {
"node": ">= 8.0.0"
},
"peerDependencies": {
"rollup": "^1.20.0||^2.0.0"
}
},
"node_modules/@rollup/pluginutils/node_modules/@types/estree": {
"version": "0.0.39",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-0.0.39.tgz",
"integrity": "sha512-EYNwp3bU+98cpU4lAWYYL7Zz+2gryWH1qbdDTidVd6hkiR6weksdbMadyXKXNPEkQFhXM+hVO9ZygomHXp+AIw==",
"dev": true
},
"node_modules/@rollup/pluginutils/node_modules/estree-walker": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-1.0.1.tgz",
"integrity": "sha512-1fMXF3YP4pZZVozF8j/ZLfvnR8NSIljt56UhbZ5PeeDmmGHpgpdwQt7ITlGvYaQukCvuBRMLEiKiYC+oeIg4cg==",
"dev": true
},
"node_modules/@types/eslint": {
"version": "8.4.6",
"resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-8.4.6.tgz",
@ -5364,6 +5423,15 @@ @@ -5364,6 +5423,15 @@
"node": ">=0.10.0"
}
},
"node_modules/is-reference": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/is-reference/-/is-reference-1.2.1.tgz",
"integrity": "sha512-U82MsXXiFIrjCK4otLT+o2NA2Cd2g5MLoOVXUZjIOhLurrRxpEXzI8O0KZHr3IjLvlAH1kTPYSuqer5T9ZVBKQ==",
"dev": true,
"dependencies": {
"@types/estree": "*"
}
},
"node_modules/is-regex": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@ -6070,6 +6138,15 @@ @@ -6070,6 +6138,15 @@
"node": ">=8"
}
},
"node_modules/magic-string": {
"version": "0.25.9",
"resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.25.9.tgz",
"integrity": "sha512-RmF0AsMzgt25qzqqLc1+MbHmhdx0ojF2Fvs4XnOqz2ZOBXzzkEwc/dJQZCYHAn7v1jbVOjAZfK8msRn4BxO4VQ==",
"dev": true,
"dependencies": {
"sourcemap-codec": "^1.4.8"
}
},
"node_modules/make-dir": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
@ -7724,6 +7801,35 @@ @@ -7724,6 +7801,35 @@
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/rollup": {
"version": "2.79.0",
"resolved": "https://registry.npmjs.org/rollup/-/rollup-2.79.0.tgz",
"integrity": "sha512-x4KsrCgwQ7ZJPcFA/SUu6QVcYlO7uRLfLAy0DSA4NS2eG8japdbpM50ToH7z4iObodRYOJ0soneF0iaQRJ6zhA==",
"dev": true,
"bin": {
"rollup": "dist/bin/rollup"
},
"engines": {
"node": ">=10.0.0"
},
"optionalDependencies": {
"fsevents": "~2.3.2"
}
},
"node_modules/rollup/node_modules/fsevents": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
"dev": true,
"hasInstallScript": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/rx": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/rx/-/rx-4.1.0.tgz",
@ -8002,6 +8108,12 @@ @@ -8002,6 +8108,12 @@
"source-map": "^0.6.0"
}
},
"node_modules/sourcemap-codec": {
"version": "1.4.8",
"resolved": "https://registry.npmjs.org/sourcemap-codec/-/sourcemap-codec-1.4.8.tgz",
"integrity": "sha512-9NykojV5Uih4lgo5So5dtw+f0JgJX30KCNI8gwhz2J9A15wD0Ml6tjHKwf6fTSa6fAdVBdZeNOs9eJ71qCk8vA==",
"dev": true
},
"node_modules/spawn-wrap": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/spawn-wrap/-/spawn-wrap-2.0.0.tgz",
@ -8388,9 +8500,9 @@ @@ -8388,9 +8500,9 @@
}
},
"node_modules/tesseract.js-core": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-3.0.1.tgz",
"integrity": "sha512-kcEGcZG4Vl8tmdsPgLacPYoXFRo8IhG9SQxTLbK/6vG+aVTwD89voIJs1KgL3x+RcWDR7sNmd2sMVcpgcLBMMg=="
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-3.0.2.tgz",
"integrity": "sha512-2fD76ka9nO/C616R0fq+M9Zu91DA3vEfyozp0jlxaJOBmpfeprtgRP3cqVweZh2darE1kK/DazoxZ65g7WU99Q=="
},
"node_modules/test-exclude": {
"version": "6.0.0",
@ -8740,6 +8852,11 @@ @@ -8740,6 +8852,11 @@
"node": ">=4.0.0"
}
},
"node_modules/wasm-feature-detect": {
"version": "1.2.11",
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.2.11.tgz",
"integrity": "sha512-HUqwaodrQGaZgz1lZaNioIkog9tkeEJjrM3eq4aUL04whXOVDRc/o2EGb/8kV0QX411iAYWEqq7fMBmJ6dKS6w=="
},
"node_modules/watchpack": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.0.tgz",
@ -10960,6 +11077,54 @@ @@ -10960,6 +11077,54 @@
"integrity": "sha512-a5Sab1C4/icpTZVzZc5Ghpz88yQtGOyNqYXcZgOssB2uuAr+wF/MvN6bgtW32q7HHrvBki+BsZ0OuNv6EV3K9g==",
"dev": true
},
"@rollup/plugin-commonjs": {
"version": "22.0.2",
"resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-22.0.2.tgz",
"integrity": "sha512-//NdP6iIwPbMTcazYsiBMbJW7gfmpHom33u1beiIoHDEM0Q9clvtQB1T0efvMqHeKsGohiHo97BCPCkBXdscwg==",
"dev": true,
"requires": {
"@rollup/pluginutils": "^3.1.0",
"commondir": "^1.0.1",
"estree-walker": "^2.0.1",
"glob": "^7.1.6",
"is-reference": "^1.2.1",
"magic-string": "^0.25.7",
"resolve": "^1.17.0"
},
"dependencies": {
"estree-walker": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz",
"integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==",
"dev": true
}
}
},
"@rollup/pluginutils": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-3.1.0.tgz",
"integrity": "sha512-GksZ6pr6TpIjHm8h9lSQ8pi8BE9VeubNT0OMJ3B5uZJ8pz73NPiqOtCog/x2/QzM1ENChPKxMDhiQuRHsqc+lg==",
"dev": true,
"requires": {
"@types/estree": "0.0.39",
"estree-walker": "^1.0.1",
"picomatch": "^2.2.2"
},
"dependencies": {
"@types/estree": {
"version": "0.0.39",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-0.0.39.tgz",
"integrity": "sha512-EYNwp3bU+98cpU4lAWYYL7Zz+2gryWH1qbdDTidVd6hkiR6weksdbMadyXKXNPEkQFhXM+hVO9ZygomHXp+AIw==",
"dev": true
},
"estree-walker": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-1.0.1.tgz",
"integrity": "sha512-1fMXF3YP4pZZVozF8j/ZLfvnR8NSIljt56UhbZ5PeeDmmGHpgpdwQt7ITlGvYaQukCvuBRMLEiKiYC+oeIg4cg==",
"dev": true
}
}
},
"@types/eslint": {
"version": "8.4.6",
"resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-8.4.6.tgz",
@ -13475,6 +13640,15 @@ @@ -13475,6 +13640,15 @@
"isobject": "^3.0.1"
}
},
"is-reference": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/is-reference/-/is-reference-1.2.1.tgz",
"integrity": "sha512-U82MsXXiFIrjCK4otLT+o2NA2Cd2g5MLoOVXUZjIOhLurrRxpEXzI8O0KZHr3IjLvlAH1kTPYSuqer5T9ZVBKQ==",
"dev": true,
"requires": {
"@types/estree": "*"
}
},
"is-regex": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@ -14016,6 +14190,15 @@ @@ -14016,6 +14190,15 @@
}
}
},
"magic-string": {
"version": "0.25.9",
"resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.25.9.tgz",
"integrity": "sha512-RmF0AsMzgt25qzqqLc1+MbHmhdx0ojF2Fvs4XnOqz2ZOBXzzkEwc/dJQZCYHAn7v1jbVOjAZfK8msRn4BxO4VQ==",
"dev": true,
"requires": {
"sourcemap-codec": "^1.4.8"
}
},
"make-dir": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
@ -15265,6 +15448,24 @@ @@ -15265,6 +15448,24 @@
}
}
},
"rollup": {
"version": "2.79.0",
"resolved": "https://registry.npmjs.org/rollup/-/rollup-2.79.0.tgz",
"integrity": "sha512-x4KsrCgwQ7ZJPcFA/SUu6QVcYlO7uRLfLAy0DSA4NS2eG8japdbpM50ToH7z4iObodRYOJ0soneF0iaQRJ6zhA==",
"dev": true,
"requires": {
"fsevents": "~2.3.2"
},
"dependencies": {
"fsevents": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
"dev": true,
"optional": true
}
}
},
"rx": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/rx/-/rx-4.1.0.tgz",
@ -15497,6 +15698,12 @@ @@ -15497,6 +15698,12 @@
"source-map": "^0.6.0"
}
},
"sourcemap-codec": {
"version": "1.4.8",
"resolved": "https://registry.npmjs.org/sourcemap-codec/-/sourcemap-codec-1.4.8.tgz",
"integrity": "sha512-9NykojV5Uih4lgo5So5dtw+f0JgJX30KCNI8gwhz2J9A15wD0Ml6tjHKwf6fTSa6fAdVBdZeNOs9eJ71qCk8vA==",
"dev": true
},
"spawn-wrap": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/spawn-wrap/-/spawn-wrap-2.0.0.tgz",
@ -15782,9 +15989,9 @@ @@ -15782,9 +15989,9 @@
}
},
"tesseract.js-core": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-3.0.1.tgz",
"integrity": "sha512-kcEGcZG4Vl8tmdsPgLacPYoXFRo8IhG9SQxTLbK/6vG+aVTwD89voIJs1KgL3x+RcWDR7sNmd2sMVcpgcLBMMg=="
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-3.0.2.tgz",
"integrity": "sha512-2fD76ka9nO/C616R0fq+M9Zu91DA3vEfyozp0jlxaJOBmpfeprtgRP3cqVweZh2darE1kK/DazoxZ65g7WU99Q=="
},
"test-exclude": {
"version": "6.0.0",
@ -16052,6 +16259,11 @@ @@ -16052,6 +16259,11 @@
"rx": "^4.1.0"
}
},
"wasm-feature-detect": {
"version": "1.2.11",
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.2.11.tgz",
"integrity": "sha512-HUqwaodrQGaZgz1lZaNioIkog9tkeEJjrM3eq4aUL04whXOVDRc/o2EGb/8kV0QX411iAYWEqq7fMBmJ6dKS6w=="
},
"watchpack": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.0.tgz",

9
package.json

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
{
"name": "tesseract.js",
"version": "2.1.5",
"version": "3.0.3",
"description": "Pure Javascript Multilingual OCR",
"main": "src/index.js",
"types": "src/index.d.ts",
@ -8,7 +8,7 @@ @@ -8,7 +8,7 @@
"jsdelivr": "dist/tesseract.min.js",
"scripts": {
"start": "node scripts/server.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js && rollup -c scripts/rollup.esm.js",
"profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json",
"profile:worker": "webpack-bundle-analyzer dist/worker-stats.json",
"prepublishOnly": "npm run build",
@ -37,6 +37,7 @@ @@ -37,6 +37,7 @@
"devDependencies": {
"@babel/core": "^7.18.7",
"@babel/preset-env": "^7.18.7",
"@rollup/plugin-commonjs": "^22.0.2",
"acorn": "^6.4.0",
"babel-loader": "^8.2.0",
"buffer": "^6.0.3",
@ -51,6 +52,7 @@ @@ -51,6 +52,7 @@
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"rimraf": "^2.7.1",
"rollup": "^2.79.0",
"wait-on": "^3.3.0",
"webpack": "^5.74.0",
"webpack-bundle-analyzer": "^4.6.0",
@ -68,7 +70,8 @@ @@ -68,7 +70,8 @@
"opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.1",
"tesseract.js-core": "^3.0.2",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
},
"repository": {

13
scripts/rollup.esm.js

@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
import commonjs from "@rollup/plugin-commonjs";
export default [
{
input: "dist/tesseract.min.js",
output: {
file: "dist/tesseract.esm.min.js",
format: "esm",
banner: "/* eslint-disable */",
},
plugins: [commonjs()],
},
];

1
src/constants/PSM.js

@ -15,4 +15,5 @@ module.exports = { @@ -15,4 +15,5 @@ module.exports = {
SINGLE_CHAR: '10',
SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
};

19
src/index.d.ts vendored

@ -19,12 +19,18 @@ declare namespace Tesseract { @@ -19,12 +19,18 @@ declare namespace Tesseract {
readText(path: string, jobId?: string): Promise<ConfigResult>
removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult>
initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
}
interface Lang {
code: string;
data: unknown;
}
interface WorkerOptions {
@ -62,6 +68,10 @@ declare namespace Tesseract { @@ -62,6 +68,10 @@ declare namespace Tesseract {
jobId: string
data: Page
}
interface GetPDFResult {
jobId: string
data: number[]
}
interface DetectResult {
jobId: string
data: DetectData
@ -79,13 +89,13 @@ declare namespace Tesseract { @@ -79,13 +89,13 @@ declare namespace Tesseract {
width: number
height: number
}
const enum OEM {
enum OEM {
TESSERACT_ONLY,
LSTM_ONLY,
TESSERACT_LSTM_COMBINED,
DEFAULT,
}
const enum PSM {
enum PSM {
OSD_ONLY = '0',
AUTO_OSD = '1',
AUTO_ONLY = '2',
@ -99,6 +109,7 @@ declare namespace Tesseract { @@ -99,6 +109,7 @@ declare namespace Tesseract {
SINGLE_CHAR = '10',
SPARSE_TEXT = '11',
SPARSE_TEXT_OSD = '12',
RAW_LINE = '13'
}
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer;

2
src/utils/resolvePaths.js

@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl @@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl
module.exports = (options) => {
const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (typeof options[key] !== 'undefined') {
if (options[key]) {
opts[key] = resolveURL(opts[key]);
}
});

27
src/worker-script/browser/getCore.js

@ -1,15 +1,26 @@ @@ -1,15 +1,26 @@
module.exports = (corePath, res) => {
const { simd } = require('wasm-feature-detect');
const { dependencies } = require('../../../package.json');
module.exports = async (corePath, res) => {
if (typeof global.TesseractCore === 'undefined') {
res.progress({ status: 'loading tesseract core', progress: 0 });
global.importScripts(corePath);
/*
* Depending on whether the browser supports WebAssembly,
* the version of the TesseractCore will be different.
*/
// If the user specifies a core path, we use that
// Otherwise, we detect the correct core based on SIMD support
let corePathImport = corePath;
if (!corePathImport) {
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
}
}
global.importScripts(corePathImport);
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') {
global.TesseractCore = global.TesseractCoreWASM;
} else if (typeof global.TesseractCoreASM !== 'undefined') {
global.TesseractCore = global.TesseractCoreASM;
} else {
throw Error('Failed to load TesseractCore');
}

17
src/worker-script/index.js

@ -28,10 +28,10 @@ let latestJob; @@ -28,10 +28,10 @@ let latestJob;
let adapter = {};
let params = defaultParams;
const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging);
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
const Core = await adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
@ -144,16 +144,8 @@ res) => { @@ -144,16 +144,8 @@ res) => {
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
} catch (err) {
if (isWebWorker && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
}
};
const setParameters = ({ payload: { params: _params } }, res) => {
@ -185,7 +177,10 @@ const initialize = ({ @@ -185,7 +177,10 @@ const initialize = ({
api.End();
}
api = new TessModule.TessBaseAPI();
api.Init(null, langs, oem);
const status = api.Init(null, langs, oem);
if (status === -1) {
res.reject('initialization failed');
}
params = defaultParams;
setParameters({ payload: { params } });
res.progress({

11
src/worker-script/node/getCore.js

@ -1,12 +1,19 @@ @@ -1,12 +1,19 @@
const { simd } = require('wasm-feature-detect');
let TesseractCore = null;
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
module.exports = (_, res) => {
module.exports = async (_, res) => {
if (TesseractCore === null) {
const simdSupport = await simd();
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
if (simdSupport) {
TesseractCore = require('tesseract.js-core/tesseract-core-simd');
} else {
TesseractCore = require('tesseract.js-core/tesseract-core');
}
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;

6
src/worker-script/utils/setImage.js

@ -20,9 +20,9 @@ module.exports = (TessModule, api, image) => { @@ -20,9 +20,9 @@ module.exports = (TessModule, api, image) => {
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/*
* Although leptonica should support reading bmp, there is a bug of "compressed BMP files".
* As there is no solution, we need to use bmp-js for now.
* @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE
* Leptonica supports uncompressed but not compressed bmp files
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
* We therefore use bmp-js to process all bmp files
*/
if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf);

4
src/worker/browser/defaultOptions.js

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
const resolveURL = require('resolve-url');
const { version, dependencies } = require('../../../package.json');
const { version } = require('../../../package.json');
const defaultOptions = require('../../constants/defaultOptions');
/*
@ -14,5 +14,5 @@ module.exports = { @@ -14,5 +14,5 @@ module.exports = {
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
corePath: null,
};

BIN
tests/assets/images/simple.gif

Binary file not shown.

After

Width:  |  Height:  |  Size: 1011 B

BIN
tests/assets/images/simple.webp

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

2
tests/constants.js

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save