Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot] 965bd8a5a4
Bump y18n from 4.0.0 to 4.0.1 4 years ago
  1. 4
      .eslintrc
  2. 2
      .github/FUNDING.yml
  3. 2
      .github/workflows/node.js.yml
  4. 30
      README.md
  5. 25
      docs/image-format.md
  6. 33
      examples/browser/benchmark.html
  7. 1
      examples/browser/demo.html
  8. BIN
      examples/data/meditations.jpg
  9. BIN
      examples/data/testocr.png
  10. BIN
      examples/data/tyger.jpg
  11. 27
      examples/node/benchmark.js
  12. 17978
      package-lock.json
  13. 27
      package.json
  14. 13
      scripts/rollup.esm.js
  15. 5
      scripts/webpack.config.common.js
  16. 3
      scripts/webpack.config.dev.js
  17. 6
      scripts/webpack.config.prod.js
  18. 1
      src/constants/PSM.js
  19. 19
      src/index.d.ts
  20. 2
      src/utils/resolvePaths.js
  21. 27
      src/worker-script/browser/getCore.js
  22. 27
      src/worker-script/index.js
  23. 11
      src/worker-script/node/getCore.js
  24. 5
      src/worker-script/node/index.js
  25. 12
      src/worker-script/utils/setImage.js
  26. 4
      src/worker/browser/defaultOptions.js
  27. 36
      src/worker/browser/loadImage.js
  28. 5
      src/worker/node/loadImage.js
  29. 4
      src/worker/node/send.js
  30. 9
      src/worker/node/spawnWorker.js
  31. 2
      src/worker/node/terminateWorker.js
  32. BIN
      tests/assets/images/simple.gif
  33. BIN
      tests/assets/images/simple.webp
  34. 4
      tests/constants.js

4
.eslintrc

@ -1,6 +1,5 @@
{ {
"extends": "airbnb-base", "extends": "airbnb-base",
"parser": "babel-eslint",
"env": { "env": {
"browser": true, "browser": true,
"node": true, "node": true,
@ -11,7 +10,6 @@
"no-underscore-dangle": 0, "no-underscore-dangle": 0,
"no-console": 0, "no-console": 0,
"global-require": 0, "global-require": 0,
"camelcase": 0, "camelcase": 0
"no-control-regex": 0
} }
} }

2
.github/FUNDING.yml

@ -6,4 +6,4 @@ open_collective: tesseractjs
ko_fi: # Replace with a single Ko-fi username ko_fi: # Replace with a single Ko-fi username
tidelift: npm/tesseract.js tidelift: npm/tesseract.js
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2"] custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2", "https://www.paypal.me/jeromewusg"]

2
.github/workflows/node.js.yml

@ -16,7 +16,7 @@ jobs:
strategy: strategy:
matrix: matrix:
node-version: [14.x, 16.x] node-version: [10.x, 12.x, 14.x]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

30
README.md

@ -12,6 +12,13 @@
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) [![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) [![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
<h3 align="center">
Version 2 is now available and under development in the master branch, read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
</h3>
<br>
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/)) Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
Image Recognition Image Recognition
@ -62,16 +69,6 @@ const worker = createWorker({
[Check out the docs](#documentation) for a full explanation of the API. [Check out the docs](#documentation) for a full explanation of the API.
## Major changes in v3
- Significantly faster performance
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data)
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18)
- Added SIMD-enabled build for supported devices
- Added support:
- Node.js version 18
- Removed support:
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0)
- Node.js versions 10 and 12
## Major changes in v2 ## Major changes in v2
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream) - Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream)
@ -80,8 +77,7 @@ const worker = createWorker({
- Support WebAssembly (fallback to ASM.js when browser doesn't support) - Support WebAssembly (fallback to ASM.js when browser doesn't support)
- Support Typescript - Support Typescript
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
## Installation ## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`. Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
@ -99,16 +95,16 @@ After including the script the `Tesseract` variable will be globally available.
### Node.js ### Node.js
**Tesseract.js v3 requires Node.js v14 or higher** **Tesseract.js currently requires Node.js v6.8.0 or higher**
```shell ```shell
# For v3 # For v2
npm install tesseract.js npm install tesseract.js
yarn add tesseract.js yarn add tesseract.js
# For v2 # For v1
npm install tesseract.js@2 npm install tesseract.js@1
yarn add tesseract.js@2 yarn add tesseract.js@1
``` ```

25
docs/image-format.md

@ -1,18 +1,17 @@
# Image Format # Image Format
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below. Support Format: **bmp, jpg, png, pbm**
Support Image Formats: **bmp, jpg, png, pbm, webp** The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
For browser and Node, supported data types are: On a browser, an image can be:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp) - an `img`, `video`, or `canvas` element
- buffer - a `File` object (from a file `<input>`)
- a `Blob` object
- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser only, supported data types are: In Node.js, an image can be
- `File` or `Blob` object - a path to a local image
- `img` or `canvas` element - a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

33
examples/browser/benchmark.html

@ -1,33 +0,0 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<textarea id="message">Working...</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file);
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
document.getElementById('message').innerHTML += "\n" + file + " [x10] runtime: " + timeDif + "s";
}
document.getElementById('message').innerHTML += "\nTotal runtime: " + timeTotal + "s";
})();
</script>
</body>
</html>

1
examples/browser/demo.html

@ -71,6 +71,7 @@ async function recognizeFile(file) {
<option value='meme' > Internet Meme </option> <option value='meme' > Internet Meme </option>
<option value='epo' > Esperanto </option> <option value='epo' > Esperanto </option>
<option value='epo_alt' > Esperanto alternative </option> <option value='epo_alt' > Esperanto alternative </option>
<option value='equ' > Math </option>
<option value='est' > Estonian </option> <option value='est' > Estonian </option>
<option value='eus' > Basque </option> <option value='eus' > Basque </option>
<option value='fin' > Finnish </option> <option value='fin' > Finnish </option>

BIN
examples/data/meditations.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1011 KiB

BIN
examples/data/testocr.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

BIN
examples/data/tyger.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

27
examples/node/benchmark.js

@ -1,27 +0,0 @@
#!/usr/bin/env node
const path = require('path');
const { createWorker } = require('../../');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file)
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
console.log(file + " [x10] runtime: " + timeDif + "s");
}
console.log("Total runtime: " + timeTotal + "s");
await worker.terminate();
})();

17978
package-lock.json generated

File diff suppressed because it is too large Load Diff

27
package.json

@ -1,6 +1,6 @@
{ {
"name": "tesseract.js", "name": "tesseract.js",
"version": "3.0.3", "version": "2.1.4",
"description": "Pure Javascript Multilingual OCR", "description": "Pure Javascript Multilingual OCR",
"main": "src/index.js", "main": "src/index.js",
"types": "src/index.d.ts", "types": "src/index.d.ts",
@ -8,7 +8,7 @@
"jsdelivr": "dist/tesseract.min.js", "jsdelivr": "dist/tesseract.min.js",
"scripts": { "scripts": {
"start": "node scripts/server.js", "start": "node scripts/server.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js && rollup -c scripts/rollup.esm.js", "build": "rimraf dist && webpack --config scripts/webpack.config.prod.js",
"profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json", "profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json",
"profile:worker": "webpack-bundle-analyzer dist/worker-stats.json", "profile:worker": "webpack-bundle-analyzer dist/worker-stats.json",
"prepublishOnly": "npm run build", "prepublishOnly": "npm run build",
@ -35,12 +35,10 @@
], ],
"license": "Apache-2.0", "license": "Apache-2.0",
"devDependencies": { "devDependencies": {
"@babel/core": "^7.18.7", "@babel/core": "^7.7.7",
"@babel/preset-env": "^7.18.7", "@babel/preset-env": "^7.7.7",
"@rollup/plugin-commonjs": "^22.0.2",
"acorn": "^6.4.0", "acorn": "^6.4.0",
"babel-loader": "^8.2.0", "babel-loader": "^8.1.0",
"buffer": "^6.0.3",
"cors": "^2.8.5", "cors": "^2.8.5",
"eslint": "^7.2.0", "eslint": "^7.2.0",
"eslint-config-airbnb-base": "^14.2.0", "eslint-config-airbnb-base": "^14.2.0",
@ -52,26 +50,25 @@
"npm-run-all": "^4.1.5", "npm-run-all": "^4.1.5",
"nyc": "^15.1.0", "nyc": "^15.1.0",
"rimraf": "^2.7.1", "rimraf": "^2.7.1",
"rollup": "^2.79.0",
"wait-on": "^3.3.0", "wait-on": "^3.3.0",
"webpack": "^5.74.0", "webpack": "^4.44.2",
"webpack-bundle-analyzer": "^4.6.0", "webpack-bundle-analyzer": "^3.6.0",
"webpack-cli": "^4.10.0", "webpack-cli": "^3.3.12",
"webpack-dev-middleware": "^5.3.3" "webpack-dev-middleware": "^3.7.2"
}, },
"dependencies": { "dependencies": {
"babel-eslint": "^10.1.0", "blueimp-load-image": "^3.0.0",
"bmp-js": "^0.1.0", "bmp-js": "^0.1.0",
"file-type": "^12.4.1", "file-type": "^12.4.1",
"idb-keyval": "^3.2.0", "idb-keyval": "^3.2.0",
"is-electron": "^2.2.0", "is-electron": "^2.2.0",
"is-url": "^1.2.4", "is-url": "^1.2.4",
"jpeg-autorotate": "^7.1.1",
"node-fetch": "^2.6.0", "node-fetch": "^2.6.0",
"opencollective-postinstall": "^2.0.2", "opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3", "regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1", "resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.2", "tesseract.js-core": "^2.2.0",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1" "zlibjs": "^0.3.1"
}, },
"repository": { "repository": {

13
scripts/rollup.esm.js

@ -1,13 +0,0 @@
import commonjs from "@rollup/plugin-commonjs";
export default [
{
input: "dist/tesseract.min.js",
output: {
file: "dist/tesseract.esm.min.js",
format: "esm",
banner: "/* eslint-disable */",
},
plugins: [commonjs()],
},
];

5
scripts/webpack.config.common.js

@ -1,9 +1,4 @@
module.exports = { module.exports = {
resolve: {
fallback: {
buffer: require.resolve('buffer/'),
},
},
module: { module: {
rules: [ rules: [
{ {

3
scripts/webpack.config.dev.js

@ -15,9 +15,6 @@ const genConfig = ({
libraryTarget, libraryTarget,
}, },
plugins: [ plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
new webpack.DefinePlugin({ new webpack.DefinePlugin({
'process.env': { 'process.env': {
TESS_ENV: JSON.stringify('development'), TESS_ENV: JSON.stringify('development'),

6
scripts/webpack.config.prod.js

@ -1,6 +1,5 @@
const path = require('path'); const path = require('path');
const common = require('./webpack.config.common'); const common = require('./webpack.config.common');
const webpack = require('webpack');
const genConfig = ({ const genConfig = ({
entry, filename, library, libraryTarget, entry, filename, library, libraryTarget,
@ -15,11 +14,6 @@ const genConfig = ({
library, library,
libraryTarget, libraryTarget,
}, },
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
]
}); });
module.exports = [ module.exports = [

1
src/constants/PSM.js

@ -15,5 +15,4 @@ module.exports = {
SINGLE_CHAR: '10', SINGLE_CHAR: '10',
SPARSE_TEXT: '11', SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12', SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
}; };

19
src/index.d.ts vendored

@ -19,18 +19,12 @@ declare namespace Tesseract {
readText(path: string, jobId?: string): Promise<ConfigResult> readText(path: string, jobId?: string): Promise<ConfigResult>
removeText(path: string, jobId?: string): Promise<ConfigResult> removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult> FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult> loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult> initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult> setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult> recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult> detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult> terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
}
interface Lang {
code: string;
data: unknown;
} }
interface WorkerOptions { interface WorkerOptions {
@ -68,10 +62,6 @@ declare namespace Tesseract {
jobId: string jobId: string
data: Page data: Page
} }
interface GetPDFResult {
jobId: string
data: number[]
}
interface DetectResult { interface DetectResult {
jobId: string jobId: string
data: DetectData data: DetectData
@ -89,13 +79,13 @@ declare namespace Tesseract {
width: number width: number
height: number height: number
} }
enum OEM { const enum OEM {
TESSERACT_ONLY, TESSERACT_ONLY,
LSTM_ONLY, LSTM_ONLY,
TESSERACT_LSTM_COMBINED, TESSERACT_LSTM_COMBINED,
DEFAULT, DEFAULT,
} }
enum PSM { const enum PSM {
OSD_ONLY = '0', OSD_ONLY = '0',
AUTO_OSD = '1', AUTO_OSD = '1',
AUTO_ONLY = '2', AUTO_ONLY = '2',
@ -109,7 +99,6 @@ declare namespace Tesseract {
SINGLE_CHAR = '10', SINGLE_CHAR = '10',
SPARSE_TEXT = '11', SPARSE_TEXT = '11',
SPARSE_TEXT_OSD = '12', SPARSE_TEXT_OSD = '12',
RAW_LINE = '13'
} }
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer; | CanvasRenderingContext2D | File | Blob | ImageData | Buffer;

2
src/utils/resolvePaths.js

@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl
module.exports = (options) => { module.exports = (options) => {
const opts = { ...options }; const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => { ['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (options[key]) { if (typeof options[key] !== 'undefined') {
opts[key] = resolveURL(opts[key]); opts[key] = resolveURL(opts[key]);
} }
}); });

27
src/worker-script/browser/getCore.js

@ -1,26 +1,15 @@
const { simd } = require('wasm-feature-detect'); module.exports = (corePath, res) => {
const { dependencies } = require('../../../package.json');
module.exports = async (corePath, res) => {
if (typeof global.TesseractCore === 'undefined') { if (typeof global.TesseractCore === 'undefined') {
res.progress({ status: 'loading tesseract core', progress: 0 }); res.progress({ status: 'loading tesseract core', progress: 0 });
global.importScripts(corePath);
// If the user specifies a core path, we use that /*
// Otherwise, we detect the correct core based on SIMD support * Depending on whether the browser supports WebAssembly,
let corePathImport = corePath; * the version of the TesseractCore will be different.
if (!corePathImport) { */
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
}
}
global.importScripts(corePathImport);
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') { if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') {
global.TesseractCore = global.TesseractCoreWASM; global.TesseractCore = global.TesseractCoreWASM;
} else if (typeof global.TesseractCoreASM !== 'undefined') {
global.TesseractCore = global.TesseractCoreASM;
} else { } else {
throw Error('Failed to load TesseractCore'); throw Error('Failed to load TesseractCore');
} }

27
src/worker-script/index.js

@ -28,10 +28,10 @@ let latestJob;
let adapter = {}; let adapter = {};
let params = defaultParams; let params = defaultParams;
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging); setLogging(logging);
if (!TessModule) { if (!TessModule) {
const Core = await adapter.getCore(corePath, res); const Core = adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
@ -72,7 +72,7 @@ const loadLanguage = async ({
}, },
}, },
}, },
res) => { res) => {
const loadAndGunzipFile = async (_lang) => { const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code; const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod) const readCache = ['refresh', 'none'].includes(cacheMethod)
@ -99,11 +99,7 @@ res) => {
} }
if (path !== null) { if (path !== null) {
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`; const resp = await (isWebWorker ? fetch : adapter.fetch)(`${path}/${lang}.traineddata${gzip ? '.gz' : ''}`);
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl);
if (!resp.ok) {
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`);
}
data = await resp.arrayBuffer(); data = await resp.arrayBuffer();
} else { } else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`); data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
@ -144,7 +140,15 @@ res) => {
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs); res.resolve(langs);
} catch (err) { } catch (err) {
res.reject(err.toString()); if (isWebWorker && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
} }
}; };
@ -177,10 +181,7 @@ const initialize = ({
api.End(); api.End();
} }
api = new TessModule.TessBaseAPI(); api = new TessModule.TessBaseAPI();
const status = api.Init(null, langs, oem); api.Init(null, langs, oem);
if (status === -1) {
res.reject('initialization failed');
}
params = defaultParams; params = defaultParams;
setParameters({ payload: { params } }); setParameters({ payload: { params } });
res.progress({ res.progress({

11
src/worker-script/node/getCore.js

@ -1,19 +1,12 @@
const { simd } = require('wasm-feature-detect');
let TesseractCore = null; let TesseractCore = null;
/* /*
* getCore is a sync function to load and return * getCore is a sync function to load and return
* TesseractCore. * TesseractCore.
*/ */
module.exports = async (_, res) => { module.exports = (_, res) => {
if (TesseractCore === null) { if (TesseractCore === null) {
const simdSupport = await simd();
res.progress({ status: 'loading tesseract core', progress: 0 }); res.progress({ status: 'loading tesseract core', progress: 0 });
if (simdSupport) { TesseractCore = require('tesseract.js-core');
TesseractCore = require('tesseract.js-core/tesseract-core-simd');
} else {
TesseractCore = require('tesseract.js-core/tesseract-core');
}
res.progress({ status: 'loaded tesseract core', progress: 1 }); res.progress({ status: 'loaded tesseract core', progress: 1 });
} }
return TesseractCore; return TesseractCore;

5
src/worker-script/node/index.js

@ -9,7 +9,6 @@
*/ */
const fetch = require('node-fetch'); const fetch = require('node-fetch');
const { parentPort } = require('worker_threads');
const worker = require('..'); const worker = require('..');
const getCore = require('./getCore'); const getCore = require('./getCore');
const gunzip = require('./gunzip'); const gunzip = require('./gunzip');
@ -18,8 +17,8 @@ const cache = require('./cache');
/* /*
* register message handler * register message handler
*/ */
parentPort.on('message', (packet) => { process.on('message', (packet) => {
worker.dispatchHandlers(packet, (obj) => parentPort.postMessage(obj)); worker.dispatchHandlers(packet, (obj) => process.send(obj));
}); });
worker.setAdapter({ worker.setAdapter({

12
src/worker-script/utils/setImage.js

@ -17,12 +17,10 @@ module.exports = (TessModule, api, image) => {
let w = 0; let w = 0;
let h = 0; let h = 0;
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/* /*
* Leptonica supports uncompressed but not compressed bmp files * Although leptonica should support reading bmp, there is a bug of "compressed BMP files".
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 * As there is no solution, we need to use bmp-js for now.
* We therefore use bmp-js to process all bmp files * @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE
*/ */
if (type && type.mime === 'image/bmp') { if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf); const bmpBuf = bmp.decode(buf);
@ -55,9 +53,9 @@ module.exports = (TessModule, api, image) => {
* *
*/ */
if (data === null) { if (data === null) {
api.SetImage(pix, undefined, undefined, undefined, undefined, exif); api.SetImage(pix);
} else { } else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif); api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
} }
return data === null ? pix : data; return data === null ? pix : data;
}; };

4
src/worker/browser/defaultOptions.js

@ -1,5 +1,5 @@
const resolveURL = require('resolve-url'); const resolveURL = require('resolve-url');
const { version } = require('../../../package.json'); const { version, dependencies } = require('../../../package.json');
const defaultOptions = require('../../constants/defaultOptions'); const defaultOptions = require('../../constants/defaultOptions');
/* /*
@ -14,5 +14,5 @@ module.exports = {
* If browser doesn't support WebAssembly, * If browser doesn't support WebAssembly,
* load ASM version instead * load ASM version instead
*/ */
corePath: null, corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
}; };

36
src/worker/browser/loadImage.js

@ -1,4 +1,5 @@
const resolveURL = require('resolve-url'); const resolveURL = require('resolve-url');
const blueimpLoadImage = require('blueimp-load-image');
/** /**
* readFromBlobOrFile * readFromBlobOrFile
@ -20,6 +21,19 @@ const readFromBlobOrFile = (blob) => (
}) })
); );
const fixOrientationFromUrlOrBlobOrFile = (blob) => (
new Promise((resolve) => {
blueimpLoadImage(
blob,
(img) => img.toBlob(resolve),
{
orientation: true,
canvas: true,
},
);
})
);
/** /**
* loadImage * loadImage
* *
@ -34,14 +48,18 @@ const loadImage = async (image) => {
} }
if (typeof image === 'string') { if (typeof image === 'string') {
// Base64 Image if (image.endsWith('.pbm')) {
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = atob(image.split(',')[1])
.split('')
.map((c) => c.charCodeAt(0));
} else {
const resp = await fetch(resolveURL(image)); const resp = await fetch(resolveURL(image));
data = await resp.arrayBuffer(); data = await resp.arrayBuffer();
} else {
let img = image;
// If not Base64 Image
if (!/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
img = resolveURL(image);
}
data = await readFromBlobOrFile(
await fixOrientationFromUrlOrBlobOrFile(img),
);
} }
} else if (image instanceof HTMLElement) { } else if (image instanceof HTMLElement) {
if (image.tagName === 'IMG') { if (image.tagName === 'IMG') {
@ -59,7 +77,11 @@ const loadImage = async (image) => {
}); });
} }
} else if (image instanceof File || image instanceof Blob) { } else if (image instanceof File || image instanceof Blob) {
data = await readFromBlobOrFile(image); let img = image;
if (!image.name.endsWith('.pbm')) {
img = await fixOrientationFromUrlOrBlobOrFile(img);
}
data = await readFromBlobOrFile(img);
} }
return new Uint8Array(data); return new Uint8Array(data);

5
src/worker/node/loadImage.js

@ -2,6 +2,7 @@ const util = require('util');
const fs = require('fs'); const fs = require('fs');
const fetch = require('node-fetch'); const fetch = require('node-fetch');
const isURL = require('is-url'); const isURL = require('is-url');
const jo = require('jpeg-autorotate');
const readFile = util.promisify(fs.readFile); const readFile = util.promisify(fs.readFile);
@ -31,5 +32,9 @@ module.exports = async (image) => {
data = image; data = image;
} }
try {
data = (await jo.rotate(data, { quality: 100 })).buffer;
} catch (_) {} /* eslint-disable-line */
return new Uint8Array(data); return new Uint8Array(data);
}; };

4
src/worker/node/send.js

@ -5,6 +5,6 @@
* @function send packet to worker and create a job * @function send packet to worker and create a job
* @access public * @access public
*/ */
module.exports = async (worker, packet) => { module.exports = (worker, packet) => {
worker.postMessage(packet); worker.send(packet);
}; };

9
src/worker/node/spawnWorker.js

@ -1,4 +1,6 @@
const { Worker } = require('worker_threads'); const { fork } = require('child_process');
let debugPort = 9229;
/** /**
* spawnWorker * spawnWorker
@ -7,4 +9,7 @@ const { Worker } = require('worker_threads');
* @function fork a new process in node * @function fork a new process in node
* @access public * @access public
*/ */
module.exports = ({ workerPath }) => new Worker(workerPath); module.exports = ({ workerPath }) => {
debugPort += 1;
return fork(workerPath, { execArgv: [`--debug-port=${debugPort}`] });
};

2
src/worker/node/terminateWorker.js

@ -6,5 +6,5 @@
* @access public * @access public
*/ */
module.exports = (worker) => { module.exports = (worker) => {
worker.terminate(); worker.kill();
}; };

BIN
tests/assets/images/simple.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1011 B

BIN
tests/assets/images/simple.webp

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

4
tests/constants.js

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save