diff --git a/package-lock.json b/package-lock.json index b3dca23..095b0b3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -21,6 +21,7 @@ "regenerator-runtime": "^0.13.3", "resolve-url": "^0.2.1", "tesseract.js-core": "^3.0.1", + "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, "devDependencies": { @@ -8740,6 +8741,11 @@ "node": ">=4.0.0" } }, + "node_modules/wasm-feature-detect": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.2.11.tgz", + "integrity": "sha512-HUqwaodrQGaZgz1lZaNioIkog9tkeEJjrM3eq4aUL04whXOVDRc/o2EGb/8kV0QX411iAYWEqq7fMBmJ6dKS6w==" + }, "node_modules/watchpack": { "version": "2.4.0", "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.0.tgz", @@ -16052,6 +16058,11 @@ "rx": "^4.1.0" } }, + "wasm-feature-detect": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.2.11.tgz", + "integrity": "sha512-HUqwaodrQGaZgz1lZaNioIkog9tkeEJjrM3eq4aUL04whXOVDRc/o2EGb/8kV0QX411iAYWEqq7fMBmJ6dKS6w==" + }, "watchpack": { "version": "2.4.0", "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.0.tgz", diff --git a/package.json b/package.json index 4aaba7c..dbf1dd4 100644 --- a/package.json +++ b/package.json @@ -69,6 +69,7 @@ "regenerator-runtime": "^0.13.3", "resolve-url": "^0.2.1", "tesseract.js-core": "^3.0.1", + "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, "repository": { diff --git a/src/worker-script/browser/getCore.js b/src/worker-script/browser/getCore.js index faea34e..f9d256e 100644 --- a/src/worker-script/browser/getCore.js +++ b/src/worker-script/browser/getCore.js @@ -1,15 +1,26 @@ -module.exports = (corePath, res) => { +const { simd } = require('wasm-feature-detect'); +const { dependencies } = require('../../../package.json'); + +module.exports = async (corePath, res) => { if (typeof global.TesseractCore === 'undefined') { res.progress({ status: 'loading tesseract core', progress: 0 }); - global.importScripts(corePath); - /* - * Depending on whether the browser supports WebAssembly, - * the version of the TesseractCore will be different. - */ + + // If the user specifies a core path, we use that + // Otherwise, we detect the correct core based on SIMD support + let corePathImport = corePath; + if (!corePathImport) { + const simdSupport = await simd(); + if (simdSupport) { + corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`; + } else { + corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`; + } + } + + global.importScripts(corePathImport); + if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') { global.TesseractCore = global.TesseractCoreWASM; - } else if (typeof global.TesseractCoreASM !== 'undefined') { - global.TesseractCore = global.TesseractCoreASM; } else { throw Error('Failed to load TesseractCore'); } diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 8eeda62..1e2cfb6 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -28,10 +28,10 @@ let latestJob; let adapter = {}; let params = defaultParams; -const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { +const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { setLogging(logging); if (!TessModule) { - const Core = adapter.getCore(corePath, res); + const Core = await adapter.getCore(corePath, res); res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); diff --git a/src/worker-script/node/getCore.js b/src/worker-script/node/getCore.js index f3783b5..03469dd 100644 --- a/src/worker-script/node/getCore.js +++ b/src/worker-script/node/getCore.js @@ -1,12 +1,19 @@ +const { simd } = require('wasm-feature-detect'); + let TesseractCore = null; /* * getCore is a sync function to load and return * TesseractCore. */ -module.exports = (_, res) => { +module.exports = async (_, res) => { if (TesseractCore === null) { + const simdSupport = await simd(); res.progress({ status: 'loading tesseract core', progress: 0 }); - TesseractCore = require('tesseract.js-core'); + if (simdSupport) { + TesseractCore = require('tesseract.js-core/tesseract-core-simd'); + } else { + TesseractCore = require('tesseract.js-core/tesseract-core'); + } res.progress({ status: 'loaded tesseract core', progress: 1 }); } return TesseractCore; diff --git a/src/worker-script/utils/setImage.js b/src/worker-script/utils/setImage.js index 351205c..3e09045 100644 --- a/src/worker-script/utils/setImage.js +++ b/src/worker-script/utils/setImage.js @@ -20,9 +20,9 @@ module.exports = (TessModule, api, image) => { const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1; /* - * Although leptonica should support reading bmp, there is a bug of "compressed BMP files". - * As there is no solution, we need to use bmp-js for now. - * @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE + * Leptonica supports uncompressed but not compressed bmp files + * @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 + * We therefore use bmp-js to process all bmp files */ if (type && type.mime === 'image/bmp') { const bmpBuf = bmp.decode(buf); diff --git a/src/worker/browser/defaultOptions.js b/src/worker/browser/defaultOptions.js index cef5e58..8127078 100644 --- a/src/worker/browser/defaultOptions.js +++ b/src/worker/browser/defaultOptions.js @@ -1,5 +1,5 @@ const resolveURL = require('resolve-url'); -const { version, dependencies } = require('../../../package.json'); +const { version } = require('../../../package.json'); const defaultOptions = require('../../constants/defaultOptions'); /* @@ -14,5 +14,5 @@ module.exports = { * If browser doesn't support WebAssembly, * load ASM version instead */ - corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, + corePath: null, };