Browse Source

Reworked createWorker to be async and throw errors per #654

dev/v4
Balearica 2 years ago
parent
commit
ca99c35d14
  1. 3
      README.md
  2. 32
      docs/api.md
  3. 33
      docs/examples.md
  4. 3
      docs/faq.md
  5. 2
      docs/local-installation.md
  6. 18
      examples/browser/basic-edge.html
  7. 3
      examples/browser/benchmark.html
  8. 3
      examples/browser/download-pdf.html
  9. 5
      examples/browser/image-processing.html
  10. 2887
      package-lock.json
  11. 4
      package.json
  12. 6
      src/Tesseract.js
  13. 13
      src/createWorker.js
  14. 2
      src/index.d.ts
  15. 8
      src/worker-script/index.js
  16. 1
      tests/FS.test.js
  17. 13
      tests/detect.test.js
  18. 45
      tests/error.test.js
  19. 14
      tests/recognize.test.js
  20. 1
      tests/scheduler.test.js

3
README.md

@ -46,12 +46,11 @@ Or more imperative @@ -46,12 +46,11 @@ Or more imperative
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
const worker = await createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');

32
docs/api.md

@ -1,7 +1,6 @@ @@ -1,7 +1,6 @@
# API
- [createWorker()](#create-worker)
- [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText)
- [Worker.readText](#worker-readText)
- [Worker.removeFile](#worker-removeFile)
@ -53,7 +52,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is @@ -53,7 +52,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is
```javascript
const { createWorker } = Tesseract;
const worker = createWorker({
const worker = await createWorker({
langPath: '...',
logger: m => console.log(m),
});
@ -63,7 +62,6 @@ const worker = createWorker({ @@ -63,7 +62,6 @@ const worker = createWorker({
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is:
- load
- FS functions // optional
- loadLanguauge
- initialize
@ -82,23 +80,6 @@ Each function is async, so using async/await or Promise is required. When it is @@ -82,23 +80,6 @@ Each function is async, so using async/await or Promise is required. When it is
jobId is generated by Tesseract.js, but you can put your own when calling any of the function above.
<a name="worker-load"></a>
### Worker.load(jobId): Promise
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action.
**Arguments:**
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.load();
})();
```
<a name="worker-writeText"></a>
### Worker.writeText(path, text, jobId): Promise
@ -273,8 +254,7 @@ Figures out what words are in `image`, where the words are in `image`, etc. @@ -273,8 +254,7 @@ Figures out what words are in `image`, where the words are in `image`, etc.
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
@ -287,8 +267,7 @@ With rectangle @@ -287,8 +267,7 @@ With rectangle
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image, {
@ -313,8 +292,7 @@ Worker.detect() does OSD (Orientation and Script Detection) to the image instead @@ -313,8 +292,7 @@ Worker.detect() does OSD (Orientation and Script Detection) to the image instead
```javascript
const { createWorker } = Tesseract;
(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data } = await worker.detect(image);
@ -361,7 +339,7 @@ Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is @@ -361,7 +339,7 @@ Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is
```javascript
const { createWorker, createScheduler } = Tesseract;
const scheduler = createScheduler();
const worker = createWorker();
const worker = await createWorker();
scheduler.addWorker(worker);
```

33
docs/examples.md

@ -7,10 +7,9 @@ You can also check [examples](../examples) folder. @@ -7,10 +7,9 @@ You can also check [examples](../examples) folder.
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -24,12 +23,11 @@ const worker = createWorker(); @@ -24,12 +23,11 @@ const worker = createWorker();
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker({
const worker = await createWorker({
logger: m => console.log(m), // Add logger here
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -43,10 +41,9 @@ const worker = createWorker({ @@ -43,10 +41,9 @@ const worker = createWorker({
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng+chi_tra');
await worker.initialize('eng+chi_tra');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -59,10 +56,9 @@ const worker = createWorker(); @@ -59,10 +56,9 @@ const worker = createWorker();
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
@ -81,10 +77,9 @@ Check here for more details of pageseg mode: https://github.com/tesseract-ocr/te @@ -81,10 +77,9 @@ Check here for more details of pageseg mode: https://github.com/tesseract-ocr/te
```javascript
const { createWorker, PSM } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
@ -110,11 +105,10 @@ Node: [download-pdf.js](../examples/node/download-pdf.js) @@ -110,11 +105,10 @@ Node: [download-pdf.js](../examples/node/download-pdf.js)
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle });
@ -128,7 +122,7 @@ const rectangle = { left: 0, top: 0, width: 500, height: 250 }; @@ -128,7 +122,7 @@ const rectangle = { left: 0, top: 0, width: 500, height: 250 };
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const worker = await createWorker();
const rectangles = [
{
left: 0,
@ -145,7 +139,6 @@ const rectangles = [ @@ -145,7 +139,6 @@ const rectangles = [
];
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const values = [];
@ -164,8 +157,8 @@ const rectangles = [ @@ -164,8 +157,8 @@ const rectangles = [
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const worker1 = await createWorker();
const worker2 = await createWorker();
const rectangles = [
{
left: 0,
@ -182,8 +175,6 @@ const rectangles = [ @@ -182,8 +175,6 @@ const rectangles = [
];
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');
@ -204,12 +195,10 @@ const rectangles = [ @@ -204,12 +195,10 @@ const rectangles = [
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const worker1 = await createWorker();
const worker2 = await createWorker();
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');

3
docs/faq.md

@ -19,12 +19,11 @@ Starting from 2.0.0-beta.1, you can get all these information in the final resul @@ -19,12 +19,11 @@ Starting from 2.0.0-beta.1, you can get all these information in the final resul
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
const worker = await createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({

2
docs/local-installation.md

@ -19,7 +19,7 @@ Tesseract.recognize(image, langs, { @@ -19,7 +19,7 @@ Tesseract.recognize(image, langs, {
Or
```javascript
const worker = createWorker({
const worker = await createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',

18
examples/browser/basic-edge.html

@ -6,9 +6,9 @@ @@ -6,9 +6,9 @@
<body>
<input type="file" id="uploader">
<script>
const recognize = function(evt){
const recognize = async function(evt){
const files = evt.target.files;
const worker = Tesseract.createWorker({
const worker = await Tesseract.createWorker({
/*
* As Edge don't support webassembly,
* here we force to use asm.js version.
@ -21,14 +21,12 @@ @@ -21,14 +21,12 @@
*/
cacheMethod: 'none',
});
Promise.resolve()
.then(() => worker.load())
.then(() => worker.loadLanguage('eng'))
.then(() => worker.initialize('eng'))
.then(() => worker.recognize(files[0]))
.then((ret) => {
console.log(ret.data.text);
});
await worker.loadLanguage('eng');
await worker.initialize('eng');
const ret = await worker.recognize(files[0]);
console.log(ret.data.text);
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);

3
examples/browser/benchmark.html

@ -7,9 +7,8 @@ @@ -7,9 +7,8 @@
<script>
const { createWorker } = Tesseract;
const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');

3
examples/browser/download-pdf.html

@ -10,14 +10,13 @@ @@ -10,14 +10,13 @@
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker({
const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
logger: m => console.log(m),
});
const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf');
const recognize = async ({ target: { files } }) => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]);

5
examples/browser/image-processing.html

@ -37,11 +37,10 @@ @@ -37,11 +37,10 @@
<script>
const recognize = async ({ target: { files } }) => {
document.getElementById("imgInput").src = URL.createObjectURL(files[0]);
const worker = Tesseract.createWorker({
corePath: '/tesseract-core-simd.wasm.js',
const worker = await Tesseract.createWorker({
// corePath: '/tesseract-core-simd.wasm.js',
workerPath: "/dist/worker.dev.js"
});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');

2887
package-lock.json generated

File diff suppressed because it is too large Load Diff

4
package.json

@ -47,8 +47,8 @@ @@ -47,8 +47,8 @@
"eslint-plugin-import": "^2.22.1",
"expect.js": "^0.3.1",
"express": "^4.17.1",
"mocha": "^8.1.3",
"mocha-headless-chrome": "^2.0.3",
"mocha": "^10.0.0",
"mocha-headless-chrome": "^4.0.0",
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"rimraf": "^2.7.1",

6
src/Tesseract.js

@ -1,8 +1,7 @@ @@ -1,8 +1,7 @@
const createWorker = require('./createWorker');
const recognize = async (image, langs, options) => {
const worker = createWorker(options);
await worker.load();
const worker = await createWorker(options);
await worker.loadLanguage(langs);
await worker.initialize(langs);
return worker.recognize(image)
@ -12,8 +11,7 @@ const recognize = async (image, langs, options) => { @@ -12,8 +11,7 @@ const recognize = async (image, langs, options) => {
};
const detect = async (image, options) => {
const worker = createWorker(options);
await worker.load();
const worker = await createWorker(options);
await worker.loadLanguage('osd');
await worker.initialize('osd');
return worker.detect(image)

13
src/createWorker.js

@ -37,7 +37,6 @@ module.exports = async (_options = {}) => { @@ -37,7 +37,6 @@ module.exports = async (_options = {}) => {
let workerError = (event) => {resReject(event.message)};
let worker = spawnWorker(options);
// worker.addEventListener("error", workerError);
worker.onerror = workerError;
workerCounter += 1;
@ -65,6 +64,10 @@ module.exports = async (_options = {}) => { @@ -65,6 +64,10 @@ module.exports = async (_options = {}) => {
);
const load = (jobId) => (
console.warn("`load` is depreciated and should be removed from code (workers now come pre-loaded)")
);
const loadInternal = (jobId) => (
startJob(createJob({
id: jobId, action: 'load', payload: { options },
}))
@ -186,6 +189,7 @@ module.exports = async (_options = {}) => { @@ -186,6 +189,7 @@ module.exports = async (_options = {}) => {
resolves[action]({ jobId, data: d });
} else if (status === 'reject') {
rejects[action](data);
if (action === "load") resReject(data);
if (errorHandler) {
errorHandler(data);
} else {
@ -216,12 +220,7 @@ module.exports = async (_options = {}) => { @@ -216,12 +220,7 @@ module.exports = async (_options = {}) => {
terminate,
};
startJob(createJob({
id: undefined, action: 'checkWorker',
})).then(() => {
console.log("Created worker");
// worker.removeEventListener("error", workerError);
resResolve(resolveObj)});
loadInternal().then(() => resResolve(resolveObj)).catch(() => {});
return res;

2
src/index.d.ts vendored

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
declare namespace Tesseract {
function createScheduler(): Scheduler
function createWorker(options?: Partial<WorkerOptions>): Worker
function createWorker(options?: Partial<WorkerOptions>): Promise<Worker>
function setLogging(logging: boolean): void
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any

8
src/worker-script/index.js

@ -390,13 +390,6 @@ const terminate = async (_, res) => { @@ -390,13 +390,6 @@ const terminate = async (_, res) => {
}
};
// Function that always resolves
// Used to confirm that worker was successfully created
const checkWorker = async (_, res) => {
res.resolve();
};
/**
* dispatchHandlers
*
@ -434,7 +427,6 @@ exports.dispatchHandlers = (packet, send) => { @@ -434,7 +427,6 @@ exports.dispatchHandlers = (packet, send) => {
getPDF,
detect,
terminate,
checkWorker
})[packet.action](packet, res)
.catch((err) => res.reject(err.toString()));
};

1
tests/FS.test.js

@ -4,7 +4,6 @@ let worker; @@ -4,7 +4,6 @@ let worker;
before(async function cb() {
this.timeout(0);
worker = await createWorker(OPTIONS);
return worker.load();
});
describe('FS', async () => {

13
tests/detect.test.js

@ -3,7 +3,6 @@ let worker; @@ -3,7 +3,6 @@ let worker;
before(async function cb() {
this.timeout(0);
worker = await createWorker(OPTIONS);
return worker.load();
});
describe('detect()', async () => {
@ -18,3 +17,15 @@ describe('detect()', async () => { @@ -18,3 +17,15 @@ describe('detect()', async () => {
});
}).timeout(TIMEOUT);
});
describe('detect()', async () => {
it('should detect OSD (simplified interface)', () => {
[
{ name: 'cosmic.png', ans: { script: 'Latin' } },
].forEach(async ({ name, ans: { script } }) => {
const { data: { script: s } } = await Tesseract.detect(`${IMAGE_PATH}/${name}`);
expect(s).to.be(script);
});
}).timeout(TIMEOUT);
});

45
tests/error.test.js

@ -1,29 +1,50 @@ @@ -1,29 +1,50 @@
// const { createWorker } = Tesseract;
// const worker = createWorker(OPTIONS);
// const worker = await createWorker(OPTIONS);
// before(function cb() {
// this.timeout(0);
// return worker.load();
// });
(IS_BROWSER ? describe : describe.skip)('Invalid paths should result in promise rejection', () => {
it('Invalid workerPath', async () => {
const OPTIONS1 = JSON.parse(JSON.stringify(OPTIONS));
OPTIONS1.corePath = "badpath.js";
OPTIONS1.workerPath = "badpath.js";
let errorThrown;
try {
const worker = Tesseract.createWorker(OPTIONS1);
await worker.load()
errorThrown = false;
} catch (error) {
errorThrown = true;
}
// try {
// const worker = await Tesseract.createWorker(OPTIONS1);
// errorThrown = false;
// } catch (error) {
// errorThrown = true;
// }
expect(errorThrown).to.equal(true);
// Tesseract.createWorker(OPTIONS1).catch(() => errorThrown = true);
// await Tesseract.createWorker(OPTIONS1).catch(() => {
// errorThrown = true;
// })
// const func = async () => {
// await Tesseract.createWorker(OPTIONS1).catch(() => {
// errorThrown = true;
// })
// return;
// };
// await func();
await (async () => {
await Tesseract.createWorker(OPTIONS1).catch((x) => { console.log("stuff") })
// .then((x) => { throw new Error('was not supposed to succeed'); })
// .catch((x) => { console.log("stuff") })
return;
})();
// await func().catch(() => console.log("caught"));
// expect(errorThrown).to.equal(true);
// expect(func).to.throwError();
// const ret = await (worker.load().then(() => true).catch(() => false));
// expect(ret).to.equal(false);
}).timeout(TIMEOUT);
});

14
tests/recognize.test.js

@ -3,7 +3,6 @@ let worker; @@ -3,7 +3,6 @@ let worker;
before(async function cb() {
this.timeout(0);
worker = await createWorker(OPTIONS);
await worker.load();
await worker.loadLanguage('eng+chi_tra+osd');
});
@ -31,6 +30,19 @@ describe('recognize()', () => { @@ -31,6 +30,19 @@ describe('recognize()', () => {
));
});
describe('should recognize base64 image (simplified interface)', () => {
[
{ format: 'png', image: SIMPLE_PNG_BASE64, ans: SIMPLE_TEXT },
{ format: 'jpg', image: SIMPLE_JPG_BASE64, ans: SIMPLE_TEXT },
].forEach(({ format, image, ans }) => (
it(`recongize ${format} in base64`, async () => {
const { data: { text } } = await Tesseract.recognize(image);
expect(text).to.be(ans);
}).timeout(TIMEOUT)
));
});
describe('should recognize different langs', () => {
[
{ name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT },

1
tests/scheduler.test.js

@ -8,7 +8,6 @@ before(async function cb() { @@ -8,7 +8,6 @@ before(async function cb() {
console.log(`Initializing ${NUM_WORKERS} workers`);
workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => {
const w = await createWorker(OPTIONS);
await w.load();
await w.loadLanguage('eng');
await w.initialize('eng');
return w;

Loading…
Cancel
Save