Compare commits

...

186 Commits

Author SHA1 Message Date
Réda Housni Alaoui 80aef15861
Fix langs TS typing (#570) 2 years ago
Balearica 263dbb87b1
Updated enums per #434 (#671) 2 years ago
Balearica c9200839df
Clarified image format documentation per #359 (#670) 2 years ago
Balearica 1b87e30ae3
Made all errors in loadLanguage lead to promise rejection per #414 (#669) 2 years ago
Balearica bce7cd84fe 3.0.3 2 years ago
Balearica 2c77f33461 Updated Tesseract.js-core version 2 years ago
Balearica d8b29522c4
Reject promise on init failure per #602 (#667) 2 years ago
Balearica dd6c40b681
Updated types per #606 and #580 (#663) 2 years ago
Balearica 5ff17fdeb1
Added build targeting esm module (#659) 2 years ago
Balearica 363690a421
Removed reference to 'equ' support per #569 (#657) 2 years ago
Balearica b64eba3db0
Updated readme (#653) 2 years ago
Balearica 85e73216be
Removed statement that video elements are supported per #647 (#652) 2 years ago
Balearica b419e45114 3.0.2 2 years ago
Balearica ea33463120
Fix for default browser corePath per #643 (#644) 2 years ago
Balearica 90c8d99b3c
Add tests for .webp and .gif images (#642) 2 years ago
Balearica 0e368c69d6 3.0.1 2 years ago
Balearica ba394673bd Fixed bug with simd detection 2 years ago
Balearica 44d322e6ff 3.0.0 2 years ago
Balearica e3c4a6bc6e Updated README for version 3 2 years ago
Balearica f372818146
Added automatic detection of simd support (#641) 2 years ago
Balearica 8b567609e3
Updated to webpack 5 for compatibility with Node.js 18 (#640) 2 years ago
Balearica 13b95f6371
Updated to Tesseract.js v.3; added exif-based rotation (#638) 2 years ago
Balearica a9ac00ccac
Removed exif auto-rotation for browser per #604 (#634) 3 years ago
Balearica 75ddd63041 Revert "Add support for ImageData and fix a hang in buffer handling (#610)" 3 years ago
Balearica 1136e0a941 Revert "Ran linter" 3 years ago
Balearica 2e478bd8a5 Ran linter 3 years ago
WintrySnowman 67848464ac
Add support for ImageData and fix a hang in buffer handling (#610) 3 years ago
Balearica be956cd889
Replaced child_process with worker_threads per #630 (#631) 3 years ago
Balearica 61d0e553c6
Temporarily removed Node 18 since this fails on master. 3 years ago
Balearica 74be03c5b9
Updated versions of node used in workflows 3 years ago
Balearica 9442d9cb69
Merge pull request #629 from naptha/feat/benchmark 3 years ago
Your Name 6aba9599ec Added benchmark code and assets per #628 3 years ago
Your Name 58d28944d3 Ran linter 3 years ago
Balearica a8287a99aa
Merge pull request #621 from SusanDoggie/patch-1 3 years ago
Balearica 66085a7d70
Merge pull request #585 from andreialecu/fix-cachebadresponse 3 years ago
Susan Cheng 50a53f51d9
Fix for passing wrong arguments while calling fork function 3 years ago
Andrei Alecu 01e8335768 Fix caching of bad langData responses 3 years ago
jeromewu adcb5b8759
Update FUNDING.yml 3 years ago
Jerome Wu 294ced5c85 Release v2.1.5 4 years ago
jeromewu 90466c3b55
Merge pull request #508 from stonefruit/add-language-constant 4 years ago
jeromewu 7b7f9afaa6
Merge pull request #496 from miguelm3/master 4 years ago
stonefruit 8f2c33fd89 Add languages constant for languages 4 years ago
Ilya 83a424f9b9
Fixed method for selecting env type (#498) 4 years ago
miguelm3 bf43f447b6 lint fix 4 years ago
miguelm3 dec119fa7b modified logger to have the user JobId 4 years ago
jeromewu 909263b94b
Merge pull request #491 from bertyhell/patch-1 4 years ago
jeromewu eb287e763a
Update article link 4 years ago
Bert Verhelst 1781487273
fix link to documentation in readme 4 years ago
Jerome Wu 69355a7c07 Release v2.1.4 4 years ago
jeromewu 4f3aa3145c
Merge pull request #489 from CedricCouton/fix-electron-webview 4 years ago
Jerome Wu ed016bdc26 Update dependencies to fix security issues 4 years ago
Jerome Wu 804c238950 Upgrade mocha to v8 4 years ago
Jerome Wu 88290dc541 Upgrade nyc to 15.1.0 4 years ago
Jerome Wu 920dd902f8 Optimize eslint 4 years ago
Jerome Wu e883f87ab7 Upgrade babel-loader to 8.1.0 4 years ago
Jerome Wu 34839a7a0c Upgrade webpack 4 years ago
Jerome Wu e68d520155 Add CodeQL badge 4 years ago
Jerome Wu d98eeab91f Upgrade jpeg-autorotate to v7 4 years ago
jeromewu c2f4ecef9a
Create codeql-analysis.yml 4 years ago
Jerome Wu eb6b8594d3 Add lint stage & update README.md 4 years ago
Jerome Wu afadfffa6a Remove Travis CI 4 years ago
jeromewu ccb8ec3d67
Create node.js.yml 4 years ago
jeromewu d64ebcaea1
Merge pull request #487 from abhishek7553/master 4 years ago
jeromewu 341360bf44
Merge pull request #483 from naptha/dependabot/npm_and_yarn/node-fetch-2.6.1 4 years ago
jeromewu 2cd68fb97e
Merge pull request #482 from naptha/dependabot/npm_and_yarn/yargs-parser-13.1.2 4 years ago
jeromewu 066c59d1f9
Merge pull request #473 from naptha/dependabot/npm_and_yarn/elliptic-6.5.3 4 years ago
Jerome Wu 820c91fa52 Fix electron webview by updating src/utils/getEnvironment.js 4 years ago
Cédric Couton 3f98fdeb0f Fix fetch when running in electron webview 4 years ago
abhishek7553 cd08357833
updated server.js 4 years ago
Jerome Wu 2bbd1e896c Release v2.1.3 4 years ago
Jerome Wu 5bad2e68ce Fix exif issue for base64 images 4 years ago
Jerome Wu 59392e96ff Update FS tests 4 years ago
dependabot[bot] 0f9cecd544
Bump node-fetch from 2.6.0 to 2.6.1 4 years ago
Jerome Wu 13ab9cec70 Release v2.1.2 4 years ago
dependabot[bot] 389bf71381
Bump yargs-parser from 13.1.1 to 13.1.2 4 years ago
Jerome Wu 6481256f5e Fix lint and test error 4 years ago
Jerome Wu 5e295b75b6 Merge branch 'rogerxaic-master' into master 4 years ago
Jerome Wu f3cbc3ee43 Merge branch 'master' of https://github.com/rogerxaic/tesseract.js into rogerxaic-master 4 years ago
jeromewu 3e97749585
Update FUNDING.yml 5 years ago
dependabot[bot] 8435f4c6e2
Bump elliptic from 6.5.2 to 6.5.3 5 years ago
jeromewu 6ec5a5a092
Merge pull request #466 from naptha/dependabot/npm_and_yarn/lodash-4.17.19 5 years ago
dependabot[bot] d3cf791e6a
Bump lodash from 4.17.15 to 4.17.19 5 years ago
jeromewu cc1f2bfe81
Merge pull request #456 from isc/patch-1 5 years ago
jeromewu fcd01ceb32
Update README.md 5 years ago
Ivan Schneider 6ef2e116cd
Correct some typing errors in api.md 5 years ago
jeromewu 885908f22e
Merge pull request #452 from mackncheesiest/patch-1 5 years ago
Joshua Mack 6df9ef2f1f
Fix documentation links in faq.md 5 years ago
Joshua Mack bb7d7093f4
Fixed "supported languages" link 5 years ago
jeromewu 40051c6761
Merge pull request #443 from connorads/patch-1 5 years ago
Connor Adams 0ce8e139ab
Fix typo in README.md 5 years ago
rogerxaic b221071b8e fix images' orientation based on exif data using Tesseract on node. 5 years ago
rogerxaic a54dbc345b fix images' rotation based on exif data 5 years ago
jeromewu 1df208ffab Release v2.1.1 5 years ago
jeromewu 7a398f6420 Add from cache message when loading traineddata 5 years ago
jeromewu ecf03503f1 Update README.md 5 years ago
jeromewu 0c7ade85e3 Release v2.1.0 5 years ago
jeromewu d6e434338f Fix FS.test.js 5 years ago
jeromewu d37f045e99 Add FS functions to api.md 5 years ago
jeromewu 50df652f33 Rename removeText to removeFile 5 years ago
jeromewu 1ff4b79f75 Add tests for FS functions 5 years ago
jeromewu 93dab17bf1 Add FS functions 5 years ago
Jerome Wu 20cd04a848 Update package-lock.json 5 years ago
Jerome Wu 363018bffe Fix security issues with force 5 years ago
Jerome Wu 89d0a6b12d Fix security issue without force 5 years ago
Jerome Wu 25d6664937 Upgrade to tesseract.js-core v2.2.0 5 years ago
jeromewu a008d0780a Update tests to use local corePath 5 years ago
jeromewu 0778add1c4 Update gitpod configuration 5 years ago
jeromewu 331fbbe6be
Merge pull request #427 from naptha/dependabot/npm_and_yarn/acorn-6.4.1 5 years ago
dependabot[bot] 6d9adfd2ce
Bump acorn from 6.4.0 to 6.4.1 5 years ago
Jerome Wu 3d456e3eb0 Add fxnoob chrome extension example 5 years ago
Jerome Wu 205178646b Update rectangle usage in api.md 5 years ago
Jerome Wu 40aea6d886 Update example/recognize.js 5 years ago
Jerome Wu 6f8e69206b Add user_defined_dpi and preserve_interword_spaces to index.d.ts 5 years ago
Jerome Wu c5935b2636 Update example/recognize.js 5 years ago
jeromewu 5a40559db2
Fix missing type in index.d.ts 5 years ago
jeromewu 023d484209
Fix type error in index.d.ts 5 years ago
jeromewu 641d0c5fe4
Merge pull request #407 from nironater/patch-1 5 years ago
Nir a56de79f22
Update documentation 5 years ago
jeromewu b418554cf0
Fix #401 5 years ago
jeromewu f8aa46ec9d
Merge pull request #392 from tomaszferens/fix-demo-example 5 years ago
Tomasz Ferens 13c16f472e Fix displaying recognized text in demo example 5 years ago
jeromewu 247a1635f8
Update README.md 5 years ago
Jerome Wu 66e2ce842e Release v2.0.2 5 years ago
jeromewu 2956afc013 Fix lint error 5 years ago
jeromewu 781f2f80ba Remove api.End() as it doesn't stop api.Recognize(), fix #387 5 years ago
jeromewu 548a5a5142
Merge pull request #386 from nisarhassan12/master 5 years ago
Nisar Hassan Naqvi 9b91579b16 [readme] Add ready-to-code badge + describe gitpod in a better way. 5 years ago
jeromewu 7cc3427a38
Merge pull request #383 from OliverCole/update-is-url 5 years ago
Oliver Cole 71dc2a80ed Update is-url package for vuln fix. Fixes https://snyk.io/vuln/npm:is-url:20180319 5 years ago
jeromewu 9db6ebf36e
Merge pull request #379 from frinyvonnick/patch-1 5 years ago
Yvonnick FRIN db3eed3915
Fix broken links in api.md file 5 years ago
Jerome Wu e975d29f38 Add electron link to README.md 5 years ago
Jerome Wu f9c76a9b7f Release v2.0.1 5 years ago
Jerome Wu 945f5d3f35 Update rectangles to rectangle as only one region can be assigned, fix #378 5 years ago
Jerome Wu b8aba2eddd Add electron environment check, fix #376 5 years ago
Jerome Wu b603d42547 Resize logo image 5 years ago
Jerome Wu c7a74a6575 Add logo image 5 years ago
Jerome Wu 3bb543d3cd Add new logo 5 years ago
Jerome Wu fa5b267f17 Fix lint error 5 years ago
Jerome Wu 15b7983619 Update docs 5 years ago
Jerome Wu b2bc416dd2 Remove axios and add webpack-bundle-analyzer, close #353 5 years ago
Jerome Wu a5fa14cc3b Add Edge example 5 years ago
Jerome Wu 2aba7285e9 Release v2.0.0 5 years ago
jeromewu 1051b2ab41
Create SECURITY.md 5 years ago
jeromewu 5c22cd52bd
Merge pull request #374 from WebReflection/moz-extension 5 years ago
Andrea Giammarchi 5a7576ceb4
Enable moz-extension:// too 5 years ago
jeromewu 2db7607496
Merge pull request #373 from WebReflection/moz-extension 5 years ago
Andrea Giammarchi 953689fedf
Enable Firefox extesions too 5 years ago
jeromewu 0e8aee660f
Merge pull request #366 from naptha/dependabot/npm_and_yarn/eslint-utils-1.4.3 5 years ago
jeromewu 625ded7921
Merge pull request #365 from naptha/dependabot/npm_and_yarn/js-yaml-3.13.1 5 years ago
jeromewu 73147afd87
Merge pull request #364 from naptha/dependabot/npm_and_yarn/mixin-deep-1.3.2 5 years ago
jeromewu 1a781a6ae4
Merge pull request #363 from naptha/dependabot/npm_and_yarn/debug-2.6.9 5 years ago
dependabot[bot] 9c62422573
Bump debug from 2.6.8 to 2.6.9 5 years ago
jeromewu 1fddc9a86c
Merge pull request #362 from naptha/dependabot/npm_and_yarn/lodash-4.17.15 5 years ago
jeromewu 4aaf462564
Merge pull request #361 from naptha/dependabot/npm_and_yarn/axios-0.18.1 5 years ago
jeromewu 7b06e0edcb
Merge pull request #368 from cstar-industries/master 5 years ago
jeromewu 663be93420
Update FUNDING.yml 5 years ago
Charles Francoise d51d7bd8ac update types 5 years ago
Charles Francoise 67ccfcfb6e add errorHandler documentation 5 years ago
Charles Francoise ebc56a1899 add errorHandler 5 years ago
Charles Francoise bcdcace9ec Don't throw error on job rejection 5 years ago
jeromewu dd61663b31
Merge pull request #357 from elderapo/fix-typings 5 years ago
dependabot[bot] f524945084
Bump eslint-utils from 1.3.1 to 1.4.3 5 years ago
dependabot[bot] 7663c7c589
Bump js-yaml from 3.12.0 to 3.13.1 5 years ago
dependabot[bot] 571c8c2452
Bump mixin-deep from 1.3.1 to 1.3.2 5 years ago
dependabot[bot] 775e48176c
Bump lodash from 4.17.4 to 4.17.15 5 years ago
dependabot[bot] 4182c5d55a
Bump axios from 0.18.0 to 0.18.1 5 years ago
jeromewu a97e2dad98
Merge pull request #360 from gustafsson/typo_whiltelist 5 years ago
Johan Gustafsson 8d926873b3 whiltelist -> whitelist 5 years ago
Tomasz Martyński b588e9f618 fix typings 5 years ago
jeromewu 086c7a507a
Merge pull request #355 from newyork-anthonyng/newyork-anthonyng-patch-1 5 years ago
Anthony Ng 2de597c1a9
Update api.md 5 years ago
Jerome Wu 6fd12b5012 Release v2.0.0-beta.2 5 years ago
Jerome Wu 15da58127d Fix debugger port conflict and resolve-url issue 5 years ago
Jerome Wu 1d28cc5e52 Fix README.md 5 years ago
Jerome Wu 6d772c7070 Center video demo gif 5 years ago
Jerome Wu e7a57363c2 Add video demo gif 5 years ago
jeromewu fd53a9d29e
Update README.md 5 years ago
jeromewu 06967cb7f3
Update README.md 5 years ago
jeromewu 61a7531acb
Update FUNDING.yml 5 years ago
Jerome Wu dc83e6f437 Update docs 5 years ago
jeromewu e9017f3ed6
Merge pull request #345 from jasonharrison/fix-readme 5 years ago
Jerome Wu 41d82da35b Fix test script 5 years ago
Jerome Wu d947f6e554 Update README.md 5 years ago
Jerome Wu 5db60be949 Update api.md 5 years ago
Jason 7c5172846b
Fix typo in README.md 5 years ago
  1. 6
      .eslintrc
  2. 4
      .github/FUNDING.yml
  3. 5
      .github/SECURITY.md
  4. 71
      .github/workflows/codeql-analysis.yml
  5. 29
      .github/workflows/node.js.yml
  6. 2
      .gitpod.Dockerfile
  7. 4
      .gitpod.yml
  8. 7
      .travis.yml
  9. 80
      README.md
  10. 133
      docs/api.md
  11. 106
      docs/examples.md
  12. 4
      docs/faq.md
  13. 25
      docs/image-format.md
  14. BIN
      docs/images/tesseract.png
  15. BIN
      docs/images/video-demo.gif
  16. 12
      docs/local-installation.md
  17. 2
      docs/tesseract_lang_list.md
  18. 37
      examples/browser/basic-edge.html
  19. 33
      examples/browser/benchmark.html
  20. 3
      examples/browser/demo.html
  21. BIN
      examples/data/meditations.jpg
  22. BIN
      examples/data/testocr.png
  23. BIN
      examples/data/tyger.jpg
  24. 27
      examples/node/benchmark.js
  25. 17
      examples/node/recognize.js
  26. 22438
      package-lock.json
  27. 59
      package.json
  28. 13
      scripts/rollup.esm.js
  29. 2
      scripts/server.js
  30. 5
      scripts/webpack.config.common.js
  31. 9
      scripts/webpack.config.dev.js
  32. 6
      scripts/webpack.config.prod.js
  33. 1
      src/constants/PSM.js
  34. 218
      src/constants/languages.js
  35. 51
      src/createWorker.js
  36. 37
      src/index.d.ts
  37. 2
      src/index.js
  38. 17
      src/utils/getEnvironment.js
  39. 2
      src/utils/resolvePaths.js
  40. 4
      src/worker-script/browser/cache.js
  41. 27
      src/worker-script/browser/getCore.js
  42. 7
      src/worker-script/browser/index.js
  43. 1
      src/worker-script/browser/resolveURL.js
  44. 2
      src/worker-script/constants/defaultParams.js
  45. 60
      src/worker-script/index.js
  46. 6
      src/worker-script/node/cache.js
  47. 11
      src/worker-script/node/getCore.js
  48. 11
      src/worker-script/node/index.js
  49. 1
      src/worker-script/node/resolveURL.js
  50. 4
      src/worker-script/utils/dump.js
  51. 12
      src/worker-script/utils/setImage.js
  52. 4
      src/worker/browser/defaultOptions.js
  53. 9
      src/worker/browser/loadImage.js
  54. 8
      src/worker/node/loadImage.js
  55. 4
      src/worker/node/send.js
  56. 6
      src/worker/node/spawnWorker.js
  57. 2
      src/worker/node/terminateWorker.js
  58. 18
      tests/FS.test.html
  59. 37
      tests/FS.test.js
  60. BIN
      tests/assets/images/simple.gif
  61. BIN
      tests/assets/images/simple.webp
  62. 5
      tests/constants.js
  63. 8
      tests/recognize.test.js

6
.eslintrc

@ -1,5 +1,6 @@ @@ -1,5 +1,6 @@
{
"extends": "airbnb",
"extends": "airbnb-base",
"parser": "babel-eslint",
"env": {
"browser": true,
"node": true,
@ -10,6 +11,7 @@ @@ -10,6 +11,7 @@
"no-underscore-dangle": 0,
"no-console": 0,
"global-require": 0,
"camelcase": 0
"camelcase": 0,
"no-control-regex": 0
}
}

4
.github/FUNDING.yml

@ -4,6 +4,6 @@ github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, u @@ -4,6 +4,6 @@ github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, u
patreon: # Replace with a single Patreon username
open_collective: tesseractjs
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
tidelift: npm/tesseract.js
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
custom: # Replace with a single custom sponsorship URL
custom: ["https://etherscan.io/address/0x74ace8c74535d6dac03ebdc708ca2fba54796ef2"]

5
.github/SECURITY.md

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
## Security contact information
To report a security vulnerability, please use the
[Tidelift security contact](https://tidelift.com/security).
Tidelift will coordinate the fix and disclosure.

71
.github/workflows/codeql-analysis.yml

@ -0,0 +1,71 @@ @@ -0,0 +1,71 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
name: "CodeQL"
on:
push:
branches: [master]
pull_request:
# The branches below must be a subset of the branches above
branches: [master]
schedule:
- cron: '0 17 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
# Override automatic language detection by changing the below list
# Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
language: ['javascript']
# Learn more...
# https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
# We must fetch at least the immediate parents so that if this is
# a pull request then we can checkout the head.
fetch-depth: 2
# If this run was triggered by a pull request event, then checkout
# the head of the pull request instead of the merge commit.
- run: git checkout HEAD^2
if: ${{ github.event_name == 'pull_request' }}
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# ℹ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

29
.github/workflows/node.js.yml

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
# This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions
name: Node.js CI
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [14.x, 16.x]
steps:
- uses: actions/checkout@v2
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
- run: npm ci
- run: npm run lint
- run: npm test

2
.gitpod.Dockerfile

@ -0,0 +1,2 @@ @@ -0,0 +1,2 @@
FROM gitpod/workspace-full
RUN sudo apt-get update && sudo apt-get install -y libgtk-3-0 libx11-xcb1 libnss3 libxss1 libasound2

4
.gitpod.yml

@ -1,7 +1,9 @@ @@ -1,7 +1,9 @@
image:
file: .gitpod.Dockerfile
tasks:
- command: gp await-port 3000 && sleep 3 && gp preview $(gp url 3000)/examples/browser/demo.html
- init: npm install
command: npm start
ports:
- port: 3000
onOpen: ignore
onOpen: ignore

7
.travis.yml

@ -1,7 +0,0 @@ @@ -1,7 +0,0 @@
language: node_js
node_js:
- "lts/*" # Use LTS version
script:
- npm run lint
- npm test

80
README.md

@ -1,8 +1,10 @@ @@ -1,8 +1,10 @@
<p align="center">
<a href="https://tesseract.projectnaptha.com/"><img alt="Tesseract.js" src="https://tesseract.projectnaptha.com/img/logo_small.png"></a>
<a href="https://tesseract.projectnaptha.com/"><img width="256px" height="256px" alt="Tesseract.js" src="./docs/images/tesseract.png"></a>
</p>
[![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js)
![Lint & Test](https://github.com/naptha/tesseract.js/workflows/Node.js%20CI/badge.svg)
![CodeQL](https://github.com/naptha/tesseract.js/workflows/CodeQL/badge.svg)
[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://github.com/naptha/tesseract.js)
[![Financial Contributors on Open Collective](https://opencollective.com/tesseractjs/all/badge.svg?label=financial+contributors)](https://opencollective.com/tesseractjs) [![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
@ -10,17 +12,19 @@ @@ -10,17 +12,19 @@
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
<h3 align="center">
Version 2 beta is now available and under development in the master branch<br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
</h3>
<br>
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))
Image Recognition
[![fancy demo gif](./docs/images/demo.gif)](http://tesseract.projectnaptha.com)
Video Real-time Recognition
<p align="center">
<a href="https://github.com/jeromewu/tesseract.js-video"><img alt="Tesseract.js Video" src="./docs/images/video-demo.gif"></a>
</p>
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine.
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/).
After you [install it](#installation), using it is as simple as:
@ -52,21 +56,32 @@ const worker = createWorker({ @@ -52,21 +56,32 @@ const worker = createWorker({
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
await woker.terminate();
await worker.terminate();
})();
```
[Check out the docs](#docs) for a full explanation of the API.
## Major changes in v2 beta
- Upgrade to tesseract v4.1 (using emscripten 1.38.45)
- Support multiple languages at the same time, eg: eng+chi_tra for English and Traditional Chinese
[Check out the docs](#documentation) for a full explanation of the API.
## Major changes in v3
- Significantly faster performance
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data)
- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18)
- Added SIMD-enabled build for supported devices
- Added support:
- Node.js version 18
- Removed support:
- ASM.js version, any other old versions of Tesseract.js-core (<3.0.0)
- Node.js versions 10 and 12
## Major changes in v2
- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream)
- Support multiple languages at the same time, eg: eng+chi\_tra for English and Traditional Chinese
- Supported image formats: png, jpg, bmp, pbm
- Support WebAssembly (fallback to ASM.js when browser doesn't support)
- Support Typescript
Read a story about v2: <a href="https://jeromewu.github.io/why-i-refactor-tesseract.js-v2/">Why I refactor tesseract.js v2?</a><br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
@ -74,7 +89,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via @@ -74,7 +89,7 @@ Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via
### CDN
```html
<!-- v2 -->
<script src='https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/tesseract.min.js'></script>
<script src='https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js'></script>
<!-- v1 -->
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script>
@ -84,16 +99,16 @@ After including the script the `Tesseract` variable will be globally available. @@ -84,16 +99,16 @@ After including the script the `Tesseract` variable will be globally available.
### Node.js
**Tesseract.js currently requires Node.js v6.8.0 or higher**
**Tesseract.js v3 requires Node.js v14 or higher**
```shell
# For v2
npm install tesseract.js@next
yarn add tesseract.js@next
# For v1
# For v3
npm install tesseract.js
yarn add tesseract.js
# For v2
npm install tesseract.js@2
yarn add tesseract.js@2
```
@ -105,6 +120,19 @@ yarn add tesseract.js @@ -105,6 +120,19 @@ yarn add tesseract.js
* [Local Installation](./docs/local-installation.md)
* [FAQ](./docs/faq.md)
## Use tesseract.js the way you like!
- Offline Version: https://github.com/jeromewu/tesseract.js-offline
- Electron Version: https://github.com/jeromewu/tesseract.js-electron
- Custom Traineddata: https://github.com/jeromewu/tesseract.js-custom-traineddata
- Chrome Extension #1: https://github.com/jeromewu/tesseract.js-chrome-extension
- Chrome Extension #2: https://github.com/fxnoob/image-to-text
- Firefox Extension: https://github.com/gnonio/korporize
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app
- With React: https://github.com/jeromewu/tesseract.js-react-app
- Typescript: https://github.com/jeromewu/tesseract.js-typescript
- Video Real-time Recognition: https://github.com/jeromewu/tesseract.js-video
## Contributing
@ -125,7 +153,9 @@ npm start @@ -125,7 +153,9 @@ npm start
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser.
It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder.
You can also run the development server in Gitpod ( a free online IDE and dev environment for GitHub that will automate your dev setup ) with a single click.
### Online Setup with a single Click
You can use Gitpod(A free online VS Code like IDE) for contributing. With a single click it will launch a ready to code workspace with the build & start scripts already in process and within a few seconds it will spin up the dev server so that you can start contributing straight away without wasting any time.
[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/naptha/tesseract.js/blob/master/examples/browser/demo.html)

133
docs/api.md

@ -2,6 +2,10 @@ @@ -2,6 +2,10 @@
- [createWorker()](#create-worker)
- [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText)
- [Worker.readText](#worker-readText)
- [Worker.removeFile](#worker-removeFile)
- [Worker.FS](#worker-FS)
- [Worker.loadLanguage](#worker-load-language)
- [Worker.initialize](#worker-initialize)
- [Worker.setParameters](#worker-set-parameters)
@ -42,6 +46,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is @@ -42,6 +46,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is
- `workerBlobURL` a boolean to define whether to use Blob URL for worker script, default: true
- `gzip` a boolean to define whether the traineddata from the remote is gzipped, default: true
- `logger` a function to log the progress, a quick example is `m => console.log(m)`
- `errorHandler` a function to handle worker errors, a quick example is `err => console.error(err)`
**Examples:**
@ -59,6 +64,7 @@ const worker = createWorker({ @@ -59,6 +64,7 @@ const worker = createWorker({
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is:
- load
- FS functions // optional
- loadLanguauge
- initialize
- setParameters // optional
@ -93,6 +99,84 @@ Worker.load() loads tesseract.js-core scripts (download from remote if not prese @@ -93,6 +99,84 @@ Worker.load() loads tesseract.js-core scripts (download from remote if not prese
})();
```
<a name="worker-writeText"></a>
### Worker.writeText(path, text, jobId): Promise
Worker.writeText() writes a text file to the path specified in MEMFS, it is useful when you want to use some features that requires tesseract.js
to read file from file system.
**Arguments:**
- `path` text file path
- `text` content of the text file
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.writeText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-readText"></a>
### Worker.readText(path, jobId): Promise
Worker.readText() reads a text file to the path specified in MEMFS, it is useful when you want to check the content.
**Arguments:**
- `path` text file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
const { data } = await worker.readText('tmp.txt');
console.log(data);
})();
```
<a name="worker-removeFile"></a>
### Worker.removeFile(path, jobId): Promise
Worker.readFile() remove a file in MEMFS, it is useful when you want to free the memory.
**Arguments:**
- `path` file path
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.removeFile('tmp.txt');
})();
```
<a name="worker-FS"></a>
### Worker.FS(method, args, jobId): Promise
Worker.FS() is a generic FS function to do anything you want, you can check [HERE](ihttps://emscripten.org/docs/api_reference/Filesystem-API.html) for all functions.
**Arguments:**
- `method` method name
- `args` array of arguments to pass
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.FS('writeFile', ['tmp.txt', 'Hi\nTesseract.js\n']);
// equal to:
// await worker.readText('tmp.txt', 'Hi\nTesseract.js\n');
})();
```
<a name="worker-load-language"></a>
### Worker.loadLanguage(langs, jobId): Promise
@ -143,17 +227,18 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i @@ -143,17 +227,18 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i
**Supported Paramters:**
| name | type | default value | description |
| ---- | ---- | ------------- | ----------- |
| tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
| name | type | default value | description |
| --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- |
| tessedit\_ocr\_engine\_mode | enum | OEM.DEFAULT | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
**Examples:**
@ -177,8 +262,8 @@ Figures out what words are in `image`, where the words are in `image`, etc. @@ -177,8 +262,8 @@ Figures out what words are in `image`, where the words are in `image`, etc.
**Arguments:**
- `image` see [Image Format](./image-format.md) for more details.
- `options` a object of customized optons
- `rectangles` an array of objects to specify the region you want to recognized in the image, the object should contain top, left, width and height, see example below.
- `options` a object of customized options
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below.
- `jobId` Please see details above
**Output:**
@ -197,7 +282,7 @@ const { createWorker } = Tesseract; @@ -197,7 +282,7 @@ const { createWorker } = Tesseract;
})();
```
With rectangles
With rectangle
```javascript
const { createWorker } = Tesseract;
@ -207,9 +292,7 @@ const { createWorker } = Tesseract; @@ -207,9 +292,7 @@ const { createWorker } = Tesseract;
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image, {
rectangles: [
{ top: 0, left: 0, width: 100, height: 100 },
],
rectangle: { top: 0, left: 0, width: 100, height: 100 },
});
console.log(text);
})();
@ -242,11 +325,7 @@ const { createWorker } = Tesseract; @@ -242,11 +325,7 @@ const { createWorker } = Tesseract;
<a name="worker-terminate"></a>
### Worker.terminate(jobId): Promise
Worker.terminate() terminates the worker and clean up
**Arguments:**
- `jobId` Please see details above
Worker.terminate() terminates the worker and cleans up
```javascript
(async () => {
@ -257,7 +336,7 @@ Worker.terminate() terminates the worker and clean up @@ -257,7 +336,7 @@ Worker.terminate() terminates the worker and clean up
<a name="create-scheduler"></a>
## createScheduler(): Scheduler
createScheduler() is a factory function to create a scheduler, a scheduler manage a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance.
createScheduler() is a factory function to create a scheduler, a scheduler manages a job queue and workers to enable multiple workers to work together, it is useful when you want to speed up your performance.
**Examples:**
@ -271,7 +350,7 @@ const scheduler = createScheduler(); @@ -271,7 +350,7 @@ const scheduler = createScheduler();
<a name="scheduler-add-worker"></a>
### Scheduler.addWorker(worker): string
Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one sheduler.
Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is suggested to add one worker to only one scheduler.
**Arguments:**
@ -354,16 +433,16 @@ See [Tesseract.js](../src/Tesseract.js) @@ -354,16 +433,16 @@ See [Tesseract.js](../src/Tesseract.js)
<a name="detect"></a>
## detect(image, options): Promise
Same background as recongize(), but it does detect instead.
Same background as recognize(), but it does detect instead.
See [Tesseract.js](../src/Tesseract.js)
<a name="psm"></a>
## PSM
See [PSM.js](../src/constatns/PSM.js)
See [PSM.js](../src/constants/PSM.js)
<a name="oem"></a>
## OEM
See [OEM.js](../src/constatns/OEM.js)
See [OEM.js](../src/constants/OEM.js)

106
docs/examples.md

@ -2,17 +2,10 @@ @@ -2,17 +2,10 @@
You can also check [examples](../examples) folder.
Example repositories:
- Offline version: https://github.com/jeromewu/tesseract.js-offline
- With Vue: https://github.com/jeromewu/tesseract.js-vue-app
- With Angular: https://github.com/jeromewu/tesseract.js-angular-app
- Chrome Extension: https://github.com/jeromewu/tesseract.js-chrome-extension
### basic
```javascript
import { createWorker } from 'tesseract.js';
const { createWorker } = require('tesseract.js');
const worker = createWorker();
@ -29,7 +22,7 @@ const worker = createWorker(); @@ -29,7 +22,7 @@ const worker = createWorker();
### with detailed progress
```javascript
import { createWorker } from 'tesseract.js';
const { createWorker } = require('tesseract.js');
const worker = createWorker({
logger: m => console.log(m), // Add logger here
@ -48,7 +41,7 @@ const worker = createWorker({ @@ -48,7 +41,7 @@ const worker = createWorker({
### with multiple languages, separate by '+'
```javascript
import { createWorker } from 'tesseract.js';
const { createWorker } = require('tesseract.js');
const worker = createWorker();
@ -64,7 +57,7 @@ const worker = createWorker(); @@ -64,7 +57,7 @@ const worker = createWorker();
### with whitelist char (^2.0.0-beta.1)
```javascript
import { createWorker } from 'tesseract.js';
const { createWorker } = require('tesseract.js');
const worker = createWorker();
@ -86,7 +79,7 @@ const worker = createWorker(); @@ -86,7 +79,7 @@ const worker = createWorker();
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript
import { createWorker, PSM } from 'tesseract.js';
const { createWorker, PSM } = require('tesseract.js');
const worker = createWorker();
@ -110,30 +103,105 @@ Please check **examples** folder for details. @@ -110,30 +103,105 @@ Please check **examples** folder for details.
Browser: [download-pdf.html](../examples/browser/download-pdf.html)
Node: [download-pdf.js](../examples/node/download-pdf.js)
### with only part of the image (^2.0.0-beta.1)
### with only part of the image (^2.0.1)
**One rectangle**
```javascript
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle });
console.log(text);
await worker.terminate();
})();
```
**Multiple Rectangles**
```javascript
import { createWorker } from 'tesseract.js';
const { createWorker } = require('tesseract.js');
const worker = createWorker();
const rectangles = [
{ left: 0, top: 0, width: 500, height: 250 },
{
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
];
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', 'eng', { rectangles });
console.log(text);
const values = [];
for (let i = 0; i < rectangles.length; i++) {
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle: rectangles[i] });
values.push(text);
}
console.log(values);
await worker.terminate();
})();
```
**Multiple Rectangles (with scheduler to do recognition in parallel)**
```javascript
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const rectangles = [
{
left: 0,
top: 0,
width: 500,
height: 250,
},
{
left: 500,
top: 0,
width: 500,
height: 250,
},
];
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng');
await worker1.initialize('eng');
await worker2.initialize('eng');
scheduler.addWorker(worker1);
scheduler.addWorker(worker2);
const results = await Promise.all(rectangles.map((rectangle) => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle })
)));
console.log(results.map(r => r.data.text));
await scheduler.terminate();
})();
```
### with multiple workers to speed up (^2.0.0-beta.1)
```javascript
import { createWorker, createScheduler } from 'tesseract.js';
const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler();
const worker1 = createWorker();
@ -150,7 +218,7 @@ const worker2 = createWorker(); @@ -150,7 +218,7 @@ const worker2 = createWorker();
scheduler.addWorker(worker2);
/** Add 10 recognition jobs */
const results = await Promise.all(Array(10).fill(0).map(() => (
await scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png')
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png')
)))
console.log(results);
await scheduler.terminate(); // It also terminates all workers.

4
docs/faq.md

@ -9,9 +9,9 @@ During the downloading of language model, Tesseract.js will first check if \*.tr @@ -9,9 +9,9 @@ During the downloading of language model, Tesseract.js will first check if \*.tr
## How can I train my own \*.traineddata?
For tesseract.js v2, check [TrainingTesseract 4.00](https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00)
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00)
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-3.03%E2%80%933.05)
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05)
## How can I get HOCR, TSV, Box, UNLV, OSD?

25
docs/image-format.md

@ -1,17 +1,18 @@ @@ -1,17 +1,18 @@
# Image Format
Support Format: **bmp, jpg, png, pbm**
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below.
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
Support Image Formats: **bmp, jpg, png, pbm, webp**
On a browser, an image can be:
- an `img`, `video`, or `canvas` element
- a `File` object (from a file `<input>`)
- a `Blob` object
- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser and Node, supported data types are:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp)
- buffer
In Node.js, an image can be
- a path to a local image
- a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser only, supported data types are:
- `File` or `Blob` object
- `img` or `canvas` element
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

BIN
docs/images/tesseract.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

BIN
docs/images/video-demo.gif

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

12
docs/local-installation.md

@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan @@ -10,9 +10,9 @@ In Node.js environment, the only path you may want to customize is languages/lan
```javascript
Tesseract.recognize(image, langs, {
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/worker.min.js',
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',
})
```
@ -20,9 +20,9 @@ Or @@ -20,9 +20,9 @@ Or
```javascript
const worker = createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-beta.1/dist/worker.min.js',
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',
});
```
@ -33,6 +33,6 @@ A string specifying the location of the [worker.js](./dist/worker.min.js) file. @@ -33,6 +33,6 @@ A string specifying the location of the [worker.js](./dist/worker.min.js) file.
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`.
### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available).
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available).
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.13/tesseract-core.wasm'. But it fails to fetch at this moment.
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment.

2
docs/tesseract_lang_list.md

@ -1,3 +1,3 @@ @@ -1,3 +1,3 @@
# Tesseract Languages
Please check [HERE](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files#data-files-for-version-400-november-29-2016) for supported languages
Please check [HERE](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) for supported languages

37
examples/browser/basic-edge.html

@ -0,0 +1,37 @@ @@ -0,0 +1,37 @@
<!DOCTYPE HTML>
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script>
const recognize = function(evt){
const files = evt.target.files;
const worker = Tesseract.createWorker({
/*
* As Edge don't support webassembly,
* here we force to use asm.js version.
*/
corePath: '../../node_modules/tesseract.js-core/tesseract-core.asm.js',
logger: function(m){console.log(m);},
/*
* As there is no indexedDB in earlier version
* of Edge, here we disable cache.
*/
cacheMethod: 'none',
});
Promise.resolve()
.then(() => worker.load())
.then(() => worker.loadLanguage('eng'))
.then(() => worker.initialize('eng'))
.then(() => worker.recognize(files[0]))
.then((ret) => {
console.log(ret.data.text);
});
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

33
examples/browser/benchmark.html

@ -0,0 +1,33 @@ @@ -0,0 +1,33 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<textarea id="message">Working...</textarea>
<script>
const { createWorker } = Tesseract;
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file);
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
document.getElementById('message').innerHTML += "\n" + file + " [x10] runtime: " + timeDif + "s";
}
document.getElementById('message').innerHTML += "\nTotal runtime: " + timeTotal + "s";
})();
</script>
</body>
</html>

3
examples/browser/demo.html

@ -27,7 +27,7 @@ function progressUpdate(packet){ @@ -27,7 +27,7 @@ function progressUpdate(packet){
if(packet.status == 'done'){
var pre = document.createElement('pre')
pre.appendChild(document.createTextNode(packet.data.text))
pre.appendChild(document.createTextNode(packet.data.data.text))
line.innerHTML = ''
line.appendChild(pre)
@ -71,7 +71,6 @@ async function recognizeFile(file) { @@ -71,7 +71,6 @@ async function recognizeFile(file) {
<option value='meme' > Internet Meme </option>
<option value='epo' > Esperanto </option>
<option value='epo_alt' > Esperanto alternative </option>
<option value='equ' > Math </option>
<option value='est' > Estonian </option>
<option value='eus' > Basque </option>
<option value='fin' > Finnish </option>

BIN
examples/data/meditations.jpg

Binary file not shown.

After

Width:  |  Height:  |  Size: 1011 KiB

BIN
examples/data/testocr.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

BIN
examples/data/tyger.jpg

Binary file not shown.

After

Width:  |  Height:  |  Size: 408 KiB

27
examples/node/benchmark.js

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
#!/usr/bin/env node
const path = require('path');
const { createWorker } = require('../../');
const worker = createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];
let timeTotal = 0;
for (let file of fileArr) {
let time1 = Date.now();
for (let i=0; i < 10; i++) {
await worker.recognize(file)
}
let time2 = Date.now();
const timeDif = (time2 - time1) / 1e3;
timeTotal += timeDif;
console.log(file + " [x10] runtime: " + timeDif + "s");
}
console.log("Total runtime: " + timeTotal + "s");
await worker.terminate();
})();

17
examples/node/recognize.js

@ -1,13 +1,20 @@ @@ -1,13 +1,20 @@
#!/usr/bin/env node
const path = require('path');
const Tesseract = require('../../');
const { createWorker } = require('../../');
const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`);
const worker = createWorker({
logger: m => console.log(m),
});
Tesseract.recognize(image, 'eng', { logger: m => console.log(m) })
.then(({ data: { text } }) => {
console.log(text);
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
console.log(text);
await worker.terminate();
})();

22438
package-lock.json generated

File diff suppressed because it is too large Load Diff

59
package.json

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
{
"name": "tesseract.js",
"version": "2.0.0-beta.1",
"version": "3.0.3",
"description": "Pure Javascript Multilingual OCR",
"main": "src/index.js",
"types": "src/index.d.ts",
@ -8,18 +8,22 @@ @@ -8,18 +8,22 @@
"jsdelivr": "dist/tesseract.min.js",
"scripts": {
"start": "node scripts/server.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js",
"build": "rimraf dist && webpack --config scripts/webpack.config.prod.js && rollup -c scripts/rollup.esm.js",
"profile:tesseract": "webpack-bundle-analyzer dist/tesseract-stats.json",
"profile:worker": "webpack-bundle-analyzer dist/worker-stats.json",
"prepublishOnly": "npm run build",
"wait": "rimraf dist && wait-on http://localhost:3000/dist/tesseract.dev.js",
"test": "npm-run-all -p -r start test:all",
"test:all": "npm-run-all wait test:browser:* test:node:all",
"test:node": "nyc mocha --exit --bail --require ./scripts/test-helper.js",
"test:node:all": "npm run test:node:one -- ./tests/*.test.js",
"test:node:all": "npm run test:node -- ./tests/*.test.js",
"test:browser-tpl": "mocha-headless-chrome -a incognito -a no-sandbox -a disable-setuid-sandbox -a disable-logging -t 300000",
"test:browser:detect": "npm run test:browser-tpl -- -f ./tests/detect.test.html",
"test:browser:recognize": "npm run test:browser-tpl -- -f ./tests/recognize.test.html",
"test:browser:scheduler": "npm run test:browser-tpl -- -f ./tests/scheduler.test.html",
"test:browser:FS": "npm run test:browser-tpl -- -f ./tests/FS.test.html",
"lint": "eslint src",
"lint:fix": "eslint --fix src",
"postinstall": "opencollective-postinstall || true"
},
"browser": {
@ -31,38 +35,43 @@ @@ -31,38 +35,43 @@
],
"license": "Apache-2.0",
"devDependencies": {
"@babel/core": "^7.4.5",
"@babel/preset-env": "^7.4.5",
"acorn": "^6.1.1",
"babel-loader": "^8.0.6",
"@babel/core": "^7.18.7",
"@babel/preset-env": "^7.18.7",
"@rollup/plugin-commonjs": "^22.0.2",
"acorn": "^6.4.0",
"babel-loader": "^8.2.0",
"buffer": "^6.0.3",
"cors": "^2.8.5",
"eslint": "^5.9.0",
"eslint-config-airbnb": "^17.1.0",
"eslint-plugin-import": "^2.14.0",
"eslint-plugin-jsx-a11y": "^6.1.2",
"eslint-plugin-react": "^7.11.1",
"eslint": "^7.2.0",
"eslint-config-airbnb-base": "^14.2.0",
"eslint-plugin-import": "^2.22.1",
"expect.js": "^0.3.1",
"express": "^4.16.4",
"mocha": "^5.2.0",
"mocha-headless-chrome": "^2.0.2",
"express": "^4.17.1",
"mocha": "^8.1.3",
"mocha-headless-chrome": "^2.0.3",
"npm-run-all": "^4.1.5",
"nyc": "^13.1.0",
"rimraf": "^2.6.3",
"wait-on": "^3.2.0",
"webpack": "^4.26.0",
"webpack-cli": "^3.1.2",
"webpack-dev-middleware": "^3.4.0"
"nyc": "^15.1.0",
"rimraf": "^2.7.1",
"rollup": "^2.79.0",
"wait-on": "^3.3.0",
"webpack": "^5.74.0",
"webpack-bundle-analyzer": "^4.6.0",
"webpack-cli": "^4.10.0",
"webpack-dev-middleware": "^5.3.3"
},
"dependencies": {
"axios": "^0.18.0",
"babel-eslint": "^10.1.0",
"bmp-js": "^0.1.0",
"file-type": "^12.3.0",
"file-type": "^12.4.1",
"idb-keyval": "^3.2.0",
"is-url": "1.2.2",
"is-electron": "^2.2.0",
"is-url": "^1.2.4",
"node-fetch": "^2.6.0",
"opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1",
"tesseract.js-core": "^2.0.0-beta.13",
"tesseract.js-core": "^3.0.2",
"wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1"
},
"repository": {

13
scripts/rollup.esm.js

@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
import commonjs from "@rollup/plugin-commonjs";
export default [
{
input: "dist/tesseract.min.js",
output: {
file: "dist/tesseract.esm.min.js",
format: "esm",
banner: "/* eslint-disable */",
},
plugins: [commonjs()],
},
];

2
scripts/server.js

@ -13,5 +13,5 @@ app.use('/', express.static(path.resolve(__dirname, '..'))); @@ -13,5 +13,5 @@ app.use('/', express.static(path.resolve(__dirname, '..')));
app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true }));
module.exports = app.listen(3000, () => {
console.log('Server is running on port 3000');
console.log('Server is running on the port no. 3000');
});

5
scripts/webpack.config.common.js

@ -1,4 +1,9 @@ @@ -1,4 +1,9 @@
module.exports = {
resolve: {
fallback: {
buffer: require.resolve('buffer/'),
},
},
module: {
rules: [
{

9
scripts/webpack.config.dev.js

@ -1,5 +1,6 @@ @@ -1,5 +1,6 @@
const path = require('path');
const webpack = require('webpack');
const { BundleAnalyzerPlugin } = require('webpack-bundle-analyzer');
const common = require('./webpack.config.common');
const genConfig = ({
@ -14,11 +15,19 @@ const genConfig = ({ @@ -14,11 +15,19 @@ const genConfig = ({
libraryTarget,
},
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
new webpack.DefinePlugin({
'process.env': {
TESS_ENV: JSON.stringify('development'),
},
}),
new BundleAnalyzerPlugin({
analyzerMode: 'disable',
statsFilename: `${filename.split('.')[0]}-stats.json`,
generateStatsFile: true
}),
],
devServer: {
allowedHosts: ['localhost', '.gitpod.io'],

6
scripts/webpack.config.prod.js

@ -1,5 +1,6 @@ @@ -1,5 +1,6 @@
const path = require('path');
const common = require('./webpack.config.common');
const webpack = require('webpack');
const genConfig = ({
entry, filename, library, libraryTarget,
@ -14,6 +15,11 @@ const genConfig = ({ @@ -14,6 +15,11 @@ const genConfig = ({
library,
libraryTarget,
},
plugins: [
new webpack.ProvidePlugin({
Buffer: ['buffer', 'Buffer'],
}),
]
});
module.exports = [

1
src/constants/PSM.js

@ -15,4 +15,5 @@ module.exports = { @@ -15,4 +15,5 @@ module.exports = {
SINGLE_CHAR: '10',
SPARSE_TEXT: '11',
SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
};

218
src/constants/languages.js

@ -0,0 +1,218 @@ @@ -0,0 +1,218 @@
/*
* languages with existing tesseract traineddata
* https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
*/
/**
* @typedef {object} Languages
* @property {string} AFR Afrikaans
* @property {string} AMH Amharic
* @property {string} ARA Arabic
* @property {string} ASM Assamese
* @property {string} AZE Azerbaijani
* @property {string} AZE_CYRL Azerbaijani - Cyrillic
* @property {string} BEL Belarusian
* @property {string} BEN Bengali
* @property {string} BOD Tibetan
* @property {string} BOS Bosnian
* @property {string} BUL Bulgarian
* @property {string} CAT Catalan; Valencian
* @property {string} CEB Cebuano
* @property {string} CES Czech
* @property {string} CHI_SIM Chinese - Simplified
* @property {string} CHI_TRA Chinese - Traditional
* @property {string} CHR Cherokee
* @property {string} CYM Welsh
* @property {string} DAN Danish
* @property {string} DEU German
* @property {string} DZO Dzongkha
* @property {string} ELL Greek, Modern (1453-)
* @property {string} ENG English
* @property {string} ENM English, Middle (1100-1500)
* @property {string} EPO Esperanto
* @property {string} EST Estonian
* @property {string} EUS Basque
* @property {string} FAS Persian
* @property {string} FIN Finnish
* @property {string} FRA French
* @property {string} FRK German Fraktur
* @property {string} FRM French, Middle (ca. 1400-1600)
* @property {string} GLE Irish
* @property {string} GLG Galician
* @property {string} GRC Greek, Ancient (-1453)
* @property {string} GUJ Gujarati
* @property {string} HAT Haitian; Haitian Creole
* @property {string} HEB Hebrew
* @property {string} HIN Hindi
* @property {string} HRV Croatian
* @property {string} HUN Hungarian
* @property {string} IKU Inuktitut
* @property {string} IND Indonesian
* @property {string} ISL Icelandic
* @property {string} ITA Italian
* @property {string} ITA_OLD Italian - Old
* @property {string} JAV Javanese
* @property {string} JPN Japanese
* @property {string} KAN Kannada
* @property {string} KAT Georgian
* @property {string} KAT_OLD Georgian - Old
* @property {string} KAZ Kazakh
* @property {string} KHM Central Khmer
* @property {string} KIR Kirghiz; Kyrgyz
* @property {string} KOR Korean
* @property {string} KUR Kurdish
* @property {string} LAO Lao
* @property {string} LAT Latin
* @property {string} LAV Latvian
* @property {string} LIT Lithuanian
* @property {string} MAL Malayalam
* @property {string} MAR Marathi
* @property {string} MKD Macedonian
* @property {string} MLT Maltese
* @property {string} MSA Malay
* @property {string} MYA Burmese
* @property {string} NEP Nepali
* @property {string} NLD Dutch; Flemish
* @property {string} NOR Norwegian
* @property {string} ORI Oriya
* @property {string} PAN Panjabi; Punjabi
* @property {string} POL Polish
* @property {string} POR Portuguese
* @property {string} PUS Pushto; Pashto
* @property {string} RON Romanian; Moldavian; Moldovan
* @property {string} RUS Russian
* @property {string} SAN Sanskrit
* @property {string} SIN Sinhala; Sinhalese
* @property {string} SLK Slovak
* @property {string} SLV Slovenian
* @property {string} SPA Spanish; Castilian
* @property {string} SPA_OLD Spanish; Castilian - Old
* @property {string} SQI Albanian
* @property {string} SRP Serbian
* @property {string} SRP_LATN Serbian - Latin
* @property {string} SWA Swahili
* @property {string} SWE Swedish
* @property {string} SYR Syriac
* @property {string} TAM Tamil
* @property {string} TEL Telugu
* @property {string} TGK Tajik
* @property {string} TGL Tagalog
* @property {string} THA Thai
* @property {string} TIR Tigrinya
* @property {string} TUR Turkish
* @property {string} UIG Uighur; Uyghur
* @property {string} UKR Ukrainian
* @property {string} URD Urdu
* @property {string} UZB Uzbek
* @property {string} UZB_CYRL Uzbek - Cyrillic
* @property {string} VIE Vietnamese
* @property {string} YID Yiddish
*/
/**
* @type {Languages}
*/
module.exports = {
AFR: 'afr',
AMH: 'amh',
ARA: 'ara',
ASM: 'asm',
AZE: 'aze',
AZE_CYRL: 'aze_cyrl',
BEL: 'bel',
BEN: 'ben',
BOD: 'bod',
BOS: 'bos',
BUL: 'bul',
CAT: 'cat',
CEB: 'ceb',
CES: 'ces',
CHI_SIM: 'chi_sim',
CHI_TRA: 'chi_tra',
CHR: 'chr',
CYM: 'cym',
DAN: 'dan',
DEU: 'deu',
DZO: 'dzo',
ELL: 'ell',
ENG: 'eng',
ENM: 'enm',
EPO: 'epo',
EST: 'est',
EUS: 'eus',
FAS: 'fas',
FIN: 'fin',
FRA: 'fra',
FRK: 'frk',
FRM: 'frm',
GLE: 'gle',
GLG: 'glg',
GRC: 'grc',
GUJ: 'guj',
HAT: 'hat',
HEB: 'heb',
HIN: 'hin',
HRV: 'hrv',
HUN: 'hun',
IKU: 'iku',
IND: 'ind',
ISL: 'isl',
ITA: 'ita',
ITA_OLD: 'ita_old',
JAV: 'jav',
JPN: 'jpn',
KAN: 'kan',
KAT: 'kat',
KAT_OLD: 'kat_old',
KAZ: 'kaz',
KHM: 'khm',
KIR: 'kir',
KOR: 'kor',
KUR: 'kur',
LAO: 'lao',
LAT: 'lat',
LAV: 'lav',
LIT: 'lit',
MAL: 'mal',
MAR: 'mar',
MKD: 'mkd',
MLT: 'mlt',
MSA: 'msa',
MYA: 'mya',
NEP: 'nep',
NLD: 'nld',
NOR: 'nor',
ORI: 'ori',
PAN: 'pan',
POL: 'pol',
POR: 'por',
PUS: 'pus',
RON: 'ron',
RUS: 'rus',
SAN: 'san',
SIN: 'sin',
SLK: 'slk',
SLV: 'slv',
SPA: 'spa',
SPA_OLD: 'spa_old',
SQI: 'sqi',
SRP: 'srp',
SRP_LATN: 'srp_latn',
SWA: 'swa',
SWE: 'swe',
SYR: 'syr',
TAM: 'tam',
TEL: 'tel',
TGK: 'tgk',
TGL: 'tgl',
THA: 'tha',
TIR: 'tir',
TUR: 'tur',
UIG: 'uig',
UKR: 'ukr',
URD: 'urd',
UZB: 'uzb',
UZB_CYRL: 'uzb_cyrl',
VIE: 'vie',
YID: 'yid',
};

51
src/createWorker.js

@ -19,6 +19,7 @@ module.exports = (_options = {}) => { @@ -19,6 +19,7 @@ module.exports = (_options = {}) => {
const id = getId('Worker', workerCounter);
const {
logger,
errorHandler,
...options
} = resolvePaths({
...defaultOptions,
@ -52,12 +53,44 @@ module.exports = (_options = {}) => { @@ -52,12 +53,44 @@ module.exports = (_options = {}) => {
})
);
const load = jobId => (
const load = (jobId) => (
startJob(createJob({
id: jobId, action: 'load', payload: { options },
}))
);
const writeText = (path, text, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'writeFile', args: [path, text] },
}))
);
const readText = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'readFile', args: [path, { encoding: 'utf8' }] },
}))
);
const removeFile = (path, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method: 'unlink', args: [path] },
}))
);
const FS = (method, args, jobId) => (
startJob(createJob({
id: jobId,
action: 'FS',
payload: { method, args },
}))
);
const loadLanguage = (langs = 'eng', jobId) => (
startJob(createJob({
id: jobId,
@ -106,12 +139,14 @@ module.exports = (_options = {}) => { @@ -106,12 +139,14 @@ module.exports = (_options = {}) => {
}))
);
const terminate = async (jobId) => {
const terminate = async () => {
if (worker !== null) {
/*
await startJob(createJob({
id: jobId,
action: 'terminate',
}));
*/
terminateWorker(worker);
worker = null;
}
@ -132,9 +167,13 @@ module.exports = (_options = {}) => { @@ -132,9 +167,13 @@ module.exports = (_options = {}) => {
resolves[action]({ jobId, data: d });
} else if (status === 'reject') {
rejects[action](data);
throw Error(data);
if (errorHandler) {
errorHandler(data);
} else {
throw Error(data);
}
} else if (status === 'progress') {
logger(data);
logger({ ...data, userJobId: jobId });
}
});
@ -144,6 +183,10 @@ module.exports = (_options = {}) => { @@ -144,6 +183,10 @@ module.exports = (_options = {}) => {
setResolve,
setReject,
load,
writeText,
readText,
removeFile,
FS,
loadLanguage,
initialize,
setParameters,

37
src/index.d.ts vendored

@ -3,7 +3,7 @@ declare namespace Tesseract { @@ -3,7 +3,7 @@ declare namespace Tesseract {
function createWorker(options?: Partial<WorkerOptions>): Worker
function setLogging(logging: boolean): void
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>
function detect(image: ImageLike, options?: Partial<WorkerOptions>)
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any
interface Scheduler {
addWorker(worker: Worker): string
@ -15,12 +15,22 @@ declare namespace Tesseract { @@ -15,12 +15,22 @@ declare namespace Tesseract {
interface Worker {
load(jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult>
initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
writeText(path: string, text: string, jobId?: string): Promise<ConfigResult>
readText(path: string, jobId?: string): Promise<ConfigResult>
removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
}
interface Lang {
code: string;
data: unknown;
}
interface WorkerOptions {
@ -32,12 +42,15 @@ declare namespace Tesseract { @@ -32,12 +42,15 @@ declare namespace Tesseract {
cacheMethod: string
workerBlobURL: boolean
gzip: boolean
logger: (any) => void
logger: (arg: any) => void,
errorHandler: (arg: any) => void
}
interface WorkerParams {
tessedit_ocr_engine_mode: OEM
tessedit_pageseg_mode: PSM
tessedit_char_whiltelist: string
tessedit_char_whitelist: string
preserve_interword_spaces: string
user_defined_dpi: string
tessjs_create_hocr: string
tessjs_create_tsv: string
tessjs_create_box: string
@ -45,7 +58,7 @@ declare namespace Tesseract { @@ -45,7 +58,7 @@ declare namespace Tesseract {
tessjs_create_osd: string
}
interface RecognizeOptions {
rectangles: Rectangle[]
rectangle: Rectangle
}
interface ConfigResult {
jobId: string
@ -55,6 +68,10 @@ declare namespace Tesseract { @@ -55,6 +68,10 @@ declare namespace Tesseract {
jobId: string
data: Page
}
interface GetPDFResult {
jobId: string
data: number[]
}
interface DetectResult {
jobId: string
data: DetectData
@ -72,13 +89,13 @@ declare namespace Tesseract { @@ -72,13 +89,13 @@ declare namespace Tesseract {
width: number
height: number
}
const enum OEM {
enum OEM {
TESSERACT_ONLY,
LSTM_ONLY,
TESSERACT_LSTM_COMBINED,
DEFAULT,
}
const enum PSM {
enum PSM {
OSD_ONLY = '0',
AUTO_OSD = '1',
AUTO_ONLY = '2',
@ -92,11 +109,12 @@ declare namespace Tesseract { @@ -92,11 +109,12 @@ declare namespace Tesseract {
SINGLE_CHAR = '10',
SPARSE_TEXT = '11',
SPARSE_TEXT_OSD = '12',
RAW_LINE = '13'
}
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer;
interface Block {
paragraphs: Paragraph;
paragraphs: Paragraph[];
text: string;
confidence: number;
baseline: Baseline;
@ -194,6 +212,7 @@ declare namespace Tesseract { @@ -194,6 +212,7 @@ declare namespace Tesseract {
confidence: number;
lines: Line[];
oem: string;
osd: string;
paragraphs: Paragraph[];
psm: string;
symbols: Symbol[];

2
src/index.js

@ -11,11 +11,13 @@ require('regenerator-runtime/runtime'); @@ -11,11 +11,13 @@ require('regenerator-runtime/runtime');
const createScheduler = require('./createScheduler');
const createWorker = require('./createWorker');
const Tesseract = require('./Tesseract');
const languages = require('./constants/languages');
const OEM = require('./constants/OEM');
const PSM = require('./constants/PSM');
const { setLogging } = require('./utils/log');
module.exports = {
languages,
OEM,
PSM,
createScheduler,

17
src/utils/getEnvironment.js

@ -1,10 +1,21 @@ @@ -1,10 +1,21 @@
const isElectron = require('is-electron');
module.exports = (key) => {
const env = {
type: (typeof window !== 'undefined') && (typeof window.document !== 'undefined') ? 'browser' : 'node',
};
const env = {};
if (typeof WorkerGlobalScope !== 'undefined') {
env.type = 'webworker';
} else if (isElectron()) {
env.type = 'electron';
} else if (typeof window === 'object') {
env.type = 'browser';
} else if (typeof process === 'object' && typeof require === 'function') {
env.type = 'node';
}
if (typeof key === 'undefined') {
return env;
}
return env[key];
};

2
src/utils/resolvePaths.js

@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl @@ -4,7 +4,7 @@ const resolveURL = isBrowser ? require('resolve-url') : s => s; // eslint-disabl
module.exports = (options) => {
const opts = { ...options };
['corePath', 'workerPath', 'langPath'].forEach((key) => {
if (typeof options[key] !== 'undefined') {
if (options[key]) {
opts[key] = resolveURL(opts[key]);
}
});

4
src/worker-script/browser/cache.js

@ -4,7 +4,7 @@ module.exports = { @@ -4,7 +4,7 @@ module.exports = {
readCache: get,
writeCache: set,
deleteCache: del,
checkCache: path => (
get(path).then(v => typeof v !== 'undefined')
checkCache: (path) => (
get(path).then((v) => typeof v !== 'undefined')
),
};

27
src/worker-script/browser/getCore.js

@ -1,15 +1,26 @@ @@ -1,15 +1,26 @@
module.exports = (corePath, res) => {
const { simd } = require('wasm-feature-detect');
const { dependencies } = require('../../../package.json');
module.exports = async (corePath, res) => {
if (typeof global.TesseractCore === 'undefined') {
res.progress({ status: 'loading tesseract core', progress: 0 });
global.importScripts(corePath);
/*
* Depending on whether the browser supports WebAssembly,
* the version of the TesseractCore will be different.
*/
// If the user specifies a core path, we use that
// Otherwise, we detect the correct core based on SIMD support
let corePathImport = corePath;
if (!corePathImport) {
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
}
}
global.importScripts(corePathImport);
if (typeof global.TesseractCoreWASM !== 'undefined' && typeof WebAssembly === 'object') {
global.TesseractCore = global.TesseractCoreWASM;
} else if (typeof global.TesseractCoreASM !== 'undefined') {
global.TesseractCore = global.TesseractCoreASM;
} else {
throw Error('Failed to load TesseractCore');
}

7
src/worker-script/browser/index.js

@ -8,17 +8,16 @@ @@ -8,17 +8,16 @@
* @author Jerome Wu <jeromewus@gmail.com>
*/
const worker = require('../');
const worker = require('..');
const getCore = require('./getCore');
const gunzip = require('./gunzip');
const resolveURL = require('./resolveURL');
const cache = require('./cache');
/*
* register message handler
*/
global.addEventListener('message', ({ data }) => {
worker.dispatchHandlers(data, obj => postMessage(obj));
worker.dispatchHandlers(data, (obj) => postMessage(obj));
});
/*
@ -28,6 +27,6 @@ global.addEventListener('message', ({ data }) => { @@ -28,6 +27,6 @@ global.addEventListener('message', ({ data }) => {
worker.setAdapter({
getCore,
gunzip,
resolveURL,
fetch: () => {},
...cache,
});

1
src/worker-script/browser/resolveURL.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = require('resolve-url');

2
src/worker-script/constants/defaultParams.js

@ -5,7 +5,7 @@ const PSM = require('../../constants/PSM'); @@ -5,7 +5,7 @@ const PSM = require('../../constants/PSM');
module.exports = {
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,
tessedit_char_whiltelist: '',
tessedit_char_whitelist: '',
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
tessjs_create_box: '0',

60
src/worker-script/index.js

@ -9,10 +9,9 @@ @@ -9,10 +9,9 @@
*/
require('regenerator-runtime/runtime');
const fileType = require('file-type');
const axios = require('axios');
const isURL = require('is-url');
const dump = require('./utils/dump');
const isBrowser = require('../utils/getEnvironment')('type') === 'browser';
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker';
const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams');
const { log, setLogging } = require('../utils/log');
@ -29,10 +28,10 @@ let latestJob; @@ -29,10 +28,10 @@ let latestJob;
let adapter = {};
let params = defaultParams;
const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => {
setLogging(logging);
if (!TessModule) {
const Core = adapter.getCore(corePath, res);
const Core = await adapter.getCore(corePath, res);
res.progress({ workerId, status: 'initializing tesseract', progress: 0 });
@ -55,6 +54,11 @@ const load = ({ workerId, jobId, payload: { options: { corePath, logging } } }, @@ -55,6 +54,11 @@ const load = ({ workerId, jobId, payload: { options: { corePath, logging } } },
}
};
const FS = ({ workerId, payload: { method, args } }, res) => {
log(`[${workerId}]: FS.${method} with args ${args}`);
res.resolve(TessModule.FS[method](...args));
};
const loadLanguage = async ({
workerId,
payload: {
@ -68,7 +72,7 @@ const loadLanguage = async ({ @@ -68,7 +72,7 @@ const loadLanguage = async ({
},
},
},
res) => {
res) => {
const loadAndGunzipFile = async (_lang) => {
const lang = typeof _lang === 'string' ? _lang : _lang.code;
const readCache = ['refresh', 'none'].includes(cacheMethod)
@ -80,6 +84,7 @@ const loadLanguage = async ({ @@ -80,6 +84,7 @@ const loadLanguage = async ({
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
if (typeof _data !== 'undefined') {
log(`[${workerId}]: Load ${lang}.traineddata from cache`);
res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 });
data = _data;
} else {
throw Error('Not found in cache');
@ -89,18 +94,17 @@ const loadLanguage = async ({ @@ -89,18 +94,17 @@ const loadLanguage = async ({
if (typeof _lang === 'string') {
let path = null;
if (isURL(langPath) || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */
if (isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */
path = langPath;
} else if (process.browser) { /** When langPath is not an URL in browser */
path = adapter.resolveURL(langPath);
}
if (path !== null) {
const { data: _data } = await axios.get(
`${path}/${lang}.traineddata${gzip ? '.gz' : ''}`,
{ responseType: 'arraybuffer' },
);
data = _data;
const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`;
const resp = await (isWebWorker ? fetch : adapter.fetch)(fetchUrl);
if (!resp.ok) {
throw Error(`Network error while fetching ${fetchUrl}. Response code: ${resp.status}`);
}
data = await resp.arrayBuffer();
} else {
data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`);
}
@ -140,21 +144,13 @@ const loadLanguage = async ({ @@ -140,21 +144,13 @@ const loadLanguage = async ({
res.progress({ workerId, status: 'loaded language traineddata', progress: 1 });
res.resolve(langs);
} catch (err) {
if (isBrowser && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
res.reject(err.toString());
}
};
const setParameters = ({ payload: { params: _params } }, res) => {
Object.keys(_params)
.filter(k => !k.startsWith('tessjs_'))
.filter((k) => !k.startsWith('tessjs_'))
.forEach((key) => {
api.SetVariable(key, _params[key]);
});
@ -171,7 +167,7 @@ const initialize = ({ @@ -171,7 +167,7 @@ const initialize = ({
}, res) => {
const langs = (typeof _langs === 'string')
? _langs
: _langs.map(l => ((typeof l === 'string') ? l : l.data)).join('+');
: _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+');
try {
res.progress({
@ -181,7 +177,10 @@ const initialize = ({ @@ -181,7 +177,10 @@ const initialize = ({
api.End();
}
api = new TessModule.TessBaseAPI();
api.Init(null, langs, oem);
const status = api.Init(null, langs, oem);
if (status === -1) {
res.reject('initialization failed');
}
params = defaultParams;
setParameters({ payload: { params } });
res.progress({
@ -193,14 +192,12 @@ const initialize = ({ @@ -193,14 +192,12 @@ const initialize = ({
}
};
const recognize = ({ payload: { image, options: { rectangles = [] } } }, res) => {
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
rectangles.forEach(({
left, top, width, height,
}) => {
api.SetRectangle(left, top, width, height);
});
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
api.Recognize(null);
res.resolve(dump(TessModule, api, params));
TessModule._free(ptr);
@ -288,6 +285,7 @@ exports.dispatchHandlers = (packet, send) => { @@ -288,6 +285,7 @@ exports.dispatchHandlers = (packet, send) => {
try {
({
load,
FS,
loadLanguage,
initialize,
setParameters,

6
src/worker-script/node/cache.js

@ -4,13 +4,13 @@ const fs = require('fs'); @@ -4,13 +4,13 @@ const fs = require('fs');
module.exports = {
readCache: util.promisify(fs.readFile),
writeCache: util.promisify(fs.writeFile),
deleteCache: path => (
deleteCache: (path) => (
util.promisify(fs.unlink)(path)
.catch(() => {})
),
checkCache: path => (
checkCache: (path) => (
util.promisify(fs.access)(path, fs.F_OK)
.then(err => (err === null))
.then((err) => (err === null))
.catch(() => false)
),
};

11
src/worker-script/node/getCore.js

@ -1,12 +1,19 @@ @@ -1,12 +1,19 @@
const { simd } = require('wasm-feature-detect');
let TesseractCore = null;
/*
* getCore is a sync function to load and return
* TesseractCore.
*/
module.exports = (_, res) => {
module.exports = async (_, res) => {
if (TesseractCore === null) {
const simdSupport = await simd();
res.progress({ status: 'loading tesseract core', progress: 0 });
TesseractCore = require('tesseract.js-core');
if (simdSupport) {
TesseractCore = require('tesseract.js-core/tesseract-core-simd');
} else {
TesseractCore = require('tesseract.js-core/tesseract-core');
}
res.progress({ status: 'loaded tesseract core', progress: 1 });
}
return TesseractCore;

11
src/worker-script/node/index.js

@ -8,22 +8,23 @@ @@ -8,22 +8,23 @@
* @author Jerome Wu <jeromewus@gmail.com>
*/
const worker = require('../');
const fetch = require('node-fetch');
const { parentPort } = require('worker_threads');
const worker = require('..');
const getCore = require('./getCore');
const resolveURL = require('./resolveURL');
const gunzip = require('./gunzip');
const cache = require('./cache');
/*
* register message handler
*/
process.on('message', (packet) => {
worker.dispatchHandlers(packet, obj => process.send(obj));
parentPort.on('message', (packet) => {
worker.dispatchHandlers(packet, (obj) => parentPort.postMessage(obj));
});
worker.setAdapter({
getCore,
gunzip,
resolveURL,
fetch,
...cache,
});

1
src/worker-script/node/resolveURL.js

@ -1 +0,0 @@ @@ -1 +0,0 @@
module.exports = s => s;

4
src/worker-script/utils/dump.js

@ -61,8 +61,8 @@ module.exports = (TessModule, api, { @@ -61,8 +61,8 @@ module.exports = (TessModule, api, {
const enumToString = (value, prefix) => (
Object.keys(TessModule)
.filter(e => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map(e => e.slice(prefix.length + 1))[0]
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
.map((e) => e.slice(prefix.length + 1))[0]
);
ri.Begin();

12
src/worker-script/utils/setImage.js

@ -17,10 +17,12 @@ module.exports = (TessModule, api, image) => { @@ -17,10 +17,12 @@ module.exports = (TessModule, api, image) => {
let w = 0;
let h = 0;
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/*
* Although leptonica should support reading bmp, there is a bug of "compressed BMP files".
* As there is no solution, we need to use bmp-js for now.
* @see https://groups.google.com/forum/#!topic/tesseract-ocr/4mPD9zTxdxE
* Leptonica supports uncompressed but not compressed bmp files
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
* We therefore use bmp-js to process all bmp files
*/
if (type && type.mime === 'image/bmp') {
const bmpBuf = bmp.decode(buf);
@ -53,9 +55,9 @@ module.exports = (TessModule, api, image) => { @@ -53,9 +55,9 @@ module.exports = (TessModule, api, image) => {
*
*/
if (data === null) {
api.SetImage(pix);
api.SetImage(pix, undefined, undefined, undefined, undefined, exif);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel);
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif);
}
return data === null ? pix : data;
};

4
src/worker/browser/defaultOptions.js

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
const resolveURL = require('resolve-url');
const { version, dependencies } = require('../../../package.json');
const { version } = require('../../../package.json');
const defaultOptions = require('../../constants/defaultOptions');
/*
@ -14,5 +14,5 @@ module.exports = { @@ -14,5 +14,5 @@ module.exports = {
* If browser doesn't support WebAssembly,
* load ASM version instead
*/
corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`,
corePath: null,
};

9
src/worker/browser/loadImage.js

@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
const axios = require('axios');
const resolveURL = require('resolve-url');
/**
@ -8,7 +7,7 @@ const resolveURL = require('resolve-url'); @@ -8,7 +7,7 @@ const resolveURL = require('resolve-url');
* @function
* @access private
*/
const readFromBlobOrFile = blob => (
const readFromBlobOrFile = (blob) => (
new Promise((resolve, reject) => {
const fileReader = new FileReader();
fileReader.onload = () => {
@ -39,10 +38,10 @@ const loadImage = async (image) => { @@ -39,10 +38,10 @@ const loadImage = async (image) => {
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = atob(image.split(',')[1])
.split('')
.map(c => c.charCodeAt(0));
.map((c) => c.charCodeAt(0));
} else {
const { data: _data } = await axios.get(resolveURL(image), { responseType: 'arraybuffer' });
data = _data;
const resp = await fetch(resolveURL(image));
data = await resp.arrayBuffer();
}
} else if (image instanceof HTMLElement) {
if (image.tagName === 'IMG') {

8
src/worker/node/loadImage.js

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
const util = require('util');
const fs = require('fs');
const axios = require('axios');
const fetch = require('node-fetch');
const isURL = require('is-url');
const readFile = util.promisify(fs.readFile);
@ -19,9 +19,9 @@ module.exports = async (image) => { @@ -19,9 +19,9 @@ module.exports = async (image) => {
}
if (typeof image === 'string') {
if (isURL(image) || image.startsWith('chrome-extension://') || image.startsWith('file://')) {
const { data: _data } = await axios.get(image, { responseType: 'arraybuffer' });
data = _data;
if (isURL(image) || image.startsWith('moz-extension://') || image.startsWith('chrome-extension://') || image.startsWith('file://')) {
const resp = await fetch(image);
data = await resp.arrayBuffer();
} else if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
data = Buffer.from(image.split(',')[1], 'base64');
} else {

4
src/worker/node/send.js

@ -5,6 +5,6 @@ @@ -5,6 +5,6 @@
* @function send packet to worker and create a job
* @access public
*/
module.exports = (worker, packet) => {
worker.send(packet);
module.exports = async (worker, packet) => {
worker.postMessage(packet);
};

6
src/worker/node/spawnWorker.js

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
const { fork } = require('child_process');
const { Worker } = require('worker_threads');
/**
* spawnWorker
@ -7,6 +7,4 @@ const { fork } = require('child_process'); @@ -7,6 +7,4 @@ const { fork } = require('child_process');
* @function fork a new process in node
* @access public
*/
module.exports = ({ workerPath }) => (
fork(workerPath)
);
module.exports = ({ workerPath }) => new Worker(workerPath);

2
src/worker/node/terminateWorker.js

@ -6,5 +6,5 @@ @@ -6,5 +6,5 @@
* @access public
*/
module.exports = (worker) => {
worker.kill();
worker.terminate();
};

18
tests/FS.test.html

@ -0,0 +1,18 @@ @@ -0,0 +1,18 @@
<html>
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="../node_modules/mocha/mocha.css">
</head>
<body>
<div id="mocha"></div>
<script src="../node_modules/mocha/mocha.js"></script>
<script src="../node_modules/expect.js/index.js"></script>
<script src="../dist/tesseract.dev.js"></script>
<script src="./constants.js"></script>
<script>mocha.setup('bdd');</script>
<script src="./FS.test.js"></script>
<script>
mocha.run();
</script>
</body>
</html>

37
tests/FS.test.js

@ -0,0 +1,37 @@ @@ -0,0 +1,37 @@
const { createWorker } = Tesseract;
const FS_WAIT = 500;
const worker = createWorker(OPTIONS);
before(function cb() {
this.timeout(0);
return worker.load();
});
describe('FS', async () => {
it('should write and read text from FS (using FS only)', () => {
[
SIMPLE_TEXT,
].forEach(async (text) => {
const path = 'tmp.txt';
await worker.FS('writeFile', [path, SIMPLE_TEXT]);
setTimeout(async () => {
const { data } = await worker.FS('readFile', [path]);
await worker.FS('unlink', [path]);
expect(data.toString()).to.be(text);
}, FS_WAIT);
});
}).timeout(TIMEOUT);
it('should write and read text from FS (using writeFile, readFile)', () => {
[
SIMPLE_TEXT,
].forEach(async (text) => {
const path = 'tmp2.txt';
await worker.writeText(path, SIMPLE_TEXT);
setTimeout(async () => {
const { data } = await worker.readText(path);
await worker.removeFile(path);
expect(data.toString()).to.be(text);
}, FS_WAIT);
});
}).timeout(TIMEOUT);
});

BIN
tests/assets/images/simple.gif

Binary file not shown.

After

Width:  |  Height:  |  Size: 1011 B

BIN
tests/assets/images/simple.webp

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

5
tests/constants.js

File diff suppressed because one or more lines are too long

8
tests/recognize.test.js

@ -69,11 +69,9 @@ describe('recognize()', () => { @@ -69,11 +69,9 @@ describe('recognize()', () => {
const { data: { text } } = await worker.recognize(
`${IMAGE_PATH}/${name}`,
{
rectangles: [
{
top, left, width, height,
},
],
rectangle: {
top, left, width, height,
},
},
);
expect(text).to.be(ans);

Loading…
Cancel
Save