diff --git a/README.md b/README.md index f5dc0c7..be31062 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ Tesseract.recognize(myImage) ```javascript // if we know our image is of spanish words without the letter 'e': Tesseract.recognize(myImage, { - lang: 'spa', + langs: 'spa', tessedit_char_blacklist: 'e' }) .then(function(result){ @@ -137,22 +137,13 @@ Tesseract.detect(myImage) The main Tesseract.js functions take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS. - On a browser, an image can be: - an `img`, `video`, or `canvas` element -- a CanvasRenderingContext2D (returned by `canvas.getContext('2d')`) - a `File` object (from a file `` or drag-drop event) -- a `Blob` object -- a `ImageData` instance (an object containing `width`, `height` and `data` properties) -- a path or URL to an accessible image (the image must either be hosted locally or accessible by CORS) - - - +- a path or URL to an accessible image (the image must either be hosted locally) In Node.js, an image can be - a path to a local image -- a `Buffer` instance containing a `PNG` or `JPEG` image -- a `ImageData` instance (an object containing `width`, `height` and `data` properties) ## TesseractJob diff --git a/package-lock.json b/package-lock.json index c471a9d..f0f6d08 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2277,8 +2277,7 @@ "ansi-regex": { "version": "2.1.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "aproba": { "version": "1.2.0", @@ -2299,14 +2298,12 @@ "balanced-match": { "version": "1.0.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "brace-expansion": { "version": "1.1.11", "bundled": true, "dev": true, - "optional": true, "requires": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -2321,20 +2318,17 @@ "code-point-at": { "version": "1.1.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "concat-map": { "version": "0.0.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "console-control-strings": { "version": "1.1.0", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "core-util-is": { "version": "1.0.2", @@ -2451,8 +2445,7 @@ "inherits": { "version": "2.0.3", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "ini": { "version": "1.3.5", @@ -2464,7 +2457,6 @@ "version": "1.0.0", "bundled": true, "dev": true, - "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -2479,7 +2471,6 @@ "version": "3.0.4", "bundled": true, "dev": true, - "optional": true, "requires": { "brace-expansion": "^1.1.7" } @@ -2487,14 +2478,12 @@ "minimist": { "version": "0.0.8", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "minipass": { "version": "2.2.4", "bundled": true, "dev": true, - "optional": true, "requires": { "safe-buffer": "^5.1.1", "yallist": "^3.0.0" @@ -2513,7 +2502,6 @@ "version": "0.5.1", "bundled": true, "dev": true, - "optional": true, "requires": { "minimist": "0.0.8" } @@ -2594,8 +2582,7 @@ "number-is-nan": { "version": "1.0.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "object-assign": { "version": "4.1.1", @@ -2607,7 +2594,6 @@ "version": "1.4.0", "bundled": true, "dev": true, - "optional": true, "requires": { "wrappy": "1" } @@ -2693,8 +2679,7 @@ "safe-buffer": { "version": "5.1.1", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "safer-buffer": { "version": "2.1.2", @@ -2730,7 +2715,6 @@ "version": "1.0.2", "bundled": true, "dev": true, - "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", @@ -2750,7 +2734,6 @@ "version": "3.0.1", "bundled": true, "dev": true, - "optional": true, "requires": { "ansi-regex": "^2.0.0" } @@ -2794,14 +2777,12 @@ "wrappy": { "version": "1.0.2", "bundled": true, - "dev": true, - "optional": true + "dev": true }, "yallist": { "version": "3.0.2", "bundled": true, - "dev": true, - "optional": true + "dev": true } } }, diff --git a/src/browser/index.js b/src/browser/index.js index 5f93ca5..2e02fb5 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -12,14 +12,34 @@ const resolveURL = require('resolve-url'); const { defaultOptions } = require('../common/options'); const { version } = require('../../package.json'); +/** + * readFromBlobOrFile + * + * @name readFromBlobOrFile + * @function + * @access private + * @param {object} blob A blob or file objec to read + * @param {function} res callback function after reading completes + */ +const readFromBlobOrFile = (blob, res) => { + const fileReader = new FileReader(); + fileReader.onload = () => { + res(fileReader.result); + }; + fileReader.readAsArrayBuffer(blob); +}; + /** * loadImage * * @name loadImage * @function load image from different source - * @access public + * @access private * @param {string, object} image - image source, supported formats: * string: URL string, can be relative path + * img HTMLElement: extract image source from src attribute + * video HTMLElement: extract image source from poster attribute + * canvas HTMLElement: extract image data by converting to Blob * File instance: data from * @returns {array} binary image in array format */ @@ -28,13 +48,24 @@ const loadImage = (image) => { return fetch(resolveURL(image)) .then(resp => resp.arrayBuffer()); } + if (check.instance(image, HTMLElement)) { + if (image.tagName === 'IMG') { + return loadImage(image.src); + } + if (image.tagName === 'VIDEO') { + return loadImage(image.poster); + } + if (image.tagName === 'CANVAS') { + return new Promise((res) => { + image.toBlob((blob) => { + readFromBlobOrFile(blob, res); + }); + }); + } + } if (check.instance(image, File)) { return new Promise((res) => { - const fileReader = new FileReader(); - fileReader.onload = () => { - res(fileReader.result); - }; - fileReader.readAsArrayBuffer(image); + readFromBlobOrFile(image, res); }); } return Promise.reject(); diff --git a/tests/recognize.test.js b/tests/recognize.test.js index ec38da1..a249bc7 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -1,10 +1,12 @@ const { TesseractWorker, utils: { loadLang } } = Tesseract; +const isBrowser = typeof window !== 'undefined' && typeof window.document !== 'undefined'; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const SIMPLE_TEXT = 'Tesseract.js\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; const CHINESE_TEXT = '繁 體 中 文 測 試\n'; +const FORMATS = ['png', 'jpg', 'bmp', 'pbm']; const loadLangOptions = { langPath: 'http://localhost:3000/tests/assets/traineddata', @@ -45,7 +47,8 @@ after((done) => { } }); -describe('recognize()', () => { +describe('recognize()',() => { + describe('should recognize different langs', () => { [ { name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT }, @@ -64,7 +67,7 @@ describe('recognize()', () => { }); describe('should read bmp, jpg, png and pbm format images', () => { - ['bmp', 'jpg', 'png', 'pbm'].forEach(format => ( + FORMATS.forEach(format => ( it(`support ${format} format`, (done) => { const worker = getWorker(); worker @@ -138,4 +141,78 @@ describe('recognize()', () => { }).timeout(60000) )); }); + + (isBrowser ? describe : describe.skip)('should read image from img DOM element (browser only)', () => { + FORMATS.forEach(format => ( + it(`support ${format} format`, (done) => { + const imageDOM = document.createElement('img'); + imageDOM.setAttribute('src', `${IMAGE_PATH}/simple.${format}`); + const worker = getWorker(); + worker + .recognize(imageDOM) + .then(({ text }) => { + expect(text).to.be(SIMPLE_TEXT); + worker.terminate(); + imageDOM.remove(); + done(); + }); + }).timeout(10000) + )); + }); + + (isBrowser ? describe : describe.skip)('should read image from video DOM element (browser only)', () => { + FORMATS.forEach(format => ( + it(`support ${format} format`, (done) => { + const videoDOM = document.createElement('video'); + videoDOM.setAttribute('poster', `${IMAGE_PATH}/simple.${format}`); + const worker = getWorker(); + worker + .recognize(videoDOM) + .then(({ text }) => { + expect(text).to.be(SIMPLE_TEXT); + worker.terminate(); + videoDOM.remove(); + done(); + }); + }).timeout(10000) + )); + }); + + (isBrowser ? describe : describe.skip)('should read video from canvas DOM element (browser only)', () => { + /* + * img tag is unable to render pbm, so let's skip it. + */ + const formats = FORMATS.filter(f => f !== 'pbm'); + let canvasDOM = null; + let imageDOM = null; + let idx = 0; + beforeEach(function cb(done) { + canvasDOM = document.createElement('canvas'); + imageDOM = document.createElement('img'); + imageDOM.onload = () => { + canvasDOM.getContext('2d').drawImage(imageDOM, 0, 0); + done(); + } + imageDOM.setAttribute('src', `${IMAGE_PATH}/simple.${formats[idx++]}`); + }); + + afterEach(() => { + canvasDOM.remove(); + imageDOM.remove(); + }); + + formats.forEach(format => ( + it(`support ${format} format`, (done) => { + const worker = getWorker(); + worker + .recognize(canvasDOM) + .then(({ text }) => { + expect(text).to.be(SIMPLE_TEXT); + worker.terminate(); + done(); + }); + }).timeout(10000) + )); + }); + });