Browse Source

Added savePDF option to recognize per #488; cleaned up code for linter

dev/v4
Balearica 2 years ago
parent
commit
622c841f33
  1. 11
      examples/browser/download-pdf.html
  2. 34
      src/createWorker.js
  3. 12
      src/index.d.ts
  4. 70
      src/worker-script/index.js

11
examples/browser/download-pdf.html

@ -8,26 +8,29 @@
<button id="download-pdf" disabled="true">Download PDF</button> <button id="download-pdf" disabled="true">Download PDF</button>
</div> </div>
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea> <textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script> <script type="module">
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const worker = await createWorker({ const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js', corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m), logger: m => console.log(m),
}); });
const uploader = document.getElementById('uploader'); const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf'); const dlBtn = document.getElementById('download-pdf');
let pdf;
const recognize = async ({ target: { files } }) => { const recognize = async ({ target: { files } }) => {
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]); const res = await worker.recognize(files[0], {savePDF: true});
pdf = res.data.pdf;
const text = res.data.text;
const board = document.getElementById('board'); const board = document.getElementById('board');
board.value = text; board.value = text;
dlBtn.disabled = false; dlBtn.disabled = false;
}; };
const downloadPDF = async () => { const downloadPDF = async () => {
const filename = 'tesseract-ocr-result.pdf'; const filename = 'tesseract-ocr-result.pdf';
const { data } = await worker.getPDF('Tesseract OCR Result'); const blob = new Blob([new Uint8Array(pdf)], { type: 'application/pdf' });
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' });
if (navigator.msSaveBlob) { if (navigator.msSaveBlob) {
// IE 10+ // IE 10+
navigator.msSaveBlob(blob, filename); navigator.msSaveBlob(blob, filename);

34
src/createWorker.js

@ -28,14 +28,14 @@ module.exports = async (_options = {}) => {
const resolves = {}; const resolves = {};
const rejects = {}; const rejects = {};
let resReject; let workerResReject;
let resResolve; let workerResResolve;
const res = new Promise((resolve, reject) => { const workerRes = new Promise((resolve, reject) => {
resResolve = resolve; workerResResolve = resolve;
resReject = reject; workerResReject = reject;
}); });
let workerError = (event) => {resReject(event.message)}; const workerError = (event) => { workerResReject(event.message); };
let worker = spawnWorker(options); let worker = spawnWorker(options);
worker.onerror = workerError; worker.onerror = workerError;
@ -63,8 +63,8 @@ module.exports = async (_options = {}) => {
}) })
); );
const load = (jobId) => ( const load = () => (
console.warn("`load` is depreciated and should be removed from code (workers now come pre-loaded)") console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)')
); );
const loadInternal = (jobId) => ( const loadInternal = (jobId) => (
@ -145,13 +145,14 @@ module.exports = async (_options = {}) => {
})) }))
); );
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => {
startJob(createJob({ console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.');
return startJob(createJob({
id: jobId, id: jobId,
action: 'getPDF', action: 'getPDF',
payload: { title, textonly }, payload: { title, textonly },
})) }));
); };
const detect = async (image, jobId) => ( const detect = async (image, jobId) => (
startJob(createJob({ startJob(createJob({
@ -189,7 +190,7 @@ module.exports = async (_options = {}) => {
resolves[action]({ jobId, data: d }); resolves[action]({ jobId, data: d });
} else if (status === 'reject') { } else if (status === 'reject') {
rejects[action](data); rejects[action](data);
if (action === "load") resReject(data); if (action === 'load') workerResReject(data);
if (errorHandler) { if (errorHandler) {
errorHandler(data); errorHandler(data);
} else { } else {
@ -220,8 +221,7 @@ module.exports = async (_options = {}) => {
terminate, terminate,
}; };
loadInternal().then(() => resResolve(resolveObj)).catch(() => {}); loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {});
return res;
return workerRes;
}; };

12
src/index.d.ts vendored

@ -59,6 +59,9 @@ declare namespace Tesseract {
saveImageOriginal: boolean saveImageOriginal: boolean
saveImageGrey: boolean saveImageGrey: boolean
saveImageBinary: boolean saveImageBinary: boolean
savePDF: boolean
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean rotateAuto: boolean
rotateRadians: float rotateRadians: float
} }
@ -231,10 +234,11 @@ declare namespace Tesseract {
box: string | null; box: string | null;
unlv: string | null; unlv: string | null;
sd: string | null; sd: string | null;
imageOriginal: string; imageOriginal: string | null;
imageGrey: string; imageGrey: string | null;
imageBinary: string; imageBinary: string | null;
rotateRadians: number; rotateRadians: number | null;
pdf: number[] | null;
} }
} }

70
src/worker-script/index.js

@ -200,6 +200,20 @@ const initialize = async ({
} }
}; };
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
const getImage = (type) => { const getImage = (type) => {
api.WriteImage(type, '/image.png'); api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png'); const pngBuffer = TessModule.FS.readFile('/image.png');
@ -211,7 +225,8 @@ const getImage = (type) => {
const recognize = async ({ const recognize = async ({
payload: { payload: {
image, options: { image, options: {
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, savePDF, pdfTitle,
pdfTextOnly, rotateAuto, rotateRadians,
}, },
}, },
}, res) => { }, res) => {
@ -263,12 +278,23 @@ const recognize = async ({
const result = dump(TessModule, api, params); const result = dump(TessModule, api, params);
if (saveImageOriginal) { if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL); result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
} }
if (saveImageGrey) { if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY); result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
} }
if (saveImageBinary) { if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY); result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
if (savePDF) {
result.pdf = getPDFInternal(pdfTitle ?? 'Tesseract OCR Result', pdfTextOnly ?? false);
} else {
result.pdf = null;
} }
result.rotateRadians = rotateRadiansFinal; result.rotateRadians = rotateRadiansFinal;
res.resolve(result); res.resolve(result);
@ -325,12 +351,18 @@ const threshold = async ({
const result = {}; const result = {};
if (saveImageOriginal) { if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL); result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
} }
if (saveImageGrey) { if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY); result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
} }
if (saveImageBinary) { if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY); result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
} }
result.rotateRadians = rotateRadiansFinal; result.rotateRadians = rotateRadiansFinal;
res.resolve(result); res.resolve(result);
@ -340,16 +372,6 @@ const threshold = async ({
} }
}; };
const getPDF = async ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = async ({ payload: { image } }, res) => { const detect = async ({ payload: { image } }, res) => {
try { try {
const ptr = setImage(TessModule, api, image); const ptr = setImage(TessModule, api, image);
@ -357,7 +379,7 @@ const detect = async ({ payload: { image } }, res) => {
if (!api.DetectOS(results)) { if (!api.DetectOS(results)) {
TessModule._free(ptr); TessModule._free(ptr);
res.resolve({ res.resolve({
tesseract_script_id: null, tesseract_script_id: null,
script: null, script: null,
@ -422,18 +444,18 @@ exports.dispatchHandlers = (packet, send) => {
latestJob = res; latestJob = res;
({ ({
load, load,
FS, FS,
loadLanguage, loadLanguage,
initialize, initialize,
setParameters, setParameters,
recognize, recognize,
threshold, threshold,
getPDF, getPDF,
detect, detect,
terminate, terminate,
})[packet.action](packet, res) })[packet.action](packet, res)
.catch((err) => res.reject(err.toString())); .catch((err) => res.reject(err.toString()));
}; };

Loading…
Cancel
Save