Browse Source

Added savePDF option to recognize per #488; cleaned up code for linter

dev/v4
Balearica 2 years ago
parent
commit
622c841f33
  1. 11
      examples/browser/download-pdf.html
  2. 34
      src/createWorker.js
  3. 12
      src/index.d.ts
  4. 70
      src/worker-script/index.js

11
examples/browser/download-pdf.html

@ -8,26 +8,29 @@ @@ -8,26 +8,29 @@
<button id="download-pdf" disabled="true">Download PDF</button>
</div>
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script>
<script type="module">
const { createWorker } = Tesseract;
const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m),
});
const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf');
let pdf;
const recognize = async ({ target: { files } }) => {
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]);
const res = await worker.recognize(files[0], {savePDF: true});
pdf = res.data.pdf;
const text = res.data.text;
const board = document.getElementById('board');
board.value = text;
dlBtn.disabled = false;
};
const downloadPDF = async () => {
const filename = 'tesseract-ocr-result.pdf';
const { data } = await worker.getPDF('Tesseract OCR Result');
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' });
const blob = new Blob([new Uint8Array(pdf)], { type: 'application/pdf' });
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, filename);

34
src/createWorker.js

@ -28,14 +28,14 @@ module.exports = async (_options = {}) => { @@ -28,14 +28,14 @@ module.exports = async (_options = {}) => {
const resolves = {};
const rejects = {};
let resReject;
let resResolve;
const res = new Promise((resolve, reject) => {
resResolve = resolve;
resReject = reject;
let workerResReject;
let workerResResolve;
const workerRes = new Promise((resolve, reject) => {
workerResResolve = resolve;
workerResReject = reject;
});
let workerError = (event) => {resReject(event.message)};
const workerError = (event) => { workerResReject(event.message); };
let worker = spawnWorker(options);
worker.onerror = workerError;
@ -63,8 +63,8 @@ module.exports = async (_options = {}) => { @@ -63,8 +63,8 @@ module.exports = async (_options = {}) => {
})
);
const load = (jobId) => (
console.warn("`load` is depreciated and should be removed from code (workers now come pre-loaded)")
const load = () => (
console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)')
);
const loadInternal = (jobId) => (
@ -145,13 +145,14 @@ module.exports = async (_options = {}) => { @@ -145,13 +145,14 @@ module.exports = async (_options = {}) => {
}))
);
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => (
startJob(createJob({
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => {
console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.');
return startJob(createJob({
id: jobId,
action: 'getPDF',
payload: { title, textonly },
}))
);
}));
};
const detect = async (image, jobId) => (
startJob(createJob({
@ -189,7 +190,7 @@ module.exports = async (_options = {}) => { @@ -189,7 +190,7 @@ module.exports = async (_options = {}) => {
resolves[action]({ jobId, data: d });
} else if (status === 'reject') {
rejects[action](data);
if (action === "load") resReject(data);
if (action === 'load') workerResReject(data);
if (errorHandler) {
errorHandler(data);
} else {
@ -220,8 +221,7 @@ module.exports = async (_options = {}) => { @@ -220,8 +221,7 @@ module.exports = async (_options = {}) => {
terminate,
};
loadInternal().then(() => resResolve(resolveObj)).catch(() => {});
return res;
loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {});
return workerRes;
};

12
src/index.d.ts vendored

@ -59,6 +59,9 @@ declare namespace Tesseract { @@ -59,6 +59,9 @@ declare namespace Tesseract {
saveImageOriginal: boolean
saveImageGrey: boolean
saveImageBinary: boolean
savePDF: boolean
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean
rotateRadians: float
}
@ -231,10 +234,11 @@ declare namespace Tesseract { @@ -231,10 +234,11 @@ declare namespace Tesseract {
box: string | null;
unlv: string | null;
sd: string | null;
imageOriginal: string;
imageGrey: string;
imageBinary: string;
rotateRadians: number;
imageOriginal: string | null;
imageGrey: string | null;
imageBinary: string | null;
rotateRadians: number | null;
pdf: number[] | null;
}
}

70
src/worker-script/index.js

@ -200,6 +200,20 @@ const initialize = async ({ @@ -200,6 +200,20 @@ const initialize = async ({
}
};
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
const getImage = (type) => {
api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png');
@ -211,7 +225,8 @@ const getImage = (type) => { @@ -211,7 +225,8 @@ const getImage = (type) => {
const recognize = async ({
payload: {
image, options: {
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians,
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, savePDF, pdfTitle,
pdfTextOnly, rotateAuto, rotateRadians,
},
},
}, res) => {
@ -263,12 +278,23 @@ const recognize = async ({ @@ -263,12 +278,23 @@ const recognize = async ({
const result = dump(TessModule, api, params);
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
if (savePDF) {
result.pdf = getPDFInternal(pdfTitle ?? 'Tesseract OCR Result', pdfTextOnly ?? false);
} else {
result.pdf = null;
}
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
@ -325,12 +351,18 @@ const threshold = async ({ @@ -325,12 +351,18 @@ const threshold = async ({
const result = {};
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
@ -340,16 +372,6 @@ const threshold = async ({ @@ -340,16 +372,6 @@ const threshold = async ({
}
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = async ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
@ -357,7 +379,7 @@ const detect = async ({ payload: { image } }, res) => { @@ -357,7 +379,7 @@ const detect = async ({ payload: { image } }, res) => {
if (!api.DetectOS(results)) {
TessModule._free(ptr);
res.resolve({
tesseract_script_id: null,
script: null,
@ -422,18 +444,18 @@ exports.dispatchHandlers = (packet, send) => { @@ -422,18 +444,18 @@ exports.dispatchHandlers = (packet, send) => {
latestJob = res;
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
.catch((err) => res.reject(err.toString()));
};

Loading…
Cancel
Save