mirror of
https://github.com/gchq/CyberChef
synced 2025-01-16 14:33:55 +00:00
Fixed Optical Character Recognition and added tests
This commit is contained in:
parent
c23a8de5a0
commit
ab37c1e562
4 changed files with 34 additions and 24 deletions
|
@ -12,9 +12,10 @@ import { isImage } from "../lib/FileType.mjs";
|
||||||
import { toBase64 } from "../lib/Base64.mjs";
|
import { toBase64 } from "../lib/Base64.mjs";
|
||||||
import { isWorkerEnvironment } from "../Utils.mjs";
|
import { isWorkerEnvironment } from "../Utils.mjs";
|
||||||
|
|
||||||
import process from "process";
|
|
||||||
import { createWorker } from "tesseract.js";
|
import { createWorker } from "tesseract.js";
|
||||||
|
|
||||||
|
const OEM_MODES = ["Tesseract only", "LSTM only", "Tesseract/LSTM Combined"];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Optical Character Recognition operation
|
* Optical Character Recognition operation
|
||||||
*/
|
*/
|
||||||
|
@ -37,6 +38,12 @@ class OpticalCharacterRecognition extends Operation {
|
||||||
name: "Show confidence",
|
name: "Show confidence",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
value: true
|
value: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OCR Engine Mode",
|
||||||
|
type: "option",
|
||||||
|
value: OEM_MODES,
|
||||||
|
defaultIndex: 1
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
@ -47,7 +54,7 @@ class OpticalCharacterRecognition extends Operation {
|
||||||
* @returns {string}
|
* @returns {string}
|
||||||
*/
|
*/
|
||||||
async run(input, args) {
|
async run(input, args) {
|
||||||
const [showConfidence] = args;
|
const [showConfidence, oemChoice] = args;
|
||||||
|
|
||||||
if (!isWorkerEnvironment()) throw new OperationError("This operation only works in a browser");
|
if (!isWorkerEnvironment()) throw new OperationError("This operation only works in a browser");
|
||||||
|
|
||||||
|
@ -56,12 +63,13 @@ class OpticalCharacterRecognition extends Operation {
|
||||||
throw new OperationError("Unsupported file type (supported: jpg,png,pbm,bmp) or no file provided");
|
throw new OperationError("Unsupported file type (supported: jpg,png,pbm,bmp) or no file provided");
|
||||||
}
|
}
|
||||||
|
|
||||||
const assetDir = isWorkerEnvironment() ? `${self.docURL}/assets/` : `${process.cwd()}/src/core/vendor/`;
|
const assetDir = `${self.docURL}/assets/`;
|
||||||
|
const oem = OEM_MODES.indexOf(oemChoice);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
self.sendStatusMessage("Spinning up Tesseract worker...");
|
self.sendStatusMessage("Spinning up Tesseract worker...");
|
||||||
const image = `data:${type};base64,${toBase64(input)}`;
|
const image = `data:${type};base64,${toBase64(input)}`;
|
||||||
const worker = createWorker({
|
const worker = await createWorker("eng", oem, {
|
||||||
workerPath: `${assetDir}tesseract/worker.min.js`,
|
workerPath: `${assetDir}tesseract/worker.min.js`,
|
||||||
langPath: `${assetDir}tesseract/lang-data`,
|
langPath: `${assetDir}tesseract/lang-data`,
|
||||||
corePath: `${assetDir}tesseract/tesseract-core.wasm.js`,
|
corePath: `${assetDir}tesseract/tesseract-core.wasm.js`,
|
||||||
|
@ -71,11 +79,6 @@ class OpticalCharacterRecognition extends Operation {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
await worker.load();
|
|
||||||
self.sendStatusMessage(`Loading English language pack...`);
|
|
||||||
await worker.loadLanguage("eng");
|
|
||||||
self.sendStatusMessage("Intialising Tesseract API...");
|
|
||||||
await worker.initialize("eng");
|
|
||||||
self.sendStatusMessage("Finding text...");
|
self.sendStatusMessage("Finding text...");
|
||||||
const result = await worker.recognize(image);
|
const result = await worker.recognize(image);
|
||||||
|
|
||||||
|
|
|
@ -236,7 +236,7 @@ module.exports = {
|
||||||
// testOp(browser, "OR", "test input", "test_output");
|
// testOp(browser, "OR", "test input", "test_output");
|
||||||
// testOp(browser, "Object Identifier to Hex", "test input", "test_output");
|
// testOp(browser, "Object Identifier to Hex", "test input", "test_output");
|
||||||
testOpHtml(browser, "Offset checker", "test input\n\nbest input", ".hl5", "est input");
|
testOpHtml(browser, "Offset checker", "test input\n\nbest input", ".hl5", "est input");
|
||||||
// testOp(browser, "Optical Character Recognition", "test input", "test_output");
|
testOpFile(browser, "Optical Character Recognition", "files/testocr.png", false, /This is a lot of 12 point text to test the/, [], 10000);
|
||||||
// testOp(browser, "PEM to Hex", "test input", "test_output");
|
// testOp(browser, "PEM to Hex", "test input", "test_output");
|
||||||
// testOp(browser, "PGP Decrypt", "test input", "test_output");
|
// testOp(browser, "PGP Decrypt", "test input", "test_output");
|
||||||
// testOp(browser, "PGP Decrypt and Verify", "test input", "test_output");
|
// testOp(browser, "PGP Decrypt and Verify", "test input", "test_output");
|
||||||
|
@ -408,7 +408,7 @@ module.exports = {
|
||||||
* @param {Browser} browser - Nightwatch client
|
* @param {Browser} browser - Nightwatch client
|
||||||
* @param {string|Array<string>} opName - name of operation to be tested, array for multiple ops
|
* @param {string|Array<string>} opName - name of operation to be tested, array for multiple ops
|
||||||
* @param {string} input - input text for test
|
* @param {string} input - input text for test
|
||||||
* @param {Array<string>|Array<Array<string>>} args - arguments, nested if multiple ops
|
* @param {Array<string>|Array<Array<string>>} [args=[]] - arguments, nested if multiple ops
|
||||||
*/
|
*/
|
||||||
function bakeOp(browser, opName, input, args=[]) {
|
function bakeOp(browser, opName, input, args=[]) {
|
||||||
browser.perform(function() {
|
browser.perform(function() {
|
||||||
|
@ -425,8 +425,8 @@ function bakeOp(browser, opName, input, args=[]) {
|
||||||
* @param {Browser} browser - Nightwatch client
|
* @param {Browser} browser - Nightwatch client
|
||||||
* @param {string|Array<string>} opName - name of operation to be tested, array for multiple ops
|
* @param {string|Array<string>} opName - name of operation to be tested, array for multiple ops
|
||||||
* @param {string} input - input text
|
* @param {string} input - input text
|
||||||
* @param {string} output - expected output
|
* @param {string|RegExp} output - expected output
|
||||||
* @param {Array<string>|Array<Array<string>>} args - arguments, nested if multiple ops
|
* @param {Array<string>|Array<Array<string>>} [args=[]] - arguments, nested if multiple ops
|
||||||
*/
|
*/
|
||||||
function testOp(browser, opName, input, output, args=[]) {
|
function testOp(browser, opName, input, output, args=[]) {
|
||||||
bakeOp(browser, opName, input, args);
|
bakeOp(browser, opName, input, args);
|
||||||
|
@ -440,8 +440,8 @@ function testOp(browser, opName, input, output, args=[]) {
|
||||||
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
||||||
* @param {string} input - input text
|
* @param {string} input - input text
|
||||||
* @param {string} cssSelector - CSS selector for HTML output
|
* @param {string} cssSelector - CSS selector for HTML output
|
||||||
* @param {string} output - expected output
|
* @param {string|RegExp} output - expected output
|
||||||
* @param {Array<string>|Array<Array<string>>} args - arguments, nested if multiple ops
|
* @param {Array<string>|Array<Array<string>>} [args=[]] - arguments, nested if multiple ops
|
||||||
*/
|
*/
|
||||||
function testOpHtml(browser, opName, input, cssSelector, output, args=[]) {
|
function testOpHtml(browser, opName, input, cssSelector, output, args=[]) {
|
||||||
bakeOp(browser, opName, input, args);
|
bakeOp(browser, opName, input, args);
|
||||||
|
@ -459,9 +459,9 @@ function testOpHtml(browser, opName, input, cssSelector, output, args=[]) {
|
||||||
* @param {Browser} browser - Nightwatch client
|
* @param {Browser} browser - Nightwatch client
|
||||||
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
||||||
* @param {string} filename - filename of image file from samples directory
|
* @param {string} filename - filename of image file from samples directory
|
||||||
* @param {Array<string>|Array<Array<string>>} args - arguments, nested if multiple ops
|
* @param {Array<string>|Array<Array<string>>} [args=[]] - arguments, nested if multiple ops
|
||||||
*/
|
*/
|
||||||
function testOpImage(browser, opName, filename, args) {
|
function testOpImage(browser, opName, filename, args=[]) {
|
||||||
browser.perform(function() {
|
browser.perform(function() {
|
||||||
console.log(`Current test: ${opName}`);
|
console.log(`Current test: ${opName}`);
|
||||||
});
|
});
|
||||||
|
@ -481,11 +481,12 @@ function testOpImage(browser, opName, filename, args) {
|
||||||
* @param {Browser} browser - Nightwatch client
|
* @param {Browser} browser - Nightwatch client
|
||||||
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
* @param {string|Array<string>} opName - name of operation to be tested array for multiple ops
|
||||||
* @param {string} filename - filename of file from samples directory
|
* @param {string} filename - filename of file from samples directory
|
||||||
* @param {string} cssSelector - CSS selector for HTML output
|
* @param {string|boolean} cssSelector - CSS selector for HTML output or false for normal text output
|
||||||
* @param {string} output - expected output
|
* @param {string|RegExp} output - expected output
|
||||||
* @param {Array<string>|Array<Array<string>>} args - arguments, nested if multiple ops
|
* @param {Array<string>|Array<Array<string>>} [args=[]] - arguments, nested if multiple ops
|
||||||
|
* @param {number} [waitWindow=1000] - The number of milliseconds to wait for the output to be correct
|
||||||
*/
|
*/
|
||||||
function testOpFile(browser, opName, filename, cssSelector, output, args) {
|
function testOpFile(browser, opName, filename, cssSelector, output, args=[], waitWindow=1000) {
|
||||||
browser.perform(function() {
|
browser.perform(function() {
|
||||||
console.log(`Current test: ${opName}`);
|
console.log(`Current test: ${opName}`);
|
||||||
});
|
});
|
||||||
|
@ -494,9 +495,14 @@ function testOpFile(browser, opName, filename, cssSelector, output, args) {
|
||||||
browser.pause(100).waitForElementVisible("#stale-indicator", 5000);
|
browser.pause(100).waitForElementVisible("#stale-indicator", 5000);
|
||||||
utils.bake(browser);
|
utils.bake(browser);
|
||||||
|
|
||||||
if (typeof output === "string") {
|
if (!cssSelector) {
|
||||||
|
// Text output
|
||||||
|
utils.expectOutput(browser, output, true, waitWindow);
|
||||||
|
} else if (typeof output === "string") {
|
||||||
|
// HTML output - string match
|
||||||
browser.expect.element("#output-html " + cssSelector).text.that.equals(output);
|
browser.expect.element("#output-html " + cssSelector).text.that.equals(output);
|
||||||
} else if (output instanceof RegExp) {
|
} else if (output instanceof RegExp) {
|
||||||
|
// HTML output - RegEx match
|
||||||
browser.expect.element("#output-html " + cssSelector).text.that.matches(output);
|
browser.expect.element("#output-html " + cssSelector).text.that.matches(output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -180,15 +180,16 @@ function loadRecipe(browser, opName, input, args) {
|
||||||
* @param {Browser} browser - Nightwatch client
|
* @param {Browser} browser - Nightwatch client
|
||||||
* @param {string|RegExp} expected - The expected output value
|
* @param {string|RegExp} expected - The expected output value
|
||||||
* @param {boolean} [waitNotNull=false] - Wait for the output to not be empty before testing the value
|
* @param {boolean} [waitNotNull=false] - Wait for the output to not be empty before testing the value
|
||||||
|
* @param {number} [waitWindow=1000] - The number of milliseconds to wait for the output to be correct
|
||||||
*/
|
*/
|
||||||
function expectOutput(browser, expected, waitNotNull=false) {
|
function expectOutput(browser, expected, waitNotNull=false, waitWindow=1000) {
|
||||||
if (waitNotNull && expected !== "") {
|
if (waitNotNull && expected !== "") {
|
||||||
browser.waitUntil(async function() {
|
browser.waitUntil(async function() {
|
||||||
const output = await this.execute(function() {
|
const output = await this.execute(function() {
|
||||||
return window.app.manager.output.outputEditorView.state.doc.toString();
|
return window.app.manager.output.outputEditorView.state.doc.toString();
|
||||||
});
|
});
|
||||||
return output.length;
|
return output.length;
|
||||||
}, 1000);
|
}, waitWindow);
|
||||||
}
|
}
|
||||||
|
|
||||||
browser.execute(expected => {
|
browser.execute(expected => {
|
||||||
|
|
BIN
tests/samples/files/testocr.png
Normal file
BIN
tests/samples/files/testocr.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 23 KiB |
Loading…
Reference in a new issue