diff --git a/src/core/lib/ChrEnc.mjs b/src/core/lib/ChrEnc.mjs index 6879d736..55fe3761 100644 --- a/src/core/lib/ChrEnc.mjs +++ b/src/core/lib/ChrEnc.mjs @@ -224,8 +224,85 @@ export function chrEncWidth(page) { * @copyright Crown Copyright 2019 * @license Apache-2.0 */ +export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"]; + /** - * Character encoding format mappings. + * Detects whether the input buffer is valid UTF8. + * + * @param {ArrayBuffer} data + * @returns {number} - 0 = not UTF8, 1 = ASCII, 2 = UTF8 */ -export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"]; +export function isUTF8(data) { + const bytes = new Uint8Array(data); + let i = 0; + let onlyASCII = true; + while (i < bytes.length) { + if (( // ASCII + bytes[i] === 0x09 || + bytes[i] === 0x0A || + bytes[i] === 0x0D || + (0x20 <= bytes[i] && bytes[i] <= 0x7E) + )) { + i += 1; + continue; + } + + onlyASCII = false; + + if (( // non-overlong 2-byte + (0xC2 <= bytes[i] && bytes[i] <= 0xDF) && + (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF) + )) { + i += 2; + continue; + } + + if (( // excluding overlongs + bytes[i] === 0xE0 && + (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) + ) || + ( // straight 3-byte + ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) || + bytes[i] === 0xEE || + bytes[i] === 0xEF) && + (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) && + (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) + ) || + ( // excluding surrogates + bytes[i] === 0xED && + (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) && + (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) + )) { + i += 3; + continue; + } + + if (( // planes 1-3 + bytes[i] === 0xF0 && + (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + ) || + ( // planes 4-15 + (0xF1 <= bytes[i] && bytes[i] <= 0xF3) && + (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + ) || + ( // plane 16 + bytes[i] === 0xF4 && + (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + )) { + i += 4; + continue; + } + + return 0; + } + + return onlyASCII ? 1 : 2; +} diff --git a/src/core/lib/Magic.mjs b/src/core/lib/Magic.mjs index 921fc3f6..14111ec7 100644 --- a/src/core/lib/Magic.mjs +++ b/src/core/lib/Magic.mjs @@ -3,6 +3,7 @@ import Utils, { isWorkerEnvironment } from "../Utils.mjs"; import Recipe from "../Recipe.mjs"; import Dish from "../Dish.mjs"; import {detectFileType, isType} from "./FileType.mjs"; +import {isUTF8} from "./ChrEnc.mjs"; import chiSquared from "chi-squared"; /** @@ -111,82 +112,6 @@ class Magic { }; } - /** - * Detects whether the input buffer is valid UTF8. - * - * @returns {boolean} - */ - isUTF8() { - const bytes = new Uint8Array(this.inputBuffer); - let i = 0; - while (i < bytes.length) { - if (( // ASCII - bytes[i] === 0x09 || - bytes[i] === 0x0A || - bytes[i] === 0x0D || - (0x20 <= bytes[i] && bytes[i] <= 0x7E) - )) { - i += 1; - continue; - } - - if (( // non-overlong 2-byte - (0xC2 <= bytes[i] && bytes[i] <= 0xDF) && - (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF) - )) { - i += 2; - continue; - } - - if (( // excluding overlongs - bytes[i] === 0xE0 && - (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && - (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) - ) || - ( // straight 3-byte - ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) || - bytes[i] === 0xEE || - bytes[i] === 0xEF) && - (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) && - (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) - ) || - ( // excluding surrogates - bytes[i] === 0xED && - (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) && - (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) - )) { - i += 3; - continue; - } - - if (( // planes 1-3 - bytes[i] === 0xF0 && - (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && - (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && - (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) - ) || - ( // planes 4-15 - (0xF1 <= bytes[i] && bytes[i] <= 0xF3) && - (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && - (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && - (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) - ) || - ( // plane 16 - bytes[i] === 0xF4 && - (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) && - (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && - (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) - )) { - i += 4; - continue; - } - - return false; - } - - return true; - } - /** * Calculates the Shannon entropy of the input data. * @@ -336,7 +261,7 @@ class Magic { data: this.inputStr.slice(0, 100), languageScores: this.detectLanguage(extLang), fileType: this.detectFileType(), - isUTF8: this.isUTF8(), + isUTF8: !!isUTF8(this.inputBuffer), entropy: this.calcEntropy(), matchingOps: matchingOps, useful: useful, diff --git a/src/web/App.mjs b/src/web/App.mjs index cce91b1e..eeae264f 100755 --- a/src/web/App.mjs +++ b/src/web/App.mjs @@ -500,22 +500,22 @@ class App { // Input Character Encoding // Must be set before the input is loaded if (this.uriParams.ienc) { - this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10)); + this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10), true); } // Output Character Encoding if (this.uriParams.oenc) { - this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10)); + this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10), true); } // Input EOL sequence if (this.uriParams.ieol) { - this.manager.input.eolChange(this.uriParams.ieol); + this.manager.input.eolChange(this.uriParams.ieol, true); } // Output EOL sequence if (this.uriParams.oeol) { - this.manager.output.eolChange(this.uriParams.oeol); + this.manager.output.eolChange(this.uriParams.oeol, true); } // Read in input data from URI params diff --git a/src/web/stylesheets/components/_operation.css b/src/web/stylesheets/components/_operation.css index 685a368a..a97fed70 100755 --- a/src/web/stylesheets/components/_operation.css +++ b/src/web/stylesheets/components/_operation.css @@ -69,6 +69,10 @@ select.arg { min-width: 100px; } +select.arg.form-control:not([size]):not([multiple]), select.custom-file-control:not([size]):not([multiple]) { + height: 100% !important; +} + textarea.arg { min-height: 74px; resize: vertical; @@ -80,7 +84,7 @@ div.toggle-string { input.toggle-string { border-top-right-radius: 0 !important; - height: 42px !important; + height: 100%; } .operation [class^='bmd-label'], diff --git a/src/web/utils/statusBar.mjs b/src/web/utils/statusBar.mjs index 69c4dd51..1adcd5be 100644 --- a/src/web/utils/statusBar.mjs +++ b/src/web/utils/statusBar.mjs @@ -24,6 +24,8 @@ class StatusBarPanel { this.eolHandler = opts.eolHandler; this.chrEncHandler = opts.chrEncHandler; this.chrEncGetter = opts.chrEncGetter; + this.getEncodingState = opts.getEncodingState; + this.getEOLState = opts.getEOLState; this.htmlOutput = opts.htmlOutput; this.eolVal = null; @@ -115,7 +117,7 @@ class StatusBarPanel { if (isNaN(chrEncVal)) return; - this.chrEncHandler(chrEncVal); + this.chrEncHandler(chrEncVal, true); this.updateCharEnc(chrEncVal); hideElement(e.target.closest(".cm-status-bar-select-content")); } @@ -212,12 +214,31 @@ class StatusBarPanel { * @param {EditorState} state */ updateEOL(state) { - if (state.lineBreak === this.eolVal) return; + if (this.getEOLState() < 2 && state.lineBreak === this.eolVal) return; const val = this.dom.querySelector(".eol-value"); const button = val.closest(".cm-status-bar-select-btn"); - const eolCode = eolSeqToCode[state.lineBreak]; - const eolName = eolCodeToName[eolCode]; + let eolCode = eolSeqToCode[state.lineBreak]; + let eolName = eolCodeToName[eolCode]; + + switch (this.getEOLState()) { + case 1: // Detected + val.classList.add("font-italic"); + eolCode += " (detected)"; + eolName += " (detected)"; + // Pulse + val.classList.add("pulse"); + setTimeout(() => { + val.classList.remove("pulse"); + }, 2000); + break; + case 0: // Unset + case 2: // Manually set + default: + val.classList.remove("font-italic"); + break; + } + val.textContent = eolCode; button.setAttribute("title", `End of line sequence:
${eolName}`); button.setAttribute("data-original-title", `End of line sequence:
${eolName}`); @@ -230,12 +251,30 @@ class StatusBarPanel { */ updateCharEnc() { const chrEncVal = this.chrEncGetter(); - if (chrEncVal === this.chrEncVal) return; + if (this.getEncodingState() < 2 && chrEncVal === this.chrEncVal) return; - const name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes"; + let name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes"; const val = this.dom.querySelector(".chr-enc-value"); const button = val.closest(".cm-status-bar-select-btn"); + + switch (this.getEncodingState()) { + case 1: // Detected + val.classList.add("font-italic"); + name += " (detected)"; + // Pulse + val.classList.add("pulse"); + setTimeout(() => { + val.classList.remove("pulse"); + }, 2000); + break; + case 0: // Unset + case 2: // Manually set + default: + val.classList.remove("font-italic"); + break; + } + val.textContent = name; button.setAttribute("title", `${this.label} character encoding:
${name}`); button.setAttribute("data-original-title", `${this.label} character encoding:
${name}`); diff --git a/src/web/waiters/InputWaiter.mjs b/src/web/waiters/InputWaiter.mjs index ad8eb38c..bffca98c 100644 --- a/src/web/waiters/InputWaiter.mjs +++ b/src/web/waiters/InputWaiter.mjs @@ -62,7 +62,8 @@ class InputWaiter { this.inputTextEl = document.getElementById("input-text"); this.inputChrEnc = 0; - this.eolSetManually = false; + this.eolState = 0; // 0 = unset, 1 = detected, 2 = manual + this.encodingState = 0; // 0 = unset, 1 = detected, 2 = manual this.initEditor(); this.inputWorker = null; @@ -116,7 +117,9 @@ class InputWaiter { label: "Input", eolHandler: this.eolChange.bind(this), chrEncHandler: this.chrEncChange.bind(this), - chrEncGetter: this.getChrEnc.bind(this) + chrEncGetter: this.getChrEnc.bind(this), + getEncodingState: this.getEncodingState.bind(this), + getEOLState: this.getEOLState.bind(this) }), // Mutable state @@ -156,6 +159,8 @@ class InputWaiter { ] }); + + if (this.inputEditorView) this.inputEditorView.destroy(); this.inputEditorView = new EditorView({ state: initialState, parent: this.inputTextEl @@ -166,30 +171,18 @@ class InputWaiter { * Handler for EOL change events * Sets the line separator * @param {string} eol - * @param {boolean} manual - a flag for whether this was set by the user or automatically + * @param {boolean} [manual=false] */ eolChange(eol, manual=false) { const eolVal = eolCodeToSeq[eol]; if (eolVal === undefined) return; - const eolBtn = document.querySelector("#input-text .eol-value"); - if (manual) { - this.eolSetManually = true; - eolBtn.classList.remove("font-italic"); - } else { - eolBtn.classList.add("font-italic"); - } + this.eolState = manual ? 2 : this.eolState; + if (this.eolState < 2 && eolVal === this.getEOLSeq()) return; - if (eolVal === this.getEOLSeq()) return; - - if (!manual) { - // Pulse - eolBtn.classList.add("pulse"); - setTimeout(() => { - eolBtn.classList.remove("pulse"); - }, 2000); + if (this.eolState === 1) { // Alert - this.app.alert(`Input EOL separator has been changed to ${eolCodeToName[eol]}`, 5000); + this.app.alert(`Input end of line separator has been detected and changed to ${eolCodeToName[eol]}`, 5000); } // Update the EOL value @@ -210,14 +203,24 @@ class InputWaiter { return this.inputEditorView.state.lineBreak; } + /** + * Returns whether the input EOL sequence was set manually or has been detected automatically + * @returns {number} - 0 = unset, 1 = detected, 2 = manual + */ + getEOLState() { + return this.eolState; + } + /** * Handler for Chr Enc change events * Sets the input character encoding * @param {number} chrEncVal + * @param {boolean} [manual=false] */ - chrEncChange(chrEncVal) { + chrEncChange(chrEncVal, manual=false) { if (typeof chrEncVal !== "number") return; this.inputChrEnc = chrEncVal; + this.encodingState = manual ? 2 : this.encodingState; this.inputChange(); } @@ -229,6 +232,14 @@ class InputWaiter { return this.inputChrEnc; } + /** + * Returns whether the input character encoding was set manually or has been detected automatically + * @returns {number} - 0 = unset, 1 = detected, 2 = manual + */ + getEncodingState() { + return this.encodingState; + } + /** * Sets word wrap on the input editor * @param {boolean} wrap @@ -908,7 +919,7 @@ class InputWaiter { */ afterPaste(e) { // If EOL has been fixed, skip this. - if (this.eolSetManually) return; + if (this.eolState > 1) return; const inputText = this.getInput(); @@ -930,17 +941,23 @@ class InputWaiter { }, 0); if (total === 0) return; - // If CRLF not zero and more than half the highest alternative, choose CRLF + // Find most prevalent line ending sequence const highest = Object.entries(eolCharCounts).reduce((acc, curr) => { return curr[1] > acc[1] ? curr : acc; }, ["LF", 0]); + let choice = highest[0]; + + // If CRLF not zero and more than half the highest alternative, choose CRLF if ((eolCharCounts.CRLF * 2) > highest[1]) { - this.eolChange("CRLF"); - return; + choice = "CRLF"; } - // Else choose max - this.eolChange(highest[0]); + const eolVal = eolCodeToSeq[choice]; + if (eolVal === this.getEOLSeq()) return; + + // Setting automatically + this.eolState = 1; + this.eolChange(choice); } /** @@ -1276,8 +1293,13 @@ class InputWaiter { this.manager.output.removeAllOutputs(); this.manager.output.terminateZipWorker(); - this.eolSetManually = false; - this.manager.output.eolSetManually = false; + this.eolState = 0; + this.encodingState = 0; + this.manager.output.eolState = 0; + this.manager.output.encodingState = 0; + + this.initEditor(); + this.manager.output.initEditor(); const tabsList = document.getElementById("input-tabs"); const tabsListChildren = tabsList.children; diff --git a/src/web/waiters/OutputWaiter.mjs b/src/web/waiters/OutputWaiter.mjs index 6acd6752..190d2ad9 100755 --- a/src/web/waiters/OutputWaiter.mjs +++ b/src/web/waiters/OutputWaiter.mjs @@ -7,6 +7,7 @@ import Utils, {debounce} from "../../core/Utils.mjs"; import Dish from "../../core/Dish.mjs"; +import {isUTF8, CHR_ENC_SIMPLE_REVERSE_LOOKUP} from "../../core/lib/ChrEnc.mjs"; import {detectFileType} from "../../core/lib/FileType.mjs"; import FileSaver from "file-saver"; import ZipWorker from "worker-loader?inline=no-fallback!../workers/ZipWorker.mjs"; @@ -70,7 +71,8 @@ class OutputWaiter { this.zipWorker = null; this.maxTabs = this.manager.tabs.calcMaxTabs(); this.tabTimeout = null; - this.eolSetManually = false; + this.eolState = 0; // 0 = unset, 1 = detected, 2 = manual + this.encodingState = 0; // 0 = unset, 1 = detected, 2 = manual } /** @@ -110,6 +112,8 @@ class OutputWaiter { eolHandler: this.eolChange.bind(this), chrEncHandler: this.chrEncChange.bind(this), chrEncGetter: this.getChrEnc.bind(this), + getEncodingState: this.getEncodingState.bind(this), + getEOLState: this.getEOLState.bind(this), htmlOutput: this.htmlOutput }), htmlPlugin(this.htmlOutput), @@ -138,6 +142,7 @@ class OutputWaiter { ] }); + if (this.outputEditorView) this.outputEditorView.destroy(); this.outputEditorView = new EditorView({ state: initialState, parent: this.outputTextEl @@ -148,30 +153,18 @@ class OutputWaiter { * Handler for EOL change events * Sets the line separator * @param {string} eol - * @param {boolean} manual - a flag for whether this was set by the user or automatically + * @param {boolean} [manual=false] */ async eolChange(eol, manual=false) { const eolVal = eolCodeToSeq[eol]; if (eolVal === undefined) return; - const eolBtn = document.querySelector("#output-text .eol-value"); - if (manual) { - this.eolSetManually = true; - eolBtn.classList.remove("font-italic"); - } else { - eolBtn.classList.add("font-italic"); - } + this.eolState = manual ? 2 : this.eolState; + if (this.eolState < 2 && eolVal === this.getEOLSeq()) return; - if (eolVal === this.getEOLSeq()) return; - - if (!manual) { - // Pulse - eolBtn.classList.add("pulse"); - setTimeout(() => { - eolBtn.classList.remove("pulse"); - }, 2000); + if (this.eolState === 1) { // Alert - this.app.alert(`Output EOL separator has been changed to ${eolCodeToName[eol]}`, 5000); + this.app.alert(`Output end of line separator has been detected and changed to ${eolCodeToName[eol]}`, 5000); } const currentTabNum = this.manager.tabs.getActiveTab("output"); @@ -205,13 +198,23 @@ class OutputWaiter { return this.outputs[currentTabNum].eolSequence; } + /** + * Returns whether the output EOL sequence was set manually or has been detected automatically + * @returns {number} - 0 = unset, 1 = detected, 2 = manual + */ + getEOLState() { + return this.eolState; + } + /** * Handler for Chr Enc change events * Sets the output character encoding * @param {number} chrEncVal + * @param {boolean} [manual=false] */ - async chrEncChange(chrEncVal) { + async chrEncChange(chrEncVal, manual=false) { if (typeof chrEncVal !== "number") return; + const currentEnc = this.getChrEnc(); const currentTabNum = this.manager.tabs.getActiveTab("output"); if (currentTabNum >= 0) { @@ -220,10 +223,17 @@ class OutputWaiter { throw new Error(`Cannot change output ${currentTabNum} chrEnc to ${chrEncVal}`); } - // Reset the output, forcing it to re-decode the data with the new character encoding - await this.setOutput(this.currentOutputCache, true); - // Update the URL manually since we aren't firing a statechange event - this.app.updateURL(true); + this.encodingState = manual ? 2 : this.encodingState; + + if (this.encodingState > 1) { + // Reset the output, forcing it to re-decode the data with the new character encoding + await this.setOutput(this.currentOutputCache, true); + // Update the URL manually since we aren't firing a statechange event + this.app.updateURL(true); + } else if (currentEnc !== chrEncVal) { + // Alert + this.app.alert(`Output character encoding has been detected and changed to ${CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] || "Raw Bytes"}`, 5000); + } } /** @@ -238,6 +248,14 @@ class OutputWaiter { return this.outputs[currentTabNum].encoding; } + /** + * Returns whether the output character encoding was set manually or has been detected automatically + * @returns {number} - 0 = unset, 1 = detected, 2 = manual + */ + getEncodingState() { + return this.encodingState; + } + /** * Sets word wrap on the output editor * @param {boolean} wrap @@ -273,6 +291,7 @@ class OutputWaiter { const tabNum = this.manager.tabs.getActiveTab("output"); this.manager.timing.recordTime("outputDecodingStart", tabNum); if (data instanceof ArrayBuffer) { + await this.detectEncoding(data); data = await this.bufferToStr(data); } this.manager.timing.recordTime("outputDecodingEnd", tabNum); @@ -380,7 +399,7 @@ class OutputWaiter { */ detectEOLSequence(data) { // If EOL has been fixed, skip this. - if (this.eolSetManually) return; + if (this.eolState > 1) return; // If data is too long, skip this. if (data.length > 1000000) return; @@ -402,17 +421,54 @@ class OutputWaiter { }, 0); if (total === 0) return; - // If CRLF not zero and more than half the highest alternative, choose CRLF + // Find most prevalent line ending sequence const highest = Object.entries(eolCharCounts).reduce((acc, curr) => { return curr[1] > acc[1] ? curr : acc; }, ["LF", 0]); + let choice = highest[0]; + + // If CRLF not zero and more than half the highest alternative, choose CRLF if ((eolCharCounts.CRLF * 2) > highest[1]) { - this.eolChange("CRLF"); - return; + choice = "CRLF"; } - // Else choose max - this.eolChange(highest[0]); + const eolVal = eolCodeToSeq[choice]; + if (eolVal === this.getEOLSeq()) return; + + // Setting automatically + this.eolState = 1; + this.eolChange(choice); + } + + /** + * Checks whether the character encoding should be updated. + * + * @param {ArrayBuffer} data + */ + async detectEncoding(data) { + // If encoding has been fixed, skip this. + if (this.encodingState > 1) return; + // If data is too long, skip this. + if (data.byteLength > 1000000) return; + + const enc = isUTF8(data); // 0 = not UTF8, 1 = ASCII, 2 = UTF8 + + switch (enc) { + case 0: // not UTF8 + // Set to Raw Bytes + this.encodingState = 1; + await this.chrEncChange(0, false); + break; + case 2: // UTF8 + // Set to UTF8 + this.encodingState = 1; + await this.chrEncChange(65001, false); + break; + case 1: // ASCII + default: + // Ignore + break; + } } /**