From 88b0982dac6256c3bd3bdf9cc41a14fa7fb7b261 Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Tue, 29 Oct 2024 06:32:35 -0500 Subject: [PATCH] allow oem code pages to be used to decode text (#14187) # Description This PR allows oem code pages to be used in decoding by specifying the code page number. ## Before ![image](https://github.com/user-attachments/assets/27f5d288-49f1-4743-a2fc-154f5291d190) ## After (umlauts) ![image](https://github.com/user-attachments/assets/d37c11be-b1fe-4159-822d-7d38018e1c57) closes https://github.com/nushell/nushell/issues/14168 I abstracted the decoding a bit. Here are my function comments on how/why. ```rust // Since we have two different decoding mechanisms, we allow oem_cp to be // specified by only a number like `open file | decode 850`. If this decode // parameter parses as a usize then we assume it was intentional and use oem_cp // crate. Otherwise, if it doesn't parse as a usize, we assume it was a string // and use the encoding_rs crate to try and decode it. ``` # User-Facing Changes # Tests + Formatting # After Submitting --- Cargo.lock | 13 +++++ Cargo.toml | 1 + crates/nu-command/Cargo.toml | 1 + .../src/strings/encode_decode/decode.rs | 56 ++++++++++++++++++- 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 560aa76c9d..c0f20c0380 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3162,6 +3162,7 @@ dependencies = [ "num-format", "num-traits", "nuon", + "oem_cp", "once_cell", "open", "os_pipe", @@ -3868,6 +3869,18 @@ dependencies = [ "memchr", ] +[[package]] +name = "oem_cp" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4" +dependencies = [ + "phf 0.11.2", + "phf_codegen 0.11.2", + "serde", + "serde_json", +] + [[package]] name = "omnipath" version = "0.1.6" diff --git a/Cargo.toml b/Cargo.toml index 92cfba87e3..9d9078d423 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -117,6 +117,7 @@ notify-debouncer-full = { version = "0.3", default-features = false } nu-ansi-term = "0.50.1" num-format = "0.4" num-traits = "0.2" +oem_cp = "2.0.0" omnipath = "0.1" once_cell = "1.20" open = "5.3" diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index a4b65b49fb..8d6e704ead 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -66,6 +66,7 @@ native-tls = { workspace = true } notify-debouncer-full = { workspace = true, default-features = false } num-format = { workspace = true } num-traits = { workspace = true } +oem_cp = { workspace = true } once_cell = { workspace = true } open = { workspace = true } os_pipe = { workspace = true } diff --git a/crates/nu-command/src/strings/encode_decode/decode.rs b/crates/nu-command/src/strings/encode_decode/decode.rs index 99bc1b9681..169667645d 100644 --- a/crates/nu-command/src/strings/encode_decode/decode.rs +++ b/crates/nu-command/src/strings/encode_decode/decode.rs @@ -1,4 +1,35 @@ use nu_engine::command_prelude::*; +use oem_cp::decode_string_complete_table; +use once_cell::sync::Lazy; +use std::collections::HashMap; + +// create a lazycell of all the code_table "Complete" code pages +// the commented out code pages are "Incomplete", which means they +// are stored as Option and not &[char; 128] +static OEM_DECODE: Lazy> = Lazy::new(|| { + let mut m = HashMap::new(); + m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437); + // m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720); + m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737); + m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775); + + m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850); + m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852); + m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855); + // m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857); + m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858); + m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860); + m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861); + m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862); + m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863); + // m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864); + m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865); + m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866); + // m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869); + // m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874); + + m +}); #[derive(Clone)] pub struct Decode; @@ -84,7 +115,7 @@ fn run( let span = stream.span(); let bytes = stream.into_bytes()?; match encoding { - Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes), + Some(encoding_name) => detect_and_decode(encoding_name, head, bytes), None => super::encoding::detect_encoding_name(head, span, &bytes) .map(|encoding| encoding.decode(&bytes).0.into_owned()) .map(|s| Value::string(s, head)), @@ -95,7 +126,7 @@ fn run( let input_span = v.span(); match v { Value::Binary { val: bytes, .. } => match encoding { - Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes), + Some(encoding_name) => detect_and_decode(encoding_name, head, bytes), None => super::encoding::detect_encoding_name(head, input_span, &bytes) .map(|encoding| encoding.decode(&bytes).0.into_owned()) .map(|s| Value::string(s, head)), @@ -121,6 +152,27 @@ fn run( } } +// Since we have two different decoding mechanisms, we allow oem_cp to be +// specified by only a number like `open file | decode 850`. If this decode +// parameter parses as a usize then we assume it was intentional and use oem_cp +// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string +// and use the encoding_rs crate to try and decode it. +fn detect_and_decode( + encoding_name: Spanned, + head: Span, + bytes: Vec, +) -> Result { + let dec_table_id = encoding_name.item.parse::().unwrap_or(0usize); + if dec_table_id == 0 { + super::encoding::decode(head, encoding_name, &bytes) + } else { + Ok(Value::string( + decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]), + head, + )) + } +} + #[cfg(test)] mod test { use super::*;