mirror of
https://github.com/nushell/nushell
synced 2025-01-27 12:25:19 +00:00
allow oem code pages to be used to decode text (#14187)
# Description This PR allows oem code pages to be used in decoding by specifying the code page number. ## Before ![image](https://github.com/user-attachments/assets/27f5d288-49f1-4743-a2fc-154f5291d190) ## After (umlauts) ![image](https://github.com/user-attachments/assets/d37c11be-b1fe-4159-822d-7d38018e1c57) closes https://github.com/nushell/nushell/issues/14168 I abstracted the decoding a bit. Here are my function comments on how/why. ```rust // Since we have two different decoding mechanisms, we allow oem_cp to be // specified by only a number like `open file | decode 850`. If this decode // parameter parses as a usize then we assume it was intentional and use oem_cp // crate. Otherwise, if it doesn't parse as a usize, we assume it was a string // and use the encoding_rs crate to try and decode it. ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
This commit is contained in:
parent
8c2e12ad79
commit
88b0982dac
4 changed files with 69 additions and 2 deletions
13
Cargo.lock
generated
13
Cargo.lock
generated
|
@ -3162,6 +3162,7 @@ dependencies = [
|
|||
"num-format",
|
||||
"num-traits",
|
||||
"nuon",
|
||||
"oem_cp",
|
||||
"once_cell",
|
||||
"open",
|
||||
"os_pipe",
|
||||
|
@ -3868,6 +3869,18 @@ dependencies = [
|
|||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oem_cp"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4"
|
||||
dependencies = [
|
||||
"phf 0.11.2",
|
||||
"phf_codegen 0.11.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "omnipath"
|
||||
version = "0.1.6"
|
||||
|
|
|
@ -117,6 +117,7 @@ notify-debouncer-full = { version = "0.3", default-features = false }
|
|||
nu-ansi-term = "0.50.1"
|
||||
num-format = "0.4"
|
||||
num-traits = "0.2"
|
||||
oem_cp = "2.0.0"
|
||||
omnipath = "0.1"
|
||||
once_cell = "1.20"
|
||||
open = "5.3"
|
||||
|
|
|
@ -66,6 +66,7 @@ native-tls = { workspace = true }
|
|||
notify-debouncer-full = { workspace = true, default-features = false }
|
||||
num-format = { workspace = true }
|
||||
num-traits = { workspace = true }
|
||||
oem_cp = { workspace = true }
|
||||
once_cell = { workspace = true }
|
||||
open = { workspace = true }
|
||||
os_pipe = { workspace = true }
|
||||
|
|
|
@ -1,4 +1,35 @@
|
|||
use nu_engine::command_prelude::*;
|
||||
use oem_cp::decode_string_complete_table;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// create a lazycell of all the code_table "Complete" code pages
|
||||
// the commented out code pages are "Incomplete", which means they
|
||||
// are stored as Option<char> and not &[char; 128]
|
||||
static OEM_DECODE: Lazy<HashMap<usize, &[char; 128]>> = Lazy::new(|| {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437);
|
||||
// m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720);
|
||||
m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737);
|
||||
m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775);
|
||||
|
||||
m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850);
|
||||
m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852);
|
||||
m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855);
|
||||
// m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857);
|
||||
m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858);
|
||||
m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860);
|
||||
m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861);
|
||||
m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862);
|
||||
m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863);
|
||||
// m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864);
|
||||
m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865);
|
||||
m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866);
|
||||
// m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869);
|
||||
// m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874);
|
||||
|
||||
m
|
||||
});
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Decode;
|
||||
|
@ -84,7 +115,7 @@ fn run(
|
|||
let span = stream.span();
|
||||
let bytes = stream.into_bytes()?;
|
||||
match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
|
||||
None => super::encoding::detect_encoding_name(head, span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::string(s, head)),
|
||||
|
@ -95,7 +126,7 @@ fn run(
|
|||
let input_span = v.span();
|
||||
match v {
|
||||
Value::Binary { val: bytes, .. } => match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
|
||||
None => super::encoding::detect_encoding_name(head, input_span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::string(s, head)),
|
||||
|
@ -121,6 +152,27 @@ fn run(
|
|||
}
|
||||
}
|
||||
|
||||
// Since we have two different decoding mechanisms, we allow oem_cp to be
|
||||
// specified by only a number like `open file | decode 850`. If this decode
|
||||
// parameter parses as a usize then we assume it was intentional and use oem_cp
|
||||
// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
|
||||
// and use the encoding_rs crate to try and decode it.
|
||||
fn detect_and_decode(
|
||||
encoding_name: Spanned<String>,
|
||||
head: Span,
|
||||
bytes: Vec<u8>,
|
||||
) -> Result<Value, ShellError> {
|
||||
let dec_table_id = encoding_name.item.parse::<usize>().unwrap_or(0usize);
|
||||
if dec_table_id == 0 {
|
||||
super::encoding::decode(head, encoding_name, &bytes)
|
||||
} else {
|
||||
Ok(Value::string(
|
||||
decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]),
|
||||
head,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
|
Loading…
Reference in a new issue