allow oem code pages to be used to decode text (#14187)

# Description

This PR allows oem code pages to be used in decoding by specifying the
code page number.

## Before

![image](https://github.com/user-attachments/assets/27f5d288-49f1-4743-a2fc-154f5291d190)
## After (umlauts)

![image](https://github.com/user-attachments/assets/d37c11be-b1fe-4159-822d-7d38018e1c57)

closes https://github.com/nushell/nushell/issues/14168

I abstracted the decoding a bit. Here are my function comments on
how/why.
```rust
// Since we have two different decoding mechanisms, we allow oem_cp to be
// specified by only a number like `open file | decode 850`. If this decode
// parameter parses as a usize then we assume it was intentional and use oem_cp
// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
// and use the encoding_rs crate to try and decode it.
```

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
Darren Schroeder 2024-10-29 06:32:35 -05:00 committed by GitHub
parent 8c2e12ad79
commit 88b0982dac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 69 additions and 2 deletions

13
Cargo.lock generated
View file

@ -3162,6 +3162,7 @@ dependencies = [
"num-format", "num-format",
"num-traits", "num-traits",
"nuon", "nuon",
"oem_cp",
"once_cell", "once_cell",
"open", "open",
"os_pipe", "os_pipe",
@ -3868,6 +3869,18 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "oem_cp"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4"
dependencies = [
"phf 0.11.2",
"phf_codegen 0.11.2",
"serde",
"serde_json",
]
[[package]] [[package]]
name = "omnipath" name = "omnipath"
version = "0.1.6" version = "0.1.6"

View file

@ -117,6 +117,7 @@ notify-debouncer-full = { version = "0.3", default-features = false }
nu-ansi-term = "0.50.1" nu-ansi-term = "0.50.1"
num-format = "0.4" num-format = "0.4"
num-traits = "0.2" num-traits = "0.2"
oem_cp = "2.0.0"
omnipath = "0.1" omnipath = "0.1"
once_cell = "1.20" once_cell = "1.20"
open = "5.3" open = "5.3"

View file

@ -66,6 +66,7 @@ native-tls = { workspace = true }
notify-debouncer-full = { workspace = true, default-features = false } notify-debouncer-full = { workspace = true, default-features = false }
num-format = { workspace = true } num-format = { workspace = true }
num-traits = { workspace = true } num-traits = { workspace = true }
oem_cp = { workspace = true }
once_cell = { workspace = true } once_cell = { workspace = true }
open = { workspace = true } open = { workspace = true }
os_pipe = { workspace = true } os_pipe = { workspace = true }

View file

@ -1,4 +1,35 @@
use nu_engine::command_prelude::*; use nu_engine::command_prelude::*;
use oem_cp::decode_string_complete_table;
use once_cell::sync::Lazy;
use std::collections::HashMap;
// create a lazycell of all the code_table "Complete" code pages
// the commented out code pages are "Incomplete", which means they
// are stored as Option<char> and not &[char; 128]
static OEM_DECODE: Lazy<HashMap<usize, &[char; 128]>> = Lazy::new(|| {
let mut m = HashMap::new();
m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437);
// m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720);
m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737);
m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775);
m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850);
m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852);
m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855);
// m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857);
m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858);
m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860);
m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861);
m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862);
m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863);
// m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864);
m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865);
m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866);
// m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869);
// m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874);
m
});
#[derive(Clone)] #[derive(Clone)]
pub struct Decode; pub struct Decode;
@ -84,7 +115,7 @@ fn run(
let span = stream.span(); let span = stream.span();
let bytes = stream.into_bytes()?; let bytes = stream.into_bytes()?;
match encoding { match encoding {
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes), Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
None => super::encoding::detect_encoding_name(head, span, &bytes) None => super::encoding::detect_encoding_name(head, span, &bytes)
.map(|encoding| encoding.decode(&bytes).0.into_owned()) .map(|encoding| encoding.decode(&bytes).0.into_owned())
.map(|s| Value::string(s, head)), .map(|s| Value::string(s, head)),
@ -95,7 +126,7 @@ fn run(
let input_span = v.span(); let input_span = v.span();
match v { match v {
Value::Binary { val: bytes, .. } => match encoding { Value::Binary { val: bytes, .. } => match encoding {
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes), Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
None => super::encoding::detect_encoding_name(head, input_span, &bytes) None => super::encoding::detect_encoding_name(head, input_span, &bytes)
.map(|encoding| encoding.decode(&bytes).0.into_owned()) .map(|encoding| encoding.decode(&bytes).0.into_owned())
.map(|s| Value::string(s, head)), .map(|s| Value::string(s, head)),
@ -121,6 +152,27 @@ fn run(
} }
} }
// Since we have two different decoding mechanisms, we allow oem_cp to be
// specified by only a number like `open file | decode 850`. If this decode
// parameter parses as a usize then we assume it was intentional and use oem_cp
// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
// and use the encoding_rs crate to try and decode it.
fn detect_and_decode(
encoding_name: Spanned<String>,
head: Span,
bytes: Vec<u8>,
) -> Result<Value, ShellError> {
let dec_table_id = encoding_name.item.parse::<usize>().unwrap_or(0usize);
if dec_table_id == 0 {
super::encoding::decode(head, encoding_name, &bytes)
} else {
Ok(Value::string(
decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]),
head,
))
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;