From 6e05b8075ba9d6a3edc2a8d293114196c3727b5d Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 12 Apr 2022 13:52:21 -0500 Subject: [PATCH] refactor(lex): Expand lexer design In considering the design for this, we want: - Ability to modify the argment list while maintaining the `Cursor` for replacements - Allow picking up subcommand parsing in the middle of short flags - Ability to peek at the next item to determine if we want to treat it as a flag or as a value - Ability to detect started short and long arguments for completions Longer term, we also want to consider: - Allowing users to customize the lexer to support different syntaxes --- src/build/command.rs | 2 +- src/parse/lexer.rs | 195 ++++++++++++++++++++++++++++++++++++++++++- src/parse/parser.rs | 4 +- 3 files changed, 195 insertions(+), 6 deletions(-) diff --git a/src/build/command.rs b/src/build/command.rs index 1f0ec2f6..51c83ff6 100644 --- a/src/build/command.rs +++ b/src/build/command.rs @@ -665,7 +665,7 @@ impl<'help> App<'help> { // to display // the full path when displaying help messages and such if !self.settings.is_set(AppSettings::NoBinaryName) { - if let Some(name) = raw_args.next(&mut cursor) { + if let Some(name) = raw_args.next_os(&mut cursor) { let p = Path::new(name); if let Some(f) = p.file_name() { diff --git a/src/parse/lexer.rs b/src/parse/lexer.rs index f043b3c1..eb84ac34 100644 --- a/src/parse/lexer.rs +++ b/src/parse/lexer.rs @@ -3,6 +3,8 @@ use std::ffi::OsString; pub use std::io::SeekFrom; +use os_str_bytes::RawOsStr; + #[derive(Default, Clone, Debug, PartialEq, Eq)] pub(crate) struct RawArgs { items: Vec, @@ -13,13 +15,21 @@ impl RawArgs { ArgCursor::new() } - pub fn next(&self, cursor: &mut ArgCursor) -> Option<&OsStr> { + pub fn next(&self, cursor: &mut ArgCursor) -> Option> { + self.next_os(cursor).map(ParsedArg::new) + } + + pub fn next_os(&self, cursor: &mut ArgCursor) -> Option<&OsStr> { let next = self.items.get(cursor.cursor).map(|s| s.as_os_str()); cursor.cursor = cursor.cursor.saturating_add(1); next } - pub fn peek(&self, cursor: &ArgCursor) -> Option<&OsStr> { + pub fn peek(&self, cursor: &ArgCursor) -> Option> { + self.peek_os(cursor).map(ParsedArg::new) + } + + pub fn peek_os(&self, cursor: &ArgCursor) -> Option<&OsStr> { self.items.get(cursor.cursor).map(|s| s.as_os_str()) } @@ -60,7 +70,7 @@ where } } -#[derive(Default, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Default, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) struct ArgCursor { cursor: usize, } @@ -70,3 +80,182 @@ impl ArgCursor { Default::default() } } + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(crate) struct ParsedArg<'s> { + inner: std::borrow::Cow<'s, RawOsStr>, + utf8: Option<&'s str>, +} + +impl<'s> ParsedArg<'s> { + fn new(inner: &'s OsStr) -> Self { + let utf8 = inner.to_str(); + let inner = RawOsStr::new(inner); + Self { inner, utf8 } + } + + pub fn is_stdio(&self) -> bool { + self.inner.as_ref() == "-" + } + + pub fn is_escape(&self) -> bool { + self.inner.as_ref() == "--" + } + + pub fn is_number(&self) -> bool { + self.to_value() + .map(|s| s.parse::().is_ok()) + .unwrap_or_default() + } + + /// Treat as a long-flag + /// + /// **NOTE:** May return an empty flag. Check [`ParsedArg::is_escape`] to separately detect `--`. + pub fn to_long(&self) -> Option<(&RawOsStr, Option<&RawOsStr>)> { + let remainder = self.inner.as_ref().strip_prefix("--")?; + let parts = if let Some((p0, p1)) = remainder.split_once("=") { + (p0, Some(p1)) + } else { + (remainder, None) + }; + Some(parts) + } + + /// Can treat as a long-flag + /// + /// **NOTE:** May return an empty flag. Check [`ParsedArg::is_escape`] to separately detect `--`. + pub fn is_long(&self) -> bool { + self.inner.as_ref().starts_with("--") + } + + /// Treat as a short-flag + /// + /// **NOTE:** Maybe return an empty flag. Check [`ParsedArg::is_stdio`] to separately detect + /// `-`. + pub fn to_short(&self) -> Option> { + if let Some(remainder_os) = self.inner.as_ref().strip_prefix('-') { + if remainder_os.starts_with('-') { + None + } else { + let remainder = self.utf8.map(|s| &s[1..]); + Some(ShortFlags::new(remainder_os, remainder)) + } + } else { + None + } + } + + /// Can treat as a short-flag + /// + /// **NOTE:** Maybe return an empty flag. Check [`ParsedArg::is_stdio`] to separately detect + /// `-`. + pub fn is_short(&self) -> bool { + self.inner.as_ref().starts_with('-') && !self.is_long() + } + + /// Treat as a value + /// + /// **NOTE:** May return a flag or an escape. + pub fn to_value_os(&self) -> &RawOsStr { + self.inner.as_ref() + } + + /// Treat as a value + /// + /// **NOTE:** May return a flag or an escape. + pub fn to_value(&self) -> Option<&str> { + self.utf8 + } + + /// Safely print an argument that may contain non-UTF8 content + /// + /// This may perform lossy conversion, depending on the platform. If you would like an implementation which escapes the path please use Debug instead. + pub fn display(&self) -> impl std::fmt::Display + '_ { + self.inner.to_str_lossy() + } +} + +#[derive(Clone, Debug)] +pub(crate) struct ShortFlags<'s> { + inner: &'s RawOsStr, + utf8_prefix: std::str::CharIndices<'s>, + invalid_suffix: Option<&'s RawOsStr>, +} + +impl<'s> ShortFlags<'s> { + fn new(inner: &'s RawOsStr, utf8: Option<&'s str>) -> Self { + let (utf8_prefix, invalid_suffix) = if let Some(utf8) = utf8 { + (utf8, None) + } else { + split_nonutf8_once(inner) + }; + let utf8_prefix = utf8_prefix.char_indices(); + Self { + inner, + utf8_prefix, + invalid_suffix, + } + } + + pub fn advance_by(&mut self, n: usize) -> Result<(), usize> { + for i in 0..n { + self.next().ok_or(i)?.map_err(|_| i)?; + } + Ok(()) + } + + pub fn is_empty(&self) -> bool { + self.invalid_suffix.is_none() && self.utf8_prefix.as_str().is_empty() + } + + pub fn is_number(&self) -> bool { + self.invalid_suffix.is_none() && self.utf8_prefix.as_str().parse::().is_ok() + } + + pub fn next(&mut self) -> Option> { + if let Some((_, flag)) = self.utf8_prefix.next() { + return Some(Ok(flag)); + } + + if let Some(suffix) = self.invalid_suffix { + self.invalid_suffix = None; + return Some(Err(suffix)); + } + + None + } + + pub fn value_os(&mut self) -> Option<&'s RawOsStr> { + if let Some((index, _)) = self.utf8_prefix.next() { + self.utf8_prefix = "".char_indices(); + self.invalid_suffix = None; + return Some(&self.inner[index..]); + } + + if let Some(suffix) = self.invalid_suffix { + self.invalid_suffix = None; + return Some(suffix); + } + + None + } +} + +impl<'s> Iterator for ShortFlags<'s> { + type Item = Result; + + fn next(&mut self) -> Option { + self.next() + } +} + +fn split_nonutf8_once(b: &RawOsStr) -> (&str, Option<&RawOsStr>) { + match std::str::from_utf8(b.as_raw_bytes()) { + Ok(s) => (s, None), + Err(err) => { + let (valid, after_valid) = b.split_at(err.valid_up_to()); + let valid = std::str::from_utf8(valid.as_raw_bytes()).unwrap(); + (valid, Some(after_valid)) + } + } +} diff --git a/src/parse/parser.rs b/src/parse/parser.rs index ed137ef7..3ceadf31 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -90,7 +90,7 @@ impl<'help, 'cmd> Parser<'help, 'cmd> { // If any arg sets .last(true) let contains_last = self.cmd.get_arguments().any(|x| x.is_last_set()); - while let Some(arg_os) = raw_args.next(&mut args_cursor) { + while let Some(arg_os) = raw_args.next_os(&mut args_cursor) { // Recover the replaced items if any. if let Some(replaced_items) = arg_os.to_str().and_then(|a| self.cmd.get_replacement(a)) { @@ -140,7 +140,7 @@ impl<'help, 'cmd> Parser<'help, 'cmd> { ); if low_index_mults || missing_pos { - let skip_current = if let Some(n) = raw_args.peek(&args_cursor) { + let skip_current = if let Some(n) = raw_args.peek_os(&args_cursor) { if let Some(p) = self .cmd .get_positionals()