Merge pull request #4802 from epage/osstr

fix(lex): Deprecate unsound `OsStrExt::split_at`
2024-12-13 06:12:40 +00:00 · 2023-03-28 10:00:48 -05:00 · 2023-03-28 10:00:48 -05:00 · 4b180f8cd0
commit 4b180f8cd0
parent 53cb165b3f 48dc66f652
2 changed files with 60 additions and 23 deletions
--- a/clap_lex/src/ext.rs
+++ b/clap_lex/src/ext.rs
@ -193,6 +193,7 @@ pub trait OsStrExt: private::Sealed {
    /// assert_eq!("Per", first);
    /// assert_eq!(" Martin-Löf", last);
    /// ```
    #[deprecated(since = "4.1.0", note = "This is not sound for all `index`")]
    fn split_at(&self, index: usize) -> (&OsStr, &OsStr);
    /// Splits the string on the first occurrence of the specified delimiter and
    /// returns prefix before delimiter and suffix after delimiter.
@ -212,7 +213,8 @@ pub trait OsStrExt: private::Sealed {
 impl OsStrExt for OsStr {
    fn try_str(&self) -> Result<&str, std::str::Utf8Error> {
-        let bytes = to_bytes(self);
+        // SAFETY: Only interacting with `OsStr` as `&str
        let bytes = unsafe { to_bytes(self) };
        std::str::from_utf8(bytes)
    }
@ -221,17 +223,24 @@ impl OsStrExt for OsStr {
    }
    fn find(&self, needle: &str) -> Option<usize> {
        // SAFETY: Only interacting with `OsStr` as `&str
        let bytes = unsafe { to_bytes(self) };
        (0..=self.len().checked_sub(needle.len())?)
-            .find(|&x| to_bytes(self)[x..].starts_with(needle.as_bytes()))
+            .find(|&x| bytes[x..].starts_with(needle.as_bytes()))
    }
    fn strip_prefix(&self, prefix: &str) -> Option<&OsStr> {
-        to_bytes(self)
+        // SAFETY: Only interacting with `OsStr` as `&str
-            .strip_prefix(prefix.as_bytes())
+        let bytes = unsafe { to_bytes(self) };
-            .map(to_os_str)
+        bytes.strip_prefix(prefix.as_bytes()).map(|s| {
            // SAFETY: Only interacting with `OsStr` as `&str
            unsafe { to_os_str(s) }
        })
    }
    fn starts_with(&self, prefix: &str) -> bool {
-        to_bytes(self).starts_with(prefix.as_bytes())
+        // SAFETY: Only interacting with `OsStr` as `&str
        let bytes = unsafe { to_bytes(self) };
        bytes.starts_with(prefix.as_bytes())
    }
    fn split<'s, 'n>(&'s self, needle: &'n str) -> Split<'s, 'n> {
@ -243,17 +252,24 @@ impl OsStrExt for OsStr {
    }
    fn split_at(&self, index: usize) -> (&OsStr, &OsStr) {
-        let (first, second) = to_bytes(self).split_at(index);
+        // BUG: This is unsafe and has been deprecated
-        (to_os_str(first), to_os_str(second))
+        unsafe {
            let bytes = to_bytes(self);
            let (first, second) = bytes.split_at(index);
            (to_os_str(first), to_os_str(second))
        }
    }
    fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)> {
        let start = self.find(needle)?;
        let end = start + needle.len();
-        let haystack = to_bytes(self);
+        // SAFETY: Only interacting with `OsStr` as `&str
-        let first = &haystack[0..start];
+        unsafe {
-        let second = &haystack[end..];
+            let haystack = to_bytes(self);
-        Some((to_os_str(first), to_os_str(second)))
+            let first = &haystack[0..start];
            let second = &haystack[end..];
            Some((to_os_str(first), to_os_str(second)))
        }
    }
 }
@ -265,33 +281,40 @@ mod private {
 /// Allow access to raw bytes
 ///
-/// **Note:** the bytes only make sense when compared with ASCII or `&str`
+/// # Safety
 ///
-/// **Note:** This must never be serialized as there is no guarantee at how invalid UTF-8 will be
+/// - The bytes only make sense when compared with ASCII or `&str`
-/// encoded, even within the same version of this crate (since its dependent on rustc version)
+/// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be
-fn to_bytes(s: &OsStr) -> &[u8] {
+///   encoded, even within the same version of this crate (since its dependent on rustc version)
 unsafe fn to_bytes(s: &OsStr) -> &[u8] {
    // SAFETY:
    // - Lifetimes are the same
-    // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`)
+    // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
    // - The primary contract is that the encoding for invalid surrogate code points is not
    //   guaranteed which isn't a problem here
    //
    // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
    // but its in limbo
-    unsafe { std::mem::transmute(s) }
+    std::mem::transmute(s)
 }
 /// Restore raw bytes as `OsStr`
-fn to_os_str(s: &[u8]) -> &OsStr {
+///
 /// # Safety
 ///
 /// - The bytes only make sense when compared with ASCII or `&str`
 /// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be
 ///   encoded, even within the same version of this crate (since its dependent on rustc version)
 unsafe fn to_os_str(s: &[u8]) -> &OsStr {
    // SAFETY:
    // - Lifetimes are the same
-    // - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`)
+    // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
    // - The primary contract is that the encoding for invalid surrogate code points is not
    //   guaranteed which isn't a problem here
    //
    // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
    // but its in limbo
-    unsafe { std::mem::transmute(s) }
+    std::mem::transmute(s)
 }
 pub struct Split<'s, 'n> {
@ -319,3 +342,14 @@ impl<'s, 'n> Iterator for Split<'s, 'n> {
        }
    }
 }
 /// Split an `OsStr`
 ///
 /// # Safety
 ///
 /// `index` must be at a valid UTF-8 boundary
 pub(crate) unsafe fn split_at(os: &OsStr, index: usize) -> (&OsStr, &OsStr) {
    let bytes = to_bytes(os);
    let (first, second) = bytes.split_at(index);
    (to_os_str(first), to_os_str(second))
 }
--- a/clap_lex/src/lib.rs
+++ b/clap_lex/src/lib.rs
@ -433,7 +433,9 @@ impl<'s> ShortFlags<'s> {
        if let Some((index, _)) = self.utf8_prefix.next() {
            self.utf8_prefix = "".char_indices();
            self.invalid_suffix = None;
-            return Some(self.inner.split_at(index).1);
+            // SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary
            let remainder = unsafe { ext::split_at(self.inner, index).1 };
            return Some(remainder);
        }
        if let Some(suffix) = self.invalid_suffix {
@ -457,7 +459,8 @@ fn split_nonutf8_once(b: &OsStr) -> (&str, Option<&OsStr>) {
    match b.try_str() {
        Ok(s) => (s, None),
        Err(err) => {
-            let (valid, after_valid) = b.split_at(err.valid_up_to());
+            // SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary
            let (valid, after_valid) = unsafe { ext::split_at(b, err.valid_up_to()) };
            let valid = valid.try_str().unwrap();
            (valid, Some(after_valid))
        }