From 59a3e42ac9d6ad36b4af3732b46852a61647540c Mon Sep 17 00:00:00 2001 From: oxalica Date: Tue, 18 Jul 2023 18:42:02 +0800 Subject: [PATCH] Fix unescaping of C string literals --- crates/hir-def/src/body/pretty.rs | 2 +- crates/hir-def/src/hir.rs | 2 +- crates/hir-ty/src/mir/lower.rs | 1 - .../test_data/highlight_strings.html | 5 +- crates/ide/src/syntax_highlighting/tests.rs | 5 +- crates/syntax/src/ast/token_ext.rs | 55 ++++++++++++++----- 6 files changed, 49 insertions(+), 21 deletions(-) diff --git a/crates/hir-def/src/body/pretty.rs b/crates/hir-def/src/body/pretty.rs index 0c6cf0b49a..eeaed87164 100644 --- a/crates/hir-def/src/body/pretty.rs +++ b/crates/hir-def/src/body/pretty.rs @@ -634,7 +634,7 @@ impl Printer<'_> { match literal { Literal::String(it) => w!(self, "{:?}", it), Literal::ByteString(it) => w!(self, "\"{}\"", it.escape_ascii()), - Literal::CString(it) => w!(self, "\"{}\\0\"", it), + Literal::CString(it) => w!(self, "\"{}\\0\"", it.escape_ascii()), Literal::Char(it) => w!(self, "'{}'", it.escape_debug()), Literal::Bool(it) => w!(self, "{}", it), Literal::Int(i, suffix) => { diff --git a/crates/hir-def/src/hir.rs b/crates/hir-def/src/hir.rs index 500e880061..8a140a1ec1 100644 --- a/crates/hir-def/src/hir.rs +++ b/crates/hir-def/src/hir.rs @@ -85,7 +85,7 @@ impl fmt::Display for FloatTypeWrapper { pub enum Literal { String(Box), ByteString(Box<[u8]>), - CString(Box), + CString(Box<[u8]>), Char(char), Bool(bool), Int(i128, Option), diff --git a/crates/hir-ty/src/mir/lower.rs b/crates/hir-ty/src/mir/lower.rs index 3610858790..8da12f9e39 100644 --- a/crates/hir-ty/src/mir/lower.rs +++ b/crates/hir-ty/src/mir/lower.rs @@ -1355,7 +1355,6 @@ impl<'ctx> MirLowerCtx<'ctx> { return Ok(Operand::from_concrete_const(data, mm, ty)); } hir_def::hir::Literal::CString(b) => { - let b = b.as_bytes(); let bytes = b.iter().copied().chain(iter::once(0)).collect::>(); let mut data = Vec::with_capacity(mem::size_of::() * 2); diff --git a/crates/ide/src/syntax_highlighting/test_data/highlight_strings.html b/crates/ide/src/syntax_highlighting/test_data/highlight_strings.html index 061329d239..33523e4af7 100644 --- a/crates/ide/src/syntax_highlighting/test_data/highlight_strings.html +++ b/crates/ide/src/syntax_highlighting/test_data/highlight_strings.html @@ -161,8 +161,9 @@ pre { color: #DCDCCC; background: #3F3F3F; font-size: 22px; padd println!("Hello\nWorld"); println!("\u{48}\x65\x6C\x6C\x6F World"); - let _ = "\x28\x28\x00\x63\xFF\n"; // invalid non-UTF8 escape sequences - let _ = b"\x28\x28\x00\x63\xFF\n"; // valid bytes + let _ = "\x28\x28\x00\x63\xFF\u{FF}\n"; // invalid non-UTF8 escape sequences + let _ = b"\x28\x28\x00\x63\xFF\u{FF}\n"; // valid bytes, invalid unicodes + let _ = c"\u{FF}\xFF"; // valid bytes, valid unicodes let backslash = r"\\"; println!("{\x41}", A = 92); diff --git a/crates/ide/src/syntax_highlighting/tests.rs b/crates/ide/src/syntax_highlighting/tests.rs index 80a49bcaa3..696aa59002 100644 --- a/crates/ide/src/syntax_highlighting/tests.rs +++ b/crates/ide/src/syntax_highlighting/tests.rs @@ -507,8 +507,9 @@ fn main() { println!("Hello\nWorld"); println!("\u{48}\x65\x6C\x6C\x6F World"); - let _ = "\x28\x28\x00\x63\xFF\n"; // invalid non-UTF8 escape sequences - let _ = b"\x28\x28\x00\x63\xFF\n"; // valid bytes + let _ = "\x28\x28\x00\x63\xFF\u{FF}\n"; // invalid non-UTF8 escape sequences + let _ = b"\x28\x28\x00\x63\xFF\u{FF}\n"; // valid bytes, invalid unicodes + let _ = c"\u{FF}\xFF"; // valid bytes, valid unicodes let backslash = r"\\"; println!("{\x41}", A = 92); diff --git a/crates/syntax/src/ast/token_ext.rs b/crates/syntax/src/ast/token_ext.rs index aa8c9bbc0f..87fd51d703 100644 --- a/crates/syntax/src/ast/token_ext.rs +++ b/crates/syntax/src/ast/token_ext.rs @@ -2,7 +2,9 @@ use std::borrow::Cow; -use rustc_lexer::unescape::{unescape_byte, unescape_char, unescape_literal, Mode}; +use rustc_lexer::unescape::{ + unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode, +}; use crate::{ ast::{self, AstToken}, @@ -285,45 +287,70 @@ impl ast::ByteString { impl IsString for ast::CString { const RAW_PREFIX: &'static str = "cr"; - // XXX: `Mode::CStr` is not supported by `unescape_literal` of ra-ap-rustc_lexer yet. - // Here we pretend it to be a byte string. - const MODE: Mode = Mode::ByteStr; + const MODE: Mode = Mode::CStr; + + fn escaped_char_ranges( + &self, + cb: &mut dyn FnMut(TextRange, Result), + ) { + let text_range_no_quotes = match self.text_range_between_quotes() { + Some(it) => it, + None => return, + }; + + let start = self.syntax().text_range().start(); + let text = &self.text()[text_range_no_quotes - start]; + let offset = text_range_no_quotes.start() - start; + + unescape_c_string(text, Self::MODE, &mut |range, unescaped_char| { + let text_range = + TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); + // XXX: This method should only be used for highlighting ranges. The unescaped + // char/byte is not used. For simplicity, we return an arbitrary placeholder char. + cb(text_range + offset, unescaped_char.map(|_| ' ')); + }); + } } impl ast::CString { - pub fn value(&self) -> Option> { + pub fn value(&self) -> Option> { if self.is_raw() { let text = self.text(); let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - return Some(Cow::Borrowed(text)); + return Some(Cow::Borrowed(text.as_bytes())); } let text = self.text(); let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()]; - let mut buf = String::new(); + let mut buf = Vec::new(); let mut prev_end = 0; let mut has_error = false; - unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match ( - unescaped_char, + let mut char_buf = [0u8; 4]; + let mut extend_unit = |buf: &mut Vec, unit: CStrUnit| match unit { + CStrUnit::Byte(b) => buf.push(b), + CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()), + }; + unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( + unescaped, buf.capacity() == 0, ) { - (Ok(c), false) => buf.push(c), + (Ok(u), false) => extend_unit(&mut buf, u), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end } - (Ok(c), true) => { + (Ok(u), true) => { buf.reserve_exact(text.len()); - buf.push_str(&text[..prev_end]); - buf.push(c); + buf.extend(text[..prev_end].as_bytes()); + extend_unit(&mut buf, u); } (Err(_), _) => has_error = true, }); match (has_error, buf.capacity() == 0) { (true, _) => None, - (false, true) => Some(Cow::Borrowed(text)), + (false, true) => Some(Cow::Borrowed(text.as_bytes())), (false, false) => Some(Cow::Owned(buf)), } }