4276: Don't count start of non-ASCII characters as being inside of them r=matklad a=lnicola

I'm still not sure that `utf16_to_utf8_col` is correct for code points from Supplementary Planes. These have two UTF-16 code units, and I feel we're not going to count them correctly.

Fixes the crash in https://github.com/rust-analyzer/rust-analyzer/issues/4263#issuecomment-622988258.

Co-authored-by: Laurențiu Nicola <lnicola@dend.ro>
This commit is contained in:
bors[bot] 2020-05-03 08:57:02 +00:00 committed by GitHub
commit 682c079043
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -8,7 +8,9 @@ use superslice::Ext;
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
pub struct LineIndex { pub struct LineIndex {
/// Offset the the beginning of each line, zero-based
pub(crate) newlines: Vec<TextSize>, pub(crate) newlines: Vec<TextSize>,
/// List of non-ASCII characters on each line
pub(crate) utf16_lines: FxHashMap<u32, Vec<Utf16Char>>, pub(crate) utf16_lines: FxHashMap<u32, Vec<Utf16Char>>,
} }
@ -22,7 +24,9 @@ pub struct LineCol {
#[derive(Clone, Debug, Hash, PartialEq, Eq)] #[derive(Clone, Debug, Hash, PartialEq, Eq)]
pub(crate) struct Utf16Char { pub(crate) struct Utf16Char {
/// Start offset of a character inside a line, zero-based
pub(crate) start: TextSize, pub(crate) start: TextSize,
/// End offset of a character inside a line, zero-based
pub(crate) end: TextSize, pub(crate) end: TextSize,
} }
@ -120,7 +124,7 @@ impl LineIndex {
fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize { fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
if let Some(utf16_chars) = self.utf16_lines.get(&line) { if let Some(utf16_chars) = self.utf16_lines.get(&line) {
for c in utf16_chars { for c in utf16_chars {
if col >= u32::from(c.start) { if col > u32::from(c.start) {
col += u32::from(c.len()) - 1; col += u32::from(c.len()) - 1;
} else { } else {
// From here on, all utf16 characters come *after* the character we are mapping, // From here on, all utf16 characters come *after* the character we are mapping,
@ -226,8 +230,10 @@ const C: char = \"メ メ\";
// UTF-16 to UTF-8 // UTF-16 to UTF-8
assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15)); assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(23)); assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15)); assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
} }