Auto merge of #14141 - matklad:utf-32, r=lnicola

Support UTF-32 position encoding Looks like this is a native encoding for Emacs at least!
2024-12-29 14:33:29 +00:00 · 2023-02-14 10:53:35 +00:00 · 2023-02-14 10:53:35 +00:00 · 31486a639d
commit 31486a639d
parent 2a57b01980 9fdcf5787d
18 changed files with 210 additions and 158 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -711,6 +711,7 @@ dependencies = [
 "limit",
 "memchr",
 "once_cell",
+ "oorandom",
 "parser",
 "profile",
 "rayon",
--- a/crates/ide-db/Cargo.toml
+++ b/crates/ide-db/Cargo.toml
@ -37,8 +37,9 @@ text-edit.workspace = true
 hir.workspace = true

 [dev-dependencies]
-xshell = "0.2.2"
 expect-test = "1.4.0"
+oorandom = "11.1.3"
+xshell = "0.2.2"

 # local deps
 test-utils.workspace = true
--- a/crates/ide-db/src/line_index.rs
+++ b/crates/ide-db/src/line_index.rs
@ -7,20 +7,13 @@ use syntax::{TextRange, TextSize};

 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct LineIndex {
-    /// Offset the the beginning of each line, zero-based
+    /// Offset the beginning of each line, zero-based.
    pub(crate) newlines: Vec<TextSize>,
-    /// List of non-ASCII characters on each line
-    pub(crate) utf16_lines: NoHashHashMap<u32, Vec<Utf16Char>>,
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct LineColUtf16 {
-    /// Zero-based
-    pub line: u32,
-    /// Zero-based
-    pub col: u32,
+    /// List of non-ASCII characters on each line.
+    pub(crate) line_wide_chars: NoHashHashMap<u32, Vec<WideChar>>,
 }

+/// Line/Column information in native, utf8 format.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct LineCol {
    /// Zero-based
@ -29,34 +22,57 @@ pub struct LineCol {
    pub col: u32,
 }

+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum WideEncoding {
+    Utf16,
+    Utf32,
+}
+
+/// Line/Column information in legacy encodings.
+///
+/// Deliberately not a generic type and different from `LineCol`.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct WideLineCol {
+    /// Zero-based
+    pub line: u32,
+    /// Zero-based
+    pub col: u32,
+}
+
 #[derive(Clone, Debug, Hash, PartialEq, Eq)]
-pub(crate) struct Utf16Char {
+pub(crate) struct WideChar {
    /// Start offset of a character inside a line, zero-based
    pub(crate) start: TextSize,
    /// End offset of a character inside a line, zero-based
    pub(crate) end: TextSize,
 }

-impl Utf16Char {
+impl WideChar {
    /// Returns the length in 8-bit UTF-8 code units.
    fn len(&self) -> TextSize {
        self.end - self.start
    }

-    /// Returns the length in 16-bit UTF-16 code units.
-    fn len_utf16(&self) -> usize {
-        if self.len() == TextSize::from(4) {
-            2
-        } else {
-            1
+    /// Returns the length in UTF-16 or UTF-32 code units.
+    fn wide_len(&self, enc: WideEncoding) -> usize {
+        match enc {
+            WideEncoding::Utf16 => {
+                if self.len() == TextSize::from(4) {
+                    2
+                } else {
+                    1
+                }
+            }
+
+            WideEncoding::Utf32 => 1,
        }
    }
 }

 impl LineIndex {
    pub fn new(text: &str) -> LineIndex {
-        let mut utf16_lines = NoHashHashMap::default();
-        let mut utf16_chars = Vec::new();
+        let mut line_wide_chars = NoHashHashMap::default();
+        let mut wide_chars = Vec::new();

        let mut newlines = Vec::with_capacity(16);
        newlines.push(TextSize::from(0));
@ -71,8 +87,8 @@ impl LineIndex {
                newlines.push(curr_row);

                // Save any utf-16 characters seen in the previous line
-                if !utf16_chars.is_empty() {
-                    utf16_lines.insert(line, mem::take(&mut utf16_chars));
+                if !wide_chars.is_empty() {
+                    line_wide_chars.insert(line, mem::take(&mut wide_chars));
                }

                // Prepare for processing the next line
@ -82,18 +98,18 @@ impl LineIndex {
            }

            if !c.is_ascii() {
-                utf16_chars.push(Utf16Char { start: curr_col, end: curr_col + c_len });
+                wide_chars.push(WideChar { start: curr_col, end: curr_col + c_len });
            }

            curr_col += c_len;
        }

        // Save any utf-16 characters seen in the last line
-        if !utf16_chars.is_empty() {
-            utf16_lines.insert(line, utf16_chars);
+        if !wide_chars.is_empty() {
+            line_wide_chars.insert(line, wide_chars);
        }

-        LineIndex { newlines, utf16_lines }
+        LineIndex { newlines, line_wide_chars }
    }

    pub fn line_col(&self, offset: TextSize) -> LineCol {
@ -109,13 +125,13 @@ impl LineIndex {
            .map(|offset| offset + TextSize::from(line_col.col))
    }

-    pub fn to_utf16(&self, line_col: LineCol) -> LineColUtf16 {
-        let col = self.utf8_to_utf16_col(line_col.line, line_col.col.into());
-        LineColUtf16 { line: line_col.line, col: col as u32 }
+    pub fn to_wide(&self, enc: WideEncoding, line_col: LineCol) -> WideLineCol {
+        let col = self.utf8_to_wide_col(enc, line_col.line, line_col.col.into());
+        WideLineCol { line: line_col.line, col: col as u32 }
    }

-    pub fn to_utf8(&self, line_col: LineColUtf16) -> LineCol {
-        let col = self.utf16_to_utf8_col(line_col.line, line_col.col);
+    pub fn to_utf8(&self, enc: WideEncoding, line_col: WideLineCol) -> LineCol {
+        let col = self.wide_to_utf8_col(enc, line_col.line, line_col.col);
        LineCol { line: line_col.line, col: col.into() }
    }

@ -132,12 +148,12 @@ impl LineIndex {
            .filter(|it| !it.is_empty())
    }

-    fn utf8_to_utf16_col(&self, line: u32, col: TextSize) -> usize {
+    fn utf8_to_wide_col(&self, enc: WideEncoding, line: u32, col: TextSize) -> usize {
        let mut res: usize = col.into();
-        if let Some(utf16_chars) = self.utf16_lines.get(&line) {
-            for c in utf16_chars {
+        if let Some(wide_chars) = self.line_wide_chars.get(&line) {
+            for c in wide_chars {
                if c.end <= col {
-                    res -= usize::from(c.len()) - c.len_utf16();
+                    res -= usize::from(c.len()) - c.wide_len(enc);
                } else {
                    // From here on, all utf16 characters come *after* the character we are mapping,
                    // so we don't need to take them into account
@ -148,11 +164,11 @@ impl LineIndex {
        res
    }

-    fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
-        if let Some(utf16_chars) = self.utf16_lines.get(&line) {
-            for c in utf16_chars {
+    fn wide_to_utf8_col(&self, enc: WideEncoding, line: u32, mut col: u32) -> TextSize {
+        if let Some(wide_chars) = self.line_wide_chars.get(&line) {
+            for c in wide_chars {
                if col > u32::from(c.start) {
-                    col += u32::from(c.len()) - c.len_utf16() as u32;
+                    col += u32::from(c.len()) - c.wide_len(enc) as u32;
                } else {
                    // From here on, all utf16 characters come *after* the character we are mapping,
                    // so we don't need to take them into account
@ -167,6 +183,9 @@ impl LineIndex {

 #[cfg(test)]
 mod tests {
+    use test_utils::skip_slow_tests;
+
+    use super::WideEncoding::{Utf16, Utf32};
    use super::*;

    #[test]
@ -210,67 +229,59 @@ mod tests {
 const C: char = 'x';
 ",
        );
-        assert_eq!(col_index.utf16_lines.len(), 0);
+        assert_eq!(col_index.line_wide_chars.len(), 0);
    }

    #[test]
-    fn test_single_char() {
-        let col_index = LineIndex::new(
-            "
-const C: char = 'メ';
-",
-        );
+    fn test_every_chars() {
+        if skip_slow_tests() {
+            return;
+        }

-        assert_eq!(col_index.utf16_lines.len(), 1);
-        assert_eq!(col_index.utf16_lines[&1].len(), 1);
-        assert_eq!(col_index.utf16_lines[&1][0], Utf16Char { start: 17.into(), end: 20.into() });
+        let text: String = {
+            let mut chars: Vec<char> = ((0 as char)..char::MAX).collect(); // Neat!
+            chars.extend("\n".repeat(chars.len() / 16).chars());
+            let mut rng = oorandom::Rand32::new(stdx::rand::seed());
+            stdx::rand::shuffle(&mut chars, |i| rng.rand_range(0..i as u32) as usize);
+            chars.into_iter().collect()
+        };
+        assert!(text.contains('💩')); // Sanity check.

-        // UTF-8 to UTF-16, no changes
-        assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15);
+        let line_index = LineIndex::new(&text);

-        // UTF-8 to UTF-16
-        assert_eq!(col_index.utf8_to_utf16_col(1, 22.into()), 20);
+        let mut lin_col = LineCol { line: 0, col: 0 };
+        let mut col_utf16 = 0;
+        let mut col_utf32 = 0;
+        for (offset, c) in text.char_indices() {
+            let got_offset = line_index.offset(lin_col).unwrap();
+            assert_eq!(usize::from(got_offset), offset);

-        // UTF-16 to UTF-8, no changes
-        assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
+            let got_lin_col = line_index.line_col(got_offset);
+            assert_eq!(got_lin_col, lin_col);

-        // UTF-16 to UTF-8
-        assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21));
+            for enc in [Utf16, Utf32] {
+                let wide_lin_col = line_index.to_wide(enc, lin_col);
+                let got_lin_col = line_index.to_utf8(enc, wide_lin_col);
+                assert_eq!(got_lin_col, lin_col);

-        let col_index = LineIndex::new("a𐐏b");
-        assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5));
-    }
+                let want_col = match enc {
+                    Utf16 => col_utf16,
+                    Utf32 => col_utf32,
+                };
+                assert_eq!(wide_lin_col.col, want_col)
+            }

-    #[test]
-    fn test_string() {
-        let col_index = LineIndex::new(
-            "
-const C: char = \"メ メ\";
-",
-        );
-
-        assert_eq!(col_index.utf16_lines.len(), 1);
-        assert_eq!(col_index.utf16_lines[&1].len(), 2);
-        assert_eq!(col_index.utf16_lines[&1][0], Utf16Char { start: 17.into(), end: 20.into() });
-        assert_eq!(col_index.utf16_lines[&1][1], Utf16Char { start: 21.into(), end: 24.into() });
-
-        // UTF-8 to UTF-16
-        assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15);
-
-        assert_eq!(col_index.utf8_to_utf16_col(1, 21.into()), 19);
-        assert_eq!(col_index.utf8_to_utf16_col(1, 25.into()), 21);
-
-        assert!(col_index.utf8_to_utf16_col(2, 15.into()) == 15);
-
-        // UTF-16 to UTF-8
-        assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
-
-        // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
-        assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
-        assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
-        assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
-
-        assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
+            if c == '\n' {
+                lin_col.line += 1;
+                lin_col.col = 0;
+                col_utf16 = 0;
+                col_utf32 = 0;
+            } else {
+                lin_col.col += c.len_utf8() as u32;
+                col_utf16 += c.len_utf16() as u32;
+                col_utf32 += 1;
+            }
+        }
    }

    #[test]
--- a/crates/ide/src/lib.rs
+++ b/crates/ide/src/lib.rs
@ -115,7 +115,7 @@ pub use ide_db::{
        SourceRoot, SourceRootId,
    },
    label::Label,
-    line_index::{LineCol, LineColUtf16, LineIndex},
+    line_index::{LineCol, LineIndex},
    search::{ReferenceCategory, SearchScope},
    source_change::{FileSystemEdit, SourceChange},
    symbol_index::Query,
--- a/crates/ide/src/shuffle_crate_graph.rs
+++ b/crates/ide/src/shuffle_crate_graph.rs
@ -18,7 +18,9 @@ pub(crate) fn shuffle_crate_graph(db: &mut RootDatabase) {
    let crate_graph = db.crate_graph();

    let mut shuffled_ids = crate_graph.iter().collect::<Vec<_>>();
-    shuffle(&mut shuffled_ids);
+
+    let mut rng = oorandom::Rand32::new(stdx::rand::seed());
+    stdx::rand::shuffle(&mut shuffled_ids, |i| rng.rand_range(0..i as u32) as usize);

    let mut new_graph = CrateGraph::default();

@ -52,21 +54,3 @@ pub(crate) fn shuffle_crate_graph(db: &mut RootDatabase) {

    db.set_crate_graph_with_durability(Arc::new(new_graph), Durability::HIGH);
 }
-
-fn shuffle<T>(slice: &mut [T]) {
-    let mut rng = oorandom::Rand32::new(seed());
-
-    let mut remaining = slice.len() - 1;
-    while remaining > 0 {
-        let index = rng.rand_range(0..remaining as u32);
-        slice.swap(remaining, index as usize);
-        remaining -= 1;
-    }
-}
-
-fn seed() -> u64 {
-    use std::collections::hash_map::RandomState;
-    use std::hash::{BuildHasher, Hasher};
-
-    RandomState::new().build_hasher().finish()
-}
--- a/crates/rust-analyzer/src/caps.rs
+++ b/crates/rust-analyzer/src/caps.rs
@ -1,4 +1,5 @@
 //! Advertises the capabilities of the LSP Server.
+use ide_db::line_index::WideEncoding;
 use lsp_types::{
    CallHierarchyServerCapability, ClientCapabilities, CodeActionKind, CodeActionOptions,
    CodeActionProviderCapability, CodeLensOptions, CompletionOptions,
@ -16,16 +17,19 @@ use lsp_types::{
 use serde_json::json;

 use crate::config::{Config, RustfmtConfig};
-use crate::lsp_ext::supports_utf8;
+use crate::line_index::PositionEncoding;
+use crate::lsp_ext::negotiated_encoding;
 use crate::semantic_tokens;

 pub fn server_capabilities(config: &Config) -> ServerCapabilities {
    ServerCapabilities {
-        position_encoding: if supports_utf8(config.caps()) {
-            Some(PositionEncodingKind::UTF8)
-        } else {
-            None
-        },
+        position_encoding: Some(match negotiated_encoding(config.caps()) {
+            PositionEncoding::Utf8 => PositionEncodingKind::UTF8,
+            PositionEncoding::Wide(wide) => match wide {
+                WideEncoding::Utf16 => PositionEncodingKind::UTF16,
+                WideEncoding::Utf32 => PositionEncodingKind::UTF32,
+            },
+        }),
        text_document_sync: Some(TextDocumentSyncCapability::Options(TextDocumentSyncOptions {
            open_close: Some(true),
            change: Some(TextDocumentSyncKind::INCREMENTAL),
--- a/crates/rust-analyzer/src/cli/lsif.rs
+++ b/crates/rust-analyzer/src/cli/lsif.rs
@ -11,6 +11,7 @@ use ide::{
 use ide_db::LineIndexDatabase;

 use ide_db::base_db::salsa::{self, ParallelDatabase};
+use ide_db::line_index::WideEncoding;
 use lsp_types::{self, lsif};
 use project_model::{CargoConfig, ProjectManifest, ProjectWorkspace};
 use vfs::{AbsPathBuf, Vfs};
@ -127,7 +128,7 @@ impl LsifManager<'_> {
        let line_index = self.db.line_index(file_id);
        let line_index = LineIndex {
            index: line_index,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Wide(WideEncoding::Utf16),
            endings: LineEndings::Unix,
        };
        let range_id = self.add_vertex(lsif::Vertex::Range {
@ -249,7 +250,7 @@ impl LsifManager<'_> {
        let line_index = self.db.line_index(file_id);
        let line_index = LineIndex {
            index: line_index,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Wide(WideEncoding::Utf16),
            endings: LineEndings::Unix,
        };
        let result = folds
--- a/crates/rust-analyzer/src/config.rs
+++ b/crates/rust-analyzer/src/config.rs
@ -33,7 +33,7 @@ use crate::{
    caps::completion_item_edit_resolve,
    diagnostics::DiagnosticsMapConfig,
    line_index::PositionEncoding,
-    lsp_ext::{self, supports_utf8, WorkspaceSymbolSearchKind, WorkspaceSymbolSearchScope},
+    lsp_ext::{self, negotiated_encoding, WorkspaceSymbolSearchKind, WorkspaceSymbolSearchScope},
 };

 mod patch_old_style;
@ -999,11 +999,7 @@ impl Config {
    }

    pub fn position_encoding(&self) -> PositionEncoding {
-        if supports_utf8(&self.caps) {
-            PositionEncoding::Utf8
-        } else {
-            PositionEncoding::Utf16
-        }
+        negotiated_encoding(&self.caps)
    }

    fn experimental(&self, index: &'static str) -> bool {
--- a/crates/rust-analyzer/src/diagnostics/to_proto.rs
+++ b/crates/rust-analyzer/src/diagnostics/to_proto.rs
@ -3,6 +3,7 @@
 use std::collections::HashMap;

 use flycheck::{Applicability, DiagnosticLevel, DiagnosticSpan};
+use ide_db::line_index::WideEncoding;
 use itertools::Itertools;
 use stdx::format_to;
 use vfs::{AbsPath, AbsPathBuf};
@ -95,7 +96,8 @@ fn position(
        let mut char_offset = 0;
        let len_func = match position_encoding {
            PositionEncoding::Utf8 => char::len_utf8,
-            PositionEncoding::Utf16 => char::len_utf16,
+            PositionEncoding::Wide(WideEncoding::Utf16) => char::len_utf16,
+            PositionEncoding::Wide(WideEncoding::Utf32) => |_| 1,
        };
        for c in line.text.chars() {
            char_offset += 1;
--- a/crates/rust-analyzer/src/from_proto.rs
+++ b/crates/rust-analyzer/src/from_proto.rs
@ -1,7 +1,10 @@
 //! Conversion lsp_types types to rust-analyzer specific ones.
 use anyhow::format_err;
-use ide::{Annotation, AnnotationKind, AssistKind, LineCol, LineColUtf16};
-use ide_db::base_db::{FileId, FilePosition, FileRange};
+use ide::{Annotation, AnnotationKind, AssistKind, LineCol};
+use ide_db::{
+    base_db::{FileId, FilePosition, FileRange},
+    line_index::WideLineCol,
+};
 use syntax::{TextRange, TextSize};
 use vfs::AbsPathBuf;

@ -26,9 +29,9 @@ pub(crate) fn vfs_path(url: &lsp_types::Url) -> Result<vfs::VfsPath> {
 pub(crate) fn offset(line_index: &LineIndex, position: lsp_types::Position) -> Result<TextSize> {
    let line_col = match line_index.encoding {
        PositionEncoding::Utf8 => LineCol { line: position.line, col: position.character },
-        PositionEncoding::Utf16 => {
-            let line_col = LineColUtf16 { line: position.line, col: position.character };
-            line_index.index.to_utf8(line_col)
+        PositionEncoding::Wide(enc) => {
+            let line_col = WideLineCol { line: position.line, col: position.character };
+            line_index.index.to_utf8(enc, line_col)
        }
    };
    let text_size =
--- a/crates/rust-analyzer/src/line_index.rs
+++ b/crates/rust-analyzer/src/line_index.rs
@ -7,9 +7,12 @@

 use std::sync::Arc;

+use ide_db::line_index::WideEncoding;
+
+#[derive(Clone, Copy)]
 pub enum PositionEncoding {
    Utf8,
-    Utf16,
+    Wide(WideEncoding),
 }

 pub(crate) struct LineIndex {
--- a/crates/rust-analyzer/src/lsp_ext.rs
+++ b/crates/rust-analyzer/src/lsp_ext.rs
@ -2,6 +2,7 @@

 use std::{collections::HashMap, path::PathBuf};

+use ide_db::line_index::WideEncoding;
 use lsp_types::request::Request;
 use lsp_types::PositionEncodingKind;
 use lsp_types::{
@ -10,6 +11,8 @@ use lsp_types::{
 };
 use serde::{Deserialize, Serialize};

+use crate::line_index::PositionEncoding;
+
 pub enum AnalyzerStatus {}

 impl Request for AnalyzerStatus {
@ -481,16 +484,22 @@ pub(crate) enum CodeLensResolveData {
    References(lsp_types::TextDocumentPositionParams),
 }

-pub fn supports_utf8(caps: &lsp_types::ClientCapabilities) -> bool {
-    match &caps.general {
-        Some(general) => general
-            .position_encodings
-            .as_deref()
-            .unwrap_or_default()
-            .iter()
-            .any(|it| it == &PositionEncodingKind::UTF8),
-        _ => false,
+pub fn negotiated_encoding(caps: &lsp_types::ClientCapabilities) -> PositionEncoding {
+    let client_encodings = match &caps.general {
+        Some(general) => general.position_encodings.as_deref().unwrap_or_default(),
+        None => &[],
+    };
+
+    for enc in client_encodings {
+        if enc == &PositionEncodingKind::UTF8 {
+            return PositionEncoding::Utf8;
+        } else if enc == &PositionEncodingKind::UTF32 {
+            return PositionEncoding::Wide(WideEncoding::Utf32);
+        }
+        // NB: intentionally prefer just about anything else to utf-16.
    }
+
+    PositionEncoding::Wide(WideEncoding::Utf16)
 }

 pub enum MoveItem {}
--- a/crates/rust-analyzer/src/lsp_utils.rs
+++ b/crates/rust-analyzer/src/lsp_utils.rs
@ -161,6 +161,7 @@ impl GlobalState {
 }

 pub(crate) fn apply_document_changes(
+    encoding: PositionEncoding,
    file_contents: impl FnOnce() -> String,
    mut content_changes: Vec<lsp_types::TextDocumentContentChangeEvent>,
 ) -> String {
@ -192,9 +193,9 @@ pub(crate) fn apply_document_changes(
    let mut line_index = LineIndex {
        // the index will be overwritten in the bottom loop's first iteration
        index: Arc::new(ide::LineIndex::new(&text)),
-        // We don't care about line endings or offset encoding here.
+        // We don't care about line endings here.
        endings: LineEndings::Unix,
-        encoding: PositionEncoding::Utf16,
+        encoding,
    };

    // The changes we got must be applied sequentially, but can cross lines so we
@ -256,6 +257,7 @@ pub(crate) fn all_edits_are_disjoint(

 #[cfg(test)]
 mod tests {
+    use ide_db::line_index::WideEncoding;
    use lsp_types::{
        CompletionItem, CompletionTextEdit, InsertReplaceEdit, Position, Range,
        TextDocumentContentChangeEvent,
@ -278,9 +280,11 @@ mod tests {
            };
        }

-        let text = apply_document_changes(|| String::new(), vec![]);
+        let encoding = PositionEncoding::Wide(WideEncoding::Utf16);
+        let text = apply_document_changes(encoding, || String::new(), vec![]);
        assert_eq!(text, "");
        let text = apply_document_changes(
+            encoding,
            || text,
            vec![TextDocumentContentChangeEvent {
                range: None,
@ -289,39 +293,49 @@ mod tests {
            }],
        );
        assert_eq!(text, "the");
-        let text = apply_document_changes(|| text, c![0, 3; 0, 3 => " quick"]);
+        let text = apply_document_changes(encoding, || text, c![0, 3; 0, 3 => " quick"]);
        assert_eq!(text, "the quick");
-        let text = apply_document_changes(|| text, c![0, 0; 0, 4 => "", 0, 5; 0, 5 => " foxes"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 0; 0, 4 => "", 0, 5; 0, 5 => " foxes"]);
        assert_eq!(text, "quick foxes");
-        let text = apply_document_changes(|| text, c![0, 11; 0, 11 => "\ndream"]);
+        let text = apply_document_changes(encoding, || text, c![0, 11; 0, 11 => "\ndream"]);
        assert_eq!(text, "quick foxes\ndream");
-        let text = apply_document_changes(|| text, c![1, 0; 1, 0 => "have "]);
+        let text = apply_document_changes(encoding, || text, c![1, 0; 1, 0 => "have "]);
        assert_eq!(text, "quick foxes\nhave dream");
        let text = apply_document_changes(
+            encoding,
            || text,
            c![0, 0; 0, 0 => "the ", 1, 4; 1, 4 => " quiet", 1, 16; 1, 16 => "s\n"],
        );
        assert_eq!(text, "the quick foxes\nhave quiet dreams\n");
-        let text = apply_document_changes(|| text, c![0, 15; 0, 15 => "\n", 2, 17; 2, 17 => "\n"]);
+        let text = apply_document_changes(
+            encoding,
+            || text,
+            c![0, 15; 0, 15 => "\n", 2, 17; 2, 17 => "\n"],
+        );
        assert_eq!(text, "the quick foxes\n\nhave quiet dreams\n\n");
        let text = apply_document_changes(
+            encoding,
            || text,
            c![1, 0; 1, 0 => "DREAM", 2, 0; 2, 0 => "they ", 3, 0; 3, 0 => "DON'T THEY?"],
        );
        assert_eq!(text, "the quick foxes\nDREAM\nthey have quiet dreams\nDON'T THEY?\n");
-        let text = apply_document_changes(|| text, c![0, 10; 1, 5 => "", 2, 0; 2, 12 => ""]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 10; 1, 5 => "", 2, 0; 2, 12 => ""]);
        assert_eq!(text, "the quick \nthey have quiet dreams\n");

        let text = String::from("❤️");
-        let text = apply_document_changes(|| text, c![0, 0; 0, 0 => "a"]);
+        let text = apply_document_changes(encoding, || text, c![0, 0; 0, 0 => "a"]);
        assert_eq!(text, "a❤️");

        let text = String::from("a\nb");
-        let text = apply_document_changes(|| text, c![0, 1; 1, 0 => "\nțc", 0, 1; 1, 1 => "d"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 1; 1, 0 => "\nțc", 0, 1; 1, 1 => "d"]);
        assert_eq!(text, "adcb");

        let text = String::from("a\nb");
-        let text = apply_document_changes(|| text, c![0, 1; 1, 0 => "ț\nc", 0, 2; 0, 2 => "c"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 1; 1, 0 => "ț\nc", 0, 2; 0, 2 => "c"]);
        assert_eq!(text, "ațc\ncb");
    }

--- a/crates/rust-analyzer/src/main_loop.rs
+++ b/crates/rust-analyzer/src/main_loop.rs
@ -831,6 +831,7 @@ impl GlobalState {
                    let vfs = &mut this.vfs.write().0;
                    let file_id = vfs.file_id(&path).unwrap();
                    let text = apply_document_changes(
+                        this.config.position_encoding(),
                        || std::str::from_utf8(vfs.file_contents(file_id)).unwrap().into(),
                        params.content_changes,
                    );
--- a/crates/rust-analyzer/src/to_proto.rs
+++ b/crates/rust-analyzer/src/to_proto.rs
@ -31,8 +31,8 @@ pub(crate) fn position(line_index: &LineIndex, offset: TextSize) -> lsp_types::P
    let line_col = line_index.index.line_col(offset);
    match line_index.encoding {
        PositionEncoding::Utf8 => lsp_types::Position::new(line_col.line, line_col.col),
-        PositionEncoding::Utf16 => {
-            let line_col = line_index.index.to_utf16(line_col);
+        PositionEncoding::Wide(enc) => {
+            let line_col = line_index.index.to_wide(enc, line_col);
            lsp_types::Position::new(line_col.line, line_col.col)
        }
    }
@ -1429,7 +1429,7 @@ fn main() {
        let line_index = LineIndex {
            index: Arc::new(ide::LineIndex::new(text)),
            endings: LineEndings::Unix,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Utf8,
        };
        let converted: Vec<lsp_types::FoldingRange> =
            folds.into_iter().map(|it| folding_range(text, &line_index, true, it)).collect();
--- a/crates/stdx/src/lib.rs
+++ b/crates/stdx/src/lib.rs
@ -11,6 +11,7 @@ pub mod hash;
 pub mod process;
 pub mod panic_context;
 pub mod non_empty_vec;
+pub mod rand;

 pub use always_assert::{always, never};

--- a/crates/stdx/src/rand.rs
+++ b/crates/stdx/src/rand.rs
@ -0,0 +1,21 @@
+//! We don't use `rand`, as that's too many things for us.
+//!
+//! Currently, we use oorandom instead, but it misses these two utilities.
+//! Perhaps we should switch to `fastrand`, or our own small prng, it's not like
+//! we need anything move complicatied that xor-shift.
+
+pub fn shuffle<T>(slice: &mut [T], mut rand_index: impl FnMut(usize) -> usize) {
+    let mut remaining = slice.len() - 1;
+    while remaining > 0 {
+        let index = rand_index(remaining);
+        slice.swap(remaining, index);
+        remaining -= 1;
+    }
+}
+
+pub fn seed() -> u64 {
+    use std::collections::hash_map::RandomState;
+    use std::hash::{BuildHasher, Hasher};
+
+    RandomState::new().build_hasher().finish()
+}
--- a/docs/dev/lsp-extensions.md
+++ b/docs/dev/lsp-extensions.md
@ -1,5 +1,5 @@
 <!---
-lsp_ext.rs hash: ec29403e67dfd15b
+lsp_ext.rs hash: d87477896dfe41d4

 If you need to change the above hash to make the test pass, please check if you
 need to adjust this doc as well and ping this issue: