2023-05-04 02:18:41 +00:00
//! See [`LineIndex`].
2023-05-04 23:21:42 +00:00
#![ deny(missing_debug_implementations, missing_docs, rust_2018_idioms) ]
2023-05-04 02:18:41 +00:00
#[ cfg(test) ]
mod tests ;
2023-05-04 23:28:15 +00:00
use nohash_hasher ::IntMap ;
2023-05-04 23:21:29 +00:00
pub use text_size ::{ TextRange , TextSize } ;
2018-08-10 18:13:39 +00:00
2023-05-06 07:52:11 +00:00
/// `(line, column)` information in the native, UTF-8 encoding.
#[ derive(Debug, Clone, Copy, PartialEq, Eq, Hash) ]
2023-02-14 00:56:28 +00:00
pub struct LineCol {
2023-05-04 02:18:41 +00:00
/// Zero-based.
2018-08-10 18:13:39 +00:00
pub line : u32 ,
2023-05-04 02:18:41 +00:00
/// Zero-based UTF-8 offset.
2021-02-12 18:24:10 +00:00
pub col : u32 ,
2018-11-15 16:34:05 +00:00
}
2023-05-04 02:18:41 +00:00
/// A kind of wide character encoding.
2023-05-06 07:52:11 +00:00
#[ derive(Debug, Clone, Copy, PartialEq, Eq, Hash) ]
2023-05-06 00:35:20 +00:00
#[ non_exhaustive ]
2023-02-14 00:56:28 +00:00
pub enum WideEncoding {
2023-05-04 02:18:41 +00:00
/// UTF-16.
2023-02-14 00:56:28 +00:00
Utf16 ,
2023-05-04 02:18:41 +00:00
/// UTF-32.
2023-02-14 00:56:28 +00:00
Utf32 ,
}
2023-05-06 00:35:20 +00:00
impl WideEncoding {
2023-05-06 07:52:11 +00:00
/// Returns the number of code units it takes to encode `text` in this encoding.
2023-05-06 00:35:20 +00:00
pub fn measure ( & self , text : & str ) -> usize {
match self {
WideEncoding ::Utf16 = > text . encode_utf16 ( ) . count ( ) ,
WideEncoding ::Utf32 = > text . chars ( ) . count ( ) ,
}
}
}
2023-05-06 07:52:11 +00:00
/// `(line, column)` information in wide encodings.
///
/// See [`WideEncoding`] for the kinds of wide encodings available.
2023-05-04 23:34:24 +00:00
//
// Deliberately not a generic type and different from `LineCol`.
2023-05-06 07:52:11 +00:00
#[ derive(Debug, Clone, Copy, PartialEq, Eq, Hash) ]
2023-02-14 00:56:28 +00:00
pub struct WideLineCol {
2023-05-04 02:18:41 +00:00
/// Zero-based.
2021-02-12 19:09:53 +00:00
pub line : u32 ,
2023-05-04 02:18:41 +00:00
/// Zero-based.
2021-02-12 19:09:53 +00:00
pub col : u32 ,
}
2023-05-06 07:52:11 +00:00
#[ derive(Debug, Clone, Copy, PartialEq, Eq) ]
2023-05-04 23:20:53 +00:00
struct WideChar {
2023-05-06 07:52:11 +00:00
/// Start offset of a character inside a line, zero-based.
2023-05-04 23:20:53 +00:00
start : TextSize ,
2023-05-06 07:52:11 +00:00
/// End offset of a character inside a line, zero-based.
2023-05-04 23:20:53 +00:00
end : TextSize ,
2018-11-15 16:34:05 +00:00
}
2023-02-14 00:56:28 +00:00
impl WideChar {
2020-05-05 17:29:04 +00:00
/// Returns the length in 8-bit UTF-8 code units.
2020-04-24 21:40:41 +00:00
fn len ( & self ) -> TextSize {
2018-11-15 16:34:05 +00:00
self . end - self . start
}
2020-05-05 17:29:04 +00:00
2023-02-14 00:56:28 +00:00
/// Returns the length in UTF-16 or UTF-32 code units.
2023-05-06 08:03:18 +00:00
fn wide_len ( & self , enc : WideEncoding ) -> u32 {
2023-02-14 00:56:28 +00:00
match enc {
WideEncoding ::Utf16 = > {
if self . len ( ) = = TextSize ::from ( 4 ) {
2
} else {
1
}
}
WideEncoding ::Utf32 = > 1 ,
2020-05-05 17:29:04 +00:00
}
}
2018-08-10 18:13:39 +00:00
}
2023-05-06 07:51:25 +00:00
/// Maps flat [`TextSize`] offsets to/from `(line, column)` representation.
#[ derive(Debug, Clone, PartialEq, Eq) ]
2023-05-06 00:25:10 +00:00
pub struct LineIndex {
2023-05-06 08:46:33 +00:00
/// Offset the beginning of each line (except the first, which always has offset 0).
2023-05-06 00:25:10 +00:00
newlines : Box < [ TextSize ] > ,
/// List of non-ASCII characters on each line.
line_wide_chars : IntMap < u32 , Box < [ WideChar ] > > ,
2023-05-06 22:44:09 +00:00
/// The length of the entire text.
len : TextSize ,
2023-05-06 00:25:10 +00:00
}
2018-08-10 18:13:39 +00:00
impl LineIndex {
2023-05-04 02:18:41 +00:00
/// Returns a `LineIndex` for the `text`.
2018-08-10 18:13:39 +00:00
pub fn new ( text : & str ) -> LineIndex {
2023-06-27 13:21:58 +00:00
let ( newlines , line_wide_chars ) = analyze_source_file ( text ) ;
2023-05-06 22:10:35 +00:00
LineIndex {
newlines : newlines . into_boxed_slice ( ) ,
line_wide_chars ,
2023-05-06 22:44:09 +00:00
len : TextSize ::of ( text ) ,
2023-05-06 22:10:35 +00:00
}
2018-08-10 18:13:39 +00:00
}
2023-05-04 02:18:41 +00:00
/// Transforms the `TextSize` into a `LineCol`.
2023-05-06 07:56:30 +00:00
///
/// # Panics
///
2023-05-06 22:14:02 +00:00
/// If the offset is invalid. See [`Self::try_line_col`].
2021-02-12 19:09:53 +00:00
pub fn line_col ( & self , offset : TextSize ) -> LineCol {
2023-05-06 08:04:41 +00:00
self . try_line_col ( offset ) . expect ( " invalid offset " )
}
2023-05-06 22:14:02 +00:00
/// Transforms the `TextSize` into a `LineCol`.
///
/// Returns `None` if the `offset` was invalid, e.g. if it extends past the end of the text or
/// points to the middle of a multi-byte character.
2023-05-06 08:04:41 +00:00
pub fn try_line_col ( & self , offset : TextSize ) -> Option < LineCol > {
2023-05-06 22:44:09 +00:00
if offset > self . len {
2023-05-06 09:08:47 +00:00
return None ;
}
2023-05-06 08:46:33 +00:00
let line = self . newlines . partition_point ( | & it | it < = offset ) ;
let start = self . start_offset ( line ) ? ;
2023-05-06 08:37:25 +00:00
let col = offset - start ;
2023-05-06 08:05:28 +00:00
let ret = LineCol { line : line as u32 , col : col . into ( ) } ;
self . line_wide_chars
. get ( & ret . line )
. into_iter ( )
. flat_map ( | it | it . iter ( ) )
2023-05-06 22:06:51 +00:00
. all ( | it | col < = it . start | | it . end < = col )
2023-05-06 08:05:28 +00:00
. then_some ( ret )
2021-02-12 19:09:53 +00:00
}
2023-05-04 02:18:41 +00:00
/// Transforms the `LineCol` into a `TextSize`.
2022-01-03 14:49:47 +00:00
pub fn offset ( & self , line_col : LineCol ) -> Option < TextSize > {
2023-05-06 08:46:33 +00:00
self . start_offset ( line_col . line as usize ) . map ( | start | start + TextSize ::from ( line_col . col ) )
}
fn start_offset ( & self , line : usize ) -> Option < TextSize > {
match line . checked_sub ( 1 ) {
None = > Some ( TextSize ::from ( 0 ) ) ,
Some ( it ) = > self . newlines . get ( it ) . copied ( ) ,
}
2021-02-12 19:09:53 +00:00
}
2018-11-15 16:34:05 +00:00
2023-05-04 02:18:41 +00:00
/// Transforms the `LineCol` with the given `WideEncoding` into a `WideLineCol`.
2023-05-06 07:57:57 +00:00
pub fn to_wide ( & self , enc : WideEncoding , line_col : LineCol ) -> Option < WideLineCol > {
2023-05-06 08:03:18 +00:00
let mut col = line_col . col ;
2023-05-06 07:59:56 +00:00
if let Some ( wide_chars ) = self . line_wide_chars . get ( & line_col . line ) {
2023-05-04 23:40:41 +00:00
for c in wide_chars . iter ( ) {
2023-05-06 08:02:37 +00:00
if u32 ::from ( c . end ) < = line_col . col {
2023-05-06 22:05:03 +00:00
col = col . checked_sub ( u32 ::from ( c . len ( ) ) - c . wide_len ( enc ) ) ? ;
2018-11-15 16:34:05 +00:00
} else {
// From here on, all utf16 characters come *after* the character we are mapping,
// so we don't need to take them into account
break ;
}
}
}
2023-05-06 08:03:18 +00:00
Some ( WideLineCol { line : line_col . line , col } )
2018-11-15 16:34:05 +00:00
}
2023-05-06 07:59:56 +00:00
/// Transforms the `WideLineCol` with the given `WideEncoding` into a `LineCol`.
pub fn to_utf8 ( & self , enc : WideEncoding , line_col : WideLineCol ) -> Option < LineCol > {
let mut col = line_col . col ;
if let Some ( wide_chars ) = self . line_wide_chars . get ( & line_col . line ) {
2023-05-04 23:40:41 +00:00
for c in wide_chars . iter ( ) {
2020-05-03 06:54:15 +00:00
if col > u32 ::from ( c . start ) {
2023-05-06 22:05:03 +00:00
col = col . checked_add ( u32 ::from ( c . len ( ) ) - c . wide_len ( enc ) ) ? ;
2018-11-15 16:34:05 +00:00
} else {
// From here on, all utf16 characters come *after* the character we are mapping,
// so we don't need to take them into account
break ;
}
}
}
2023-05-06 22:05:38 +00:00
Some ( LineCol { line : line_col . line , col } )
2023-05-06 07:59:56 +00:00
}
2023-05-06 22:42:15 +00:00
/// Given a range [start, end), returns a sorted iterator of non-empty ranges [start, x1), [x1,
/// x2), ..., [xn, end) where all the xi, which are positions of newlines, are inside the range
/// [start, end).
2023-05-06 07:59:56 +00:00
pub fn lines ( & self , range : TextRange ) -> impl Iterator < Item = TextRange > + '_ {
let lo = self . newlines . partition_point ( | & it | it < range . start ( ) ) ;
let hi = self . newlines . partition_point ( | & it | it < = range . end ( ) ) ;
let all = std ::iter ::once ( range . start ( ) )
. chain ( self . newlines [ lo .. hi ] . iter ( ) . copied ( ) )
. chain ( std ::iter ::once ( range . end ( ) ) ) ;
2018-11-15 16:34:05 +00:00
2023-05-06 07:59:56 +00:00
all . clone ( )
. zip ( all . skip ( 1 ) )
. map ( | ( lo , hi ) | TextRange ::new ( lo , hi ) )
. filter ( | it | ! it . is_empty ( ) )
2018-08-10 18:13:39 +00:00
}
2023-05-06 22:44:37 +00:00
/// Returns the length of the original text.
pub fn len ( & self ) -> TextSize {
self . len
}
2018-08-10 18:13:39 +00:00
}
2023-06-27 13:21:58 +00:00
2023-07-12 17:51:20 +00:00
/// This is adapted from the rustc_span crate, https://github.com/rust-lang/rust/blob/de59844c98f7925242a798a72c59dc3610dd0e2c/compiler/rustc_span/src/analyze_source_file.rs
2023-06-27 13:21:58 +00:00
fn analyze_source_file ( src : & str ) -> ( Vec < TextSize > , IntMap < u32 , Box < [ WideChar ] > > ) {
assert! ( src . len ( ) < ! 0 u32 as usize ) ;
let mut lines = vec! [ ] ;
let mut line_wide_chars = IntMap ::< u32 , Vec < WideChar > > ::default ( ) ;
// Calls the right implementation, depending on hardware support available.
analyze_source_file_dispatch ( src , & mut lines , & mut line_wide_chars ) ;
( lines , line_wide_chars . into_iter ( ) . map ( | ( k , v ) | ( k , v . into_boxed_slice ( ) ) ) . collect ( ) )
}
#[ cfg(any(target_arch = " x86 " , target_arch = " x86_64 " )) ]
fn analyze_source_file_dispatch (
src : & str ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) {
if is_x86_feature_detected! ( " sse2 " ) {
// SAFETY: SSE2 support was checked
unsafe {
analyze_source_file_sse2 ( src , lines , multi_byte_chars ) ;
}
} else {
analyze_source_file_generic ( src , src . len ( ) , TextSize ::from ( 0 ) , lines , multi_byte_chars ) ;
}
}
2024-01-12 07:57:38 +00:00
#[ cfg(target_arch = " aarch64 " ) ]
fn analyze_source_file_dispatch (
src : & str ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) {
if std ::arch ::is_aarch64_feature_detected! ( " neon " ) {
// SAFETY: NEON support was checked
unsafe {
analyze_source_file_neon ( src , lines , multi_byte_chars ) ;
}
} else {
analyze_source_file_generic ( src , src . len ( ) , TextSize ::from ( 0 ) , lines , multi_byte_chars ) ;
}
}
2023-06-27 13:21:58 +00:00
/// Checks 16 byte chunks of text at a time. If the chunk contains
/// something other than printable ASCII characters and newlines, the
/// function falls back to the generic implementation. Otherwise it uses
/// SSE2 intrinsics to quickly find all newlines.
#[ target_feature(enable = " sse2 " ) ]
#[ cfg(any(target_arch = " x86 " , target_arch = " x86_64 " )) ]
unsafe fn analyze_source_file_sse2 (
src : & str ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) {
#[ cfg(target_arch = " x86 " ) ]
use std ::arch ::x86 ::* ;
#[ cfg(target_arch = " x86_64 " ) ]
use std ::arch ::x86_64 ::* ;
const CHUNK_SIZE : usize = 16 ;
let src_bytes = src . as_bytes ( ) ;
let chunk_count = src . len ( ) / CHUNK_SIZE ;
// This variable keeps track of where we should start decoding a
// chunk. If a multi-byte character spans across chunk boundaries,
// we need to skip that part in the next chunk because we already
// handled it.
let mut intra_chunk_offset = 0 ;
for chunk_index in 0 .. chunk_count {
let ptr = src_bytes . as_ptr ( ) as * const __m128i ;
// We don't know if the pointer is aligned to 16 bytes, so we
// use `loadu`, which supports unaligned loading.
let chunk = _mm_loadu_si128 ( ptr . add ( chunk_index ) ) ;
// For character in the chunk, see if its byte value is < 0, which
// indicates that it's part of a UTF-8 char.
let multibyte_test = _mm_cmplt_epi8 ( chunk , _mm_set1_epi8 ( 0 ) ) ;
// Create a bit mask from the comparison results.
let multibyte_mask = _mm_movemask_epi8 ( multibyte_test ) ;
// If the bit mask is all zero, we only have ASCII chars here:
if multibyte_mask = = 0 {
assert! ( intra_chunk_offset = = 0 ) ;
// Check for newlines in the chunk
let newlines_test = _mm_cmpeq_epi8 ( chunk , _mm_set1_epi8 ( b '\n' as i8 ) ) ;
let newlines_mask = _mm_movemask_epi8 ( newlines_test ) ;
if newlines_mask ! = 0 {
// All control characters are newlines, record them
let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
let output_offset = TextSize ::from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
loop {
let index = newlines_mask . trailing_zeros ( ) ;
if index > = CHUNK_SIZE as u32 {
// We have arrived at the end of the chunk.
break ;
}
lines . push ( TextSize ::from ( index ) + output_offset ) ;
// Clear the bit, so we can find the next one.
newlines_mask & = ( ! 1 ) < < index ;
}
}
continue ;
}
// The slow path.
// There are control chars in here, fallback to generic decoding.
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset ;
intra_chunk_offset = analyze_source_file_generic (
& src [ scan_start .. ] ,
CHUNK_SIZE - intra_chunk_offset ,
TextSize ::from ( scan_start as u32 ) ,
lines ,
multi_byte_chars ,
) ;
}
// There might still be a tail left to analyze
let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset ;
if tail_start < src . len ( ) {
analyze_source_file_generic (
& src [ tail_start .. ] ,
src . len ( ) - tail_start ,
TextSize ::from ( tail_start as u32 ) ,
lines ,
multi_byte_chars ,
) ;
}
}
2024-01-12 07:57:38 +00:00
#[ target_feature(enable = " neon " ) ]
2024-01-19 14:52:08 +00:00
#[ cfg(target_arch = " aarch64 " ) ]
2024-01-12 12:27:49 +00:00
#[ inline ]
2024-01-12 07:57:38 +00:00
// See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
//
// The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
// input vector. The least significant 4 bits correspond to the first byte in
// the vector.
unsafe fn move_mask ( v : std ::arch ::aarch64 ::uint8x16_t ) -> u64 {
use std ::arch ::aarch64 ::* ;
let nibble_mask = vshrn_n_u16 ( vreinterpretq_u16_u8 ( v ) , 4 ) ;
vget_lane_u64 ( vreinterpret_u64_u8 ( nibble_mask ) , 0 )
}
#[ target_feature(enable = " neon " ) ]
2024-01-19 14:52:08 +00:00
#[ cfg(target_arch = " aarch64 " ) ]
2024-01-12 07:57:38 +00:00
unsafe fn analyze_source_file_neon (
src : & str ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) {
use std ::arch ::aarch64 ::* ;
const CHUNK_SIZE : usize = 16 ;
let src_bytes = src . as_bytes ( ) ;
let chunk_count = src . len ( ) / CHUNK_SIZE ;
let newline = vdupq_n_s8 ( b '\n' as i8 ) ;
// This variable keeps track of where we should start decoding a
// chunk. If a multi-byte character spans across chunk boundaries,
// we need to skip that part in the next chunk because we already
// handled it.
let mut intra_chunk_offset = 0 ;
for chunk_index in 0 .. chunk_count {
let ptr = src_bytes . as_ptr ( ) as * const i8 ;
let chunk = vld1q_s8 ( ptr . add ( chunk_index * CHUNK_SIZE ) ) ;
// For character in the chunk, see if its byte value is < 0, which
// indicates that it's part of a UTF-8 char.
let multibyte_test = vcltzq_s8 ( chunk ) ;
// Create a bit mask from the comparison results.
let multibyte_mask = move_mask ( multibyte_test ) ;
// If the bit mask is all zero, we only have ASCII chars here:
if multibyte_mask = = 0 {
assert! ( intra_chunk_offset = = 0 ) ;
// Check for newlines in the chunk
let newlines_test = vceqq_s8 ( chunk , newline ) ;
let mut newlines_mask = move_mask ( newlines_test ) ;
2024-01-12 12:27:49 +00:00
// If the bit mask is not all zero, there are newlines in this chunk.
2024-01-12 07:57:38 +00:00
if newlines_mask ! = 0 {
let output_offset = TextSize ::from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
while newlines_mask ! = 0 {
let trailing_zeros = newlines_mask . trailing_zeros ( ) ;
let index = trailing_zeros / 4 ;
lines . push ( TextSize ::from ( index ) + output_offset ) ;
// Clear the current 4-bit, so we can find the next one.
newlines_mask & = ( ! 0xF ) < < trailing_zeros ;
}
}
continue ;
}
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset ;
intra_chunk_offset = analyze_source_file_generic (
& src [ scan_start .. ] ,
CHUNK_SIZE - intra_chunk_offset ,
TextSize ::from ( scan_start as u32 ) ,
lines ,
multi_byte_chars ,
) ;
}
let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset ;
if tail_start < src . len ( ) {
analyze_source_file_generic (
& src [ tail_start .. ] ,
src . len ( ) - tail_start ,
TextSize ::from ( tail_start as u32 ) ,
lines ,
multi_byte_chars ,
) ;
}
}
#[ cfg(not(any(target_arch = " x86 " , target_arch = " x86_64 " , target_arch = " aarch64 " ))) ]
2023-06-27 13:21:58 +00:00
// The target (or compiler version) does not support SSE2 ...
fn analyze_source_file_dispatch (
src : & str ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) {
analyze_source_file_generic ( src , src . len ( ) , TextSize ::from ( 0 ) , lines , multi_byte_chars ) ;
}
// `scan_len` determines the number of bytes in `src` to scan. Note that the
// function can read past `scan_len` if a multi-byte character start within the
// range but extends past it. The overflow is returned by the function.
fn analyze_source_file_generic (
src : & str ,
scan_len : usize ,
output_offset : TextSize ,
lines : & mut Vec < TextSize > ,
multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
) -> usize {
assert! ( src . len ( ) > = scan_len ) ;
let mut i = 0 ;
let src_bytes = src . as_bytes ( ) ;
while i < scan_len {
let byte = unsafe {
// We verified that i < scan_len <= src.len()
* src_bytes . get_unchecked ( i )
} ;
// How much to advance in order to get to the next UTF-8 char in the
// string.
let mut char_len = 1 ;
if byte = = b '\n' {
lines . push ( TextSize ::from ( i as u32 + 1 ) + output_offset ) ;
} else if byte > = 127 {
// The slow path: Just decode to `char`.
let c = src [ i .. ] . chars ( ) . next ( ) . unwrap ( ) ;
char_len = c . len_utf8 ( ) ;
2023-12-07 07:30:00 +00:00
// The last element of `lines` represents the offset of the start of
// current line. To get the offset inside the line, we subtract it.
let pos = TextSize ::from ( i as u32 ) + output_offset
- lines . last ( ) . unwrap_or ( & TextSize ::default ( ) ) ;
2023-06-27 13:21:58 +00:00
if char_len > 1 {
assert! ( ( 2 ..= 4 ) . contains ( & char_len ) ) ;
let mbc = WideChar { start : pos , end : pos + TextSize ::from ( char_len as u32 ) } ;
multi_byte_chars . entry ( lines . len ( ) as u32 ) . or_default ( ) . push ( mbc ) ;
}
}
i + = char_len ;
}
i - scan_len
}