2016-05-03 22:18:24 +00:00
// Helper functions for working with wcstring.
2014-09-22 02:18:56 +00:00
# ifndef FISH_WCSTRINGUTIL_H
# define FISH_WCSTRINGUTIL_H
2017-07-27 13:05:35 +00:00
# include <algorithm>
2020-07-30 00:16:51 +00:00
# include <cstring>
2015-07-25 15:14:25 +00:00
# include <string>
2014-09-22 02:18:56 +00:00
# include <utility>
2016-04-21 06:00:54 +00:00
2014-09-22 02:18:56 +00:00
# include "common.h"
2020-07-30 00:16:51 +00:00
# include "expand.h"
2014-09-22 02:18:56 +00:00
2020-01-15 21:16:43 +00:00
/// Test if a string prefixes another. Returns true if a is a prefix of b.
bool string_prefixes_string ( const wcstring & proposed_prefix , const wcstring & value ) ;
bool string_prefixes_string ( const wchar_t * proposed_prefix , const wcstring & value ) ;
bool string_prefixes_string ( const wchar_t * proposed_prefix , const wchar_t * value ) ;
bool string_prefixes_string ( const char * proposed_prefix , const std : : string & value ) ;
bool string_prefixes_string ( const char * proposed_prefix , const char * value ) ;
/// Test if a string is a suffix of another.
bool string_suffixes_string ( const wcstring & proposed_suffix , const wcstring & value ) ;
bool string_suffixes_string ( const wchar_t * proposed_suffix , const wcstring & value ) ;
bool string_suffixes_string_case_insensitive ( const wcstring & proposed_suffix ,
const wcstring & value ) ;
/// Test if a string prefixes another without regard to case. Returns true if a is a prefix of b.
bool string_prefixes_string_case_insensitive ( const wcstring & proposed_prefix ,
const wcstring & value ) ;
/// Case-insensitive string search, modeled after std::string::find().
/// \param fuzzy indicates this is being used for fuzzy matching and case insensitivity is
/// expanded to include symbolic characters (#3584).
/// \return the offset of the first case-insensitive matching instance of `needle` within
/// `haystack`, or `string::npos()` if no results were found.
size_t ifind ( const wcstring & haystack , const wcstring & needle , bool fuzzy = false ) ;
size_t ifind ( const std : : string & haystack , const std : : string & needle , bool fuzzy = false ) ;
2020-11-28 21:39:36 +00:00
/// A lightweight value-type describing how closely a string fuzzy-matches another string.
struct string_fuzzy_match_t {
// The ways one string can contain another.
enum class contain_type_t : uint8_t {
exact , // exact match: foobar matches foo
prefix , // prefix match: foo matches foobar
substr , // substring match: ooba matches foobar
subseq , // subsequence match: fbr matches foobar
} ;
contain_type_t type ;
// The case-folding required for the match.
enum class case_fold_t : uint8_t {
2020-11-29 00:15:01 +00:00
samecase , // exact match: foobar matches foobar
smartcase , // case insensitive match with lowercase input. foobar matches FoBar.
icase , // case insensitive: FoBaR matches foobAr
2020-11-28 21:39:36 +00:00
} ;
case_fold_t case_fold ;
2020-11-27 23:43:07 +00:00
2020-11-28 21:39:36 +00:00
// Constructor.
constexpr string_fuzzy_match_t ( contain_type_t type , case_fold_t case_fold )
: type ( type ) , case_fold ( case_fold ) { }
2020-11-27 23:43:07 +00:00
2020-11-28 21:39:36 +00:00
// Helper to return an exact match.
static constexpr string_fuzzy_match_t exact_match ( ) {
return string_fuzzy_match_t ( contain_type_t : : exact , case_fold_t : : samecase ) ;
}
2020-11-27 23:43:07 +00:00
2020-11-28 21:39:36 +00:00
/// \return whether this is a samecase exact match.
bool is_samecase_exact ( ) const {
return type = = contain_type_t : : exact & & case_fold = = case_fold_t : : samecase ;
2020-11-27 23:43:07 +00:00
}
2020-11-28 21:39:36 +00:00
/// \return if we are exact or prefix match.
bool is_exact_or_prefix ( ) const {
switch ( type ) {
case contain_type_t : : exact :
case contain_type_t : : prefix :
return true ;
case contain_type_t : : substr :
case contain_type_t : : subseq :
return false ;
}
DIE ( " Unreachable " ) ;
return false ;
2020-11-27 23:43:07 +00:00
}
2020-11-28 21:39:36 +00:00
// \return if our match requires a full replacement, i.e. is not a strict extension of our
// existing string. This is false only if our case matches, and our type is prefix or exact.
bool requires_full_replacement ( ) const {
if ( case_fold ! = case_fold_t : : samecase ) return true ;
switch ( type ) {
case contain_type_t : : exact :
case contain_type_t : : prefix :
return false ;
case contain_type_t : : substr :
case contain_type_t : : subseq :
return true ;
}
DIE ( " Unreachable " ) ;
return false ;
}
2020-11-27 23:43:07 +00:00
2020-11-28 21:39:36 +00:00
/// Try creating a fuzzy match for \p string against \p match_against.
/// \p string is something like "foo" and \p match_against is like "FooBar".
/// If \p anchor_start is set, then only exact and prefix matches are permitted.
static maybe_t < string_fuzzy_match_t > try_create ( const wcstring & string ,
const wcstring & match_against ,
bool anchor_start ) ;
2020-11-27 23:43:07 +00:00
2020-11-28 21:39:36 +00:00
/// \return a rank for filtering matches.
/// Earlier (smaller) ranks are better matches.
uint32_t rank ( ) const ;
2020-11-27 23:43:07 +00:00
} ;
2020-11-28 21:39:36 +00:00
/// Cover over string_fuzzy_match_t::try_create().
inline maybe_t < string_fuzzy_match_t > string_fuzzy_match_string ( const wcstring & string ,
const wcstring & match_against ,
bool anchor_start = false ) {
return string_fuzzy_match_t : : try_create ( string , match_against , anchor_start ) ;
}
2020-11-27 23:43:07 +00:00
2020-01-15 21:16:43 +00:00
/// Split a string by a separator character.
wcstring_list_t split_string ( const wcstring & val , wchar_t sep ) ;
2021-04-18 21:46:05 +00:00
/// Split a string by runs of any of the separator characters provided in \p seps.
/// Note the delimiters are the characters in \p seps, not \p seps itself.
/// \p seps may contain the NUL character.
/// Do not output more than \p max_results results. If we are to output exactly that much,
/// the last output is the the remainder of the input, including leading delimiters,
/// except for the first. This is historical behavior.
/// Example: split_string_tok(" a b c ", " ", 3) -> {"a", "b", " c "}
wcstring_list_t split_string_tok ( const wcstring & val , const wcstring & seps ,
size_t max_results = std : : numeric_limits < size_t > : : max ( ) ) ;
2020-01-15 21:16:43 +00:00
/// Join a list of strings by a separator character.
wcstring join_strings ( const wcstring_list_t & vals , wchar_t sep ) ;
inline wcstring to_string ( long x ) {
wchar_t buff [ 64 ] ;
format_long_safe ( buff , x ) ;
return wcstring ( buff ) ;
}
inline wcstring to_string ( unsigned long long x ) {
wchar_t buff [ 64 ] ;
format_ullong_safe ( buff , x ) ;
return wcstring ( buff ) ;
}
inline wcstring to_string ( int x ) { return to_string ( static_cast < long > ( x ) ) ; }
inline wcstring to_string ( size_t x ) { return to_string ( static_cast < unsigned long long > ( x ) ) ; }
inline bool bool_from_string ( const std : : string & x ) {
if ( x . empty ( ) ) return false ;
switch ( x . front ( ) ) {
case ' Y ' :
case ' T ' :
case ' y ' :
case ' t ' :
case ' 1 ' :
return true ;
default :
return false ;
}
}
inline bool bool_from_string ( const wcstring & x ) {
return ! x . empty ( ) & & std : : wcschr ( L " YTyt1 " , x . at ( 0 ) ) ;
}
2017-07-27 13:05:35 +00:00
/// Given iterators into a string (forward or reverse), splits the haystack iterators
/// about the needle sequence, up to max times. Inserts splits into the output array.
/// If the iterators are forward, this does the normal thing.
/// If the iterators are backward, this returns reversed strings, in reversed order!
/// If the needle is empty, split on individual elements (characters).
2018-04-17 11:57:33 +00:00
/// Max output entries will be max + 1 (after max splits)
2017-07-27 13:05:35 +00:00
template < typename ITER >
void split_about ( ITER haystack_start , ITER haystack_end , ITER needle_start , ITER needle_end ,
2019-05-05 10:09:25 +00:00
wcstring_list_t * output , long max = LONG_MAX , bool no_empty = false ) {
2017-07-27 13:05:35 +00:00
long remaining = max ;
ITER haystack_cursor = haystack_start ;
while ( remaining > 0 & & haystack_cursor ! = haystack_end ) {
ITER split_point ;
if ( needle_start = = needle_end ) { // empty needle, we split on individual elements
split_point = haystack_cursor + 1 ;
} else {
split_point = std : : search ( haystack_cursor , haystack_end , needle_start , needle_end ) ;
}
if ( split_point = = haystack_end ) { // not found
break ;
}
2018-04-17 02:49:26 +00:00
if ( ! no_empty | | haystack_cursor ! = split_point ) {
output - > emplace_back ( haystack_cursor , split_point ) ;
2018-03-29 13:12:08 +00:00
}
2017-07-27 13:05:35 +00:00
remaining - - ;
// Need to skip over the needle for the next search note that the needle may be empty.
haystack_cursor = split_point + std : : distance ( needle_start , needle_end ) ;
}
// Trailing component, possibly empty.
2018-04-17 02:49:26 +00:00
if ( ! no_empty | | haystack_cursor ! = haystack_end ) {
output - > emplace_back ( haystack_cursor , haystack_end ) ;
}
2017-07-27 13:05:35 +00:00
}
2018-03-09 20:52:12 +00:00
enum class ellipsis_type {
None ,
2019-05-05 10:09:25 +00:00
// Prefer niceness over minimalness
2018-03-09 20:52:12 +00:00
Prettiest ,
2019-05-05 10:09:25 +00:00
// Make every character count ($ instead of ...)
2018-03-09 20:52:12 +00:00
Shortest ,
} ;
2019-05-05 10:09:25 +00:00
wcstring truncate ( const wcstring & input , int max_len ,
ellipsis_type etype = ellipsis_type : : Prettiest ) ;
2019-08-25 20:37:06 +00:00
wcstring trim ( wcstring input ) ;
wcstring trim ( wcstring input , const wchar_t * any_of ) ;
2018-03-09 20:52:12 +00:00
2019-09-22 22:33:08 +00:00
/// Converts a string to lowercase.
wcstring wcstolower ( wcstring input ) ;
2021-01-31 00:20:20 +00:00
/// \return the number of escaping backslashes before a character.
/// \p idx may be "one past the end."
size_t count_preceding_backslashes ( const wcstring & text , size_t idx ) ;
2020-07-30 00:16:51 +00:00
// Out-of-line helper for wcs2string_callback.
void wcs2string_bad_char ( wchar_t ) ;
/// Implementation of wcs2string that accepts a callback.
/// This invokes \p func with (const char*, size_t) pairs.
/// If \p func returns false, it stops; otherwise it continues.
/// \return false if the callback returned false, otherwise true.
template < typename Func >
bool wcs2string_callback ( const wchar_t * input , size_t len , const Func & func ) {
mbstate_t state = { } ;
char converted [ MB_LEN_MAX ] ;
for ( size_t i = 0 ; i < len ; i + + ) {
wchar_t wc = input [ i ] ;
// TODO: this doesn't seem sound.
if ( wc = = INTERNAL_SEPARATOR ) {
// do nothing
} else if ( wc > = ENCODE_DIRECT_BASE & & wc < ENCODE_DIRECT_BASE + 256 ) {
converted [ 0 ] = wc - ENCODE_DIRECT_BASE ;
if ( ! func ( converted , 1 ) ) return false ;
} else if ( MB_CUR_MAX = = 1 ) { // single-byte locale (C/POSIX/ISO-8859)
// If `wc` contains a wide character we emit a question-mark.
if ( wc & ~ 0xFF ) {
wc = ' ? ' ;
}
converted [ 0 ] = wc ;
if ( ! func ( converted , 1 ) ) return false ;
} else {
std : : memset ( converted , 0 , sizeof converted ) ;
size_t len = std : : wcrtomb ( converted , wc , & state ) ;
if ( len = = static_cast < size_t > ( - 1 ) ) {
wcs2string_bad_char ( wc ) ;
std : : memset ( & state , 0 , sizeof ( state ) ) ;
} else {
if ( ! func ( converted , len ) ) return false ;
}
}
}
return true ;
}
2020-01-15 21:16:43 +00:00
/// Support for iterating over a newline-separated string.
template < typename Collection >
class line_iterator_t {
// Storage for each line.
Collection storage ;
// The collection we're iterating. Note we hold this by reference.
const Collection & coll ;
// The current location in the iteration.
typename Collection : : const_iterator current ;
public :
/// Construct from a collection (presumably std::string or std::wcstring).
line_iterator_t ( const Collection & coll ) : coll ( coll ) , current ( coll . cbegin ( ) ) { }
/// Access the storage in which the last line was stored.
const Collection & line ( ) const { return storage ; }
/// Advances to the next line. \return true on success, false if we have exhausted the string.
bool next ( ) {
if ( current = = coll . end ( ) ) return false ;
auto newline_or_end = std : : find ( current , coll . cend ( ) , ' \n ' ) ;
storage . assign ( current , newline_or_end ) ;
current = newline_or_end ;
// Skip the newline.
if ( current ! = coll . cend ( ) ) + + current ;
return true ;
}
} ;
2021-09-23 10:57:44 +00:00
/// Like fish_wcwidth, but returns 0 for characters with no real width instead of -1.
int fish_wcwidth_visible ( wchar_t widechar ) ;
/// The same, but for all chars. Note that this only makes sense if the string has an arbitrary long prefix - backslashes can move the cursor *before* the string.
///
/// In typical usage, you probably want to wrap wcwidth_visible to accumulate the width, but never go below 0.
int fish_wcswidth_visible ( const wcstring & str ) ;
2014-09-22 02:18:56 +00:00
# endif