deal with broken unicode implementations

Both GNU and BSD have bugs regarding the classification of
non-characters and private use area characters. Provide wrappers around
iswalnum(), iswalpha(), and isgraph() to provide a consistent
experience. We don't bother to autoconf the use of these wrappers for
several reasons. Including the fact that a binary built for one distro
release should behave correctly on another release (e.g., FreeBSD 10
does the right thing while FreeBSD 11 and 12 do not with respect to
iswalnum() of code points in the range 0xFDD0..0xFDFF).

Also move a few functions from common.* to wutil.* because they are wide
char specific and really belong in the latter module.

Fixes #3050
This commit is contained in:
Kurtis Rader 2016-09-27 21:07:10 -07:00
parent 01fa31f313
commit 92dd6de73c
4 changed files with 104 additions and 41 deletions

View file

@ -428,26 +428,6 @@ void append_format(wcstring &str, const wchar_t *format, ...) {
va_end(va);
}
const wchar_t *wcsvarname(const wchar_t *str) {
while (*str) {
if ((!iswalnum(*str)) && (*str != L'_')) {
return str;
}
str++;
}
return NULL;
}
const wchar_t *wcsvarname(const wcstring &str) { return wcsvarname(str.c_str()); }
const wchar_t *wcsfuncname(const wcstring &str) { return wcschr(str.c_str(), L'/'); }
bool wcsvarchr(wchar_t chr) { return iswalnum(chr) || chr == L'_'; }
int fish_wcswidth(const wchar_t *str) { return fish_wcswidth(str, wcslen(str)); }
int fish_wcswidth(const wcstring &str) { return fish_wcswidth(str.c_str(), str.size()); }
wchar_t *quote_end(const wchar_t *pos) {
wchar_t c = *pos;

View file

@ -617,27 +617,6 @@ wcstring vformat_string(const wchar_t *format, va_list va_orig);
void append_format(wcstring &str, const wchar_t *format, ...);
void append_formatv(wcstring &str, const wchar_t *format, va_list ap);
/// Test if the given string is a valid variable name.
///
/// \return null if this is a valid name, and a pointer to the first invalid character otherwise.
const wchar_t *wcsvarname(const wchar_t *str);
const wchar_t *wcsvarname(const wcstring &str);
/// Test if the given string is a valid function name.
///
/// \return null if this is a valid name, and a pointer to the first invalid character otherwise.
const wchar_t *wcsfuncname(const wcstring &str);
/// Test if the given string is valid in a variable name.
///
/// \return true if this is a valid name, false otherwise.
bool wcsvarchr(wchar_t chr);
/// Convenience variants on fish_wcwswidth().
///
/// See fallback.h for the normal definitions.
int fish_wcswidth(const wchar_t *str);
int fish_wcswidth(const wcstring &str);
/// This functions returns the end of the quoted substring beginning at \c in. The type of quoting
/// character is detemrined by examining \c in. Returns 0 on error.

View file

@ -1,4 +1,5 @@
// Wide character equivalents of various standard unix functions.
#define FISH_NO_ISW_WRAPPERS
#include "config.h"
#include <assert.h>
@ -470,6 +471,84 @@ int wrename(const wcstring &old, const wcstring &newv) {
return rename(old_narrow.c_str(), new_narrow.c_str());
}
/// Return one if the code point is in the range we reserve for internal use.
int fish_is_reserved_codepoint(wint_t wc) {
if (RESERVED_CHAR_BASE <= wc && wc < RESERVED_CHAR_END) return 1;
if (EXPAND_RESERVED_BASE <= wc && wc < EXPAND_RESERVED_END) return 1;
if (WILDCARD_RESERVED_BASE <= wc && wc < WILDCARD_RESERVED_END) return 1;
return 0;
}
/// Return one if the code point is in a Unicode private use area.
int fish_is_pua(wint_t wc) {
if (PUA1_START <= wc && wc < PUA1_END) return 1;
if (PUA2_START <= wc && wc < PUA2_END) return 1;
if (PUA3_START <= wc && wc < PUA3_END) return 1;
return 0;
}
/// We need this because there are too many implementations that don't return the proper answer for
/// some code points. See issue #3050.
int fish_iswalnum(wint_t wc) {
if (fish_is_reserved_codepoint(wc)) return 0;
if (fish_is_pua(wc)) return 0;
return iswalnum(wc);
}
/// We need this because there are too many implementations that don't return the proper answer for
/// some code points. See issue #3050.
int fish_iswalpha(wint_t wc) {
if (fish_is_reserved_codepoint(wc)) return 0;
if (fish_is_pua(wc)) return 0;
return iswalpha(wc);
}
/// We need this because there are too many implementations that don't return the proper answer for
/// some code points. See issue #3050.
int fish_iswgraph(wint_t wc) {
if (fish_is_reserved_codepoint(wc)) return 0;
if (fish_is_pua(wc)) return 1;
return iswgraph(wc);
}
/// Test if the given string is a valid variable name.
///
/// \return null if this is a valid name, and a pointer to the first invalid character otherwise.
const wchar_t *wcsvarname(const wchar_t *str) {
while (*str) {
if ((!fish_iswalnum(*str)) && (*str != L'_')) {
return str;
}
str++;
}
return NULL;
}
/// Test if the given string is a valid variable name.
///
/// \return null if this is a valid name, and a pointer to the first invalid character otherwise.
const wchar_t *wcsvarname(const wcstring &str) { return wcsvarname(str.c_str()); }
/// Test if the given string is a valid function name.
///
/// \return null if this is a valid name, and a pointer to the first invalid character otherwise.
const wchar_t *wcsfuncname(const wcstring &str) { return wcschr(str.c_str(), L'/'); }
/// Test if the given string is valid in a variable name.
///
/// \return true if this is a valid name, false otherwise.
bool wcsvarchr(wchar_t chr) { return fish_iswalnum(chr) || chr == L'_'; }
/// Convenience variants on fish_wcwswidth().
///
/// See fallback.h for the normal definitions.
int fish_wcswidth(const wchar_t *str) { return fish_wcswidth(str, wcslen(str)); }
/// Convenience variants on fish_wcwswidth().
///
/// See fallback.h for the normal definitions.
int fish_wcswidth(const wcstring &str) { return fish_wcswidth(str.c_str(), str.size()); }
file_id_t file_id_t::file_id_from_stat(const struct stat *buf) {
assert(buf != NULL);

View file

@ -59,6 +59,31 @@ int wmkdir(const wcstring &dir, int mode);
int wrename(const wcstring &oldName, const wcstring &newName);
#define PUA1_START 0xE000
#define PUA1_END 0xF900
#define PUA2_START 0xF0000
#define PUA2_END 0xFFFFE
#define PUA3_START 0x100000
#define PUA3_END 0x10FFFE
// We need this because there are too many implementations that don't return the proper answer for
// some code points. See issue #3050.
#ifndef FISH_NO_ISW_WRAPPERS
#define iswalnum fish_iswalnum
#define iswalpha fish_iswalpha
#define iswgraph fish_iswgraph
#endif
int fish_iswalnum(wint_t wc);
int fish_iswalpha(wint_t wc);
int fish_iswgraph(wint_t wc);
const wchar_t *wcsvarname(const wchar_t *str);
const wchar_t *wcsvarname(const wcstring &str);
const wchar_t *wcsfuncname(const wcstring &str);
bool wcsvarchr(wchar_t chr);
int fish_wcswidth(const wchar_t *str);
int fish_wcswidth(const wcstring &str);
/// Class for representing a file's inode. We use this to detect and avoid symlink loops, among
/// other things. While an inode / dev pair is sufficient to distinguish co-existing files, Linux
/// seems to aggressively re-use inodes, so it cannot determine if a file has been deleted (ABA