mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-16 06:54:03 +00:00
Try to set LC_CTYPE to something UTF-8 capable (#8031)
* Try to set LC_CTYPE to something UTF-8 capable When fish is started with LC_CTYPE=C (even just effectively, often via LC_ALL=C!), it's basically broken. There's no way to handle non-ASCII characters with a C locale unless we want to write our locale-independent replacements for all of the system functions. Since we're not going to do that, let's try to find *some locale* for LC_CTYPE. We already do that in __fish_setlocale, but that's - a bit of a weird thing that reads unstandardized system configuration files - allows setting locale to C explicitly So it's still easily possible to end up in a broken configuration. Now, the issue with this is that there is (AFAICT) no portable way to get a list of all allowed locales and C.UTF-8 is not standardized, so we have no one locale to fall back on and are forced to try a few. The list we have here is quite arbitrary, but it's a start. Python does something similar and only tries C.UTF-8, C.utf8 and "UTF-8". Once C.UTF-8 is (hopefully) standardized, that will just start working (tm). Note that we do not *export* the fixed LC_CTYPE variable, so external programs still have to deal with the C locale, but we have no real business messing with the user's environment. To turn it off: $fish_allow_singlebyte_locale, if set to something true (like "1"), will re-run the locale initialization and skip the bit where we force LC_CTYPE to be utf8-capable. This is mainly used in our tests, but might also be useful if people are trying to do something weird.
This commit is contained in:
parent
e57c998d4c
commit
046db09f90
3 changed files with 36 additions and 1 deletions
|
@ -1409,6 +1409,7 @@ The locale variables are: ``LANG``, ``LC_ALL``, ``LC_COLLATE``, ``LC_CTYPE``, ``
|
||||||
|
|
||||||
The most common way to set the locale to use a command like ``set -gx LANG en_GB.utf8``, which sets the current locale to be the English language, as used in Great Britain, using the UTF-8 character set. That way any program that requires one setting differently can easily override just that and doesn't have to resort to LC_ALL. For a list of available locales on your system, try ``locale -a``.
|
The most common way to set the locale to use a command like ``set -gx LANG en_GB.utf8``, which sets the current locale to be the English language, as used in Great Britain, using the UTF-8 character set. That way any program that requires one setting differently can easily override just that and doesn't have to resort to LC_ALL. For a list of available locales on your system, try ``locale -a``.
|
||||||
|
|
||||||
|
Because it needs to handle output that might include multibyte characters (like e.g. emojis), fish will try to set its own internal LC_CTYPE to one that is UTF8-capable even if given an effective LC_CTYPE of "C" (the default). This prevents issues with e.g. filenames given in autosuggestions even if the user started fish with LC_ALL=C. To turn this handling off, set ``fish_allow_singlebyte_locale`` to "1".
|
||||||
|
|
||||||
.. _builtin-overview:
|
.. _builtin-overview:
|
||||||
|
|
||||||
|
|
|
@ -64,7 +64,7 @@ static const wcstring locale_variables[] = {
|
||||||
L"LANG", L"LANGUAGE", L"LC_ALL", L"LC_ADDRESS", L"LC_COLLATE",
|
L"LANG", L"LANGUAGE", L"LC_ALL", L"LC_ADDRESS", L"LC_COLLATE",
|
||||||
L"LC_CTYPE", L"LC_IDENTIFICATION", L"LC_MEASUREMENT", L"LC_MESSAGES", L"LC_MONETARY",
|
L"LC_CTYPE", L"LC_IDENTIFICATION", L"LC_MEASUREMENT", L"LC_MESSAGES", L"LC_MONETARY",
|
||||||
L"LC_NAME", L"LC_NUMERIC", L"LC_PAPER", L"LC_TELEPHONE", L"LC_TIME",
|
L"LC_NAME", L"LC_NUMERIC", L"LC_PAPER", L"LC_TELEPHONE", L"LC_TIME",
|
||||||
L"LOCPATH"};
|
L"fish_allow_singlebyte_locale", L"LOCPATH"};
|
||||||
|
|
||||||
/// List of all curses environment variable names that might trigger (re)initializing the curses
|
/// List of all curses environment variable names that might trigger (re)initializing the curses
|
||||||
/// subsystem.
|
/// subsystem.
|
||||||
|
@ -556,6 +556,15 @@ static void init_curses(const environment_t &vars) {
|
||||||
curses_initialized = true;
|
curses_initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char *utf8_locales[] = {
|
||||||
|
"C.UTF-8",
|
||||||
|
"en_US.UTF-8",
|
||||||
|
"en_GB.UTF-8",
|
||||||
|
"de_DE.UTF-8",
|
||||||
|
"C.utf8",
|
||||||
|
"UTF-8",
|
||||||
|
};
|
||||||
|
|
||||||
/// Initialize the locale subsystem.
|
/// Initialize the locale subsystem.
|
||||||
static void init_locale(const environment_t &vars) {
|
static void init_locale(const environment_t &vars) {
|
||||||
// We have to make a copy because the subsequent setlocale() call to change the locale will
|
// We have to make a copy because the subsequent setlocale() call to change the locale will
|
||||||
|
@ -576,6 +585,28 @@ static void init_locale(const environment_t &vars) {
|
||||||
}
|
}
|
||||||
|
|
||||||
char *locale = setlocale(LC_ALL, "");
|
char *locale = setlocale(LC_ALL, "");
|
||||||
|
|
||||||
|
// Try to get a multibyte-capable encoding
|
||||||
|
// A "C" locale is broken for our purposes - any wchar functions will break on it.
|
||||||
|
// So we try *really really really hard* to not have one.
|
||||||
|
bool fix_locale = true;
|
||||||
|
if (auto allow_c = vars.get(L"fish_allow_singlebyte_locale")) {
|
||||||
|
fix_locale = allow_c.missing_or_empty() ? true : !bool_from_string(allow_c->as_string());
|
||||||
|
}
|
||||||
|
if (fix_locale && MB_CUR_MAX == 1) {
|
||||||
|
FLOGF(env_locale, L"Have singlebyte locale, trying to fix");
|
||||||
|
for (auto loc : utf8_locales) {
|
||||||
|
setlocale(LC_CTYPE, loc);
|
||||||
|
if (MB_CUR_MAX > 1) {
|
||||||
|
FLOGF(env_locale, L"Fixed locale: '%s'", loc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (MB_CUR_MAX == 1) {
|
||||||
|
FLOGF(env_locale, L"Failed to fix locale");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fish_setlocale();
|
fish_setlocale();
|
||||||
FLOGF(env_locale, L"init_locale() setlocale(): '%s'", locale);
|
FLOGF(env_locale, L"init_locale() setlocale(): '%s'", locale);
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,9 @@
|
||||||
# see #7934.
|
# see #7934.
|
||||||
#REQUIRES: test -z "$GITHUB_WORKFLOW"
|
#REQUIRES: test -z "$GITHUB_WORKFLOW"
|
||||||
|
|
||||||
|
# We typically try to force a utf8-capable locale,
|
||||||
|
# this turns that off.
|
||||||
|
set -gx fish_allow_singlebyte_locale 1
|
||||||
|
|
||||||
# A function to display bytes, necessary because GNU and BSD implementations of `od` have different output.
|
# A function to display bytes, necessary because GNU and BSD implementations of `od` have different output.
|
||||||
# We used to use xxd, but it's not available everywhere. See #3797.
|
# We used to use xxd, but it's not available everywhere. See #3797.
|
||||||
|
|
Loading…
Reference in a new issue