mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-26 11:45:08 +00:00
Naive reimplementation of utf2wcs and wcs2utf in
env_universal_common.cpp. These use the new utf8 functions exposed in utf8.h. This will allow us to drop the iconv dependency.
This commit is contained in:
parent
a67dd9fbdd
commit
9718e70260
2 changed files with 17 additions and 290 deletions
|
@ -21,7 +21,6 @@
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <dirent.h>
|
#include <dirent.h>
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
#include <iconv.h>
|
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <locale.h>
|
#include <locale.h>
|
||||||
|
@ -39,6 +38,7 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "wutil.h"
|
#include "wutil.h"
|
||||||
|
#include "utf8.h"
|
||||||
#include "env_universal_common.h"
|
#include "env_universal_common.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -116,303 +116,28 @@ static void (*callback)(fish_message_type_t type,
|
||||||
const wchar_t *key,
|
const wchar_t *key,
|
||||||
const wchar_t *val);
|
const wchar_t *val);
|
||||||
|
|
||||||
/**
|
/* UTF <-> wchar conversions. These return a string allocated with malloc. These call sites could be cleaned up substantially to eliminate the dependence on malloc. */
|
||||||
List of names for the UTF-8 character set.
|
static wchar_t *utf2wcs(const char *input)
|
||||||
*/
|
|
||||||
static const char *iconv_utf8_names[]=
|
|
||||||
{
|
{
|
||||||
"utf-8", "UTF-8",
|
wchar_t *result = NULL;
|
||||||
"utf8", "UTF8",
|
wcstring converted;
|
||||||
0
|
if (utf8_to_wchar_string(input, &converted))
|
||||||
|
{
|
||||||
|
result = wcsdup(converted.c_str());
|
||||||
}
|
}
|
||||||
;
|
return result;
|
||||||
|
|
||||||
/**
|
|
||||||
List of wide character names, undefined byte length.
|
|
||||||
*/
|
|
||||||
static const char *iconv_wide_names_unknown[]=
|
|
||||||
{
|
|
||||||
"wchar_t", "WCHAR_T",
|
|
||||||
"wchar", "WCHAR",
|
|
||||||
0
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
/**
|
|
||||||
List of wide character names, 4 bytes long.
|
|
||||||
*/
|
|
||||||
static const char *iconv_wide_names_4[]=
|
|
||||||
{
|
|
||||||
"wchar_t", "WCHAR_T",
|
|
||||||
"wchar", "WCHAR",
|
|
||||||
"ucs-4", "UCS-4",
|
|
||||||
"ucs4", "UCS4",
|
|
||||||
"utf-32", "UTF-32",
|
|
||||||
"utf32", "UTF32",
|
|
||||||
0
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
/**
|
|
||||||
List of wide character names, 2 bytes long.
|
|
||||||
*/
|
|
||||||
static const char *iconv_wide_names_2[]=
|
|
||||||
{
|
|
||||||
"wchar_t", "WCHAR_T",
|
|
||||||
"wchar", "WCHAR",
|
|
||||||
"ucs-2", "UCS-2",
|
|
||||||
"ucs2", "UCS2",
|
|
||||||
"utf-16", "UTF-16",
|
|
||||||
"utf16", "UTF16",
|
|
||||||
0
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
template<class T>
|
|
||||||
class sloppy {};
|
|
||||||
|
|
||||||
static size_t hack_iconv(iconv_t cd, const char * const* inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
|
|
||||||
{
|
|
||||||
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
|
|
||||||
OS X and Linux this one: size_t iconv (iconv_t, char **...)
|
|
||||||
AFAIK there's no single type that can be passed as both char ** and const char **.
|
|
||||||
Therefore, we let C++ figure it out, by providing a struct with an implicit conversion to both char** and const char **.
|
|
||||||
*/
|
|
||||||
struct sloppy_char
|
|
||||||
{
|
|
||||||
const char * const * t;
|
|
||||||
operator char** () const
|
|
||||||
{
|
|
||||||
return (char **)t;
|
|
||||||
}
|
|
||||||
operator const char** () const
|
|
||||||
{
|
|
||||||
return (const char**)t;
|
|
||||||
}
|
|
||||||
} slop_inbuf = {inbuf};
|
|
||||||
|
|
||||||
return iconv(cd, slop_inbuf, inbytesleft, outbuf, outbytesleft);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static char *wcs2utf(const wchar_t *input)
|
||||||
Convert utf-8 string to wide string
|
|
||||||
*/
|
|
||||||
static wchar_t *utf2wcs(const char *in)
|
|
||||||
{
|
{
|
||||||
iconv_t cd=(iconv_t) -1;
|
char *result = NULL;
|
||||||
int i,j;
|
std::string converted;
|
||||||
|
if (wchar_to_utf8_string(input, &converted))
|
||||||
wchar_t *out;
|
|
||||||
|
|
||||||
/*
|
|
||||||
Try to convert to wchar_t. If that is not a valid character set,
|
|
||||||
try various names for ucs-4. We can't be sure that ucs-4 is
|
|
||||||
really the character set used by wchar_t, but it is the best
|
|
||||||
assumption we can make.
|
|
||||||
*/
|
|
||||||
const char **to_name=0;
|
|
||||||
|
|
||||||
switch (sizeof(wchar_t))
|
|
||||||
{
|
{
|
||||||
|
result = strdup(converted.c_str());
|
||||||
case 2:
|
|
||||||
to_name = iconv_wide_names_2;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
to_name = iconv_wide_names_4;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
to_name = iconv_wide_names_unknown;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
|
|
||||||
/*
|
|
||||||
The line protocol fish uses is always utf-8.
|
|
||||||
*/
|
|
||||||
const char **from_name = iconv_utf8_names;
|
|
||||||
|
|
||||||
size_t in_len = strlen(in);
|
|
||||||
size_t out_len = sizeof(wchar_t)*(in_len+2);
|
|
||||||
size_t nconv;
|
|
||||||
char *nout;
|
|
||||||
|
|
||||||
out = (wchar_t *)malloc(out_len);
|
|
||||||
nout = (char *)out;
|
|
||||||
|
|
||||||
if (!out)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
for (i=0; to_name[i]; i++)
|
|
||||||
{
|
|
||||||
for (j=0; from_name[j]; j++)
|
|
||||||
{
|
|
||||||
cd = iconv_open(to_name[i], from_name[j]);
|
|
||||||
|
|
||||||
if (cd != (iconv_t) -1)
|
|
||||||
{
|
|
||||||
goto start_conversion;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
start_conversion:
|
|
||||||
|
|
||||||
if (cd == (iconv_t) -1)
|
|
||||||
{
|
|
||||||
/* Something went wrong. */
|
|
||||||
debug(0, L"Could not perform utf-8 conversion");
|
|
||||||
if (errno != EINVAL)
|
|
||||||
wperror(L"iconv_open");
|
|
||||||
|
|
||||||
/* Terminate the output string. */
|
|
||||||
free(out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
|
|
||||||
OS X and Linux this one: size_t iconv (iconv_t, char **...)
|
|
||||||
AFAIK there's no single type that can be passed as both char ** and const char **.
|
|
||||||
Hence this hack.
|
|
||||||
*/
|
|
||||||
nconv = hack_iconv(cd, &in, &in_len, &nout, &out_len);
|
|
||||||
|
|
||||||
if (nconv == (size_t) -1)
|
|
||||||
{
|
|
||||||
debug(0, L"Error while converting from utf string");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
*((wchar_t *) nout) = L'\0';
|
|
||||||
|
|
||||||
/*
|
|
||||||
Check for silly iconv behaviour inserting an bytemark in the output
|
|
||||||
string.
|
|
||||||
*/
|
|
||||||
if (*out == L'\xfeff' || *out == L'\xffef' || *out == L'\xefbbbf')
|
|
||||||
{
|
|
||||||
wchar_t *out_old = out;
|
|
||||||
out = wcsdup(out+1);
|
|
||||||
if (! out)
|
|
||||||
{
|
|
||||||
debug(0, L"FNORD!!!!");
|
|
||||||
free(out_old);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
free(out_old);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (iconv_close(cd) != 0)
|
|
||||||
wperror(L"iconv_close");
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
Convert wide string to utf-8
|
|
||||||
*/
|
|
||||||
static char *wcs2utf(const wchar_t *in)
|
|
||||||
{
|
|
||||||
iconv_t cd=(iconv_t) -1;
|
|
||||||
int i,j;
|
|
||||||
|
|
||||||
char *char_in = (char *)in;
|
|
||||||
char *out;
|
|
||||||
|
|
||||||
/*
|
|
||||||
Try to convert to wchar_t. If that is not a valid character set,
|
|
||||||
try various names for ucs-4. We can't be sure that ucs-4 is
|
|
||||||
really the character set used by wchar_t, but it is the best
|
|
||||||
assumption we can make.
|
|
||||||
*/
|
|
||||||
const char **from_name=0;
|
|
||||||
|
|
||||||
switch (sizeof(wchar_t))
|
|
||||||
{
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
from_name = iconv_wide_names_2;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
from_name = iconv_wide_names_4;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
from_name = iconv_wide_names_unknown;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char **to_name = iconv_utf8_names;
|
|
||||||
|
|
||||||
size_t in_len = wcslen(in);
|
|
||||||
size_t out_len = sizeof(char)*((MAX_UTF8_BYTES*in_len)+1);
|
|
||||||
size_t nconv;
|
|
||||||
char *nout;
|
|
||||||
|
|
||||||
out = (char *)malloc(out_len);
|
|
||||||
nout = (char *)out;
|
|
||||||
in_len *= sizeof(wchar_t);
|
|
||||||
|
|
||||||
if (!out)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
for (i=0; to_name[i]; i++)
|
|
||||||
{
|
|
||||||
for (j=0; from_name[j]; j++)
|
|
||||||
{
|
|
||||||
cd = iconv_open(to_name[i], from_name[j]);
|
|
||||||
|
|
||||||
if (cd != (iconv_t) -1)
|
|
||||||
{
|
|
||||||
goto start_conversion;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
start_conversion:
|
|
||||||
|
|
||||||
if (cd == (iconv_t) -1)
|
|
||||||
{
|
|
||||||
/* Something went wrong. */
|
|
||||||
debug(0, L"Could not perform utf-8 conversion");
|
|
||||||
if (errno != EINVAL)
|
|
||||||
wperror(L"iconv_open");
|
|
||||||
|
|
||||||
/* Terminate the output string. */
|
|
||||||
free(out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
nconv = hack_iconv(cd, &char_in, &in_len, &nout, &out_len);
|
|
||||||
|
|
||||||
|
|
||||||
if (nconv == (size_t) -1)
|
|
||||||
{
|
|
||||||
debug(0, L"%d %d", in_len, out_len);
|
|
||||||
debug(0, L"Error while converting from to string");
|
|
||||||
|
|
||||||
/* Terminate the output string. */
|
|
||||||
free(out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
*nout = '\0';
|
|
||||||
|
|
||||||
if (iconv_close(cd) != 0)
|
|
||||||
wperror(L"iconv_close");
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
|
void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
|
||||||
{
|
{
|
||||||
|
|
2
utf8.h
2
utf8.h
|
@ -28,9 +28,11 @@
|
||||||
#define UTF8_IGNORE_ERROR 0x01
|
#define UTF8_IGNORE_ERROR 0x01
|
||||||
#define UTF8_SKIP_BOM 0x02
|
#define UTF8_SKIP_BOM 0x02
|
||||||
|
|
||||||
|
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
|
||||||
bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
|
bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
|
||||||
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
||||||
|
|
||||||
|
/* Variants exposed for testing */
|
||||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
|
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
|
||||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue