mirror of
https://github.com/fish-shell/fish-shell
synced 2024-12-25 12:23:09 +00:00
Naive reimplementation of utf2wcs and wcs2utf in
env_universal_common.cpp. These use the new utf8 functions exposed in utf8.h. This will allow us to drop the iconv dependency.
This commit is contained in:
parent
a67dd9fbdd
commit
9718e70260
2 changed files with 17 additions and 290 deletions
|
@ -21,7 +21,6 @@
|
|||
#include <sys/stat.h>
|
||||
#include <dirent.h>
|
||||
#include <wctype.h>
|
||||
#include <iconv.h>
|
||||
|
||||
#include <errno.h>
|
||||
#include <locale.h>
|
||||
|
@ -39,6 +38,7 @@
|
|||
|
||||
#include "common.h"
|
||||
#include "wutil.h"
|
||||
#include "utf8.h"
|
||||
#include "env_universal_common.h"
|
||||
|
||||
/**
|
||||
|
@ -116,304 +116,29 @@ static void (*callback)(fish_message_type_t type,
|
|||
const wchar_t *key,
|
||||
const wchar_t *val);
|
||||
|
||||
/**
|
||||
List of names for the UTF-8 character set.
|
||||
*/
|
||||
static const char *iconv_utf8_names[]=
|
||||
/* UTF <-> wchar conversions. These return a string allocated with malloc. These call sites could be cleaned up substantially to eliminate the dependence on malloc. */
|
||||
static wchar_t *utf2wcs(const char *input)
|
||||
{
|
||||
"utf-8", "UTF-8",
|
||||
"utf8", "UTF8",
|
||||
0
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
List of wide character names, undefined byte length.
|
||||
*/
|
||||
static const char *iconv_wide_names_unknown[]=
|
||||
{
|
||||
"wchar_t", "WCHAR_T",
|
||||
"wchar", "WCHAR",
|
||||
0
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
List of wide character names, 4 bytes long.
|
||||
*/
|
||||
static const char *iconv_wide_names_4[]=
|
||||
{
|
||||
"wchar_t", "WCHAR_T",
|
||||
"wchar", "WCHAR",
|
||||
"ucs-4", "UCS-4",
|
||||
"ucs4", "UCS4",
|
||||
"utf-32", "UTF-32",
|
||||
"utf32", "UTF32",
|
||||
0
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
List of wide character names, 2 bytes long.
|
||||
*/
|
||||
static const char *iconv_wide_names_2[]=
|
||||
{
|
||||
"wchar_t", "WCHAR_T",
|
||||
"wchar", "WCHAR",
|
||||
"ucs-2", "UCS-2",
|
||||
"ucs2", "UCS2",
|
||||
"utf-16", "UTF-16",
|
||||
"utf16", "UTF16",
|
||||
0
|
||||
}
|
||||
;
|
||||
|
||||
template<class T>
|
||||
class sloppy {};
|
||||
|
||||
static size_t hack_iconv(iconv_t cd, const char * const* inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
|
||||
{
|
||||
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
|
||||
OS X and Linux this one: size_t iconv (iconv_t, char **...)
|
||||
AFAIK there's no single type that can be passed as both char ** and const char **.
|
||||
Therefore, we let C++ figure it out, by providing a struct with an implicit conversion to both char** and const char **.
|
||||
*/
|
||||
struct sloppy_char
|
||||
wchar_t *result = NULL;
|
||||
wcstring converted;
|
||||
if (utf8_to_wchar_string(input, &converted))
|
||||
{
|
||||
const char * const * t;
|
||||
operator char** () const
|
||||
{
|
||||
return (char **)t;
|
||||
}
|
||||
operator const char** () const
|
||||
{
|
||||
return (const char**)t;
|
||||
}
|
||||
} slop_inbuf = {inbuf};
|
||||
|
||||
return iconv(cd, slop_inbuf, inbytesleft, outbuf, outbytesleft);
|
||||
result = wcsdup(converted.c_str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
Convert utf-8 string to wide string
|
||||
*/
|
||||
static wchar_t *utf2wcs(const char *in)
|
||||
static char *wcs2utf(const wchar_t *input)
|
||||
{
|
||||
iconv_t cd=(iconv_t) -1;
|
||||
int i,j;
|
||||
|
||||
wchar_t *out;
|
||||
|
||||
/*
|
||||
Try to convert to wchar_t. If that is not a valid character set,
|
||||
try various names for ucs-4. We can't be sure that ucs-4 is
|
||||
really the character set used by wchar_t, but it is the best
|
||||
assumption we can make.
|
||||
*/
|
||||
const char **to_name=0;
|
||||
|
||||
switch (sizeof(wchar_t))
|
||||
char *result = NULL;
|
||||
std::string converted;
|
||||
if (wchar_to_utf8_string(input, &converted))
|
||||
{
|
||||
|
||||
case 2:
|
||||
to_name = iconv_wide_names_2;
|
||||
break;
|
||||
|
||||
case 4:
|
||||
to_name = iconv_wide_names_4;
|
||||
break;
|
||||
|
||||
default:
|
||||
to_name = iconv_wide_names_unknown;
|
||||
break;
|
||||
result = strdup(converted.c_str());
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
The line protocol fish uses is always utf-8.
|
||||
*/
|
||||
const char **from_name = iconv_utf8_names;
|
||||
|
||||
size_t in_len = strlen(in);
|
||||
size_t out_len = sizeof(wchar_t)*(in_len+2);
|
||||
size_t nconv;
|
||||
char *nout;
|
||||
|
||||
out = (wchar_t *)malloc(out_len);
|
||||
nout = (char *)out;
|
||||
|
||||
if (!out)
|
||||
return 0;
|
||||
|
||||
for (i=0; to_name[i]; i++)
|
||||
{
|
||||
for (j=0; from_name[j]; j++)
|
||||
{
|
||||
cd = iconv_open(to_name[i], from_name[j]);
|
||||
|
||||
if (cd != (iconv_t) -1)
|
||||
{
|
||||
goto start_conversion;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
start_conversion:
|
||||
|
||||
if (cd == (iconv_t) -1)
|
||||
{
|
||||
/* Something went wrong. */
|
||||
debug(0, L"Could not perform utf-8 conversion");
|
||||
if (errno != EINVAL)
|
||||
wperror(L"iconv_open");
|
||||
|
||||
/* Terminate the output string. */
|
||||
free(out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
|
||||
OS X and Linux this one: size_t iconv (iconv_t, char **...)
|
||||
AFAIK there's no single type that can be passed as both char ** and const char **.
|
||||
Hence this hack.
|
||||
*/
|
||||
nconv = hack_iconv(cd, &in, &in_len, &nout, &out_len);
|
||||
|
||||
if (nconv == (size_t) -1)
|
||||
{
|
||||
debug(0, L"Error while converting from utf string");
|
||||
return 0;
|
||||
}
|
||||
|
||||
*((wchar_t *) nout) = L'\0';
|
||||
|
||||
/*
|
||||
Check for silly iconv behaviour inserting an bytemark in the output
|
||||
string.
|
||||
*/
|
||||
if (*out == L'\xfeff' || *out == L'\xffef' || *out == L'\xefbbbf')
|
||||
{
|
||||
wchar_t *out_old = out;
|
||||
out = wcsdup(out+1);
|
||||
if (! out)
|
||||
{
|
||||
debug(0, L"FNORD!!!!");
|
||||
free(out_old);
|
||||
return 0;
|
||||
}
|
||||
free(out_old);
|
||||
}
|
||||
|
||||
|
||||
if (iconv_close(cd) != 0)
|
||||
wperror(L"iconv_close");
|
||||
|
||||
return out;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
Convert wide string to utf-8
|
||||
*/
|
||||
static char *wcs2utf(const wchar_t *in)
|
||||
{
|
||||
iconv_t cd=(iconv_t) -1;
|
||||
int i,j;
|
||||
|
||||
char *char_in = (char *)in;
|
||||
char *out;
|
||||
|
||||
/*
|
||||
Try to convert to wchar_t. If that is not a valid character set,
|
||||
try various names for ucs-4. We can't be sure that ucs-4 is
|
||||
really the character set used by wchar_t, but it is the best
|
||||
assumption we can make.
|
||||
*/
|
||||
const char **from_name=0;
|
||||
|
||||
switch (sizeof(wchar_t))
|
||||
{
|
||||
|
||||
case 2:
|
||||
from_name = iconv_wide_names_2;
|
||||
break;
|
||||
|
||||
case 4:
|
||||
from_name = iconv_wide_names_4;
|
||||
break;
|
||||
|
||||
default:
|
||||
from_name = iconv_wide_names_unknown;
|
||||
break;
|
||||
}
|
||||
|
||||
const char **to_name = iconv_utf8_names;
|
||||
|
||||
size_t in_len = wcslen(in);
|
||||
size_t out_len = sizeof(char)*((MAX_UTF8_BYTES*in_len)+1);
|
||||
size_t nconv;
|
||||
char *nout;
|
||||
|
||||
out = (char *)malloc(out_len);
|
||||
nout = (char *)out;
|
||||
in_len *= sizeof(wchar_t);
|
||||
|
||||
if (!out)
|
||||
return 0;
|
||||
|
||||
for (i=0; to_name[i]; i++)
|
||||
{
|
||||
for (j=0; from_name[j]; j++)
|
||||
{
|
||||
cd = iconv_open(to_name[i], from_name[j]);
|
||||
|
||||
if (cd != (iconv_t) -1)
|
||||
{
|
||||
goto start_conversion;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
start_conversion:
|
||||
|
||||
if (cd == (iconv_t) -1)
|
||||
{
|
||||
/* Something went wrong. */
|
||||
debug(0, L"Could not perform utf-8 conversion");
|
||||
if (errno != EINVAL)
|
||||
wperror(L"iconv_open");
|
||||
|
||||
/* Terminate the output string. */
|
||||
free(out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
nconv = hack_iconv(cd, &char_in, &in_len, &nout, &out_len);
|
||||
|
||||
|
||||
if (nconv == (size_t) -1)
|
||||
{
|
||||
debug(0, L"%d %d", in_len, out_len);
|
||||
debug(0, L"Error while converting from to string");
|
||||
|
||||
/* Terminate the output string. */
|
||||
free(out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nout = '\0';
|
||||
|
||||
if (iconv_close(cd) != 0)
|
||||
wperror(L"iconv_close");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
|
||||
{
|
||||
callback = cb;
|
||||
|
|
2
utf8.h
2
utf8.h
|
@ -28,9 +28,11 @@
|
|||
#define UTF8_IGNORE_ERROR 0x01
|
||||
#define UTF8_SKIP_BOM 0x02
|
||||
|
||||
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
|
||||
bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
|
||||
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
||||
|
||||
/* Variants exposed for testing */
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||
|
||||
|
|
Loading…
Reference in a new issue