Naive reimplementation of utf2wcs and wcs2utf in

env_universal_common.cpp. These use the new utf8 functions exposed in
utf8.h. This will allow us to drop the iconv dependency.
This commit is contained in:
ridiculousfish 2014-03-23 13:06:24 -07:00
parent a67dd9fbdd
commit 9718e70260
2 changed files with 17 additions and 290 deletions

View file

@ -21,7 +21,6 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <dirent.h> #include <dirent.h>
#include <wctype.h> #include <wctype.h>
#include <iconv.h>
#include <errno.h> #include <errno.h>
#include <locale.h> #include <locale.h>
@ -39,6 +38,7 @@
#include "common.h" #include "common.h"
#include "wutil.h" #include "wutil.h"
#include "utf8.h"
#include "env_universal_common.h" #include "env_universal_common.h"
/** /**
@ -116,303 +116,28 @@ static void (*callback)(fish_message_type_t type,
const wchar_t *key, const wchar_t *key,
const wchar_t *val); const wchar_t *val);
/** /* UTF <-> wchar conversions. These return a string allocated with malloc. These call sites could be cleaned up substantially to eliminate the dependence on malloc. */
List of names for the UTF-8 character set. static wchar_t *utf2wcs(const char *input)
*/
static const char *iconv_utf8_names[]=
{ {
"utf-8", "UTF-8", wchar_t *result = NULL;
"utf8", "UTF8", wcstring converted;
0 if (utf8_to_wchar_string(input, &converted))
{
result = wcsdup(converted.c_str());
} }
; return result;
/**
List of wide character names, undefined byte length.
*/
static const char *iconv_wide_names_unknown[]=
{
"wchar_t", "WCHAR_T",
"wchar", "WCHAR",
0
}
;
/**
List of wide character names, 4 bytes long.
*/
static const char *iconv_wide_names_4[]=
{
"wchar_t", "WCHAR_T",
"wchar", "WCHAR",
"ucs-4", "UCS-4",
"ucs4", "UCS4",
"utf-32", "UTF-32",
"utf32", "UTF32",
0
}
;
/**
List of wide character names, 2 bytes long.
*/
static const char *iconv_wide_names_2[]=
{
"wchar_t", "WCHAR_T",
"wchar", "WCHAR",
"ucs-2", "UCS-2",
"ucs2", "UCS2",
"utf-16", "UTF-16",
"utf16", "UTF16",
0
}
;
template<class T>
class sloppy {};
static size_t hack_iconv(iconv_t cd, const char * const* inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
{
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
OS X and Linux this one: size_t iconv (iconv_t, char **...)
AFAIK there's no single type that can be passed as both char ** and const char **.
Therefore, we let C++ figure it out, by providing a struct with an implicit conversion to both char** and const char **.
*/
struct sloppy_char
{
const char * const * t;
operator char** () const
{
return (char **)t;
}
operator const char** () const
{
return (const char**)t;
}
} slop_inbuf = {inbuf};
return iconv(cd, slop_inbuf, inbytesleft, outbuf, outbytesleft);
} }
/** static char *wcs2utf(const wchar_t *input)
Convert utf-8 string to wide string
*/
static wchar_t *utf2wcs(const char *in)
{ {
iconv_t cd=(iconv_t) -1; char *result = NULL;
int i,j; std::string converted;
if (wchar_to_utf8_string(input, &converted))
wchar_t *out;
/*
Try to convert to wchar_t. If that is not a valid character set,
try various names for ucs-4. We can't be sure that ucs-4 is
really the character set used by wchar_t, but it is the best
assumption we can make.
*/
const char **to_name=0;
switch (sizeof(wchar_t))
{ {
result = strdup(converted.c_str());
case 2:
to_name = iconv_wide_names_2;
break;
case 4:
to_name = iconv_wide_names_4;
break;
default:
to_name = iconv_wide_names_unknown;
break;
} }
return result;
/*
The line protocol fish uses is always utf-8.
*/
const char **from_name = iconv_utf8_names;
size_t in_len = strlen(in);
size_t out_len = sizeof(wchar_t)*(in_len+2);
size_t nconv;
char *nout;
out = (wchar_t *)malloc(out_len);
nout = (char *)out;
if (!out)
return 0;
for (i=0; to_name[i]; i++)
{
for (j=0; from_name[j]; j++)
{
cd = iconv_open(to_name[i], from_name[j]);
if (cd != (iconv_t) -1)
{
goto start_conversion;
} }
}
}
start_conversion:
if (cd == (iconv_t) -1)
{
/* Something went wrong. */
debug(0, L"Could not perform utf-8 conversion");
if (errno != EINVAL)
wperror(L"iconv_open");
/* Terminate the output string. */
free(out);
return 0;
}
/* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
OS X and Linux this one: size_t iconv (iconv_t, char **...)
AFAIK there's no single type that can be passed as both char ** and const char **.
Hence this hack.
*/
nconv = hack_iconv(cd, &in, &in_len, &nout, &out_len);
if (nconv == (size_t) -1)
{
debug(0, L"Error while converting from utf string");
return 0;
}
*((wchar_t *) nout) = L'\0';
/*
Check for silly iconv behaviour inserting an bytemark in the output
string.
*/
if (*out == L'\xfeff' || *out == L'\xffef' || *out == L'\xefbbbf')
{
wchar_t *out_old = out;
out = wcsdup(out+1);
if (! out)
{
debug(0, L"FNORD!!!!");
free(out_old);
return 0;
}
free(out_old);
}
if (iconv_close(cd) != 0)
wperror(L"iconv_close");
return out;
}
/**
Convert wide string to utf-8
*/
static char *wcs2utf(const wchar_t *in)
{
iconv_t cd=(iconv_t) -1;
int i,j;
char *char_in = (char *)in;
char *out;
/*
Try to convert to wchar_t. If that is not a valid character set,
try various names for ucs-4. We can't be sure that ucs-4 is
really the character set used by wchar_t, but it is the best
assumption we can make.
*/
const char **from_name=0;
switch (sizeof(wchar_t))
{
case 2:
from_name = iconv_wide_names_2;
break;
case 4:
from_name = iconv_wide_names_4;
break;
default:
from_name = iconv_wide_names_unknown;
break;
}
const char **to_name = iconv_utf8_names;
size_t in_len = wcslen(in);
size_t out_len = sizeof(char)*((MAX_UTF8_BYTES*in_len)+1);
size_t nconv;
char *nout;
out = (char *)malloc(out_len);
nout = (char *)out;
in_len *= sizeof(wchar_t);
if (!out)
return 0;
for (i=0; to_name[i]; i++)
{
for (j=0; from_name[j]; j++)
{
cd = iconv_open(to_name[i], from_name[j]);
if (cd != (iconv_t) -1)
{
goto start_conversion;
}
}
}
start_conversion:
if (cd == (iconv_t) -1)
{
/* Something went wrong. */
debug(0, L"Could not perform utf-8 conversion");
if (errno != EINVAL)
wperror(L"iconv_open");
/* Terminate the output string. */
free(out);
return 0;
}
nconv = hack_iconv(cd, &char_in, &in_len, &nout, &out_len);
if (nconv == (size_t) -1)
{
debug(0, L"%d %d", in_len, out_len);
debug(0, L"Error while converting from to string");
/* Terminate the output string. */
free(out);
return 0;
}
*nout = '\0';
if (iconv_close(cd) != 0)
wperror(L"iconv_close");
return out;
}
void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val)) void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
{ {

2
utf8.h
View file

@ -28,9 +28,11 @@
#define UTF8_IGNORE_ERROR 0x01 #define UTF8_IGNORE_ERROR 0x01
#define UTF8_SKIP_BOM 0x02 #define UTF8_SKIP_BOM 0x02
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
bool utf8_to_wchar_string(const std::string &input, std::wstring *result); bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
bool wchar_to_utf8_string(const std::wstring &input, std::string *result); bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
/* Variants exposed for testing */
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags); size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags); size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);