Naive reimplementation of utf2wcs and wcs2utf in

env_universal_common.cpp. These use the new utf8 functions exposed in utf8.h. This will allow us to drop the iconv dependency.
2025-01-12 13:08:49 +00:00 · 2014-03-23 13:06:24 -07:00 · 2014-03-23 13:06:24 -07:00 · 9718e70260
commit 9718e70260
parent a67dd9fbdd
2 changed files with 17 additions and 290 deletions
--- a/env_universal_common.cpp
+++ b/env_universal_common.cpp
@ -21,7 +21,6 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include <wctype.h>
 #include <iconv.h>
 #include <errno.h>
 #include <locale.h>
@ -39,6 +38,7 @@
 #include "common.h"
 #include "wutil.h"
 #include "utf8.h"
 #include "env_universal_common.h"
 /**
@ -116,304 +116,29 @@ static void (*callback)(fish_message_type_t type,
                        const wchar_t *key,
                        const wchar_t *val);
-/**
+/* UTF <-> wchar conversions. These return a string allocated with malloc. These call sites could be cleaned up substantially to eliminate the dependence on malloc. */
-   List of names for the UTF-8 character set.
+static wchar_t *utf2wcs(const char *input)
 */
 static const char *iconv_utf8_names[]=
 {
-    "utf-8", "UTF-8",
+    wchar_t *result = NULL;
-    "utf8", "UTF8",
+    wcstring converted;
-    0
+    if (utf8_to_wchar_string(input, &converted))
 }
 ;
 /**
    List of wide character names, undefined byte length.
 */
 static const char *iconv_wide_names_unknown[]=
 {
    "wchar_t", "WCHAR_T",
    "wchar", "WCHAR",
    0
 }
 ;
 /**
   List of wide character names, 4 bytes long.
 */
 static const char *iconv_wide_names_4[]=
 {
    "wchar_t", "WCHAR_T",
    "wchar", "WCHAR",
    "ucs-4", "UCS-4",
    "ucs4", "UCS4",
    "utf-32", "UTF-32",
    "utf32", "UTF32",
    0
 }
 ;
 /**
   List of wide character names, 2 bytes long.
 */
 static const char *iconv_wide_names_2[]=
 {
    "wchar_t", "WCHAR_T",
    "wchar", "WCHAR",
    "ucs-2", "UCS-2",
    "ucs2", "UCS2",
    "utf-16", "UTF-16",
    "utf16", "UTF16",
    0
 }
 ;
 template<class T>
 class sloppy {};
 static size_t hack_iconv(iconv_t cd, const char * const* inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
 {
    /* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
       OS X and Linux this one: size_t iconv (iconv_t, char **...)
       AFAIK there's no single type that can be passed as both char ** and const char **.
       Therefore, we let C++ figure it out, by providing a struct with an implicit conversion to both char** and const char **.
    */
    struct sloppy_char
    {
-        const char * const * t;
+        result = wcsdup(converted.c_str());
-        operator char** () const
+    }
-        {
+    return result;
            return (char **)t;
        }
        operator const char** () const
        {
            return (const char**)t;
        }
    } slop_inbuf = {inbuf};
    return iconv(cd, slop_inbuf, inbytesleft, outbuf, outbytesleft);
 }
-/**
+static char *wcs2utf(const wchar_t *input)
   Convert utf-8 string to wide string
 */
 static wchar_t *utf2wcs(const char *in)
 {
-    iconv_t cd=(iconv_t) -1;
+    char *result = NULL;
-    int i,j;
+    std::string converted;
-
+    if (wchar_to_utf8_string(input, &converted))
    wchar_t *out;
    /*
      Try to convert to wchar_t. If that is not a valid character set,
      try various names for ucs-4. We can't be sure that ucs-4 is
      really the character set used by wchar_t, but it is the best
      assumption we can make.
    */
    const char **to_name=0;
    switch (sizeof(wchar_t))
    {
-
+        result = strdup(converted.c_str());
        case 2:
            to_name = iconv_wide_names_2;
            break;
        case 4:
            to_name = iconv_wide_names_4;
            break;
        default:
            to_name = iconv_wide_names_unknown;
            break;
    }
-
+    return result;
    /*
      The line protocol fish uses is always utf-8.
    */
    const char **from_name = iconv_utf8_names;
    size_t in_len = strlen(in);
    size_t out_len =  sizeof(wchar_t)*(in_len+2);
    size_t nconv;
    char *nout;
    out = (wchar_t *)malloc(out_len);
    nout = (char *)out;
    if (!out)
        return 0;
    for (i=0; to_name[i]; i++)
    {
        for (j=0; from_name[j]; j++)
        {
            cd = iconv_open(to_name[i], from_name[j]);
            if (cd != (iconv_t) -1)
            {
                goto start_conversion;
            }
        }
    }
 start_conversion:
    if (cd == (iconv_t) -1)
    {
        /* Something went wrong.  */
        debug(0, L"Could not perform utf-8 conversion");
        if (errno != EINVAL)
            wperror(L"iconv_open");
        /* Terminate the output string.  */
        free(out);
        return 0;
    }
    /* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
       OS X and Linux this one: size_t iconv (iconv_t, char **...)
       AFAIK there's no single type that can be passed as both char ** and const char **.
       Hence this hack.
    */
    nconv = hack_iconv(cd, &in, &in_len, &nout, &out_len);
    if (nconv == (size_t) -1)
    {
        debug(0, L"Error while converting from utf string");
        return 0;
    }
    *((wchar_t *) nout) = L'\0';
    /*
      Check for silly iconv behaviour inserting an bytemark in the output
      string.
     */
    if (*out == L'\xfeff' || *out == L'\xffef' || *out == L'\xefbbbf')
    {
        wchar_t *out_old = out;
        out = wcsdup(out+1);
        if (! out)
        {
            debug(0, L"FNORD!!!!");
            free(out_old);
            return 0;
        }
        free(out_old);
    }
    if (iconv_close(cd) != 0)
        wperror(L"iconv_close");
    return out;
 }
 /**
   Convert wide string to utf-8
 */
 static char *wcs2utf(const wchar_t *in)
 {
    iconv_t cd=(iconv_t) -1;
    int i,j;
    char *char_in = (char *)in;
    char *out;
    /*
      Try to convert to wchar_t. If that is not a valid character set,
      try various names for ucs-4. We can't be sure that ucs-4 is
      really the character set used by wchar_t, but it is the best
      assumption we can make.
    */
    const char **from_name=0;
    switch (sizeof(wchar_t))
    {
        case 2:
            from_name = iconv_wide_names_2;
            break;
        case 4:
            from_name = iconv_wide_names_4;
            break;
        default:
            from_name = iconv_wide_names_unknown;
            break;
    }
    const char **to_name = iconv_utf8_names;
    size_t in_len = wcslen(in);
    size_t out_len =  sizeof(char)*((MAX_UTF8_BYTES*in_len)+1);
    size_t nconv;
    char *nout;
    out = (char *)malloc(out_len);
    nout = (char *)out;
    in_len *= sizeof(wchar_t);
    if (!out)
        return 0;
    for (i=0; to_name[i]; i++)
    {
        for (j=0; from_name[j]; j++)
        {
            cd = iconv_open(to_name[i], from_name[j]);
            if (cd != (iconv_t) -1)
            {
                goto start_conversion;
            }
        }
    }
 start_conversion:
    if (cd == (iconv_t) -1)
    {
        /* Something went wrong.  */
        debug(0, L"Could not perform utf-8 conversion");
        if (errno != EINVAL)
            wperror(L"iconv_open");
        /* Terminate the output string.  */
        free(out);
        return 0;
    }
    nconv = hack_iconv(cd, &char_in, &in_len, &nout, &out_len);
    if (nconv == (size_t) -1)
    {
        debug(0, L"%d %d", in_len, out_len);
        debug(0, L"Error while converting from to string");
        /* Terminate the output string.  */
        free(out);
        return 0;
    }
    *nout = '\0';
    if (iconv_close(cd) != 0)
        wperror(L"iconv_close");
    return out;
 }
 void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
 {
    callback = cb;
--- a/utf8.h
+++ b/utf8.h
@ -28,9 +28,11 @@
 #define UTF8_IGNORE_ERROR		0x01
 #define UTF8_SKIP_BOM			0x02
 /* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
 bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
 bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
 /* Variants exposed for testing */
 size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
 size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);