From 9718e70260fd44ede9409711d7d0a8efcf131c20 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Sun, 23 Mar 2014 13:06:24 -0700
Subject: [PATCH] Naive reimplementation of utf2wcs and wcs2utf in
 env_universal_common.cpp. These use the new utf8 functions exposed in utf8.h.
 This will allow us to drop the iconv dependency.

---
 env_universal_common.cpp | 305 ++-------------------------------------
 utf8.h                   |   2 +
 2 files changed, 17 insertions(+), 290 deletions(-)
diff --git a/env_universal_common.cpp b/env_universal_common.cpp
index c175b4a14..048396afa 100644
--- a/env_universal_common.cpp
+++ b/env_universal_common.cpp
@@ -21,7 +21,6 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include <wctype.h>
-#include <iconv.h>
 
 #include <errno.h>
 #include <locale.h>
@@ -39,6 +38,7 @@
 
 #include "common.h"
 #include "wutil.h"
+#include "utf8.h"
 #include "env_universal_common.h"
 
 /**
@@ -116,304 +116,29 @@ static void (*callback)(fish_message_type_t type,
                         const wchar_t *key,
                         const wchar_t *val);
 
-/**
-   List of names for the UTF-8 character set.
- */
-static const char *iconv_utf8_names[]=
+/* UTF <-> wchar conversions. These return a string allocated with malloc. These call sites could be cleaned up substantially to eliminate the dependence on malloc. */
+static wchar_t *utf2wcs(const char *input)
 {
-    "utf-8", "UTF-8",
-    "utf8", "UTF8",
-    0
-}
-;
-
-/**
-    List of wide character names, undefined byte length.
- */
-static const char *iconv_wide_names_unknown[]=
-{
-    "wchar_t", "WCHAR_T",
-    "wchar", "WCHAR",
-    0
-}
-;
-
-/**
-   List of wide character names, 4 bytes long.
- */
-static const char *iconv_wide_names_4[]=
-{
-    "wchar_t", "WCHAR_T",
-    "wchar", "WCHAR",
-    "ucs-4", "UCS-4",
-    "ucs4", "UCS4",
-    "utf-32", "UTF-32",
-    "utf32", "UTF32",
-    0
-}
-;
-
-/**
-   List of wide character names, 2 bytes long.
- */
-static const char *iconv_wide_names_2[]=
-{
-    "wchar_t", "WCHAR_T",
-    "wchar", "WCHAR",
-    "ucs-2", "UCS-2",
-    "ucs2", "UCS2",
-    "utf-16", "UTF-16",
-    "utf16", "UTF16",
-    0
-}
-;
-
-template<class T>
-class sloppy {};
-
-static size_t hack_iconv(iconv_t cd, const char * const* inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
-{
-    /* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
-       OS X and Linux this one: size_t iconv (iconv_t, char **...)
-       AFAIK there's no single type that can be passed as both char ** and const char **.
-       Therefore, we let C++ figure it out, by providing a struct with an implicit conversion to both char** and const char **.
-    */
-    struct sloppy_char
+    wchar_t *result = NULL;
+    wcstring converted;
+    if (utf8_to_wchar_string(input, &converted))
     {
-        const char * const * t;
-        operator char** () const
-        {
-            return (char **)t;
-        }
-        operator const char** () const
-        {
-            return (const char**)t;
-        }
-    } slop_inbuf = {inbuf};
-
-    return iconv(cd, slop_inbuf, inbytesleft, outbuf, outbytesleft);
+        result = wcsdup(converted.c_str());
+    }
+    return result;
 }
 
-/**
-   Convert utf-8 string to wide string
- */
-static wchar_t *utf2wcs(const char *in)
+static char *wcs2utf(const wchar_t *input)
 {
-    iconv_t cd=(iconv_t) -1;
-    int i,j;
-
-    wchar_t *out;
-
-    /*
-      Try to convert to wchar_t. If that is not a valid character set,
-      try various names for ucs-4. We can't be sure that ucs-4 is
-      really the character set used by wchar_t, but it is the best
-      assumption we can make.
-    */
-    const char **to_name=0;
-
-    switch (sizeof(wchar_t))
+    char *result = NULL;
+    std::string converted;
+    if (wchar_to_utf8_string(input, &converted))
     {
-
-        case 2:
-            to_name = iconv_wide_names_2;
-            break;
-
-        case 4:
-            to_name = iconv_wide_names_4;
-            break;
-
-        default:
-            to_name = iconv_wide_names_unknown;
-            break;
+        result = strdup(converted.c_str());
     }
-
-
-    /*
-      The line protocol fish uses is always utf-8.
-    */
-    const char **from_name = iconv_utf8_names;
-
-    size_t in_len = strlen(in);
-    size_t out_len =  sizeof(wchar_t)*(in_len+2);
-    size_t nconv;
-    char *nout;
-
-    out = (wchar_t *)malloc(out_len);
-    nout = (char *)out;
-
-    if (!out)
-        return 0;
-
-    for (i=0; to_name[i]; i++)
-    {
-        for (j=0; from_name[j]; j++)
-        {
-            cd = iconv_open(to_name[i], from_name[j]);
-
-            if (cd != (iconv_t) -1)
-            {
-                goto start_conversion;
-
-            }
-        }
-    }
-
-start_conversion:
-
-    if (cd == (iconv_t) -1)
-    {
-        /* Something went wrong.  */
-        debug(0, L"Could not perform utf-8 conversion");
-        if (errno != EINVAL)
-            wperror(L"iconv_open");
-
-        /* Terminate the output string.  */
-        free(out);
-        return 0;
-    }
-
-    /* FreeBSD has this prototype: size_t iconv (iconv_t, const char **...)
-       OS X and Linux this one: size_t iconv (iconv_t, char **...)
-       AFAIK there's no single type that can be passed as both char ** and const char **.
-       Hence this hack.
-    */
-    nconv = hack_iconv(cd, &in, &in_len, &nout, &out_len);
-
-    if (nconv == (size_t) -1)
-    {
-        debug(0, L"Error while converting from utf string");
-        return 0;
-    }
-
-    *((wchar_t *) nout) = L'\0';
-
-    /*
-      Check for silly iconv behaviour inserting an bytemark in the output
-      string.
-     */
-    if (*out == L'\xfeff' || *out == L'\xffef' || *out == L'\xefbbbf')
-    {
-        wchar_t *out_old = out;
-        out = wcsdup(out+1);
-        if (! out)
-        {
-            debug(0, L"FNORD!!!!");
-            free(out_old);
-            return 0;
-        }
-        free(out_old);
-    }
-
-
-    if (iconv_close(cd) != 0)
-        wperror(L"iconv_close");
-
-    return out;
+    return result;
 }
 
-
-
-/**
-   Convert wide string to utf-8
- */
-static char *wcs2utf(const wchar_t *in)
-{
-    iconv_t cd=(iconv_t) -1;
-    int i,j;
-
-    char *char_in = (char *)in;
-    char *out;
-
-    /*
-      Try to convert to wchar_t. If that is not a valid character set,
-      try various names for ucs-4. We can't be sure that ucs-4 is
-      really the character set used by wchar_t, but it is the best
-      assumption we can make.
-    */
-    const char **from_name=0;
-
-    switch (sizeof(wchar_t))
-    {
-
-        case 2:
-            from_name = iconv_wide_names_2;
-            break;
-
-        case 4:
-            from_name = iconv_wide_names_4;
-            break;
-
-        default:
-            from_name = iconv_wide_names_unknown;
-            break;
-    }
-
-    const char **to_name = iconv_utf8_names;
-
-    size_t in_len = wcslen(in);
-    size_t out_len =  sizeof(char)*((MAX_UTF8_BYTES*in_len)+1);
-    size_t nconv;
-    char *nout;
-
-    out = (char *)malloc(out_len);
-    nout = (char *)out;
-    in_len *= sizeof(wchar_t);
-
-    if (!out)
-        return 0;
-
-    for (i=0; to_name[i]; i++)
-    {
-        for (j=0; from_name[j]; j++)
-        {
-            cd = iconv_open(to_name[i], from_name[j]);
-
-            if (cd != (iconv_t) -1)
-            {
-                goto start_conversion;
-
-            }
-        }
-    }
-
-start_conversion:
-
-    if (cd == (iconv_t) -1)
-    {
-        /* Something went wrong.  */
-        debug(0, L"Could not perform utf-8 conversion");
-        if (errno != EINVAL)
-            wperror(L"iconv_open");
-
-        /* Terminate the output string.  */
-        free(out);
-        return 0;
-    }
-
-    nconv = hack_iconv(cd, &char_in, &in_len, &nout, &out_len);
-
-
-    if (nconv == (size_t) -1)
-    {
-        debug(0, L"%d %d", in_len, out_len);
-        debug(0, L"Error while converting from to string");
-
-        /* Terminate the output string.  */
-        free(out);
-        return 0;
-    }
-
-    *nout = '\0';
-
-    if (iconv_close(cd) != 0)
-        wperror(L"iconv_close");
-
-    return out;
-}
-
-
-
 void env_universal_common_init(void (*cb)(fish_message_type_t type, const wchar_t *key, const wchar_t *val))
 {
     callback = cb;
diff --git a/utf8.h b/utf8.h
index 18aa52658..a1f501545 100644
--- a/utf8.h
+++ b/utf8.h
@@ -28,9 +28,11 @@
 #define UTF8_IGNORE_ERROR		0x01
 #define UTF8_SKIP_BOM			0x02
 
+/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
 bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
 bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
 
+/* Variants exposed for testing */
 size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
 size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);