From ee44879d4d6ae28968885823db5f0ce13e5a6dec Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Tue, 3 May 2016 15:05:47 -0700 Subject: [PATCH] restyle utf8 module to match project style Reduces lint errors from 63 to 57 (-10%). Line count from 518 to 418 (-19%). Another step in resolving issue #2902. --- src/utf8.cpp | 398 +++++++++++++++++++-------------------------------- src/utf8.h | 14 +- 2 files changed, 156 insertions(+), 256 deletions(-) diff --git a/src/utf8.cpp b/src/utf8.cpp index 7c876a311..ceec637e9 100644 --- a/src/utf8.cpp +++ b/src/utf8.cpp @@ -13,44 +13,40 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include #include // IWYU pragma: keep -#include +#include #include +#include #include "utf8.h" -#define _NXT 0x80 -#define _SEQ2 0xc0 -#define _SEQ3 0xe0 -#define _SEQ4 0xf0 -#define _SEQ5 0xf8 -#define _SEQ6 0xfc +#define _NXT 0x80 +#define _SEQ2 0xc0 +#define _SEQ3 0xe0 +#define _SEQ4 0xf0 +#define _SEQ5 0xf8 +#define _SEQ6 0xfc -#define _BOM 0xfeff +#define _BOM 0xfeff -/* We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix */ +// We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix. typedef wchar_t utf8_wchar_t; #define UTF8_WCHAR_MAX ((size_t)std::numeric_limits::max()) typedef std::basic_string utf8_wstring_t; -bool is_wchar_ucs2() -{ - return UTF8_WCHAR_MAX <= 0xFFFF; -} +bool is_wchar_ucs2() { return UTF8_WCHAR_MAX <= 0xFFFF; } -static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *result, int flags); -static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags); +static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *result, + int flags); +static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, + size_t outsize, int flags); -static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count) -{ +static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count) { bool result = true; - for (size_t i=0; i < count; i++) - { + for (size_t i = 0; i < count; i++) { wchar_t c = in[i]; - if (c > UTF8_WCHAR_MAX) - { + if (c > UTF8_WCHAR_MAX) { result = false; break; } @@ -59,24 +55,20 @@ static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, return result; } -bool wchar_to_utf8_string(const std::wstring &str, std::string *result) -{ +bool wchar_to_utf8_string(const std::wstring &str, std::string *result) { result->clear(); const size_t inlen = str.size(); - if (inlen == 0) - { + if (inlen == 0) { return true; } bool success = false; const wchar_t *input = str.c_str(); size_t outlen = wchar_to_utf8(input, inlen, NULL, 0, 0); - if (outlen > 0) - { + if (outlen > 0) { char *tmp = new char[outlen]; size_t outlen2 = wchar_to_utf8(input, inlen, tmp, outlen, 0); - if (outlen2 > 0) - { + if (outlen2 > 0) { result->assign(tmp, outlen2); success = true; } @@ -85,27 +77,19 @@ bool wchar_to_utf8_string(const std::wstring &str, std::string *result) return success; } -size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags) -{ - if (in == NULL || insize == 0) - { +size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags) { + if (in == NULL || insize == 0) { return 0; } size_t result; - if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) - { + if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) { result = utf8_to_wchar_internal(in, insize, reinterpret_cast(out), flags); - } - else if (out == NULL) - { + } else if (out == NULL) { result = utf8_to_wchar_internal(in, insize, NULL, flags); - } - else - { - // Allocate a temporary buffer to hold the output, - // invoke the conversion with the temporary, - // and then copy it back + } else { + // Allocate a temporary buffer to hold the output, invoke the conversion with the temporary, + // and then copy it back. utf8_wstring_t tmp_output; result = utf8_to_wchar_internal(in, insize, &tmp_output, flags); out->insert(out->end(), tmp_output.begin(), tmp_output.end()); @@ -113,32 +97,24 @@ size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags return result; } -size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags) -{ - if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) - { +size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags) { + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) { return 0; } size_t result; - if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) - { - result = wchar_to_utf8_internal(reinterpret_cast(in), insize, out, outsize, flags); - } - else - { - // Allocate a temporary buffer to hold the input - // the std::copy performs the size conversion - // note: insize may be 0 + if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) { + result = wchar_to_utf8_internal(reinterpret_cast(in), insize, out, + outsize, flags); + } else { + // Allocate a temporary buffer to hold the input the std::copy performs the size conversion. + // Note: insize may be 0. utf8_wchar_t *tmp_input = new utf8_wchar_t[insize]; - if (! safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize)) - { - // our utf8_wchar_t is UCS-16 and there was an astral character + if (!safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize)) { + // Our utf8_wchar_t is UCS-16 and there was an astral character. result = 0; - } - else - { - // Invoke the conversion with the temporary, then clean up the input + } else { + // Invoke the conversion with the temporary, then clean up the input. result = wchar_to_utf8_internal(tmp_input, insize, out, outsize, flags); } delete[] tmp_input; @@ -146,191 +122,136 @@ size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize return result; } - static int __wchar_forbitten(utf8_wchar_t sym); static int __utf8_forbitten(unsigned char octet); -static int -__wchar_forbitten(utf8_wchar_t sym) -{ - - /* Surrogate pairs */ - if (sym >= 0xd800 && sym <= 0xdfff) - return (-1); +static int __wchar_forbitten(utf8_wchar_t sym) { + // Surrogate pairs. + if (sym >= 0xd800 && sym <= 0xdfff) return (-1); return (0); } -static int -__utf8_forbitten(unsigned char octet) -{ - - switch (octet) - { +static int __utf8_forbitten(unsigned char octet) { + switch (octet) { case 0xc0: case 0xc1: case 0xf5: - case 0xff: + case 0xff: { return (-1); + } } return (0); } -/* - * DESCRIPTION - * This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols - * will be in local machine byte order). - * - * It takes the following arguments: - * in - input UTF-8 string. It can be null-terminated. - * insize - size of input string in bytes. - * out_string - result buffer for UCS-2/4 string. - * - * RETURN VALUES - * The function returns size of result buffer (in wide characters). - * Zero is returned in case of error. - * - * CAVEATS - * 1. If UTF-8 string contains zero symbols, they will be translated - * as regular symbols. - * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary - * when `out' is NULL and not NULL. It's because of special UTF-8 - * sequences which may result in forbitten (by RFC3629) UNICODE - * characters. So, the caller must check return value every time and - * not prepare buffer in advance (\0 terminate) but after calling this - * function. - */ -static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *out_string, int flags) -{ +/// This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols will be in local +/// machine byte order). It takes the following arguments: +/// +/// in - input UTF-8 string. It can be null-terminated. +/// insize - size of input string in bytes. +/// out_string - result buffer for UCS-2/4 string. +/// +/// RETURN VALUES +/// The function returns size of result buffer (in wide characters). +/// Zero is returned in case of error. +/// +/// CAVEATS +/// 1. If UTF-8 string contains zero symbols, they will be translated +/// as regular symbols. +/// +/// 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary when `out' is NULL and not +/// NULL. It's because of special UTF-8 sequences which may result in forbitten (by RFC3629) UNICODE +/// characters. So, the caller must check return value every time and not prepare buffer in advance +/// (\0 terminate) but after calling this function. +static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *out_string, + int flags) { unsigned char *p, *lim; utf8_wchar_t high; size_t n, total, i, n_bits; - if (in == NULL || insize == 0) - return (0); - - if (out_string != NULL) - out_string->clear(); + if (in == NULL || insize == 0) return (0); + + if (out_string != NULL) out_string->clear(); total = 0; p = (unsigned char *)in; lim = p + insize; - for (; p < lim; p += n) - { - if (__utf8_forbitten(*p) != 0 && - (flags & UTF8_IGNORE_ERROR) == 0) - return (0); + for (; p < lim; p += n) { + if (__utf8_forbitten(*p) != 0 && (flags & UTF8_IGNORE_ERROR) == 0) return (0); - /* - * Get number of bytes for one wide character. - */ - n = 1; /* default: 1 byte. Used when skipping bytes. */ + // Get number of bytes for one wide character. + n = 1; // default: 1 byte. Used when skipping bytes if ((*p & 0x80) == 0) high = (utf8_wchar_t)*p; - else if ((*p & 0xe0) == _SEQ2) - { + else if ((*p & 0xe0) == _SEQ2) { n = 2; high = (utf8_wchar_t)(*p & 0x1f); - } - else if ((*p & 0xf0) == _SEQ3) - { + } else if ((*p & 0xf0) == _SEQ3) { n = 3; high = (utf8_wchar_t)(*p & 0x0f); - } - else if ((*p & 0xf8) == _SEQ4) - { + } else if ((*p & 0xf8) == _SEQ4) { n = 4; high = (utf8_wchar_t)(*p & 0x07); - } - else if ((*p & 0xfc) == _SEQ5) - { + } else if ((*p & 0xfc) == _SEQ5) { n = 5; high = (utf8_wchar_t)(*p & 0x03); - } - else if ((*p & 0xfe) == _SEQ6) - { + } else if ((*p & 0xfe) == _SEQ6) { n = 6; high = (utf8_wchar_t)(*p & 0x01); - } - else - { - if ((flags & UTF8_IGNORE_ERROR) == 0) - return (0); + } else { + if ((flags & UTF8_IGNORE_ERROR) == 0) return (0); continue; } - /* does the sequence header tell us truth about length? */ - if (lim - p <= n - 1) - { - if ((flags & UTF8_IGNORE_ERROR) == 0) - return (0); + // Does the sequence header tell us truth about length? + if (lim - p <= n - 1) { + if ((flags & UTF8_IGNORE_ERROR) == 0) return (0); n = 1; - continue; /* skip */ + continue; // skip } - /* - * Validate sequence. - * All symbols must have higher bits set to 10xxxxxx - */ - if (n > 1) - { - for (i = 1; i < n; i++) - { - if ((p[i] & 0xc0) != _NXT) - break; + // Validate sequence. All symbols must have higher bits set to 10xxxxxx. + if (n > 1) { + for (i = 1; i < n; i++) { + if ((p[i] & 0xc0) != _NXT) break; } - if (i != n) - { - if ((flags & UTF8_IGNORE_ERROR) == 0) - return (0); + if (i != n) { + if ((flags & UTF8_IGNORE_ERROR) == 0) return (0); n = 1; - continue; /* skip */ + continue; // skip } } total++; - if (out_string == NULL) - continue; + if (out_string == NULL) continue; uint32_t out_val = 0; n_bits = 0; - for (i = 1; i < n; i++) - { + for (i = 1; i < n; i++) { out_val |= (utf8_wchar_t)(p[n - i] & 0x3f) << n_bits; - n_bits += 6; /* 6 low bits in every byte */ + n_bits += 6; // 6 low bits in every byte } out_val |= high << n_bits; bool skip = false; - if (__wchar_forbitten(out_val) != 0) - { - if ((flags & UTF8_IGNORE_ERROR) == 0) - { - return 0; /* forbitten character */ - } - else - { + if (__wchar_forbitten(out_val) != 0) { + if ((flags & UTF8_IGNORE_ERROR) == 0) { + return 0; // forbidden character + } else { skip = true; } - } - else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0) - { + } else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0) { skip = true; } - if (skip) - { + if (skip) { total--; - } - else if (out_val > UTF8_WCHAR_MAX) - { - // wchar_t is UCS-2, but the UTF-8 specified an astral character + } else if (out_val > UTF8_WCHAR_MAX) { + // wchar_t is UCS-2, but the UTF-8 specified an astral character. return 0; - } - else - { + } else { out_string->push_back(out_val); } } @@ -338,61 +259,47 @@ static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring return (total); } -/* - * DESCRIPTION - * This function translates UCS-2/4 symbols (given in local machine - * byte order) into UTF-8 string. - * - * It takes the following arguments: - * in - input unicode string. It can be null-terminated. - * insize - size of input string in wide characters. - * out - result buffer for utf8 string. If out is NULL, - * function returns size of result buffer. - * outsize - size of result buffer. - * - * RETURN VALUES - * The function returns size of result buffer (in bytes). Zero is returned - * in case of error. - * - * CAVEATS - * If UCS-4 string contains zero symbols, they will be translated - * as regular symbols. - */ -static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags) -{ +/// This function translates UCS-2/4 symbols (given in local machine byte order) into UTF-8 string. +/// It takes the following arguments: +/// +/// in - input unicode string. It can be null-terminated. +/// insize - size of input string in wide characters. +/// out - result buffer for utf8 string. If out is NULL, function returns size of result buffer. +/// outsize - size of result buffer. +/// +/// RETURN VALUES +/// The function returns size of result buffer (in bytes). Zero is returned in case of error. +/// +/// CAVEATS +/// If UCS-4 string contains zero symbols, they will be translated as regular symbols. +static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, + size_t outsize, int flags) { const utf8_wchar_t *w, *wlim; unsigned char *p, *lim; size_t total, n; - if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) - return (0); + if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) return (0); w = in; wlim = w + insize; p = (unsigned char *)out; lim = p + outsize; total = 0; - for (; w < wlim; w++) - { - if (__wchar_forbitten(*w) != 0) - { + for (; w < wlim; w++) { + if (__wchar_forbitten(*w) != 0) { if ((flags & UTF8_IGNORE_ERROR) == 0) return (0); else continue; } - if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0) - continue; + if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0) continue; const int32_t w_wide = *w; - if (w_wide < 0) - { - if ((flags & UTF8_IGNORE_ERROR) == 0) - return (0); + if (w_wide < 0) { + if ((flags & UTF8_IGNORE_ERROR) == 0) return (0); continue; - } - else if (w_wide <= 0x0000007f) + } else if (w_wide <= 0x0000007f) n = 1; else if (w_wide <= 0x000007ff) n = 2; @@ -402,18 +309,16 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char n = 4; else if (w_wide <= 0x03ffffff) n = 5; - else /* if (w_wide <= 0x7fffffff) */ + else /// if (w_wide <= 0x7fffffff) n = 6; total += n; - if (out == NULL) - continue; + if (out == NULL) continue; - if (lim - p <= n - 1) - return (0); /* no space left */ + if (lim - p <= n - 1) return (0); /* no space left */ - /* extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0 */ + // Extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0. unsigned char oc[4]; uint32_t w_tmp = *w; oc[3] = w_tmp & 0xFF; @@ -424,41 +329,38 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char w_tmp >>= 8; oc[0] = w_tmp & 0xFF; - switch (n) - { - case 1: + switch (n) { + case 1: { p[0] = oc[3]; break; - - case 2: + } + case 2: { p[1] = _NXT | (oc[3] & 0x3f); p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2); break; - - case 3: + } + case 3: { p[2] = _NXT | (oc[3] & 0x3f); p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4); break; - - case 4: + } + case 4: { p[3] = _NXT | (oc[3] & 0x3f); p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); - p[1] = _NXT | ((oc[2] & 0xf0) >> 4) | - ((oc[1] & 0x03) << 4); + p[1] = _NXT | ((oc[2] & 0xf0) >> 4) | ((oc[1] & 0x03) << 4); p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2); break; - - case 5: + } + case 5: { p[4] = _NXT | (oc[3] & 0x3f); p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); - p[2] = _NXT | ((oc[2] & 0xf0) >> 4) | - ((oc[1] & 0x03) << 4); + p[2] = _NXT | ((oc[2] & 0xf0) >> 4) | ((oc[1] & 0x03) << 4); p[1] = _NXT | (oc[1] >> 2); p[0] = _SEQ5 | (oc[0] & 0x03); break; - - case 6: + } + case 6: { p[5] = _NXT | (oc[3] & 0x3f); p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2); p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4); @@ -466,13 +368,11 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char p[1] = _NXT | (oc[0] & 0x3f); p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6); break; + } } - /* - * NOTE: do not check here for forbitten UTF-8 characters. - * They cannot appear here because we do proper convertion. - */ - + // NOTE: do not check here for forbitten UTF-8 characters. They cannot appear here because + // we do proper convertion. p += n; } diff --git a/src/utf8.h b/src/utf8.h index 72e1cc8bd..99a12a8e6 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -14,22 +14,22 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* - * utf8: implementation of UTF-8 charset encoding (RFC3629). - */ +// Implementation of UTF-8 charset encoding (RFC3629). #ifndef _UTF8_H_ #define _UTF8_H_ #include #include -#define UTF8_IGNORE_ERROR 0x01 -#define UTF8_SKIP_BOM 0x02 +#define UTF8_IGNORE_ERROR 0x01 +#define UTF8_SKIP_BOM 0x02 -/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */ +/// Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if +/// successful, storing the result of the conversion in *result*. bool wchar_to_utf8_string(const std::wstring &input, std::string *result); -/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns nonzero if successful, storing the result of the conversion in *out */ +/// Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns nonzero if +/// successful, storing the result of the conversion in *out*. size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags); size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);