mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-14 05:53:59 +00:00
restyle utf8 module to match project style
Reduces lint errors from 63 to 57 (-10%). Line count from 518 to 418 (-19%). Another step in resolving issue #2902.
This commit is contained in:
parent
c14bac4284
commit
ee44879d4d
2 changed files with 156 additions and 256 deletions
398
src/utf8.cpp
398
src/utf8.cpp
|
@ -13,44 +13,40 @@
|
|||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h> // IWYU pragma: keep
|
||||
#include <string>
|
||||
#include <sys/types.h>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
#define _NXT 0x80
|
||||
#define _SEQ2 0xc0
|
||||
#define _SEQ3 0xe0
|
||||
#define _SEQ4 0xf0
|
||||
#define _SEQ5 0xf8
|
||||
#define _SEQ6 0xfc
|
||||
#define _NXT 0x80
|
||||
#define _SEQ2 0xc0
|
||||
#define _SEQ3 0xe0
|
||||
#define _SEQ4 0xf0
|
||||
#define _SEQ5 0xf8
|
||||
#define _SEQ6 0xfc
|
||||
|
||||
#define _BOM 0xfeff
|
||||
#define _BOM 0xfeff
|
||||
|
||||
/* We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix */
|
||||
// We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix.
|
||||
typedef wchar_t utf8_wchar_t;
|
||||
#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits<utf8_wchar_t>::max())
|
||||
|
||||
typedef std::basic_string<utf8_wchar_t> utf8_wstring_t;
|
||||
|
||||
bool is_wchar_ucs2()
|
||||
{
|
||||
return UTF8_WCHAR_MAX <= 0xFFFF;
|
||||
}
|
||||
bool is_wchar_ucs2() { return UTF8_WCHAR_MAX <= 0xFFFF; }
|
||||
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *result, int flags);
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *result,
|
||||
int flags);
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out,
|
||||
size_t outsize, int flags);
|
||||
|
||||
static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count)
|
||||
{
|
||||
static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count) {
|
||||
bool result = true;
|
||||
for (size_t i=0; i < count; i++)
|
||||
{
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
wchar_t c = in[i];
|
||||
if (c > UTF8_WCHAR_MAX)
|
||||
{
|
||||
if (c > UTF8_WCHAR_MAX) {
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
|
@ -59,24 +55,20 @@ static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out,
|
|||
return result;
|
||||
}
|
||||
|
||||
bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
||||
{
|
||||
bool wchar_to_utf8_string(const std::wstring &str, std::string *result) {
|
||||
result->clear();
|
||||
const size_t inlen = str.size();
|
||||
if (inlen == 0)
|
||||
{
|
||||
if (inlen == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool success = false;
|
||||
const wchar_t *input = str.c_str();
|
||||
size_t outlen = wchar_to_utf8(input, inlen, NULL, 0, 0);
|
||||
if (outlen > 0)
|
||||
{
|
||||
if (outlen > 0) {
|
||||
char *tmp = new char[outlen];
|
||||
size_t outlen2 = wchar_to_utf8(input, inlen, tmp, outlen, 0);
|
||||
if (outlen2 > 0)
|
||||
{
|
||||
if (outlen2 > 0) {
|
||||
result->assign(tmp, outlen2);
|
||||
success = true;
|
||||
}
|
||||
|
@ -85,27 +77,19 @@ bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
|||
return success;
|
||||
}
|
||||
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags)
|
||||
{
|
||||
if (in == NULL || insize == 0)
|
||||
{
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags) {
|
||||
if (in == NULL || insize == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t result;
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
||||
{
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) {
|
||||
result = utf8_to_wchar_internal(in, insize, reinterpret_cast<utf8_wstring_t *>(out), flags);
|
||||
}
|
||||
else if (out == NULL)
|
||||
{
|
||||
} else if (out == NULL) {
|
||||
result = utf8_to_wchar_internal(in, insize, NULL, flags);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate a temporary buffer to hold the output,
|
||||
// invoke the conversion with the temporary,
|
||||
// and then copy it back
|
||||
} else {
|
||||
// Allocate a temporary buffer to hold the output, invoke the conversion with the temporary,
|
||||
// and then copy it back.
|
||||
utf8_wstring_t tmp_output;
|
||||
result = utf8_to_wchar_internal(in, insize, &tmp_output, flags);
|
||||
out->insert(out->end(), tmp_output.begin(), tmp_output.end());
|
||||
|
@ -113,32 +97,24 @@ size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags
|
|||
return result;
|
||||
}
|
||||
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
|
||||
{
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
{
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags) {
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t result;
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
||||
{
|
||||
result = wchar_to_utf8_internal(reinterpret_cast<const utf8_wchar_t *>(in), insize, out, outsize, flags);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate a temporary buffer to hold the input
|
||||
// the std::copy performs the size conversion
|
||||
// note: insize may be 0
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t)) {
|
||||
result = wchar_to_utf8_internal(reinterpret_cast<const utf8_wchar_t *>(in), insize, out,
|
||||
outsize, flags);
|
||||
} else {
|
||||
// Allocate a temporary buffer to hold the input the std::copy performs the size conversion.
|
||||
// Note: insize may be 0.
|
||||
utf8_wchar_t *tmp_input = new utf8_wchar_t[insize];
|
||||
if (! safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize))
|
||||
{
|
||||
// our utf8_wchar_t is UCS-16 and there was an astral character
|
||||
if (!safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize)) {
|
||||
// Our utf8_wchar_t is UCS-16 and there was an astral character.
|
||||
result = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invoke the conversion with the temporary, then clean up the input
|
||||
} else {
|
||||
// Invoke the conversion with the temporary, then clean up the input.
|
||||
result = wchar_to_utf8_internal(tmp_input, insize, out, outsize, flags);
|
||||
}
|
||||
delete[] tmp_input;
|
||||
|
@ -146,191 +122,136 @@ size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int __wchar_forbitten(utf8_wchar_t sym);
|
||||
static int __utf8_forbitten(unsigned char octet);
|
||||
|
||||
static int
|
||||
__wchar_forbitten(utf8_wchar_t sym)
|
||||
{
|
||||
|
||||
/* Surrogate pairs */
|
||||
if (sym >= 0xd800 && sym <= 0xdfff)
|
||||
return (-1);
|
||||
static int __wchar_forbitten(utf8_wchar_t sym) {
|
||||
// Surrogate pairs.
|
||||
if (sym >= 0xd800 && sym <= 0xdfff) return (-1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
__utf8_forbitten(unsigned char octet)
|
||||
{
|
||||
|
||||
switch (octet)
|
||||
{
|
||||
static int __utf8_forbitten(unsigned char octet) {
|
||||
switch (octet) {
|
||||
case 0xc0:
|
||||
case 0xc1:
|
||||
case 0xf5:
|
||||
case 0xff:
|
||||
case 0xff: {
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* DESCRIPTION
|
||||
* This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols
|
||||
* will be in local machine byte order).
|
||||
*
|
||||
* It takes the following arguments:
|
||||
* in - input UTF-8 string. It can be null-terminated.
|
||||
* insize - size of input string in bytes.
|
||||
* out_string - result buffer for UCS-2/4 string.
|
||||
*
|
||||
* RETURN VALUES
|
||||
* The function returns size of result buffer (in wide characters).
|
||||
* Zero is returned in case of error.
|
||||
*
|
||||
* CAVEATS
|
||||
* 1. If UTF-8 string contains zero symbols, they will be translated
|
||||
* as regular symbols.
|
||||
* 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
|
||||
* when `out' is NULL and not NULL. It's because of special UTF-8
|
||||
* sequences which may result in forbitten (by RFC3629) UNICODE
|
||||
* characters. So, the caller must check return value every time and
|
||||
* not prepare buffer in advance (\0 terminate) but after calling this
|
||||
* function.
|
||||
*/
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *out_string, int flags)
|
||||
{
|
||||
/// This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols will be in local
|
||||
/// machine byte order). It takes the following arguments:
|
||||
///
|
||||
/// in - input UTF-8 string. It can be null-terminated.
|
||||
/// insize - size of input string in bytes.
|
||||
/// out_string - result buffer for UCS-2/4 string.
|
||||
///
|
||||
/// RETURN VALUES
|
||||
/// The function returns size of result buffer (in wide characters).
|
||||
/// Zero is returned in case of error.
|
||||
///
|
||||
/// CAVEATS
|
||||
/// 1. If UTF-8 string contains zero symbols, they will be translated
|
||||
/// as regular symbols.
|
||||
///
|
||||
/// 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary when `out' is NULL and not
|
||||
/// NULL. It's because of special UTF-8 sequences which may result in forbitten (by RFC3629) UNICODE
|
||||
/// characters. So, the caller must check return value every time and not prepare buffer in advance
|
||||
/// (\0 terminate) but after calling this function.
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *out_string,
|
||||
int flags) {
|
||||
unsigned char *p, *lim;
|
||||
utf8_wchar_t high;
|
||||
size_t n, total, i, n_bits;
|
||||
|
||||
if (in == NULL || insize == 0)
|
||||
return (0);
|
||||
|
||||
if (out_string != NULL)
|
||||
out_string->clear();
|
||||
if (in == NULL || insize == 0) return (0);
|
||||
|
||||
if (out_string != NULL) out_string->clear();
|
||||
|
||||
total = 0;
|
||||
p = (unsigned char *)in;
|
||||
lim = p + insize;
|
||||
|
||||
for (; p < lim; p += n)
|
||||
{
|
||||
if (__utf8_forbitten(*p) != 0 &&
|
||||
(flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
for (; p < lim; p += n) {
|
||||
if (__utf8_forbitten(*p) != 0 && (flags & UTF8_IGNORE_ERROR) == 0) return (0);
|
||||
|
||||
/*
|
||||
* Get number of bytes for one wide character.
|
||||
*/
|
||||
n = 1; /* default: 1 byte. Used when skipping bytes. */
|
||||
// Get number of bytes for one wide character.
|
||||
n = 1; // default: 1 byte. Used when skipping bytes
|
||||
if ((*p & 0x80) == 0)
|
||||
high = (utf8_wchar_t)*p;
|
||||
else if ((*p & 0xe0) == _SEQ2)
|
||||
{
|
||||
else if ((*p & 0xe0) == _SEQ2) {
|
||||
n = 2;
|
||||
high = (utf8_wchar_t)(*p & 0x1f);
|
||||
}
|
||||
else if ((*p & 0xf0) == _SEQ3)
|
||||
{
|
||||
} else if ((*p & 0xf0) == _SEQ3) {
|
||||
n = 3;
|
||||
high = (utf8_wchar_t)(*p & 0x0f);
|
||||
}
|
||||
else if ((*p & 0xf8) == _SEQ4)
|
||||
{
|
||||
} else if ((*p & 0xf8) == _SEQ4) {
|
||||
n = 4;
|
||||
high = (utf8_wchar_t)(*p & 0x07);
|
||||
}
|
||||
else if ((*p & 0xfc) == _SEQ5)
|
||||
{
|
||||
} else if ((*p & 0xfc) == _SEQ5) {
|
||||
n = 5;
|
||||
high = (utf8_wchar_t)(*p & 0x03);
|
||||
}
|
||||
else if ((*p & 0xfe) == _SEQ6)
|
||||
{
|
||||
} else if ((*p & 0xfe) == _SEQ6) {
|
||||
n = 6;
|
||||
high = (utf8_wchar_t)(*p & 0x01);
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
} else {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0) return (0);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* does the sequence header tell us truth about length? */
|
||||
if (lim - p <= n - 1)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
// Does the sequence header tell us truth about length?
|
||||
if (lim - p <= n - 1) {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0) return (0);
|
||||
n = 1;
|
||||
continue; /* skip */
|
||||
continue; // skip
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate sequence.
|
||||
* All symbols must have higher bits set to 10xxxxxx
|
||||
*/
|
||||
if (n > 1)
|
||||
{
|
||||
for (i = 1; i < n; i++)
|
||||
{
|
||||
if ((p[i] & 0xc0) != _NXT)
|
||||
break;
|
||||
// Validate sequence. All symbols must have higher bits set to 10xxxxxx.
|
||||
if (n > 1) {
|
||||
for (i = 1; i < n; i++) {
|
||||
if ((p[i] & 0xc0) != _NXT) break;
|
||||
}
|
||||
if (i != n)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
if (i != n) {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0) return (0);
|
||||
n = 1;
|
||||
continue; /* skip */
|
||||
continue; // skip
|
||||
}
|
||||
}
|
||||
|
||||
total++;
|
||||
if (out_string == NULL)
|
||||
continue;
|
||||
if (out_string == NULL) continue;
|
||||
|
||||
uint32_t out_val = 0;
|
||||
n_bits = 0;
|
||||
for (i = 1; i < n; i++)
|
||||
{
|
||||
for (i = 1; i < n; i++) {
|
||||
out_val |= (utf8_wchar_t)(p[n - i] & 0x3f) << n_bits;
|
||||
n_bits += 6; /* 6 low bits in every byte */
|
||||
n_bits += 6; // 6 low bits in every byte
|
||||
}
|
||||
out_val |= high << n_bits;
|
||||
|
||||
bool skip = false;
|
||||
if (__wchar_forbitten(out_val) != 0)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
{
|
||||
return 0; /* forbitten character */
|
||||
}
|
||||
else
|
||||
{
|
||||
if (__wchar_forbitten(out_val) != 0) {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0) {
|
||||
return 0; // forbidden character
|
||||
} else {
|
||||
skip = true;
|
||||
}
|
||||
}
|
||||
else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0)
|
||||
{
|
||||
} else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
|
||||
skip = true;
|
||||
}
|
||||
|
||||
if (skip)
|
||||
{
|
||||
if (skip) {
|
||||
total--;
|
||||
}
|
||||
else if (out_val > UTF8_WCHAR_MAX)
|
||||
{
|
||||
// wchar_t is UCS-2, but the UTF-8 specified an astral character
|
||||
} else if (out_val > UTF8_WCHAR_MAX) {
|
||||
// wchar_t is UCS-2, but the UTF-8 specified an astral character.
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
out_string->push_back(out_val);
|
||||
}
|
||||
}
|
||||
|
@ -338,61 +259,47 @@ static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring
|
|||
return (total);
|
||||
}
|
||||
|
||||
/*
|
||||
* DESCRIPTION
|
||||
* This function translates UCS-2/4 symbols (given in local machine
|
||||
* byte order) into UTF-8 string.
|
||||
*
|
||||
* It takes the following arguments:
|
||||
* in - input unicode string. It can be null-terminated.
|
||||
* insize - size of input string in wide characters.
|
||||
* out - result buffer for utf8 string. If out is NULL,
|
||||
* function returns size of result buffer.
|
||||
* outsize - size of result buffer.
|
||||
*
|
||||
* RETURN VALUES
|
||||
* The function returns size of result buffer (in bytes). Zero is returned
|
||||
* in case of error.
|
||||
*
|
||||
* CAVEATS
|
||||
* If UCS-4 string contains zero symbols, they will be translated
|
||||
* as regular symbols.
|
||||
*/
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
|
||||
{
|
||||
/// This function translates UCS-2/4 symbols (given in local machine byte order) into UTF-8 string.
|
||||
/// It takes the following arguments:
|
||||
///
|
||||
/// in - input unicode string. It can be null-terminated.
|
||||
/// insize - size of input string in wide characters.
|
||||
/// out - result buffer for utf8 string. If out is NULL, function returns size of result buffer.
|
||||
/// outsize - size of result buffer.
|
||||
///
|
||||
/// RETURN VALUES
|
||||
/// The function returns size of result buffer (in bytes). Zero is returned in case of error.
|
||||
///
|
||||
/// CAVEATS
|
||||
/// If UCS-4 string contains zero symbols, they will be translated as regular symbols.
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out,
|
||||
size_t outsize, int flags) {
|
||||
const utf8_wchar_t *w, *wlim;
|
||||
unsigned char *p, *lim;
|
||||
size_t total, n;
|
||||
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
return (0);
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL)) return (0);
|
||||
|
||||
w = in;
|
||||
wlim = w + insize;
|
||||
p = (unsigned char *)out;
|
||||
lim = p + outsize;
|
||||
total = 0;
|
||||
for (; w < wlim; w++)
|
||||
{
|
||||
if (__wchar_forbitten(*w) != 0)
|
||||
{
|
||||
for (; w < wlim; w++) {
|
||||
if (__wchar_forbitten(*w) != 0) {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
else
|
||||
continue;
|
||||
}
|
||||
|
||||
if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
|
||||
continue;
|
||||
if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0) continue;
|
||||
|
||||
const int32_t w_wide = *w;
|
||||
if (w_wide < 0)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
if (w_wide < 0) {
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0) return (0);
|
||||
continue;
|
||||
}
|
||||
else if (w_wide <= 0x0000007f)
|
||||
} else if (w_wide <= 0x0000007f)
|
||||
n = 1;
|
||||
else if (w_wide <= 0x000007ff)
|
||||
n = 2;
|
||||
|
@ -402,18 +309,16 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char
|
|||
n = 4;
|
||||
else if (w_wide <= 0x03ffffff)
|
||||
n = 5;
|
||||
else /* if (w_wide <= 0x7fffffff) */
|
||||
else /// if (w_wide <= 0x7fffffff)
|
||||
n = 6;
|
||||
|
||||
total += n;
|
||||
|
||||
if (out == NULL)
|
||||
continue;
|
||||
if (out == NULL) continue;
|
||||
|
||||
if (lim - p <= n - 1)
|
||||
return (0); /* no space left */
|
||||
if (lim - p <= n - 1) return (0); /* no space left */
|
||||
|
||||
/* extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0 */
|
||||
// Extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0.
|
||||
unsigned char oc[4];
|
||||
uint32_t w_tmp = *w;
|
||||
oc[3] = w_tmp & 0xFF;
|
||||
|
@ -424,41 +329,38 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char
|
|||
w_tmp >>= 8;
|
||||
oc[0] = w_tmp & 0xFF;
|
||||
|
||||
switch (n)
|
||||
{
|
||||
case 1:
|
||||
switch (n) {
|
||||
case 1: {
|
||||
p[0] = oc[3];
|
||||
break;
|
||||
|
||||
case 2:
|
||||
}
|
||||
case 2: {
|
||||
p[1] = _NXT | (oc[3] & 0x3f);
|
||||
p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
}
|
||||
case 3: {
|
||||
p[2] = _NXT | (oc[3] & 0x3f);
|
||||
p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
|
||||
break;
|
||||
|
||||
case 4:
|
||||
}
|
||||
case 4: {
|
||||
p[3] = _NXT | (oc[3] & 0x3f);
|
||||
p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
|
||||
((oc[1] & 0x03) << 4);
|
||||
p[1] = _NXT | ((oc[2] & 0xf0) >> 4) | ((oc[1] & 0x03) << 4);
|
||||
p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
|
||||
break;
|
||||
|
||||
case 5:
|
||||
}
|
||||
case 5: {
|
||||
p[4] = _NXT | (oc[3] & 0x3f);
|
||||
p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
|
||||
((oc[1] & 0x03) << 4);
|
||||
p[2] = _NXT | ((oc[2] & 0xf0) >> 4) | ((oc[1] & 0x03) << 4);
|
||||
p[1] = _NXT | (oc[1] >> 2);
|
||||
p[0] = _SEQ5 | (oc[0] & 0x03);
|
||||
break;
|
||||
|
||||
case 6:
|
||||
}
|
||||
case 6: {
|
||||
p[5] = _NXT | (oc[3] & 0x3f);
|
||||
p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
|
||||
|
@ -466,13 +368,11 @@ static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char
|
|||
p[1] = _NXT | (oc[0] & 0x3f);
|
||||
p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: do not check here for forbitten UTF-8 characters.
|
||||
* They cannot appear here because we do proper convertion.
|
||||
*/
|
||||
|
||||
// NOTE: do not check here for forbitten UTF-8 characters. They cannot appear here because
|
||||
// we do proper convertion.
|
||||
p += n;
|
||||
}
|
||||
|
||||
|
|
14
src/utf8.h
14
src/utf8.h
|
@ -14,22 +14,22 @@
|
|||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* utf8: implementation of UTF-8 charset encoding (RFC3629).
|
||||
*/
|
||||
// Implementation of UTF-8 charset encoding (RFC3629).
|
||||
#ifndef _UTF8_H_
|
||||
#define _UTF8_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
|
||||
#define UTF8_IGNORE_ERROR 0x01
|
||||
#define UTF8_SKIP_BOM 0x02
|
||||
#define UTF8_IGNORE_ERROR 0x01
|
||||
#define UTF8_SKIP_BOM 0x02
|
||||
|
||||
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
|
||||
/// Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if
|
||||
/// successful, storing the result of the conversion in *result*.
|
||||
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
||||
|
||||
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns nonzero if successful, storing the result of the conversion in *out */
|
||||
/// Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns nonzero if
|
||||
/// successful, storing the result of the conversion in *out*.
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags);
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||
|
||||
|
|
Loading…
Reference in a new issue