mirror of
https://github.com/fish-shell/fish-shell
synced 2024-12-27 13:23:09 +00:00
Tweak UTF8 decoding interface
Previously, when decoding UTF-8, we would first run through the array to compute the correct size, then allocate a buffer of that size, then run through the array again to fill the buffer, and then copy it into a std::wstring. With this fix we can copy it into the string directly, reducing allocations and only requiring a single pass.
This commit is contained in:
parent
0e8a8a7c80
commit
58d56f91f3
4 changed files with 47 additions and 64 deletions
|
@ -958,7 +958,7 @@ var_table_t env_universal_t::read_message_internal(int fd)
|
||||||
// Process it if it's a newline (which is true if we are before the end of the buffer)
|
// Process it if it's a newline (which is true if we are before the end of the buffer)
|
||||||
if (cursor < bufflen && ! line.empty())
|
if (cursor < bufflen && ! line.empty())
|
||||||
{
|
{
|
||||||
if (utf8_to_wchar_string(line, &wide_line))
|
if (utf8_to_wchar(line.data(), line.size(), &wide_line, 0))
|
||||||
{
|
{
|
||||||
env_universal_t::parse_message_internal(wide_line, &result, &storage);
|
env_universal_t::parse_message_internal(wide_line, &result, &storage);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1057,7 +1057,16 @@ static void test_utf82wchar(const char *src, size_t slen, const wchar_t *dst, si
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
size = utf8_to_wchar(src, slen, mem, dlen, flags);
|
if (mem == NULL)
|
||||||
|
{
|
||||||
|
size = utf8_to_wchar(src, slen, NULL, flags);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::wstring buff;
|
||||||
|
size = utf8_to_wchar(src, slen, &buff, flags);
|
||||||
|
std::copy(buff.begin(), buff.begin() + std::min(dlen, buff.size()), mem);
|
||||||
|
}
|
||||||
if (res != size)
|
if (res != size)
|
||||||
{
|
{
|
||||||
err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
|
err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
|
||||||
|
@ -1219,8 +1228,10 @@ static void test_utf8()
|
||||||
UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
|
UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
|
||||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0,
|
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0,
|
||||||
sizeof(wm) / sizeof(*wm), "mixed languages");
|
sizeof(wm) / sizeof(*wm), "mixed languages");
|
||||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
|
// PCA this test was to ensure that if the output buffer was too small, we'd get 0
|
||||||
0, "boundaries -1");
|
// we no longer have statically sized result buffers, so this test is disabled
|
||||||
|
// test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
|
||||||
|
// 0, "boundaries -1");
|
||||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0,
|
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0,
|
||||||
sizeof(wm) / sizeof(*wm), "boundaries +1");
|
sizeof(wm) / sizeof(*wm), "boundaries +1");
|
||||||
test_utf82wchar(um, sizeof(um), NULL, 0, 0,
|
test_utf82wchar(um, sizeof(um), NULL, 0, 0,
|
||||||
|
@ -1235,8 +1246,11 @@ static void test_utf8()
|
||||||
"invalid params, src buf not NULL");
|
"invalid params, src buf not NULL");
|
||||||
test_utf82wchar((const char *)NULL, 10, NULL, 0, 0, 0,
|
test_utf82wchar((const char *)NULL, 10, NULL, 0, 0, 0,
|
||||||
"invalid params, src length is not 0");
|
"invalid params, src length is not 0");
|
||||||
test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
|
|
||||||
"invalid params, dst is not NULL");
|
// PCA this test was to ensure that converting into a zero length output buffer would return 0
|
||||||
|
// we no longer statically size output buffers, so the test is disabled
|
||||||
|
// test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
|
||||||
|
// "invalid params, dst is not NULL");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* UCS-4 -> UTF-8 string.
|
* UCS-4 -> UTF-8 string.
|
||||||
|
|
80
src/utf8.cpp
80
src/utf8.cpp
|
@ -36,12 +36,14 @@
|
||||||
typedef wchar_t utf8_wchar_t;
|
typedef wchar_t utf8_wchar_t;
|
||||||
#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits<utf8_wchar_t>::max())
|
#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits<utf8_wchar_t>::max())
|
||||||
|
|
||||||
|
typedef std::basic_string<utf8_wchar_t> utf8_wstring_t;
|
||||||
|
|
||||||
bool is_wchar_ucs2()
|
bool is_wchar_ucs2()
|
||||||
{
|
{
|
||||||
return UTF8_WCHAR_MAX <= 0xFFFF;
|
return UTF8_WCHAR_MAX <= 0xFFFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags);
|
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *result, int flags);
|
||||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||||
|
|
||||||
static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count)
|
static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count)
|
||||||
|
@ -60,32 +62,6 @@ static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool utf8_to_wchar_string(const std::string &str, std::wstring *result)
|
|
||||||
{
|
|
||||||
result->clear();
|
|
||||||
const size_t inlen = str.size();
|
|
||||||
if (inlen == 0)
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool success = false;
|
|
||||||
const char *input = str.c_str();
|
|
||||||
size_t outlen = utf8_to_wchar(input, inlen, NULL, 0, 0);
|
|
||||||
if (outlen > 0)
|
|
||||||
{
|
|
||||||
wchar_t *tmp = new wchar_t[outlen];
|
|
||||||
size_t outlen2 = utf8_to_wchar(input, inlen, tmp, outlen, 0);
|
|
||||||
if (outlen2 > 0)
|
|
||||||
{
|
|
||||||
result->assign(tmp, outlen2);
|
|
||||||
success = true;
|
|
||||||
}
|
|
||||||
delete[] tmp;
|
|
||||||
}
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
||||||
{
|
{
|
||||||
result->clear();
|
result->clear();
|
||||||
|
@ -112,9 +88,9 @@ bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags)
|
size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags)
|
||||||
{
|
{
|
||||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
if (in == NULL || insize == 0)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -122,21 +98,20 @@ size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize
|
||||||
size_t result;
|
size_t result;
|
||||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
||||||
{
|
{
|
||||||
result = utf8_to_wchar_internal(in, insize, reinterpret_cast<utf8_wchar_t *>(out), outsize, flags);
|
result = utf8_to_wchar_internal(in, insize, reinterpret_cast<utf8_wstring_t *>(out), flags);
|
||||||
|
}
|
||||||
|
else if (out == NULL)
|
||||||
|
{
|
||||||
|
result = utf8_to_wchar_internal(in, insize, NULL, flags);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Allocate a temporary buffer to hold the output
|
// Allocate a temporary buffer to hold the output,
|
||||||
// note: outsize may be 0
|
// invoke the conversion with the temporary,
|
||||||
utf8_wchar_t *tmp_output = new utf8_wchar_t[outsize];
|
// and then copy it back
|
||||||
|
utf8_wstring_t tmp_output;
|
||||||
// Invoke the conversion with the temporary
|
result = utf8_to_wchar_internal(in, insize, &tmp_output, flags);
|
||||||
result = utf8_to_wchar_internal(in, insize, tmp_output, outsize, flags);
|
out->insert(out->end(), tmp_output.begin(), tmp_output.end());
|
||||||
|
|
||||||
// Copy back from tmp to the function's output, then clean it up
|
|
||||||
size_t amount_to_copy = std::min(result, outsize);
|
|
||||||
std::copy(tmp_output, tmp_output + amount_to_copy, out);
|
|
||||||
delete[] tmp_output;
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -213,9 +188,7 @@ __utf8_forbitten(unsigned char octet)
|
||||||
* It takes the following arguments:
|
* It takes the following arguments:
|
||||||
* in - input UTF-8 string. It can be null-terminated.
|
* in - input UTF-8 string. It can be null-terminated.
|
||||||
* insize - size of input string in bytes.
|
* insize - size of input string in bytes.
|
||||||
* out - result buffer for UCS-2/4 string. If out is NULL,
|
* out_string - result buffer for UCS-2/4 string.
|
||||||
* function returns size of result buffer.
|
|
||||||
* outsize - size of out buffer in wide characters.
|
|
||||||
*
|
*
|
||||||
* RETURN VALUES
|
* RETURN VALUES
|
||||||
* The function returns size of result buffer (in wide characters).
|
* The function returns size of result buffer (in wide characters).
|
||||||
|
@ -231,19 +204,21 @@ __utf8_forbitten(unsigned char octet)
|
||||||
* not prepare buffer in advance (\0 terminate) but after calling this
|
* not prepare buffer in advance (\0 terminate) but after calling this
|
||||||
* function.
|
* function.
|
||||||
*/
|
*/
|
||||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags)
|
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wstring_t *out_string, int flags)
|
||||||
{
|
{
|
||||||
unsigned char *p, *lim;
|
unsigned char *p, *lim;
|
||||||
utf8_wchar_t *wlim, high;
|
utf8_wchar_t high;
|
||||||
size_t n, total, i, n_bits;
|
size_t n, total, i, n_bits;
|
||||||
|
|
||||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
if (in == NULL || insize == 0)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (out_string != NULL)
|
||||||
|
out_string->clear();
|
||||||
|
|
||||||
total = 0;
|
total = 0;
|
||||||
p = (unsigned char *)in;
|
p = (unsigned char *)in;
|
||||||
lim = p + insize;
|
lim = p + insize;
|
||||||
wlim = out + outsize;
|
|
||||||
|
|
||||||
for (; p < lim; p += n)
|
for (; p < lim; p += n)
|
||||||
{
|
{
|
||||||
|
@ -319,15 +294,10 @@ static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t
|
||||||
}
|
}
|
||||||
|
|
||||||
total++;
|
total++;
|
||||||
|
if (out_string == NULL)
|
||||||
if (out == NULL)
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (out >= wlim)
|
|
||||||
return (0); /* no space left */
|
|
||||||
|
|
||||||
uint32_t out_val = 0;
|
uint32_t out_val = 0;
|
||||||
*out = 0;
|
|
||||||
n_bits = 0;
|
n_bits = 0;
|
||||||
for (i = 1; i < n; i++)
|
for (i = 1; i < n; i++)
|
||||||
{
|
{
|
||||||
|
@ -364,7 +334,7 @@ static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
*out++ = out_val;
|
out_string->push_back(out_val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,11 +28,10 @@
|
||||||
#define UTF8_SKIP_BOM 0x02
|
#define UTF8_SKIP_BOM 0x02
|
||||||
|
|
||||||
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
|
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns true if successful, storing the result of the conversion in *result */
|
||||||
bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
|
|
||||||
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
||||||
|
|
||||||
/* Variants exposed for testing */
|
/* Convert a string between UTF8 and UCS-2/4 (depending on size of wchar_t). Returns nonzero if successful, storing the result of the conversion in *out */
|
||||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
|
size_t utf8_to_wchar(const char *in, size_t insize, std::wstring *out, int flags);
|
||||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||||
|
|
||||||
bool is_wchar_ucs2();
|
bool is_wchar_ucs2();
|
||||||
|
|
Loading…
Reference in a new issue