mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-13 13:39:02 +00:00
Incorporate a modified UTF8 <-> wchar_t implementation from Alexey
Vatchenko (http://www.bsdua.org/libbsdua.html) in preparation for eliminating our dependency on iconv
This commit is contained in:
parent
06eb271bda
commit
aabed8279e
6 changed files with 861 additions and 27 deletions
|
@ -91,7 +91,7 @@ FISH_OBJS := function.o builtin.o complete.o env.o exec.o expand.o \
|
|||
signal.o io.o parse_util.o common.o screen.o path.o autoload.o \
|
||||
parser_keywords.o iothread.o color.o postfork.o \
|
||||
builtin_test.o parse_tree.o parse_productions.o parse_execution.cpp \
|
||||
pager.cpp
|
||||
pager.cpp utf8.o
|
||||
|
||||
FISH_INDENT_OBJS := fish_indent.o print_help.o common.o \
|
||||
parser_keywords.o wutil.o tokenizer.o
|
||||
|
@ -117,7 +117,7 @@ FISH_TESTS_OBJS := $(FISH_OBJS) fish_tests.o
|
|||
#
|
||||
|
||||
FISHD_OBJS := fishd.o env_universal_common.o wutil.o print_help.o \
|
||||
common.o
|
||||
common.o utf8.o
|
||||
|
||||
|
||||
#
|
||||
|
|
|
@ -1402,4 +1402,20 @@ POSSIBILITY OF SUCH DAMAGES.
|
|||
|
||||
*/
|
||||
|
||||
<h2>License for UTF8</h2>
|
||||
|
||||
<p>Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
|
||||
|
||||
<p>Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
<p>THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
\htmlonly </div> \endhtmlonly
|
||||
|
|
|
@ -117,6 +117,8 @@
|
|||
D0A564FE168D23D800AF6161 /* man in CopyFiles */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; };
|
||||
D0A56501168D258300AF6161 /* man in Copy Files */ = {isa = PBXBuildFile; fileRef = D0A564F1168D0BAB00AF6161 /* man */; };
|
||||
D0C52F371765284C00BFAB82 /* parse_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C52F351765284C00BFAB82 /* parse_tree.cpp */; };
|
||||
D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; };
|
||||
D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0C9733718DE5449002D7C81 /* utf8.cpp */; };
|
||||
D0CBD587159EF0E10024809C /* launch_fish.scpt in Resources */ = {isa = PBXBuildFile; fileRef = D0CBD586159EF0E10024809C /* launch_fish.scpt */; };
|
||||
D0D02A67159837AD008E62BD /* complete.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853713B3ACEE0099B651 /* complete.cpp */; };
|
||||
D0D02A69159837B2008E62BD /* env.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D0A0853A13B3ACEE0099B651 /* env.cpp */; };
|
||||
|
@ -475,6 +477,8 @@
|
|||
D0C6FCC914CFA4B0004CE8AD /* autoload.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = autoload.cpp; sourceTree = "<group>"; };
|
||||
D0C6FCCB14CFA4B7004CE8AD /* autoload.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = autoload.h; sourceTree = "<group>"; };
|
||||
D0C861EA16CC7054003B5A04 /* builtin_set_color.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_set_color.cpp; sourceTree = "<group>"; };
|
||||
D0C9733718DE5449002D7C81 /* utf8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = utf8.cpp; sourceTree = "<group>"; };
|
||||
D0C9733A18DE5451002D7C81 /* utf8.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = utf8.h; sourceTree = "<group>"; };
|
||||
D0CA63F316FC275F00093BD4 /* builtin_printf.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = builtin_printf.cpp; sourceTree = "<group>"; };
|
||||
D0CBD580159EE48F0024809C /* config.fish */ = {isa = PBXFileReference; lastKnownFileType = text; name = config.fish; path = share/config.fish; sourceTree = "<group>"; };
|
||||
D0CBD583159EEE010024809C /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||
|
@ -686,6 +690,8 @@
|
|||
D0A0855C13B3ACEE0099B651 /* signal.cpp */,
|
||||
D0A0852513B3ACEE0099B651 /* tokenizer.h */,
|
||||
D0A0855D13B3ACEE0099B651 /* tokenizer.cpp */,
|
||||
D0C9733A18DE5451002D7C81 /* utf8.h */,
|
||||
D0C9733718DE5449002D7C81 /* utf8.cpp */,
|
||||
D0A0852613B3ACEE0099B651 /* util.h */,
|
||||
D0A0855E13B3ACEE0099B651 /* util.cpp */,
|
||||
D0A0852713B3ACEE0099B651 /* wgetopt.h */,
|
||||
|
@ -1120,6 +1126,7 @@
|
|||
files = (
|
||||
D0D02AC215985F3F008E62BD /* fishd.cpp in Sources */,
|
||||
D0D02AC315985F43008E62BD /* env_universal_common.cpp in Sources */,
|
||||
D0C9733918DE5449002D7C81 /* utf8.cpp in Sources */,
|
||||
D0D02AC415985F4D008E62BD /* wutil.cpp in Sources */,
|
||||
D0D02AC515985F5B008E62BD /* print_help.cpp in Sources */,
|
||||
D0D02AC615985F65008E62BD /* common.cpp in Sources */,
|
||||
|
@ -1157,6 +1164,7 @@
|
|||
D0D02A86159839D5008E62BD /* postfork.cpp in Sources */,
|
||||
D0D02A87159839D5008E62BD /* screen.cpp in Sources */,
|
||||
D0D02A88159839D5008E62BD /* signal.cpp in Sources */,
|
||||
D0C9733818DE5449002D7C81 /* utf8.cpp in Sources */,
|
||||
D0D2694A15983779005D9B9C /* builtin.cpp in Sources */,
|
||||
D0D2694915983772005D9B9C /* function.cpp in Sources */,
|
||||
D0D02A67159837AD008E62BD /* complete.cpp in Sources */,
|
||||
|
|
259
fish_tests.cpp
259
fish_tests.cpp
|
@ -62,6 +62,7 @@
|
|||
#include "parse_util.h"
|
||||
#include "pager.h"
|
||||
#include "input.h"
|
||||
#include "utf8.h"
|
||||
|
||||
static const char * const * s_arguments;
|
||||
static int s_test_run_count = 0;
|
||||
|
@ -857,6 +858,260 @@ static void test_utils()
|
|||
if (begin != a + wcslen(L"echo (echo (")) err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
|
||||
}
|
||||
|
||||
/* UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html */
|
||||
|
||||
static void test_utf82wchar(const char *src, size_t slen, const wchar_t *dst, size_t dlen,
|
||||
int flags, size_t res, const char *descr)
|
||||
{
|
||||
size_t size;
|
||||
wchar_t *mem = NULL;
|
||||
|
||||
/* Hack: if wchar is only UCS-2, and the UTF-8 input string contains astral characters, then tweak the expected size to 0 */
|
||||
if (src != NULL && is_wchar_ucs2())
|
||||
{
|
||||
/* A UTF-8 code unit may represent an astral code point if it has 4 or more leading 1s */
|
||||
const unsigned char astral_mask = 0xF0;
|
||||
for (size_t i=0; i < slen; i++)
|
||||
{
|
||||
if ((src[i] & astral_mask) == astral_mask)
|
||||
{
|
||||
/* Astral char. We expect this conversion to just fail. */
|
||||
res = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dst != NULL)
|
||||
{
|
||||
mem = (wchar_t *)malloc(dlen * sizeof(*mem));
|
||||
if (mem == NULL)
|
||||
{
|
||||
err(L"u2w: %s: MALLOC FAILED\n", descr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
size = utf8_to_wchar(src, slen, mem, dlen, flags);
|
||||
if (res != size)
|
||||
{
|
||||
err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
|
||||
break;
|
||||
}
|
||||
|
||||
if (mem == NULL)
|
||||
break; /* OK */
|
||||
|
||||
if (memcmp(mem, dst, size * sizeof(*mem)) != 0)
|
||||
{
|
||||
err(L"u2w: %s: BROKEN", descr);
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
while (0);
|
||||
|
||||
free(mem);
|
||||
}
|
||||
|
||||
static void test_wchar2utf8(const wchar_t *src, size_t slen, const char *dst, size_t dlen,
|
||||
int flags, size_t res, const char *descr)
|
||||
{
|
||||
size_t size;
|
||||
char *mem = NULL;
|
||||
|
||||
/* Hack: if wchar is simulating UCS-2, and the wchar_t input string contains astral characters, then tweak the expected size to 0 */
|
||||
if (src != NULL && is_wchar_ucs2())
|
||||
{
|
||||
const uint32_t astral_mask = 0xFFFF0000U;
|
||||
for (size_t i=0; i < slen; i++)
|
||||
{
|
||||
if ((src[i] & astral_mask) != 0)
|
||||
{
|
||||
/* astral char */
|
||||
res = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dst != NULL)
|
||||
{
|
||||
mem = (char *)malloc(dlen);
|
||||
if (mem == NULL)
|
||||
{
|
||||
err(L"w2u: %s: MALLOC FAILED", descr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
size = wchar_to_utf8(src, slen, mem, dlen, flags);
|
||||
if (res != size)
|
||||
{
|
||||
err(L"w2u: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
|
||||
break;
|
||||
}
|
||||
|
||||
if (mem == NULL)
|
||||
break; /* OK */
|
||||
|
||||
if (memcmp(mem, dst, size) != 0)
|
||||
{
|
||||
err(L"w2u: %s: BROKEN", descr);
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
while (0);
|
||||
|
||||
if (mem != NULL);
|
||||
free(mem);
|
||||
}
|
||||
|
||||
static void test_utf8()
|
||||
{
|
||||
wchar_t w1[] = {0x54, 0x65, 0x73, 0x74};
|
||||
wchar_t w2[] = {0x0422, 0x0435, 0x0441, 0x0442};
|
||||
wchar_t w3[] = {0x800, 0x1e80, 0x98c4, 0x9910, 0xff00};
|
||||
wchar_t w4[] = {0x15555, 0xf7777, 0xa};
|
||||
wchar_t w5[] = {0x255555, 0x1fa04ff, 0xddfd04, 0xa};
|
||||
wchar_t w6[] = {0xf255555, 0x1dfa04ff, 0x7fddfd04, 0xa};
|
||||
wchar_t wb[] = {-2, 0xa, 0xffffffff, 0x0441};
|
||||
wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d, 0x2e05da67};
|
||||
wchar_t wb1[] = {0xa, 0x0422};
|
||||
wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0xa};
|
||||
wchar_t wbom[] = {0xfeff, 0x41, 0xa};
|
||||
wchar_t wbom2[] = {0x41, 0xa};
|
||||
wchar_t wbom22[] = {0xfeff, 0x41, 0xa};
|
||||
char u1[] = {0x54, 0x65, 0x73, 0x74};
|
||||
char u2[] = {0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82};
|
||||
char u3[] = {0xe0, 0xa0, 0x80, 0xe1, 0xba, 0x80, 0xe9, 0xa3, 0x84,
|
||||
0xe9, 0xa4, 0x90, 0xef, 0xbc, 0x80
|
||||
};
|
||||
char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0xa};
|
||||
char u5[] = {0xf8, 0x89, 0x95, 0x95, 0x95, 0xf9, 0xbe, 0xa0, 0x93,
|
||||
0xbf, 0xf8, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
|
||||
};
|
||||
char u6[] = {0xfc, 0x8f, 0x89, 0x95, 0x95, 0x95, 0xfc, 0x9d, 0xbe,
|
||||
0xa0, 0x93, 0xbf, 0xfd, 0xbf, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
|
||||
};
|
||||
char ub[] = {0xa, 0xd1, 0x81};
|
||||
char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7,
|
||||
0xe9, 0xac, 0x8d, 0xfc, 0xae, 0x81, 0x9d, 0xa9, 0xa7
|
||||
};
|
||||
char ub1[] = {0xa, 0xff, 0xd0, 0xa2, 0xfe, 0x8f, 0xe0, 0x80};
|
||||
char uc080[] = {0xc0, 0x80};
|
||||
char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0xa};
|
||||
char ubom[] = {0x41, 0xa};
|
||||
char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0xa};
|
||||
|
||||
/*
|
||||
* UTF-8 -> UCS-4 string.
|
||||
*/
|
||||
test_utf82wchar(ubom2, sizeof(ubom2), wbom2,
|
||||
sizeof(wbom2) / sizeof(*wbom2), UTF8_SKIP_BOM,
|
||||
sizeof(wbom2) / sizeof(*wbom2), "skip BOM");
|
||||
test_utf82wchar(ubom2, sizeof(ubom2), wbom22,
|
||||
sizeof(wbom22) / sizeof(*wbom22), 0,
|
||||
sizeof(wbom22) / sizeof(*wbom22), "BOM");
|
||||
test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0,
|
||||
"c0 80 - forbitten by rfc3629");
|
||||
test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, is_wchar_ucs2() ? 0 : 3,
|
||||
"resulted in forbitten wchars (len)");
|
||||
test_utf82wchar(ub2, sizeof(ub2), wb2, sizeof(wb2) / sizeof(*wb2), 0, 0,
|
||||
"resulted in forbitten wchars");
|
||||
test_utf82wchar(ub2, sizeof(ub2), L"\x0a", 1, UTF8_IGNORE_ERROR,
|
||||
1, "resulted in ignored forbitten wchars");
|
||||
test_utf82wchar(u1, sizeof(u1), w1, sizeof(w1) / sizeof(*w1), 0,
|
||||
sizeof(w1) / sizeof(*w1), "1 octet chars");
|
||||
test_utf82wchar(u2, sizeof(u2), w2, sizeof(w2) / sizeof(*w2), 0,
|
||||
sizeof(w2) / sizeof(*w2), "2 octets chars");
|
||||
test_utf82wchar(u3, sizeof(u3), w3, sizeof(w3) / sizeof(*w3), 0,
|
||||
sizeof(w3) / sizeof(*w3), "3 octets chars");
|
||||
test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0,
|
||||
sizeof(w4) / sizeof(*w4), "4 octets chars");
|
||||
test_utf82wchar(u5, sizeof(u5), w5, sizeof(w5) / sizeof(*w5), 0,
|
||||
sizeof(w5) / sizeof(*w5), "5 octets chars");
|
||||
test_utf82wchar(u6, sizeof(u6), w6, sizeof(w6) / sizeof(*w6), 0,
|
||||
sizeof(w6) / sizeof(*w6), "6 octets chars");
|
||||
test_utf82wchar("\xff", 1, NULL, 0, 0, 0, "broken utf-8 0xff symbol");
|
||||
test_utf82wchar("\xfe", 1, NULL, 0, 0, 0, "broken utf-8 0xfe symbol");
|
||||
test_utf82wchar("\x8f", 1, NULL, 0, 0, 0,
|
||||
"broken utf-8, start from 10 higher bits");
|
||||
if (! is_wchar_ucs2()) test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1),
|
||||
UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
|
||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0,
|
||||
sizeof(wm) / sizeof(*wm), "mixed languages");
|
||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
|
||||
0, "boundaries -1");
|
||||
test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0,
|
||||
sizeof(wm) / sizeof(*wm), "boundaries +1");
|
||||
test_utf82wchar(um, sizeof(um), NULL, 0, 0,
|
||||
sizeof(wm) / sizeof(*wm), "calculate length");
|
||||
test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0,
|
||||
0, "calculate length of bad chars");
|
||||
test_utf82wchar(ub1, sizeof(ub1), NULL, 0,
|
||||
UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1),
|
||||
"calculate length, ignore bad chars");
|
||||
test_utf82wchar(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
|
||||
test_utf82wchar(u1, 0, NULL, 0, 0, 0,
|
||||
"invalid params, src buf not NULL");
|
||||
test_utf82wchar(NULL, 10, NULL, 0, 0, 0,
|
||||
"invalid params, src length is not 0");
|
||||
test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
|
||||
"invalid params, dst is not NULL");
|
||||
|
||||
/*
|
||||
* UCS-4 -> UTF-8 string.
|
||||
*/
|
||||
test_wchar2utf8(wbom, sizeof(wbom) / sizeof(*wbom), ubom, sizeof(ubom),
|
||||
UTF8_SKIP_BOM, sizeof(ubom), "BOM");
|
||||
test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0, 0,
|
||||
0, "prohibited wchars");
|
||||
test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0,
|
||||
UTF8_IGNORE_ERROR, 2, "ignore prohibited wchars");
|
||||
test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0,
|
||||
sizeof(u1), "1 octet chars");
|
||||
test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0,
|
||||
sizeof(u2), "2 octets chars");
|
||||
test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0,
|
||||
sizeof(u3), "3 octets chars");
|
||||
test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0,
|
||||
sizeof(u4), "4 octets chars");
|
||||
test_wchar2utf8(w5, sizeof(w5) / sizeof(*w5), u5, sizeof(u5), 0,
|
||||
sizeof(u5), "5 octets chars");
|
||||
test_wchar2utf8(w6, sizeof(w6) / sizeof(*w6), u6, sizeof(u6), 0,
|
||||
sizeof(u6), "6 octets chars");
|
||||
test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0,
|
||||
0, "bad chars");
|
||||
test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub),
|
||||
UTF8_IGNORE_ERROR, sizeof(ub), "ignore bad chars");
|
||||
test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0,
|
||||
sizeof(um), "mixed languages");
|
||||
test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0,
|
||||
0, "boundaries -1");
|
||||
test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) + 1, 0,
|
||||
sizeof(um), "boundaries +1");
|
||||
test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), NULL, 0, 0,
|
||||
sizeof(um), "calculate length");
|
||||
test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0, 0,
|
||||
0, "calculate length of bad chars");
|
||||
test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0,
|
||||
UTF8_IGNORE_ERROR, sizeof(ub),
|
||||
"calculate length, ignore bad chars");
|
||||
test_wchar2utf8(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
|
||||
test_wchar2utf8(w1, 0, NULL, 0, 0, 0,
|
||||
"invalid params, src buf not NULL");
|
||||
test_wchar2utf8(NULL, 10, NULL, 0, 0, 0,
|
||||
"invalid params, src length is not 0");
|
||||
test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0,
|
||||
"invalid params, dst is not NULL");
|
||||
}
|
||||
|
||||
static void test_escape_sequences(void)
|
||||
{
|
||||
say(L"Testing escape codes");
|
||||
|
@ -2867,6 +3122,7 @@ int main(int argc, char **argv)
|
|||
if (should_test_function("cancellation")) test_cancellation();
|
||||
if (should_test_function("indents")) test_indents();
|
||||
if (should_test_function("utils")) test_utils();
|
||||
if (should_test_function("utf8")) test_utf8();
|
||||
if (should_test_function("escape_sequences")) test_escape_sequences();
|
||||
if (should_test_function("lru")) test_lru();
|
||||
if (should_test_function("expand")) test_expand();
|
||||
|
@ -2906,7 +3162,8 @@ int main(int argc, char **argv)
|
|||
event_destroy();
|
||||
proc_destroy();
|
||||
|
||||
if(err_count != 0) {
|
||||
if (err_count != 0)
|
||||
{
|
||||
return(1);
|
||||
}
|
||||
}
|
||||
|
|
514
utf8.cpp
Normal file
514
utf8.cpp
Normal file
|
@ -0,0 +1,514 @@
|
|||
/*
|
||||
* Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <wchar.h>
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
|
||||
#define _NXT 0x80
|
||||
#define _SEQ2 0xc0
|
||||
#define _SEQ3 0xe0
|
||||
#define _SEQ4 0xf0
|
||||
#define _SEQ5 0xf8
|
||||
#define _SEQ6 0xfc
|
||||
|
||||
#define _BOM 0xfeff
|
||||
|
||||
/* We can tweak the following typedef to allow us to simulate Windows-style 16 bit wchar's on Unix */
|
||||
typedef wchar_t utf8_wchar_t;
|
||||
#define UTF8_WCHAR_MAX ((size_t)std::numeric_limits<utf8_wchar_t>::max())
|
||||
|
||||
bool is_wchar_ucs2()
|
||||
{
|
||||
return UTF8_WCHAR_MAX <= 0xFFFF;
|
||||
}
|
||||
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags);
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||
|
||||
static bool safe_copy_wchar_to_utf8_wchar(const wchar_t *in, utf8_wchar_t *out, size_t count)
|
||||
{
|
||||
bool result = true;
|
||||
for (size_t i=0; i < count; i++)
|
||||
{
|
||||
wchar_t c = in[i];
|
||||
if (c > UTF8_WCHAR_MAX)
|
||||
{
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
out[i] = c;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool utf8_to_wchar_string(const std::string &str, std::wstring *result)
|
||||
{
|
||||
result->clear();
|
||||
const size_t inlen = str.size();
|
||||
if (inlen == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool success = false;
|
||||
const char *input = str.c_str();
|
||||
size_t outlen = utf8_to_wchar(input, inlen, NULL, 0, 0);
|
||||
if (outlen > 0)
|
||||
{
|
||||
wchar_t *tmp = new wchar_t[outlen];
|
||||
size_t outlen2 = utf8_to_wchar(input, inlen, tmp, outlen, 0);
|
||||
if (outlen2 > 0)
|
||||
{
|
||||
result->assign(tmp, outlen2);
|
||||
success = true;
|
||||
}
|
||||
delete[] tmp;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
bool wchar_to_utf8_string(const std::wstring &str, std::string *result)
|
||||
{
|
||||
result->clear();
|
||||
const size_t inlen = str.size();
|
||||
if (inlen == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool success = false;
|
||||
const wchar_t *input = str.c_str();
|
||||
size_t outlen = wchar_to_utf8(input, inlen, NULL, 0, 0);
|
||||
if (outlen > 0)
|
||||
{
|
||||
char *tmp = new char[outlen];
|
||||
size_t outlen2 = wchar_to_utf8(input, inlen, tmp, outlen, 0);
|
||||
if (outlen2 > 0)
|
||||
{
|
||||
result->assign(tmp, outlen2);
|
||||
success = true;
|
||||
}
|
||||
delete[] tmp;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags)
|
||||
{
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t result;
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
||||
{
|
||||
result = utf8_to_wchar_internal(in, insize, reinterpret_cast<utf8_wchar_t *>(out), outsize, flags);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate a temporary buffer to hold the output
|
||||
// note: outsize may be 0
|
||||
utf8_wchar_t *tmp_output = new utf8_wchar_t[outsize];
|
||||
|
||||
// Invoke the conversion with the temporary
|
||||
result = utf8_to_wchar_internal(in, insize, tmp_output, outsize, flags);
|
||||
|
||||
// Copy back from tmp to the function's output, then clean it up
|
||||
size_t amount_to_copy = std::min(result, outsize);
|
||||
std::copy(tmp_output, tmp_output + amount_to_copy, out);
|
||||
delete[] tmp_output;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
|
||||
{
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t result;
|
||||
if (sizeof(wchar_t) == sizeof(utf8_wchar_t))
|
||||
{
|
||||
result = wchar_to_utf8_internal(reinterpret_cast<const utf8_wchar_t *>(in), insize, out, outsize, flags);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate a temporary buffer to hold the input
|
||||
// the std::copy performs the size conversion
|
||||
// note: insize may be 0
|
||||
utf8_wchar_t *tmp_input = new utf8_wchar_t[insize];
|
||||
if (! safe_copy_wchar_to_utf8_wchar(in, tmp_input, insize))
|
||||
{
|
||||
// our utf8_wchar_t is UCS-16 and there was an astral character
|
||||
result = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invoke the conversion with the temporary, then clean up the input
|
||||
result = wchar_to_utf8_internal(tmp_input, insize, out, outsize, flags);
|
||||
}
|
||||
delete[] tmp_input;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int __wchar_forbitten(utf8_wchar_t sym);
|
||||
static int __utf8_forbitten(unsigned char octet);
|
||||
|
||||
static int
|
||||
__wchar_forbitten(utf8_wchar_t sym)
|
||||
{
|
||||
|
||||
/* Surrogate pairs */
|
||||
if (sym >= 0xd800 && sym <= 0xdfff)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
__utf8_forbitten(unsigned char octet)
|
||||
{
|
||||
|
||||
switch (octet)
|
||||
{
|
||||
case 0xc0:
|
||||
case 0xc1:
|
||||
case 0xf5:
|
||||
case 0xff:
|
||||
return (-1);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* DESCRIPTION
|
||||
* This function translates UTF-8 string into UCS-2 or UCS-4 string (all symbols
|
||||
* will be in local machine byte order).
|
||||
*
|
||||
* It takes the following arguments:
|
||||
* in - input UTF-8 string. It can be null-terminated.
|
||||
* insize - size of input string in bytes.
|
||||
* out - result buffer for UCS-2/4 string. If out is NULL,
|
||||
* function returns size of result buffer.
|
||||
* outsize - size of out buffer in wide characters.
|
||||
*
|
||||
* RETURN VALUES
|
||||
* The function returns size of result buffer (in wide characters).
|
||||
* Zero is returned in case of error.
|
||||
*
|
||||
* CAVEATS
|
||||
* 1. If UTF-8 string contains zero symbols, they will be translated
|
||||
* as regular symbols.
|
||||
* 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
|
||||
* when `out' is NULL and not NULL. It's because of special UTF-8
|
||||
* sequences which may result in forbitten (by RFC3629) UNICODE
|
||||
* characters. So, the caller must check return value every time and
|
||||
* not prepare buffer in advance (\0 terminate) but after calling this
|
||||
* function.
|
||||
*/
|
||||
static size_t utf8_to_wchar_internal(const char *in, size_t insize, utf8_wchar_t *out, size_t outsize, int flags)
|
||||
{
|
||||
unsigned char *p, *lim;
|
||||
utf8_wchar_t *wlim, high;
|
||||
size_t n, total, i, n_bits;
|
||||
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
return (0);
|
||||
|
||||
total = 0;
|
||||
p = (unsigned char *)in;
|
||||
lim = p + insize;
|
||||
wlim = out + outsize;
|
||||
|
||||
for (; p < lim; p += n)
|
||||
{
|
||||
if (__utf8_forbitten(*p) != 0 &&
|
||||
(flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* Get number of bytes for one wide character.
|
||||
*/
|
||||
n = 1; /* default: 1 byte. Used when skipping bytes. */
|
||||
if ((*p & 0x80) == 0)
|
||||
high = (utf8_wchar_t)*p;
|
||||
else if ((*p & 0xe0) == _SEQ2)
|
||||
{
|
||||
n = 2;
|
||||
high = (utf8_wchar_t)(*p & 0x1f);
|
||||
}
|
||||
else if ((*p & 0xf0) == _SEQ3)
|
||||
{
|
||||
n = 3;
|
||||
high = (utf8_wchar_t)(*p & 0x0f);
|
||||
}
|
||||
else if ((*p & 0xf8) == _SEQ4)
|
||||
{
|
||||
n = 4;
|
||||
high = (utf8_wchar_t)(*p & 0x07);
|
||||
}
|
||||
else if ((*p & 0xfc) == _SEQ5)
|
||||
{
|
||||
n = 5;
|
||||
high = (utf8_wchar_t)(*p & 0x03);
|
||||
}
|
||||
else if ((*p & 0xfe) == _SEQ6)
|
||||
{
|
||||
n = 6;
|
||||
high = (utf8_wchar_t)(*p & 0x01);
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* does the sequence header tell us truth about length? */
|
||||
if (lim - p <= n - 1)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
n = 1;
|
||||
continue; /* skip */
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate sequence.
|
||||
* All symbols must have higher bits set to 10xxxxxx
|
||||
*/
|
||||
if (n > 1)
|
||||
{
|
||||
for (i = 1; i < n; i++)
|
||||
{
|
||||
if ((p[i] & 0xc0) != _NXT)
|
||||
break;
|
||||
}
|
||||
if (i != n)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
n = 1;
|
||||
continue; /* skip */
|
||||
}
|
||||
}
|
||||
|
||||
total++;
|
||||
|
||||
if (out == NULL)
|
||||
continue;
|
||||
|
||||
if (out >= wlim)
|
||||
return (0); /* no space left */
|
||||
|
||||
uint32_t out_val = 0;
|
||||
*out = 0;
|
||||
n_bits = 0;
|
||||
for (i = 1; i < n; i++)
|
||||
{
|
||||
out_val |= (utf8_wchar_t)(p[n - i] & 0x3f) << n_bits;
|
||||
n_bits += 6; /* 6 low bits in every byte */
|
||||
}
|
||||
out_val |= high << n_bits;
|
||||
|
||||
bool skip = false;
|
||||
if (__wchar_forbitten(out_val) != 0)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
{
|
||||
return 0; /* forbitten character */
|
||||
}
|
||||
else
|
||||
{
|
||||
skip = true;
|
||||
}
|
||||
}
|
||||
else if (out_val == _BOM && (flags & UTF8_SKIP_BOM) != 0)
|
||||
{
|
||||
skip = true;
|
||||
}
|
||||
|
||||
if (skip)
|
||||
{
|
||||
total--;
|
||||
}
|
||||
else if (out_val > UTF8_WCHAR_MAX)
|
||||
{
|
||||
// wchar_t is UCS-2, but the UTF-8 specified an astral character
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
*out++ = out_val;
|
||||
}
|
||||
}
|
||||
|
||||
return (total);
|
||||
}
|
||||
|
||||
/*
|
||||
* DESCRIPTION
|
||||
* This function translates UCS-2/4 symbols (given in local machine
|
||||
* byte order) into UTF-8 string.
|
||||
*
|
||||
* It takes the following arguments:
|
||||
* in - input unicode string. It can be null-terminated.
|
||||
* insize - size of input string in wide characters.
|
||||
* out - result buffer for utf8 string. If out is NULL,
|
||||
* function returns size of result buffer.
|
||||
* outsize - size of result buffer.
|
||||
*
|
||||
* RETURN VALUES
|
||||
* The function returns size of result buffer (in bytes). Zero is returned
|
||||
* in case of error.
|
||||
*
|
||||
* CAVEATS
|
||||
* If UCS-4 string contains zero symbols, they will be translated
|
||||
* as regular symbols.
|
||||
*/
|
||||
static size_t wchar_to_utf8_internal(const utf8_wchar_t *in, size_t insize, char *out, size_t outsize, int flags)
|
||||
{
|
||||
const utf8_wchar_t *w, *wlim;
|
||||
unsigned char *p, *lim;
|
||||
size_t total, n;
|
||||
|
||||
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
|
||||
return (0);
|
||||
|
||||
w = in;
|
||||
wlim = w + insize;
|
||||
p = (unsigned char *)out;
|
||||
lim = p + outsize;
|
||||
total = 0;
|
||||
for (; w < wlim; w++)
|
||||
{
|
||||
if (__wchar_forbitten(*w) != 0)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
else
|
||||
continue;
|
||||
}
|
||||
|
||||
if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
|
||||
continue;
|
||||
|
||||
const int32_t w_wide = *w;
|
||||
if (w_wide < 0)
|
||||
{
|
||||
if ((flags & UTF8_IGNORE_ERROR) == 0)
|
||||
return (0);
|
||||
continue;
|
||||
}
|
||||
else if (w_wide <= 0x0000007f)
|
||||
n = 1;
|
||||
else if (w_wide <= 0x000007ff)
|
||||
n = 2;
|
||||
else if (w_wide <= 0x0000ffff)
|
||||
n = 3;
|
||||
else if (w_wide <= 0x001fffff)
|
||||
n = 4;
|
||||
else if (w_wide <= 0x03ffffff)
|
||||
n = 5;
|
||||
else /* if (w_wide <= 0x7fffffff) */
|
||||
n = 6;
|
||||
|
||||
total += n;
|
||||
|
||||
if (out == NULL)
|
||||
continue;
|
||||
|
||||
if (lim - p <= n - 1)
|
||||
return (0); /* no space left */
|
||||
|
||||
/* extract the wchar_t as big-endian. If wchar_t is UCS-16, the first two bytes will be 0 */
|
||||
unsigned char oc[4];
|
||||
uint32_t w_tmp = *w;
|
||||
oc[3] = w_tmp & 0xFF;
|
||||
w_tmp >>= 8;
|
||||
oc[2] = w_tmp & 0xFF;
|
||||
w_tmp >>= 8;
|
||||
oc[1] = w_tmp & 0xFF;
|
||||
w_tmp >>= 8;
|
||||
oc[0] = w_tmp & 0xFF;
|
||||
|
||||
switch (n)
|
||||
{
|
||||
case 1:
|
||||
p[0] = oc[3];
|
||||
break;
|
||||
|
||||
case 2:
|
||||
p[1] = _NXT | (oc[3] & 0x3f);
|
||||
p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
p[2] = _NXT | (oc[3] & 0x3f);
|
||||
p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
|
||||
break;
|
||||
|
||||
case 4:
|
||||
p[3] = _NXT | (oc[3] & 0x3f);
|
||||
p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
|
||||
((oc[1] & 0x03) << 4);
|
||||
p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
|
||||
break;
|
||||
|
||||
case 5:
|
||||
p[4] = _NXT | (oc[3] & 0x3f);
|
||||
p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
|
||||
((oc[1] & 0x03) << 4);
|
||||
p[1] = _NXT | (oc[1] >> 2);
|
||||
p[0] = _SEQ5 | (oc[0] & 0x03);
|
||||
break;
|
||||
|
||||
case 6:
|
||||
p[5] = _NXT | (oc[3] & 0x3f);
|
||||
p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
|
||||
p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
|
||||
p[2] = _NXT | (oc[1] >> 2);
|
||||
p[1] = _NXT | (oc[0] & 0x3f);
|
||||
p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: do not check here for forbitten UTF-8 characters.
|
||||
* They cannot appear here because we do proper convertion.
|
||||
*/
|
||||
|
||||
p += n;
|
||||
}
|
||||
|
||||
return (total);
|
||||
}
|
39
utf8.h
Normal file
39
utf8.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* utf8: implementation of UTF-8 charset encoding (RFC3629).
|
||||
*/
|
||||
#ifndef _UTF8_H_
|
||||
#define _UTF8_H_
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <string>
|
||||
#include <wchar.h>
|
||||
|
||||
#define UTF8_IGNORE_ERROR 0x01
|
||||
#define UTF8_SKIP_BOM 0x02
|
||||
|
||||
bool utf8_to_wchar_string(const std::string &input, std::wstring *result);
|
||||
bool wchar_to_utf8_string(const std::wstring &input, std::string *result);
|
||||
|
||||
size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize, int flags);
|
||||
size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize, int flags);
|
||||
|
||||
bool is_wchar_ucs2();
|
||||
|
||||
#endif /* !_UTF8_H_ */
|
Loading…
Reference in a new issue