lib: charset: utility functions for Unicode

utf8_get() - get next UTF-8 code point from buffer utf8_put() - write UTF-8 code point to buffer utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16 utf8_utf16_strncpy() - copy a utf-8 string to utf-16 utf16_get() - get next UTF-16 code point from buffer utf16_put() - write UTF-16 code point to buffer utf16_strnlen() - number of codes points in a utf-16 string utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8 utf16_utf8_strncpy() - copy a utf-16 string to utf-8 Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Signed-off-by: Alexander Graf <agraf@suse.de>
2024-12-01 08:59:33 +00:00 · 2018-08-31 21:31:27 +02:00 · 2018-08-31 21:31:27 +02:00 · d8c28232c3
commit d8c28232c3
parent 1dde0d57a5
2 changed files with 363 additions and 3 deletions
--- a/include/charset.h
+++ b/include/charset.h
@ -8,10 +8,140 @@
 #ifndef __CHARSET_H_
 #define __CHARSET_H_
 #include <linux/kernel.h>
 #include <linux/types.h>
 #define MAX_UTF8_PER_UTF16 3
 /**
 * utf8_get() - get next UTF-8 code point from buffer
 *
 * @src:		pointer to current byte, updated to point to next byte
 * Return:		code point, or 0 for end of string, or -1 if no legal
 *			code point is found. In case of an error src points to
 *			the incorrect byte.
 */
 s32 utf8_get(const char **src);
 /**
 * utf8_put() - write UTF-8 code point to buffer
 *
 * @code:		code point
 * @dst:		pointer to destination buffer, updated to next position
 * Return:		-1 if the input parameters are invalid
 */
 int utf8_put(s32 code, char **dst);
 /**
 * utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion
 *			  to utf-16
 *
 * @src:		utf-8 string
 * @count:		maximum number of code points to convert
 * Return:		length in bytes after conversion to utf-16 without the
 *			trailing \0. If an invalid UTF-8 sequence is hit one
 *			word will be reserved for a replacement character.
 */
 size_t utf8_utf16_strnlen(const char *src, size_t count);
 /**
 * utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16
 *
 * @src:		utf-8 string
 * Return:		length in bytes after conversion to utf-16 without the
 *			trailing \0. -1 if the utf-8 string is not valid.
 */
 #define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX)
 /**
 * utf8_utf16_strncpy() - copy utf-8 string to utf-16 string
 *
 * @dst:		destination buffer
 * @src:		source buffer
 * @count:		maximum number of code points to copy
 * Return:		-1 if the input parameters are invalid
 */
 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count);
 /**
 * utf8_utf16_strcpy() - copy utf-8 string to utf-16 string
 *
 * @dst:		destination buffer
 * @src:		source buffer
 * Return:		-1 if the input parameters are invalid
 */
 #define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX)
 /**
 * utf16_get() - get next UTF-16 code point from buffer
 *
 * @src:		pointer to current word, updated to point to next word
 * Return:		code point, or 0 for end of string, or -1 if no legal
 *			code point is found. In case of an error src points to
 *			the incorrect word.
 */
 s32 utf16_get(const u16 **src);
 /**
 * utf16_put() - write UTF-16 code point to buffer
 *
 * @code:		code point
 * @dst:		pointer to destination buffer, updated to next position
 * Return:		-1 if the input parameters are invalid
 */
 int utf16_put(s32 code, u16 **dst);
 /**
 * utf16_strnlen() - length of a truncated utf-16 string
 *
 * @src:		utf-16 string
 * @count:		maximum number of code points to convert
 * Return:		length in code points. If an invalid UTF-16 sequence is
 *			hit one position will be reserved for a replacement
 *			character.
 */
 size_t utf16_strnlen(const u16 *src, size_t count);
 /**
 * utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion
 *			  to utf-8
 *
 * @src:		utf-16 string
 * @count:		maximum number of code points to convert
 * Return:		length in bytes after conversion to utf-8 without the
 *			trailing \0. If an invalid UTF-16 sequence is hit one
 *			byte will be reserved for a replacement character.
 */
 size_t utf16_utf8_strnlen(const u16 *src, size_t count);
 /**
 * utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8
 *
 * @src:		utf-16 string
 * Return:		length in bytes after conversion to utf-8 without the
 *			trailing \0. -1 if the utf-16 string is not valid.
 */
 #define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX)
 /**
 * utf16_utf8_strncpy() - copy utf-16 string to utf-8 string
 *
 * @dst:		destination buffer
 * @src:		source buffer
 * @count:		maximum number of code points to copy
 * Return:		-1 if the input parameters are invalid
 */
 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count);
 /**
 * utf16_utf8_strcpy() - copy utf-16 string to utf-8 string
 *
 * @dst:		destination buffer
 * @src:		source buffer
 * Return:		-1 if the input parameters are invalid
 */
 #define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX)
 /**
 * u16_strlen - count non-zero words
 *
--- a/lib/charset.c
+++ b/lib/charset.c
@ -8,9 +8,239 @@
 #include <charset.h>
 #include <malloc.h>
-/*
+s32 utf8_get(const char **src)
- * utf8/utf16 conversion mostly lifted from grub
+{
- */
+	s32 code = 0;
 	unsigned char c;
 	if (!src || !*src)
 		return -1;
 	if (!**src)
 		return 0;
 	c = **src;
 	if (c >= 0x80) {
 		++*src;
 		if (!**src)
 			return -1;
 		/*
 		 * We do not expect a continuation byte (0x80 - 0xbf).
 		 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
 		 * here.
 		 * The highest code point is 0x10ffff which is coded as
 		 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
 		 */
 		if (c < 0xc2 || code > 0xf4)
 			return -1;
 		if (c >= 0xe0) {
 			if (c >= 0xf0) {
 				/* 0xf0 - 0xf4 */
 				c &= 0x07;
 				code = c << 18;
 				c = **src;
 				++*src;
 				if (!**src)
 					return -1;
 				if (c < 0x80 || c > 0xbf)
 					return -1;
 				c &= 0x3f;
 			} else {
 				/* 0xe0 - 0xef */
 				c &= 0x0f;
 			}
 			code += c << 12;
 			if ((code >= 0xD800 && code <= 0xDFFF) ||
 			    code >= 0x110000)
 				return -1;
 			c = **src;
 			++*src;
 			if (!**src)
 				return -1;
 			if (c < 0x80 || c > 0xbf)
 				return -1;
 		}
 		/* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
 		c &= 0x3f;
 		code += c << 6;
 		c = **src;
 		if (c < 0x80 || c > 0xbf)
 			return -1;
 		c &= 0x3f;
 	}
 	code += c;
 	++*src;
 	return code;
 }
 int utf8_put(s32 code, char **dst)
 {
 	if (!dst || !*dst)
 		return -1;
 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 		return -1;
 	if (code <= 0x007F) {
 		**dst = code;
 	} else {
 		if (code <= 0x07FF) {
 			**dst = code >> 6 | 0xC0;
 		} else {
 			if (code < 0x10000) {
 				**dst = code >> 12 | 0xE0;
 			} else {
 				**dst = code >> 18 | 0xF0;
 				++*dst;
 				**dst = (code >> 12 & 0x3F) | 0x80;
 			}
 			++*dst;
 			**dst = (code >> 6 & 0x3F) | 0x80;
 		}
 		++*dst;
 		**dst = (code & 0x3F) | 0x80;
 	}
 	++*dst;
 	return 0;
 }
 size_t utf8_utf16_strnlen(const char *src, size_t count)
 {
 	size_t len = 0;
 	for (; *src && count; --count)  {
 		s32 code = utf8_get(&src);
 		if (!code)
 			break;
 		if (code < 0) {
 			/* Reserve space for a replacement character */
 			len += 1;
 		} else if (code < 0x10000) {
 			len += 1;
 		} else {
 			len += 2;
 		}
 	}
 	return len;
 }
 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
 {
 	if (!src || !dst || !*dst)
 		return -1;
 	for (; count && *src; --count) {
 		s32 code = utf8_get(&src);
 		if (code < 0)
 			code = '?';
 		utf16_put(code, dst);
 	}
 	**dst = 0;
 	return 0;
 }
 s32 utf16_get(const u16 **src)
 {
 	s32 code, code2;
 	if (!src || !*src)
 		return -1;
 	if (!**src)
 		return 0;
 	code = **src;
 	++*src;
 	if (code >= 0xDC00 && code <= 0xDFFF)
 		return -1;
 	if (code >= 0xD800 && code <= 0xDBFF) {
 		if (!**src)
 			return -1;
 		code &= 0x3ff;
 		code <<= 10;
 		code += 0x10000;
 		code2 = **src;
 		++*src;
 		if (code2 <= 0xDC00 || code2 >= 0xDFFF)
 			return -1;
 		code2 &= 0x3ff;
 		code += code2;
 	}
 	return code;
 }
 int utf16_put(s32 code, u16 **dst)
 {
 	if (!dst || !*dst)
 		return -1;
 	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 		return -1;
 	if (code < 0x10000) {
 		**dst = code;
 	} else {
 		code -= 0x10000;
 		**dst = code >> 10 | 0xD800;
 		++*dst;
 		**dst = (code & 0x3ff) | 0xDC00;
 	}
 	++*dst;
 	return 0;
 }
 size_t utf16_strnlen(const u16 *src, size_t count)
 {
 	size_t len = 0;
 	for (; *src && count; --count)  {
 		s32 code = utf16_get(&src);
 		if (!code)
 			break;
 		/*
 		 * In case of an illegal sequence still reserve space for a
 		 * replacement character.
 		 */
 		++len;
 	}
 	return len;
 }
 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
 {
 	size_t len = 0;
 	for (; *src && count; --count)  {
 		s32 code = utf16_get(&src);
 		if (!code)
 			break;
 		if (code < 0)
 			/* Reserve space for a replacement character */
 			len += 1;
 		else if (code < 0x80)
 			len += 1;
 		else if (code < 0x800)
 			len += 2;
 		else if (code < 0x10000)
 			len += 3;
 		else
 			len += 4;
 	}
 	return len;
 }
 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
 {
 	if (!src || !dst || !*dst)
 		return -1;
 	for (; count && *src; --count) {
 		s32 code = utf16_get(&src);
 		if (code < 0)
 			code = '?';
 		utf8_put(code, dst);
 	}
 	**dst = 0;
 	return 0;
 }
 size_t u16_strlen(const u16 *in)
 {