mirror of
https://github.com/AsahiLinux/u-boot
synced 2024-11-28 15:41:40 +00:00
lib: charset: utility functions for Unicode
utf8_get() - get next UTF-8 code point from buffer utf8_put() - write UTF-8 code point to buffer utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16 utf8_utf16_strncpy() - copy a utf-8 string to utf-16 utf16_get() - get next UTF-16 code point from buffer utf16_put() - write UTF-16 code point to buffer utf16_strnlen() - number of codes points in a utf-16 string utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8 utf16_utf8_strncpy() - copy a utf-16 string to utf-8 Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de> Signed-off-by: Alexander Graf <agraf@suse.de>
This commit is contained in:
parent
1dde0d57a5
commit
d8c28232c3
2 changed files with 363 additions and 3 deletions
|
@ -8,10 +8,140 @@
|
|||
#ifndef __CHARSET_H_
|
||||
#define __CHARSET_H_
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define MAX_UTF8_PER_UTF16 3
|
||||
|
||||
/**
|
||||
* utf8_get() - get next UTF-8 code point from buffer
|
||||
*
|
||||
* @src: pointer to current byte, updated to point to next byte
|
||||
* Return: code point, or 0 for end of string, or -1 if no legal
|
||||
* code point is found. In case of an error src points to
|
||||
* the incorrect byte.
|
||||
*/
|
||||
s32 utf8_get(const char **src);
|
||||
|
||||
/**
|
||||
* utf8_put() - write UTF-8 code point to buffer
|
||||
*
|
||||
* @code: code point
|
||||
* @dst: pointer to destination buffer, updated to next position
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
int utf8_put(s32 code, char **dst);
|
||||
|
||||
/**
|
||||
* utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion
|
||||
* to utf-16
|
||||
*
|
||||
* @src: utf-8 string
|
||||
* @count: maximum number of code points to convert
|
||||
* Return: length in bytes after conversion to utf-16 without the
|
||||
* trailing \0. If an invalid UTF-8 sequence is hit one
|
||||
* word will be reserved for a replacement character.
|
||||
*/
|
||||
size_t utf8_utf16_strnlen(const char *src, size_t count);
|
||||
|
||||
/**
|
||||
* utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16
|
||||
*
|
||||
* @src: utf-8 string
|
||||
* Return: length in bytes after conversion to utf-16 without the
|
||||
* trailing \0. -1 if the utf-8 string is not valid.
|
||||
*/
|
||||
#define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX)
|
||||
|
||||
/**
|
||||
* utf8_utf16_strncpy() - copy utf-8 string to utf-16 string
|
||||
*
|
||||
* @dst: destination buffer
|
||||
* @src: source buffer
|
||||
* @count: maximum number of code points to copy
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count);
|
||||
|
||||
/**
|
||||
* utf8_utf16_strcpy() - copy utf-8 string to utf-16 string
|
||||
*
|
||||
* @dst: destination buffer
|
||||
* @src: source buffer
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
#define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX)
|
||||
|
||||
/**
|
||||
* utf16_get() - get next UTF-16 code point from buffer
|
||||
*
|
||||
* @src: pointer to current word, updated to point to next word
|
||||
* Return: code point, or 0 for end of string, or -1 if no legal
|
||||
* code point is found. In case of an error src points to
|
||||
* the incorrect word.
|
||||
*/
|
||||
s32 utf16_get(const u16 **src);
|
||||
|
||||
/**
|
||||
* utf16_put() - write UTF-16 code point to buffer
|
||||
*
|
||||
* @code: code point
|
||||
* @dst: pointer to destination buffer, updated to next position
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
int utf16_put(s32 code, u16 **dst);
|
||||
|
||||
/**
|
||||
* utf16_strnlen() - length of a truncated utf-16 string
|
||||
*
|
||||
* @src: utf-16 string
|
||||
* @count: maximum number of code points to convert
|
||||
* Return: length in code points. If an invalid UTF-16 sequence is
|
||||
* hit one position will be reserved for a replacement
|
||||
* character.
|
||||
*/
|
||||
size_t utf16_strnlen(const u16 *src, size_t count);
|
||||
|
||||
/**
|
||||
* utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion
|
||||
* to utf-8
|
||||
*
|
||||
* @src: utf-16 string
|
||||
* @count: maximum number of code points to convert
|
||||
* Return: length in bytes after conversion to utf-8 without the
|
||||
* trailing \0. If an invalid UTF-16 sequence is hit one
|
||||
* byte will be reserved for a replacement character.
|
||||
*/
|
||||
size_t utf16_utf8_strnlen(const u16 *src, size_t count);
|
||||
|
||||
/**
|
||||
* utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8
|
||||
*
|
||||
* @src: utf-16 string
|
||||
* Return: length in bytes after conversion to utf-8 without the
|
||||
* trailing \0. -1 if the utf-16 string is not valid.
|
||||
*/
|
||||
#define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX)
|
||||
|
||||
/**
|
||||
* utf16_utf8_strncpy() - copy utf-16 string to utf-8 string
|
||||
*
|
||||
* @dst: destination buffer
|
||||
* @src: source buffer
|
||||
* @count: maximum number of code points to copy
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count);
|
||||
|
||||
/**
|
||||
* utf16_utf8_strcpy() - copy utf-16 string to utf-8 string
|
||||
*
|
||||
* @dst: destination buffer
|
||||
* @src: source buffer
|
||||
* Return: -1 if the input parameters are invalid
|
||||
*/
|
||||
#define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX)
|
||||
|
||||
/**
|
||||
* u16_strlen - count non-zero words
|
||||
*
|
||||
|
|
236
lib/charset.c
236
lib/charset.c
|
@ -8,9 +8,239 @@
|
|||
#include <charset.h>
|
||||
#include <malloc.h>
|
||||
|
||||
/*
|
||||
* utf8/utf16 conversion mostly lifted from grub
|
||||
*/
|
||||
s32 utf8_get(const char **src)
|
||||
{
|
||||
s32 code = 0;
|
||||
unsigned char c;
|
||||
|
||||
if (!src || !*src)
|
||||
return -1;
|
||||
if (!**src)
|
||||
return 0;
|
||||
c = **src;
|
||||
if (c >= 0x80) {
|
||||
++*src;
|
||||
if (!**src)
|
||||
return -1;
|
||||
/*
|
||||
* We do not expect a continuation byte (0x80 - 0xbf).
|
||||
* 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
|
||||
* here.
|
||||
* The highest code point is 0x10ffff which is coded as
|
||||
* 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
|
||||
*/
|
||||
if (c < 0xc2 || code > 0xf4)
|
||||
return -1;
|
||||
if (c >= 0xe0) {
|
||||
if (c >= 0xf0) {
|
||||
/* 0xf0 - 0xf4 */
|
||||
c &= 0x07;
|
||||
code = c << 18;
|
||||
c = **src;
|
||||
++*src;
|
||||
if (!**src)
|
||||
return -1;
|
||||
if (c < 0x80 || c > 0xbf)
|
||||
return -1;
|
||||
c &= 0x3f;
|
||||
} else {
|
||||
/* 0xe0 - 0xef */
|
||||
c &= 0x0f;
|
||||
}
|
||||
code += c << 12;
|
||||
if ((code >= 0xD800 && code <= 0xDFFF) ||
|
||||
code >= 0x110000)
|
||||
return -1;
|
||||
c = **src;
|
||||
++*src;
|
||||
if (!**src)
|
||||
return -1;
|
||||
if (c < 0x80 || c > 0xbf)
|
||||
return -1;
|
||||
}
|
||||
/* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
|
||||
c &= 0x3f;
|
||||
code += c << 6;
|
||||
c = **src;
|
||||
if (c < 0x80 || c > 0xbf)
|
||||
return -1;
|
||||
c &= 0x3f;
|
||||
}
|
||||
code += c;
|
||||
++*src;
|
||||
return code;
|
||||
}
|
||||
|
||||
int utf8_put(s32 code, char **dst)
|
||||
{
|
||||
if (!dst || !*dst)
|
||||
return -1;
|
||||
if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
|
||||
return -1;
|
||||
if (code <= 0x007F) {
|
||||
**dst = code;
|
||||
} else {
|
||||
if (code <= 0x07FF) {
|
||||
**dst = code >> 6 | 0xC0;
|
||||
} else {
|
||||
if (code < 0x10000) {
|
||||
**dst = code >> 12 | 0xE0;
|
||||
} else {
|
||||
**dst = code >> 18 | 0xF0;
|
||||
++*dst;
|
||||
**dst = (code >> 12 & 0x3F) | 0x80;
|
||||
}
|
||||
++*dst;
|
||||
**dst = (code >> 6 & 0x3F) | 0x80;
|
||||
}
|
||||
++*dst;
|
||||
**dst = (code & 0x3F) | 0x80;
|
||||
}
|
||||
++*dst;
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t utf8_utf16_strnlen(const char *src, size_t count)
|
||||
{
|
||||
size_t len = 0;
|
||||
|
||||
for (; *src && count; --count) {
|
||||
s32 code = utf8_get(&src);
|
||||
|
||||
if (!code)
|
||||
break;
|
||||
if (code < 0) {
|
||||
/* Reserve space for a replacement character */
|
||||
len += 1;
|
||||
} else if (code < 0x10000) {
|
||||
len += 1;
|
||||
} else {
|
||||
len += 2;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
|
||||
{
|
||||
if (!src || !dst || !*dst)
|
||||
return -1;
|
||||
|
||||
for (; count && *src; --count) {
|
||||
s32 code = utf8_get(&src);
|
||||
|
||||
if (code < 0)
|
||||
code = '?';
|
||||
utf16_put(code, dst);
|
||||
}
|
||||
**dst = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
s32 utf16_get(const u16 **src)
|
||||
{
|
||||
s32 code, code2;
|
||||
|
||||
if (!src || !*src)
|
||||
return -1;
|
||||
if (!**src)
|
||||
return 0;
|
||||
code = **src;
|
||||
++*src;
|
||||
if (code >= 0xDC00 && code <= 0xDFFF)
|
||||
return -1;
|
||||
if (code >= 0xD800 && code <= 0xDBFF) {
|
||||
if (!**src)
|
||||
return -1;
|
||||
code &= 0x3ff;
|
||||
code <<= 10;
|
||||
code += 0x10000;
|
||||
code2 = **src;
|
||||
++*src;
|
||||
if (code2 <= 0xDC00 || code2 >= 0xDFFF)
|
||||
return -1;
|
||||
code2 &= 0x3ff;
|
||||
code += code2;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
int utf16_put(s32 code, u16 **dst)
|
||||
{
|
||||
if (!dst || !*dst)
|
||||
return -1;
|
||||
if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
|
||||
return -1;
|
||||
if (code < 0x10000) {
|
||||
**dst = code;
|
||||
} else {
|
||||
code -= 0x10000;
|
||||
**dst = code >> 10 | 0xD800;
|
||||
++*dst;
|
||||
**dst = (code & 0x3ff) | 0xDC00;
|
||||
}
|
||||
++*dst;
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t utf16_strnlen(const u16 *src, size_t count)
|
||||
{
|
||||
size_t len = 0;
|
||||
|
||||
for (; *src && count; --count) {
|
||||
s32 code = utf16_get(&src);
|
||||
|
||||
if (!code)
|
||||
break;
|
||||
/*
|
||||
* In case of an illegal sequence still reserve space for a
|
||||
* replacement character.
|
||||
*/
|
||||
++len;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t utf16_utf8_strnlen(const u16 *src, size_t count)
|
||||
{
|
||||
size_t len = 0;
|
||||
|
||||
for (; *src && count; --count) {
|
||||
s32 code = utf16_get(&src);
|
||||
|
||||
if (!code)
|
||||
break;
|
||||
if (code < 0)
|
||||
/* Reserve space for a replacement character */
|
||||
len += 1;
|
||||
else if (code < 0x80)
|
||||
len += 1;
|
||||
else if (code < 0x800)
|
||||
len += 2;
|
||||
else if (code < 0x10000)
|
||||
len += 3;
|
||||
else
|
||||
len += 4;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
|
||||
{
|
||||
if (!src || !dst || !*dst)
|
||||
return -1;
|
||||
|
||||
for (; count && *src; --count) {
|
||||
s32 code = utf16_get(&src);
|
||||
|
||||
if (code < 0)
|
||||
code = '?';
|
||||
utf8_put(code, dst);
|
||||
}
|
||||
**dst = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
size_t u16_strlen(const u16 *in)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue