diff --git a/include/linux/nls.h b/include/linux/nls.h new file mode 100644 index 0000000..62fb7b5 --- /dev/null +++ b/include/linux/nls.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NLS_H +#define _LINUX_NLS_H + +/* Unicode has changed over the years. Unicode code points no longer + * fit into 16 bits; as of Unicode 5 valid code points range from 0 + * to 0x10ffff (17 planes, where each plane holds 65536 code points). + * + * The original decision to represent Unicode characters as 16-bit + * wchar_t values is now outdated. But plane 0 still includes the + * most commonly used characters, so we will retain it. The newer + * 32-bit unicode_t type can be used when it is necessary to + * represent the full Unicode character set. + */ + +/* Plane-0 Unicode character */ +typedef u16 wchar_t; +#define MAX_WCHAR_T 0xffff + +/* Arbitrary Unicode character */ +typedef u32 unicode_t; + +/* this value hold the maximum octet of charset */ +#define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ + +/* Byte order for UTF-16 strings */ +enum utf16_endian { + UTF16_HOST_ENDIAN, + UTF16_LITTLE_ENDIAN, + UTF16_BIG_ENDIAN +}; + +/* nls_base.c */ + +extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); +extern int utf8s_to_utf16s(const u8 *s, int len, + enum utf16_endian endian, wchar_t *pwcs, int maxlen); + +#endif /* _LINUX_NLS_H */ + diff --git a/lib/Kconfig b/lib/Kconfig index e048ade..44f30d8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -148,4 +148,7 @@ config GENERIC_LIB_MULDI3 bool +config NLS + bool "Native language support" + endmenu diff --git a/lib/Makefile b/lib/Makefile index e72aa66..763516d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -67,6 +67,7 @@ obj-y += clz_ctz.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_NLS) += nls_base.o # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o diff --git a/lib/nls_base.c b/lib/nls_base.c new file mode 100644 index 0000000..cd6a5ff --- /dev/null +++ b/lib/nls_base.c @@ -0,0 +1,131 @@ +/* + * linux/fs/nls/nls_base.c + * + * Native language support--charsets and unicode translations. + * By Gordon Chaffee 1996, 1997 + * + * Unicode based case conversion 1999 by Wolfram Pienkoss + * + */ +#include +#include + +/* + * Sample implementation from Unicode home page. + * http://www.stonehand.com/unicode/standard/fss-utf.html + */ +struct utf8_table { + int cmask; + int cval; + int shift; + long lmask; + long lval; +}; + +static const struct utf8_table utf8_table[] = +{ + {0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */}, + {0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */}, + {0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */}, + {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, + {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, + {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, + {0, /* end of table */} +}; + +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) +{ + unsigned long l; + int c0, c, nc; + const struct utf8_table *t; + + nc = 0; + c0 = *s; + l = c0; + for (t = utf8_table; t->cmask; t++) { + nc++; + if ((c0 & t->cmask) == t->cval) { + l &= t->lmask; + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + *pu = (unicode_t) l; + return nc; + } + if (inlen <= nc) + return -1; + s++; + c = (*s ^ 0x80) & 0xFF; + if (c & 0xC0) + return -1; + l = (l << 6) | c; + } + return -1; +} +EXPORT_SYMBOL(utf8_to_utf32); + +static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + *s = (wchar_t) c; + break; + case UTF16_LITTLE_ENDIAN: + *s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *s = __cpu_to_be16(c); + break; + } +} + +int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) +{ + u16 *op; + int size; + unicode_t u; + + op = pwcs; + while (inlen > 0 && maxout > 0 && *s) { + if (*s & 0x80) { + size = utf8_to_utf32(s, inlen, &u); + if (size < 0) + return -EINVAL; + s += size; + inlen -= size; + + if (u >= PLANE_SIZE) { + if (maxout < 2) + break; + u -= PLANE_SIZE; + put_utf16(op++, SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS), + endian); + maxout -= 2; + } else { + put_utf16(op++, u, endian); + maxout--; + } + } else { + put_utf16(op++, *s++, endian); + inlen--; + maxout--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); +