Blame - libutils/Unicode.cpp - AOSPA/android_system_core

blob: a66e3bbbb018451e97afeb5d1e2ed6b1e1ee1377 [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include <utils/Unicode.h>
				18
				19	#include <stddef.h>
				20
				21	#ifdef HAVE_WINSOCK
				22	# undef nhtol
				23	# undef htonl
				24	# undef nhtos
				25	# undef htons
				26
				27	# ifdef HAVE_LITTLE_ENDIAN
				28	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				29	# define htonl(x) ntohl(x)
				30	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				31	# define htons(x) ntohs(x)
				32	# else
				33	# define ntohl(x) (x)
				34	# define htonl(x) (x)
				35	# define ntohs(x) (x)
				36	# define htons(x) (x)
				37	# endif
				38	#else
				39	# include <netinet/in.h>
				40	#endif
				41
				42	extern "C" {
				43
				44	static const char32_t kByteMask = 0x000000BF;
				45	static const char32_t kByteMark = 0x00000080;
				46
				47	// Surrogates aren't valid for UTF-32 characters, so define some
				48	// constants that will let us screen them out.
				49	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
				50	static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				51	static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
				52	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				53	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				54	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				55	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				56
				57	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				58	// indexed by number of bytes in the sequence.
				59	// 0xxxxxxx
				60	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				61	// 110yyyyx 10xxxxxx
				62	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				63	// 1110yyyy 10yxxxxx 10xxxxxx
				64	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				65	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				66	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				67	static const char32_t kFirstByteMark[] = {
				68	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				69	};
				70
				71	// --------------------------------------------------------------------------
				72	// UTF-32
				73	// --------------------------------------------------------------------------
				74
				75	/**
				76	* Return number of UTF-8 bytes required for the character. If the character
				77	* is invalid, return size of 0.
				78	*/
				79	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				80	{
				81	// Figure out how many bytes the result will require.
				82	if (srcChar < 0x00000080) {
				83	return 1;
				84	} else if (srcChar < 0x00000800) {
				85	return 2;
				86	} else if (srcChar < 0x00010000) {
				87	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				88	return 3;
				89	} else {
				90	// Surrogates are invalid UTF-32 characters.
				91	return 0;
				92	}
				93	}
				94	// Max code point for Unicode is 0x0010FFFF.
				95	else if (srcChar <= kUnicodeMaxCodepoint) {
				96	return 4;
				97	} else {
				98	// Invalid UTF-32 character.
				99	return 0;
				100	}
				101	}
				102
				103	// Write out the source character to <dstP>.
				104
				105	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				106	{
				107	dstP += bytes;
				108	switch (bytes)
				109	{ /* note: everything falls through. */
				110	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				111	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				112	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				113	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				114	}
				115	}
				116
				117	size_t strlen32(const char32_t *s)
				118	{
				119	const char32_t *ss = s;
				120	while ( *ss )
				121	ss++;
				122	return ss-s;
				123	}
				124
				125	size_t strnlen32(const char32_t *s, size_t maxlen)
				126	{
				127	const char32_t *ss = s;
				128	while ((maxlen > 0) && *ss) {
				129	ss++;
				130	maxlen--;
				131	}
				132	return ss-s;
				133	}
				134
				135	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				136	{
				137	const char first_char = *cur;
				138	if ((first_char & 0x80) == 0) { // ASCII
				139	*num_read = 1;
				140	return *cur;
				141	}
				142	cur++;
				143	char32_t mask, to_ignore_mask;
				144	size_t num_to_read = 0;
				145	char32_t utf32 = first_char;
				146	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				147	(first_char & mask);
				148	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				149	// 0x3F == 00111111
				150	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				151	}
				152	to_ignore_mask \|= mask;
				153	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				154
				155	*num_read = num_to_read;
				156	return static_cast<int32_t>(utf32);
				157	}
				158
				159	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				160	{
				161	if (index >= src_len) {
				162	return -1;
				163	}
				164	size_t dummy_index;
				165	if (next_index == NULL) {
				166	next_index = &dummy_index;
				167	}
				168	size_t num_read;
				169	int32_t ret = utf32_at_internal(src + index, &num_read);
				170	if (ret >= 0) {
				171	*next_index = index + num_read;
				172	}
				173
				174	return ret;
				175	}
				176
				177	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				178	{
				179	if (src == NULL \|\| src_len == 0) {
				180	return -1;
				181	}
				182
				183	size_t ret = 0;
				184	const char32_t *end = src + src_len;
				185	while (src < end) {
				186	ret += utf32_codepoint_utf8_length(*src++);
				187	}
				188	return ret;
				189	}
				190
				191	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
				192	{
				193	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				194	return;
				195	}
				196
				197	const char32_t *cur_utf32 = src;
				198	const char32_t *end_utf32 = src + src_len;
				199	char *cur = dst;
				200	while (cur_utf32 < end_utf32) {
				201	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
				202	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				203	cur += len;
				204	}
				205	*cur = '\0';
				206	}
				207
				208	// --------------------------------------------------------------------------
				209	// UTF-16
				210	// --------------------------------------------------------------------------
				211
				212	int strcmp16(const char16_t s1, const char16_t s2)
				213	{
				214	char16_t ch;
				215	int d = 0;
				216
				217	while ( 1 ) {
				218	d = (int)(ch = s1++) - (int)s2++;
				219	if ( d \|\| !ch )
				220	break;
				221	}
				222
				223	return d;
				224	}
				225
				226	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				227	{
				228	char16_t ch;
				229	int d = 0;
				230
				231	while ( n-- ) {
				232	d = (int)(ch = s1++) - (int)s2++;
				233	if ( d \|\| !ch )
				234	break;
				235	}
				236
				237	return d;
				238	}
				239
				240	char16_t strcpy16(char16_t dst, const char16_t *src)
				241	{
				242	char16_t *q = dst;
				243	const char16_t *p = src;
				244	char16_t ch;
				245
				246	do {
				247	q++ = ch = p++;
				248	} while ( ch );
				249
				250	return dst;
				251	}
				252
				253	size_t strlen16(const char16_t *s)
				254	{
				255	const char16_t *ss = s;
				256	while ( *ss )
				257	ss++;
				258	return ss-s;
				259	}
				260
				261
				262	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				263	{
				264	char16_t *q = dst;
				265	const char16_t *p = src;
				266	char ch;
				267
				268	while (n) {
				269	n--;
				270	q++ = ch = p++;
				271	if ( !ch )
				272	break;
				273	}
				274
				275	*q = 0;
				276
				277	return dst;
				278	}
				279
				280	size_t strnlen16(const char16_t *s, size_t maxlen)
				281	{
				282	const char16_t *ss = s;
				283
				284	/* Important: the maxlen test must precede the reference through ss;
				285	since the byte beyond the maximum may segfault */
				286	while ((maxlen > 0) && *ss) {
				287	ss++;
				288	maxlen--;
				289	}
				290	return ss-s;
				291	}
				292
				293	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				294	{
				295	const char16_t* e1 = s1+n1;
				296	const char16_t* e2 = s2+n2;
				297
				298	while (s1 < e1 && s2 < e2) {
				299	const int d = (int)s1++ - (int)s2++;
				300	if (d) {
				301	return d;
				302	}
				303	}
				304
				305	return n1 < n2
				306	? (0 - (int)*s2)
				307	: (n1 > n2
				308	? ((int)*s1 - 0)
				309	: 0);
				310	}
				311
				312	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				313	{
				314	const char16_t* e1 = s1H+n1;
				315	const char16_t* e2 = s2N+n2;
				316
				317	while (s1H < e1 && s2N < e2) {
				318	const char16_t c2 = ntohs(*s2N);
				319	const int d = (int)*s1H++ - (int)c2;
				320	s2N++;
				321	if (d) {
				322	return d;
				323	}
				324	}
				325
				326	return n1 < n2
				327	? (0 - (int)ntohs(*s2N))
				328	: (n1 > n2
				329	? ((int)*s1H - 0)
				330	: 0);
				331	}
				332
				333	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
				334	{
				335	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				336	return;
				337	}
				338
				339	const char16_t* cur_utf16 = src;
				340	const char16_t* const end_utf16 = src + src_len;
				341	char *cur = dst;
				342	while (cur_utf16 < end_utf16) {
				343	char32_t utf32;
				344	// surrogate pairs
				345	if ((*cur_utf16 & 0xFC00) == 0xD800) {
				346	utf32 = (*cur_utf16++ - 0xD800) << 10;
				347	utf32 \|= *cur_utf16++ - 0xDC00;
				348	utf32 += 0x10000;
				349	} else {
				350	utf32 = (char32_t) *cur_utf16++;
				351	}
				352	const size_t len = utf32_codepoint_utf8_length(utf32);
				353	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				354	cur += len;
				355	}
				356	*cur = '\0';
				357	}
				358
				359	// --------------------------------------------------------------------------
				360	// UTF-8
				361	// --------------------------------------------------------------------------
				362
				363	ssize_t utf8_length(const char *src)
				364	{
				365	const char *cur = src;
				366	size_t ret = 0;
				367	while (*cur != '\0') {
				368	const char first_char = *cur++;
				369	if ((first_char & 0x80) == 0) { // ASCII
				370	ret += 1;
				371	continue;
				372	}
				373	// (UTF-8's character must not be like 10xxxxxx,
				374	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				375	if ((first_char & 0x40) == 0) {
				376	return -1;
				377	}
				378
				379	int32_t mask, to_ignore_mask;
				380	size_t num_to_read = 0;
				381	char32_t utf32 = 0;
				382	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				383	num_to_read < 5 && (first_char & mask);
				384	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				385	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				386	return -1;
				387	}
				388	// 0x3F == 00111111
				389	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				390	}
				391	// "first_char" must be (110xxxxx - 11110xxx)
				392	if (num_to_read == 5) {
				393	return -1;
				394	}
				395	to_ignore_mask \|= mask;
				396	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				397	if (utf32 > kUnicodeMaxCodepoint) {
				398	return -1;
				399	}
				400
				401	ret += num_to_read;
				402	}
				403	return ret;
				404	}
				405
				406	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				407	{
				408	if (src == NULL \|\| src_len == 0) {
				409	return -1;
				410	}
				411
				412	size_t ret = 0;
				413	const char16_t* const end = src + src_len;
				414	while (src < end) {
				415	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				416	&& (*++src & 0xFC00) == 0xDC00) {
				417	// surrogate pairs are always 4 bytes.
				418	ret += 4;
				419	src++;
				420	} else {
				421	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				422	}
				423	}
				424	return ret;
				425	}
				426
				427	/**
				428	* Returns 1-4 based on the number of leading bits.
				429	*
				430	* 1111 -> 4
				431	* 1110 -> 3
				432	* 110x -> 2
				433	* 10xx -> 1
				434	* 0xxx -> 1
				435	*/
				436	static inline size_t utf8_codepoint_len(uint8_t ch)
				437	{
				438	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				439	}
				440
				441	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				442	{
				443	*codePoint <<= 6;
				444	*codePoint \|= 0x3F & byte;
				445	}
				446
				447	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				448	{
				449	if (src == NULL \|\| src_len == 0) {
				450	return 0;
				451	}
				452	size_t ret = 0;
				453	const char* cur;
				454	const char* end;
				455	size_t num_to_skip;
				456	for (cur = src, end = src + src_len, num_to_skip = 1;
				457	cur < end;
				458	cur += num_to_skip, ret++) {
				459	const char first_char = *cur;
				460	num_to_skip = 1;
				461	if ((first_char & 0x80) == 0) { // ASCII
				462	continue;
				463	}
				464	int32_t mask;
				465
				466	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				467	}
				468	}
				469	return ret;
				470	}
				471
				472	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				473	{
				474	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				475	return;
				476	}
				477
				478	const char* cur = src;
				479	const char* const end = src + src_len;
				480	char32_t* cur_utf32 = dst;
				481	while (cur < end) {
				482	size_t num_read;
				483	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				484	cur += num_read;
				485	}
				486	*cur_utf32 = 0;
				487	}
				488
				489	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				490	{
				491	uint32_t unicode;
				492
				493	switch (length)
				494	{
				495	case 1:
				496	return src[0];
				497	case 2:
				498	unicode = src[0] & 0x1f;
				499	utf8_shift_and_mask(&unicode, src[1]);
				500	return unicode;
				501	case 3:
				502	unicode = src[0] & 0x0f;
				503	utf8_shift_and_mask(&unicode, src[1]);
				504	utf8_shift_and_mask(&unicode, src[2]);
				505	return unicode;
				506	case 4:
				507	unicode = src[0] & 0x07;
				508	utf8_shift_and_mask(&unicode, src[1]);
				509	utf8_shift_and_mask(&unicode, src[2]);
				510	utf8_shift_and_mask(&unicode, src[3]);
				511	return unicode;
				512	default:
				513	return 0xffff;
				514	}
				515
				516	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				517	}
				518
				519	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
				520	{
				521	const uint8_t* const u8end = u8str + u8len;
				522	const uint8_t* u8cur = u8str;
				523
				524	/* Validate that the UTF-8 is the correct len */
				525	size_t u16measuredLen = 0;
				526	while (u8cur < u8end) {
				527	u16measuredLen++;
				528	int u8charLen = utf8_codepoint_len(*u8cur);
				529	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				530	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				531	u8cur += u8charLen;
				532	}
				533
				534	/**
				535	* Make sure that we ended where we thought we would and the output UTF-16
				536	* will be exactly how long we were told it would be.
				537	*/
				538	if (u8cur != u8end) {
				539	return -1;
				540	}
				541
				542	return u16measuredLen;
				543	}
				544
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	545	char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	546	{
				547	const uint8_t* const u8end = u8str + u8len;
				548	const uint8_t* u8cur = u8str;
				549	char16_t* u16cur = u16str;
				550
				551	while (u8cur < u8end) {
				552	size_t u8len = utf8_codepoint_len(*u8cur);
				553	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				554
				555	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				556	if (codepoint <= 0xFFFF) {
				557	// Single UTF16 character
				558	*u16cur++ = (char16_t) codepoint;
				559	} else {
				560	// Multiple UTF16 characters with surrogates
				561	codepoint = codepoint - 0x10000;
				562	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				563	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				564	}
				565
				566	u8cur += u8len;
				567	}
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	568	return u16cur;
				569	}
				570
				571	void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
				572	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
				573	*end = 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	574	}
				575
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	576	char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				577	const uint8_t* const u8end = src + srcLen;
				578	const uint8_t* u8cur = src;
				579	const uint16_t* const u16end = dst + dstLen;
				580	char16_t* u16cur = dst;
				581
				582	while (u8cur < u8end && u16cur < u16end) {
				583	size_t u8len = utf8_codepoint_len(*u8cur);
				584	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				585
				586	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				587	if (codepoint <= 0xFFFF) {
				588	// Single UTF16 character
				589	*u16cur++ = (char16_t) codepoint;
				590	} else {
				591	// Multiple UTF16 characters with surrogates
				592	codepoint = codepoint - 0x10000;
				593	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				594	if (u16cur >= u16end) {
				595	// Ooops... not enough room for this surrogate pair.
				596	return u16cur-1;
				597	}
				598	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				599	}
				600
				601	u8cur += u8len;
				602	}
				603	return u16cur;
				604	}
				605
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	606	}