mirror of
https://gitee.com/zlgopen/awtk.git
synced 2024-12-03 04:27:44 +08:00
159 lines
4.6 KiB
C
159 lines
4.6 KiB
C
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
|
|
|
/*
|
|
* Break processing in a Unicode sequence. Designed to be used in a
|
|
* generic text renderer.
|
|
*
|
|
* Copyright (C) 2015-2016 Wu Yongwei <wuyongwei at gmail dot com>
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the author be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute
|
|
* it freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must
|
|
* not claim that you wrote the original software. If you use this
|
|
* software in a product, an acknowledgement in the product
|
|
* documentation would be appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must
|
|
* not be misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source
|
|
* distribution.
|
|
*/
|
|
|
|
/**
|
|
* @file unibreakdef.c
|
|
*
|
|
* Definition of utility functions used by the libunibreak library.
|
|
*
|
|
* @author Wu Yongwei
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include "unibreakdef.h"
|
|
|
|
/**
|
|
* Gets the next Unicode character in a UTF-8 sequence. The index will
|
|
* be advanced to the next complete character, unless the end of string
|
|
* is reached in the middle of a UTF-8 sequence.
|
|
*
|
|
* @param[in] s input UTF-8 string
|
|
* @param[in] len length of the string in bytes
|
|
* @param[in,out] ip pointer to the index
|
|
* @return the Unicode character beginning at the index; or
|
|
* #EOS if end of input is encountered
|
|
*/
|
|
utf32_t ub_get_next_char_utf8(
|
|
const utf8_t *s,
|
|
size_t len,
|
|
size_t *ip)
|
|
{
|
|
utf8_t ch;
|
|
utf32_t res;
|
|
|
|
assert(*ip <= len);
|
|
if (*ip == len)
|
|
return EOS;
|
|
ch = s[*ip];
|
|
|
|
if (ch < 0xC2 || ch > 0xF4)
|
|
{ /* One-byte sequence, tail (should not occur), or invalid */
|
|
*ip += 1;
|
|
return ch;
|
|
}
|
|
else if (ch < 0xE0)
|
|
{ /* Two-byte sequence */
|
|
if (*ip + 2 > len)
|
|
return EOS;
|
|
res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
|
|
*ip += 2;
|
|
return res;
|
|
}
|
|
else if (ch < 0xF0)
|
|
{ /* Three-byte sequence */
|
|
if (*ip + 3 > len)
|
|
return EOS;
|
|
res = ((ch & 0x0F) << 12) +
|
|
((s[*ip + 1] & 0x3F) << 6) +
|
|
((s[*ip + 2] & 0x3F));
|
|
*ip += 3;
|
|
return res;
|
|
}
|
|
else
|
|
{ /* Four-byte sequence */
|
|
if (*ip + 4 > len)
|
|
return EOS;
|
|
res = ((ch & 0x07) << 18) +
|
|
((s[*ip + 1] & 0x3F) << 12) +
|
|
((s[*ip + 2] & 0x3F) << 6) +
|
|
((s[*ip + 3] & 0x3F));
|
|
*ip += 4;
|
|
return res;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets the next Unicode character in a UTF-16 sequence. The index will
|
|
* be advanced to the next complete character, unless the end of string
|
|
* is reached in the middle of a UTF-16 surrogate pair.
|
|
*
|
|
* @param[in] s input UTF-16 string
|
|
* @param[in] len length of the string in words
|
|
* @param[in,out] ip pointer to the index
|
|
* @return the Unicode character beginning at the index; or
|
|
* #EOS if end of input is encountered
|
|
*/
|
|
utf32_t ub_get_next_char_utf16(
|
|
const utf16_t *s,
|
|
size_t len,
|
|
size_t *ip)
|
|
{
|
|
utf16_t ch;
|
|
|
|
assert(*ip <= len);
|
|
if (*ip == len)
|
|
return EOS;
|
|
ch = s[(*ip)++];
|
|
|
|
if (ch < 0xD800 || ch > 0xDBFF)
|
|
{ /* If the character is not a high surrogate */
|
|
return ch;
|
|
}
|
|
if (*ip == len)
|
|
{ /* If the input ends here (an error) */
|
|
--(*ip);
|
|
return EOS;
|
|
}
|
|
if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
|
|
{ /* If the next character is not the low surrogate (an error) */
|
|
return ch;
|
|
}
|
|
/* Return the constructed character and advance the index again */
|
|
return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
|
|
}
|
|
|
|
/**
|
|
* Gets the next Unicode character in a UTF-32 sequence. The index will
|
|
* be advanced to the next character.
|
|
*
|
|
* @param[in] s input UTF-32 string
|
|
* @param[in] len length of the string in dwords
|
|
* @param[in,out] ip pointer to the index
|
|
* @return the Unicode character beginning at the index; or
|
|
* #EOS if end of input is encountered
|
|
*/
|
|
utf32_t ub_get_next_char_utf32(
|
|
const utf32_t *s,
|
|
size_t len,
|
|
size_t *ip)
|
|
{
|
|
assert(*ip <= len);
|
|
if (*ip == len)
|
|
return EOS;
|
|
return s[(*ip)++];
|
|
}
|