mirror of
https://gitee.com/zlgopen/awtk.git
synced 2024-11-30 02:58:26 +08:00
897 lines
32 KiB
C
897 lines
32 KiB
C
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
|
|
|
/*
|
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
|
* generic text renderer.
|
|
*
|
|
* Copyright (C) 2008-2018 Wu Yongwei <wuyongwei at gmail dot com>
|
|
* Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the author be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute
|
|
* it freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must
|
|
* not claim that you wrote the original software. If you use this
|
|
* software in a product, an acknowledgement in the product
|
|
* documentation would be appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must
|
|
* not be misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source
|
|
* distribution.
|
|
*
|
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
|
* <URL:http://www.unicode.org/reports/tr14/>
|
|
*
|
|
* When this library was designed, this annex was at Revision 19, for
|
|
* Unicode 5.0.0:
|
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
|
*
|
|
* This library has been updated according to Revision 41, for
|
|
* Unicode 11.0.0:
|
|
* <URL:http://www.unicode.org/reports/tr14/tr14-41.html>
|
|
*
|
|
* The Unicode Terms of Use are available at
|
|
* <URL:http://www.unicode.org/copyright.html>
|
|
*/
|
|
|
|
/**
|
|
* @file linebreak.c
|
|
*
|
|
* Implementation of the line breaking algorithm as described in Unicode
|
|
* Standard Annex 14.
|
|
*
|
|
* @author Wu Yongwei
|
|
* @author Petr Filipsky
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include "linebreak.h"
|
|
#include "linebreakdef.h"
|
|
|
|
/**
|
|
* Special value used internally to indicate an undefined break result.
|
|
*/
|
|
#define LINEBREAK_UNDEFINED -1
|
|
|
|
/**
|
|
* Size of the second-level index to the line breaking properties.
|
|
*/
|
|
#define LINEBREAK_INDEX_SIZE 40
|
|
|
|
/**
|
|
* Enumeration of break actions. They are used in the break action
|
|
* pair table #baTable.
|
|
*/
|
|
enum BreakAction
|
|
{
|
|
DIR_BRK, /**< Direct break opportunity */
|
|
IND_BRK, /**< Indirect break opportunity */
|
|
CMI_BRK, /**< Indirect break opportunity for combining marks */
|
|
CMP_BRK, /**< Prohibited break for combining marks */
|
|
PRH_BRK /**< Prohibited break */
|
|
};
|
|
|
|
/**
|
|
* Break action pair table. This is a direct mapping of Table 2 of
|
|
* Unicode Standard Annex 14, Revision 37, except for ZWJ (manually
|
|
* adjusted after special processing as per LB8a of Revision 41) and CB
|
|
* (manually added as per LB20).
|
|
*/
|
|
static const enum BreakAction baTable[LBP_CB][LBP_CB] = {
|
|
{ /* OP */
|
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
|
CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
|
|
{ /* CL */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* CP */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* QU */
|
|
PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
|
{ /* GL */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
|
{ /* NS */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* EX */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* SY */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* IS */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* PR */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
|
|
{ /* PO */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* NU */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* AL */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* HL */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* ID */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* IN */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* HY */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* BA */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* BB */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
|
|
{ /* B2 */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* ZW */
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
|
{ /* CM */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* WJ */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
|
{ /* H2 */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* H3 */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* JL */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* JV */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* JT */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* RI */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
IND_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* EB */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK },
|
|
{ /* EM */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* ZWJ */
|
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
|
DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
{ /* CB */
|
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK,
|
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
|
|
CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
|
DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK },
|
|
};
|
|
|
|
/**
|
|
* Struct for the second-level index to the line breaking properties.
|
|
*/
|
|
struct LineBreakPropertiesIndex
|
|
{
|
|
utf32_t end; /**< End codepoint */
|
|
const struct LineBreakProperties *lbp; /**< Pointer to line breaking
|
|
properties */
|
|
};
|
|
|
|
/**
|
|
* Second-level index to the line breaking properties.
|
|
*/
|
|
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
|
|
{
|
|
{ 0xFFFFFFFF, lb_prop_default }
|
|
};
|
|
|
|
/**
|
|
* Checks whether the \a str ends with \a suffix, which has length
|
|
* \a suffix_len.
|
|
*
|
|
* @param str string whose ending is to be checked
|
|
* @param suffix string to check
|
|
* @param suffixLen length of \a suffix
|
|
* @return non-zero if true; zero otherwise
|
|
*/
|
|
static __inline int ends_with(const char *str, const char *suffix,
|
|
unsigned suffixLen)
|
|
{
|
|
unsigned len;
|
|
if (str == NULL)
|
|
{
|
|
return 0;
|
|
}
|
|
len = strlen(str);
|
|
if (len >= suffixLen &&
|
|
memcmp(str + len - suffixLen, suffix, suffixLen) == 0)
|
|
{
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#define ENDS_WITH(str, suffix) ends_with((str), (suffix), sizeof(suffix) - 1)
|
|
|
|
/**
|
|
* Initializes the second-level index to the line breaking properties.
|
|
* If it is not called, the performance of #get_char_lb_class_lang (and
|
|
* thus the main functionality) can be pretty bad, especially for big
|
|
* codepoints like those of Chinese.
|
|
*/
|
|
void init_linebreak(void)
|
|
{
|
|
size_t i;
|
|
size_t iPropDefault;
|
|
size_t len;
|
|
size_t step;
|
|
|
|
len = 0;
|
|
while (lb_prop_default[len].prop != LBP_Undefined)
|
|
++len;
|
|
step = len / LINEBREAK_INDEX_SIZE;
|
|
iPropDefault = 0;
|
|
for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
|
|
{
|
|
lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
|
|
iPropDefault += step;
|
|
lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
|
|
}
|
|
lb_prop_index[--i].end = 0xFFFFFFFF;
|
|
}
|
|
|
|
/**
|
|
* Gets the language-specific line breaking properties.
|
|
*
|
|
* @param lang language of the text
|
|
* @return pointer to the language-specific line breaking
|
|
* properties array if found; \c NULL otherwise
|
|
*/
|
|
static const struct LineBreakProperties *get_lb_prop_lang(const char *lang)
|
|
{
|
|
const struct LineBreakPropertiesLang *lbplIter;
|
|
if (lang != NULL)
|
|
{
|
|
for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
|
|
{
|
|
if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
|
|
{
|
|
return lbplIter->lbp;
|
|
}
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Gets the line breaking class of a character from a line breaking
|
|
* properties array.
|
|
*
|
|
* @param ch character to check
|
|
* @param lbp pointer to the line breaking properties array
|
|
* @return the line breaking class if found; \c LBP_XX otherwise
|
|
*/
|
|
static enum LineBreakClass get_char_lb_class(
|
|
utf32_t ch,
|
|
const struct LineBreakProperties *lbp)
|
|
{
|
|
while (lbp->prop != LBP_Undefined && ch >= lbp->start)
|
|
{
|
|
if (ch <= lbp->end)
|
|
return lbp->prop;
|
|
++lbp;
|
|
}
|
|
return LBP_XX;
|
|
}
|
|
|
|
/**
|
|
* Gets the line breaking class of a character from the default line
|
|
* breaking properties array.
|
|
*
|
|
* @param ch character to check
|
|
* @return the line breaking class if found; \c LBP_XX otherwise
|
|
*/
|
|
static enum LineBreakClass get_char_lb_class_default(
|
|
utf32_t ch)
|
|
{
|
|
size_t i = 0;
|
|
while (ch > lb_prop_index[i].end)
|
|
++i;
|
|
assert(i < LINEBREAK_INDEX_SIZE);
|
|
return get_char_lb_class(ch, lb_prop_index[i].lbp);
|
|
}
|
|
|
|
/**
|
|
* Gets the line breaking class of a character for a specific
|
|
* language. This function will check the language-specific data first,
|
|
* and then the default data if there is no language-specific property
|
|
* available for the character.
|
|
*
|
|
* @param ch character to check
|
|
* @param lbpLang pointer to the language-specific line breaking
|
|
* properties array
|
|
* @return the line breaking class if found; \c LBP_XX
|
|
* otherwise
|
|
*/
|
|
static enum LineBreakClass get_char_lb_class_lang(
|
|
utf32_t ch,
|
|
const struct LineBreakProperties *lbpLang)
|
|
{
|
|
enum LineBreakClass lbcResult;
|
|
|
|
/* Find the language-specific line breaking class for a character */
|
|
if (lbpLang)
|
|
{
|
|
lbcResult = get_char_lb_class(ch, lbpLang);
|
|
if (lbcResult != LBP_XX)
|
|
return lbcResult;
|
|
}
|
|
|
|
/* Find the generic language-specific line breaking class, if no
|
|
* language context is provided, or language-specific data are not
|
|
* available for the specific character in the specified language */
|
|
return get_char_lb_class_default(ch);
|
|
}
|
|
|
|
/**
|
|
* Resolves the line breaking class for certain ambiguous or complicated
|
|
* characters. They are treated in a simplistic way in this
|
|
* implementation.
|
|
*
|
|
* @param lbc line breaking class to resolve
|
|
* @param lang language of the text
|
|
* @return the resolved line breaking class
|
|
*/
|
|
static enum LineBreakClass resolve_lb_class(
|
|
enum LineBreakClass lbc,
|
|
const char *lang)
|
|
{
|
|
switch (lbc)
|
|
{
|
|
case LBP_AI:
|
|
if (lang != NULL &&
|
|
(strncmp(lang, "zh", 2) == 0 || /* Chinese */
|
|
strncmp(lang, "ja", 2) == 0 || /* Japanese */
|
|
strncmp(lang, "ko", 2) == 0)) /* Korean */
|
|
{
|
|
return LBP_ID;
|
|
}
|
|
else
|
|
{
|
|
return LBP_AL;
|
|
}
|
|
case LBP_CJ:
|
|
/* `Strict' and `normal' line breaking. See
|
|
* <url:http://www.unicode.org/reports/tr14/#CJ>
|
|
* for details. */
|
|
if (ENDS_WITH(lang, "-strict"))
|
|
{
|
|
return LBP_NS;
|
|
}
|
|
else
|
|
{
|
|
return LBP_ID;
|
|
}
|
|
case LBP_SA:
|
|
case LBP_SG:
|
|
case LBP_XX:
|
|
return LBP_AL;
|
|
default:
|
|
return lbc;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Treats specially for the first character in a line.
|
|
*
|
|
* @param[in,out] lbpCtx pointer to the line breaking context
|
|
* @pre \a lbpCtx->lbcCur has a valid line break class
|
|
* @post \a lbpCtx->lbcCur has the updated line break class
|
|
*/
|
|
static void treat_first_char(
|
|
struct LineBreakContext *lbpCtx)
|
|
{
|
|
switch (lbpCtx->lbcCur)
|
|
{
|
|
case LBP_LF:
|
|
case LBP_NL:
|
|
lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
|
|
break;
|
|
case LBP_SP:
|
|
lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tries telling the line break opportunity by simple rules.
|
|
*
|
|
* @param[in,out] lbpCtx pointer to the line breaking context
|
|
* @pre \a lbpCtx->lbcCur has the current line break
|
|
* class; and \a lbpCtx->lbcNew has the line
|
|
* break class for the next character
|
|
* @post \a lbpCtx->lbcCur has the updated line break
|
|
* class
|
|
* @return break result, one of #LINEBREAK_MUSTBREAK,
|
|
* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
|
|
* if identified; or #LINEBREAK_UNDEFINED if
|
|
* table lookup is needed
|
|
*/
|
|
static int get_lb_result_simple(
|
|
struct LineBreakContext *lbpCtx)
|
|
{
|
|
if (lbpCtx->lbcCur == LBP_BK
|
|
|| (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
|
|
{
|
|
return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
|
|
}
|
|
|
|
switch (lbpCtx->lbcNew)
|
|
{
|
|
case LBP_SP:
|
|
return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
|
|
case LBP_BK:
|
|
case LBP_LF:
|
|
case LBP_NL:
|
|
lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
|
|
return LINEBREAK_NOBREAK; /* Rule LB6 */
|
|
case LBP_CR:
|
|
lbpCtx->lbcCur = LBP_CR;
|
|
return LINEBREAK_NOBREAK; /* Rule LB6 */
|
|
default:
|
|
return LINEBREAK_UNDEFINED; /* Table lookup is needed */
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tells the line break opportunity by table lookup.
|
|
*
|
|
* @param[in,out] lbpCtx pointer to the line breaking context
|
|
* @pre \a lbpCtx->lbcCur has the current line break
|
|
* class; \a lbpCtx->lbcLast has the line break
|
|
* class for the last character; and \a
|
|
* lbcCur->lbcNew has the line break class for
|
|
* the next character
|
|
* @post \a lbpCtx->lbcCur has the updated line break
|
|
* class
|
|
* @return break result, one of #LINEBREAK_MUSTBREAK,
|
|
* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
|
|
*/
|
|
static int get_lb_result_lookup(
|
|
struct LineBreakContext *lbpCtx)
|
|
{
|
|
int brk = LINEBREAK_UNDEFINED;
|
|
|
|
assert(lbpCtx->lbcCur <= LBP_CB);
|
|
assert(lbpCtx->lbcNew <= LBP_CB);
|
|
switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
|
|
{
|
|
case DIR_BRK:
|
|
brk = LINEBREAK_ALLOWBREAK;
|
|
break;
|
|
case IND_BRK:
|
|
brk = (lbpCtx->lbcLast == LBP_SP)
|
|
? LINEBREAK_ALLOWBREAK
|
|
: LINEBREAK_NOBREAK;
|
|
break;
|
|
case CMI_BRK:
|
|
brk = LINEBREAK_ALLOWBREAK;
|
|
if (lbpCtx->lbcLast != LBP_SP)
|
|
{
|
|
brk = LINEBREAK_NOBREAK;
|
|
return brk; /* Do not update lbcCur */
|
|
}
|
|
break;
|
|
case CMP_BRK:
|
|
brk = LINEBREAK_NOBREAK;
|
|
if (lbpCtx->lbcLast != LBP_SP)
|
|
return brk; /* Do not update lbcCur */
|
|
break;
|
|
case PRH_BRK:
|
|
brk = LINEBREAK_NOBREAK;
|
|
break;
|
|
}
|
|
|
|
/* Special processing due to rule LB8a */
|
|
if (lbpCtx->fLb8aZwj)
|
|
{
|
|
brk = LINEBREAK_NOBREAK;
|
|
}
|
|
|
|
/* Special processing due to rule LB21a */
|
|
if (lbpCtx->fLb21aHebrew &&
|
|
(lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))
|
|
{
|
|
brk = LINEBREAK_NOBREAK;
|
|
lbpCtx->fLb21aHebrew = false;
|
|
}
|
|
else
|
|
{
|
|
lbpCtx->fLb21aHebrew = (lbpCtx->lbcCur == LBP_HL);
|
|
}
|
|
|
|
/* Special processing due to rule LB30a */
|
|
if (lbpCtx->lbcCur == LBP_RI)
|
|
{
|
|
lbpCtx->cLb30aRI++;
|
|
if (lbpCtx->cLb30aRI == 2 && lbpCtx->lbcNew == LBP_RI)
|
|
{
|
|
brk = LINEBREAK_ALLOWBREAK;
|
|
lbpCtx->cLb30aRI = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
lbpCtx->cLb30aRI = 0;
|
|
}
|
|
|
|
lbpCtx->lbcCur = lbpCtx->lbcNew;
|
|
return brk;
|
|
}
|
|
|
|
/**
|
|
* Initializes line breaking context for a given language.
|
|
*
|
|
* @param[in,out] lbpCtx pointer to the line breaking context
|
|
* @param[in] ch the first character to process
|
|
* @param[in] lang language of the input
|
|
* @post the line breaking context is initialized
|
|
*/
|
|
void lb_init_break_context(
|
|
struct LineBreakContext *lbpCtx,
|
|
utf32_t ch,
|
|
const char *lang)
|
|
{
|
|
lbpCtx->lang = lang;
|
|
lbpCtx->lbpLang = get_lb_prop_lang(lang);
|
|
lbpCtx->lbcLast = LBP_Undefined;
|
|
lbpCtx->lbcNew = LBP_Undefined;
|
|
lbpCtx->lbcCur = resolve_lb_class(
|
|
get_char_lb_class_lang(ch, lbpCtx->lbpLang),
|
|
lbpCtx->lang);
|
|
lbpCtx->fLb8aZwj =
|
|
(get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_ZWJ);
|
|
lbpCtx->fLb10LeadSpace =
|
|
(get_char_lb_class_lang(ch, lbpCtx->lbpLang) == LBP_SP);
|
|
lbpCtx->fLb21aHebrew = false;
|
|
lbpCtx->cLb30aRI = 0;
|
|
treat_first_char(lbpCtx);
|
|
}
|
|
|
|
/**
|
|
* Updates LineBreakingContext for the next codepoint and returns
|
|
* the detected break.
|
|
*
|
|
* @param[in,out] lbpCtx pointer to the line breaking context
|
|
* @param[in] ch Unicode codepoint
|
|
* @return break result, one of #LINEBREAK_MUSTBREAK,
|
|
* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
|
|
* @post the line breaking context is updated
|
|
*/
|
|
int lb_process_next_char(
|
|
struct LineBreakContext *lbpCtx,
|
|
utf32_t ch )
|
|
{
|
|
int brk;
|
|
|
|
lbpCtx->lbcLast = lbpCtx->lbcNew;
|
|
lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
|
|
brk = get_lb_result_simple(lbpCtx);
|
|
switch (brk)
|
|
{
|
|
case LINEBREAK_MUSTBREAK:
|
|
lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
|
|
treat_first_char(lbpCtx);
|
|
break;
|
|
case LINEBREAK_UNDEFINED:
|
|
lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
|
|
brk = get_lb_result_lookup(lbpCtx);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* Special processing due to rule LB8a */
|
|
if (lbpCtx->lbcNew == LBP_ZWJ)
|
|
{
|
|
lbpCtx->fLb8aZwj = true;
|
|
}
|
|
else
|
|
{
|
|
lbpCtx->fLb8aZwj = false;
|
|
}
|
|
|
|
/* Special processing due to rule LB10 */
|
|
if (lbpCtx->fLb10LeadSpace)
|
|
{
|
|
if (lbpCtx->lbcNew == LBP_CM || lbpCtx->lbcNew == LBP_ZWJ)
|
|
brk = LINEBREAK_ALLOWBREAK;
|
|
lbpCtx->fLb10LeadSpace = false;
|
|
}
|
|
|
|
return brk;
|
|
}
|
|
|
|
/**
|
|
* Sets the line breaking information for a generic input string.
|
|
*
|
|
* Currently, this implementation has customization for the following
|
|
* ISO 639-1 language codes (for \a lang):
|
|
*
|
|
* - de (German)
|
|
* - en (English)
|
|
* - es (Spanish)
|
|
* - fr (French)
|
|
* - ja (Japanese)
|
|
* - ko (Korean)
|
|
* - ru (Russian)
|
|
* - zh (Chinese)
|
|
*
|
|
* In addition, a suffix <code>"-strict"</code> may be added to indicate
|
|
* strict (as versus normal) line-breaking behaviour. See the <a
|
|
* href="http://www.unicode.org/reports/tr14/#CJ">Conditional Japanese
|
|
* Starter section of UAX #14</a> for more details.
|
|
*
|
|
* @param[in] s input string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data,
|
|
* containing #LINEBREAK_MUSTBREAK,
|
|
* #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
|
|
* or #LINEBREAK_INSIDEACHAR
|
|
* @param[in] get_next_char function to get the next UTF-32 character
|
|
*/
|
|
void set_linebreaks(
|
|
const void *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks,
|
|
get_next_char_t get_next_char)
|
|
{
|
|
utf32_t ch;
|
|
struct LineBreakContext lbCtx;
|
|
size_t posCur = 0;
|
|
size_t posLast = 0;
|
|
|
|
--posLast; /* To be ++'d later */
|
|
ch = get_next_char(s, len, &posCur);
|
|
if (ch == EOS)
|
|
return;
|
|
lb_init_break_context(&lbCtx, ch, lang);
|
|
|
|
/* Process a line till an explicit break or end of string */
|
|
for (;;)
|
|
{
|
|
for (++posLast; posLast < posCur - 1; ++posLast)
|
|
{
|
|
brks[posLast] = LINEBREAK_INSIDEACHAR;
|
|
}
|
|
assert(posLast == posCur - 1);
|
|
ch = get_next_char(s, len, &posCur);
|
|
if (ch == EOS)
|
|
break;
|
|
brks[posLast] = lb_process_next_char(&lbCtx, ch);
|
|
}
|
|
|
|
assert(posLast == posCur - 1 && posCur <= len);
|
|
/* Break after the last character */
|
|
brks[posLast] = LINEBREAK_MUSTBREAK;
|
|
/* When the input contains incomplete sequences */
|
|
while (posCur < len)
|
|
{
|
|
brks[posCur++] = LINEBREAK_INSIDEACHAR;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sets the line breaking information for a UTF-8 input string.
|
|
*
|
|
* @param[in] s input UTF-8 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
|
* @see #set_linebreaks for a note about \a lang.
|
|
*/
|
|
void set_linebreaks_utf8(
|
|
const utf8_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_linebreaks(s, len, lang, brks,
|
|
(get_next_char_t)ub_get_next_char_utf8);
|
|
}
|
|
|
|
/**
|
|
* Sets the line breaking information for a UTF-16 input string.
|
|
*
|
|
* @param[in] s input UTF-16 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
|
* @see #set_linebreaks for a note about \a lang.
|
|
*/
|
|
void set_linebreaks_utf16(
|
|
const utf16_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_linebreaks(s, len, lang, brks,
|
|
(get_next_char_t)ub_get_next_char_utf16);
|
|
}
|
|
|
|
/**
|
|
* Sets the line breaking information for a UTF-32 input string.
|
|
*
|
|
* @param[in] s input UTF-32 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
|
* @see #set_linebreaks for a note about \a lang.
|
|
*/
|
|
void set_linebreaks_utf32(
|
|
const utf32_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_linebreaks(s, len, lang, brks,
|
|
(get_next_char_t)ub_get_next_char_utf32);
|
|
}
|
|
|
|
/**
|
|
* Tells whether a line break can occur between two Unicode characters.
|
|
* This is a wrapper function to expose a simple interface. Generally
|
|
* speaking, it is better to use #set_linebreaks_utf32 instead, since
|
|
* complicated cases involving combining marks, spaces, etc. cannot be
|
|
* correctly processed.
|
|
*
|
|
* @param char1 the first Unicode character
|
|
* @param char2 the second Unicode character
|
|
* @param lang language of the input
|
|
* @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
|
*/
|
|
int is_line_breakable(
|
|
utf32_t char1,
|
|
utf32_t char2,
|
|
const char *lang)
|
|
{
|
|
utf32_t s[2];
|
|
char brks[2];
|
|
s[0] = char1;
|
|
s[1] = char2;
|
|
set_linebreaks_utf32(s, 2, lang, brks);
|
|
return brks[0];
|
|
}
|