awtk/3rd/libunibreak/wordbreak.c

/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */

/*
 * Word breaking in a Unicode sequence.  Designed to be used in a
 * generic text renderer.
 *
 * Copyright (C) 2013-2016 Tom Hacohen <tom at stosb dot com>
 * Copyright (C) 2018 Wu Yongwei <wuyongwei at gmail dot com>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute
 * it freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must
 *    not claim that you wrote the original software.  If you use this
 *    software in a product, an acknowledgement in the product
 *    documentation would be appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must
 *    not be misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source
 *    distribution.
 *
 * The main reference is Unicode Standard Annex 29 (UAX #29):
 *      <URL:http://unicode.org/reports/tr29>
 *
 * When this library was designed, this annex was at Revision 17, for
 * Unicode 6.0.0:
 *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
 *
 * This library has been updated according to Revision 33, for
 * Unicode 11.0.0:
 *      <URL:http://www.unicode.org/reports/tr29/tr29-33.html>
 *
 * The Unicode Terms of Use are available at
 *      <URL:http://www.unicode.org/copyright.html>
 */

/**
 * @file    wordbreak.c
 *
 * Implementation of the word breaking algorithm as described in Unicode
 * Standard Annex 29.
 *
 * @author  Tom Hacohen
 */

#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "unibreakdef.h"
#include "wordbreak.h"
#include "wordbreakdata.inc"
#include "emojidef.h"

/**
 * Initializes the wordbreak internals.  It currently does nothing, but
 * it may in the future.
 */
void init_wordbreak(void)
{
}

/**
 * Gets the word breaking class of a character.
 *
 * @param ch   character to check
 * @param wbp  pointer to the wbp breaking properties array
 * @param len  size of the wbp array in number of items
 * @return     the word breaking class if found; \c WBP_Any otherwise
 */
static enum WordBreakClass get_char_wb_class(
        utf32_t ch,
        const struct WordBreakProperties *wbp,
        size_t len)
{
    int min = 0;
    int max = len - 1;
    int mid;

    do
    {
        mid = (min + max) / 2;

        if (ch < wbp[mid].start)
            max = mid - 1;
        else if (ch > wbp[mid].end)
            min = mid + 1;
        else
            return wbp[mid].prop;
    }
    while (min <= max);

    return WBP_Any;
}

/**
 * Sets the word break types to a specific value in a range.
 *
 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 * cells that we really don't want to break after.
 *
 * @param[in]  s             input string
 * @param[out] brks          breaks array to fill
 * @param[in]  posStart      start position
 * @param[in]  posEnd        end position (exclusive)
 * @param[in]  len           length of the string
 * @param[in]  brkType       breaks type to use
 * @param[in] get_next_char  function to get the next UTF-32 character
 */
static void set_brks_to(
        const void *s,
        char *brks,
        size_t posStart,
        size_t posEnd,
        size_t len,
        char brkType,
        get_next_char_t get_next_char)
{
    size_t posNext = posStart;
    while (posNext < posEnd)
    {
        utf32_t ch;
        ch = get_next_char(s, len, &posNext);
        (void)ch;
        assert(ch != EOS);
        for (; posStart < posNext - 1; ++posStart)
            brks[posStart] = WORDBREAK_INSIDEACHAR;
        assert(posStart == posNext - 1);

        /* Only set it if we haven't set it not to break before. */
        if (brks[posStart] != WORDBREAK_NOBREAK)
            brks[posStart] = brkType;
        posStart = posNext;
    }
}

/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
                       (cls == WBP_LF))

/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s             input string
 * @param[in]  len           length of the input
 * @param[in]  lang          language of the input (reserved for future use)
 * @param[out] brks          pointer to the output breaking data, containing
 *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *                           #WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char  function to get the next UTF-32 character
 */
static void set_wordbreaks(
        const void *s,
        size_t len,
        const char *lang,
        char *brks,
        get_next_char_t get_next_char)
{
    /* Counter of how many time we cam across RI */
    int riCounter = 0;
    enum WordBreakClass wbcLast = WBP_Undefined;
    /* wbcSeqStart is the class that started the current sequence.
     * WBP_Undefined is a special case that means "sot".
     * This value is the class that is at the start of the current rule
     * matching sequence. For example, in case of Numeric+MidNum+Numeric
     * it'll be Numeric all the way.
     */
    enum WordBreakClass wbcSeqStart = WBP_Undefined;
    utf32_t ch;
    size_t posNext = 0;
    size_t posCur = 0;
    size_t posLast = 0;

    /* TODO: Language-specific specialization. */
    (void) lang;

    /* Init brks. */
    memset(brks, WORDBREAK_BREAK, len);

    ch = get_next_char(s, len, &posNext);

    while (ch != EOS)
    {
        enum WordBreakClass wbcCur;
        wbcCur = get_char_wb_class(ch, wb_prop_default,
                                   ARRAY_LEN(wb_prop_default));

        switch (wbcCur)
        {
        case WBP_CR:
            /* WB3b */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_LF:
            if (wbcSeqStart == WBP_CR) /* WB3 */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
                break;
            }
            /* Fall through */

        case WBP_Newline:
            /* WB3a,3b */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_ZWJ:
        case WBP_Extend:
        case WBP_Format:
            /* WB4 - If not the first char/after a newline (WB3a,3b), skip
             * this class, set it to be the same as the prev, and mark
             * brks not to break before them. */
            if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            else
            {
                /* It's surely not the first */
                brks[posCur - 1] = WORDBREAK_NOBREAK;
                /* WB3c and WB3d precede 4, so no intervening Extend
                 * chars allowed. */
                if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ &&
                    wbcSeqStart != WBP_WSegSpace)
                {
                    /* "inherit" the previous class. */
                    wbcCur = wbcLast;
                }
            }
            break;

        case WBP_Katakana:
            if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Hebrew_Letter:
        case WBP_ALetter:
            if ((wbcSeqStart == WBP_Hebrew_Letter) &&
                    (wbcLast == WBP_Double_Quote)) /* WB7b,c */
            {
               if (wbcCur == WBP_Hebrew_Letter)
                 {
                     set_brks_to(s, brks, posLast, posCur, len,
                             WORDBREAK_NOBREAK, get_next_char);
                 }
               else
                 {
                     set_brks_to(s, brks, posLast, posCur, len,
                             WORDBREAK_BREAK, get_next_char);
                 }
            }
            else if (((wbcSeqStart == WBP_ALetter) ||
                        (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
                    (wbcLast == WBP_Numeric) || /* WB10 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Single_Quote:
            if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            /* Fall through */

        case WBP_MidNumLet:
            if (((wbcLast == WBP_ALetter) ||
                        (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
                    (wbcLast == WBP_Numeric)) /* WB11,12 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_MidLetter:
            if ((wbcLast == WBP_ALetter) ||
                    (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_MidNum:
            if (wbcLast == WBP_Numeric) /* WB11,12 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_Numeric:
            if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
                    ((wbcLast == WBP_ALetter) ||
                     (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_ExtendNumLet:
            /* WB13a,13b */
            if ((wbcSeqStart == wbcLast) &&
                ((wbcLast == WBP_ALetter) ||
                 (wbcLast == WBP_Hebrew_Letter) ||
                 (wbcLast == WBP_Numeric) ||
                 (wbcLast == WBP_Katakana) ||
                 (wbcLast == WBP_ExtendNumLet)))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Regional_Indicator:
            /* WB15,16 */
            if ((wbcSeqStart == WBP_Regional_Indicator) &&
                ((riCounter % 2) == 1))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_NOBREAK, get_next_char);
                riCounter = 0; /* Reset the sequence */
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                riCounter = 1;
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Double_Quote:
            if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
            {
               /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_WSegSpace:
            if (wbcLast == WBP_WSegSpace) /* WB3d */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                posLast = posCur;
                break;
            }
            /* Fall through */

        case WBP_Any:
            /* Check for rule WB3c */
            if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                posLast = posCur;
                break;
            }

            /* Allow breaks and reset */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        default:
            /* Error, should never get here! */
            assert(0);
            break;
        }

        wbcLast = wbcCur;
        posCur = posNext;
        ch = get_next_char(s, len, &posNext);
    }

    /* WB2 */
    set_brks_to(s, brks, posLast, posNext, len,
                WORDBREAK_BREAK, get_next_char);
}

/**
 * Sets the word breaking information for a UTF-8 input string.
 *
 * @param[in]  s     input UTF-8 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *                   #WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf8(
        const utf8_t *s,
        size_t len,
        const char *lang,
        char *brks)
{
    set_wordbreaks(s, len, lang, brks,
                   (get_next_char_t)ub_get_next_char_utf8);
}

/**
 * Sets the word breaking information for a UTF-16 input string.
 *
 * @param[in]  s     input UTF-16 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *                   #WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf16(
        const utf16_t *s,
        size_t len,
        const char *lang,
        char *brks)
{
    set_wordbreaks(s, len, lang, brks,
                   (get_next_char_t)ub_get_next_char_utf16);
}

/**
 * Sets the word breaking information for a UTF-32 input string.
 *
 * @param[in]  s     input UTF-32 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *                   #WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf32(
        const utf32_t *s,
        size_t len,
        const char *lang,
        char *brks)
{
    set_wordbreaks(s, len, lang, brks,
                   (get_next_char_t)ub_get_next_char_utf32);
}
add line break 2018-07-27 10:50:05 +08:00			`/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */`

			`/*`
			`* Word breaking in a Unicode sequence. Designed to be used in a`
			`* generic text renderer.`
			`*`
			`* Copyright (C) 2013-2016 Tom Hacohen <tom at stosb dot com>`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`* Copyright (C) 2018 Wu Yongwei <wuyongwei at gmail dot com>`
add line break 2018-07-27 10:50:05 +08:00			`*`
			`* This software is provided 'as-is', without any express or implied`
			`* warranty. In no event will the author be held liable for any damages`
			`* arising from the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute`
			`* it freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must`
			`* not claim that you wrote the original software. If you use this`
			`* software in a product, an acknowledgement in the product`
			`* documentation would be appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must`
			`* not be misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source`
			`* distribution.`
			`*`
			`* The main reference is Unicode Standard Annex 29 (UAX #29):`
			`* <URL:http://unicode.org/reports/tr29>`
			`*`
			`* When this library was designed, this annex was at Revision 17, for`
			`* Unicode 6.0.0:`
			`* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>`
			`*`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`* This library has been updated according to Revision 33, for`
			`* Unicode 11.0.0:`
			`* <URL:http://www.unicode.org/reports/tr29/tr29-33.html>`
add line break 2018-07-27 10:50:05 +08:00			`*`
			`* The Unicode Terms of Use are available at`
			`* <URL:http://www.unicode.org/copyright.html>`
			`*/`

			`/**`
			`* @file wordbreak.c`
			`*`
			`* Implementation of the word breaking algorithm as described in Unicode`
			`* Standard Annex 29.`
			`*`
			`* @author Tom Hacohen`
			`*/`

			`#include <assert.h>`
			`#include <stddef.h>`
			`#include <string.h>`
			`#include "unibreakdef.h"`
			`#include "wordbreak.h"`
rename files in libunibreak 2018-10-14 07:32:00 +08:00			`#include "wordbreakdata.inc"`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`#include "emojidef.h"`
add line break 2018-07-27 10:50:05 +08:00
			`/**`
			`* Initializes the wordbreak internals. It currently does nothing, but`
			`* it may in the future.`
			`*/`
			`void init_wordbreak(void)`
			`{`
			`}`

			`/**`
			`* Gets the word breaking class of a character.`
			`*`
			`* @param ch character to check`
			`* @param wbp pointer to the wbp breaking properties array`
			`* @param len size of the wbp array in number of items`
			`* @return the word breaking class if found; \c WBP_Any otherwise`
			`*/`
			`static enum WordBreakClass get_char_wb_class(`
			`utf32_t ch,`
			`const struct WordBreakProperties *wbp,`
			`size_t len)`
			`{`
			`int min = 0;`
			`int max = len - 1;`
			`int mid;`

			`do`
			`{`
			`mid = (min + max) / 2;`

			`if (ch < wbp[mid].start)`
			`max = mid - 1;`
			`else if (ch > wbp[mid].end)`
			`min = mid + 1;`
			`else`
			`return wbp[mid].prop;`
			`}`
			`while (min <= max);`

			`return WBP_Any;`
			`}`

			`/**`
			`* Sets the word break types to a specific value in a range.`
			`*`
			`* It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.`
			`* Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are`
			`* cells that we really don't want to break after.`
			`*`
			`* @param[in] s input string`
			`* @param[out] brks breaks array to fill`
			`* @param[in] posStart start position`
			`* @param[in] posEnd end position (exclusive)`
			`* @param[in] len length of the string`
			`* @param[in] brkType breaks type to use`
			`* @param[in] get_next_char function to get the next UTF-32 character`
			`*/`
			`static void set_brks_to(`
			`const void *s,`
			`char *brks,`
			`size_t posStart,`
			`size_t posEnd,`
			`size_t len,`
			`char brkType,`
			`get_next_char_t get_next_char)`
			`{`
			`size_t posNext = posStart;`
			`while (posNext < posEnd)`
			`{`
			`utf32_t ch;`
			`ch = get_next_char(s, len, &posNext);`
			`(void)ch;`
			`assert(ch != EOS);`
			`for (; posStart < posNext - 1; ++posStart)`
			`brks[posStart] = WORDBREAK_INSIDEACHAR;`
			`assert(posStart == posNext - 1);`

			`/* Only set it if we haven't set it not to break before. */`
			`if (brks[posStart] != WORDBREAK_NOBREAK)`
			`brks[posStart] = brkType;`
			`posStart = posNext;`
			`}`
			`}`

			`/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */`
			`#define IS_WB3ab(cls) ((cls == WBP_Newline) \|\| (cls == WBP_CR) \|\| \`
			`(cls == WBP_LF))`

			`/**`
			`* Sets the word breaking information for a generic input string.`
			`*`
			`* @param[in] s input string`
			`* @param[in] len length of the input`
			`* @param[in] lang language of the input (reserved for future use)`
			`* @param[out] brks pointer to the output breaking data, containing`
			`* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or`
			`* #WORDBREAK_INSIDEACHAR`
			`* @param[in] get_next_char function to get the next UTF-32 character`
			`*/`
			`static void set_wordbreaks(`
			`const void *s,`
			`size_t len,`
			`const char *lang,`
			`char *brks,`
			`get_next_char_t get_next_char)`
			`{`
			`/* Counter of how many time we cam across RI */`
			`int riCounter = 0;`
			`enum WordBreakClass wbcLast = WBP_Undefined;`
			`/* wbcSeqStart is the class that started the current sequence.`
			`* WBP_Undefined is a special case that means "sot".`
			`* This value is the class that is at the start of the current rule`
			`* matching sequence. For example, in case of Numeric+MidNum+Numeric`
			`* it'll be Numeric all the way.`
			`*/`
			`enum WordBreakClass wbcSeqStart = WBP_Undefined;`
			`utf32_t ch;`
			`size_t posNext = 0;`
			`size_t posCur = 0;`
			`size_t posLast = 0;`

			`/* TODO: Language-specific specialization. */`
			`(void) lang;`

			`/* Init brks. */`
			`memset(brks, WORDBREAK_BREAK, len);`

			`ch = get_next_char(s, len, &posNext);`

			`while (ch != EOS)`
			`{`
			`enum WordBreakClass wbcCur;`
			`wbcCur = get_char_wb_class(ch, wb_prop_default,`
			`ARRAY_LEN(wb_prop_default));`

			`switch (wbcCur)`
			`{`
			`case WBP_CR:`
			`/* WB3b */`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_LF:`
			`if (wbcSeqStart == WBP_CR) /* WB3 */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`
			`}`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`/* Fall through */`
add line break 2018-07-27 10:50:05 +08:00
			`case WBP_Newline:`
			`/* WB3a,3b */`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_ZWJ:`
			`case WBP_Extend:`
			`case WBP_Format:`
			`/* WB4 - If not the first char/after a newline (WB3a,3b), skip`
			`* this class, set it to be the same as the prev, and mark`
			`* brks not to break before them. */`
			`if ((wbcSeqStart == WBP_Undefined) \|\| IS_WB3ab(wbcSeqStart))`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
			`else`
			`{`
			`/* It's surely not the first */`
			`brks[posCur - 1] = WORDBREAK_NOBREAK;`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`/* WB3c and WB3d precede 4, so no intervening Extend`
			`* chars allowed. */`
			`if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ &&`
			`wbcSeqStart != WBP_WSegSpace)`
add line break 2018-07-27 10:50:05 +08:00			`{`
			`/* "inherit" the previous class. */`
			`wbcCur = wbcLast;`
			`}`
			`}`
			`break;`

			`case WBP_Katakana:`
			`if ((wbcSeqStart == WBP_Katakana) \|\| /* WB13 */`
			`(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`}`
			`/* No rule found, reset */`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_Hebrew_Letter:`
			`case WBP_ALetter:`
			`if ((wbcSeqStart == WBP_Hebrew_Letter) &&`
			`(wbcLast == WBP_Double_Quote)) /* WB7b,c */`
			`{`
			`if (wbcCur == WBP_Hebrew_Letter)`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`}`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`
			`}`
			`else if (((wbcSeqStart == WBP_ALetter) \|\|`
			`(wbcSeqStart == WBP_Hebrew_Letter)) \|\| /* WB5,6,7 */`
			`(wbcLast == WBP_Numeric) \|\| /* WB10 */`
			`(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`}`
			`/* No rule found, reset */`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_Single_Quote:`
			`if (wbcLast == WBP_Hebrew_Letter) /* WB7a */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`/* Fall through */`

add line break 2018-07-27 10:50:05 +08:00			`case WBP_MidNumLet:`
			`if (((wbcLast == WBP_ALetter) \|\|`
			`(wbcLast == WBP_Hebrew_Letter)) \|\| /* WB6,7 */`
			`(wbcLast == WBP_Numeric)) /* WB11,12 */`
			`{`
			`/* Go on */`
			`}`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
			`break;`

			`case WBP_MidLetter:`
			`if ((wbcLast == WBP_ALetter) \|\|`
			`(wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */`
			`{`
			`/* Go on */`
			`}`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
			`break;`

			`case WBP_MidNum:`
			`if (wbcLast == WBP_Numeric) /* WB11,12 */`
			`{`
			`/* Go on */`
			`}`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
			`break;`

			`case WBP_Numeric:`
			`if ((wbcSeqStart == WBP_Numeric) \|\| /* WB8,11,12 */`
			`((wbcLast == WBP_ALetter) \|\|`
			`(wbcLast == WBP_Hebrew_Letter)) \|\| /* WB9 */`
			`(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`}`
			`/* No rule found, reset */`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_ExtendNumLet:`
			`/* WB13a,13b */`
			`if ((wbcSeqStart == wbcLast) &&`
			`((wbcLast == WBP_ALetter) \|\|`
			`(wbcLast == WBP_Hebrew_Letter) \|\|`
			`(wbcLast == WBP_Numeric) \|\|`
			`(wbcLast == WBP_Katakana) \|\|`
			`(wbcLast == WBP_ExtendNumLet)))`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`}`
			`/* No rule found, reset */`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_Regional_Indicator:`
			`/* WB15,16 */`
			`if ((wbcSeqStart == WBP_Regional_Indicator) &&`
			`((riCounter % 2) == 1))`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`riCounter = 0; /* Reset the sequence */`
			`}`
			`/* No rule found, reset */`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`riCounter = 1;`
			`}`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`case WBP_Double_Quote:`
			`if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */`
			`{`
			`/* Go on */`
			`}`
			`else`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`}`
			`break;`

tidy 3rd packages 2018-10-13 18:40:00 +08:00			`case WBP_WSegSpace:`
			`if (wbcLast == WBP_WSegSpace) /* WB3d */`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`posLast = posCur;`
			`break;`
			`}`
			`/* Fall through */`

add line break 2018-07-27 10:50:05 +08:00			`case WBP_Any:`
tidy 3rd packages 2018-10-13 18:40:00 +08:00			`/* Check for rule WB3c */`
			`if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch))`
			`{`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_NOBREAK, get_next_char);`
			`posLast = posCur;`
			`break;`
			`}`

add line break 2018-07-27 10:50:05 +08:00			`/* Allow breaks and reset */`
			`set_brks_to(s, brks, posLast, posCur, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`wbcSeqStart = wbcCur;`
			`posLast = posCur;`
			`break;`

			`default:`
			`/* Error, should never get here! */`
			`assert(0);`
			`break;`
			`}`

			`wbcLast = wbcCur;`
			`posCur = posNext;`
			`ch = get_next_char(s, len, &posNext);`
			`}`

			`/* WB2 */`
			`set_brks_to(s, brks, posLast, posNext, len,`
			`WORDBREAK_BREAK, get_next_char);`
			`}`

			`/**`
			`* Sets the word breaking information for a UTF-8 input string.`
			`*`
			`* @param[in] s input UTF-8 string`
			`* @param[in] len length of the input`
			`* @param[in] lang language of the input (reserved for future use)`
			`* @param[out] brks pointer to the output breaking data, containing`
			`* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or`
			`* #WORDBREAK_INSIDEACHAR`
			`*/`
			`void set_wordbreaks_utf8(`
			`const utf8_t *s,`
			`size_t len,`
			`const char *lang,`
			`char *brks)`
			`{`
			`set_wordbreaks(s, len, lang, brks,`
			`(get_next_char_t)ub_get_next_char_utf8);`
			`}`

			`/**`
			`* Sets the word breaking information for a UTF-16 input string.`
			`*`
			`* @param[in] s input UTF-16 string`
			`* @param[in] len length of the input`
			`* @param[in] lang language of the input (reserved for future use)`
			`* @param[out] brks pointer to the output breaking data, containing`
			`* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or`
			`* #WORDBREAK_INSIDEACHAR`
			`*/`
			`void set_wordbreaks_utf16(`
			`const utf16_t *s,`
			`size_t len,`
			`const char *lang,`
			`char *brks)`
			`{`
			`set_wordbreaks(s, len, lang, brks,`
			`(get_next_char_t)ub_get_next_char_utf16);`
			`}`

			`/**`
			`* Sets the word breaking information for a UTF-32 input string.`
			`*`
			`* @param[in] s input UTF-32 string`
			`* @param[in] len length of the input`
			`* @param[in] lang language of the input (reserved for future use)`
			`* @param[out] brks pointer to the output breaking data, containing`
			`* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or`
			`* #WORDBREAK_INSIDEACHAR`
			`*/`
			`void set_wordbreaks_utf32(`
			`const utf32_t *s,`
			`size_t len,`
			`const char *lang,`
			`char *brks)`
			`{`
			`set_wordbreaks(s, len, lang, brks,`
			`(get_next_char_t)ub_get_next_char_utf32);`
			`}`