diff --git a/CHANGES.md b/CHANGES.md index 95e25a6..b9b6bd2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,9 +9,9 @@ friso-1.6.2: 3. friso deb | rmp支持: Debian & Ubuntu: - sudo apt-get install libfriso0 libfriso-dev + sudo apt-get install libfriso0 libfriso-dev CentOS & Fedora: - sudo yum install libfriso libfriso-devel + sudo yum install libfriso libfriso-devel 4. 中文词性标注。 @@ -26,41 +26,41 @@ friso-1.6.2: friso-1.6.1: -1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done +1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done -2. 修复两处内存泄漏bug. -done +2. 修复两处内存泄漏bug. -done -3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done +3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done -4. 词库优化, 加入了一些新词条. -done +4. 词库优化, 加入了一些新词条. -done -5. 修复friso_dic_add & array_list_insert的两处代码bug -done +5. 修复friso_dic_add & array_list_insert的两处代码bug -done -6. 增加检测模式切分, 只返回词库中有的词条 -done +6. 增加检测模式切分, 只返回词库中有的词条 -done -7. 集成了php扩展绑定,完美支持PHP分词 -done +7. 集成了php扩展绑定,完美支持PHP分词 -done friso-1.6.0: 1. friso_stirng.c#utf8_decimal_string初始化bytes = 0, - 去除WinNT的Run-Time Check Failed. -done + 去除WinNT的Run-Time Check Failed. -done -2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done +2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done -3. GBK编码支持. -done +3. GBK编码支持. -done -4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done +4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done -5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done +5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done -6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回,纠正了offset的误差。 -done +6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回,纠正了offset的误差。 -done 7. 做了一些优化,例如:同义词的追加(普通/sphinx定义)复杂的判断逻辑, - 改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done + 改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done -8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done +8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done 9. 增加了安装中头文件的自动拷贝(usr/include/friso),可以通过include 来引用头文件。 @@ -83,18 +83,18 @@ friso-1.4: 1. 小数+单位无法识别的情况.更改friso_string#utf8_numeric_string()函数. 2. 更改中英混合词的识别(目前可以识别中英任何一种组合). - 英中: 例如: b超, - 英中英: a美1, - 英中英中: a哆啦a梦, - 中英: 卡拉ok, - 中英中: 哆啦a梦, - 中英中英: 中文a美a + 英中: 例如: b超, + 英中英: a美1, + 英中英中: a哆啦a梦, + 中英: 卡拉ok, + 中英中: 哆啦a梦, + 中英中英: 中文a美a 3. 更改了单位组合, 现在可以组合的单位不局限是中文, 例如: ℃,℉ 4. 对于未识别的字符, 给定一个开关选项来决定保留还是过滤. -5. 英文同义词的追加(增加了lex-en.lex词库) +5. 英文同义词的追加(增加了lex-en.lex词库) friso-1.3: @@ -103,7 +103,7 @@ friso-1.3: 2. 部分简易函数使用了宏定义来代替, 减少函数的调用. 3. 保留了英文全半角和中文标点符号的切分.(可以通过过滤停止词来过滤不需要的标点) - 停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了. + 停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了. 4. 修复friso_string#utf8_en_punctuation()函数一处bug. diff --git a/README.md b/README.md index 4d4bc4c..c529a52 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ Friso是使用c语言开发的一款开源的高性能中文分词器,使用 2。三种切分模式: - (1). 简易模式:FMM算法,适合速度要求场合。 - (2). 复杂模式- MMSEG四种过滤算法,具有较高的岐义去除,分词准确率达到了98.41%。 - (3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始) + (1). 简易模式:FMM算法,适合速度要求场合。 + (2). 复杂模式- MMSEG四种过滤算法,具有较高的岐义去除,分词准确率达到了98.41%。 + (3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始) 请参考本算法的原作:http://technology.chtsai.org/mmseg/。 diff --git a/binding/php/config.w32 b/binding/php/config.w32 index 9ade3eb..d47e57a 100644 --- a/binding/php/config.w32 +++ b/binding/php/config.w32 @@ -8,6 +8,6 @@ // ARG_ENABLE("friso", "enable friso support", "no"); if (PHP_FRISO != "no") { - EXTENSION("friso", "friso.c"); + EXTENSION("friso", "friso.c"); } diff --git a/binding/php/demo/friso.fun.php b/binding/php/demo/friso.fun.php index fec17e6..ff074b0 100644 --- a/binding/php/demo/friso.fun.php +++ b/binding/php/demo/friso.fun.php @@ -20,53 +20,53 @@ echo "friso_version(): " , friso_version(), ", friso_charset(): ", friso_charset echo "分词函数:
"; if ( friso_charset() == 'UTF-8' ) { - $_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体,x射线本质是什么,今天去奇都ktv唱卡拉ok去,哆啦a梦是一个动漫中的主角,单位和全角: 2009年8月6日开始大学之旅,岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩."; - echo "

friso_split(\"" . $_str . "\"):

"; - - //API: - //rb_split(string, Array, [long]) - //1.string: 要被切分的字符串。 - //2.Array: 配置选项,使用NULL来选择默认的配置(friso.ini中的配置)。 - //3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs - - //1.完整的配置: - //array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1, - // 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX); - //1.在不了解friso内核的情况下, 请不要随便更改nthreshold - //2.使用NULL来使用php.ini中指定的friso.ini文件中的配置 - - //2.返回选项: - //词条: FRISO_RET_WORD, 类别:FRISO_RET_TYPE, 长度:FRISO_RET_LENGTH, 真实长度:FRISO_RET_RLEN, 偏移量:FRISO_RET_OFF - //词性:FRISO_RET_POS(待实现) - $_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS; - //$_rargs = 0; - - //3.切分类别: - //CJK词条:FRISO_TYP_CJK, 英中混合词(b超):FRISO_TYP_ECM,中英混合词(卡拉ok):FRISO_TYP_CEM, - //英文标点混合词(c++):FRISO_TYP_EPUN,标点:FRISO_TYP_PUN,未知类别:FRISO_TYP_UNK,其他类别(同义词):FRISO_TYP_OTR - $_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs); - unset($_str); - foreach ( $_result as $_val ) - { - $_str = $_val['word']; - if ( $_rargs != 0 ) { - $_str .= '['; - if ( ($_rargs & FRISO_RET_TYPE) != 0 ) - $_str .= ', type: '.$_val['type']; //获取词条类别 - if ( ($_rargs & FRISO_RET_LEN) != 0 ) - $_str .= ', len: ' . $_val['len']; //词条长度 - if ( ($_rargs & FRISO_RET_RLEN) != 0 ) - $_str .= ', rlen: ' . $_val['rlen']; //词条真实长度 - if ( ($_rargs & FRISO_RET_OFF) != 0 ) - $_str .= ', off: ' . $_val['off']; //词条偏移量 - if ( ($_rargs & FRISO_RET_POS) != 0 ) - $_str .= ', pos: ' . $_val['pos']; //词条词性 - $_str .= ']'; - } + $_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体,x射线本质是什么,今天去奇都ktv唱卡拉ok去,哆啦a梦是一个动漫中的主角,单位和全角: 2009年8月6日开始大学之旅,岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩."; + echo "

friso_split(\"" . $_str . "\"):

"; + + //API: + //rb_split(string, Array, [long]) + //1.string: 要被切分的字符串。 + //2.Array: 配置选项,使用NULL来选择默认的配置(friso.ini中的配置)。 + //3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs + + //1.完整的配置: + //array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1, + // 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX); + //1.在不了解friso内核的情况下, 请不要随便更改nthreshold + //2.使用NULL来使用php.ini中指定的friso.ini文件中的配置 + + //2.返回选项: + //词条: FRISO_RET_WORD, 类别:FRISO_RET_TYPE, 长度:FRISO_RET_LENGTH, 真实长度:FRISO_RET_RLEN, 偏移量:FRISO_RET_OFF + //词性:FRISO_RET_POS(待实现) + $_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS; + //$_rargs = 0; + + //3.切分类别: + //CJK词条:FRISO_TYP_CJK, 英中混合词(b超):FRISO_TYP_ECM,中英混合词(卡拉ok):FRISO_TYP_CEM, + //英文标点混合词(c++):FRISO_TYP_EPUN,标点:FRISO_TYP_PUN,未知类别:FRISO_TYP_UNK,其他类别(同义词):FRISO_TYP_OTR + $_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs); + unset($_str); + foreach ( $_result as $_val ) + { + $_str = $_val['word']; + if ( $_rargs != 0 ) { + $_str .= '['; + if ( ($_rargs & FRISO_RET_TYPE) != 0 ) + $_str .= ', type: '.$_val['type']; //获取词条类别 + if ( ($_rargs & FRISO_RET_LEN) != 0 ) + $_str .= ', len: ' . $_val['len']; //词条长度 + if ( ($_rargs & FRISO_RET_RLEN) != 0 ) + $_str .= ', rlen: ' . $_val['rlen']; //词条真实长度 + if ( ($_rargs & FRISO_RET_OFF) != 0 ) + $_str .= ', off: ' . $_val['off']; //词条偏移量 + if ( ($_rargs & FRISO_RET_POS) != 0 ) + $_str .= ', pos: ' . $_val['pos']; //词条词性 + $_str .= ']'; + } - $_str .= '/   '; - echo $_str; - } + $_str .= '/   '; + echo $_str; + } } else echo "set charset to UTF-8 to test function friso_split."; ?> diff --git a/binding/php/demo/gbk.demo.php b/binding/php/demo/gbk.demo.php index e109f0f..06d77b4 100644 --- a/binding/php/demo/gbk.demo.php +++ b/binding/php/demo/gbk.demo.php @@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0); //check the charset if ( friso_charset() != "GBK" ) { - $_str = "Error: GBK charset required.
"; - $_str .= "1. Modified friso.charset = 1 in your friso.ini .
"; - $_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon.
"; - exit($_str); + $_str = "Error: GBK charset required.
"; + $_str .= "1. Modified friso.charset = 1 in your friso.ini .
"; + $_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon.
"; + exit($_str); } $text = ''; @@ -15,139 +15,139 @@ $_timer = 0; $_act = ''; $_cfg = array('mode' => FRISO_COMPLEX); if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) { - $text = &$_POST['text']; - $_cfg = &$_POST['config']; - if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0; - if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0; - if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0; - if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0; - if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0; - - $s_time = timer(); - $_ret = friso_split($text, $_cfg); - $_timer = timer() - $s_time; + $text = &$_POST['text']; + $_cfg = &$_POST['config']; + if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0; + if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0; + if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0; + if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0; + if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0; + + $s_time = timer(); + $_ret = friso_split($text, $_cfg); + $_timer = timer() - $s_time; } function timer() { - list($msec, $sec) = explode(' ', microtime()); - return ((float)$msec + (float)$sec); + list($msec, $sec) = explode(' ', microtime()); + return ((float)$msec + (float)$sec); } ?> + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - GBK - robbeִʲԳ - - + GBK - robbeִʲԳ + + -

-
-
ִã
-
-
- - -
-
- - -
-
- - value="1" /> -
-
- - -
-
- - -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - />ģʽ - />ģʽ -
-
- -
ִݣ
-
- - robbeִ -
+
+
+
ִã
+
+
+ + +
+
+ + +
+
+ + value="1" /> +
+
+ + +
+
+ + +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + />ģʽ + />ģʽ +
+
+ +
ִݣ
+
+ + robbeִ +
- -
ִʽ
-
- - -
+ +
ִʽ
+
+ + +
diff --git a/binding/php/demo/utf8.demo.php b/binding/php/demo/utf8.demo.php index 8a49cb7..3e31397 100644 --- a/binding/php/demo/utf8.demo.php +++ b/binding/php/demo/utf8.demo.php @@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0); //charset check. if ( friso_charset() != "UTF-8" ) { - $_str = "Error: UTF-8 charset required.
"; - $_str .= "1. Modified friso.charset = 0 in your friso.ini .
"; - $_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon.
"; - exit($_str); + $_str = "Error: UTF-8 charset required.
"; + $_str .= "1. Modified friso.charset = 0 in your friso.ini .
"; + $_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon.
"; + exit($_str); } $text = ''; @@ -15,139 +15,139 @@ $_timer = 0; $_act = ''; $_cfg = array('mode' => FRISO_COMPLEX); if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) { - $text = &$_POST['text']; - $_cfg = &$_POST['config']; - if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0; - if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0; - if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0; - if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0; - if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0; - - $s_time = timer(); - $_ret = friso_split($text, $_cfg); - $_timer = timer() - $s_time; + $text = &$_POST['text']; + $_cfg = &$_POST['config']; + if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0; + if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0; + if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0; + if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0; + if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0; + + $s_time = timer(); + $_ret = friso_split($text, $_cfg); + $_timer = timer() - $s_time; } function timer() { - list($msec, $sec) = explode(' ', microtime()); - return ((float)$msec + (float)$sec); + list($msec, $sec) = explode(' ', microtime()); + return ((float)$msec + (float)$sec); } ?> + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - UTF8 - robbe分词测试程序 - - + UTF8 - robbe分词测试程序 + + -
-
-
分词配置:
-
-
- - -
-
- - -
-
- - value="1" /> -
-
- - -
-
- - -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - value="1" /> -
-
- - />简易模式 - />复杂模式 -
-
- -
分词内容:
-
- - robbe分词 -
+
+
+
分词配置:
+
+
+ + +
+
+ + +
+
+ + value="1" /> +
+
+ + +
+
+ + +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + value="1" /> +
+
+ + />简易模式 + />复杂模式 +
+
+ +
分词内容:
+
+ + robbe分词 +
- -
分词结果:
-
- - -
+ +
分词结果:
+
+ + +
diff --git a/binding/php/friso.c b/binding/php/friso.c index ce4cfaa..a062d75 100644 --- a/binding/php/friso.c +++ b/binding/php/friso.c @@ -9,9 +9,9 @@ #include "php_friso.h" #ifdef FRISO_WINNT -# define friso_default_conf_file "c:/windows/friso.ini" +# define friso_default_conf_file "c:/windows/friso.ini" #else -# define friso_default_conf_file "/etc/friso/friso.ini" +# define friso_default_conf_file "/etc/friso/friso.ini" #endif /* If you declare any globals in php_friso.h uncomment this: @@ -27,15 +27,15 @@ static int le_friso = 1; * Every user visible function must have an entry in friso_functions[]. */ const zend_function_entry friso_functions[] = { - PHP_FE(friso_split, NULL) - PHP_FE(friso_version, NULL) - PHP_FE(friso_charset, NULL) - PHP_FE(friso_dic_exist, NULL) - PHP_FE(friso_dic_get, NULL) - PHP_FE(friso_utf8_bytes, NULL) - PHP_FE(friso_utf8_ucode, NULL) - PHP_FE(friso_ucode_utf8, NULL) - {NULL, NULL, NULL} /* Must be the last line in friso_functions[] */ + PHP_FE(friso_split, NULL) + PHP_FE(friso_version, NULL) + PHP_FE(friso_charset, NULL) + PHP_FE(friso_dic_exist, NULL) + PHP_FE(friso_dic_get, NULL) + PHP_FE(friso_utf8_bytes, NULL) + PHP_FE(friso_utf8_ucode, NULL) + PHP_FE(friso_ucode_utf8, NULL) + {NULL, NULL, NULL} /* Must be the last line in friso_functions[] */ }; /* }}} */ @@ -43,19 +43,19 @@ const zend_function_entry friso_functions[] = { */ zend_module_entry friso_module_entry = { #if ZEND_MODULE_API_NO >= 20010901 - STANDARD_MODULE_HEADER, + STANDARD_MODULE_HEADER, #endif - "friso", - friso_functions, - PHP_MINIT(friso), - PHP_MSHUTDOWN(friso), - PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */ - PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */ - PHP_MINFO(friso), + "friso", + friso_functions, + PHP_MINIT(friso), + PHP_MSHUTDOWN(friso), + PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */ + PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */ + PHP_MINFO(friso), #if ZEND_MODULE_API_NO >= 20010901 - "0.1", /* Replace with version number for your extension */ + "0.1", /* Replace with version number for your extension */ #endif - STANDARD_MODULE_PROPERTIES + STANDARD_MODULE_PROPERTIES }; /* }}} */ @@ -73,72 +73,72 @@ PHP_INI_END() /* {{{ php_robbe_globals_construct */ static void php_friso_globals_construct(zend_friso_globals *friso_globals) { - friso_globals->friso = friso_new(); - friso_globals->config = friso_new_config(); - friso_init_from_ifile(friso_globals->friso, - friso_globals->config, INI_STR("friso.ini_file")); + friso_globals->friso = friso_new(); + friso_globals->config = friso_new_config(); + friso_init_from_ifile(friso_globals->friso, + friso_globals->config, INI_STR("friso.ini_file")); } /* }}} */ /* {{{ php_robbe_globals_destruct*/ static void php_friso_globals_destruct(zend_friso_globals *friso_globals) { - /* - * cause friso_free will free the dictionary - * so here we don't have to call the friso_dic_free to free the - * the robbe_dic global variable. - */ - //friso_dic_free( friso_globals->friso_dic ); - //friso_globals->friso_dic = NULL; - friso_free_config( friso_globals->config ); - friso_free( friso_globals->friso ); + /* + * cause friso_free will free the dictionary + * so here we don't have to call the friso_dic_free to free the + * the robbe_dic global variable. + */ + //friso_dic_free( friso_globals->friso_dic ); + //friso_globals->friso_dic = NULL; + friso_free_config( friso_globals->config ); + friso_free( friso_globals->friso ); } /* }}} */ -#define FRISO_RET_WORD (1 << 0) -#define FRISO_RET_TYPE (1 << 1) -#define FRISO_RET_OFF (1 << 2) -#define FRISO_RET_LEN (1 << 3) -#define FRISO_RET_RLEN (1 << 4) -#define FRISO_RET_POS (1 << 5) +#define FRISO_RET_WORD (1 << 0) +#define FRISO_RET_TYPE (1 << 1) +#define FRISO_RET_OFF (1 << 2) +#define FRISO_RET_LEN (1 << 3) +#define FRISO_RET_RLEN (1 << 4) +#define FRISO_RET_POS (1 << 5) /* {{{ PHP_MINIT_FUNCTION */ PHP_MINIT_FUNCTION(friso) { - /* - * register some contants that robbe may use - * at its following work. - * the constant is case sensitive and persitent. - */ - REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT); + /* + * register some contants that robbe may use + * at its following work. + * the constant is case sensitive and persitent. + */ + REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT); - //return parts for rb_split. - REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT); + //return parts for rb_split. + REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT); - //lex type constants. - REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT); + //lex type constants. + REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT); - REGISTER_INI_ENTRIES(); - /*initialize the globals variables.*/ - php_friso_globals_construct( &friso_globals ); + REGISTER_INI_ENTRIES(); + /*initialize the globals variables.*/ + php_friso_globals_construct( &friso_globals ); - return SUCCESS; + return SUCCESS; } /* }}} */ @@ -146,11 +146,11 @@ PHP_MINIT_FUNCTION(friso) */ PHP_MSHUTDOWN_FUNCTION(friso) { - UNREGISTER_INI_ENTRIES(); - /*destruct the globals variables*/ - php_friso_globals_destruct( &friso_globals ); - - return SUCCESS; + UNREGISTER_INI_ENTRIES(); + /*destruct the globals variables*/ + php_friso_globals_destruct( &friso_globals ); + + return SUCCESS; } /* }}} */ @@ -159,7 +159,7 @@ PHP_MSHUTDOWN_FUNCTION(friso) */ PHP_RINIT_FUNCTION(friso) { - return SUCCESS; + return SUCCESS; } /* }}} */ @@ -168,22 +168,22 @@ PHP_RINIT_FUNCTION(friso) */ PHP_RSHUTDOWN_FUNCTION(friso) { - return SUCCESS; + return SUCCESS; } /* }}} */ /* {{{ PHP_MINFO_FUNCTION */ PHP_MINFO_FUNCTION(friso) -{ - php_info_print_table_start(); - php_info_print_table_row(2, "Friso Support", "enabled"); - php_info_print_table_row(2, "Version", FRISO_VERSION); - php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com"); - php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso"); - php_info_print_table_end(); +{ + php_info_print_table_start(); + php_info_print_table_row(2, "Friso Support", "enabled"); + php_info_print_table_row(2, "Version", FRISO_VERSION); + php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com"); + php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso"); + php_info_print_table_end(); - DISPLAY_INI_ENTRIES(); + DISPLAY_INI_ENTRIES(); } /* }}} */ @@ -192,130 +192,130 @@ PHP_MINFO_FUNCTION(friso) Return a array contains all the split result with a specified mode */ PHP_FUNCTION(friso_split) { - char *_str = NULL, *_key; - int slen, idx, klen, rargs = 0; - int arg_count; + char *_str = NULL, *_key; + int slen, idx, klen, rargs = 0; + int arg_count; - zval *ret, *cfg, **data; - //used for multiple item return. - zval *item; + zval *ret, *cfg, **data; + //used for multiple item return. + zval *item; - HashTable *cfgArr; - HashPosition pointer; + HashTable *cfgArr; + HashPosition pointer; - friso_task_t task; - friso_config_t config = NULL, nconfig = NULL; + friso_task_t task; + friso_config_t config = NULL, nconfig = NULL; - //get the arugments from the php layer. - arg_count = ZEND_NUM_ARGS(); - switch ( arg_count ) - { - case 2: - if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz", - &_str, &slen, &cfg) == FAILURE ) return; - break; - case 3: - if (zend_parse_parameters( arg_count TSRMLS_CC, "szl", - &_str, &slen, &cfg, &rargs) == FAILURE ) return; - break; - default: - WRONG_PARAM_COUNT; - } + //get the arugments from the php layer. + arg_count = ZEND_NUM_ARGS(); + switch ( arg_count ) + { + case 2: + if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz", + &_str, &slen, &cfg) == FAILURE ) return; + break; + case 3: + if (zend_parse_parameters( arg_count TSRMLS_CC, "szl", + &_str, &slen, &cfg, &rargs) == FAILURE ) return; + break; + default: + WRONG_PARAM_COUNT; + } - //make sure the RB_RET_WORD will be returned. - //rargs |= FRISO_RET_WORD; + //make sure the RB_RET_WORD will be returned. + //rargs |= FRISO_RET_WORD; - //check and initialize the friso. - if ( Z_TYPE_P(cfg) != IS_NULL ) - { - nconfig = friso_new_config(); - memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry)); + //check and initialize the friso. + if ( Z_TYPE_P(cfg) != IS_NULL ) + { + nconfig = friso_new_config(); + memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry)); - //check the new setting. - cfgArr = Z_ARRVAL_P(cfg); - //zend_printf("array length: %d", zend_hash_num_elements(cfgArr)); - for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer); - zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS; - zend_hash_move_forward_ex(cfgArr, &pointer) ) - { - zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer); - //zend_printf("key: %s, value: %d
", _key, (*data)->value.lval); - - if ( strcmp(_key, "kpuncs") == 0 ) - { - memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len); - nconfig->kpuncs[(*data)->value.str.len] = '\0'; - } - else - { - //convert the data to long. - convert_to_long_ex(data); - if ( strcmp(_key, "max_len") == 0 ) - nconfig->max_len = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "r_name") == 0 ) - nconfig->r_name = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "mix_len") == 0 ) - nconfig->mix_len = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "lna_len") == 0 ) - nconfig->lna_len = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "add_syn") == 0 ) - nconfig->add_syn = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "clr_stw") == 0 ) - nconfig->clr_stw = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "add_syn") == 0 ) - nconfig->add_syn = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "keep_urec") == 0 ) - nconfig->keep_urec = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "spx_out") == 0 ) - nconfig->spx_out = (ushort_t)(*data)->value.lval; - else if ( strcmp(_key, "nthreshold") == 0 ) - nconfig->nthreshold = (uint_t) (*data)->value.lval; - else if ( strcmp(_key, "mode") == 0 ) - friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval)); - else if ( strcmp(_key, "en_sseg") == 0 ) - nconfig->en_sseg = (ushort_t) (*data)->value.lval; - else if ( strcmp(_key, "st_minl") == 0 ) - nconfig->st_minl = (ushort_t) (*data)->value.lval; - } - } - } + //check the new setting. + cfgArr = Z_ARRVAL_P(cfg); + //zend_printf("array length: %d", zend_hash_num_elements(cfgArr)); + for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer); + zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS; + zend_hash_move_forward_ex(cfgArr, &pointer) ) + { + zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer); + //zend_printf("key: %s, value: %d
", _key, (*data)->value.lval); + + if ( strcmp(_key, "kpuncs") == 0 ) + { + memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len); + nconfig->kpuncs[(*data)->value.str.len] = '\0'; + } + else + { + //convert the data to long. + convert_to_long_ex(data); + if ( strcmp(_key, "max_len") == 0 ) + nconfig->max_len = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "r_name") == 0 ) + nconfig->r_name = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "mix_len") == 0 ) + nconfig->mix_len = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "lna_len") == 0 ) + nconfig->lna_len = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "add_syn") == 0 ) + nconfig->add_syn = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "clr_stw") == 0 ) + nconfig->clr_stw = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "add_syn") == 0 ) + nconfig->add_syn = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "keep_urec") == 0 ) + nconfig->keep_urec = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "spx_out") == 0 ) + nconfig->spx_out = (ushort_t)(*data)->value.lval; + else if ( strcmp(_key, "nthreshold") == 0 ) + nconfig->nthreshold = (uint_t) (*data)->value.lval; + else if ( strcmp(_key, "mode") == 0 ) + friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval)); + else if ( strcmp(_key, "en_sseg") == 0 ) + nconfig->en_sseg = (ushort_t) (*data)->value.lval; + else if ( strcmp(_key, "st_minl") == 0 ) + nconfig->st_minl = (ushort_t) (*data)->value.lval; + } + } + } - //initialize the array. - MAKE_STD_ZVAL( ret ); - array_init( ret ); - config = ( nconfig == NULL ) ? friso_globals.config : nconfig; + //initialize the array. + MAKE_STD_ZVAL( ret ); + array_init( ret ); + config = ( nconfig == NULL ) ? friso_globals.config : nconfig; - //create a new friso task. - task = friso_new_task(); - idx = 0; - friso_set_text(task, _str); - while ( config->next_token( friso_globals.friso, config, task ) != NULL ) - { - MAKE_STD_ZVAL(item); - array_init(item); - add_assoc_string(item, "word", task->token->word, 1); - //check the append of type - if ( (rargs & FRISO_RET_TYPE) != 0 ) - add_assoc_long(item, "type", task->token->type); - if ( (rargs & FRISO_RET_LEN) != 0 ) - add_assoc_long(item, "len", task->token->length); - if ( (rargs & FRISO_RET_RLEN) != 0 ) - add_assoc_long(item, "rlen", task->token->rlen); - if ( (rargs & FRISO_RET_OFF) != 0 ) - add_assoc_long(item, "off", task->token->offset); - if ( (rargs & FRISO_RET_POS) != 0 ) - add_assoc_stringl(item, "pos", &task->token->pos, 1, 1); - - //append the sub result. - add_index_zval( ret, idx++, item ); - } + //create a new friso task. + task = friso_new_task(); + idx = 0; + friso_set_text(task, _str); + while ( config->next_token( friso_globals.friso, config, task ) != NULL ) + { + MAKE_STD_ZVAL(item); + array_init(item); + add_assoc_string(item, "word", task->token->word, 1); + //check the append of type + if ( (rargs & FRISO_RET_TYPE) != 0 ) + add_assoc_long(item, "type", task->token->type); + if ( (rargs & FRISO_RET_LEN) != 0 ) + add_assoc_long(item, "len", task->token->length); + if ( (rargs & FRISO_RET_RLEN) != 0 ) + add_assoc_long(item, "rlen", task->token->rlen); + if ( (rargs & FRISO_RET_OFF) != 0 ) + add_assoc_long(item, "off", task->token->offset); + if ( (rargs & FRISO_RET_POS) != 0 ) + add_assoc_stringl(item, "pos", &task->token->pos, 1, 1); + + //append the sub result. + add_index_zval( ret, idx++, item ); + } - //free the friso task. - friso_free_task(task); - if ( nconfig != NULL ) friso_free_config(nconfig); + //free the friso task. + friso_free_task(task); + if ( nconfig != NULL ) friso_free_config(nconfig); - //RETURN_ZVAL( ret, 0, 0); - *( return_value ) = *( ret ); + //RETURN_ZVAL( ret, 0, 0); + *( return_value ) = *( ret ); } /* }}} */ @@ -323,7 +323,7 @@ PHP_FUNCTION(friso_split) Return the current version of Friso. */ PHP_FUNCTION(friso_version) { - RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1); + RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1); } /* }}} */ @@ -331,8 +331,8 @@ PHP_FUNCTION(friso_version) Return the current charset of friso. */ PHP_FUNCTION(friso_charset) { - char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK"; - RETURN_STRINGL(charset, strlen(charset), 1); + char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK"; + RETURN_STRINGL(charset, strlen(charset), 1); } /* }}} */ @@ -340,23 +340,23 @@ PHP_FUNCTION(friso_charset) Return a bool to confirm that the given str is a word in a specified dictionary. */ PHP_FUNCTION(friso_dic_exist) { - char *word = NULL; - int wlen; - long type; + char *word = NULL; + int wlen; + long type; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) { + return; + } - if ( friso_globals.friso->dic == NULL ) - RETURN_BOOL(0); + if ( friso_globals.friso->dic == NULL ) + RETURN_BOOL(0); - if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ ) - type = __LEX_CJK_WORDS__; + if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ ) + type = __LEX_CJK_WORDS__; - wlen = friso_dic_match( friso_globals.friso->dic, type, word ); + wlen = friso_dic_match( friso_globals.friso->dic, type, word ); - RETURN_BOOL(wlen); + RETURN_BOOL(wlen); } /* }}} */ @@ -364,38 +364,38 @@ PHP_FUNCTION(friso_dic_exist) Return a array contains all the information of the given word.*/ PHP_FUNCTION(friso_dic_get) { - char *word = NULL; - int wlen; - long type; - zval *entry; - lex_entry_t e; + char *word = NULL; + int wlen; + long type; + zval *entry; + lex_entry_t e; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) { + return; + } - //check the dictionary - if ( friso_globals.friso->dic == NULL ) - RETURN_BOOL(0); + //check the dictionary + if ( friso_globals.friso->dic == NULL ) + RETURN_BOOL(0); - MAKE_STD_ZVAL( entry ); - array_init( entry ); + MAKE_STD_ZVAL( entry ); + array_init( entry ); - if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ ) - { - type = __LEX_CJK_WORDS__; - } + if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ ) + { + type = __LEX_CJK_WORDS__; + } - e = friso_dic_get( friso_globals.friso->dic, type, word ); - if ( e != NULL ) - { - add_assoc_long( entry, "length", e->length); - add_assoc_long( entry, "freq", e->fre ); - *( return_value ) = * ( entry ); - return; - } + e = friso_dic_get( friso_globals.friso->dic, type, word ); + if ( e != NULL ) + { + add_assoc_long( entry, "length", e->length); + add_assoc_long( entry, "freq", e->fre ); + *( return_value ) = * ( entry ); + return; + } - RETURN_BOOL(0); + RETURN_BOOL(0); } /* }}} */ @@ -403,17 +403,17 @@ PHP_FUNCTION(friso_dic_get) Return the bytes that the utf-8 char takes.*/ PHP_FUNCTION(friso_utf8_bytes) { - char *word = NULL; - int wlen, _bytes; + char *word = NULL; + int wlen, _bytes; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) { + return; + } - if ( word == NULL ) RETURN_LONG(0); - _bytes = get_utf8_bytes( word[0] ); + if ( word == NULL ) RETURN_LONG(0); + _bytes = get_utf8_bytes( word[0] ); - RETURN_LONG(_bytes); + RETURN_LONG(_bytes); } /* }}} */ @@ -421,16 +421,16 @@ PHP_FUNCTION(friso_utf8_bytes) Return the unicode of the given utf-8 char.*/ PHP_FUNCTION(friso_utf8_ucode) { - char *word = NULL; - int wlen, _ucode; + char *word = NULL; + int wlen, _ucode; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) { + return; + } - _ucode = get_utf8_unicode( word ); + _ucode = get_utf8_unicode( word ); - RETURN_LONG(_ucode); + RETURN_LONG(_ucode); } /* }}} */ @@ -438,18 +438,18 @@ PHP_FUNCTION(friso_utf8_ucode) Return char that the a unicode pointed to.*/ PHP_FUNCTION(friso_ucode_utf8) { - unsigned long *ucode = NULL; - int _bytes; - char word[7]; + unsigned long *ucode = NULL; + int _bytes; + char word[7]; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) { + return; + } - _bytes = unicode_to_utf8( ( size_t ) ucode, word ); - word[_bytes] = '\0'; + _bytes = unicode_to_utf8( ( size_t ) ucode, word ); + word[_bytes] = '\0'; - RETURN_STRINGL( word, _bytes, 1 ); + RETURN_STRINGL( word, _bytes, 1 ); } /* }}} */ diff --git a/binding/php/friso.php b/binding/php/friso.php index cb7305c..76cfed4 100644 --- a/binding/php/friso.php +++ b/binding/php/friso.php @@ -2,7 +2,7 @@ $br = (php_sapi_name() == "cli")? "":"
"; if(!extension_loaded('friso')) { - dl('friso.' . PHP_SHLIB_SUFFIX); + dl('friso.' . PHP_SHLIB_SUFFIX); } $module = 'friso'; $functions = get_extension_funcs($module); @@ -13,9 +13,9 @@ foreach($functions as $func) { echo "$br\n"; $function = 'confirm_' . $module . '_compiled'; if (extension_loaded($module)) { - $str = $function($module); + $str = $function($module); } else { - $str = "Module $module is not compiled into PHP"; + $str = "Module $module is not compiled into PHP"; } echo "$str\n"; ?> diff --git a/binding/php/php_friso.h b/binding/php/php_friso.h index 0279fd0..5829336 100644 --- a/binding/php/php_friso.h +++ b/binding/php/php_friso.h @@ -6,11 +6,11 @@ extern zend_module_entry friso_module_entry; #define phpext_friso_ptr &friso_module_entry #ifdef PHP_WIN32 -# define PHP_FRISO_API __declspec(dllexport) +# define PHP_FRISO_API __declspec(dllexport) #elif defined(__GNUC__) && __GNUC__ >= 4 -# define PHP_FRISO_API __attribute__ ((visibility("default"))) +# define PHP_FRISO_API __attribute__ ((visibility("default"))) #else -# define PHP_FRISO_API +# define PHP_FRISO_API #endif #ifdef ZTS @@ -36,12 +36,12 @@ PHP_FUNCTION(friso_utf8_ucode); PHP_FUNCTION(friso_ucode_utf8); /* - Declare any global variables you may need between the BEGIN - and END macros here: + Declare any global variables you may need between the BEGIN + and END macros here: ZEND_BEGIN_MODULE_GLOBALS(friso) - long global_value; - char *global_string; + long global_value; + char *global_string; ZEND_END_MODULE_GLOBALS(friso) */ @@ -66,5 +66,5 @@ typedef struct { #define FRISO_G(v) (friso_globals.v) #endif -#endif /* PHP_FRISO_H */ +#endif /* PHP_FRISO_H */ diff --git a/binding/php/tests/001.phpt b/binding/php/tests/001.phpt index 6b79acf..2fabf38 100644 --- a/binding/php/tests/001.phpt +++ b/binding/php/tests/001.phpt @@ -6,14 +6,14 @@ Check for friso presence diff --git a/src/friso.c b/src/friso.c index a0ca6de..db2aa5c 100644 --- a/src/friso.c +++ b/src/friso.c @@ -1,8 +1,8 @@ /* * friso main file implemented the friso main functions. - * starts with friso_ in the friso header file "friso.h"; + * starts with friso_ in the friso header file "friso.h"; * - * @author chenxin + * @author chenxin */ #include #include @@ -19,15 +19,15 @@ */ FRISO_API friso_t friso_new( void ) { - friso_t e = ( friso_t ) FRISO_MALLOC( sizeof( friso_entry ) ); - if ( e == NULL ) { - ___ALLOCATION_ERROR___ - } + friso_t e = ( friso_t ) FRISO_MALLOC( sizeof( friso_entry ) ); + if ( e == NULL ) { + ___ALLOCATION_ERROR___ + } - e->dic = NULL; - e->charset = FRISO_UTF8; //set default charset UTF8. + e->dic = NULL; + e->charset = FRISO_UTF8; //set default charset UTF8. - return e; + return e; } /* }}} */ @@ -36,163 +36,163 @@ FRISO_API friso_t friso_new( void ) * @return 1 for successfully and 0 for failed. */ FRISO_API int friso_init_from_ifile( - friso_t friso, friso_config_t config, fstring __ifile ) + friso_t friso, friso_config_t config, fstring __ifile ) { - FILE *__stream; - char __chars__[256], __key__[128], *__line__; - char __lexi__[160], lexpath[160]; - uint_t i, t, __hit__ = 0, __length__; + FILE *__stream; + char __chars__[256], __key__[128], *__line__; + char __lexi__[160], lexpath[160]; + uint_t i, t, __hit__ = 0, __length__; - char *slimiter = NULL; - uint_t flen = 0; + char *slimiter = NULL; + uint_t flen = 0; - //get the base part of the path of the __ifile - if ( (slimiter = strrchr(__ifile, '/')) != NULL ) - { - flen = slimiter - __ifile + 1; - } + //get the base part of the path of the __ifile + if ( (slimiter = strrchr(__ifile, '/')) != NULL ) + { + flen = slimiter - __ifile + 1; + } - //yat, start to parse the friso.ini configuration file - if ( ( __stream = fopen( __ifile, "rb" ) ) != NULL ) - { - //initialize the entry with the value from the ifile. - while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) - { - //comments filter. - if ( __line__[0] == '#' ) continue; - if ( __line__[0] == '\t' ) continue; - if ( __line__[0] == ' ' || __line__[0] == '\0' ) continue; + //yat, start to parse the friso.ini configuration file + if ( ( __stream = fopen( __ifile, "rb" ) ) != NULL ) + { + //initialize the entry with the value from the ifile. + while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) + { + //comments filter. + if ( __line__[0] == '#' ) continue; + if ( __line__[0] == '\t' ) continue; + if ( __line__[0] == ' ' || __line__[0] == '\0' ) continue; - __length__ = strlen( __line__ ); - for ( i = 0; i < __length__; i++ ) { - if ( __line__[i] == ' ' - || __line__[i] == '\t' || __line__[i] == '=' ) break; - __key__[i] = __line__[i]; - } - __key__[i] = '\0'; + __length__ = strlen( __line__ ); + for ( i = 0; i < __length__; i++ ) { + if ( __line__[i] == ' ' + || __line__[i] == '\t' || __line__[i] == '=' ) break; + __key__[i] = __line__[i]; + } + __key__[i] = '\0'; - //position the euqals char '='. - if ( __line__[i] == ' ' || __line__[i] == '\t' ) { - for ( i++ ; i < __length__; i++ ) - if ( __line__[i] == '=' ) break; - } + //position the euqals char '='. + if ( __line__[i] == ' ' || __line__[i] == '\t' ) { + for ( i++ ; i < __length__; i++ ) + if ( __line__[i] == '=' ) break; + } - //clear the left whitespace of the value. - for ( i++; i < __length__ - && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); - for ( t = 0; i < __length__; i++, t++ ) { - if ( __line__[i] == ' ' || __line__[i] == '\t' ) break; - __line__[t] = __line__[i]; - } - __line__[t] = '\0'; + //clear the left whitespace of the value. + for ( i++; i < __length__ + && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); + for ( t = 0; i < __length__; i++, t++ ) { + if ( __line__[i] == ' ' || __line__[i] == '\t' ) break; + __line__[t] = __line__[i]; + } + __line__[t] = '\0'; - //printf("key=%s, value=%s\n", __key__, __line__ ); - if ( strcmp( __key__, "friso.lex_dir" ) == 0 ) - { - /* - * here copy the value of the lex_dir. - * cause we need the value of friso.max_len to finish all - * the work when we call function friso_dic_load_from_ifile to - * initiliaze the friso dictionary. - */ - if ( __hit__ == 0 ) - { - __hit__ = t; - for ( t = 0; t < __hit__; t++ ) { - __lexi__[t] = __line__[t]; - } - __lexi__[t] = '\0'; - } - } else if ( strcmp( __key__, "friso.max_len" ) == 0 ) { - config->max_len = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.r_name" ) == 0 ) { - config->r_name = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.mix_len" ) == 0 ) { - config->mix_len = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.lna_len" ) == 0 ) { - config->lna_len = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.add_syn" ) == 0 ) { - config->add_syn = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.clr_stw" ) == 0 ) { - config->clr_stw = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.keep_urec" ) == 0 ) { - config->keep_urec = ( uint_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.spx_out" ) == 0 ) { - config->spx_out = ( ushort_t ) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.nthreshold" ) == 0 ) { - config->nthreshold = atoi( __line__ ); - } else if ( strcmp( __key__, "friso.mode" ) == 0 ) { - //config->mode = ( friso_mode_t ) atoi( __line__ ); - friso_set_mode(config, (friso_mode_t) atoi( __line__ )); - } else if ( strcmp( __key__, "friso.charset" ) == 0 ) { - friso->charset = (friso_charset_t) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.en_sseg") == 0 ) { - config->en_sseg = (ushort_t) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.st_minl") == 0 ) { - config->st_minl = (ushort_t) atoi( __line__ ); - } else if ( strcmp( __key__, "friso.kpuncs") == 0 ) { - //t is the length of the __line__. - memcpy(config->kpuncs, __line__, t); - //printf("friso_init_from_ifile#kpuncs: %s\n", config->kpuncs); - } - } + //printf("key=%s, value=%s\n", __key__, __line__ ); + if ( strcmp( __key__, "friso.lex_dir" ) == 0 ) + { + /* + * here copy the value of the lex_dir. + * cause we need the value of friso.max_len to finish all + * the work when we call function friso_dic_load_from_ifile to + * initiliaze the friso dictionary. + */ + if ( __hit__ == 0 ) + { + __hit__ = t; + for ( t = 0; t < __hit__; t++ ) { + __lexi__[t] = __line__[t]; + } + __lexi__[t] = '\0'; + } + } else if ( strcmp( __key__, "friso.max_len" ) == 0 ) { + config->max_len = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.r_name" ) == 0 ) { + config->r_name = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.mix_len" ) == 0 ) { + config->mix_len = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.lna_len" ) == 0 ) { + config->lna_len = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.add_syn" ) == 0 ) { + config->add_syn = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.clr_stw" ) == 0 ) { + config->clr_stw = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.keep_urec" ) == 0 ) { + config->keep_urec = ( uint_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.spx_out" ) == 0 ) { + config->spx_out = ( ushort_t ) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.nthreshold" ) == 0 ) { + config->nthreshold = atoi( __line__ ); + } else if ( strcmp( __key__, "friso.mode" ) == 0 ) { + //config->mode = ( friso_mode_t ) atoi( __line__ ); + friso_set_mode(config, (friso_mode_t) atoi( __line__ )); + } else if ( strcmp( __key__, "friso.charset" ) == 0 ) { + friso->charset = (friso_charset_t) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.en_sseg") == 0 ) { + config->en_sseg = (ushort_t) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.st_minl") == 0 ) { + config->st_minl = (ushort_t) atoi( __line__ ); + } else if ( strcmp( __key__, "friso.kpuncs") == 0 ) { + //t is the length of the __line__. + memcpy(config->kpuncs, __line__, t); + //printf("friso_init_from_ifile#kpuncs: %s\n", config->kpuncs); + } + } - /* - * intialize the friso dictionary here. - * use the setting from the ifile parse above - * we copied the value in the __lexi__ - */ - if ( __hit__ != 0 ) - { - //add relative path search support - //@added: 2014-05-24 - //convert the relative path to absolute path base on the path of friso.ini - //improved at @date: 2014-10-26 + /* + * intialize the friso dictionary here. + * use the setting from the ifile parse above + * we copied the value in the __lexi__ + */ + if ( __hit__ != 0 ) + { + //add relative path search support + //@added: 2014-05-24 + //convert the relative path to absolute path base on the path of friso.ini + //improved at @date: 2014-10-26 #ifdef FRISO_WINNT - if ( __lexi__[1] != ':' && flen != 0 ) + if ( __lexi__[1] != ':' && flen != 0 ) #else - if ( __lexi__[0] != '/' && flen != 0 ) + if ( __lexi__[0] != '/' && flen != 0 ) #endif - { - if ( (flen + __hit__) > sizeof(lexpath) - 1 ) - { - printf("[Error]: Buffer is not long enough to hold the final lexicon path"); - printf(" with a length of {%d} at function friso.c#friso_init_from_ifile", flen + __hit__); - return 0; - } + { + if ( (flen + __hit__) > sizeof(lexpath) - 1 ) + { + printf("[Error]: Buffer is not long enough to hold the final lexicon path"); + printf(" with a length of {%d} at function friso.c#friso_init_from_ifile", flen + __hit__); + return 0; + } - memcpy(lexpath, __ifile, flen); - memcpy(lexpath + flen, __lexi__, __hit__ - 1); - //count the new length - flen = flen + __hit__ - 1; - if ( lexpath[flen-1] != '/' ) lexpath[flen] = '/'; - lexpath[flen+1] = '\0'; - } - else - { - memcpy(lexpath, __lexi__, __hit__); - lexpath[__hit__] = '\0'; - if ( lexpath[__hit__ - 1] != '/' ) - { - lexpath[__hit__] = '/'; - lexpath[__hit__+1] = '\0'; - } - } + memcpy(lexpath, __ifile, flen); + memcpy(lexpath + flen, __lexi__, __hit__ - 1); + //count the new length + flen = flen + __hit__ - 1; + if ( lexpath[flen-1] != '/' ) lexpath[flen] = '/'; + lexpath[flen+1] = '\0'; + } + else + { + memcpy(lexpath, __lexi__, __hit__); + lexpath[__hit__] = '\0'; + if ( lexpath[__hit__ - 1] != '/' ) + { + lexpath[__hit__] = '/'; + lexpath[__hit__+1] = '\0'; + } + } - //printf("lexpath=%s\n", lexpath); + //printf("lexpath=%s\n", lexpath); - friso->dic = friso_dic_new(); - //add charset check for max word length counting - friso_dic_load_from_ifile( friso, config, - lexpath, config->max_len * (friso->charset == FRISO_UTF8 ? 3 : 2) ); - } + friso->dic = friso_dic_new(); + //add charset check for max word length counting + friso_dic_load_from_ifile( friso, config, + lexpath, config->max_len * (friso->charset == FRISO_UTF8 ? 3 : 2) ); + } - fclose( __stream ); - return 1; - } + fclose( __stream ); + return 1; + } - return 0; + return 0; } /* }}} */ @@ -201,35 +201,35 @@ FRISO_API int friso_init_from_ifile( */ FRISO_API void friso_free( friso_t friso ) { - //free the dictionary - if ( friso->dic != NULL ) { - friso_dic_free( friso->dic ); - } - FRISO_FREE( friso ); + //free the dictionary + if ( friso->dic != NULL ) { + friso_dic_free( friso->dic ); + } + FRISO_FREE( friso ); } /* }}} */ /* {{{ set the current split mode - * view the friso.h#friso_mode_t + * view the friso.h#friso_mode_t */ FRISO_API void friso_set_mode( friso_config_t config, friso_mode_t mode ) { - config->mode = mode; + config->mode = mode; - switch ( config->mode ) - { - case __FRISO_SIMPLE_MODE__: - config->next_token = next_mmseg_token; - config->next_cjk = next_simple_cjk; - break; - case __FRISO_DETECT_MODE__: - config->next_token = next_detect_token; - break; - default: - config->next_token = next_mmseg_token; - config->next_cjk = next_complex_cjk; - break; - } + switch ( config->mode ) + { + case __FRISO_SIMPLE_MODE__: + config->next_token = next_mmseg_token; + config->next_cjk = next_simple_cjk; + break; + case __FRISO_DETECT_MODE__: + config->next_token = next_detect_token; + break; + default: + config->next_token = next_mmseg_token; + config->next_cjk = next_complex_cjk; + break; + } } /* }}} */ @@ -237,39 +237,39 @@ FRISO_API void friso_set_mode( friso_config_t config, friso_mode_t mode ) * it with default value.*/ FRISO_API friso_config_t friso_new_config( void ) { - friso_config_t cfg = (friso_config_t) - FRISO_MALLOC(sizeof(friso_config_entry)); - if ( cfg == NULL ) { - ___ALLOCATION_ERROR___; - } + friso_config_t cfg = (friso_config_t) + FRISO_MALLOC(sizeof(friso_config_entry)); + if ( cfg == NULL ) { + ___ALLOCATION_ERROR___; + } - //initialize the configuration entry. - friso_init_config(cfg); + //initialize the configuration entry. + friso_init_config(cfg); - return cfg; + return cfg; } /* }}} */ /* {{{ initialize the specified friso config entry with default value.*/ FRISO_API void friso_init_config( friso_config_t cfg ) { - cfg->max_len = DEFAULT_SEGMENT_LENGTH; - cfg->r_name = 1; - cfg->mix_len = DEFAULT_MIX_LENGTH; - cfg->lna_len = DEFAULT_LNA_LENGTH; - cfg->add_syn = 1; - cfg->clr_stw = 0; - cfg->keep_urec = 0; - cfg->spx_out = 0; - cfg->en_sseg = 1; //default start the secondary segmentaion. - cfg->st_minl = 1; //min length for secondary split sub token. - cfg->nthreshold = DEFAULT_NTHRESHOLD; - cfg->mode = ( friso_mode_t ) DEFAULT_SEGMENT_MODE; + cfg->max_len = DEFAULT_SEGMENT_LENGTH; + cfg->r_name = 1; + cfg->mix_len = DEFAULT_MIX_LENGTH; + cfg->lna_len = DEFAULT_LNA_LENGTH; + cfg->add_syn = 1; + cfg->clr_stw = 0; + cfg->keep_urec = 0; + cfg->spx_out = 0; + cfg->en_sseg = 1; //default start the secondary segmentaion. + cfg->st_minl = 1; //min length for secondary split sub token. + cfg->nthreshold = DEFAULT_NTHRESHOLD; + cfg->mode = ( friso_mode_t ) DEFAULT_SEGMENT_MODE; - friso_set_mode(cfg, cfg->mode); + friso_set_mode(cfg, cfg->mode); - //Zero fill the kpuncs buffer. - memset(cfg->kpuncs, 0x00, sizeof(cfg->kpuncs)); + //Zero fill the kpuncs buffer. + memset(cfg->kpuncs, 0x00, sizeof(cfg->kpuncs)); } /* }}} */ @@ -277,82 +277,82 @@ FRISO_API void friso_init_config( friso_config_t cfg ) */ FRISO_API friso_task_t friso_new_task() { - friso_task_t task = ( friso_task_t ) FRISO_MALLOC( sizeof( friso_task_entry ) ); - if ( task == NULL ) { - ___ALLOCATION_ERROR___ - } + friso_task_t task = ( friso_task_t ) FRISO_MALLOC( sizeof( friso_task_entry ) ); + if ( task == NULL ) { + ___ALLOCATION_ERROR___ + } - //initliaze the segment. - task->text = NULL; - task->idx = 0; - task->length = 0; - task->bytes = 0; - task->unicode = 0; - task->ctrlMask = 0; - task->pool = new_link_list(); - task->sbuf = new_string_buffer(); - task->token = friso_new_token(); + //initliaze the segment. + task->text = NULL; + task->idx = 0; + task->length = 0; + task->bytes = 0; + task->unicode = 0; + task->ctrlMask = 0; + task->pool = new_link_list(); + task->sbuf = new_string_buffer(); + task->token = friso_new_token(); - return task; + return task; } /* }}} */ /* {{{ free the specified task*/ FRISO_API void friso_free_task( friso_task_t task ) { - //free the allocation of the poll link list. - if ( task->pool != NULL ) { - free_link_list( task->pool ); - } + //free the allocation of the poll link list. + if ( task->pool != NULL ) { + free_link_list( task->pool ); + } - //release the allocation of the sbuff string_buffer_t. - if ( task->sbuf != NULL ) { - free_string_buffer(task->sbuf); - } + //release the allocation of the sbuff string_buffer_t. + if ( task->sbuf != NULL ) { + free_string_buffer(task->sbuf); + } - //free the allocations of the token. - if ( task->token != NULL ) { - friso_free_token( task->token ); - } + //free the allocations of the token. + if ( task->token != NULL ) { + friso_free_token( task->token ); + } - FRISO_FREE( task ); + FRISO_FREE( task ); } /* }}} */ /* {{{ create a new friso token */ FRISO_API friso_token_t friso_new_token( void ) { - friso_token_t token = ( friso_token_t ) - FRISO_MALLOC( sizeof( friso_token_entry ) ); - if ( token == NULL ) { - ___ALLOCATION_ERROR___ - } + friso_token_t token = ( friso_token_t ) + FRISO_MALLOC( sizeof( friso_token_entry ) ); + if ( token == NULL ) { + ___ALLOCATION_ERROR___ + } - //initialize - token->type = (uchar_t) __LEX_OTHER_WORDS__; - token->length = 0; - token->rlen = 0; - token->pos = '\0'; - token->offset = -1; - memset(token->word, 0x00, __HITS_WORD_LENGTH__); + //initialize + token->type = (uchar_t) __LEX_OTHER_WORDS__; + token->length = 0; + token->rlen = 0; + token->pos = '\0'; + token->offset = -1; + memset(token->word, 0x00, __HITS_WORD_LENGTH__); - return token; + return token; } /* }}} */ /* {{{ set the text of the current segmentation. - * that means we could re-use the segment. - * also we have to reset the idx and the length of the segmentation. + * that means we could re-use the segment. + * also we have to reset the idx and the length of the segmentation. * and the most important one - clear the poll link list. */ FRISO_API void friso_set_text( - friso_task_t task, fstring text ) + friso_task_t task, fstring text ) { - task->text = text; - task->idx = 0; //reset the index - task->length = strlen( text ); - task->pool = link_list_clear( task->pool ); //clear the word poll - string_buffer_clear( task->sbuf ); //crear the string buffer. + task->text = text; + task->idx = 0; //reset the index + task->length = strlen( text ); + task->pool = link_list_clear( task->pool ); //clear the word poll + string_buffer_clear( task->sbuf ); //crear the string buffer. } /* }}} */ @@ -360,77 +360,77 @@ FRISO_API void friso_set_text( //friso core part 1: simple mode tokenize handler functions /* {{{ read the next word from the current position. * - * @return int the bytes of the readed word. + * @return int the bytes of the readed word. */ __STATIC_API__ uint_t readNextWord( - friso_t friso, //friso instance - friso_task_t task, //token task - uint_t *idx, //current index. - fstring __word ) //work buffer. + friso_t friso, //friso instance + friso_task_t task, //token task + uint_t *idx, //current index. + fstring __word ) //work buffer. { - if ( friso->charset == FRISO_UTF8 ) - //@reader: task->unicode = get_utf8_unicode(task->buffer) is moved insite - // function utf8_next_word from friso 1.6.0 . - return utf8_next_word( task, idx, __word ); - else if ( friso->charset == FRISO_GBK ) - return gbk_next_word( task, idx, __word ); + if ( friso->charset == FRISO_UTF8 ) + //@reader: task->unicode = get_utf8_unicode(task->buffer) is moved insite + // function utf8_next_word from friso 1.6.0 . + return utf8_next_word( task, idx, __word ); + else if ( friso->charset == FRISO_GBK ) + return gbk_next_word( task, idx, __word ); - return 0; //unknow charset. + return 0; //unknow charset. } /* }}} */ /* {{{ get the next cjk word from the current position, with simple mode. */ FRISO_API lex_entry_t next_simple_cjk( - friso_t friso, - friso_config_t config, - friso_task_t task ) + friso_t friso, + friso_config_t config, + friso_task_t task ) { - uint_t t, idx = task->idx, __length__; - string_buffer_t sb = new_string_buffer_with_string( task->buffer ); - lex_entry_t e = friso_dic_get( friso->dic, - __LEX_CJK_WORDS__, sb->buffer ); + uint_t t, idx = task->idx, __length__; + string_buffer_t sb = new_string_buffer_with_string( task->buffer ); + lex_entry_t e = friso_dic_get( friso->dic, + __LEX_CJK_WORDS__, sb->buffer ); - /* - * here bak the e->length in the task->token->type. - * we will use it to count the task->idx. - * for the sake of use less variable. - */ - __length__ = e->length; + /* + * here bak the e->length in the task->token->type. + * we will use it to count the task->idx. + * for the sake of use less variable. + */ + __length__ = e->length; - for ( t = 1; t < config->max_len - && ( task->bytes = readNextWord( - friso, task, &idx, task->buffer ) ) != 0; t++ ) - { - if ( friso_whitespace( friso->charset, task ) ) break; - if ( ! friso_cn_string( friso->charset, task ) ) break; + for ( t = 1; t < config->max_len + && ( task->bytes = readNextWord( + friso, task, &idx, task->buffer ) ) != 0; t++ ) + { + if ( friso_whitespace( friso->charset, task ) ) break; + if ( ! friso_cn_string( friso->charset, task ) ) break; - string_buffer_append( sb, task->buffer ); + string_buffer_append( sb, task->buffer ); - //check the existence of the word by search the dictionary. - if ( friso_dic_match( friso->dic, - __LEX_CJK_WORDS__, sb->buffer ) ) { - e = friso_dic_get( friso->dic, - __LEX_CJK_WORDS__, sb->buffer ); - } - } + //check the existence of the word by search the dictionary. + if ( friso_dic_match( friso->dic, + __LEX_CJK_WORDS__, sb->buffer ) ) { + e = friso_dic_get( friso->dic, + __LEX_CJK_WORDS__, sb->buffer ); + } + } - //correct the offset of the segment. - task->idx += ( e->length - __length__ ); - free_string_buffer( sb ); //free the buffer + //correct the offset of the segment. + task->idx += ( e->length - __length__ ); + free_string_buffer( sb ); //free the buffer - /* - * check the stopwords dictionary, - * make sure the current tokenzier is not stopwords. - * @warning: friso.clr_stw must be open in friso.ini configuration file. - */ - if ( config->clr_stw - && friso_dic_match( friso->dic, - __LEX_STOPWORDS__, e->word ) ) { - return NULL; - } + /* + * check the stopwords dictionary, + * make sure the current tokenzier is not stopwords. + * @warning: friso.clr_stw must be open in friso.ini configuration file. + */ + if ( config->clr_stw + && friso_dic_match( friso->dic, + __LEX_STOPWORDS__, e->word ) ) { + return NULL; + } - return e; + return e; } /* }}} */ @@ -439,300 +439,300 @@ FRISO_API lex_entry_t next_simple_cjk( /* {{{ basic latin segmentation*/ /*convert full-width char to half-width*/ #define convert_full_to_half( friso, task, convert ) \ - do {\ - if ( friso_fullwidth_en_char( friso->charset, task ) ) { \ - if ( friso->charset == FRISO_UTF8 ) \ - task->unicode -= 65248; \ - else if ( friso->charset == FRISO_GBK ) \ - {\ - task->buffer[0] = ((uchar_t)task->buffer[1]) - 128; \ - task->buffer[1] = '\0'; \ - }\ - convert = 1; \ - } \ - } while (0) + do {\ + if ( friso_fullwidth_en_char( friso->charset, task ) ) { \ + if ( friso->charset == FRISO_UTF8 ) \ + task->unicode -= 65248; \ + else if ( friso->charset == FRISO_GBK ) \ + {\ + task->buffer[0] = ((uchar_t)task->buffer[1]) - 128; \ + task->buffer[1] = '\0'; \ + }\ + convert = 1; \ + } \ + } while (0) /*convert uppercase char to lowercase char*/ #define convert_upper_to_lower( friso, task, convert ) \ - do {\ - if ( friso_uppercase_letter( friso->charset, task ) ) { \ - if ( friso->charset == FRISO_UTF8 ) \ - task->unicode += 32; \ - /* With the above logic(full to half), - * here we just need to check half-width*/ \ - else if ( friso->charset == FRISO_GBK ) \ - task->buffer[0] = task->buffer[0] + 32; \ - convert = 1; \ - } \ - } while (0) + do {\ + if ( friso_uppercase_letter( friso->charset, task ) ) { \ + if ( friso->charset == FRISO_UTF8 ) \ + task->unicode += 32; \ + /* With the above logic(full to half), + * here we just need to check half-width*/ \ + else if ( friso->charset == FRISO_GBK ) \ + task->buffer[0] = task->buffer[0] + 32; \ + convert = 1; \ + } \ + } while (0) /* convert the unicode to utf-8 bytes. (FRISO_UTF8) */ #define convert_work_apply( friso, task, convert ) \ - do {\ - if ( convert == 1 && friso->charset == FRISO_UTF8 ) { \ - memset( task->buffer, 0x00, 7 ); \ - unicode_to_utf8( task->unicode, task->buffer ); \ - convert = 0; \ - } \ - } while ( 0 ) + do {\ + if ( convert == 1 && friso->charset == FRISO_UTF8 ) { \ + memset( task->buffer, 0x00, 7 ); \ + unicode_to_utf8( task->unicode, task->buffer ); \ + convert = 0; \ + } \ + } while ( 0 ) //get the next latin word from the current position. __STATIC_API__ lex_entry_t next_basic_latin( - friso_t friso, - friso_config_t config, - friso_task_t task ) + friso_t friso, + friso_config_t config, + friso_task_t task ) { - int __convert = 0, t = 0, blen = 0; - int chkecm = 0, chkunits = 1, wspace = 0; + int __convert = 0, t = 0, blen = 0; + int chkecm = 0, chkunits = 1, wspace = 0; - /* cause friso will convert full-width numeric and letters - * (Not punctuations) to half-width ones. so, here we need - * wlen to record the real length of the lex_entry_t. - * */ - uint_t wlen = task->bytes; - uint_t idx = task->idx; - string_buffer_t sb, tmp = NULL; - lex_entry_t e = NULL; + /* cause friso will convert full-width numeric and letters + * (Not punctuations) to half-width ones. so, here we need + * wlen to record the real length of the lex_entry_t. + * */ + uint_t wlen = task->bytes; + uint_t idx = task->idx; + string_buffer_t sb, tmp = NULL; + lex_entry_t e = NULL; - //condition controller to start the secondary segmente. - int ssseg = 0; - int fdunits = 0; + //condition controller to start the secondary segmente. + int ssseg = 0; + int fdunits = 0; - //secondray segmente. - int tcount = 1; //number fo different type of char. - friso_enchar_t _ctype, _TYPE; - task_ssseg_close(task); + //secondray segmente. + int tcount = 1; //number fo different type of char. + friso_enchar_t _ctype, _TYPE; + task_ssseg_close(task); - //full-half width and upper-lower case exchange. - convert_full_to_half( friso, task, __convert ); - convert_upper_to_lower( friso, task, __convert ); - convert_work_apply( friso, task, __convert ); + //full-half width and upper-lower case exchange. + convert_full_to_half( friso, task, __convert ); + convert_upper_to_lower( friso, task, __convert ); + convert_work_apply( friso, task, __convert ); - //creat a new fstring buffer and append the task->buffer insite. - sb = new_string_buffer_with_string( task->buffer ); - _TYPE = friso_enchar_type( friso->charset, task ); + //creat a new fstring buffer and append the task->buffer insite. + sb = new_string_buffer_with_string( task->buffer ); + _TYPE = friso_enchar_type( friso->charset, task ); - //segmentation. - while ( ( task->bytes = readNextWord( - friso, task, &idx, task->buffer ) ) != 0 ) - { - //convert full-width to half-width. - convert_full_to_half(friso, task, __convert); - _ctype = friso_enchar_type( friso->charset, task ); + //segmentation. + while ( ( task->bytes = readNextWord( + friso, task, &idx, task->buffer ) ) != 0 ) + { + //convert full-width to half-width. + convert_full_to_half(friso, task, __convert); + _ctype = friso_enchar_type( friso->charset, task ); - if ( _ctype == FRISO_EN_WHITESPACE ) - { - wspace = 1; - break; - } + if ( _ctype == FRISO_EN_WHITESPACE ) + { + wspace = 1; + break; + } - if ( _ctype == FRISO_EN_PUNCTUATION ) - { - //clear the full-width punctuations. - if ( task->bytes > 1 ) break; - if ( ! friso_en_kpunc( config, task->buffer[0] ) ) break; - } + if ( _ctype == FRISO_EN_PUNCTUATION ) + { + //clear the full-width punctuations. + if ( task->bytes > 1 ) break; + if ( ! friso_en_kpunc( config, task->buffer[0] ) ) break; + } - /* check if is an FRISO_EN_NUMERIC, or FRISO_EN_LETTER. - * here just need to make sure it is not FRISO_EN_UNKNOW. - * */ - if ( _ctype == FRISO_EN_UNKNOW ) - { - if ( friso_cn_string( friso->charset, task ) ) chkecm = 1; - break; - } + /* check if is an FRISO_EN_NUMERIC, or FRISO_EN_LETTER. + * here just need to make sure it is not FRISO_EN_UNKNOW. + * */ + if ( _ctype == FRISO_EN_UNKNOW ) + { + if ( friso_cn_string( friso->charset, task ) ) chkecm = 1; + break; + } - //upper-lower case convert - convert_upper_to_lower( friso, task, __convert ); - convert_work_apply( friso, task, __convert ); + //upper-lower case convert + convert_upper_to_lower( friso, task, __convert ); + convert_work_apply( friso, task, __convert ); - //sound a little crazy, i did't limit the length of this - //@Added: 2015-01-16 night - if ( (wlen + task->bytes) >= __HITS_WORD_LENGTH__ ) - { - break; - } + //sound a little crazy, i did't limit the length of this + //@Added: 2015-01-16 night + if ( (wlen + task->bytes) >= __HITS_WORD_LENGTH__ ) + { + break; + } - string_buffer_append( sb, task->buffer ); - wlen += task->bytes; - task->idx += task->bytes; + string_buffer_append( sb, task->buffer ); + wlen += task->bytes; + task->idx += task->bytes; - /* Char type counter. - * make the condition to start the secondary segmentation. - * - * @TODO: 2013-12-22 - * */ - if ( _ctype != _TYPE ) - { - tcount++; - _TYPE = _ctype; - } - } + /* Char type counter. + * make the condition to start the secondary segmentation. + * + * @TODO: 2013-12-22 + * */ + if ( _ctype != _TYPE ) + { + tcount++; + _TYPE = _ctype; + } + } - /* - * 1. clear the useless english punctuation - * from the end of the buffer. - * 2. check the english and punctuation mixed word. - * - * set _ctype to as the status for the existence of punctuation - * at the end of the sb cause we need to plus the tcount - * to avoid the secondary check for work like 'c+', 'chenxin.'. - */ - _ctype = 0; - for ( ; sb->length > 0 - && sb->buffer[ sb->length - 1 ] != '%' - && is_en_punctuation( - friso->charset, sb->buffer[ sb->length - 1 ] ); ) - { - //check the english punctuation mixed word. - if ( friso_dic_match( friso->dic, - __LEX_ENPUN_WORDS__, sb->buffer ) ) { - e = friso_dic_get(friso->dic, - __LEX_ENPUN_WORDS__, sb->buffer); - chkunits = 0; - break; - } + /* + * 1. clear the useless english punctuation + * from the end of the buffer. + * 2. check the english and punctuation mixed word. + * + * set _ctype to as the status for the existence of punctuation + * at the end of the sb cause we need to plus the tcount + * to avoid the secondary check for work like 'c+', 'chenxin.'. + */ + _ctype = 0; + for ( ; sb->length > 0 + && sb->buffer[ sb->length - 1 ] != '%' + && is_en_punctuation( + friso->charset, sb->buffer[ sb->length - 1 ] ); ) + { + //check the english punctuation mixed word. + if ( friso_dic_match( friso->dic, + __LEX_ENPUN_WORDS__, sb->buffer ) ) { + e = friso_dic_get(friso->dic, + __LEX_ENPUN_WORDS__, sb->buffer); + chkunits = 0; + break; + } - //mark the end of the buffer. - sb->buffer[ --sb->length ] = '\0'; - wlen--; - task->idx--; + //mark the end of the buffer. + sb->buffer[ --sb->length ] = '\0'; + wlen--; + task->idx--; - /*check and plus the tcount*/ - if ( _ctype == 0 ) - { - tcount--; - _ctype = 1; - } - } + /*check and plus the tcount*/ + if ( _ctype == 0 ) + { + tcount--; + _ctype = 1; + } + } - //check the condition to start the secondary segmentation. - ssseg = (tcount > 1) && (chkunits == 1); + //check the condition to start the secondary segmentation. + ssseg = (tcount > 1) && (chkunits == 1); - //check the tokenize loop is break by whitespace. - // no need for all the following work if it is. - //@added 2013-11-19 - if ( wspace == 1 || task->idx == task->length ) - { - blen = sb->length; - e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); - e->rlen = wlen; - //set the secondary mask. - if ( ssseg ) task_ssseg_open(task); - return e; - } + //check the tokenize loop is break by whitespace. + // no need for all the following work if it is. + //@added 2013-11-19 + if ( wspace == 1 || task->idx == task->length ) + { + blen = sb->length; + e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); + e->rlen = wlen; + //set the secondary mask. + if ( ssseg ) task_ssseg_open(task); + return e; + } - if ( chkecm != 1 ) { - /* - * check the single words unit. - * not only the chinese word but also other kinds of word. - * so we can recongnize the complex unit like '℉,℃'' eg.. - * @date 2013-10-14 - */ - if ( chkunits - && ( friso_numeric_string( friso->charset, sb->buffer ) - || friso_decimal_string( friso->charset, sb->buffer ) ) ) - { - idx = task->idx; - if ( ( task->bytes = readNextWord( - friso, task, &idx, task->buffer ) ) != 0 ) - { - //check the EC dictionary. - if ( friso_dic_match( friso->dic, - __LEX_CJK_UNITS__, task->buffer ) ) - { - fdunits = 1; - string_buffer_append(sb, task->buffer); - wlen += task->bytes; - task->idx += task->bytes; - } - } - } + if ( chkecm != 1 ) { + /* + * check the single words unit. + * not only the chinese word but also other kinds of word. + * so we can recongnize the complex unit like '℉,℃'' eg.. + * @date 2013-10-14 + */ + if ( chkunits + && ( friso_numeric_string( friso->charset, sb->buffer ) + || friso_decimal_string( friso->charset, sb->buffer ) ) ) + { + idx = task->idx; + if ( ( task->bytes = readNextWord( + friso, task, &idx, task->buffer ) ) != 0 ) + { + //check the EC dictionary. + if ( friso_dic_match( friso->dic, + __LEX_CJK_UNITS__, task->buffer ) ) + { + fdunits = 1; + string_buffer_append(sb, task->buffer); + wlen += task->bytes; + task->idx += task->bytes; + } + } + } - //set the START_SS_MASK - if ( fdunits != 1 && ssseg ) task_ssseg_open(task); + //set the START_SS_MASK + if ( fdunits != 1 && ssseg ) task_ssseg_open(task); - //creat the lexicon entry and return it. - blen = sb->length; - e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); - e->rlen = wlen; + //creat the lexicon entry and return it. + blen = sb->length; + e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); + e->rlen = wlen; - return e; - } + return e; + } - //Try to find a english chinese mixed word. - tmp = new_string_buffer_with_string( sb->buffer ); - idx = task->idx; - for ( t = 0; t < config->mix_len - && ( task->bytes = readNextWord( - friso, task, &idx, task->buffer ) ) != 0; t++ ) - { - //if ( ! friso_cn_string( friso->charset, task ) ) { - // task->idx -= task->bytes; - // break; - //} - //replace with the whitespace check. - //more complex mixed words could be find here. - // (no only english and chinese mix word) - //@date 2013-10-14 - if ( friso_whitespace( friso->charset, task ) ) break; + //Try to find a english chinese mixed word. + tmp = new_string_buffer_with_string( sb->buffer ); + idx = task->idx; + for ( t = 0; t < config->mix_len + && ( task->bytes = readNextWord( + friso, task, &idx, task->buffer ) ) != 0; t++ ) + { + //if ( ! friso_cn_string( friso->charset, task ) ) { + // task->idx -= task->bytes; + // break; + //} + //replace with the whitespace check. + //more complex mixed words could be find here. + // (no only english and chinese mix word) + //@date 2013-10-14 + if ( friso_whitespace( friso->charset, task ) ) break; - string_buffer_append( tmp, task->buffer ); + string_buffer_append( tmp, task->buffer ); - //check the mixed word dictionary. - if ( friso_dic_match( friso->dic, - __LEX_ECM_WORDS__, tmp->buffer ) ) { - e = friso_dic_get( friso->dic, - __LEX_ECM_WORDS__, tmp->buffer ); - } - } + //check the mixed word dictionary. + if ( friso_dic_match( friso->dic, + __LEX_ECM_WORDS__, tmp->buffer ) ) { + e = friso_dic_get( friso->dic, + __LEX_ECM_WORDS__, tmp->buffer ); + } + } - free_string_buffer( tmp ); + free_string_buffer( tmp ); - /* e is not NULL does't mean it must be EC mixed word. - * it could be an english and punctuation mixed word, like 'c++' - * But we don't need to check and set the START_SS_MASK mask here. - * */ - if ( e != NULL ) - { - task->idx += (e->length - sb->length); - free_string_buffer(sb); - return e; - } + /* e is not NULL does't mean it must be EC mixed word. + * it could be an english and punctuation mixed word, like 'c++' + * But we don't need to check and set the START_SS_MASK mask here. + * */ + if ( e != NULL ) + { + task->idx += (e->length - sb->length); + free_string_buffer(sb); + return e; + } - //no match for mix word, try to find a single unit. - if ( chkunits - && ( friso_numeric_string( friso->charset, sb->buffer ) - || friso_decimal_string( friso->charset, sb->buffer ) ) ) - { - idx = task->idx; - if ( ( task->bytes = readNextWord( - friso, task, &idx, task->buffer ) ) != 0 ) { - //check the single chinese units dictionary. - if ( friso_dic_match( friso->dic, - __LEX_CJK_UNITS__, task->buffer ) ) - { - fdunits = 1; - string_buffer_append( sb, task->buffer ); - wlen += task->bytes; - task->idx += task->bytes; - } - } - } + //no match for mix word, try to find a single unit. + if ( chkunits + && ( friso_numeric_string( friso->charset, sb->buffer ) + || friso_decimal_string( friso->charset, sb->buffer ) ) ) + { + idx = task->idx; + if ( ( task->bytes = readNextWord( + friso, task, &idx, task->buffer ) ) != 0 ) { + //check the single chinese units dictionary. + if ( friso_dic_match( friso->dic, + __LEX_CJK_UNITS__, task->buffer ) ) + { + fdunits = 1; + string_buffer_append( sb, task->buffer ); + wlen += task->bytes; + task->idx += task->bytes; + } + } + } - //set the START_SS_MASK. - if ( fdunits != 1 && ssseg ) task_ssseg_open(task); + //set the START_SS_MASK. + if ( fdunits != 1 && ssseg ) task_ssseg_open(task); - //create the lexicon entry and return it. - blen = sb->length; - e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); - e->rlen = wlen; + //create the lexicon entry and return it. + blen = sb->length; + e = new_lex_entry( string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__ ); + e->rlen = wlen; - return e; + return e; } /* }}} */ @@ -742,174 +742,174 @@ __STATIC_API__ lex_entry_t next_basic_latin( //mmseg algorithm implemented functions - start /* {{{ get the next match from the current position, - * throught the dictionary this will return all the matchs. + * throught the dictionary this will return all the matchs. * * @return friso_array_t that contains all the matchs. */ __STATIC_API__ friso_array_t get_next_match( - friso_t friso, - friso_config_t config, - friso_task_t task, - uint_t idx ) + friso_t friso, + friso_config_t config, + friso_task_t task, + uint_t idx ) { - register uint_t t; - string_buffer_t sb = - new_string_buffer_with_string( task->buffer ); + register uint_t t; + string_buffer_t sb = + new_string_buffer_with_string( task->buffer ); - //create a match dynamic array. - friso_array_t match = - new_array_list_with_opacity( config->max_len ); - array_list_add( match, friso_dic_get( - friso->dic, __LEX_CJK_WORDS__, task->buffer ) ); + //create a match dynamic array. + friso_array_t match = + new_array_list_with_opacity( config->max_len ); + array_list_add( match, friso_dic_get( + friso->dic, __LEX_CJK_WORDS__, task->buffer ) ); - for ( t = 1; t < config->max_len && ( task->bytes = - readNextWord( friso, task, &idx, task->buffer ) ) != 0; t++ ) - { - if ( friso_whitespace( friso->charset, task ) ) break; - if ( ! friso_cn_string( friso->charset, task ) ) break; + for ( t = 1; t < config->max_len && ( task->bytes = + readNextWord( friso, task, &idx, task->buffer ) ) != 0; t++ ) + { + if ( friso_whitespace( friso->charset, task ) ) break; + if ( ! friso_cn_string( friso->charset, task ) ) break; - //append the task->buffer to the buffer. - string_buffer_append( sb, task->buffer ); + //append the task->buffer to the buffer. + string_buffer_append( sb, task->buffer ); - //check the CJK dictionary. - if ( friso_dic_match( friso->dic, - __LEX_CJK_WORDS__, sb->buffer ) ) { - /* - * add the lex_entry_t insite. - * here is a key point: - * we use friso_dic_get function - * to get the address of the lex_entry_cdt - * that store in the dictionary, - * not create a new lex_entry_cdt. - * so : - * 1.we will not bother to the allocations of - * the newly created lex_entry_cdt. - * 2.more efficient of course. - */ - array_list_add( match, friso_dic_get( - friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ); - } - } + //check the CJK dictionary. + if ( friso_dic_match( friso->dic, + __LEX_CJK_WORDS__, sb->buffer ) ) { + /* + * add the lex_entry_t insite. + * here is a key point: + * we use friso_dic_get function + * to get the address of the lex_entry_cdt + * that store in the dictionary, + * not create a new lex_entry_cdt. + * so : + * 1.we will not bother to the allocations of + * the newly created lex_entry_cdt. + * 2.more efficient of course. + */ + array_list_add( match, friso_dic_get( + friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ); + } + } - /*buffer allocations clear*/ - free_string_buffer( sb ); - //array_list_trim( match ); + /*buffer allocations clear*/ + free_string_buffer( sb ); + //array_list_trim( match ); - return match; + return match; } /* }}} */ /* {{{ chunk for mmseg defines and functions to handle them.*/ typedef struct { - friso_array_t words; - uint_t length; - float average_word_length; - float word_length_variance; - float single_word_dmf; + friso_array_t words; + uint_t length; + float average_word_length; + float word_length_variance; + float single_word_dmf; } friso_chunk_entry; typedef friso_chunk_entry * friso_chunk_t; /* }}} */ /* {{{ create a new chunks*/ __STATIC_API__ friso_chunk_t new_chunk( - friso_array_t words, uint_t length ) + friso_array_t words, uint_t length ) { - friso_chunk_t chunk = ( friso_chunk_t ) - FRISO_MALLOC( sizeof( friso_chunk_entry ) ); - if ( chunk == NULL ) { - ___ALLOCATION_ERROR___ - } + friso_chunk_t chunk = ( friso_chunk_t ) + FRISO_MALLOC( sizeof( friso_chunk_entry ) ); + if ( chunk == NULL ) { + ___ALLOCATION_ERROR___ + } - chunk->words = words; - chunk->length = length; - chunk->average_word_length = -1; - chunk->word_length_variance = -1; - chunk->single_word_dmf = -1; + chunk->words = words; + chunk->length = length; + chunk->average_word_length = -1; + chunk->word_length_variance = -1; + chunk->single_word_dmf = -1; - return chunk; + return chunk; } /* }}} */ /* {{{ free the specified chunk */ __STATIC_API__ void free_chunk( friso_chunk_t chunk ) { - FRISO_FREE( chunk ); + FRISO_FREE( chunk ); } /* }}} */ /* {{{ a static function to count the average word length - * of the given chunk. + * of the given chunk. */ __STATIC_API__ float count_chunk_avl( friso_chunk_t chunk ) { - chunk->average_word_length = - ((float) chunk->length) / chunk->words->length; - return chunk->average_word_length; + chunk->average_word_length = + ((float) chunk->length) / chunk->words->length; + return chunk->average_word_length; } /* }}} */ /* {{{ a static function to count the word length variance - * of the given chunk. + * of the given chunk. */ __STATIC_API__ float count_chunk_var( friso_chunk_t chunk ) { - float var = 0, tmp = 0; //snapshot - register uint_t t; - lex_entry_t e; + float var = 0, tmp = 0; //snapshot + register uint_t t; + lex_entry_t e; - for ( t = 0; t < chunk->words->length; t++ ) { - e = ( lex_entry_t ) chunk->words->items[t]; - tmp = e->length - chunk->average_word_length; - var += tmp * tmp; - } + for ( t = 0; t < chunk->words->length; t++ ) { + e = ( lex_entry_t ) chunk->words->items[t]; + tmp = e->length - chunk->average_word_length; + var += tmp * tmp; + } - chunk->word_length_variance = var / chunk->words->length; + chunk->word_length_variance = var / chunk->words->length; - return chunk->word_length_variance; + return chunk->word_length_variance; } /* }}} */ /* {{{ a static function to count the single word morpheme degree of freedom - * of the given chunk. + * of the given chunk. */ __STATIC_API__ float count_chunk_mdf( friso_chunk_t chunk ) { - float __mdf__ = 0; - register uint_t t; - lex_entry_t e; + float __mdf__ = 0; + register uint_t t; + lex_entry_t e; - for ( t = 0; t < chunk->words->length; t++ ) { - e = ( lex_entry_t ) chunk->words->items[t]; - //single CJK(UTF-8)/chinese(GBK) word. - //better add a charset check here, but this will works find. - //all CJK words will take 3 bytes with UTF-8 encoding. - //all chinese words take 2 bytes with GBK encoding. - if ( e->length == 3 || e->length == 2 ) { - __mdf__ += (float) log( (float)e->fre); - } - } - chunk->single_word_dmf = __mdf__; + for ( t = 0; t < chunk->words->length; t++ ) { + e = ( lex_entry_t ) chunk->words->items[t]; + //single CJK(UTF-8)/chinese(GBK) word. + //better add a charset check here, but this will works find. + //all CJK words will take 3 bytes with UTF-8 encoding. + //all chinese words take 2 bytes with GBK encoding. + if ( e->length == 3 || e->length == 2 ) { + __mdf__ += (float) log( (float)e->fre); + } + } + chunk->single_word_dmf = __mdf__; - return chunk->single_word_dmf; + return chunk->single_word_dmf; } /* }}} */ /* {{{ chunk printer - use for for debug*/ -#define ___CHUNK_PRINTER___( _chunks_ ) \ - for ( t = 0; t < _chunks_->length; t++ ) { \ - __tmp__ = (( friso_chunk_t ) _chunks_->items[t])->words; \ - for ( j = 0; j < __tmp__->length; j++ ) { \ - printf("%s/ ", ( ( lex_entry_t ) __tmp__->items[j] )->word ); \ - } \ - putchar('\n'); \ - } \ -putchar('\n'); \ +#define ___CHUNK_PRINTER___( _chunks_ ) \ + for ( t = 0; t < _chunks_->length; t++ ) { \ + __tmp__ = (( friso_chunk_t ) _chunks_->items[t])->words; \ + for ( j = 0; j < __tmp__->length; j++ ) { \ + printf("%s/ ", ( ( lex_entry_t ) __tmp__->items[j] )->word ); \ + } \ + putchar('\n'); \ + } \ +putchar('\n'); \ /* }}} */ /* {{{ mmseg algorithm core invoke * here, * we use four rules to filter all the chunks to get the best chunk. - * and this is the core of the mmseg alogrithm. + * and this is the core of the mmseg alogrithm. * 1. maximum match word length. * 2. larget average word length. * 3. smallest word length variance. @@ -917,291 +917,291 @@ putchar('\n'); \ */ __STATIC_API__ friso_chunk_t mmseg_core_invoke( friso_array_t chunks ) { - register uint_t t/*, j*/; - float max; - friso_chunk_t e; - friso_array_t __res__, __tmp__; - __res__ = new_array_list_with_opacity( chunks->length ); + register uint_t t/*, j*/; + float max; + friso_chunk_t e; + friso_array_t __res__, __tmp__; + __res__ = new_array_list_with_opacity( chunks->length ); - //1.get the maximum matched chunks. - //count the maximum length - max = ( float ) ( ( friso_chunk_t ) chunks->items[0] )->length; - for ( t = 1; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( e->length > max ) - max = ( float ) e->length; - } - //get the chunk items that owns the maximum length. - for ( t = 0; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( e->length >= max ) { - array_list_add( __res__, e ); - } else { - free_array_list( e->words ); - free_chunk( e ); - } - } - //check the left chunks - if ( __res__->length == 1 ) { - e = ( friso_chunk_t ) __res__->items[0]; - free_array_list( __res__ ); - free_array_list( chunks ); - return e; - } else { - __tmp__ = array_list_clear( chunks ); - chunks = __res__; - __res__ = __tmp__; - } + //1.get the maximum matched chunks. + //count the maximum length + max = ( float ) ( ( friso_chunk_t ) chunks->items[0] )->length; + for ( t = 1; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( e->length > max ) + max = ( float ) e->length; + } + //get the chunk items that owns the maximum length. + for ( t = 0; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( e->length >= max ) { + array_list_add( __res__, e ); + } else { + free_array_list( e->words ); + free_chunk( e ); + } + } + //check the left chunks + if ( __res__->length == 1 ) { + e = ( friso_chunk_t ) __res__->items[0]; + free_array_list( __res__ ); + free_array_list( chunks ); + return e; + } else { + __tmp__ = array_list_clear( chunks ); + chunks = __res__; + __res__ = __tmp__; + } - //2.get the largest average word length chunks. - //count the maximum average word length. - max = count_chunk_avl( ( friso_chunk_t ) chunks->items[0] ); - for ( t = 1; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( count_chunk_avl( e ) > max ) { - max = e->average_word_length; - } - } - //get the chunks items that own the largest average word length. - for ( t = 0; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( e->average_word_length >= max ) { - array_list_add( __res__, e ); - } else { - free_array_list( e->words ); - free_chunk( e ); - } - } - //check the left chunks - if ( __res__->length == 1 ) { - e = ( friso_chunk_t ) __res__->items[0]; - free_array_list( __res__); - free_array_list( chunks ); - return e; - } else { - __tmp__ = array_list_clear( chunks ); - chunks = __res__; - __res__ = __tmp__; - } + //2.get the largest average word length chunks. + //count the maximum average word length. + max = count_chunk_avl( ( friso_chunk_t ) chunks->items[0] ); + for ( t = 1; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( count_chunk_avl( e ) > max ) { + max = e->average_word_length; + } + } + //get the chunks items that own the largest average word length. + for ( t = 0; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( e->average_word_length >= max ) { + array_list_add( __res__, e ); + } else { + free_array_list( e->words ); + free_chunk( e ); + } + } + //check the left chunks + if ( __res__->length == 1 ) { + e = ( friso_chunk_t ) __res__->items[0]; + free_array_list( __res__); + free_array_list( chunks ); + return e; + } else { + __tmp__ = array_list_clear( chunks ); + chunks = __res__; + __res__ = __tmp__; + } - //3.get the smallest word length variance chunks - //count the smallest word length variance - max = count_chunk_var( ( friso_chunk_t ) chunks->items[0] ); - for ( t = 1; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( count_chunk_var( e ) < max ) { - max = e->word_length_variance; - } - } - //get the chunks that own the smallest word length variance. - for ( t = 0; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( e->word_length_variance <= max ) { - array_list_add( __res__, e ); - } else { - free_array_list( e->words ); - free_chunk( e ); - } - } - //check the left chunks - if ( __res__->length == 1 ) { - e = ( friso_chunk_t ) __res__->items[0]; - free_array_list( chunks ); - free_array_list( __res__ ); - return e; - } else { - __tmp__ = array_list_clear( chunks ); - chunks = __res__; - __res__ = __tmp__; - } + //3.get the smallest word length variance chunks + //count the smallest word length variance + max = count_chunk_var( ( friso_chunk_t ) chunks->items[0] ); + for ( t = 1; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( count_chunk_var( e ) < max ) { + max = e->word_length_variance; + } + } + //get the chunks that own the smallest word length variance. + for ( t = 0; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( e->word_length_variance <= max ) { + array_list_add( __res__, e ); + } else { + free_array_list( e->words ); + free_chunk( e ); + } + } + //check the left chunks + if ( __res__->length == 1 ) { + e = ( friso_chunk_t ) __res__->items[0]; + free_array_list( chunks ); + free_array_list( __res__ ); + return e; + } else { + __tmp__ = array_list_clear( chunks ); + chunks = __res__; + __res__ = __tmp__; + } - //4.get the largest single word morpheme degrees of freedom. - //count the maximum single word morpheme degreees of freedom - max = count_chunk_mdf( ( friso_chunk_t ) chunks->items[0] ); - for ( t = 1; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( count_chunk_mdf( e ) > max ) { - max = e->single_word_dmf; - } - } - //get the chunks that own the largest single word word morpheme degrees of freedom. - for ( t = 0; t < chunks->length; t++ ) { - e = ( friso_chunk_t ) chunks->items[t]; - if ( e->single_word_dmf >= max ) { - array_list_add( __res__, e ); - } else { - free_array_list( e->words ); - free_chunk( e ); - } - } + //4.get the largest single word morpheme degrees of freedom. + //count the maximum single word morpheme degreees of freedom + max = count_chunk_mdf( ( friso_chunk_t ) chunks->items[0] ); + for ( t = 1; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( count_chunk_mdf( e ) > max ) { + max = e->single_word_dmf; + } + } + //get the chunks that own the largest single word word morpheme degrees of freedom. + for ( t = 0; t < chunks->length; t++ ) { + e = ( friso_chunk_t ) chunks->items[t]; + if ( e->single_word_dmf >= max ) { + array_list_add( __res__, e ); + } else { + free_array_list( e->words ); + free_chunk( e ); + } + } - /* - * there is still more than one chunks? - * well, this rarely happen but still happens. - * here we simple return the first chunk as the final result, - * and we need to free the all the chunks that __res__ - * points to except the 1th one. - * you have to do two things to totaly free a chunk: - * 1. call free_array_list to free the allocations of a chunk's words. - * 2. call free_chunk to the free the allocations of a chunk. - */ - for ( t = 1; t < __res__->length; t++ ) { - e = ( friso_chunk_t ) __res__->items[t]; - free_array_list( e->words ); - free_chunk( e ); - } + /* + * there is still more than one chunks? + * well, this rarely happen but still happens. + * here we simple return the first chunk as the final result, + * and we need to free the all the chunks that __res__ + * points to except the 1th one. + * you have to do two things to totaly free a chunk: + * 1. call free_array_list to free the allocations of a chunk's words. + * 2. call free_chunk to the free the allocations of a chunk. + */ + for ( t = 1; t < __res__->length; t++ ) { + e = ( friso_chunk_t ) __res__->items[t]; + free_array_list( e->words ); + free_chunk( e ); + } - e = ( friso_chunk_t ) __res__->items[0]; - free_array_list( chunks ); - free_array_list( __res__ ); + e = ( friso_chunk_t ) __res__->items[0]; + free_array_list( chunks ); + free_array_list( __res__ ); - return e; + return e; } /* }}} */ /* {{{ get the next cjk word from the current position with complex mode. - * this is the core of the mmseg chinese word segemetation algorithm. - * we use four rules to filter the matched chunks and get the best one - * as the final result. + * this is the core of the mmseg chinese word segemetation algorithm. + * we use four rules to filter the matched chunks and get the best one + * as the final result. * * @see mmseg_core_invoke( chunks ); */ FRISO_API lex_entry_t next_complex_cjk( - friso_t friso, - friso_config_t config, - friso_task_t task ) + friso_t friso, + friso_config_t config, + friso_task_t task ) { - register uint_t x, y, z; - /*bakup the task->bytes here*/ - uint_t __idx__ = task->bytes; - lex_entry_t fe, se, te; - friso_chunk_t e; - friso_array_t words, chunks; - friso_array_t smatch, tmatch, fmatch = - get_next_match( friso, config, task, task->idx ); + register uint_t x, y, z; + /*bakup the task->bytes here*/ + uint_t __idx__ = task->bytes; + lex_entry_t fe, se, te; + friso_chunk_t e; + friso_array_t words, chunks; + friso_array_t smatch, tmatch, fmatch = + get_next_match( friso, config, task, task->idx ); - /* - * here: - * if the length of the fmatch is 1, mean we don't have to - * continue the following work. ( no matter what we get the same result. ) - */ - if ( fmatch->length == 1 ) - { - fe = ( ( lex_entry_t ) fmatch->items[0] ); - free_array_list( fmatch ); + /* + * here: + * if the length of the fmatch is 1, mean we don't have to + * continue the following work. ( no matter what we get the same result. ) + */ + if ( fmatch->length == 1 ) + { + fe = ( ( lex_entry_t ) fmatch->items[0] ); + free_array_list( fmatch ); - /* - * check and clear the stop words . - * @date 2013-06-13 - */ - if ( config->clr_stw && - friso_dic_match( friso->dic, - __LEX_STOPWORDS__, fe->word ) ) { - return NULL; - } + /* + * check and clear the stop words . + * @date 2013-06-13 + */ + if ( config->clr_stw && + friso_dic_match( friso->dic, + __LEX_STOPWORDS__, fe->word ) ) { + return NULL; + } - return fe; - } + return fe; + } - chunks = new_array_list(); - task->idx -= __idx__; + chunks = new_array_list(); + task->idx -= __idx__; - for ( x = 0; x < fmatch->length; x++ ) - { - /*get the word and try the second layer match*/ - fe = ( lex_entry_t ) array_list_get( fmatch, x ); - __idx__ = task->idx + fe->length; - readNextWord( friso, task, &__idx__, task->buffer ); + for ( x = 0; x < fmatch->length; x++ ) + { + /*get the word and try the second layer match*/ + fe = ( lex_entry_t ) array_list_get( fmatch, x ); + __idx__ = task->idx + fe->length; + readNextWord( friso, task, &__idx__, task->buffer ); - if ( task->bytes != 0 - && friso_cn_string( friso->charset, task ) - && friso_dic_match( friso->dic, - __LEX_CJK_WORDS__, task->buffer ) ) - { - //get the next matchs - smatch = get_next_match( friso, config, task, __idx__ ); - for ( y = 0; y < smatch->length; y++ ) - { - /*get the word and try the third layer match*/ - se = ( lex_entry_t ) array_list_get( smatch, y ); - __idx__ = task->idx + fe->length + se->length; - readNextWord( friso, task, &__idx__, task->buffer ); + if ( task->bytes != 0 + && friso_cn_string( friso->charset, task ) + && friso_dic_match( friso->dic, + __LEX_CJK_WORDS__, task->buffer ) ) + { + //get the next matchs + smatch = get_next_match( friso, config, task, __idx__ ); + for ( y = 0; y < smatch->length; y++ ) + { + /*get the word and try the third layer match*/ + se = ( lex_entry_t ) array_list_get( smatch, y ); + __idx__ = task->idx + fe->length + se->length; + readNextWord( friso, task, &__idx__, task->buffer ); - if ( task->bytes != 0 - && friso_cn_string( friso->charset, task ) - && friso_dic_match( friso->dic, - __LEX_CJK_WORDS__, task->buffer ) ) - { - //get the matchs. - tmatch = get_next_match( friso, config, task, __idx__ ); - for ( z = 0; z < tmatch->length; z++ ) - { - te = ( lex_entry_t ) array_list_get( tmatch, z ); - words = new_array_list_with_opacity(3); - array_list_add( words, fe ); - array_list_add( words, se ); - array_list_add( words, te ); - array_list_add( chunks, new_chunk( words, - fe->length + se->length + te->length ) ); - } - //free the third matched array list - free_array_list( tmatch ); - } - else - { - words = new_array_list_with_opacity(2); - array_list_add( words, fe ); - array_list_add( words, se ); - //add the chunk - array_list_add( chunks, - new_chunk( words, fe->length + se->length ) ); - } - } - //free the second match array list - free_array_list( smatch ); - } - else - { - words = new_array_list_with_opacity(1); - array_list_add( words, fe ); - array_list_add( chunks, new_chunk( words, fe->length ) ); - } - } - //free the first match array list - free_array_list( fmatch ); + if ( task->bytes != 0 + && friso_cn_string( friso->charset, task ) + && friso_dic_match( friso->dic, + __LEX_CJK_WORDS__, task->buffer ) ) + { + //get the matchs. + tmatch = get_next_match( friso, config, task, __idx__ ); + for ( z = 0; z < tmatch->length; z++ ) + { + te = ( lex_entry_t ) array_list_get( tmatch, z ); + words = new_array_list_with_opacity(3); + array_list_add( words, fe ); + array_list_add( words, se ); + array_list_add( words, te ); + array_list_add( chunks, new_chunk( words, + fe->length + se->length + te->length ) ); + } + //free the third matched array list + free_array_list( tmatch ); + } + else + { + words = new_array_list_with_opacity(2); + array_list_add( words, fe ); + array_list_add( words, se ); + //add the chunk + array_list_add( chunks, + new_chunk( words, fe->length + se->length ) ); + } + } + //free the second match array list + free_array_list( smatch ); + } + else + { + words = new_array_list_with_opacity(1); + array_list_add( words, fe ); + array_list_add( chunks, new_chunk( words, fe->length ) ); + } + } + //free the first match array list + free_array_list( fmatch ); - /* - * filter the chunks with the four rules of the mmseg algorithm - * and get best chunk as the final result. - * - * @see mmseg_core_invoke( chunks ); - * @date 2012-12-13 - */ - if ( chunks->length > 1 ) { - e = mmseg_core_invoke( chunks ); - } else { - e = ( friso_chunk_t ) chunks->items[0]; - } + /* + * filter the chunks with the four rules of the mmseg algorithm + * and get best chunk as the final result. + * + * @see mmseg_core_invoke( chunks ); + * @date 2012-12-13 + */ + if ( chunks->length > 1 ) { + e = mmseg_core_invoke( chunks ); + } else { + e = ( friso_chunk_t ) chunks->items[0]; + } - fe = ( lex_entry_t ) e->words->items[0]; - task->idx += fe->length; //reset the idx of the task. - free_array_list(e->words); //free the chunks words allocation - free_chunk( e ); + fe = ( lex_entry_t ) e->words->items[0]; + task->idx += fe->length; //reset the idx of the task. + free_array_list(e->words); //free the chunks words allocation + free_chunk( e ); - //clear the stop words - if ( config->clr_stw && - friso_dic_match( friso->dic, - __LEX_STOPWORDS__, fe->word ) ) { - return NULL; - } + //clear the stop words + if ( config->clr_stw && + friso_dic_match( friso->dic, + __LEX_STOPWORDS__, fe->word ) ) { + return NULL; + } - return fe; + return fe; } /* }}} */ //----------------end of mmseg core @@ -1210,652 +1210,652 @@ FRISO_API lex_entry_t next_complex_cjk( //------------------------------------------------------------------------------------- //mmseg core logic controller, output style controller and macro defines /* {{{ A macro function to check and free - * the lex_entry_t with type of __LEX_OTHER_WORDS__. + * the lex_entry_t with type of __LEX_OTHER_WORDS__. */ #define check_free_otlex_entry( lex ) \ - do { \ - if ( lex->type == __LEX_OTHER_WORDS__ ) { \ - FRISO_FREE( lex->word ); \ - free_lex_entry( lex ); \ - }\ - } while (0) + do { \ + if ( lex->type == __LEX_OTHER_WORDS__ ) { \ + FRISO_FREE( lex->word ); \ + free_lex_entry( lex ); \ + }\ + } while (0) /* }}} */ /* {{{ sphinx style output synonyms words append. * - * @param task - * @param lex + * @param task + * @param lex * */ __STATIC_API__ void token_sphinx_output( - friso_task_t task, - lex_entry_t lex ) + friso_task_t task, + lex_entry_t lex ) { - uint_t i, j, len; - fstring _word; - len = lex->length; + uint_t i, j, len; + fstring _word; + len = lex->length; - //append the synoyums words. - for ( i = 0; i < lex->syn->length; i++ ) - { - _word = ( fstring ) lex->syn->items[i]; - j = strlen(_word); - if ( ( len + j + 1 ) >= __HITS_WORD_LENGTH__ ) break; - memcpy(task->token->word + len, "|", 1); - len += 1; - memcpy(task->token->word + len, _word, j); - len += j; - } + //append the synoyums words. + for ( i = 0; i < lex->syn->length; i++ ) + { + _word = ( fstring ) lex->syn->items[i]; + j = strlen(_word); + if ( ( len + j + 1 ) >= __HITS_WORD_LENGTH__ ) break; + memcpy(task->token->word + len, "|", 1); + len += 1; + memcpy(task->token->word + len, _word, j); + len += j; + } - //set the new end of the buffer. - task->token->word[len] = '\0'; + //set the new end of the buffer. + task->token->word[len] = '\0'; } /* }}} */ /* {{{ normal style output synonyms words append. * - * @param task - * @param lex - * @param front 1 for add the synoyum words from the head and - * 0 for append from the tail. + * @param task + * @param lex + * @param front 1 for add the synoyum words from the head and + * 0 for append from the tail. * */ __STATIC_API__ void token_normal_output( - friso_task_t task, - lex_entry_t lex, - int front ) + friso_task_t task, + lex_entry_t lex, + int front ) { - uint_t i; - fstring _word; - lex_entry_t e; + uint_t i; + fstring _word; + lex_entry_t e; - for ( i = 0; i < lex->syn->length; i++ ) - { - _word = ( fstring ) lex->syn->items[i]; - e = new_lex_entry( _word, NULL, 0, - strlen(_word), __LEX_NCSYN_WORDS__ ); - e->offset = lex->offset; - //add to the buffer. - if ( front ) - link_list_add_first( task->pool, e ); - else link_list_add( task->pool, e); - } + for ( i = 0; i < lex->syn->length; i++ ) + { + _word = ( fstring ) lex->syn->items[i]; + e = new_lex_entry( _word, NULL, 0, + strlen(_word), __LEX_NCSYN_WORDS__ ); + e->offset = lex->offset; + //add to the buffer. + if ( front ) + link_list_add_first( task->pool, e ); + else link_list_add( task->pool, e); + } } /* }}} */ /* {{{ do the secondary segmentation of the complex english token. * - * @param friso - * @param config - * @param task - * @param lex - * @param retfw -Wether to return the first word. - * @return lex_entry_t(NULL or the first sub token of the lex) + * @param friso + * @param config + * @param task + * @param lex + * @param retfw -Wether to return the first word. + * @return lex_entry_t(NULL or the first sub token of the lex) */ __STATIC_API__ lex_entry_t en_second_seg( - friso_t friso, - friso_config_t config, - friso_task_t task, - lex_entry_t lex, int retfw ) + friso_t friso, + friso_config_t config, + friso_task_t task, + lex_entry_t lex, int retfw ) { - //printf("sseg: %d\n", (task->ctrlMask & START_SS_MASK)); + //printf("sseg: %d\n", (task->ctrlMask & START_SS_MASK)); - int j, p = 0, start = 0; - fstring str = lex->word; + int j, p = 0, start = 0; + fstring str = lex->word; - lex_entry_t fword = NULL, sword = NULL; + lex_entry_t fword = NULL, sword = NULL; - int _ctype, _TYPE = get_enchar_type(str[0]); - string_buffer_clear(task->sbuf); - string_buffer_append_char(task->sbuf, str[0]); + int _ctype, _TYPE = get_enchar_type(str[0]); + string_buffer_clear(task->sbuf); + string_buffer_append_char(task->sbuf, str[0]); - for ( j = 1; j < lex->length; j++ ) - { - //get the type of the char - _ctype = get_enchar_type(str[j]); - if ( _ctype == FRISO_EN_WHITESPACE ) - { - _TYPE = FRISO_EN_WHITESPACE; - p++; - continue; - } + for ( j = 1; j < lex->length; j++ ) + { + //get the type of the char + _ctype = get_enchar_type(str[j]); + if ( _ctype == FRISO_EN_WHITESPACE ) + { + _TYPE = FRISO_EN_WHITESPACE; + p++; + continue; + } - if ( _ctype == _TYPE ) - string_buffer_append_char(task->sbuf, str[j]); - else - { - start = j - task->sbuf->length - p; + if ( _ctype == _TYPE ) + string_buffer_append_char(task->sbuf, str[j]); + else + { + start = j - task->sbuf->length - p; - /* If the number of chars of current type - * is larger than config->st_minl then we will - * create a new lex_entry_t and append it to the task->wordPool. - * */ - if ( task->sbuf->length >= config->st_minl - && ! ( config->clr_stw && friso_dic_match( friso->dic, - __LEX_STOPWORDS__, task->sbuf->buffer ) ) ) - { - /* the allocation of lex_entry_t and its word - * should be released and the type of the lex_entry_t - * must be __LEX_OTHER_WORDS__. - * */ - sword = new_lex_entry(strdup(task->sbuf->buffer), - NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__); - sword->offset = lex->offset + start; - if ( retfw && fword == NULL ) fword = sword; - else link_list_add(task->pool, sword); - } + /* If the number of chars of current type + * is larger than config->st_minl then we will + * create a new lex_entry_t and append it to the task->wordPool. + * */ + if ( task->sbuf->length >= config->st_minl + && ! ( config->clr_stw && friso_dic_match( friso->dic, + __LEX_STOPWORDS__, task->sbuf->buffer ) ) ) + { + /* the allocation of lex_entry_t and its word + * should be released and the type of the lex_entry_t + * must be __LEX_OTHER_WORDS__. + * */ + sword = new_lex_entry(strdup(task->sbuf->buffer), + NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__); + sword->offset = lex->offset + start; + if ( retfw && fword == NULL ) fword = sword; + else link_list_add(task->pool, sword); + } - string_buffer_clear(task->sbuf); - string_buffer_append_char(task->sbuf, str[j]); - p = 0; - _TYPE = _ctype; - } - } + string_buffer_clear(task->sbuf); + string_buffer_append_char(task->sbuf, str[j]); + p = 0; + _TYPE = _ctype; + } + } - //continue to check the last item. - if ( task->sbuf->length >= config->st_minl - && ! ( config->clr_stw && friso_dic_match( friso->dic, - __LEX_STOPWORDS__, task->sbuf->buffer ) ) ) - { - start = j - task->sbuf->length; - sword = new_lex_entry(strdup(task->sbuf->buffer), - NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__); - sword->offset = j - task->sbuf->length; - if ( retfw && fword == NULL ) fword = sword; - else link_list_add(task->pool, sword); - } + //continue to check the last item. + if ( task->sbuf->length >= config->st_minl + && ! ( config->clr_stw && friso_dic_match( friso->dic, + __LEX_STOPWORDS__, task->sbuf->buffer ) ) ) + { + start = j - task->sbuf->length; + sword = new_lex_entry(strdup(task->sbuf->buffer), + NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__); + sword->offset = j - task->sbuf->length; + if ( retfw && fword == NULL ) fword = sword; + else link_list_add(task->pool, sword); + } - return fword; + return fword; } /*}}}*/ /* {{{ english synoyums words check and append macro define.*/ #define append_en_syn( lex, tmp, front )\ - do {\ - if ( ( tmp = friso_dic_get(friso->dic, \ - __LEX_EN_WORDS__, lex->word) ) != NULL \ - && (tmp->syn) != NULL ) \ - {\ - if ( config->spx_out == 1 ) \ - token_sphinx_output(task, tmp); \ - else \ - {\ - tmp->offset = lex->offset; \ - token_normal_output(task, tmp, front); \ - }\ - }\ - } while (0) + do {\ + if ( ( tmp = friso_dic_get(friso->dic, \ + __LEX_EN_WORDS__, lex->word) ) != NULL \ + && (tmp->syn) != NULL ) \ + {\ + if ( config->spx_out == 1 ) \ + token_sphinx_output(task, tmp); \ + else \ + {\ + tmp->offset = lex->offset; \ + token_normal_output(task, tmp, front); \ + }\ + }\ + } while (0) /* }}} */ /* {{{ get the next segmentation. - * and also this is the friso enterface function. + * and also this is the friso enterface function. * - * @param friso. - * @param config. - * @return task. + * @param friso. + * @param config. + * @return task. */ FRISO_API friso_token_t next_mmseg_token( - friso_t friso, - friso_config_t config, - friso_task_t task ) + friso_t friso, + friso_config_t config, + friso_task_t task ) { - uint_t j, len = 0; - string_buffer_t sb = NULL; - lex_entry_t lex = NULL, tmp = NULL, sword = NULL; + uint_t j, len = 0; + string_buffer_t sb = NULL; + lex_entry_t lex = NULL, tmp = NULL, sword = NULL; - /* {{{ task word pool check */ - if ( ! link_list_empty( task->pool ) ) { - /* - * load word from the word poll if it is not empty. - * this will make the next word more convenient and efficient. - * often synonyms, newly created word will be stored in the poll. - */ - lex = ( lex_entry_t ) link_list_remove_first( task->pool ); - memcpy(task->token->word, lex->word, lex->length); - task->token->type = lex->type; - task->token->length = lex->length; - task->token->rlen = lex->rlen; - task->token->offset = lex->offset; - task->token->word[lex->length] = '\0'; + /* {{{ task word pool check */ + if ( ! link_list_empty( task->pool ) ) { + /* + * load word from the word poll if it is not empty. + * this will make the next word more convenient and efficient. + * often synonyms, newly created word will be stored in the poll. + */ + lex = ( lex_entry_t ) link_list_remove_first( task->pool ); + memcpy(task->token->word, lex->word, lex->length); + task->token->type = lex->type; + task->token->length = lex->length; + task->token->rlen = lex->rlen; + task->token->offset = lex->offset; + task->token->word[lex->length] = '\0'; - /* check and handle the english synonyms words append mask. - * Also we have to close the mask after finish the operation. - * - * 1. we've check the config->add_syn before open the - * _LEX_APPENSYN_MASK mask. - * 2. we should add the synonyms words of the curren - * lex_entry_t from the head. - * - * @since: 1.6.0 - * */ - if ( lex_appensyn_check(lex) ) - { - lex_appensyn_close(lex); - append_en_syn(lex, tmp, 1); - } + /* check and handle the english synonyms words append mask. + * Also we have to close the mask after finish the operation. + * + * 1. we've check the config->add_syn before open the + * _LEX_APPENSYN_MASK mask. + * 2. we should add the synonyms words of the curren + * lex_entry_t from the head. + * + * @since: 1.6.0 + * */ + if ( lex_appensyn_check(lex) ) + { + lex_appensyn_close(lex); + append_en_syn(lex, tmp, 1); + } - /* - * __LEX_NCSYN_WORDS__: - * these lex_entry_t was created to store the the synonyums words. - * and its word pointed to the lex_entry_t's synonyms word of - * friso->dic, so : - * free the lex_entry_t but not its word here. - * - * __LEX_OTHER_WORDS__: - * newly created lexicon entry, like the chinese and english mixed word. - * during the invoke of function next_basic_latin. - * - * other type: - * they must exist in the dictionary, so just pass them. - */ - switch ( lex->type ) - { - case __LEX_OTHER_WORDS__: - FRISO_FREE( lex->word ); - free_lex_entry( lex ); - break; - case __LEX_NCSYN_WORDS__: - free_lex_entry( lex ); - break; - } + /* + * __LEX_NCSYN_WORDS__: + * these lex_entry_t was created to store the the synonyums words. + * and its word pointed to the lex_entry_t's synonyms word of + * friso->dic, so : + * free the lex_entry_t but not its word here. + * + * __LEX_OTHER_WORDS__: + * newly created lexicon entry, like the chinese and english mixed word. + * during the invoke of function next_basic_latin. + * + * other type: + * they must exist in the dictionary, so just pass them. + */ + switch ( lex->type ) + { + case __LEX_OTHER_WORDS__: + FRISO_FREE( lex->word ); + free_lex_entry( lex ); + break; + case __LEX_NCSYN_WORDS__: + free_lex_entry( lex ); + break; + } - return task->token; - } - /* }}} */ + return task->token; + } + /* }}} */ - while ( task->idx < task->length ) - { - //read the next word from the current position. - task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); - if ( task->bytes == 0 ) break; + while ( task->idx < task->length ) + { + //read the next word from the current position. + task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); + if ( task->bytes == 0 ) break; - //clear up the whitespace. - if ( friso_whitespace( friso->charset, task ) ) continue; + //clear up the whitespace. + if ( friso_whitespace( friso->charset, task ) ) continue; - /* {{{ CJK words recongnize block. */ - if ( friso_cn_string( friso->charset, task ) ) - { - /* check the dictionary. - * and return the unrecognized CJK char as a single word. - * */ - if ( ! friso_dic_match( friso->dic, - __LEX_CJK_WORDS__, task->buffer) ) - { - memcpy(task->token->word, task->buffer, task->bytes ); - task->token->type = __LEX_PUNC_WORDS__; - task->token->length = task->bytes; - task->token->rlen = task->bytes; - task->token->offset = task->idx - task->bytes; - task->token->word[(int)task->bytes] = '\0'; - return task->token; - } + /* {{{ CJK words recongnize block. */ + if ( friso_cn_string( friso->charset, task ) ) + { + /* check the dictionary. + * and return the unrecognized CJK char as a single word. + * */ + if ( ! friso_dic_match( friso->dic, + __LEX_CJK_WORDS__, task->buffer) ) + { + memcpy(task->token->word, task->buffer, task->bytes ); + task->token->type = __LEX_PUNC_WORDS__; + task->token->length = task->bytes; + task->token->rlen = task->bytes; + task->token->offset = task->idx - task->bytes; + task->token->word[(int)task->bytes] = '\0'; + return task->token; + } - //specifield mode split. - //if ( config->mode == __FRISO_COMPLEX_MODE__ ) - // lex = next_complex_cjk( friso, config, task ); - //else lex = next_simple_cjk( friso, config, task ); - lex = config->next_cjk(friso, config, task); + //specifield mode split. + //if ( config->mode == __FRISO_COMPLEX_MODE__ ) + // lex = next_complex_cjk( friso, config, task ); + //else lex = next_simple_cjk( friso, config, task ); + lex = config->next_cjk(friso, config, task); - if ( lex == NULL ) continue; //find a stopwrod. - lex->offset = task->idx - lex->rlen; + if ( lex == NULL ) continue; //find a stopwrod. + lex->offset = task->idx - lex->rlen; - /* - * try to find a chinese and english mixed words, like '卡拉ok' - * keep in mind that is not english and chinese mixed words - * like 'x射线'. - * - * @reader: - * 1. only if the char after the current word is an english char. - * 2. if the first point meet, friso will call next_basic_latin() to - * get the next basic latin. (yeah, you have to handle it). - * 3. if match a CE word, set lex to the newly match CE word. - * 4. if no match a CE word, we will have to append the basic latin - * to the pool, and it should after the append of synonyms words. - * 5. do not use the task->buffer and task->unicode as the check - * condition for the CE word identify. - * 6. Add friso_numeric_letter check so can get work like '高3' - * - * @date 2013-09-02 - */ - if ( ( task->idx < task->length ) - && ((int)task->text[task->idx]) > 0 - && ( friso_en_letter( friso->charset, task ) - || friso_numeric_letter(friso->charset, task) ) ) - { - //create a string buffer - sb = new_string_buffer_with_string(lex->word); + /* + * try to find a chinese and english mixed words, like '卡拉ok' + * keep in mind that is not english and chinese mixed words + * like 'x射线'. + * + * @reader: + * 1. only if the char after the current word is an english char. + * 2. if the first point meet, friso will call next_basic_latin() to + * get the next basic latin. (yeah, you have to handle it). + * 3. if match a CE word, set lex to the newly match CE word. + * 4. if no match a CE word, we will have to append the basic latin + * to the pool, and it should after the append of synonyms words. + * 5. do not use the task->buffer and task->unicode as the check + * condition for the CE word identify. + * 6. Add friso_numeric_letter check so can get work like '高3' + * + * @date 2013-09-02 + */ + if ( ( task->idx < task->length ) + && ((int)task->text[task->idx]) > 0 + && ( friso_en_letter( friso->charset, task ) + || friso_numeric_letter(friso->charset, task) ) ) + { + //create a string buffer + sb = new_string_buffer_with_string(lex->word); - //find the next basic latin. - task->buffer[0] = task->text[task->idx++]; - task->buffer[1] = '\0'; - tmp = next_basic_latin(friso, config, task); - tmp->offset = task->idx - tmp->length; - string_buffer_append( sb, tmp->word ); + //find the next basic latin. + task->buffer[0] = task->text[task->idx++]; + task->buffer[1] = '\0'; + tmp = next_basic_latin(friso, config, task); + tmp->offset = task->idx - tmp->length; + string_buffer_append( sb, tmp->word ); - //check the CE dictionary. - if ( friso_dic_match( friso->dic, - __LEX_CEM_WORDS__, sb->buffer ) ) - { - j = lex->offset; //bakup the offset. - lex = friso_dic_get( friso->dic, - __LEX_CEM_WORDS__, sb->buffer ); - lex->offset = j; - check_free_otlex_entry(tmp); - free_string_buffer(sb); - tmp = NULL; sb = NULL; - } - } + //check the CE dictionary. + if ( friso_dic_match( friso->dic, + __LEX_CEM_WORDS__, sb->buffer ) ) + { + j = lex->offset; //bakup the offset. + lex = friso_dic_get( friso->dic, + __LEX_CEM_WORDS__, sb->buffer ); + lex->offset = j; + check_free_otlex_entry(tmp); + free_string_buffer(sb); + tmp = NULL; sb = NULL; + } + } - /* - * copy the lex_entry to the result token - * - * @reader: (boodly lession, added 2013-08-31): - * don't bother to handle the task->token->offset problem. - * is has been sovled perfectly above. - */ - len = (int) lex->length; - memcpy(task->token->word, lex->word, lex->length); - task->token->type = lex->type; - task->token->length = lex->length; - task->token->rlen = lex->rlen; - task->token->offset = lex->offset; - task->token->word[len] = '\0'; + /* + * copy the lex_entry to the result token + * + * @reader: (boodly lession, added 2013-08-31): + * don't bother to handle the task->token->offset problem. + * is has been sovled perfectly above. + */ + len = (int) lex->length; + memcpy(task->token->word, lex->word, lex->length); + task->token->type = lex->type; + task->token->length = lex->length; + task->token->rlen = lex->rlen; + task->token->offset = lex->offset; + task->token->word[len] = '\0'; - //check and append the synonyms words - if ( config->add_syn && lex->syn != NULL ) - { - if ( config->spx_out == 1 ) - token_sphinx_output(task, lex); - else token_normal_output(task, lex, 0); - } + //check and append the synonyms words + if ( config->add_syn && lex->syn != NULL ) + { + if ( config->spx_out == 1 ) + token_sphinx_output(task, lex); + else token_normal_output(task, lex, 0); + } - /* {{{ here: handle the newly found basic latin created when - * we try to find a CE word. - * - * @reader: - * when tmp is not NULL and sb will not be NULL too - * except a CE word is found. - * - * @TODO: finished append the synonyms words on 2013-12-19. - */ - if ( tmp != NULL && sb != NULL ) - { - //check the secondary split. - if ( config->en_sseg == 1 - && task_ssseg_check(task) ) - en_second_seg(friso, config, task, tmp, 0); + /* {{{ here: handle the newly found basic latin created when + * we try to find a CE word. + * + * @reader: + * when tmp is not NULL and sb will not be NULL too + * except a CE word is found. + * + * @TODO: finished append the synonyms words on 2013-12-19. + */ + if ( tmp != NULL && sb != NULL ) + { + //check the secondary split. + if ( config->en_sseg == 1 + && task_ssseg_check(task) ) + en_second_seg(friso, config, task, tmp, 0); - free_string_buffer( sb ); - link_list_add( task->pool, tmp ); + free_string_buffer( sb ); + link_list_add( task->pool, tmp ); - //check if append synoyums words. - if ( config->add_syn == 1 ) lex_appensyn_open(tmp); + //check if append synoyums words. + if ( config->add_syn == 1 ) lex_appensyn_open(tmp); - } - /* }}} */ + } + /* }}} */ - return task->token; - } - /* }}} */ + return task->token; + } + /* }}} */ - /* {{{ basic english/latin recongnize block. */ - else if ( friso_halfwidth_en_char( friso->charset, task ) - || friso_fullwidth_en_char( friso->charset, task ) ) - { - /* - * handle the english punctuation. - * - * @todo: - * 1. commen all the code of the following if - * and uncomment the continue to clear up the punctuation directly. - * - * @reader: - * 2. keep in mind that ALL the english punctuation will be handled here, - * (when a english punctuation is found during the other process, we will - * reset the task->idx back to it and then back here) - * except the keep punctuation(define in file friso_string.c) - * that will make up a word with the english chars around it. - */ - if ( friso_en_punctuation( friso->charset, task ) ) - { - if ( config->clr_stw - && friso_dic_match(friso->dic, - __LEX_STOPWORDS__, task->buffer) ) - continue; - //count the punctuation in. - task->token->word[0] = task->buffer[0]; - task->token->type = __LEX_PUNC_WORDS__; - task->token->length = task->bytes; - task->token->rlen = task->bytes; - task->token->offset = task->idx - task->bytes; - task->token->word[1] = '\0'; - return task->token; + /* {{{ basic english/latin recongnize block. */ + else if ( friso_halfwidth_en_char( friso->charset, task ) + || friso_fullwidth_en_char( friso->charset, task ) ) + { + /* + * handle the english punctuation. + * + * @todo: + * 1. commen all the code of the following if + * and uncomment the continue to clear up the punctuation directly. + * + * @reader: + * 2. keep in mind that ALL the english punctuation will be handled here, + * (when a english punctuation is found during the other process, we will + * reset the task->idx back to it and then back here) + * except the keep punctuation(define in file friso_string.c) + * that will make up a word with the english chars around it. + */ + if ( friso_en_punctuation( friso->charset, task ) ) + { + if ( config->clr_stw + && friso_dic_match(friso->dic, + __LEX_STOPWORDS__, task->buffer) ) + continue; + //count the punctuation in. + task->token->word[0] = task->buffer[0]; + task->token->type = __LEX_PUNC_WORDS__; + task->token->length = task->bytes; + task->token->rlen = task->bytes; + task->token->offset = task->idx - task->bytes; + task->token->word[1] = '\0'; + return task->token; - //continue - } + //continue + } - //get the next basic latin word. - lex = next_basic_latin( friso, config, task ); - lex->offset = task->idx - lex->rlen; + //get the next basic latin word. + lex = next_basic_latin( friso, config, task ); + lex->offset = task->idx - lex->rlen; - /* @added: 2013-12-22 - * check and do the secondary segmentation work. - * this will split 'qq2013' to 'qq, 2013' - * */ - sword = NULL; - if ( config->en_sseg == 1 - && task_ssseg_check(task) ) - sword = en_second_seg(friso, config, task, lex, 1); + /* @added: 2013-12-22 + * check and do the secondary segmentation work. + * this will split 'qq2013' to 'qq, 2013' + * */ + sword = NULL; + if ( config->en_sseg == 1 + && task_ssseg_check(task) ) + sword = en_second_seg(friso, config, task, lex, 1); - //check if it is a stopword. - if ( config->clr_stw - && friso_dic_match( friso->dic, - __LEX_STOPWORDS__, lex->word ) ) { - //free the newly created lexicon entry. - check_free_otlex_entry( lex ); - if ( sword == NULL ) continue; - lex = sword; - } - else if ( sword != NULL ) - { - if ( config->add_syn == 1 ) lex_appensyn_open(lex); - link_list_add(task->pool, lex); + //check if it is a stopword. + if ( config->clr_stw + && friso_dic_match( friso->dic, + __LEX_STOPWORDS__, lex->word ) ) { + //free the newly created lexicon entry. + check_free_otlex_entry( lex ); + if ( sword == NULL ) continue; + lex = sword; + } + else if ( sword != NULL ) + { + if ( config->add_syn == 1 ) lex_appensyn_open(lex); + link_list_add(task->pool, lex); - /* If the sub token is not NULL: - * add the lex to the task->pool if it is not NULL - * and return the sub token istead of lex so - * the sub tokens will be output ahead of lex. - * */ - lex = sword; - } + /* If the sub token is not NULL: + * add the lex to the task->pool if it is not NULL + * and return the sub token istead of lex so + * the sub tokens will be output ahead of lex. + * */ + lex = sword; + } - //if the token is longer than __HITS_WORD_LENGTH__, drop it - //copy the word to the task token buffer. - //if ( lex->length >= __HITS_WORD_LENGTH__ ) continue; - memcpy(task->token->word, lex->word, lex->length); - task->token->type = lex->type; - task->token->length = lex->length; - task->token->rlen = lex->rlen; - task->token->offset = lex->offset; - task->token->word[lex->length] = '\0'; + //if the token is longer than __HITS_WORD_LENGTH__, drop it + //copy the word to the task token buffer. + //if ( lex->length >= __HITS_WORD_LENGTH__ ) continue; + memcpy(task->token->word, lex->word, lex->length); + task->token->type = lex->type; + task->token->length = lex->length; + task->token->rlen = lex->rlen; + task->token->offset = lex->offset; + task->token->word[lex->length] = '\0'; - /* If sword is NULL, continue to check and append - * tye synoyums words for the current lex_entry_t. - * */ - if ( sword == NULL - && config->add_syn == 1 ) append_en_syn(lex, tmp, 0); + /* If sword is NULL, continue to check and append + * tye synoyums words for the current lex_entry_t. + * */ + if ( sword == NULL + && config->add_syn == 1 ) append_en_syn(lex, tmp, 0); - //free the newly create lex_entry_t - check_free_otlex_entry( lex ); + //free the newly create lex_entry_t + check_free_otlex_entry( lex ); - return task->token; - } - /* }}} */ + return task->token; + } + /* }}} */ - /* {{{ Keep the chinese punctuation. - * @added 2013-08-31) */ - else if ( friso_cn_punctuation( friso->charset, task ) ) - { - if ( config->clr_stw - && friso_dic_match(friso->dic, - __LEX_STOPWORDS__, task->buffer) ) - continue; - //count the punctuation in. - memcpy(task->token->word, task->buffer, task->bytes); - task->token->type = __LEX_PUNC_WORDS__; - task->token->length = task->bytes; - task->token->offset = task->idx - task->bytes; - task->token->word[task->bytes] = '\0'; - return task->token; - } - /* }}} */ - //else if ( friso_letter_number( friso->charset, task ) ) - //{ - //} - //else if ( friso_other_number( friso->charset, task ) ) - //{ - //} + /* {{{ Keep the chinese punctuation. + * @added 2013-08-31) */ + else if ( friso_cn_punctuation( friso->charset, task ) ) + { + if ( config->clr_stw + && friso_dic_match(friso->dic, + __LEX_STOPWORDS__, task->buffer) ) + continue; + //count the punctuation in. + memcpy(task->token->word, task->buffer, task->bytes); + task->token->type = __LEX_PUNC_WORDS__; + task->token->length = task->bytes; + task->token->offset = task->idx - task->bytes; + task->token->word[task->bytes] = '\0'; + return task->token; + } + /* }}} */ + //else if ( friso_letter_number( friso->charset, task ) ) + //{ + //} + //else if ( friso_other_number( friso->charset, task ) ) + //{ + //} - /* {{{ keep the unrecognized words? - //@date 2013-10-14 */ - else if ( config->keep_urec ) - { - memcpy(task->token->word, task->buffer, task->bytes); - task->token->type = __LEX_UNKNOW_WORDS__; - task->token->length = task->bytes; - task->token->offset = task->idx - task->bytes; - task->token->word[task->bytes] = '\0'; - return task->token; - } - /* }}} */ - } + /* {{{ keep the unrecognized words? + //@date 2013-10-14 */ + else if ( config->keep_urec ) + { + memcpy(task->token->word, task->buffer, task->bytes); + task->token->type = __LEX_UNKNOW_WORDS__; + task->token->length = task->bytes; + task->token->offset = task->idx - task->bytes; + task->token->word[task->bytes] = '\0'; + return task->token; + } + /* }}} */ + } - return NULL; + return NULL; } /* }}} */ //---------------------------------------------------------------------- //detect core logic controller: detect tokenize mode handler functions /** {{{ get the next splited token with detect mode - * detect mode will only return the words in the dictionary - * with simple forward maximum matching algorithm + * detect mode will only return the words in the dictionary + * with simple forward maximum matching algorithm */ FRISO_API friso_token_t next_detect_token( - friso_t friso, friso_config_t config, friso_task_t task ) + friso_t friso, friso_config_t config, friso_task_t task ) { - lex_entry_t lex = NULL; - int i, __convert = 0, tbytes, wbytes; + lex_entry_t lex = NULL; + int i, __convert = 0, tbytes, wbytes; - /* {{{ task word pool check */ - if ( ! link_list_empty( task->pool ) ) - { - /* - * load word from the word poll if it is not empty. - * this will make the next word more convenient and efficient. - * often synonyms, newly created word will be stored in the poll. - */ - lex = ( lex_entry_t ) link_list_remove_first( task->pool ); - memcpy(task->token->word, lex->word, lex->length); - task->token->type = lex->type; - task->token->length = lex->length; - task->token->rlen = lex->rlen; - task->token->offset = lex->offset; - task->token->word[lex->length] = '\0'; + /* {{{ task word pool check */ + if ( ! link_list_empty( task->pool ) ) + { + /* + * load word from the word poll if it is not empty. + * this will make the next word more convenient and efficient. + * often synonyms, newly created word will be stored in the poll. + */ + lex = ( lex_entry_t ) link_list_remove_first( task->pool ); + memcpy(task->token->word, lex->word, lex->length); + task->token->type = lex->type; + task->token->length = lex->length; + task->token->rlen = lex->rlen; + task->token->offset = lex->offset; + task->token->word[lex->length] = '\0'; - /* - * __LEX_NCSYN_WORDS__: - * these lex_entry_t was created to store the the synonyums words. - * and its word pointed to the lex_entry_t's synonyms word of - * friso->dic, so : - * free the lex_entry_t but not its word here. - */ - if ( lex->type == __LEX_NCSYN_WORDS__ ) - { - free_lex_entry( lex ); - } + /* + * __LEX_NCSYN_WORDS__: + * these lex_entry_t was created to store the the synonyums words. + * and its word pointed to the lex_entry_t's synonyms word of + * friso->dic, so : + * free the lex_entry_t but not its word here. + */ + if ( lex->type == __LEX_NCSYN_WORDS__ ) + { + free_lex_entry( lex ); + } - return task->token; - } - /* }}} */ + return task->token; + } + /* }}} */ - while ( task->idx < task->length ) - { - lex = NULL; + while ( task->idx < task->length ) + { + lex = NULL; - //read the next word from the current position. - task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); - if ( task->bytes == 0 ) break; + //read the next word from the current position. + task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); + if ( task->bytes == 0 ) break; - //clear up the whitespace. - if ( friso_whitespace( friso->charset, task ) ) continue; + //clear up the whitespace. + if ( friso_whitespace( friso->charset, task ) ) continue; - //convert full-width to half-width - // and uppercase to lowercase for english chars - wbytes = 0; - tbytes = task->bytes; - convert_full_to_half( friso, task, __convert ); - convert_upper_to_lower( friso, task, __convert ); - convert_work_apply( friso, task, __convert ); + //convert full-width to half-width + // and uppercase to lowercase for english chars + wbytes = 0; + tbytes = task->bytes; + convert_full_to_half( friso, task, __convert ); + convert_upper_to_lower( friso, task, __convert ); + convert_work_apply( friso, task, __convert ); - string_buffer_clear(task->sbuf); - string_buffer_append(task->sbuf, task->buffer); - if ( friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer) ) - { - lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer); - wbytes = tbytes; - } + string_buffer_clear(task->sbuf); + string_buffer_append(task->sbuf, task->buffer); + if ( friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer) ) + { + lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer); + wbytes = tbytes; + } - for ( i = 1; i < config->max_len; i++ ) - { - task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); - if ( task->bytes == 0 ) break; + for ( i = 1; i < config->max_len; i++ ) + { + task->bytes = readNextWord( friso, task, &task->idx, task->buffer ); + if ( task->bytes == 0 ) break; - //convert full-width to half-width - // and uppercase to lowercase for english chars - tbytes += task->bytes; - convert_full_to_half( friso, task, __convert ); - convert_upper_to_lower( friso, task, __convert ); - convert_work_apply( friso, task, __convert ); - string_buffer_append(task->sbuf, task->buffer); + //convert full-width to half-width + // and uppercase to lowercase for english chars + tbytes += task->bytes; + convert_full_to_half( friso, task, __convert ); + convert_upper_to_lower( friso, task, __convert ); + convert_work_apply( friso, task, __convert ); + string_buffer_append(task->sbuf, task->buffer); - if ( friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer) ) - { - lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer); - wbytes = tbytes; - } - } + if ( friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer) ) + { + lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer); + wbytes = tbytes; + } + } - /* - * matches no word in the dictionary - * reset the task->idx to the correct value - */ - if ( lex == NULL ) - { - task->idx -= (tbytes - 1); - continue; - } + /* + * matches no word in the dictionary + * reset the task->idx to the correct value + */ + if ( lex == NULL ) + { + task->idx -= (tbytes - 1); + continue; + } - //yat, matched a item and tanke it to initialize the returning token - // also we need to push back the none-matched part by reset the task->idx - task->idx -= (tbytes - wbytes); + //yat, matched a item and tanke it to initialize the returning token + // also we need to push back the none-matched part by reset the task->idx + task->idx -= (tbytes - wbytes); - memcpy(task->token->word, lex->word, lex->length); - task->token->type = __LEX_CJK_WORDS__; - task->token->length = lex->length; - task->token->rlen = wbytes; - task->token->offset = task->idx - wbytes; - task->token->word[(int)lex->length] = '\0'; + memcpy(task->token->word, lex->word, lex->length); + task->token->type = __LEX_CJK_WORDS__; + task->token->length = lex->length; + task->token->rlen = wbytes; + task->token->offset = task->idx - wbytes; + task->token->word[(int)lex->length] = '\0'; - //check and append the synonyms words - if ( config->add_syn && lex->syn != NULL ) - { - if ( config->spx_out == 1 ) - token_sphinx_output(task, lex); - else token_normal_output(task, lex, 0); - } + //check and append the synonyms words + if ( config->add_syn && lex->syn != NULL ) + { + if ( config->spx_out == 1 ) + token_sphinx_output(task, lex); + else token_normal_output(task, lex, 0); + } - return task->token; - } + return task->token; + } - return NULL; + return NULL; } /* }}} */ diff --git a/src/friso.h b/src/friso.h index 7711541..ea4cca4 100644 --- a/src/friso.h +++ b/src/friso.h @@ -1,8 +1,8 @@ /* * main interface file for friso - free soul. - * you could modify it and re-release it but never for commercial use. + * you could modify it and re-release it but never for commercial use. * - * @author chenxin + * @author chenxin */ #ifndef _friso_h #define _friso_h @@ -15,11 +15,11 @@ #define friso_version() FRISO_VERSION -#define DEFAULT_SEGMENT_LENGTH 5 -#define DEFAULT_MIX_LENGTH 2 -#define DEFAULT_LNA_LENGTH 1 -#define DEFAULT_NTHRESHOLD 1000000 -#define DEFAULT_SEGMENT_MODE 2 +#define DEFAULT_SEGMENT_LENGTH 5 +#define DEFAULT_MIX_LENGTH 2 +#define DEFAULT_LNA_LENGTH 1 +#define DEFAULT_NTHRESHOLD 1000000 +#define DEFAULT_SEGMENT_MODE 2 /* * Type: friso_lex_t @@ -29,8 +29,8 @@ typedef enum { __LEX_CJK_WORDS__ = 0, __LEX_CJK_UNITS__ = 1, - __LEX_ECM_WORDS__ = 2, //english and chinese mixed words. - __LEX_CEM_WORDS__ = 3, //chinese and english mixed words. + __LEX_ECM_WORDS__ = 2, //english and chinese mixed words. + __LEX_CEM_WORDS__ = 3, //chinese and english mixed words. __LEX_CN_LNAME__ = 4, __LEX_CN_SNAME__ = 5, __LEX_CN_DNAME1__ = 6, @@ -41,8 +41,8 @@ typedef enum { __LEX_EN_WORDS__ = 11, __LEX_OTHER_WORDS__ = 15, __LEX_NCSYN_WORDS__ = 16, - __LEX_PUNC_WORDS__ = 17, //punctuations - __LEX_UNKNOW_WORDS__ = 18 //unrecognized words. + __LEX_PUNC_WORDS__ = 17, //punctuations + __LEX_UNKNOW_WORDS__ = 18 //unrecognized words. } friso_lex_t; typedef friso_hash_t * friso_dic_t; @@ -51,8 +51,8 @@ typedef friso_hash_t * friso_dic_t; //charset that Friso now support. typedef enum { - FRISO_UTF8 = 0, //UTF-8 - FRISO_GBK = 1 //GBK + FRISO_UTF8 = 0, //UTF-8 + FRISO_GBK = 1 //GBK } friso_charset_t; /* @@ -61,15 +61,15 @@ typedef enum { * use to identidy the mode that the friso use. */ typedef enum { - __FRISO_SIMPLE_MODE__ = 1, - __FRISO_COMPLEX_MODE__ = 2, - __FRISO_DETECT_MODE__ = 3 + __FRISO_SIMPLE_MODE__ = 1, + __FRISO_COMPLEX_MODE__ = 2, + __FRISO_DETECT_MODE__ = 3 } friso_mode_t; /* friso entry.*/ typedef struct { - friso_dic_t dic; //friso dictionary - friso_charset_t charset; //project charset. + friso_dic_t dic; //friso dictionary + friso_charset_t charset; //project charset. } friso_entry; typedef friso_entry * friso_t; @@ -80,26 +80,26 @@ typedef friso_entry * friso_t; * ------------------- * This type used to represent the lexicon entry struct. */ -#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words. -#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK -#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK -#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0) +#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words. +#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK +#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK +#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0) typedef struct { /* * the type of the lexicon item. * available value is all the elements in friso_lex_t enum. - * and if it is __LEX_OTHER_WORDS__, we need to free it after use it. + * and if it is __LEX_OTHER_WORDS__, we need to free it after use it. */ - uchar_t length; //the length of the token.(after the convertor of Friso.) - uchar_t rlen; //the real length of the token.(before any convert) + uchar_t length; //the length of the token.(after the convertor of Friso.) + uchar_t rlen; //the real length of the token.(before any convert) uchar_t type; - uchar_t ctrlMask; //function control mask, like append the synoyums words. - uint_t offset; //offset index. + uchar_t ctrlMask; //function control mask, like append the synoyums words. + uint_t offset; //offset index. fstring word; - //fstring py; //pinyin of the word.(invalid) - friso_array_t syn; //synoyums words. - friso_array_t pos; //part of speech. - uint_t fre; //single word frequency. + //fstring py; //pinyin of the word.(invalid) + friso_array_t syn; //synoyums words. + friso_array_t pos; //part of speech. + uint_t fre; //single word frequency. } lex_entry_cdt; typedef lex_entry_cdt * lex_entry_t; @@ -108,11 +108,11 @@ typedef lex_entry_cdt * lex_entry_t; #define __HITS_WORD_LENGTH__ 64 typedef struct { - uchar_t type; //type of the word. (item of friso_lex_t) - uchar_t length; //length of the token. - uchar_t rlen; //the real length of the token.(in orgin strng) - char pos; //part of speech. - int offset; //start offset of the word. + uchar_t type; //type of the word. (item of friso_lex_t) + uchar_t length; //length of the token. + uchar_t rlen; //the real length of the token.(in orgin strng) + char pos; //part of speech. + int offset; //start offset of the word. char word[__HITS_WORD_LENGTH__]; //char py[0]; } friso_token_entry; @@ -122,25 +122,25 @@ typedef friso_token_entry * friso_token_t; /* * Type: friso_task_entry * This type used to represent the current segmentation content. - * like the text to split, and the current index, token buffer eg.... + * like the text to split, and the current index, token buffer eg.... */ //action control mask for #FRISO_TASK_T#. -#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction. -#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation. -#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK -#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK -#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0) +#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction. +#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation. +#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK +#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK +#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0) typedef struct { - fstring text; //text to tokenize - uint_t idx; //start offset index. - uint_t length; //length of the text. - uint_t bytes; //latest word bytes in C. - uint_t unicode; //latest word unicode number. - uint_t ctrlMask; //action control mask. - friso_link_t pool; //task pool. - string_buffer_t sbuf; //string buffer. - friso_token_t token; //token result token; - char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C). + fstring text; //text to tokenize + uint_t idx; //start offset index. + uint_t length; //length of the text. + uint_t bytes; //latest word bytes in C. + uint_t unicode; //latest word unicode number. + uint_t ctrlMask; //action control mask. + friso_link_t pool; //task pool. + string_buffer_t sbuf; //string buffer. + friso_token_t token; //token result token; + char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C). } friso_task_entry; typedef friso_task_entry * friso_task_t; @@ -151,23 +151,23 @@ typedef friso_task_entry * friso_task_t; //typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t ); //typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t ); struct friso_config_struct { - ushort_t max_len; //the max match length (4 - 7). - ushort_t r_name; //1 for open chinese name recognition 0 for close it. - ushort_t mix_len; //the max length for the CJK words in a mix string. - ushort_t lna_len; //the max length for the chinese last name adron. - ushort_t add_syn; //append synonyms tokenizer words. - ushort_t clr_stw; //clear the stopwords. - ushort_t keep_urec; //keep the unrecongnized words. - ushort_t spx_out; //use sphinx output customize. - ushort_t en_sseg; //start the secondary segmentation. - ushort_t st_minl; //min length of the secondary segmentation token. - uint_t nthreshold; //the threshold value for a char to make up a chinese name. - friso_mode_t mode; //Complex mode or simple mode + ushort_t max_len; //the max match length (4 - 7). + ushort_t r_name; //1 for open chinese name recognition 0 for close it. + ushort_t mix_len; //the max length for the CJK words in a mix string. + ushort_t lna_len; //the max length for the chinese last name adron. + ushort_t add_syn; //append synonyms tokenizer words. + ushort_t clr_stw; //clear the stopwords. + ushort_t keep_urec; //keep the unrecongnized words. + ushort_t spx_out; //use sphinx output customize. + ushort_t en_sseg; //start the secondary segmentation. + ushort_t st_minl; //min length of the secondary segmentation token. + uint_t nthreshold; //the threshold value for a char to make up a chinese name. + friso_mode_t mode; //Complex mode or simple mode - //pointer to the function to get the next token - friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t); - //pointer to the function to get the next cjk lex_entry_t - lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t); + //pointer to the function to get the next token + friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t); + //pointer to the function to get the next cjk lex_entry_t + lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t); char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer. }; @@ -181,7 +181,7 @@ typedef friso_config_entry * friso_config_t; * Usage: vars = friso_new( void ); * -------------------------------- * This function used to create a new empty friso friso_t; - * with default value. + * with default value. */ FRISO_API friso_t friso_new( void ); @@ -202,7 +202,7 @@ FRISO_API void friso_free( friso_t ); * Usage: dic = friso_set_dic( vars, dic ); * ---------------------------------------- * This function is used to set the dictionary for friso. - * and firso_dic_t is the pointer of a hash table array. + * and firso_dic_t is the pointer of a hash table array. */ //FRISO_API void friso_set_dic( friso_t, friso_dic_t ); #define friso_set_dic(friso, dic)\ @@ -272,7 +272,7 @@ FRISO_API lex_entry_t next_complex_cjk( friso_t, friso_config_t, friso_task_t ); * Usage: word = next_mmseg_token( vars, seg ); * -------------------------------------- * This function is used to get next word that friso segmented - * with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__ + * with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__ */ FRISO_API friso_token_t next_mmseg_token( friso_t, friso_config_t, friso_task_t ); @@ -313,14 +313,14 @@ FRISO_API void free_lex_entry( lex_entry_t ); * Usage: friso_dic_load( friso, friso_lex_t, path, length ); * -------------------------------------------------- * This function is used to load dictionary from a given path. - * no length limit when length less than 0. + * no length limit when length less than 0. */ FRISO_API void friso_dic_load( friso_t, friso_config_t, - friso_lex_t, fstring, uint_t ); + friso_lex_t, fstring, uint_t ); /* * load the lexicon configuration file. - * and load all the valid lexicon from the conf file. + * and load all the valid lexicon from the conf file. */ FRISO_API void friso_dic_load_from_ifile( friso_t, friso_config_t, fstring, uint_t ); diff --git a/src/friso_API.h b/src/friso_API.h index 8315322..a6d04b2 100644 --- a/src/friso_API.h +++ b/src/friso_API.h @@ -16,22 +16,22 @@ //yat, just take it as this way, 99 percent you will find no problem #if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) ) -# define FRISO_WINNT +# define FRISO_WINNT #else -# define FRISO_LINUX +# define FRISO_LINUX #endif #ifdef FRISO_WINNT -# define FRISO_API extern __declspec(dllexport) -# define __STATIC_API__ static +# define FRISO_API extern __declspec(dllexport) +# define __STATIC_API__ static #else /*platform shared library statement :: unix*/ -# define FRISO_API extern -# define __STATIC_API__ static inline +# define FRISO_API extern +# define __STATIC_API__ static inline #endif -#define ___ALLOCATION_ERROR___ \ - printf("Unable to do the memory allocation, program will now exit\n" ); \ +#define ___ALLOCATION_ERROR___ \ + printf("Unable to do the memory allocation, program will now exit\n" ); \ exit(1); #define print(str) printf("%s", str ) @@ -39,12 +39,12 @@ exit(1); /* * memory allocation macro definition. - * cause we should use emalloc,ecalloc .ege. in php. + * cause we should use emalloc,ecalloc .ege. in php. * so you could make it better apdat the php environment. */ -#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks) -#define FRISO_MALLOC(_bytes) malloc(_bytes) -#define FRISO_FREE( _ptr ) free( _ptr ) +#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks) +#define FRISO_MALLOC(_bytes) malloc(_bytes) +#define FRISO_FREE( _ptr ) free( _ptr ) typedef unsigned short ushort_t; typedef unsigned char uchar_t; @@ -74,7 +74,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str ); /* * this function will copy the chars that the fstring pointed. - * to the buffer. + * to the buffer. * this may cause the resize action of the buffer. */ FRISO_API void string_buffer_append( string_buffer_t, fstring ); @@ -88,21 +88,21 @@ FRISO_API fstring string_buffer_remove( string_buffer_t, uint_t idx, uint_t ); /* * turn the string_buffer to a string. - * or return the buffer of the string_buffer. + * or return the buffer of the string_buffer. */ FRISO_API string_buffer_t string_buffer_trim( string_buffer_t ); /* * free the given fstring buffer. - * and this function will not free the allocations of the - * the string_buffer_t->buffer, we return it to you, if there is - * a necessary you could free it youself by calling free(); + * and this function will not free the allocations of the + * the string_buffer_t->buffer, we return it to you, if there is + * a necessary you could free it youself by calling free(); */ FRISO_API fstring string_buffer_devote( string_buffer_t ); /* * clear the given fstring buffer. - * reset its buffer with 0 and reset its length to 0. + * reset its buffer with 0 and reset its length to 0. */ FRISO_API void string_buffer_clear( string_buffer_t ); @@ -126,8 +126,8 @@ typedef string_split_entry * string_split_t; /** * create a new string_split_entry. * - * @param source - * @return string_split_t; + * @param source + * @return string_split_t; */ FRISO_API string_split_t new_string_split( fstring, fstring ); @@ -141,12 +141,12 @@ FRISO_API void free_string_split( string_split_t ); /** * get the next split fstring, and copy the - * splited fstring into the __dst buffer . + * splited fstring into the __dst buffer . * - * @param string_split_t - * @param __dst - * @return fstring (NULL if reach the end of the source - * or there is no more segmentation) + * @param string_split_t + * @param __dst + * @return fstring (NULL if reach the end of the source + * or there is no more segmentation) */ FRISO_API fstring string_split_next( string_split_t, fstring ); /* }}} */ @@ -175,7 +175,7 @@ FRISO_API friso_array_t new_array_list_with_opacity( uint_t ); /* * free the given friso array. - * and its items, but never where the items's item to pointed to . + * and its items, but never where the items's item to pointed to . */ FRISO_API void free_array_list( friso_array_t ); @@ -190,13 +190,13 @@ FRISO_API void *array_list_get( friso_array_t, uint_t ); /* * set the item at a specified position. - * this will return the old value. + * this will return the old value. */ FRISO_API void *array_list_set( friso_array_t, uint_t, void * ); /* * remove the given item at a specified position. - * this will return the value of the removed item. + * this will return the value of the removed item. */ FRISO_API void *array_list_remove( friso_array_t, uint_t ); @@ -205,9 +205,9 @@ FRISO_API friso_array_t array_list_trim( friso_array_t ); /* * clear the array list. - * this function will free all the allocations that the pointer pointed. - * but will not free the point array allocations, - * and will reset the length of it. + * this function will free all the allocations that the pointer pointed. + * but will not free the point array allocations, + * and will reset the length of it. */ FRISO_API friso_array_t array_list_clear( friso_array_t ); @@ -300,8 +300,8 @@ FRISO_API void link_list_add_first( friso_link_t, void * ); /* {{{ hashtable interface define :: start*/ struct hash_entry { - fstring _key; //the node key - void * _val; //the node value + fstring _key; //the node key + void * _val; //the node value struct hash_entry * _next; }; typedef struct hash_entry friso_hash_entry; @@ -319,8 +319,8 @@ typedef struct { typedef friso_hash_cdt * friso_hash_t; //default value for friso_hash_cdt -#define DEFAULT_LENGTH 31 -#define DEFAULT_FACTOR 0.85f +#define DEFAULT_LENGTH 31 +#define DEFAULT_FACTOR 0.85f /* * Function: new_hash_table @@ -359,7 +359,7 @@ FRISO_API int hash_exist_mapping( friso_hash_t, fstring ); * Usage: value = get_mapping_value( table, key ); * ----------------------------------------------- * this function return the value associated with the given key. - * UNDEFINED will be return if the mapping is not exists. + * UNDEFINED will be return if the mapping is not exists. */ FRISO_API void * hash_get_value( friso_hash_t, fstring ); diff --git a/src/friso_GBK.c b/src/friso_GBK.c index d64e6d6..28ea22e 100644 --- a/src/friso_GBK.c +++ b/src/friso_GBK.c @@ -1,6 +1,6 @@ /** * Friso GBK about function implements source file. - * @package src/friso_GBK.c . + * @package src/friso_GBK.c . * * @author chenxin */ @@ -12,12 +12,12 @@ /* read the next GBK word from the specified position. * - * @return int the bytes of the current readed word. + * @return int the bytes of the current readed word. */ FRISO_API int gbk_next_word( - friso_task_t task, - uint_t *idx, - fstring __word ) + friso_task_t task, + uint_t *idx, + fstring __word ) { int c; if ( *idx >= task->length ) return 0; @@ -41,26 +41,26 @@ FRISO_API int gbk_next_word( //} //check if the given buffer is a gbk word (ANSII string). -// included the simplified and traditional words. +// included the simplified and traditional words. FRISO_API int gbk_cn_string( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; //GBK/2: gb2312 chinese word. return ( ((c1 >= 0xb0 && c1 <= 0xf7) - && (c2 >= 0xa1 && c2 <= 0xfe)) + && (c2 >= 0xa1 && c2 <= 0xfe)) //GBK/3: extend chinese words. - || ((c1 >= 0x81 && c1 <= 0xa0) - && ( (c2 >= 0x40 && c2 <= 0x7e) - || (c2 >= 0x80 && c2 <= 0xfe) )) + || ((c1 >= 0x81 && c1 <= 0xa0) + && ( (c2 >= 0x40 && c2 <= 0x7e) + || (c2 >= 0x80 && c2 <= 0xfe) )) //GBK/4: extend chinese words. - || ((c1 >= 0xaa && c1 <= 0xfe) - && ( (c2 >= 0x40 && c2 <= 0xfe) - || (c2 >= 0x80 && c2 <= 0xa0) )) ); + || ((c1 >= 0xaa && c1 <= 0xfe) + && ( (c2 >= 0x40 && c2 <= 0xfe) + || (c2 >= 0x80 && c2 <= 0xa0) )) ); } /*check if the given char is a ASCII letter - * include all the arabic number, letters and english puntuations.*/ + * include all the arabic number, letters and english puntuations.*/ FRISO_API int gbk_halfwidth_en_char( char c ) { int u = (uchar_t) c; @@ -69,58 +69,58 @@ FRISO_API int gbk_halfwidth_en_char( char c ) /* * check if the given char is a full-width latain. - * include the full-width arabic numeber, letters. - * but not the full-width puntuations. + * include the full-width arabic numeber, letters. + * but not the full-width puntuations. */ FRISO_API int gbk_fullwidth_en_char( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; return ( (c1 == 0xA3) - && ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers. - || ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters. - || ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters. + && ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers. + || ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters. + || ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters. } //check if the given char is a upper case english letter. -// included the full-width and half-width letters. +// included the full-width and half-width letters. FRISO_API int gbk_uppercase_letter( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; - if ( c1 <= 0x80 ) //half-width - return ( c1 >= 65 && c1 <= 90 ); - else //full-width - return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) ); + if ( c1 <= 0x80 ) //half-width + return ( c1 >= 65 && c1 <= 90 ); + else //full-width + return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) ); } //check if the given char is a lower case char. -// included the full-width and half-width letters. +// included the full-width and half-width letters. FRISO_API int gbk_lowercase_letter( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; - if ( c1 <= 0x80 ) //half-width - return ( c1 >= 97 && c1 <= 122 ); - else //full-width - return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) ); + if ( c1 <= 0x80 ) //half-width + return ( c1 >= 97 && c1 <= 122 ); + else //full-width + return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) ); } //check if the given char is a arabic numeric. -// included the full-width and half-width arabic numeric. +// included the full-width and half-width arabic numeric. FRISO_API int gbk_numeric_letter( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; - if ( c1 <= 0x80 ) //half-width - return ( c1 >= 48 && c1 <= 57 ); - else //full-width - return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) ); + if ( c1 <= 0x80 ) //half-width + return ( c1 >= 48 && c1 <= 57 ); + else //full-width + return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) ); } /* * check if the given fstring is make up with numeric chars. - * both full-width,half-width numeric is ok. + * both full-width,half-width numeric is ok. */ FRISO_API int gbk_numeric_string( char *str ) { @@ -130,17 +130,17 @@ FRISO_API int gbk_numeric_string( char *str ) while ( *s != '\0' ) { - c1 = (uchar_t) (*s++); - if ( c1 <= 0x80 ) //half-width - { - if ( c1 < 48 || c2 > 57 ) return 0; - } - else //full-width - { - if ( c1 != 0xa3 ) return 0; - c2 = (uchar_t) (*s++); - if ( c2 < 0xb0 || c2 > 0xb9 ) return 0; - } + c1 = (uchar_t) (*s++); + if ( c1 <= 0x80 ) //half-width + { + if ( c1 < 48 || c2 > 57 ) return 0; + } + else //full-width + { + if ( c1 != 0xa3 ) return 0; + c2 = (uchar_t) (*s++); + if ( c2 < 0xb0 || c2 > 0xb9 ) return 0; + } } return 1; @@ -157,47 +157,47 @@ FRISO_API int gbk_decimal_string( char *str ) for ( i = 0; i < len; ) { - c1 = (uchar_t) str[i++]; - //count the number of the points. - if ( c1 == 46 ) - { - p++; - continue; - } + c1 = (uchar_t) str[i++]; + //count the number of the points. + if ( c1 == 46 ) + { + p++; + continue; + } - if ( c1 <= 0x80 ) //half-width - { - if ( c1 < 48 || c1 > 57 ) return 0; - } - else //full-width - { - if ( c1 != 0xa3 ) return 0; - c2 = (uchar_t) str[i++]; - if ( c2 < 0xb0 || c2 > 0xb9 ) return 0; - } + if ( c1 <= 0x80 ) //half-width + { + if ( c1 < 48 || c1 > 57 ) return 0; + } + else //full-width + { + if ( c1 != 0xa3 ) return 0; + c2 = (uchar_t) str[i++]; + if ( c2 < 0xb0 || c2 > 0xb9 ) return 0; + } } return (p == 1); } //check if the given char is a english(ASCII) letter. -// (full-width and half-width), not the punctuation/arabic of course. +// (full-width and half-width), not the punctuation/arabic of course. FRISO_API int gbk_en_letter( char *str ) { int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; - if ( c1 <= 0x80 ) //half-width - return ( (c1 >= 65 && c1 <= 90) //lowercase - || (c1 >= 97 && c1 <= 122)); //uppercase + if ( c1 <= 0x80 ) //half-width + return ( (c1 >= 65 && c1 <= 90) //lowercase + || (c1 >= 97 && c1 <= 122)); //uppercase else - return ( (c1 == 0xa3) - && ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase - || ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase + return ( (c1 == 0xa3) + && ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase + || ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase return 0; } //check the given char is a whitespace or not. -// included full-width and half-width whitespace. +// included full-width and half-width whitespace. FRISO_API int gbk_whitespace( char *str ) { int c1 = (uchar_t) str[0]; @@ -213,8 +213,8 @@ FRISO_API int gbk_letter_number( char *str ) int c1 = (uchar_t) str[0]; int c2 = (uchar_t) str[1]; return ( (c1 == 0xa2) - && ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase - || ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase + && ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase + || ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase } /* @@ -232,9 +232,9 @@ FRISO_API int gbk_en_punctuation( char c ) { int u = (uchar_t) c; return ( (u > 32 && u < 48) - || ( u > 57 && u < 65 ) - || ( u > 90 && u < 97 ) - || ( u > 122 && u < 127 ) ); + || ( u > 57 && u < 65 ) + || ( u > 90 && u < 97 ) + || ( u > 122 && u < 127 ) ); } //check the given char is a chinese punctuation. @@ -244,16 +244,16 @@ FRISO_API int gbk_cn_punctuation( char *str ) int c2 = (uchar_t) str[1]; //full-width en punctuation. return ( (c1 == 0xa3 && (( c2 >= 0xa1 && c2 <= 0xaf ) - || ( c2 >= 0xba && c2 <= 0xc0 ) - || ( c2 >= 0xdb && c2 <= 0xe0 ) - || ( c2 >= 0xfb && c2 <= 0xfe ) )) + || ( c2 >= 0xba && c2 <= 0xc0 ) + || ( c2 >= 0xdb && c2 <= 0xe0 ) + || ( c2 >= 0xfb && c2 <= 0xfe ) )) //chinese punctuation. - || (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae) - || ( c2 >= 0xb0 && c2 <= 0xbf ) )) + || (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae) + || ( c2 >= 0xb0 && c2 <= 0xbf ) )) //A6 area special punctuations:" " - || (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe)) + || (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe)) //A8 area special punctuations: " ˊˋ˙–―‥‵℅ " - || (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) ); + || (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) ); } /* {{{ @@ -269,19 +269,19 @@ FRISO_API int gbk_cn_punctuation( char *str ) //FRISO_API int gbk_keep_punctuation( char *str ) //{ // if ( __keep_punctuations_hash__ == NULL ) { -// __keep_punctuations_hash__ = new_hash_table(); -// hash_put_mapping( __keep_punctuations_hash__, "@", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "$", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "%", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "^", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "&", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "-", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, ":", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, ".", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "/", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "'", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "#", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "+", NULL ); +// __keep_punctuations_hash__ = new_hash_table(); +// hash_put_mapping( __keep_punctuations_hash__, "@", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "$", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "%", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "^", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "&", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "-", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, ":", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, ".", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "/", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "'", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "#", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "+", NULL ); // } // //check the hash. // return hash_exist_mapping( __keep_punctuations_hash__, str ); diff --git a/src/friso_UTF8.c b/src/friso_UTF8.c index 5e5dfb6..5238d84 100644 --- a/src/friso_UTF8.c +++ b/src/friso_UTF8.c @@ -1,6 +1,6 @@ /** * Friso utf8 about function implements source file. - * @package src/friso_UTF8.c . + * @package src/friso_UTF8.c . * * @author chenxin */ @@ -12,12 +12,12 @@ /* read the next utf-8 word from the specified position. * - * @return int the bytes of the current readed word. + * @return int the bytes of the current readed word. */ FRISO_API int utf8_next_word( - friso_task_t task, - uint_t *idx, - fstring __word ) + friso_task_t task, + uint_t *idx, + fstring __word ) { if ( *idx >= task->length ) return 0; @@ -25,7 +25,7 @@ FRISO_API int utf8_next_word( task->bytes = get_utf8_bytes( task->text[ *idx ] ); //for ( t = 0; t < task->bytes; t++ ) { - // __word[t] = task->text[ (*idx)++ ]; + // __word[t] = task->text[ (*idx)++ ]; //} //change the loop to memcpy. @@ -52,31 +52,31 @@ FRISO_API void print_char_binary( char value ) for ( t = 0; t < __CHAR_BYTES__; t++ ) { - if ( ( value & 0x80 ) == 0x80 ) { - printf("1"); - } else { - printf("0"); - } - value <<= 1; + if ( ( value & 0x80 ) == 0x80 ) { + printf("1"); + } else { + printf("0"); + } + value <<= 1; } } /* * get the bytes of a utf-8 char. - * between 1 - 6. + * between 1 - 6. * * @param __char * @return int */ FRISO_API int get_utf8_bytes( char value ) -{ +{ register uint_t t = 0; //one byte ascii char. if ( ( value & 0x80 ) == 0 ) return 1; for ( ; ( value & 0x80 ) != 0; value <<= 1 ) - t++; + t++; return t; } @@ -94,25 +94,25 @@ FRISO_API int get_utf8_unicode( const fstring ch ) register char b1,b2,b3; switch ( bytes ) { - case 1: - *bit = *ch; - break; - case 2: - b1 = *ch; - b2 = *(ch + 1); + case 1: + *bit = *ch; + break; + case 2: + b1 = *ch; + b2 = *(ch + 1); - *bit = (b1 << 6) + (b2 & 0x3F); - *(bit+1) = (b1 >> 2) & 0x07; - break; - case 3: - b1 = *ch; - b2 = *(ch + 1); - b3 = *(ch + 2); + *bit = (b1 << 6) + (b2 & 0x3F); + *(bit+1) = (b1 >> 2) & 0x07; + break; + case 3: + b1 = *ch; + b2 = *(ch + 1); + b3 = *(ch + 2); - *bit = (b2 << 6) + (b3 & 0x3F); - *(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F); - break; - //ignore the ones that are larger than 3 bytes; + *bit = (b2 << 6) + (b3 & 0x3F); + *(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F); + break; + //ignore the ones that are larger than 3 bytes; } return code; @@ -122,50 +122,50 @@ FRISO_API int get_utf8_unicode( const fstring ch ) FRISO_API int unicode_to_utf8( uint_t u, fstring __word ) { if ( u <= 0x0000007F ) { - //U-00000000 - U-0000007F - //0xxxxxxx - *__word = ( u & 0x7F ); - return 1; + //U-00000000 - U-0000007F + //0xxxxxxx + *__word = ( u & 0x7F ); + return 1; } else if ( u >= 0x00000080 && u <= 0x000007FF ) { - //U-00000080 - U-000007FF - //110xxxxx 10xxxxxx - *( __word + 1 ) = ( u & 0x3F) | 0x80; - *__word = ((u >> 6) & 0x1F) | 0xC0; - return 2; + //U-00000080 - U-000007FF + //110xxxxx 10xxxxxx + *( __word + 1 ) = ( u & 0x3F) | 0x80; + *__word = ((u >> 6) & 0x1F) | 0xC0; + return 2; } else if ( u >= 0x00000800 && u <= 0x0000FFFF ) { - //U-00000800 - U-0000FFFF - //1110xxxx 10xxxxxx 10xxxxxx - *( __word + 2 ) = ( u & 0x3F) | 0x80; - *( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80; - *__word = ((u >> 12) & 0x0F) | 0xE0; - return 3; + //U-00000800 - U-0000FFFF + //1110xxxx 10xxxxxx 10xxxxxx + *( __word + 2 ) = ( u & 0x3F) | 0x80; + *( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80; + *__word = ((u >> 12) & 0x0F) | 0xE0; + return 3; } else if ( u >= 0x00010000 && u <= 0x001FFFFF ) { - //U-00010000 - U-001FFFFF - //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - *( __word + 3 ) = ( u & 0x3F) | 0x80; - *( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80; - *( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80; - *__word = ((u >> 18) & 0x07) | 0xF0; - return 4; + //U-00010000 - U-001FFFFF + //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + *( __word + 3 ) = ( u & 0x3F) | 0x80; + *( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80; + *( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80; + *__word = ((u >> 18) & 0x07) | 0xF0; + return 4; } else if ( u >= 0x00200000 && u <= 0x03FFFFFF ) { - //U-00200000 - U-03FFFFFF - //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - *( __word + 4 ) = ( u & 0x3F) | 0x80; - *( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80; - *( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80; - *( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80; - *__word = ((u >> 24) & 0x03) | 0xF8; - return 5; + //U-00200000 - U-03FFFFFF + //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + *( __word + 4 ) = ( u & 0x3F) | 0x80; + *( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80; + *( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80; + *( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80; + *__word = ((u >> 24) & 0x03) | 0xF8; + return 5; } else if ( u >= 0x04000000 && u <= 0x7FFFFFFF ) { - //U-04000000 - U-7FFFFFFF - //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - *( __word + 5 ) = ( u & 0x3F) | 0x80; - *( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80; - *( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80; - *( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80; - *( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80; - *__word = ((u >> 30) & 0x01) | 0xFC; - return 6; + //U-04000000 - U-7FFFFFFF + //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + *( __word + 5 ) = ( u & 0x3F) | 0x80; + *( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80; + *( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80; + *( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80; + *( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80; + *__word = ((u >> 30) & 0x01) | 0xFC; + return 6; } return 0; @@ -173,28 +173,28 @@ FRISO_API int unicode_to_utf8( uint_t u, fstring __word ) /* * check the given char is a CJK char or not. - * 2E80-2EFF CJK 部首补充 - * 2F00-2FDF 康熙字典部首 - * 3000-303F CJK 符号和标点 --ignore - * 31C0-31EF CJK 笔画 - * 3200-32FF 封闭式 CJK 文字和月份 --ignore. - * 3300-33FF CJK 兼容 - * 3400-4DBF CJK 统一表意符号扩展 A - * 4DC0-4DFF 易经六十四卦符号 - * 4E00-9FBF CJK 统一表意符号 - * F900-FAFF CJK 兼容象形文字 - * FE30-FE4F CJK 兼容形式 - * FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin) + * 2E80-2EFF CJK 部首补充 + * 2F00-2FDF 康熙字典部首 + * 3000-303F CJK 符号和标点 --ignore + * 31C0-31EF CJK 笔画 + * 3200-32FF 封闭式 CJK 文字和月份 --ignore. + * 3300-33FF CJK 兼容 + * 3400-4DBF CJK 统一表意符号扩展 A + * 4DC0-4DFF 易经六十四卦符号 + * 4E00-9FBF CJK 统一表意符号 + * F900-FAFF CJK 兼容象形文字 + * FE30-FE4F CJK 兼容形式 + * FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin) * * Japanese: - * 3040-309F 日本平假名 - * 30A0-30FF 日本片假名 - * 31F0-31FF 日本片假名拼音扩展 + * 3040-309F 日本平假名 + * 30A0-30FF 日本片假名 + * 31F0-31FF 日本片假名拼音扩展 * * Korean: - * AC00-D7AF 韩文拼音 - * 1100-11FF 韩文字母 - * 3130-318F 韩文兼容字母 + * AC00-D7AF 韩文拼音 + * 1100-11FF 韩文字母 + * 3130-318F 韩文兼容字母 * * @param ch :pointer to the char * @return int : 1 for yes and 0 for not. @@ -211,23 +211,23 @@ FRISO_API int utf8_cjk_string( uint_t u ) //Chinese. #ifdef FRISO_CJK_CHK_C c = ( ( u >= 0x4E00 && u <= 0x9FBF ) - || ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF ) - || ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF ) - || ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF ) - || ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF ) - || ( u >= 0xFE30 && u <= 0xFE4F ) ); + || ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF ) + || ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF ) + || ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF ) + || ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF ) + || ( u >= 0xFE30 && u <= 0xFE4F ) ); #endif //Japanese. #ifdef FRISO_CJK_CHK_J j = ( ( u >= 0x3040 && u <= 0x309F ) - || ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) ); + || ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) ); #endif //Korean #ifdef FRISO_CJK_CHK_K k = ( ( u >= 0xAC00 && u <= 0xD7AF ) - || ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) ); + || ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) ); #endif return ( c || j || k ); @@ -235,7 +235,7 @@ FRISO_API int utf8_cjk_string( uint_t u ) /* * check the given char is a Basic Latin letter or not. - * include all the letters and english punctuations. + * include all the letters and english punctuations. * * @param c * @return int 1 for yes and 0 for not. @@ -247,21 +247,21 @@ FRISO_API int utf8_halfwidth_en_char( uint_t u ) /* * check the given char is a full-width latain or not. - * include the full-width arabic numeber, letters. - * but not the full-width punctuations. + * include the full-width arabic numeber, letters. + * but not the full-width punctuations. * * @param c * @return int */ FRISO_API int utf8_fullwidth_en_char( uint_t u ) { - return ( (u >= 65296 && u <= 65305 ) //arabic number - || ( u >= 65313 && u <= 65338 ) //upper case letters - || ( u >= 65345 && u <= 65370 ) ); //lower case letters + return ( (u >= 65296 && u <= 65305 ) //arabic number + || ( u >= 65313 && u <= 65338 ) //upper case letters + || ( u >= 65345 && u <= 65370 ) ); //lower case letters } //check the given char is a upper case letters or not. -// included the full-width and half-width letters. +// included the full-width and half-width letters. FRISO_API int utf8_uppercase_letter( uint_t u ) { if ( u > 65280 ) u -= 65248; @@ -269,7 +269,7 @@ FRISO_API int utf8_uppercase_letter( uint_t u ) } //check the given char is a upper case letters or not. -// included the full-width and half-width letters. +// included the full-width and half-width letters. FRISO_API int utf8_lowercase_letter( uint_t u ) { if ( u > 65280 ) u -= 65248; @@ -277,25 +277,25 @@ FRISO_API int utf8_lowercase_letter( uint_t u ) } //check the given char is a numeric -// included the full-width and half-width arabic numeric. +// included the full-width and half-width arabic numeric. FRISO_API int utf8_numeric_letter( uint_t u ) { - if ( u > 65280 ) u -= 65248; //make full-width half-width. + if ( u > 65280 ) u -= 65248; //make full-width half-width. return ( ( u >= 48 && u <= 57 ) ); } //check the given char is a english letter.(included the full-width) -// not the punctuation of course. +// not the punctuation of course. FRISO_API int utf8_en_letter( uint_t u ) { if ( u > 65280 ) u -= 65248; return ( ( u >= 65 && u <= 90 ) - || ( u >= 97 && u <= 122 ) ); + || ( u >= 97 && u <= 122 ) ); } /* * check if the given fstring is make up with numeric. - * both full-width,half-width numeric is ok. + * both full-width,half-width numeric is ok. * * @param str * @return int @@ -317,22 +317,22 @@ FRISO_API int utf8_numeric_string( const fstring str ) while ( *s != '\0' ) { - //if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) { - // return 0; - //} + //if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) { + // return 0; + //} - //new implemention. - //@date 2013-10-14 - bytes = 1; - if ( *s < 0 ) //full-width chars. - { - u = get_utf8_unicode(s); - bytes = get_utf8_bytes(*s); - if ( u < 65296 || u > 65305 ) return 0; - } - else if ( *s < 48 || *s > 57 ) return 0; + //new implemention. + //@date 2013-10-14 + bytes = 1; + if ( *s < 0 ) //full-width chars. + { + u = get_utf8_unicode(s); + bytes = get_utf8_bytes(*s); + if ( u < 65296 || u > 65305 ) return 0; + } + else if ( *s < 48 || *s > 57 ) return 0; - s += bytes; + s += bytes; } return 1; @@ -347,24 +347,24 @@ FRISO_API int utf8_decimal_string( const fstring str ) for ( i = 1; i < len; bytes = 1 ) { - //count the number of char '.' - if ( str[i] == '.' ) - { - i++; - p++; - continue; - } + //count the number of char '.' + if ( str[i] == '.' ) + { + i++; + p++; + continue; + } - //full-width numeric. - else if ( str[i] < 0 ) - { - u = get_utf8_unicode(str+i); - bytes = get_utf8_bytes(str[i]); - if ( u < 65296 || u > 65305 ) return 0; - } - else if ( str[i] < 48 || str[i] > 57 ) return 0; + //full-width numeric. + else if ( str[i] < 0 ) + { + u = get_utf8_unicode(str+i); + bytes = get_utf8_bytes(str[i]); + if ( u < 65296 || u > 65305 ) return 0; + } + else if ( str[i] < 48 || str[i] > 57 ) return 0; - i += bytes; + i += bytes; } return (p == 1); @@ -379,7 +379,7 @@ FRISO_API int utf8_decimal_string( const fstring str ) FRISO_API int utf8_whitespace( uint_t u ) { if ( u == 32 || u == 12288 ) - return 1; + return 1; return 0; } @@ -392,16 +392,16 @@ FRISO_API int utf8_whitespace( uint_t u ) */ FRISO_API int utf8_en_punctuation( uint_t u ) { - //if ( u > 65280 ) u = u - 65248; //make full-width half-width + //if ( u > 65280 ) u = u - 65248; //make full-width half-width return ( (u > 32 && u < 48) - || ( u > 57 && u < 65 ) - || ( u > 90 && u < 97 ) //added @2013-08-31 - || ( u > 122 && u < 127 ) ); + || ( u > 57 && u < 65 ) + || ( u > 90 && u < 97 ) //added @2013-08-31 + || ( u > 122 && u < 127 ) ); } /* * check the given char is a chinese punctuation. - * @date 2013-08-31 added. + * @date 2013-08-31 added. * * @param ch * @return int @@ -409,17 +409,17 @@ FRISO_API int utf8_en_punctuation( uint_t u ) FRISO_API int utf8_cn_punctuation( uint_t u ) { return ( ( u > 65280 && u < 65296 ) - || ( u > 65305 && u < 65312 ) - || ( u > 65338 && u < 65345 ) - || ( u > 65370 && u < 65382 ) - //cjk symbol and punctuation.(added 2013-09-06) - //from http://www.unicode.org/charts/PDF/U3000.pdf - || ( u >= 12289 && u <= 12319) ); + || ( u > 65305 && u < 65312 ) + || ( u > 65338 && u < 65345 ) + || ( u > 65370 && u < 65382 ) + //cjk symbol and punctuation.(added 2013-09-06) + //from http://www.unicode.org/charts/PDF/U3000.pdf + || ( u >= 12289 && u <= 12319) ); } /* * check if the given char is a letter number in unicode. - * like 'ⅠⅡ'. + * like 'ⅠⅡ'. * @param ch * @return int */ @@ -430,7 +430,7 @@ FRISO_API int utf8_letter_number( uint_t u ) /* * check if the given char is a other number in unicode. - * like '①⑩⑽㈩'. + * like '①⑩⑽㈩'. * @param ch * @return int */ @@ -456,19 +456,19 @@ FRISO_API int utf8_other_number( uint_t u ) //{ // if ( __keep_punctuations_hash__ == NULL ) // { -// __keep_punctuations_hash__ = new_hash_table(); -// hash_put_mapping( __keep_punctuations_hash__, "@", NULL ); -// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "%", NULL ); -// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "&", NULL ); -// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL ); -// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, ".", NULL ); -// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "'", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "#", NULL ); -// hash_put_mapping( __keep_punctuations_hash__, "+", NULL ); +// __keep_punctuations_hash__ = new_hash_table(); +// hash_put_mapping( __keep_punctuations_hash__, "@", NULL ); +// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "%", NULL ); +// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "&", NULL ); +// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL ); +// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, ".", NULL ); +// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "'", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "#", NULL ); +// hash_put_mapping( __keep_punctuations_hash__, "+", NULL ); // } // //check the hash. // return hash_exist_mapping( __keep_punctuations_hash__, str ); @@ -484,7 +484,7 @@ FRISO_API int utf8_other_number( uint_t u ) //FRISO_API int utf8_fullwidth_char( uint_t u ) //{ // if ( u == 12288 ) -// return 1; //full-width space +// return 1; //full-width space // //(32 - 126) ascii code // return (u > 65280 && u <= 65406); //} diff --git a/src/friso_array.c b/src/friso_array.c index f338d96..84724ae 100644 --- a/src/friso_array.c +++ b/src/friso_array.c @@ -1,9 +1,9 @@ /* * friso dynamaic interface implemented functions file - * that defined in header file "friso_API.h". - * never use it for commercial use. + * that defined in header file "friso_API.h". + * never use it for commercial use. * - * @author chenxini + * @author chenxini */ #include "friso_API.h" @@ -14,37 +14,37 @@ **********************************************/ __STATIC_API__ void **create_array_entries( uint_t __blocks ) { - register uint_t t; - void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks ); - if ( block == NULL ) { - ___ALLOCATION_ERROR___ - } + register uint_t t; + void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks ); + if ( block == NULL ) { + ___ALLOCATION_ERROR___ + } - //initialize - for ( t = 0; t < __blocks; t++ ) { - block[t] = NULL; - } + //initialize + for ( t = 0; t < __blocks; t++ ) { + block[t] = NULL; + } - return block; + return block; } //resize the array. (the opacity should not be smaller than array->length) __STATIC_API__ friso_array_t resize_array_list( - friso_array_t array, - uint_t opacity ) + friso_array_t array, + uint_t opacity ) { - register uint_t t; - void **block = create_array_entries( opacity ); + register uint_t t; + void **block = create_array_entries( opacity ); - for ( t = 0; t < array->length ; t++ ) { - block[t] = array->items[t]; - } + for ( t = 0; t < array->length ; t++ ) { + block[t] = array->items[t]; + } - FRISO_FREE( array->items ); - array->items = block; - array->allocs = opacity; + FRISO_FREE( array->items ); + array->items = block; + array->allocs = opacity; - return array; + return array; } @@ -59,154 +59,154 @@ __STATIC_API__ friso_array_t resize_array_list( //create a new array list with a given opacity. FRISO_API friso_array_t new_array_list_with_opacity( uint_t opacity ) { - friso_array_t array = ( friso_array_t ) - FRISO_MALLOC( sizeof( friso_array_entry ) ); - if ( array == NULL ) { - ___ALLOCATION_ERROR___ - } + friso_array_t array = ( friso_array_t ) + FRISO_MALLOC( sizeof( friso_array_entry ) ); + if ( array == NULL ) { + ___ALLOCATION_ERROR___ + } - //initialize - array->items = create_array_entries( opacity ); - array->allocs = opacity; - array->length = 0; + //initialize + array->items = create_array_entries( opacity ); + array->allocs = opacity; + array->length = 0; - return array; + return array; } /* * free the given friso array. - * and its items, but never where its items item pointed to . + * and its items, but never where its items item pointed to . */ FRISO_API void free_array_list( friso_array_t array ) { - //free the allocation that all the items pointed to - //register int t; - //if ( flag == 1 ) { - // for ( t = 0; t < array->length; t++ ) { - // if ( array->items[t] == NULL ) continue; - // FRISO_FREE( array->items[t] ); - // array->items[t] = NULL; - // } - //} + //free the allocation that all the items pointed to + //register int t; + //if ( flag == 1 ) { + // for ( t = 0; t < array->length; t++ ) { + // if ( array->items[t] == NULL ) continue; + // FRISO_FREE( array->items[t] ); + // array->items[t] = NULL; + // } + //} - FRISO_FREE( array->items ); - FRISO_FREE( array ); + FRISO_FREE( array->items ); + FRISO_FREE( array ); } //add a new item to the array. FRISO_API void array_list_add( friso_array_t array, void *value ) { - //check the condition to resize. - if ( array->length == array->allocs ) { - resize_array_list( array, array->length * 2 + 1 ); - } - array->items[array->length++] = value; + //check the condition to resize. + if ( array->length == array->allocs ) { + resize_array_list( array, array->length * 2 + 1 ); + } + array->items[array->length++] = value; } //insert a new item at a specified position. FRISO_API void array_list_insert( - friso_array_t array, - uint_t idx, - void *value ) + friso_array_t array, + uint_t idx, + void *value ) { - register uint_t t; + register uint_t t; - if ( idx <= array->length ) - { - //check the condition to resize the array. - if ( array->length == array->allocs ) { - resize_array_list( array, array->length * 2 + 1 ); - } + if ( idx <= array->length ) + { + //check the condition to resize the array. + if ( array->length == array->allocs ) { + resize_array_list( array, array->length * 2 + 1 ); + } - //move the elements after idx. - //for ( t = idx; t < array->length; t++ ) { - // array->items[t+1] = array->items[t]; - //} - for ( t = array->length - 1; t >= idx; t-- ) - { - array->items[t+1] = array->items[t]; - } + //move the elements after idx. + //for ( t = idx; t < array->length; t++ ) { + // array->items[t+1] = array->items[t]; + //} + for ( t = array->length - 1; t >= idx; t-- ) + { + array->items[t+1] = array->items[t]; + } - array->items[idx] = value; - array->length++; - } + array->items[idx] = value; + array->length++; + } } //get the item at a specified position. FRISO_API void *array_list_get( friso_array_t array, uint_t idx ) { - if ( idx < array->length ) { - return array->items[idx]; - } - return NULL; + if ( idx < array->length ) { + return array->items[idx]; + } + return NULL; } //set the value of the item at a specified position. //this will return the old value. FRISO_API void * array_list_set( - friso_array_t array, - uint_t idx, - void * value ) + friso_array_t array, + uint_t idx, + void * value ) { - void * oval = NULL; - if ( idx < array->length ) - { - oval = array->items[idx]; - array->items[idx] = value; - } - return oval; + void * oval = NULL; + if ( idx < array->length ) + { + oval = array->items[idx]; + array->items[idx] = value; + } + return oval; } //remove the item at a specified position. //this will return the value of the removed item. FRISO_API void * array_list_remove( - friso_array_t array, uint_t idx ) + friso_array_t array, uint_t idx ) { - register uint_t t; - void *oval = NULL; + register uint_t t; + void *oval = NULL; - if ( idx < array->length ) - { - oval = array->items[idx]; - //move the elements after idx. - for ( t = idx; t < array->length - 1; t++ ) { - array->items[t] = array->items[ t + 1 ]; - } - array->items[array->length - 1] = NULL; - array->length--; - } + if ( idx < array->length ) + { + oval = array->items[idx]; + //move the elements after idx. + for ( t = idx; t < array->length - 1; t++ ) { + array->items[t] = array->items[ t + 1 ]; + } + array->items[array->length - 1] = NULL; + array->length--; + } - return oval; + return oval; } /*trim the array list*/ FRISO_API friso_array_t array_list_trim( friso_array_t array ) { - if ( array->length < array->allocs ) { - return resize_array_list( array, array->length ); - } - return array; + if ( array->length < array->allocs ) { + return resize_array_list( array, array->length ); + } + return array; } /* * clear the array list. - * this function will free all the allocations that the pointer pointed. - * but will not free the point array allocations, - * and will reset the length of it. + * this function will free all the allocations that the pointer pointed. + * but will not free the point array allocations, + * and will reset the length of it. */ FRISO_API friso_array_t array_list_clear( friso_array_t array ) { - register uint_t t; - //free all the allocations that the array->length's pointer pointed. - for ( t = 0; t < array->length; t++ ) { - /*if ( array->items[t] == NULL ) continue; - FRISO_FREE( array->items[t] ); */ - array->items[t] = NULL; - } - //attribute reset. - array->length = 0; + register uint_t t; + //free all the allocations that the array->length's pointer pointed. + for ( t = 0; t < array->length; t++ ) { + /*if ( array->items[t] == NULL ) continue; + FRISO_FREE( array->items[t] ); */ + array->items[t] = NULL; + } + //attribute reset. + array->length = 0; - return array; + return array; } //get the size of the array list. (A macro define has replace this.) diff --git a/src/friso_ctype.c b/src/friso_ctype.c index 3ef3019..9f5d17a 100644 --- a/src/friso_ctype.c +++ b/src/friso_ctype.c @@ -1,7 +1,7 @@ /** * friso string type check function interface, - * like english/CJK, full-wdith/half-width, punctuation or not. - * @ses friso_UTF8.c and friso_GBK.c for detail. + * like english/CJK, full-wdith/half-width, punctuation or not. + * @ses friso_UTF8.c and friso_GBK.c for detail. * * @author chenxin */ @@ -16,25 +16,25 @@ * @return int (true for cn string or false) * */ FRISO_API int friso_cn_string( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_cjk_string(task->unicode); + return utf8_cjk_string(task->unicode); else if ( charset == FRISO_GBK ) - return gbk_cn_string(task->buffer); + return gbk_cn_string(task->buffer); return 0; } //check if the specified word is a whitespace. FRISO_API int friso_whitespace( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_whitespace(task->unicode); + return utf8_whitespace(task->unicode); else if ( charset == FRISO_GBK ) - return gbk_whitespace(task->buffer); + return gbk_whitespace(task->buffer); return 0; } @@ -52,76 +52,76 @@ FRISO_API int friso_numeric_letter( //check if the specified word is aa english letter. FRISO_API int friso_en_letter( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_en_letter( ( uint_t ) task->text[task->idx]); + return utf8_en_letter( ( uint_t ) task->text[task->idx]); else if ( charset == FRISO_GBK ) - return gbk_en_letter( task->text + task->idx ); + return gbk_en_letter( task->text + task->idx ); return 0; } //check if the specified word is a half-width letter. -// punctuations are inclued. +// punctuations are inclued. FRISO_API int friso_halfwidth_en_char( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_halfwidth_en_char(task->unicode); + return utf8_halfwidth_en_char(task->unicode); else if ( charset == FRISO_GBK ) - return gbk_halfwidth_en_char(task->buffer[0]); + return gbk_halfwidth_en_char(task->buffer[0]); return 0; } //check if the specified word is a full-width letter. -// full-width punctuations are not included. +// full-width punctuations are not included. FRISO_API int friso_fullwidth_en_char( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_fullwidth_en_char( task->unicode ); + return utf8_fullwidth_en_char( task->unicode ); else if ( charset == FRISO_GBK ) - return gbk_fullwidth_en_char( task->buffer ); + return gbk_fullwidth_en_char( task->buffer ); return 0; } //check if the specified word is an english punctuations. FRISO_API int friso_en_punctuation( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_en_punctuation( task->unicode ); + return utf8_en_punctuation( task->unicode ); else if ( charset == FRISO_GBK ) - return gbk_en_punctuation( task->buffer[0] ); + return gbk_en_punctuation( task->buffer[0] ); return 0; } //check if the specified word ia sn chinese punctuation. FRISO_API int friso_cn_punctuation( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_cn_punctuation( task->unicode ); + return utf8_cn_punctuation( task->unicode ); else if ( charset == FRISO_GBK ) - return gbk_cn_punctuation( task->buffer ); + return gbk_cn_punctuation( task->buffer ); return 0; } FRISO_API int friso_letter_number( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { return 0; } FRISO_API int friso_other_number( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { return 0; } @@ -129,98 +129,98 @@ FRISO_API int friso_other_number( //check if the word is a keep punctuation. //@Deprecated //FRISO_API int friso_keep_punctuation( -// friso_charset_t charset, -// friso_task_t task ) +// friso_charset_t charset, +// friso_task_t task ) //{ // if ( charset == FRISO_UTF8 ) -// return utf8_keep_punctuation( task->buffer ); +// return utf8_keep_punctuation( task->buffer ); // else if ( charset == FRISO_GBK ) -// return gbk_keep_punctuation( task->buffer ); +// return gbk_keep_punctuation( task->buffer ); // return 0; //} //check if the specified char is en english punctuation. -// this function is the same as friso_en_punctuation. +// this function is the same as friso_en_punctuation. FRISO_API int is_en_punctuation( - friso_charset_t charset, char c ) + friso_charset_t charset, char c ) { if ( charset == FRISO_UTF8 ) - return utf8_en_punctuation( (uint_t) c); + return utf8_en_punctuation( (uint_t) c); else if ( charset == FRISO_GBK ) - return gbk_en_punctuation( c ); + return gbk_en_punctuation( c ); return 0; } //check the specified string is make up with numeric. FRISO_API int friso_numeric_string( - friso_charset_t charset, - char *buffer ) + friso_charset_t charset, + char *buffer ) { if ( charset == FRISO_UTF8 ) - return utf8_numeric_string( buffer ); + return utf8_numeric_string( buffer ); else if ( charset == FRISO_GBK ) - return gbk_numeric_string( buffer ); + return gbk_numeric_string( buffer ); return 0; } //check the specified string is a decimal string. FRISO_API int friso_decimal_string( - friso_charset_t charset, char *buffer ) + friso_charset_t charset, char *buffer ) { if ( charset == FRISO_UTF8 ) - return utf8_decimal_string( buffer ); + return utf8_decimal_string( buffer ); else if ( charset == FRISO_GBK ) - return gbk_decimal_string( buffer ); + return gbk_decimal_string( buffer ); return 0; } //check if the specified char is english uppercase letter. -// included full-width and half-width letters. +// included full-width and half-width letters. FRISO_API int friso_uppercase_letter( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { if ( charset == FRISO_UTF8 ) - return utf8_uppercase_letter( task->unicode ); + return utf8_uppercase_letter( task->unicode ); else if ( charset == FRISO_GBK ) - return gbk_uppercase_letter( task->buffer ); + return gbk_uppercase_letter( task->buffer ); return 0; } /* get the type of the specified char. - * the type will be the constants defined above. + * the type will be the constants defined above. * (include the fullwidth english char.) */ FRISO_API friso_enchar_t friso_enchar_type( - friso_charset_t charset, - friso_task_t task ) + friso_charset_t charset, + friso_task_t task ) { //Unicode or ASCII.(Both UTF-8 and GBK are valid) uint_t u = 0; if ( charset == FRISO_UTF8 ) { - u = task->unicode; - //if ( u >= 65280 ) u = 65280 - 65248; + u = task->unicode; + //if ( u >= 65280 ) u = 65280 - 65248; } else if ( charset == FRISO_GBK ) { - u = (uchar_t)task->buffer[0]; - //if ( u == 0xa3 ) ; //full-width. + u = (uchar_t)task->buffer[0]; + //if ( u == 0xa3 ) ; //full-width. } //range check. - if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW; - if ( u == 32 ) return FRISO_EN_WHITESPACE; - if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC; - if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER; - if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER; + if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW; + if ( u == 32 ) return FRISO_EN_WHITESPACE; + if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC; + if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER; + if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER; return FRISO_EN_PUNCTUATION; } /* get the type of the specified en char. - * the type will be the constants defined above. + * the type will be the constants defined above. * (the char should be half-width english char only) */ FRISO_API friso_enchar_t get_enchar_type( char ch ) @@ -228,11 +228,11 @@ FRISO_API friso_enchar_t get_enchar_type( char ch ) uint_t u = (uchar_t) ch; //range check. - if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW; - if ( u == 32 ) return FRISO_EN_WHITESPACE; - if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC; - if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER; - if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER; + if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW; + if ( u == 32 ) return FRISO_EN_WHITESPACE; + if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC; + if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER; + if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER; return FRISO_EN_PUNCTUATION; } diff --git a/src/friso_ctype.h b/src/friso_ctype.h index 100edbe..d881223 100644 --- a/src/friso_ctype.h +++ b/src/friso_ctype.h @@ -1,9 +1,9 @@ /** * Friso charset about function interface header file. - * @package src/friso_charset.h . + * @package src/friso_charset.h . * Available charset for now: - * 1. UTF8 - function start with utf8 - * 2. GBK - function start with gbk + * 1. UTF8 - function start with utf8 + * 2. GBK - function start with gbk * * @author chenxin */ @@ -33,11 +33,11 @@ FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t); FRISO_API int friso_en_letter( friso_charset_t, friso_task_t ); //check if the specified word is a half-width letter. -// punctuations are inclued. +// punctuations are inclued. FRISO_API int friso_halfwidth_en_char( friso_charset_t, friso_task_t ); //check if the specified word is a full-width letter. -// full-width punctuations are not included. +// full-width punctuations are not included. FRISO_API int friso_fullwidth_en_char( friso_charset_t, friso_task_t ); //check if the specified word is an english punctuations. @@ -60,32 +60,32 @@ FRISO_API int friso_numeric_string( friso_charset_t, char * ); FRISO_API int friso_decimal_string( friso_charset_t, char * ); //check if the specified char is english uppercase letter. -// included full-width and half-width letters. +// included full-width and half-width letters. FRISO_API int friso_uppercase_letter( friso_charset_t, friso_task_t ); //en char type. -//#define FRISO_EN_LETTER 0 //a-z && A-Z -//#define FRISO_EN_NUMERIC 1 //0-9 -//#define FRISO_EN_PUNCTUATION 2 //english punctuations -//#define FRISO_EN_WHITESPACE 3 //whitespace -//#define FRISO_EN_UNKNOW -1 //beyond 32-122 +//#define FRISO_EN_LETTER 0 //a-z && A-Z +//#define FRISO_EN_NUMERIC 1 //0-9 +//#define FRISO_EN_PUNCTUATION 2 //english punctuations +//#define FRISO_EN_WHITESPACE 3 //whitespace +//#define FRISO_EN_UNKNOW -1 //beyond 32-122 typedef enum { - FRISO_EN_LETTER = 0, //A-Z, a-z - FRISO_EN_NUMERIC = 1, //0-9 - FRISO_EN_PUNCTUATION = 2, //english punctuations - FRISO_EN_WHITESPACE = 3, //whitespace - FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126) + FRISO_EN_LETTER = 0, //A-Z, a-z + FRISO_EN_NUMERIC = 1, //0-9 + FRISO_EN_PUNCTUATION = 2, //english punctuations + FRISO_EN_WHITESPACE = 3, //whitespace + FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126) } friso_enchar_t; /* get the type of the specified char. - * the type will be the constants defined above. + * the type will be the constants defined above. * (include the fullwidth english char.) */ FRISO_API friso_enchar_t friso_enchar_type( friso_charset_t, friso_task_t ); /* get the type of the specified en char. - * the type will be the constants defined above. + * the type will be the constants defined above. * (the char should be half-width english char only) */ FRISO_API friso_enchar_t get_enchar_type( char ); @@ -99,7 +99,7 @@ FRISO_API friso_enchar_t get_enchar_type( char ); /* read the next utf-8 word from the specified position. * - * @return int the bytes of the current readed word. + * @return int the bytes of the current readed word. */ FRISO_API int utf8_next_word( friso_task_t, uint_t *, fstring ); @@ -116,31 +116,31 @@ FRISO_API int unicode_to_utf8( uint_t, fstring ); FRISO_API int utf8_cjk_string( uint_t ) ; /*check the given char is a Basic Latin letter or not. - * include all the letters and english puntuations.*/ + * include all the letters and english puntuations.*/ FRISO_API int utf8_halfwidth_en_char( uint_t ); /* * check the given char is a full-width latain or not. - * include the full-width arabic numeber, letters. - * but not the full-width puntuations. + * include the full-width arabic numeber, letters. + * but not the full-width puntuations. */ FRISO_API int utf8_fullwidth_en_char( uint_t ); //check the given char is a upper case letter or not. -// included all the full-width and half-width letters. +// included all the full-width and half-width letters. FRISO_API int utf8_uppercase_letter( uint_t ); //check the given char is a lower case letter or not. -// included all the full-width and half-width letters. +// included all the full-width and half-width letters. FRISO_API int utf8_lowercase_letter( uint_t ); //check the given char is a numeric. -// included the full-width and half-width arabic numeric. +// included the full-width and half-width arabic numeric. FRISO_API int utf8_numeric_letter( uint_t ); /* * check if the given fstring is make up with numeric chars. - * both full-width,half-width numeric is ok. + * both full-width,half-width numeric is ok. */ FRISO_API int utf8_numeric_string( char * ); @@ -183,7 +183,7 @@ FRISO_API int is_en_punctuation( friso_charset_t, char ); /* read the next GBK word from the specified position. * - * @return int the bytes of the current readed word. + * @return int the bytes of the current readed word. */ FRISO_API int gbk_next_word( friso_task_t, uint_t *, fstring ); @@ -194,31 +194,31 @@ FRISO_API int get_gbk_bytes( char ); FRISO_API int gbk_cn_string( char * ) ; /*check if the given char is a ASCII letter - * include all the letters and english puntuations.*/ + * include all the letters and english puntuations.*/ FRISO_API int gbk_halfwidth_en_char( char ); /* * check if the given char is a full-width latain. - * include the full-width arabic numeber, letters. - * but not the full-width puntuations. + * include the full-width arabic numeber, letters. + * but not the full-width puntuations. */ FRISO_API int gbk_fullwidth_en_char( char * ); //check if the given char is a upper case char. -// included all the full-width and half-width letters. +// included all the full-width and half-width letters. FRISO_API int gbk_uppercase_letter( char * ); //check if the given char is a lower case char. -// included all the full-width and half-width letters. +// included all the full-width and half-width letters. FRISO_API int gbk_lowercase_letter( char * ); //check if the given char is a numeric. -// included the full-width and half-width arabic numeric. +// included the full-width and half-width arabic numeric. FRISO_API int gbk_numeric_letter( char * ); /* * check if the given fstring is make up with numeric chars. - * both full-width,half-width numeric is ok. + * both full-width,half-width numeric is ok. */ FRISO_API int gbk_numeric_string( char * ); @@ -248,7 +248,7 @@ FRISO_API int gbk_en_punctuation( char ) ; FRISO_API int gbk_cn_punctuation( char * ); //cause the logic handle is the same as the utf8. -// here invoke the utf8 interface directly. +// here invoke the utf8 interface directly. //FRISO_API int gbk_keep_punctuation( char * ); //@Deprecated //#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str) @@ -257,4 +257,4 @@ FRISO_API int gbk_cn_punctuation( char * ); //FRISO_API int gbk_fullwidth_char( char * ) ; /* }}}*/ -#endif /*end _friso_charset_h*/ +#endif /*end _friso_charset_h*/ diff --git a/src/friso_hash.c b/src/friso_hash.c index 400487f..b7efd6d 100644 --- a/src/friso_hash.c +++ b/src/friso_hash.c @@ -1,8 +1,8 @@ /* * friso hash table implements functions - * defined in header file "friso_API.h". + * defined in header file "friso_API.h". * - * @author chenxin + * @author chenxin */ #include "friso_API.h" #include @@ -10,7 +10,7 @@ //-166411799L //31 131 1331 13331 133331 .. -//31 131 1313 13131 131313 .. the best +//31 131 1313 13131 131313 .. the best #define HASH_FACTOR 1313131 /* ************************ @@ -22,7 +22,7 @@ __STATIC_API__ uint_t hash( fstring str, uint_t length ) uint_t h = 0; while ( *str != '\0' ) - h = h * HASH_FACTOR + ( *str++ ); + h = h * HASH_FACTOR + ( *str++ ); return (h % length); } @@ -32,13 +32,13 @@ __STATIC_API__ int is_prime( int n ) { int j; if ( n == 2 || n == 3 ) - return 1; + return 1; if ( n == 1 || n % 2 == 0 ) - return 0; + return 0; for ( j = 3; j * j < n; j++ ) - if ( n % j == 0 ) - return 0; + if ( n % j == 0 ) + return 0; return 1; } @@ -47,7 +47,7 @@ __STATIC_API__ int is_prime( int n ) __STATIC_API__ int next_prime( int n ) { if ( n % 2 == 0 ) - n++; + n++; for ( ; ! is_prime( n ); n = n + 2 ) ; return n; @@ -72,14 +72,14 @@ __STATIC_API__ int next_prime( int n ) * static hashtable function area. * ***********************************/ __STATIC_API__ hash_entry_t new_hash_entry( - fstring key, - void * value, - hash_entry_t next ) + fstring key, + void * value, + hash_entry_t next ) { hash_entry_t e = ( hash_entry_t ) - FRISO_MALLOC( sizeof( friso_hash_entry ) ); + FRISO_MALLOC( sizeof( friso_hash_entry ) ); if ( e == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } //e->_key = string_copy( key ); @@ -95,13 +95,13 @@ __STATIC_API__ hash_entry_t * create_hash_entries( uint_t blocks ) { register uint_t t; hash_entry_t *e = ( hash_entry_t * ) - FRISO_CALLOC( sizeof( hash_entry_t ), blocks ); + FRISO_CALLOC( sizeof( hash_entry_t ), blocks ); if ( e == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } for ( t = 0; t < blocks; t++ ) { - e[t] = NULL; + e[t] = NULL; } return e; @@ -114,22 +114,22 @@ __STATIC_API__ void rebuild_hash( friso_hash_t _hash ) //find the next prime as the length of the hashtable. uint_t t, length = next_prime( _hash->length * 2 + 1 ); hash_entry_t e, next, *_src = _hash->table, \ - *table = create_hash_entries( length ); + *table = create_hash_entries( length ); uint_t bucket; //copy the nodes for ( t = 0; t < _hash->length; t++ ) { - e = *( _src + t ); - if ( e != NULL ) { - do { - next = e->_next; - bucket = hash( e->_key, length ); - e->_next = table[bucket]; - table[bucket] = e; - e = next; - } while ( e != NULL ); - } + e = *( _src + t ); + if ( e != NULL ) { + do { + next = e->_next; + bucket = hash( e->_key, length ); + e->_next = table[bucket]; + table[bucket] = e; + e = next; + } while ( e != NULL ); + } } _hash->table = table; @@ -149,35 +149,35 @@ FRISO_API friso_hash_t new_hash_table( void ) { friso_hash_t _hash = ( friso_hash_t ) FRISO_MALLOC( sizeof ( friso_hash_cdt ) ); if ( _hash == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } //initialize the the hashtable - _hash->length = DEFAULT_LENGTH; - _hash->size = 0; - _hash->factor = DEFAULT_FACTOR; - _hash->threshold = ( uint_t ) ( _hash->length * _hash->factor ); - _hash->table = create_hash_entries( _hash->length ); + _hash->length = DEFAULT_LENGTH; + _hash->size = 0; + _hash->factor = DEFAULT_FACTOR; + _hash->threshold = ( uint_t ) ( _hash->length * _hash->factor ); + _hash->table = create_hash_entries( _hash->length ); return _hash; } FRISO_API void free_hash_table( - friso_hash_t _hash, - fhash_callback_fn_t fentry_func ) + friso_hash_t _hash, + fhash_callback_fn_t fentry_func ) { register uint_t j; hash_entry_t e, n; for ( j = 0; j < _hash->length; j++ ) { - e = *( _hash->table + j ); - for ( ; e != NULL ; ) { - n = e->_next; - if ( fentry_func != NULL ) fentry_func(e); - FRISO_FREE( e ); - e = n; - } + e = *( _hash->table + j ); + for ( ; e != NULL ; ) { + n = e->_next; + if ( fentry_func != NULL ) fentry_func(e); + FRISO_FREE( e ); + e = n; + } } //free the pointer array block ( 4 * htable->length continuous bytes ). @@ -189,9 +189,9 @@ FRISO_API void free_hash_table( //put a new mapping insite. //the value cannot be NULL. FRISO_API void *hash_put_mapping( - friso_hash_t _hash, - fstring key, - void * value ) + friso_hash_t _hash, + fstring key, + void * value ) { uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length ); hash_entry_t e = *( _hash->table + bucket ); @@ -200,14 +200,14 @@ FRISO_API void *hash_put_mapping( //check the given key is already exists or not. for ( ; e != NULL; e = e->_next ) { - if ( key == e->_key - || ( key != NULL && e->_key != NULL - && strcmp( key, e->_key ) == 0 ) ) - { + if ( key == e->_key + || ( key != NULL && e->_key != NULL + && strcmp( key, e->_key ) == 0 ) ) + { oval = e->_val; //bak the old value - e->_val = value; - return oval; - } + e->_val = value; + return oval; + } } //put a new mapping into the hashtable. @@ -216,27 +216,27 @@ FRISO_API void *hash_put_mapping( //check the condition to rebuild the hashtable. if ( _hash->size >= _hash->threshold ) - rebuild_hash( _hash ); + rebuild_hash( _hash ); return oval; } //check the existence of the mapping associated with the given key. FRISO_API int hash_exist_mapping( - friso_hash_t _hash, fstring key ) + friso_hash_t _hash, fstring key ) { uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length ); hash_entry_t e; for ( e = *( _hash->table + bucket ); - e != NULL; - e = e->_next ) { - if ( key == e->_key - || ( key != NULL && e->_key != NULL - && strcmp( key, e->_key ) == 0 )) - { - return 1; - } + e != NULL; + e = e->_next ) { + if ( key == e->_key + || ( key != NULL && e->_key != NULL + && strcmp( key, e->_key ) == 0 )) + { + return 1; + } } return 0; @@ -249,14 +249,14 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key ) hash_entry_t e; for ( e = *( _hash->table + bucket ); - e != NULL; - e = e->_next ) { - if ( key == e->_key - || ( key != NULL && e->_key != NULL - && strcmp( key, e->_key ) == 0 )) - { - return e->_val; - } + e != NULL; + e = e->_next ) { + if ( key == e->_key + || ( key != NULL && e->_key != NULL + && strcmp( key, e->_key ) == 0 )) + { + return e->_val; + } } return NULL; @@ -264,31 +264,31 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key ) //remove the mapping associated with the given key. FRISO_API hash_entry_t hash_remove_mapping( - friso_hash_t _hash, fstring key ) + friso_hash_t _hash, fstring key ) { uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length ); hash_entry_t e, prev = NULL; hash_entry_t b; for ( e = *( _hash->table + bucket ); - e != NULL; - prev = e, e = e->_next ) { - if ( key == e->_key - || ( key != NULL && e->_key != NULL - && strcmp( key, e->_key ) == 0 ) ) - { - b = e; - //the node located at *( htable->table + bucket ) - if ( prev == NULL ) { - _hash->table[bucket] = e->_next; - } else { - prev->_next = e->_next; - } - //printf("%s was removed\n", b->_key); - _hash->size--; - //FRISO_FREE( b ); - return b; - } + e != NULL; + prev = e, e = e->_next ) { + if ( key == e->_key + || ( key != NULL && e->_key != NULL + && strcmp( key, e->_key ) == 0 ) ) + { + b = e; + //the node located at *( htable->table + bucket ) + if ( prev == NULL ) { + _hash->table[bucket] = e->_next; + } else { + prev->_next = e->_next; + } + //printf("%s was removed\n", b->_key); + _hash->size--; + //FRISO_FREE( b ); + return b; + } } return NULL; diff --git a/src/friso_lexicon.c b/src/friso_lexicon.c index 67e1300..dc726cc 100644 --- a/src/friso_lexicon.c +++ b/src/friso_lexicon.c @@ -1,102 +1,102 @@ /* * friso lexicon implemented functions. - * used to deal with the friso lexicon, like: load,remove,match... + * used to deal with the friso lexicon, like: load,remove,match... * - * @author chenxin + * @author chenxin */ #include #include #include "friso_API.h" #include "friso.h" -#define __SPLIT_MAX_TOKENS__ 5 -#define __LEX_FILE_DELIME__ '#' -#define __FRISO_LEX_IFILE__ "friso.lex.ini" +#define __SPLIT_MAX_TOKENS__ 5 +#define __LEX_FILE_DELIME__ '#' +#define __FRISO_LEX_IFILE__ "friso.lex.ini" //create a new lexicon FRISO_API friso_dic_t friso_dic_new() { - register uint_t t; - friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC( - sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ ); - if ( dic == NULL ) { - ___ALLOCATION_ERROR___ - } + register uint_t t; + friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC( + sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ ); + if ( dic == NULL ) { + ___ALLOCATION_ERROR___ + } - for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { - dic[t] = new_hash_table(); - } + for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { + dic[t] = new_hash_table(); + } - return dic; + return dic; } /** * default callback function to invoke - * when free the friso dictionary . + * when free the friso dictionary . * * @date 2013-06-12 */ __STATIC_API__ void default_fdic_callback( hash_entry_t e ) { - register uint_t i; - friso_array_t syn; - lex_entry_t lex = ( lex_entry_t ) e->_val; - //free the lex->word - FRISO_FREE( lex->word ); - //free the lex->syn if it is not NULL - if ( lex->syn != NULL ) - { - syn = lex->syn; - for ( i = 0; i < syn->length; i++ ) { - FRISO_FREE( syn->items[i] ); - } - free_array_list( syn ); - } + register uint_t i; + friso_array_t syn; + lex_entry_t lex = ( lex_entry_t ) e->_val; + //free the lex->word + FRISO_FREE( lex->word ); + //free the lex->syn if it is not NULL + if ( lex->syn != NULL ) + { + syn = lex->syn; + for ( i = 0; i < syn->length; i++ ) { + FRISO_FREE( syn->items[i] ); + } + free_array_list( syn ); + } - //free the e->_val - //@date 2014-01-28 posted by mlemay@gmail.com - FRISO_FREE(lex); + //free the e->_val + //@date 2014-01-28 posted by mlemay@gmail.com + FRISO_FREE(lex); } FRISO_API void friso_dic_free( friso_dic_t dic ) { - register uint_t t; - for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { - //free the hash table - free_hash_table( dic[t], default_fdic_callback ); - } + register uint_t t; + for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { + //free the hash table + free_hash_table( dic[t], default_fdic_callback ); + } - FRISO_FREE( dic ); + FRISO_FREE( dic ); } //create a new lexicon entry FRISO_API lex_entry_t new_lex_entry( - fstring word, - friso_array_t syn, - uint_t fre, - uint_t length, - uint_t type ) + fstring word, + friso_array_t syn, + uint_t fre, + uint_t length, + uint_t type ) { - lex_entry_t e = ( lex_entry_t ) - FRISO_MALLOC( sizeof( lex_entry_cdt ) ); - if ( e == NULL ) { - ___ALLOCATION_ERROR___ - } + lex_entry_t e = ( lex_entry_t ) + FRISO_MALLOC( sizeof( lex_entry_cdt ) ); + if ( e == NULL ) { + ___ALLOCATION_ERROR___ + } - //initialize. - e->word = word; - e->syn = syn; //synoyum words array list. - e->pos = NULL; //part of speech array list. - //e->py = NULL; //set to NULL first. - e->fre = fre; - e->length = (uchar_t) length; //length - e->rlen = (uchar_t) length; //set to length by default. - e->type = (uchar_t) type; //type - e->ctrlMask = 0; //control mask. - e->offset = -1; + //initialize. + e->word = word; + e->syn = syn; //synoyum words array list. + e->pos = NULL; //part of speech array list. + //e->py = NULL; //set to NULL first. + e->fre = fre; + e->length = (uchar_t) length; //length + e->rlen = (uchar_t) length; //set to length by default. + e->type = (uchar_t) type; //type + e->ctrlMask = 0; //control mask. + e->offset = -1; - return e; + return e; } /** @@ -109,64 +109,64 @@ FRISO_API lex_entry_t new_lex_entry( */ FRISO_API void free_lex_entry( lex_entry_t e ) { - //if ( e->syn != NULL ) { - // if ( flag == 1 ) free_array_list( e->syn); - // else free_array_list( e->syn ); - //} - FRISO_FREE( e ); + //if ( e->syn != NULL ) { + // if ( flag == 1 ) free_array_list( e->syn); + // else free_array_list( e->syn ); + //} + FRISO_FREE( e ); } //add a new entry to the dictionary. FRISO_API void friso_dic_add( - friso_dic_t dic, - friso_lex_t lex, - fstring word, - friso_array_t syn ) + friso_dic_t dic, + friso_lex_t lex, + fstring word, + friso_array_t syn ) { - if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) - { - //printf("lex=%d, word=%s, syn=%s\n", lex, word, syn); - hash_put_mapping( dic[lex], word, - new_lex_entry( word, syn, 0, - (uint_t) strlen(word), (uint_t) lex ) ); - } + if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) + { + //printf("lex=%d, word=%s, syn=%s\n", lex, word, syn); + hash_put_mapping( dic[lex], word, + new_lex_entry( word, syn, 0, + (uint_t) strlen(word), (uint_t) lex ) ); + } } FRISO_API void friso_dic_add_with_fre( - friso_dic_t dic, - friso_lex_t lex, - fstring word, - friso_array_t syn, - uint_t frequency ) + friso_dic_t dic, + friso_lex_t lex, + fstring word, + friso_array_t syn, + uint_t frequency ) { - if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { - hash_put_mapping( dic[lex], word, - new_lex_entry( word, syn, frequency, - ( uint_t ) strlen(word), ( uint_t ) lex ) ); - } + if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { + hash_put_mapping( dic[lex], word, + new_lex_entry( word, syn, frequency, + ( uint_t ) strlen(word), ( uint_t ) lex ) ); + } } /* * read a line from a specified stream. - * the newline will be cleared. + * the newline will be cleared. * - * @date 2012-11-24 + * @date 2012-11-24 */ FRISO_API fstring file_get_line( fstring __dst, FILE * _stream ) { - register int c; - fstring cs; + register int c; + fstring cs; - cs = __dst; - while ( ( c = fgetc( _stream ) ) != EOF ) - { - if ( c == '\n' ) break; - *cs++ = c; - } - *cs = '\0'; + cs = __dst; + while ( ( c = fgetc( _stream ) ) != EOF ) + { + if ( c == '\n' ) break; + *cs++ = c; + } + *cs = '\0'; - return ( c == EOF && cs == __dst ) ? NULL : __dst; + return ( c == EOF && cs == __dst ) ? NULL : __dst; } /* @@ -174,373 +174,373 @@ FRISO_API fstring file_get_line( fstring __dst, FILE * _stream ) */ ///instead of memcpy __STATIC_API__ fstring string_copy( - fstring _src, - fstring __dst, - uint_t blocks ) + fstring _src, + fstring __dst, + uint_t blocks ) { - register fstring __src = _src; - register uint_t t; + register fstring __src = _src; + register uint_t t; - for ( t = 0; t < blocks; t++ ) { - if ( *__src == '\0' ) break; - __dst[t] = *__src++; - } - __dst[t] = '\0'; + for ( t = 0; t < blocks; t++ ) { + if ( *__src == '\0' ) break; + __dst[t] = *__src++; + } + __dst[t] = '\0'; - return __dst; + return __dst; } /** * make a heap allocation, and copy the - * source fstring to the new allocation, and - * you should free it after use it . + * source fstring to the new allocation, and + * you should free it after use it . * - * @param _src source fstring - * @param blocks number of bytes to copy + * @param _src source fstring + * @param blocks number of bytes to copy */ __STATIC_API__ fstring string_copy_heap( - fstring _src, uint_t blocks ) + fstring _src, uint_t blocks ) { - register uint_t t; + register uint_t t; - fstring str = ( fstring ) - FRISO_MALLOC( blocks + 1 ); - if ( str == NULL ) { - ___ALLOCATION_ERROR___; - } + fstring str = ( fstring ) + FRISO_MALLOC( blocks + 1 ); + if ( str == NULL ) { + ___ALLOCATION_ERROR___; + } - for ( t = 0; t < blocks; t++ ) { - if ( *_src == '\0' ) break; - str[t] = *_src++; - } + for ( t = 0; t < blocks; t++ ) { + if ( *_src == '\0' ) break; + str[t] = *_src++; + } - str[t] = '\0'; - return str; + str[t] = '\0'; + return str; } /* * find the postion of the first appear of the given char. - * address of the char in the fstring will be return . - * if not found NULL will be return . + * address of the char in the fstring will be return . + * if not found NULL will be return . */ __STATIC_API__ fstring indexOf( fstring __str, char delimiter ) { - uint_t i, __length__; + uint_t i, __length__; - __length__ = strlen( __str ); - for ( i = 0; i < __length__; i++ ) { - if ( __str[i] == delimiter ) - return __str + i; - } + __length__ = strlen( __str ); + for ( i = 0; i < __length__; i++ ) { + if ( __str[i] == delimiter ) + return __str + i; + } - return NULL; + return NULL; } /** * load all the valid wors from a specified lexicon file . * - * @param dic friso dictionary instance (A hash array) - * @param lex the lexicon type - * @param lex_file the path of the lexicon file - * @param length the maximum length of the word item + * @param dic friso dictionary instance (A hash array) + * @param lex the lexicon type + * @param lex_file the path of the lexicon file + * @param length the maximum length of the word item */ FRISO_API void friso_dic_load( - friso_t friso, - friso_config_t config, - friso_lex_t lex, - fstring lex_file, - uint_t length ) + friso_t friso, + friso_config_t config, + friso_lex_t lex, + fstring lex_file, + uint_t length ) { - FILE * _stream; - char __char[1024], _buffer[512]; - fstring _line; - string_split_entry sse; + FILE * _stream; + char __char[1024], _buffer[512]; + fstring _line; + string_split_entry sse; - fstring _word; - char _sbuffer[512]; - fstring _syn; - friso_array_t sywords; - uint_t _fre; + fstring _word; + char _sbuffer[512]; + fstring _syn; + friso_array_t sywords; + uint_t _fre; - if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL ) - { - while ( ( _line = file_get_line( __char, _stream ) ) != NULL ) - { - //clear up the notes - //make sure the length of the line is greater than 1. - //like the single '#' mark in stopwords dictionary. - if ( _line[0] == '#' && strlen(_line) > 1 ) continue; + if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL ) + { + while ( ( _line = file_get_line( __char, _stream ) ) != NULL ) + { + //clear up the notes + //make sure the length of the line is greater than 1. + //like the single '#' mark in stopwords dictionary. + if ( _line[0] == '#' && strlen(_line) > 1 ) continue; - //handle the stopwords. - if ( lex == __LEX_STOPWORDS__ ) - { - //clean the chinese words that its length is greater than max length. - if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue; - friso_dic_add( friso->dic, __LEX_STOPWORDS__, - string_copy_heap( _line, strlen(_line) ), NULL ); - continue; - } + //handle the stopwords. + if ( lex == __LEX_STOPWORDS__ ) + { + //clean the chinese words that its length is greater than max length. + if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue; + friso_dic_add( friso->dic, __LEX_STOPWORDS__, + string_copy_heap( _line, strlen(_line) ), NULL ); + continue; + } - //split the fstring with '/'. - string_split_reset( &sse, "/", _line); - if ( string_split_next( &sse, _buffer ) == NULL ) continue; + //split the fstring with '/'. + string_split_reset( &sse, "/", _line); + if ( string_split_next( &sse, _buffer ) == NULL ) continue; - //1. get the word. - _word = string_copy_heap( _buffer, strlen(_buffer) ); + //1. get the word. + _word = string_copy_heap( _buffer, strlen(_buffer) ); - if ( string_split_next( &sse, _buffer ) == NULL ) - { - //normal lexicon type, - //add them to the dictionary directly - friso_dic_add( friso->dic, lex, _word, NULL ); - continue; - } + if ( string_split_next( &sse, _buffer ) == NULL ) + { + //normal lexicon type, + //add them to the dictionary directly + friso_dic_add( friso->dic, lex, _word, NULL ); + continue; + } - /* - * filter out the words that its length is larger - * than the specified limit. - * but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__ - * and __LEX_CEM_WORDS__. - */ - if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ ) - && strlen( _word ) > length ) - { - FRISO_FREE(_word); - continue; - } + /* + * filter out the words that its length is larger + * than the specified limit. + * but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__ + * and __LEX_CEM_WORDS__. + */ + if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ ) + && strlen( _word ) > length ) + { + FRISO_FREE(_word); + continue; + } - //2. get the synonyms words. - _syn = NULL; - if ( strcmp( _buffer, "null" ) != 0 ) - _syn = string_copy( _buffer, _sbuffer, strlen(_buffer) ); + //2. get the synonyms words. + _syn = NULL; + if ( strcmp( _buffer, "null" ) != 0 ) + _syn = string_copy( _buffer, _sbuffer, strlen(_buffer) ); - //3. get the word frequency if it available. - _fre = 0; - if ( string_split_next( &sse, _buffer ) != NULL ) - _fre = atoi( _buffer ); + //3. get the word frequency if it available. + _fre = 0; + if ( string_split_next( &sse, _buffer ) != NULL ) + _fre = atoi( _buffer ); - /** - * Here: - * split the synonyms words with mark "," - * and put them in a array list if the synonyms is not NULL - */ - sywords = NULL; - if ( config->add_syn && _syn != NULL ) - { - string_split_reset( &sse, ",", _sbuffer ); - sywords = new_array_list_with_opacity(5); - while ( string_split_next( &sse, _buffer ) != NULL ) - { - if ( strlen(_buffer) > length ) continue; - array_list_add( sywords, - string_copy_heap(_buffer, strlen(_buffer)) ); - } - sywords = array_list_trim( sywords ); - } + /** + * Here: + * split the synonyms words with mark "," + * and put them in a array list if the synonyms is not NULL + */ + sywords = NULL; + if ( config->add_syn && _syn != NULL ) + { + string_split_reset( &sse, ",", _sbuffer ); + sywords = new_array_list_with_opacity(5); + while ( string_split_next( &sse, _buffer ) != NULL ) + { + if ( strlen(_buffer) > length ) continue; + array_list_add( sywords, + string_copy_heap(_buffer, strlen(_buffer)) ); + } + sywords = array_list_trim( sywords ); + } - //4. add the word item - friso_dic_add_with_fre( - friso->dic, lex, _word, sywords, _fre ); - } + //4. add the word item + friso_dic_add_with_fre( + friso->dic, lex, _word, sywords, _fre ); + } - fclose( _stream ); - } else { - printf("Warning: Fail to open lexicon file %s\n", lex_file); - } + fclose( _stream ); + } else { + printf("Warning: Fail to open lexicon file %s\n", lex_file); + } } /** * get the lexicon type index with the specified - * type keywords . + * type keywords . * - * @see friso.h#friso_lex_t - * @param _key - * @return int + * @see friso.h#friso_lex_t + * @param _key + * @return int */ __STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key ) { - if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) { - return __LEX_CJK_WORDS__; - } - else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) { - return __LEX_CJK_UNITS__; - } - else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) { - return __LEX_ECM_WORDS__; - } - else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) { - return __LEX_CEM_WORDS__; - } - else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) { - return __LEX_CN_LNAME__; - } - else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) { - return __LEX_CN_SNAME__; - } - else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) { - return __LEX_CN_DNAME1__; - } - else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) { - return __LEX_CN_DNAME2__; - } - else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) { - return __LEX_CN_LNA__; - } - else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) { - return __LEX_STOPWORDS__; - } - else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) { - return __LEX_ENPUN_WORDS__; - } - else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) { - return __LEX_EN_WORDS__; - } + if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) { + return __LEX_CJK_WORDS__; + } + else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) { + return __LEX_CJK_UNITS__; + } + else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) { + return __LEX_ECM_WORDS__; + } + else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) { + return __LEX_CEM_WORDS__; + } + else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) { + return __LEX_CN_LNAME__; + } + else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) { + return __LEX_CN_SNAME__; + } + else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) { + return __LEX_CN_DNAME1__; + } + else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) { + return __LEX_CN_DNAME2__; + } + else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) { + return __LEX_CN_LNA__; + } + else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) { + return __LEX_STOPWORDS__; + } + else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) { + return __LEX_ENPUN_WORDS__; + } + else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) { + return __LEX_EN_WORDS__; + } - return -1; + return -1; } /* * load the lexicon configuration file. - * and load all the valid lexicon from the configuration file. + * and load all the valid lexicon from the configuration file. * - * @param friso friso instance - * @param config friso_config instance - * @param _path dictionary directory - * @param _limitts words length limit + * @param friso friso instance + * @param config friso_config instance + * @param _path dictionary directory + * @param _limitts words length limit */ FRISO_API void friso_dic_load_from_ifile( - friso_t friso, - friso_config_t config, - fstring _path, - uint_t _limits ) + friso_t friso, + friso_config_t config, + fstring _path, + uint_t _limits ) { - //1.parse the configuration file. - FILE *__stream; - char __chars__[1024], __key__[30], *__line__; - uint_t __length__, i, t; - friso_lex_t lex_t; - string_buffer_t sb; + //1.parse the configuration file. + FILE *__stream; + char __chars__[1024], __key__[30], *__line__; + uint_t __length__, i, t; + friso_lex_t lex_t; + string_buffer_t sb; - //get the lexicon configruation file path - sb = new_string_buffer(); - string_buffer_append( sb, _path ); - string_buffer_append( sb, __FRISO_LEX_IFILE__ ); - //printf("%s\n", sb->buffer); + //get the lexicon configruation file path + sb = new_string_buffer(); + string_buffer_append( sb, _path ); + string_buffer_append( sb, __FRISO_LEX_IFILE__ ); + //printf("%s\n", sb->buffer); - if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL ) - { - while ( ( __line__ = - file_get_line( __chars__, __stream ) ) != NULL ) - { - //comment filter. - if ( __line__[0] == '#' ) continue; - if ( __line__[0] == '\0' ) continue; + if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL ) + { + while ( ( __line__ = + file_get_line( __chars__, __stream ) ) != NULL ) + { + //comment filter. + if ( __line__[0] == '#' ) continue; + if ( __line__[0] == '\0' ) continue; - __length__ = strlen( __line__ ); - //item start - if ( __line__[ __length__ - 1 ] == '[' ) - { - //get the type key - for ( i = 0; i < __length__ - && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); - for ( t = 0; i < __length__; i++,t++ ) { - if ( __line__[i] == ' ' - || __line__[i] == '\t' || __line__[i] == ':' ) break; - __key__[t] = __line__[i]; - } - __key__[t] = '\0'; + __length__ = strlen( __line__ ); + //item start + if ( __line__[ __length__ - 1 ] == '[' ) + { + //get the type key + for ( i = 0; i < __length__ + && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); + for ( t = 0; i < __length__; i++,t++ ) { + if ( __line__[i] == ' ' + || __line__[i] == '\t' || __line__[i] == ':' ) break; + __key__[t] = __line__[i]; + } + __key__[t] = '\0'; - //get the lexicon type - lex_t = get_lexicon_type_with_constant(__key__); - if ( lex_t == -1 ) continue; + //get the lexicon type + lex_t = get_lexicon_type_with_constant(__key__); + if ( lex_t == -1 ) continue; - //printf("key=%s, type=%d\n", __key__, lex_t ); - while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) - { - //comments filter. - if ( __line__[0] == '#' ) continue; - if ( __line__[0] == '\0' ) continue; + //printf("key=%s, type=%d\n", __key__, lex_t ); + while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) + { + //comments filter. + if ( __line__[0] == '#' ) continue; + if ( __line__[0] == '\0' ) continue; - __length__ = strlen( __line__ ); - if ( __line__[ __length__ - 1 ] == ']' ) break; + __length__ = strlen( __line__ ); + if ( __line__[ __length__ - 1 ] == ']' ) break; - for ( i = 0; i < __length__ - && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); - for ( t = 0; i < __length__; i++,t++ ) { - if ( __line__[i] == ' ' - || __line__[i] == '\t' || __line__[i] == ';' ) break; - __key__[t] = __line__[i]; - } - __key__[t] = '\0'; + for ( i = 0; i < __length__ + && ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); + for ( t = 0; i < __length__; i++,t++ ) { + if ( __line__[i] == ' ' + || __line__[i] == '\t' || __line__[i] == ';' ) break; + __key__[t] = __line__[i]; + } + __key__[t] = '\0'; - //load the lexicon item from the lexicon file. - string_buffer_clear( sb ); - string_buffer_append( sb, _path ); - string_buffer_append( sb, __key__ ); - //printf("key=%s, type=%d\n", __key__, lex_t); - friso_dic_load( friso, config, lex_t, sb->buffer, _limits ); - } + //load the lexicon item from the lexicon file. + string_buffer_clear( sb ); + string_buffer_append( sb, _path ); + string_buffer_append( sb, __key__ ); + //printf("key=%s, type=%d\n", __key__, lex_t); + friso_dic_load( friso, config, lex_t, sb->buffer, _limits ); + } - } + } - } //end while + } //end while - fclose( __stream ); - } else { - printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer); - } + fclose( __stream ); + } else { + printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer); + } - free_string_buffer(sb); + free_string_buffer(sb); } //match the item. FRISO_API int friso_dic_match( - friso_dic_t dic, - friso_lex_t lex, - fstring word ) + friso_dic_t dic, + friso_lex_t lex, + fstring word ) { - if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { - return hash_exist_mapping( dic[lex], word ); - } - return 0; + if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { + return hash_exist_mapping( dic[lex], word ); + } + return 0; } //get the lex_entry_t associated with the word. FRISO_API lex_entry_t friso_dic_get( - friso_dic_t dic, - friso_lex_t lex, - fstring word ) + friso_dic_t dic, + friso_lex_t lex, + fstring word ) { - if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { - return ( lex_entry_t ) hash_get_value( dic[lex], word ); - } - return NULL; + if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { + return ( lex_entry_t ) hash_get_value( dic[lex], word ); + } + return NULL; } //get the size of the specified type dictionary. FRISO_API uint_t friso_spec_dic_size( - friso_dic_t dic, - friso_lex_t lex ) + friso_dic_t dic, + friso_lex_t lex ) { - if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { - return hash_get_size( dic[lex] ); - } - return 0; + if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) { + return hash_get_size( dic[lex] ); + } + return 0; } //get size of the whole dictionary. FRISO_API uint_t friso_all_dic_size( - friso_dic_t dic ) + friso_dic_t dic ) { - register uint_t size = 0, t; + register uint_t size = 0, t; - for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { - size += hash_get_size( dic[t] ); - } + for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) { + size += hash_get_size( dic[t] ); + } - return size; + return size; } diff --git a/src/friso_link.c b/src/friso_link.c index dd58654..87a6dd2 100644 --- a/src/friso_link.c +++ b/src/friso_link.c @@ -1,29 +1,29 @@ /* * link list implemented functions - * defined in header file "friso_API.h". + * defined in header file "friso_API.h". * when the link_node is being deleted, here we just free - * the allocation of the node, not the allcation of it's value. + * the allocation of the node, not the allcation of it's value. * - * @author chenxin + * @author chenxin */ #include "friso_API.h" #include //create a new link list node. __STATIC_API__ link_node_t new_node_entry( - void * value, - link_node_t prev, - link_node_t next ) + void * value, + link_node_t prev, + link_node_t next ) { link_node_t node = ( link_node_t ) - FRISO_MALLOC( sizeof( link_node_entry ) ); + FRISO_MALLOC( sizeof( link_node_entry ) ); if ( node == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } - node->value = value; - node->prev = prev; - node->next = next; + node->value = value; + node->prev = prev; + node->next = next; return node; } @@ -32,14 +32,14 @@ __STATIC_API__ link_node_t new_node_entry( FRISO_API friso_link_t new_link_list( void ) { friso_link_t e = ( friso_link_t ) - FRISO_MALLOC( sizeof( friso_link_entry ) ); + FRISO_MALLOC( sizeof( friso_link_entry ) ); if ( e == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } //initialize the entry - e->head = new_node_entry( NULL, NULL, NULL ); - e->tail = new_node_entry( NULL, e->head, NULL ); + e->head = new_node_entry( NULL, NULL, NULL ); + e->tail = new_node_entry( NULL, e->head, NULL ); e->head->next = e->tail; e->size = 0; @@ -52,9 +52,9 @@ FRISO_API void free_link_list( friso_link_t link ) link_node_t node, next; for ( node = link->head; node != NULL; ) { - next = node->next; - FRISO_FREE( node ); - node = next; + next = node->next; + FRISO_FREE( node ); + node = next; } FRISO_FREE( link ); @@ -62,16 +62,16 @@ FRISO_API void free_link_list( friso_link_t link ) //clear all nodes in the link list. FRISO_API friso_link_t link_list_clear( - friso_link_t link ) + friso_link_t link ) { link_node_t node, next; //free all the middle nodes. for ( node = link->head->next; - node != link->tail; ) + node != link->tail; ) { - next = node->next; - FRISO_FREE( node ); - node = next; + next = node->next; + FRISO_FREE( node ); + node = next; } link->head->next = link->tail; @@ -97,22 +97,22 @@ FRISO_API friso_link_t link_list_clear( * static */ __STATIC_API__ link_node_t get_node( - friso_link_t link, uint_t idx ) + friso_link_t link, uint_t idx ) { link_node_t p = NULL; register uint_t t; if ( idx >= 0 && idx < link->size ) { - if ( idx < link->size / 2 ) { //find from the head. - p = link->head; - for ( t = 0; t <= idx; t++ ) - p = p->next; - } else { //find from the tail. - p = link->tail; - for ( t = link->size; t > idx; t-- ) - p = p->prev; - } + if ( idx < link->size / 2 ) { //find from the head. + p = link->head; + for ( t = 0; t <= idx; t++ ) + p = p->next; + } else { //find from the tail. + p = link->tail; + for ( t = link->size; t > idx; t-- ) + p = p->prev; + } } return p; @@ -123,9 +123,9 @@ __STATIC_API__ link_node_t get_node( * static */ //__STATIC_API__ void insert_before( -// friso_link_t link, -// link_node_t node, -// void * value ) +// friso_link_t link, +// link_node_t node, +// void * value ) //{ // link_node_t e = new_node_entry( value, node->prev, node ); // e->prev->next = e; @@ -136,10 +136,10 @@ __STATIC_API__ link_node_t get_node( //} #define insert_before( link, node, value ) \ { \ - link_node_t e = new_node_entry( value, node->prev, node ); \ - e->prev->next = e; \ - e->next->prev = e; \ - link->size++; \ + link_node_t e = new_node_entry( value, node->prev, node ); \ + e->prev->next = e; \ + e->next->prev = e; \ + link->size++; \ } /* @@ -150,7 +150,7 @@ __STATIC_API__ link_node_t get_node( * @return the value of the removed node. */ __STATIC_API__ void * remove_node( - friso_link_t link, link_node_t node ) + friso_link_t link, link_node_t node ) { void * _value = node->value; @@ -166,18 +166,18 @@ __STATIC_API__ void * remove_node( //add a new node to the link list.(insert just before the tail) FRISO_API void link_list_add( - friso_link_t link, void * value ) + friso_link_t link, void * value ) { insert_before( link, link->tail, value ); } //add a new node before the given index. FRISO_API void link_list_insert_before( - friso_link_t link, uint_t idx, void * value ) + friso_link_t link, uint_t idx, void * value ) { link_node_t node = get_node( link, idx ); if ( node != NULL ) { - insert_before( link, node, value ); + insert_before( link, node, value ); } } @@ -187,11 +187,11 @@ FRISO_API void link_list_insert_before( * @return the value of the node. */ FRISO_API void * link_list_get( - friso_link_t link, uint_t idx ) + friso_link_t link, uint_t idx ) { link_node_t node = get_node( link, idx ); if ( node != NULL ) { - return node->value; + return node->value; } return NULL; } @@ -199,20 +199,20 @@ FRISO_API void * link_list_get( /* * set the value of the node that located in the specified position. * we did't free the allocation of the old value, we return it to you. - * free it yourself when it is necessary. + * free it yourself when it is necessary. * * @return the old value. */ FRISO_API void *link_list_set( - friso_link_t link, - uint_t idx, void * value ) + friso_link_t link, + uint_t idx, void * value ) { link_node_t node = get_node( link, idx ); void * _value = NULL; if ( node != NULL ) { - _value = node->value; - node->value = value; + _value = node->value; + node->value = value; } return _value; @@ -225,13 +225,13 @@ FRISO_API void *link_list_set( * @return the value of the node removed. */ FRISO_API void *link_list_remove( - friso_link_t link, uint_t idx ) + friso_link_t link, uint_t idx ) { link_node_t node = get_node( link, idx ); if ( node != NULL ) { - //printf("idx=%d, node->value=%s\n", idx, (string) node->value ); - return remove_node( link, node ); + //printf("idx=%d, node->value=%s\n", idx, (string) node->value ); + return remove_node( link, node ); } return NULL; @@ -244,43 +244,43 @@ FRISO_API void *link_list_remove( * @return the value of the node removed. */ FRISO_API void *link_list_remove_node( - friso_link_t link, - link_node_t node ) + friso_link_t link, + link_node_t node ) { return remove_node( link, node ); } //remove the first node after the head FRISO_API void *link_list_remove_first( - friso_link_t link ) + friso_link_t link ) { if ( link->size > 0 ) { - return remove_node( link, link->head->next ); + return remove_node( link, link->head->next ); } return NULL; } //remove the last node just before the tail. FRISO_API void *link_list_remove_last( - friso_link_t link ) + friso_link_t link ) { if ( link->size > 0 ) { - return remove_node( link, link->tail->prev ); + return remove_node( link, link->tail->prev ); } return NULL; } //append a node from the tail. FRISO_API void link_list_add_last( - friso_link_t link, - void *value ) + friso_link_t link, + void *value ) { insert_before( link, link->tail, value ); } //append a note just after the head. FRISO_API void link_list_add_first( - friso_link_t link, void *value ) + friso_link_t link, void *value ) { insert_before( link, link->head->next, value ); } diff --git a/src/friso_string.c b/src/friso_string.c index a9c95af..91e633b 100644 --- a/src/friso_string.c +++ b/src/friso_string.c @@ -1,8 +1,8 @@ /* * utf-8 handle function implements. - * you could modify it or re-release it but never for commercial use. + * you could modify it or re-release it but never for commercial use. * - * @author chenxin + * @author chenxin */ #include "friso_API.h" @@ -11,14 +11,14 @@ #include /* ****************************************** - * fstring buffer functions implements. * + * fstring buffer functions implements. * ********************************************/ /** * create a new buffer * @Note: * 1. it's real length is 1 byte greater than the specifield value * 2. we did not do any optimization for the memory allocation to ... - * avoid the memory defragmentation. + * avoid the memory defragmentation. * * @date: 2014-10-16 */ @@ -26,7 +26,7 @@ __STATIC_API__ fstring create_buffer( uint_t length ) { fstring buffer = ( fstring ) FRISO_MALLOC( length + 1 ); if ( buffer == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } memset( buffer, 0x00, length + 1 ); @@ -36,7 +36,7 @@ __STATIC_API__ fstring create_buffer( uint_t length ) //the __allocs should not be smaller than sb->length __STATIC_API__ string_buffer_t resize_buffer( - string_buffer_t sb, uint_t __allocs ) + string_buffer_t sb, uint_t __allocs ) { //create a new buffer. //if ( __allocs < sb->length ) __allocs = sb->length + 1; @@ -44,7 +44,7 @@ __STATIC_API__ string_buffer_t resize_buffer( //register uint_t t; //for ( t = 0; t < sb->length; t++ ) { - // str[t] = sb->buffer[t]; + // str[t] = sb->buffer[t]; //} memcpy( str, sb->buffer, sb->length ); FRISO_FREE( sb->buffer ); @@ -65,9 +65,9 @@ __STATIC_API__ string_buffer_t resize_buffer( FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t opacity ) { string_buffer_t sb = ( string_buffer_t ) - FRISO_MALLOC( sizeof( string_buffer_entry ) ); + FRISO_MALLOC( sizeof( string_buffer_entry ) ); if ( sb == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } sb->buffer = create_buffer( opacity ); @@ -82,9 +82,9 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str ) { //buffer allocations. string_buffer_t sb = ( string_buffer_t ) - FRISO_MALLOC( sizeof( string_buffer_entry ) ); + FRISO_MALLOC( sizeof( string_buffer_entry ) ); if ( sb == NULL ) { - ___ALLOCATION_ERROR___ + ___ALLOCATION_ERROR___ } //initialize @@ -95,7 +95,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str ) //register uint_t t; //copy the str to the buffer. //for ( t = 0; t < sb->length; t++ ) { - // sb->buffer[t] = str[t]; + // sb->buffer[t] = str[t]; //} memcpy( sb->buffer, str, sb->length ); @@ -103,66 +103,66 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str ) } FRISO_API void string_buffer_append( - string_buffer_t sb, fstring __str ) + string_buffer_t sb, fstring __str ) { register uint_t __len__ = strlen( __str ); //check the necessity to resize the buffer. if ( sb->length + __len__ > sb->allocs ) { - sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 ); + sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 ); } //register uint_t t; ////copy the __str to the buffer. //for ( t = 0; t < __len__; t++ ) { - // sb->buffer[ sb->length++ ] = __str[t]; + // sb->buffer[ sb->length++ ] = __str[t]; //} memcpy( sb->buffer + sb->length, __str, __len__ ); sb->length += __len__; } FRISO_API void string_buffer_append_char( - string_buffer_t sb, char ch ) + string_buffer_t sb, char ch ) { //check the necessity to resize the buffer. if ( sb->length + 1 > sb->allocs ) { - sb = resize_buffer( sb, sb->length * 2 + 1 ); + sb = resize_buffer( sb, sb->length * 2 + 1 ); } sb->buffer[sb->length++] = ch; } FRISO_API void string_buffer_insert( - string_buffer_t sb, - uint_t idx, - fstring __str ) + string_buffer_t sb, + uint_t idx, + fstring __str ) { } /* * remove the given bytes from the buffer start from idx. - * this will cause the byte move after the idx+length. + * this will cause the byte move after the idx+length. * * @return the new string. */ FRISO_API fstring string_buffer_remove( - string_buffer_t sb, - uint_t idx, - uint_t length ) + string_buffer_t sb, + uint_t idx, + uint_t length ) { uint_t t; //move the bytes after the idx + length for ( t = idx + length; t < sb->length; t++ ) { - sb->buffer[t - length] = sb->buffer[t]; + sb->buffer[t - length] = sb->buffer[t]; } sb->buffer[t] = '\0'; //memcpy( sb->buffer + idx, - // sb->buffer + idx + length, - // sb->length - idx - length ); + // sb->buffer + idx + length, + // sb->length - idx - length ); t = sb->length - idx; if ( t > 0 ) { - sb->length -= ( t > length ) ? length : t; + sb->length -= ( t > length ) ? length : t; } sb->buffer[sb->length-1] = '\0'; @@ -171,13 +171,13 @@ FRISO_API fstring string_buffer_remove( /* * turn the string_buffer to a string. - * or return the buffer of the string_buffer. + * or return the buffer of the string_buffer. */ FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb ) { //resize the buffer. if ( sb->length < sb->allocs - 1 ) { - sb = resize_buffer( sb, sb->length + 1 ); + sb = resize_buffer( sb, sb->length + 1 ); } return sb; } @@ -185,8 +185,8 @@ FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb ) /* * free the given fstring buffer. * and this function will not free the allocations of the - * string_buffer_t->buffer, we return it to you, if there is - * a necessary you could free it youself by calling free(); + * string_buffer_t->buffer, we return it to you, if there is + * a necessary you could free it youself by calling free(); */ FRISO_API fstring string_buffer_devote( string_buffer_t sb ) { @@ -197,7 +197,7 @@ FRISO_API fstring string_buffer_devote( string_buffer_t sb ) /* * clear the given fstring buffer. - * reset its buffer with 0 and reset its length to 0. + * reset its buffer with 0 and reset its length to 0. */ FRISO_API void string_buffer_clear( string_buffer_t sb ) { @@ -216,17 +216,17 @@ FRISO_API void free_string_buffer( string_buffer_t sb ) /** * create a new string_split_entry. * - * @param source - * @return string_split_t; + * @param source + * @return string_split_t; */ FRISO_API string_split_t new_string_split( - fstring delimiter, - fstring source ) + fstring delimiter, + fstring source ) { string_split_t e = ( string_split_t ) - FRISO_MALLOC( sizeof( string_split_entry ) ); + FRISO_MALLOC( sizeof( string_split_entry ) ); if ( e == NULL ) { - ___ALLOCATION_ERROR___; + ___ALLOCATION_ERROR___; } e->delimiter = delimiter; @@ -239,19 +239,19 @@ FRISO_API string_split_t new_string_split( } FRISO_API void string_split_reset( - string_split_t sst, - fstring delimiter, - fstring source ) + string_split_t sst, + fstring delimiter, + fstring source ) { sst->delimiter = delimiter; sst->delLen = strlen(delimiter); sst->source = source; - sst->srcLen = strlen(source); + sst->srcLen = strlen(source); sst->idx = 0; } FRISO_API void string_split_set_source( - string_split_t sst, fstring source ) + string_split_t sst, fstring source ) { sst->source = source; sst->srcLen = strlen(source); @@ -259,7 +259,7 @@ FRISO_API void string_split_set_source( } FRISO_API void string_split_set_delimiter( - string_split_t sst, fstring delimiter ) + string_split_t sst, fstring delimiter ) { sst->delimiter = delimiter; sst->delLen = strlen( delimiter ); @@ -273,15 +273,15 @@ FRISO_API void free_string_split( string_split_t sst ) /** * get the next split fstring, and copy the - * splited fstring into the __dst buffer . + * splited fstring into the __dst buffer . * - * @param string_split_t - * @param __dst - * @return fstring (NULL if reach the end of the source - * or there is no more segmentation) + * @param string_split_t + * @param __dst + * @return fstring (NULL if reach the end of the source + * or there is no more segmentation) */ FRISO_API fstring string_split_next( - string_split_t sst, fstring __dst) + string_split_t sst, fstring __dst) { uint_t i, _ok; fstring _dst = __dst; @@ -291,28 +291,28 @@ FRISO_API fstring string_split_next( while ( 1 ) { - _ok = 1; - for ( i = 0; i < sst->delLen - && (sst->idx + i < sst->srcLen); i++ ) - { - if ( sst->source[sst->idx+i] != sst->delimiter[i] ) - { - _ok = 0; - break; - } - } + _ok = 1; + for ( i = 0; i < sst->delLen + && (sst->idx + i < sst->srcLen); i++ ) + { + if ( sst->source[sst->idx+i] != sst->delimiter[i] ) + { + _ok = 0; + break; + } + } - //find the delimiter here, - //break the loop and self plus the sst->idx, then return the buffer . - if ( _ok == 1 ) { - sst->idx += sst->delLen; - break; - } + //find the delimiter here, + //break the loop and self plus the sst->idx, then return the buffer . + if ( _ok == 1 ) { + sst->idx += sst->delLen; + break; + } - //coy the char to the buffer - *_dst++ = sst->source[sst->idx++]; - //check if reach the end of the fstring - if ( sst->idx >= sst->srcLen ) break; + //coy the char to the buffer + *_dst++ = sst->source[sst->idx++]; + //check if reach the end of the fstring + if ( sst->idx >= sst->srcLen ) break; } *_dst = '\0'; diff --git a/src/tst-array.c b/src/tst-array.c index dfe781e..e42ef12 100644 --- a/src/tst-array.c +++ b/src/tst-array.c @@ -1,8 +1,8 @@ /* * dynamatic array test program. * - * @author chenxin - * @email chenxin619315@gmail.com + * @author chenxin + * @email chenxin619315@gmail.com */ #include "friso_API.h" @@ -10,42 +10,42 @@ #include int main( int argc, char **args ) { - - //create a new array list. - friso_array_t array = new_array_list(); - fstring keys[] = { - "chenmanwen", "yangqinghua", - "chenxin", "luojiangyan", "xiaoyanzi", "bibi", - "zhangrenfang", "yangjian", - "liuxiao", "pankai", - "chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo", - "caizaili", "panpan", "xiaolude", "yintanwen" - }; - int j, idx = 2, len = sizeof( keys ) / sizeof( fstring ); + + //create a new array list. + friso_array_t array = new_array_list(); + fstring keys[] = { + "chenmanwen", "yangqinghua", + "chenxin", "luojiangyan", "xiaoyanzi", "bibi", + "zhangrenfang", "yangjian", + "liuxiao", "pankai", + "chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo", + "caizaili", "panpan", "xiaolude", "yintanwen" + }; + int j, idx = 2, len = sizeof( keys ) / sizeof( fstring ); - for ( j = 0; j < len; j++ ) { - array_list_add( array, keys[j] ); - } + for ( j = 0; j < len; j++ ) { + array_list_add( array, keys[j] ); + } - printf("length=%d, allocations=%d\n", array->length, array->allocs ); - array_list_trim( array ); - printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs ); - printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); + printf("length=%d, allocations=%d\n", array->length, array->allocs ); + array_list_trim( array ); + printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs ); + printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); - printf("\nAfter set %dth item.\n", idx ); - array_list_set( array, idx, "chenxin__" ); - printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); + printf("\nAfter set %dth item.\n", idx ); + array_list_set( array, idx, "chenxin__" ); + printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); - printf("\nAfter remove %dth item.\n", idx ); - array_list_remove( array, idx ); - printf("length=%d, allocations=%d\n", array->length, array->allocs ); - printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); + printf("\nAfter remove %dth item.\n", idx ); + array_list_remove( array, idx ); + printf("length=%d, allocations=%d\n", array->length, array->allocs ); + printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); - printf("\nInsert a item at %dth\n", idx ); - array_list_insert( array, idx, "*chenxin*" ); - printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); + printf("\nInsert a item at %dth\n", idx ); + array_list_insert( array, idx, "*chenxin*" ); + printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) ); - free_array_list( array ); + free_array_list( array ); - return 0; + return 0; } diff --git a/src/tst-friso.c b/src/tst-friso.c index 7cce8e4..33cd51d 100644 --- a/src/tst-friso.c +++ b/src/tst-friso.c @@ -1,8 +1,8 @@ /* * Friso test program. - * Of couse you can make it a perfect demo for friso. + * Of couse you can make it a perfect demo for friso. * all threads or proccess share the same friso_t, - * defferent threads/proccess use defferent friso_task_t. + * defferent threads/proccess use defferent friso_task_t. * and you could share the friso_config_t if you wish... * * @author chenxin @@ -17,33 +17,33 @@ #define __LENGTH__ 15 #define __INPUT_LENGTH__ 20480 -#define ___EXIT_INFO___ \ - println("Thanks for trying friso."); \ +#define ___EXIT_INFO___ \ + println("Thanks for trying friso."); \ break; -#define ___ABOUT___ \ - println("+-----------------------------------------------------------+"); \ - println("| friso - a chinese word segmentation writen by c. |"); \ - println("| bug report email - chenxin619315@gmail.com. |"); \ - println("| or: visit http://code.google.com/p/friso. |"); \ - println("| java edition for http://code.google.com/p/jcseg |"); \ - println("| type 'quit' to exit the program. |"); \ +#define ___ABOUT___ \ + println("+-----------------------------------------------------------+"); \ + println("| friso - a chinese word segmentation writen by c. |"); \ + println("| bug report email - chenxin619315@gmail.com. |"); \ + println("| or: visit http://code.google.com/p/friso. |"); \ + println("| java edition for http://code.google.com/p/jcseg |"); \ + println("| type 'quit' to exit the program. |"); \ println("+-----------------------------------------------------------+"); //read a line from a command line. static fstring getLine( FILE *fp, fstring __dst ) { - register int c; - register fstring cs; + register int c; + register fstring cs; - cs = __dst; - while ( ( c = getc( fp ) ) != EOF ) { - if ( c == '\n' ) break; - *cs++ = c; - } - *cs = '\0'; + cs = __dst; + while ( ( c = getc( fp ) ) != EOF ) { + if ( c == '\n' ) break; + *cs++ = c; + } + *cs = '\0'; - return ( c == EOF && cs == __dst ) ? NULL : __dst; + return ( c == EOF && cs == __dst ) ? NULL : __dst; } /*static void printcode( fstring str ) { @@ -59,94 +59,94 @@ static fstring getLine( FILE *fp, fstring __dst ) int main(int argc, char **argv) { - clock_t s_time, e_time; - char line[__INPUT_LENGTH__] = {0}; - int i; - fstring __path__ = NULL, mode = NULL; + clock_t s_time, e_time; + char line[__INPUT_LENGTH__] = {0}; + int i; + fstring __path__ = NULL, mode = NULL; - friso_t friso; - friso_config_t config; - friso_task_t task; + friso_t friso; + friso_config_t config; + friso_task_t task; - //get the lexicon directory - for ( i = 0; i < argc; i++ ) { - if ( strcasecmp( "-init", argv[i] ) == 0 ) { - __path__ = argv[i+1]; - } - } - if ( __path__ == NULL ) { - println("Usage: friso -init lexicon path"); - exit(0); - } + //get the lexicon directory + for ( i = 0; i < argc; i++ ) { + if ( strcasecmp( "-init", argv[i] ) == 0 ) { + __path__ = argv[i+1]; + } + } + if ( __path__ == NULL ) { + println("Usage: friso -init lexicon path"); + exit(0); + } - s_time = clock(); + s_time = clock(); - //initialize - friso = friso_new(); - config = friso_new_config(); - /*friso_dic_t dic = friso_dic_new(); - friso_dic_load_from_ifile( dic, __path__, __LENGTH__ ); - friso_set_dic( friso, dic ); - friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/ - if ( friso_init_from_ifile(friso, config, __path__) != 1 ) { - printf("fail to initialize friso and config."); - goto err; - } + //initialize + friso = friso_new(); + config = friso_new_config(); + /*friso_dic_t dic = friso_dic_new(); + friso_dic_load_from_ifile( dic, __path__, __LENGTH__ ); + friso_set_dic( friso, dic ); + friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/ + if ( friso_init_from_ifile(friso, config, __path__) != 1 ) { + printf("fail to initialize friso and config."); + goto err; + } - switch ( config->mode ) - { - case __FRISO_SIMPLE_MODE__: - mode = "Simple"; - break; - case __FRISO_COMPLEX_MODE__: - mode = "Complex"; - break; - case __FRISO_DETECT_MODE__: - mode = "Detect"; - break; - } + switch ( config->mode ) + { + case __FRISO_SIMPLE_MODE__: + mode = "Simple"; + break; + case __FRISO_COMPLEX_MODE__: + mode = "Complex"; + break; + case __FRISO_DETECT_MODE__: + mode = "Detect"; + break; + } - //friso_set_mode( config, __FRISO_DETECT_MODE__ ); - //printf("clr_stw=%d\n", friso->clr_stw); - //printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" )); - //printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究")); + //friso_set_mode( config, __FRISO_DETECT_MODE__ ); + //printf("clr_stw=%d\n", friso->clr_stw); + //printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" )); + //printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究")); - e_time = clock(); + e_time = clock(); - printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC ); - printf("Mode: %s\n", mode); - printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" ); - ___ABOUT___; + printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC ); + printf("Mode: %s\n", mode); + printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" ); + ___ABOUT___; - //set the task. - task = friso_new_task(); + //set the task. + task = friso_new_task(); - while ( 1 ) - { - print("friso>> "); - getLine( stdin, line ); - //exit the programe - if ( strcasecmp( line, "quit" ) == 0 ) { - ___EXIT_INFO___ - } + while ( 1 ) + { + print("friso>> "); + getLine( stdin, line ); + //exit the programe + if ( strcasecmp( line, "quit" ) == 0 ) { + ___EXIT_INFO___ + } - //for ( i = 0; i < 1000000; i++ ) { - //set the task text. - friso_set_text( task, line ); - println("分词结果:"); + //for ( i = 0; i < 1000000; i++ ) { + //set the task text. + friso_set_text( task, line ); + println("分词结果:"); - s_time = clock(); - while ( ( config->next_token( friso, config, task ) ) != NULL ) - { - //printf("%s[%d, %d, %d] ", task->token->word, - // task->token->offset, task->token->length, task->token->rlen ); - printf("%s ", task->token->word ); - } - //} - e_time = clock(); - printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC ); + s_time = clock(); + while ( ( config->next_token( friso, config, task ) ) != NULL ) + { + //printf("%s[%d, %d, %d] ", task->token->word, + // task->token->offset, task->token->length, task->token->rlen ); + printf("%s ", task->token->word ); + } + //} + e_time = clock(); + printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC ); - } + } friso_free_task( task ); diff --git a/src/tst-hash.c b/src/tst-hash.c index d824c63..e814977 100644 --- a/src/tst-hash.c +++ b/src/tst-hash.c @@ -1,8 +1,8 @@ /** * File Explain. * - * @author chenxin - * @see http://www.webssky.com + * @author chenxin + * @see http://www.webssky.com */ #include "friso_API.h" @@ -10,28 +10,28 @@ void print_hash_info( friso_hash_t _hash ) { printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \ - _hash->size, _hash->factor, _hash->threshold); + _hash->size, _hash->factor, _hash->threshold); } int main(int argc, char **argv) { friso_hash_t _hash = new_hash_table(); char *names[] = { - "陈满文", "阳清华", - "陈鑫", "罗江艳", - "小燕子", "比比", - "张仁芳", "阳建", - "陈配", "李恒", - "张志刚", "张怡少", - "阳江波", "蔡再利", - "阳绘章", "尹唐文", - "谭志鹏", "肖路德", - "潘凯", "刘潇", - "马朝辉", "张强", - "殷美林", "元明清", - "周安", "郭桥安", - "刘敏", "黄广华", - "李胜", "黄海清" + "陈满文", "阳清华", + "陈鑫", "罗江艳", + "小燕子", "比比", + "张仁芳", "阳建", + "陈配", "李恒", + "张志刚", "张怡少", + "阳江波", "蔡再利", + "阳绘章", "尹唐文", + "谭志鹏", "肖路德", + "潘凯", "刘潇", + "马朝辉", "张强", + "殷美林", "元明清", + "周安", "郭桥安", + "刘敏", "黄广华", + "李胜", "黄海清" }; //char *str[] = {"陈鑫", "张仁芳", "比比"}; char **str = names; @@ -39,7 +39,7 @@ int main(int argc, char **argv) print_hash_info( _hash ); for ( j = 0; j < len; j++) { - hash_put_mapping( _hash, names[j], names[j] ); + hash_put_mapping( _hash, names[j], names[j] ); } print_hash_info( _hash ); @@ -49,11 +49,11 @@ int main(int argc, char **argv) //remove mappings for ( j = 0; j < len; j++ ) { - printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] )); - printf("Now, remove %s\n", str[j]); - hash_remove_mapping( _hash, str[j] ); - printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] )); - printf("*********************************\n"); + printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] )); + printf("Now, remove %s\n", str[j]); + hash_remove_mapping( _hash, str[j] ); + printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] )); + printf("*********************************\n"); } printf("Press any key to continue."); diff --git a/src/tst-lex.c b/src/tst-lex.c index f615cca..2c26ef5 100644 --- a/src/tst-lex.c +++ b/src/tst-lex.c @@ -1,8 +1,8 @@ /* * lex functions test program. * - * @author chenxin - * @see http://www.webssky.com + * @author chenxin + * @see http://www.webssky.com */ #include "friso.h" @@ -11,10 +11,10 @@ #include #define __LENGTH__ 15 -#define ___PRINT_HELP_INFO___ \ - printf("1. help print the current menu.\n"); \ -printf("2. #set set the classify of the dictionary.\n"); \ -printf("3. other search the words in the dictionary.\n"); \ +#define ___PRINT_HELP_INFO___ \ + printf("1. help print the current menu.\n"); \ +printf("2. #set set the classify of the dictionary.\n"); \ +printf("3. other search the words in the dictionary.\n"); \ printf("4. quit exit the programe.\n"); int main(int argc, char **argv) @@ -62,30 +62,30 @@ int main(int argc, char **argv) e_time = clock(); printf("Done, cost: %f sec, size=%d\n", ( double ) ( e_time - s_time ) / CLOCKS_PER_SEC, \ - friso_all_dic_size( friso->dic ) ); + friso_all_dic_size( friso->dic ) ); while ( 1 ) { - printf("friso-%d>> ", lex); - scanf("%s", _line); - if ( strcmp( _line, "quit" ) == 0 ) { - break; - } else if ( strcmp( _line, "help" ) == 0 ) { - ___PRINT_HELP_INFO___ - } else if ( strcmp( _line, "#set" ) == 0 ) { - printf("lex_t>> "); - scanf("%d", &lex); - } else { - s_time = clock(); - e = friso_dic_get( friso->dic, lex, _line ); - e_time = clock(); - if ( e != NULL ) { - printf("word=%s, syn=%s, fre=%d, cost:%fsec\n", - e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre, - (double) ( e_time - s_time ) / CLOCKS_PER_SEC ); - } else { - printf("%s was not found.\n", _line); - } - } + printf("friso-%d>> ", lex); + scanf("%s", _line); + if ( strcmp( _line, "quit" ) == 0 ) { + break; + } else if ( strcmp( _line, "help" ) == 0 ) { + ___PRINT_HELP_INFO___ + } else if ( strcmp( _line, "#set" ) == 0 ) { + printf("lex_t>> "); + scanf("%d", &lex); + } else { + s_time = clock(); + e = friso_dic_get( friso->dic, lex, _line ); + e_time = clock(); + if ( e != NULL ) { + printf("word=%s, syn=%s, fre=%d, cost:%fsec\n", + e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre, + (double) ( e_time - s_time ) / CLOCKS_PER_SEC ); + } else { + printf("%s was not found.\n", _line); + } + } } //friso_dic_free( friso->dic ); diff --git a/src/tst-link.c b/src/tst-link.c index 491ea07..8fb16ac 100644 --- a/src/tst-link.c +++ b/src/tst-link.c @@ -1,8 +1,8 @@ /* * link list test programe. * - * @author chenxin - * @email chenxin619315@gmail.com + * @author chenxin + * @email chenxin619315@gmail.com */ #include "friso_API.h" @@ -13,12 +13,12 @@ int main( int argc, char **args ) { friso_link_t link; fstring keys[] = { - "chenmanwen", "yangqinghua", - "chenxin", "luojiangyan", "xiaoyanzi", "bibi", - "zhangrenfang", "yangjian", - "liuxiao", "pankai", - "chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo", - "caizaili", "panpan", "xiaolude", "yintanwen" + "chenmanwen", "yangqinghua", + "chenxin", "luojiangyan", "xiaoyanzi", "bibi", + "zhangrenfang", "yangjian", + "liuxiao", "pankai", + "chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo", + "caizaili", "panpan", "xiaolude", "yintanwen" }; int j, len = sizeof( keys ) / sizeof( fstring ); @@ -28,15 +28,15 @@ int main( int argc, char **args ) { printf("size=%d\n", link->size ); for ( j = 0; j < len; j++ ) { - //link_add( link, keys[j] ); - link_list_add_last( link, keys[j] ); + //link_add( link, keys[j] ); + link_list_add_last( link, keys[j] ); } printf("size=%d\n", link->size ); for ( j = 0; j < len / 2; j++ ) { - //printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) ); - printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) ); + //printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) ); + printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) ); } printf("size=%d\n", link->size ); diff --git a/src/tst-split.c b/src/tst-split.c index 3e21b35..6815d39 100644 --- a/src/tst-split.c +++ b/src/tst-split.c @@ -11,7 +11,7 @@ int main ( int argc, char **args ) { - fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057"; + fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057"; char buffer[128]; string_split_t split = new_string_split(",", source ); @@ -20,7 +20,7 @@ int main ( int argc, char **args ) printf("sst->delLen=%d\n", split->delLen); while ( string_split_next(split, buffer) != NULL) { - printf("buffer:%s\n", buffer); + printf("buffer:%s\n", buffer); } free_string_split(split); diff --git a/src/tst-string.c b/src/tst-string.c index 22b1416..373fa71 100644 --- a/src/tst-string.c +++ b/src/tst-string.c @@ -1,7 +1,7 @@ /* * fstring handle mode test program. * - * @author chenxin + * @author chenxin */ #include "friso_API.h" @@ -20,13 +20,13 @@ int main( int argc, char **args ) { for ( t = 0; t < length; t += bytes ) { - bytes = get_utf8_bytes( *(str + t) ); - if ( bytes == 0 ) continue; - for ( j = 0; j < bytes; j++ ) - word[j] = *(str + t + j ); - word[j] = '\0'; - string_buffer_append( sb, word ); - printf("word=%s\n", word ); + bytes = get_utf8_bytes( *(str + t) ); + if ( bytes == 0 ) continue; + for ( j = 0; j < bytes; j++ ) + word[j] = *(str + t + j ); + word[j] = '\0'; + string_buffer_append( sb, word ); + printf("word=%s\n", word ); } printf("length=%d, buffer=%s\n", sb->length, sb->buffer );