mirror of
https://gitee.com/lionsoul/friso.git
synced 2024-11-29 17:57:38 +08:00
code tab to 4 space
This commit is contained in:
parent
e9bf4a2536
commit
a264922721
50
CHANGES.md
50
CHANGES.md
@ -9,9 +9,9 @@ friso-1.6.2:
|
||||
|
||||
3. friso deb | rmp支持:
|
||||
Debian & Ubuntu:
|
||||
sudo apt-get install libfriso0 libfriso-dev
|
||||
sudo apt-get install libfriso0 libfriso-dev
|
||||
CentOS & Fedora:
|
||||
sudo yum install libfriso libfriso-devel
|
||||
sudo yum install libfriso libfriso-devel
|
||||
|
||||
4. 中文词性标注。
|
||||
|
||||
@ -26,41 +26,41 @@ friso-1.6.2:
|
||||
|
||||
friso-1.6.1:
|
||||
|
||||
1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done
|
||||
1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done
|
||||
|
||||
2. 修复两处内存泄漏bug. -done
|
||||
2. 修复两处内存泄漏bug. -done
|
||||
|
||||
3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done
|
||||
3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done
|
||||
|
||||
4. 词库优化, 加入了一些新词条. -done
|
||||
4. 词库优化, 加入了一些新词条. -done
|
||||
|
||||
5. 修复friso_dic_add & array_list_insert的两处代码bug -done
|
||||
5. 修复friso_dic_add & array_list_insert的两处代码bug -done
|
||||
|
||||
6. 增加检测模式切分, 只返回词库中有的词条 -done
|
||||
6. 增加检测模式切分, 只返回词库中有的词条 -done
|
||||
|
||||
7. 集成了php扩展绑定,完美支持PHP分词 -done
|
||||
7. 集成了php扩展绑定,完美支持PHP分词 -done
|
||||
|
||||
|
||||
|
||||
friso-1.6.0:
|
||||
|
||||
1. friso_stirng.c#utf8_decimal_string初始化bytes = 0,
|
||||
去除WinNT的Run-Time Check Failed. -done
|
||||
去除WinNT的Run-Time Check Failed. -done
|
||||
|
||||
2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done
|
||||
2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done
|
||||
|
||||
3. GBK编码支持. -done
|
||||
3. GBK编码支持. -done
|
||||
|
||||
4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done
|
||||
4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done
|
||||
|
||||
5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done
|
||||
5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done
|
||||
|
||||
6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回,纠正了offset的误差。 -done
|
||||
6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回,纠正了offset的误差。 -done
|
||||
|
||||
7. 做了一些优化,例如:同义词的追加(普通/sphinx定义)复杂的判断逻辑,
|
||||
改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done
|
||||
改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done
|
||||
|
||||
8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done
|
||||
8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done
|
||||
|
||||
9. 增加了安装中头文件的自动拷贝(usr/include/friso),可以通过include <friso/xx.h>来引用头文件。
|
||||
|
||||
@ -83,18 +83,18 @@ friso-1.4:
|
||||
1. 小数+单位无法识别的情况.更改friso_string#utf8_numeric_string()函数.
|
||||
|
||||
2. 更改中英混合词的识别(目前可以识别中英任何一种组合).
|
||||
英中: 例如: b超,
|
||||
英中英: a美1,
|
||||
英中英中: a哆啦a梦,
|
||||
中英: 卡拉ok,
|
||||
中英中: 哆啦a梦,
|
||||
中英中英: 中文a美a
|
||||
英中: 例如: b超,
|
||||
英中英: a美1,
|
||||
英中英中: a哆啦a梦,
|
||||
中英: 卡拉ok,
|
||||
中英中: 哆啦a梦,
|
||||
中英中英: 中文a美a
|
||||
|
||||
3. 更改了单位组合, 现在可以组合的单位不局限是中文, 例如: ℃,℉
|
||||
|
||||
4. 对于未识别的字符, 给定一个开关选项来决定保留还是过滤.
|
||||
|
||||
5. 英文同义词的追加(增加了lex-en.lex词库)
|
||||
5. 英文同义词的追加(增加了lex-en.lex词库)
|
||||
|
||||
|
||||
friso-1.3:
|
||||
@ -103,7 +103,7 @@ friso-1.3:
|
||||
2. 部分简易函数使用了宏定义来代替, 减少函数的调用.
|
||||
|
||||
3. 保留了英文全半角和中文标点符号的切分.(可以通过过滤停止词来过滤不需要的标点)
|
||||
停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了.
|
||||
停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了.
|
||||
|
||||
4. 修复friso_string#utf8_en_punctuation()函数一处bug.
|
||||
|
||||
|
@ -6,9 +6,9 @@ Friso是使用c语言开发的一款开源的高性能中文分词器,使用
|
||||
|
||||
2。三种切分模式:
|
||||
|
||||
(1). 简易模式:FMM算法,适合速度要求场合。
|
||||
(2). 复杂模式- MMSEG四种过滤算法,具有较高的岐义去除,分词准确率达到了98.41%。
|
||||
(3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始)
|
||||
(1). 简易模式:FMM算法,适合速度要求场合。
|
||||
(2). 复杂模式- MMSEG四种过滤算法,具有较高的岐义去除,分词准确率达到了98.41%。
|
||||
(3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始)
|
||||
|
||||
请参考本算法的原作:http://technology.chtsai.org/mmseg/。
|
||||
|
||||
|
@ -8,6 +8,6 @@
|
||||
// ARG_ENABLE("friso", "enable friso support", "no");
|
||||
|
||||
if (PHP_FRISO != "no") {
|
||||
EXTENSION("friso", "friso.c");
|
||||
EXTENSION("friso", "friso.c");
|
||||
}
|
||||
|
||||
|
@ -20,53 +20,53 @@ echo "friso_version(): " , friso_version(), ", friso_charset(): ", friso_charset
|
||||
echo "分词函数:<br />";
|
||||
if ( friso_charset() == 'UTF-8' )
|
||||
{
|
||||
$_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体,x射线本质是什么,今天去奇都ktv唱卡拉ok去,哆啦a梦是一个动漫中的主角,单位和全角: 2009年8月6日开始大学之旅,岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩.";
|
||||
echo "<p>friso_split(\"" . $_str . "\"):<p />";
|
||||
|
||||
//API:
|
||||
//rb_split(string, Array, [long])
|
||||
//1.string: 要被切分的字符串。
|
||||
//2.Array: 配置选项,使用NULL来选择默认的配置(friso.ini中的配置)。
|
||||
//3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs
|
||||
|
||||
//1.完整的配置:
|
||||
//array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1,
|
||||
// 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX);
|
||||
//1.在不了解friso内核的情况下, 请不要随便更改nthreshold
|
||||
//2.使用NULL来使用php.ini中指定的friso.ini文件中的配置
|
||||
|
||||
//2.返回选项:
|
||||
//词条: FRISO_RET_WORD, 类别:FRISO_RET_TYPE, 长度:FRISO_RET_LENGTH, 真实长度:FRISO_RET_RLEN, 偏移量:FRISO_RET_OFF
|
||||
//词性:FRISO_RET_POS(待实现)
|
||||
$_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS;
|
||||
//$_rargs = 0;
|
||||
|
||||
//3.切分类别:
|
||||
//CJK词条:FRISO_TYP_CJK, 英中混合词(b超):FRISO_TYP_ECM,中英混合词(卡拉ok):FRISO_TYP_CEM,
|
||||
//英文标点混合词(c++):FRISO_TYP_EPUN,标点:FRISO_TYP_PUN,未知类别:FRISO_TYP_UNK,其他类别(同义词):FRISO_TYP_OTR
|
||||
$_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs);
|
||||
unset($_str);
|
||||
foreach ( $_result as $_val )
|
||||
{
|
||||
$_str = $_val['word'];
|
||||
if ( $_rargs != 0 ) {
|
||||
$_str .= '[';
|
||||
if ( ($_rargs & FRISO_RET_TYPE) != 0 )
|
||||
$_str .= ', type: '.$_val['type']; //获取词条类别
|
||||
if ( ($_rargs & FRISO_RET_LEN) != 0 )
|
||||
$_str .= ', len: ' . $_val['len']; //词条长度
|
||||
if ( ($_rargs & FRISO_RET_RLEN) != 0 )
|
||||
$_str .= ', rlen: ' . $_val['rlen']; //词条真实长度
|
||||
if ( ($_rargs & FRISO_RET_OFF) != 0 )
|
||||
$_str .= ', off: ' . $_val['off']; //词条偏移量
|
||||
if ( ($_rargs & FRISO_RET_POS) != 0 )
|
||||
$_str .= ', pos: ' . $_val['pos']; //词条词性
|
||||
$_str .= ']';
|
||||
}
|
||||
$_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体,x射线本质是什么,今天去奇都ktv唱卡拉ok去,哆啦a梦是一个动漫中的主角,单位和全角: 2009年8月6日开始大学之旅,岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩.";
|
||||
echo "<p>friso_split(\"" . $_str . "\"):<p />";
|
||||
|
||||
//API:
|
||||
//rb_split(string, Array, [long])
|
||||
//1.string: 要被切分的字符串。
|
||||
//2.Array: 配置选项,使用NULL来选择默认的配置(friso.ini中的配置)。
|
||||
//3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs
|
||||
|
||||
//1.完整的配置:
|
||||
//array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1,
|
||||
// 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX);
|
||||
//1.在不了解friso内核的情况下, 请不要随便更改nthreshold
|
||||
//2.使用NULL来使用php.ini中指定的friso.ini文件中的配置
|
||||
|
||||
//2.返回选项:
|
||||
//词条: FRISO_RET_WORD, 类别:FRISO_RET_TYPE, 长度:FRISO_RET_LENGTH, 真实长度:FRISO_RET_RLEN, 偏移量:FRISO_RET_OFF
|
||||
//词性:FRISO_RET_POS(待实现)
|
||||
$_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS;
|
||||
//$_rargs = 0;
|
||||
|
||||
//3.切分类别:
|
||||
//CJK词条:FRISO_TYP_CJK, 英中混合词(b超):FRISO_TYP_ECM,中英混合词(卡拉ok):FRISO_TYP_CEM,
|
||||
//英文标点混合词(c++):FRISO_TYP_EPUN,标点:FRISO_TYP_PUN,未知类别:FRISO_TYP_UNK,其他类别(同义词):FRISO_TYP_OTR
|
||||
$_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs);
|
||||
unset($_str);
|
||||
foreach ( $_result as $_val )
|
||||
{
|
||||
$_str = $_val['word'];
|
||||
if ( $_rargs != 0 ) {
|
||||
$_str .= '[';
|
||||
if ( ($_rargs & FRISO_RET_TYPE) != 0 )
|
||||
$_str .= ', type: '.$_val['type']; //获取词条类别
|
||||
if ( ($_rargs & FRISO_RET_LEN) != 0 )
|
||||
$_str .= ', len: ' . $_val['len']; //词条长度
|
||||
if ( ($_rargs & FRISO_RET_RLEN) != 0 )
|
||||
$_str .= ', rlen: ' . $_val['rlen']; //词条真实长度
|
||||
if ( ($_rargs & FRISO_RET_OFF) != 0 )
|
||||
$_str .= ', off: ' . $_val['off']; //词条偏移量
|
||||
if ( ($_rargs & FRISO_RET_POS) != 0 )
|
||||
$_str .= ', pos: ' . $_val['pos']; //词条词性
|
||||
$_str .= ']';
|
||||
}
|
||||
|
||||
$_str .= '/ ';
|
||||
echo $_str;
|
||||
}
|
||||
$_str .= '/ ';
|
||||
echo $_str;
|
||||
}
|
||||
}
|
||||
else echo "set charset to UTF-8 to test function friso_split.";
|
||||
?>
|
||||
|
@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0);
|
||||
|
||||
//check the charset
|
||||
if ( friso_charset() != "GBK" ) {
|
||||
$_str = "Error: GBK charset required. <br />";
|
||||
$_str .= "1. Modified friso.charset = 1 in your friso.ini .<br />";
|
||||
$_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon. <br />";
|
||||
exit($_str);
|
||||
$_str = "Error: GBK charset required. <br />";
|
||||
$_str .= "1. Modified friso.charset = 1 in your friso.ini .<br />";
|
||||
$_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon. <br />";
|
||||
exit($_str);
|
||||
}
|
||||
|
||||
$text = '';
|
||||
@ -15,139 +15,139 @@ $_timer = 0;
|
||||
$_act = '';
|
||||
$_cfg = array('mode' => FRISO_COMPLEX);
|
||||
if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) {
|
||||
$text = &$_POST['text'];
|
||||
$_cfg = &$_POST['config'];
|
||||
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
|
||||
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
|
||||
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
|
||||
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
|
||||
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
|
||||
|
||||
$s_time = timer();
|
||||
$_ret = friso_split($text, $_cfg);
|
||||
$_timer = timer() - $s_time;
|
||||
$text = &$_POST['text'];
|
||||
$_cfg = &$_POST['config'];
|
||||
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
|
||||
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
|
||||
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
|
||||
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
|
||||
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
|
||||
|
||||
$s_time = timer();
|
||||
$_ret = friso_split($text, $_cfg);
|
||||
$_timer = timer() - $s_time;
|
||||
}
|
||||
|
||||
function timer() {
|
||||
list($msec, $sec) = explode(' ', microtime());
|
||||
return ((float)$msec + (float)$sec);
|
||||
list($msec, $sec) = explode(' ', microtime());
|
||||
return ((float)$msec + (float)$sec);
|
||||
}
|
||||
?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
|
||||
<head>
|
||||
<title>GBK - robbe分词测试程序 </title>
|
||||
<meta http-equiv="content-type" content="text/html;charset=GBK" />
|
||||
<style type="text/css">
|
||||
#box {width: 1000px}
|
||||
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
|
||||
color: #555;font-size: 14px;}
|
||||
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
|
||||
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
.link-item {float: left;font-size: 14px;font-weight: bold;
|
||||
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
|
||||
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
|
||||
|
||||
#cfg-box {margin-bottom: 10px;}
|
||||
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
|
||||
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
|
||||
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
|
||||
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
|
||||
color:#555; outline: none;}
|
||||
</style>
|
||||
<title>GBK - robbe分词测试程序 </title>
|
||||
<meta http-equiv="content-type" content="text/html;charset=GBK" />
|
||||
<style type="text/css">
|
||||
#box {width: 1000px}
|
||||
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
|
||||
color: #555;font-size: 14px;}
|
||||
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
|
||||
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
.link-item {float: left;font-size: 14px;font-weight: bold;
|
||||
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
|
||||
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
|
||||
|
||||
#cfg-box {margin-bottom: 10px;}
|
||||
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
|
||||
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
|
||||
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
|
||||
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
|
||||
color:#555; outline: none;}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="box">
|
||||
<form name="robbe" method="post" action="gbk.demo.php">
|
||||
<div class="title-item">分词配置:</div>
|
||||
<div id="cfg-box">
|
||||
<div>
|
||||
<label>最大词长: </label>
|
||||
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>混合词中文词长: </label>
|
||||
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文二次切分: </label>
|
||||
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>二次切分子Token最小长度: </label>
|
||||
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文Token中保留的标点: </label>
|
||||
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>同义词追加: </label>
|
||||
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>过滤停止词: </label>
|
||||
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>保留未识别词: </label>
|
||||
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>sphinx定制输出: </label>
|
||||
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>分词模式: </label>
|
||||
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
|
||||
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="title-item">分词内容:</div>
|
||||
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
|
||||
<input type="hidden" name="_act" value="split"/>
|
||||
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
|
||||
</form>
|
||||
<div id="box">
|
||||
<form name="robbe" method="post" action="gbk.demo.php">
|
||||
<div class="title-item">分词配置:</div>
|
||||
<div id="cfg-box">
|
||||
<div>
|
||||
<label>最大词长: </label>
|
||||
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>混合词中文词长: </label>
|
||||
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文二次切分: </label>
|
||||
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>二次切分子Token最小长度: </label>
|
||||
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文Token中保留的标点: </label>
|
||||
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>同义词追加: </label>
|
||||
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>过滤停止词: </label>
|
||||
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>保留未识别词: </label>
|
||||
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>sphinx定制输出: </label>
|
||||
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>分词模式: </label>
|
||||
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
|
||||
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="title-item">分词内容:</div>
|
||||
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
|
||||
<input type="hidden" name="_act" value="split"/>
|
||||
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
|
||||
</form>
|
||||
|
||||
<?php
|
||||
if ( $_act == 'split' ) {
|
||||
?>
|
||||
<div class="title-item">分词结果:</div>
|
||||
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
|
||||
</textarea></div>
|
||||
<div class="link-box"><a id="info-link">
|
||||
<?php
|
||||
$len = strlen($text);
|
||||
if ( $len >= 1048576 ) {
|
||||
echo substr(($len/1048576), 0, 6).'MB';
|
||||
} else if ( $len >= 1024 ) {
|
||||
echo substr( ($len / 1024), 0, 6).'KB';
|
||||
} else {
|
||||
echo $len.'B';
|
||||
}
|
||||
?>
|
||||
<?php printf("%.5f", $_timer)?>sec
|
||||
</a></div>
|
||||
<?php
|
||||
}
|
||||
?>
|
||||
</div>
|
||||
<?php
|
||||
if ( $_act == 'split' ) {
|
||||
?>
|
||||
<div class="title-item">分词结果:</div>
|
||||
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
|
||||
</textarea></div>
|
||||
<div class="link-box"><a id="info-link">
|
||||
<?php
|
||||
$len = strlen($text);
|
||||
if ( $len >= 1048576 ) {
|
||||
echo substr(($len/1048576), 0, 6).'MB';
|
||||
} else if ( $len >= 1024 ) {
|
||||
echo substr( ($len / 1024), 0, 6).'KB';
|
||||
} else {
|
||||
echo $len.'B';
|
||||
}
|
||||
?>
|
||||
<?php printf("%.5f", $_timer)?>sec
|
||||
</a></div>
|
||||
<?php
|
||||
}
|
||||
?>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript">
|
||||
String.prototype.trim = function() {return this.replace(/^\s+|\s+$/g, '');}
|
||||
function do_submit() {
|
||||
var text = document.getElementById('text');
|
||||
if ( text.value.trim() == '' ) return;
|
||||
document.robbe.submit();
|
||||
var text = document.getElementById('text');
|
||||
if ( text.value.trim() == '' ) return;
|
||||
document.robbe.submit();
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
|
@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0);
|
||||
|
||||
//charset check.
|
||||
if ( friso_charset() != "UTF-8" ) {
|
||||
$_str = "Error: UTF-8 charset required. <br />";
|
||||
$_str .= "1. Modified friso.charset = 0 in your friso.ini .<br />";
|
||||
$_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon. <br />";
|
||||
exit($_str);
|
||||
$_str = "Error: UTF-8 charset required. <br />";
|
||||
$_str .= "1. Modified friso.charset = 0 in your friso.ini .<br />";
|
||||
$_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon. <br />";
|
||||
exit($_str);
|
||||
}
|
||||
|
||||
$text = '';
|
||||
@ -15,139 +15,139 @@ $_timer = 0;
|
||||
$_act = '';
|
||||
$_cfg = array('mode' => FRISO_COMPLEX);
|
||||
if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) {
|
||||
$text = &$_POST['text'];
|
||||
$_cfg = &$_POST['config'];
|
||||
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
|
||||
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
|
||||
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
|
||||
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
|
||||
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
|
||||
|
||||
$s_time = timer();
|
||||
$_ret = friso_split($text, $_cfg);
|
||||
$_timer = timer() - $s_time;
|
||||
$text = &$_POST['text'];
|
||||
$_cfg = &$_POST['config'];
|
||||
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
|
||||
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
|
||||
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
|
||||
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
|
||||
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
|
||||
|
||||
$s_time = timer();
|
||||
$_ret = friso_split($text, $_cfg);
|
||||
$_timer = timer() - $s_time;
|
||||
}
|
||||
|
||||
function timer() {
|
||||
list($msec, $sec) = explode(' ', microtime());
|
||||
return ((float)$msec + (float)$sec);
|
||||
list($msec, $sec) = explode(' ', microtime());
|
||||
return ((float)$msec + (float)$sec);
|
||||
}
|
||||
?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
|
||||
<head>
|
||||
<title>UTF8 - robbe分词测试程序</title>
|
||||
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
|
||||
<style type="text/css">
|
||||
#box {width: 1000px}
|
||||
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
|
||||
color: #555;font-size: 14px;}
|
||||
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
|
||||
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
.link-item {float: left;font-size: 14px;font-weight: bold;
|
||||
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
|
||||
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
|
||||
|
||||
#cfg-box {margin-bottom: 10px;}
|
||||
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
|
||||
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
|
||||
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
|
||||
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
|
||||
color:#555; outline: none;}
|
||||
</style>
|
||||
<title>UTF8 - robbe分词测试程序</title>
|
||||
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
|
||||
<style type="text/css">
|
||||
#box {width: 1000px}
|
||||
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
|
||||
color: #555;font-size: 14px;}
|
||||
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
|
||||
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
|
||||
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
|
||||
text-decoration: none;font-size: 14px;}
|
||||
.link-item {float: left;font-size: 14px;font-weight: bold;
|
||||
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
|
||||
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
|
||||
|
||||
#cfg-box {margin-bottom: 10px;}
|
||||
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
|
||||
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
|
||||
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
|
||||
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
|
||||
color:#555; outline: none;}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="box">
|
||||
<form name="robbe" method="post" action="utf8.demo.php">
|
||||
<div class="title-item">分词配置:</div>
|
||||
<div id="cfg-box">
|
||||
<div>
|
||||
<label>最大词长: </label>
|
||||
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>混合词中文词长: </label>
|
||||
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文二次切分: </label>
|
||||
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>二次切分子Token最小长度: </label>
|
||||
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文Token中保留的标点: </label>
|
||||
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>同义词追加: </label>
|
||||
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>过滤停止词: </label>
|
||||
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>保留未识别词: </label>
|
||||
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>sphinx定制输出: </label>
|
||||
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>分词模式: </label>
|
||||
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
|
||||
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="title-item">分词内容:</div>
|
||||
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
|
||||
<input type="hidden" name="_act" value="split"/>
|
||||
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
|
||||
</form>
|
||||
<div id="box">
|
||||
<form name="robbe" method="post" action="utf8.demo.php">
|
||||
<div class="title-item">分词配置:</div>
|
||||
<div id="cfg-box">
|
||||
<div>
|
||||
<label>最大词长: </label>
|
||||
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>混合词中文词长: </label>
|
||||
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文二次切分: </label>
|
||||
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>二次切分子Token最小长度: </label>
|
||||
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>英文Token中保留的标点: </label>
|
||||
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
|
||||
</div>
|
||||
<div>
|
||||
<label>同义词追加: </label>
|
||||
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>过滤停止词: </label>
|
||||
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>保留未识别词: </label>
|
||||
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>sphinx定制输出: </label>
|
||||
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
|
||||
</div>
|
||||
<div>
|
||||
<label>分词模式: </label>
|
||||
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
|
||||
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="title-item">分词内容:</div>
|
||||
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
|
||||
<input type="hidden" name="_act" value="split"/>
|
||||
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
|
||||
</form>
|
||||
|
||||
<?php
|
||||
if ( $_act == 'split' ) {
|
||||
?>
|
||||
<div class="title-item">分词结果:</div>
|
||||
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
|
||||
</textarea></div>
|
||||
<div class="link-box"><a id="info-link">
|
||||
<?php
|
||||
$len = strlen($text);
|
||||
if ( $len >= 1048576 ) {
|
||||
echo substr(($len/1048576), 0, 6).'MB';
|
||||
} else if ( $len >= 1024 ) {
|
||||
echo substr( ($len / 1024), 0, 6).'KB';
|
||||
} else {
|
||||
echo $len.'B';
|
||||
}
|
||||
?>
|
||||
<?php printf("%.5f", $_timer)?>sec
|
||||
</a></div>
|
||||
<?php
|
||||
}
|
||||
?>
|
||||
</div>
|
||||
<?php
|
||||
if ( $_act == 'split' ) {
|
||||
?>
|
||||
<div class="title-item">分词结果:</div>
|
||||
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
|
||||
</textarea></div>
|
||||
<div class="link-box"><a id="info-link">
|
||||
<?php
|
||||
$len = strlen($text);
|
||||
if ( $len >= 1048576 ) {
|
||||
echo substr(($len/1048576), 0, 6).'MB';
|
||||
} else if ( $len >= 1024 ) {
|
||||
echo substr( ($len / 1024), 0, 6).'KB';
|
||||
} else {
|
||||
echo $len.'B';
|
||||
}
|
||||
?>
|
||||
<?php printf("%.5f", $_timer)?>sec
|
||||
</a></div>
|
||||
<?php
|
||||
}
|
||||
?>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript">
|
||||
String.prototype.trim = function() {return this.replace(/^\s+|\s+$/g, '');}
|
||||
function do_submit() {
|
||||
var text = document.getElementById('text');
|
||||
if ( text.value.trim() == '' ) return;
|
||||
document.robbe.submit();
|
||||
var text = document.getElementById('text');
|
||||
if ( text.value.trim() == '' ) return;
|
||||
document.robbe.submit();
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
|
@ -9,9 +9,9 @@
|
||||
#include "php_friso.h"
|
||||
|
||||
#ifdef FRISO_WINNT
|
||||
# define friso_default_conf_file "c:/windows/friso.ini"
|
||||
# define friso_default_conf_file "c:/windows/friso.ini"
|
||||
#else
|
||||
# define friso_default_conf_file "/etc/friso/friso.ini"
|
||||
# define friso_default_conf_file "/etc/friso/friso.ini"
|
||||
#endif
|
||||
|
||||
/* If you declare any globals in php_friso.h uncomment this:
|
||||
@ -27,15 +27,15 @@ static int le_friso = 1;
|
||||
* Every user visible function must have an entry in friso_functions[].
|
||||
*/
|
||||
const zend_function_entry friso_functions[] = {
|
||||
PHP_FE(friso_split, NULL)
|
||||
PHP_FE(friso_version, NULL)
|
||||
PHP_FE(friso_charset, NULL)
|
||||
PHP_FE(friso_dic_exist, NULL)
|
||||
PHP_FE(friso_dic_get, NULL)
|
||||
PHP_FE(friso_utf8_bytes, NULL)
|
||||
PHP_FE(friso_utf8_ucode, NULL)
|
||||
PHP_FE(friso_ucode_utf8, NULL)
|
||||
{NULL, NULL, NULL} /* Must be the last line in friso_functions[] */
|
||||
PHP_FE(friso_split, NULL)
|
||||
PHP_FE(friso_version, NULL)
|
||||
PHP_FE(friso_charset, NULL)
|
||||
PHP_FE(friso_dic_exist, NULL)
|
||||
PHP_FE(friso_dic_get, NULL)
|
||||
PHP_FE(friso_utf8_bytes, NULL)
|
||||
PHP_FE(friso_utf8_ucode, NULL)
|
||||
PHP_FE(friso_ucode_utf8, NULL)
|
||||
{NULL, NULL, NULL} /* Must be the last line in friso_functions[] */
|
||||
};
|
||||
/* }}} */
|
||||
|
||||
@ -43,19 +43,19 @@ const zend_function_entry friso_functions[] = {
|
||||
*/
|
||||
zend_module_entry friso_module_entry = {
|
||||
#if ZEND_MODULE_API_NO >= 20010901
|
||||
STANDARD_MODULE_HEADER,
|
||||
STANDARD_MODULE_HEADER,
|
||||
#endif
|
||||
"friso",
|
||||
friso_functions,
|
||||
PHP_MINIT(friso),
|
||||
PHP_MSHUTDOWN(friso),
|
||||
PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */
|
||||
PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */
|
||||
PHP_MINFO(friso),
|
||||
"friso",
|
||||
friso_functions,
|
||||
PHP_MINIT(friso),
|
||||
PHP_MSHUTDOWN(friso),
|
||||
PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */
|
||||
PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */
|
||||
PHP_MINFO(friso),
|
||||
#if ZEND_MODULE_API_NO >= 20010901
|
||||
"0.1", /* Replace with version number for your extension */
|
||||
"0.1", /* Replace with version number for your extension */
|
||||
#endif
|
||||
STANDARD_MODULE_PROPERTIES
|
||||
STANDARD_MODULE_PROPERTIES
|
||||
};
|
||||
/* }}} */
|
||||
|
||||
@ -73,72 +73,72 @@ PHP_INI_END()
|
||||
/* {{{ php_robbe_globals_construct */
|
||||
static void php_friso_globals_construct(zend_friso_globals *friso_globals)
|
||||
{
|
||||
friso_globals->friso = friso_new();
|
||||
friso_globals->config = friso_new_config();
|
||||
friso_init_from_ifile(friso_globals->friso,
|
||||
friso_globals->config, INI_STR("friso.ini_file"));
|
||||
friso_globals->friso = friso_new();
|
||||
friso_globals->config = friso_new_config();
|
||||
friso_init_from_ifile(friso_globals->friso,
|
||||
friso_globals->config, INI_STR("friso.ini_file"));
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
/* {{{ php_robbe_globals_destruct*/
|
||||
static void php_friso_globals_destruct(zend_friso_globals *friso_globals)
|
||||
{
|
||||
/*
|
||||
* cause friso_free will free the dictionary
|
||||
* so here we don't have to call the friso_dic_free to free the
|
||||
* the robbe_dic global variable.
|
||||
*/
|
||||
//friso_dic_free( friso_globals->friso_dic );
|
||||
//friso_globals->friso_dic = NULL;
|
||||
friso_free_config( friso_globals->config );
|
||||
friso_free( friso_globals->friso );
|
||||
/*
|
||||
* cause friso_free will free the dictionary
|
||||
* so here we don't have to call the friso_dic_free to free the
|
||||
* the robbe_dic global variable.
|
||||
*/
|
||||
//friso_dic_free( friso_globals->friso_dic );
|
||||
//friso_globals->friso_dic = NULL;
|
||||
friso_free_config( friso_globals->config );
|
||||
friso_free( friso_globals->friso );
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
#define FRISO_RET_WORD (1 << 0)
|
||||
#define FRISO_RET_TYPE (1 << 1)
|
||||
#define FRISO_RET_OFF (1 << 2)
|
||||
#define FRISO_RET_LEN (1 << 3)
|
||||
#define FRISO_RET_RLEN (1 << 4)
|
||||
#define FRISO_RET_POS (1 << 5)
|
||||
#define FRISO_RET_WORD (1 << 0)
|
||||
#define FRISO_RET_TYPE (1 << 1)
|
||||
#define FRISO_RET_OFF (1 << 2)
|
||||
#define FRISO_RET_LEN (1 << 3)
|
||||
#define FRISO_RET_RLEN (1 << 4)
|
||||
#define FRISO_RET_POS (1 << 5)
|
||||
|
||||
/* {{{ PHP_MINIT_FUNCTION
|
||||
*/
|
||||
PHP_MINIT_FUNCTION(friso)
|
||||
{
|
||||
/*
|
||||
* register some contants that robbe may use
|
||||
* at its following work.
|
||||
* the constant is case sensitive and persitent.
|
||||
*/
|
||||
REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
/*
|
||||
* register some contants that robbe may use
|
||||
* at its following work.
|
||||
* the constant is case sensitive and persitent.
|
||||
*/
|
||||
REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
|
||||
//return parts for rb_split.
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT);
|
||||
//return parts for rb_split.
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT);
|
||||
|
||||
//lex type constants.
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
//lex type constants.
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
|
||||
|
||||
REGISTER_INI_ENTRIES();
|
||||
/*initialize the globals variables.*/
|
||||
php_friso_globals_construct( &friso_globals );
|
||||
REGISTER_INI_ENTRIES();
|
||||
/*initialize the globals variables.*/
|
||||
php_friso_globals_construct( &friso_globals );
|
||||
|
||||
return SUCCESS;
|
||||
return SUCCESS;
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -146,11 +146,11 @@ PHP_MINIT_FUNCTION(friso)
|
||||
*/
|
||||
PHP_MSHUTDOWN_FUNCTION(friso)
|
||||
{
|
||||
UNREGISTER_INI_ENTRIES();
|
||||
/*destruct the globals variables*/
|
||||
php_friso_globals_destruct( &friso_globals );
|
||||
|
||||
return SUCCESS;
|
||||
UNREGISTER_INI_ENTRIES();
|
||||
/*destruct the globals variables*/
|
||||
php_friso_globals_destruct( &friso_globals );
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -159,7 +159,7 @@ PHP_MSHUTDOWN_FUNCTION(friso)
|
||||
*/
|
||||
PHP_RINIT_FUNCTION(friso)
|
||||
{
|
||||
return SUCCESS;
|
||||
return SUCCESS;
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -168,22 +168,22 @@ PHP_RINIT_FUNCTION(friso)
|
||||
*/
|
||||
PHP_RSHUTDOWN_FUNCTION(friso)
|
||||
{
|
||||
return SUCCESS;
|
||||
return SUCCESS;
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
/* {{{ PHP_MINFO_FUNCTION
|
||||
*/
|
||||
PHP_MINFO_FUNCTION(friso)
|
||||
{
|
||||
php_info_print_table_start();
|
||||
php_info_print_table_row(2, "Friso Support", "enabled");
|
||||
php_info_print_table_row(2, "Version", FRISO_VERSION);
|
||||
php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com");
|
||||
php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso");
|
||||
php_info_print_table_end();
|
||||
{
|
||||
php_info_print_table_start();
|
||||
php_info_print_table_row(2, "Friso Support", "enabled");
|
||||
php_info_print_table_row(2, "Version", FRISO_VERSION);
|
||||
php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com");
|
||||
php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso");
|
||||
php_info_print_table_end();
|
||||
|
||||
DISPLAY_INI_ENTRIES();
|
||||
DISPLAY_INI_ENTRIES();
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -192,130 +192,130 @@ PHP_MINFO_FUNCTION(friso)
|
||||
Return a array contains all the split result with a specified mode */
|
||||
PHP_FUNCTION(friso_split)
|
||||
{
|
||||
char *_str = NULL, *_key;
|
||||
int slen, idx, klen, rargs = 0;
|
||||
int arg_count;
|
||||
char *_str = NULL, *_key;
|
||||
int slen, idx, klen, rargs = 0;
|
||||
int arg_count;
|
||||
|
||||
zval *ret, *cfg, **data;
|
||||
//used for multiple item return.
|
||||
zval *item;
|
||||
zval *ret, *cfg, **data;
|
||||
//used for multiple item return.
|
||||
zval *item;
|
||||
|
||||
HashTable *cfgArr;
|
||||
HashPosition pointer;
|
||||
HashTable *cfgArr;
|
||||
HashPosition pointer;
|
||||
|
||||
friso_task_t task;
|
||||
friso_config_t config = NULL, nconfig = NULL;
|
||||
friso_task_t task;
|
||||
friso_config_t config = NULL, nconfig = NULL;
|
||||
|
||||
//get the arugments from the php layer.
|
||||
arg_count = ZEND_NUM_ARGS();
|
||||
switch ( arg_count )
|
||||
{
|
||||
case 2:
|
||||
if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz",
|
||||
&_str, &slen, &cfg) == FAILURE ) return;
|
||||
break;
|
||||
case 3:
|
||||
if (zend_parse_parameters( arg_count TSRMLS_CC, "szl",
|
||||
&_str, &slen, &cfg, &rargs) == FAILURE ) return;
|
||||
break;
|
||||
default:
|
||||
WRONG_PARAM_COUNT;
|
||||
}
|
||||
//get the arugments from the php layer.
|
||||
arg_count = ZEND_NUM_ARGS();
|
||||
switch ( arg_count )
|
||||
{
|
||||
case 2:
|
||||
if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz",
|
||||
&_str, &slen, &cfg) == FAILURE ) return;
|
||||
break;
|
||||
case 3:
|
||||
if (zend_parse_parameters( arg_count TSRMLS_CC, "szl",
|
||||
&_str, &slen, &cfg, &rargs) == FAILURE ) return;
|
||||
break;
|
||||
default:
|
||||
WRONG_PARAM_COUNT;
|
||||
}
|
||||
|
||||
//make sure the RB_RET_WORD will be returned.
|
||||
//rargs |= FRISO_RET_WORD;
|
||||
//make sure the RB_RET_WORD will be returned.
|
||||
//rargs |= FRISO_RET_WORD;
|
||||
|
||||
//check and initialize the friso.
|
||||
if ( Z_TYPE_P(cfg) != IS_NULL )
|
||||
{
|
||||
nconfig = friso_new_config();
|
||||
memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry));
|
||||
//check and initialize the friso.
|
||||
if ( Z_TYPE_P(cfg) != IS_NULL )
|
||||
{
|
||||
nconfig = friso_new_config();
|
||||
memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry));
|
||||
|
||||
//check the new setting.
|
||||
cfgArr = Z_ARRVAL_P(cfg);
|
||||
//zend_printf("array length: %d", zend_hash_num_elements(cfgArr));
|
||||
for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer);
|
||||
zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS;
|
||||
zend_hash_move_forward_ex(cfgArr, &pointer) )
|
||||
{
|
||||
zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer);
|
||||
//zend_printf("key: %s, value: %d<br />", _key, (*data)->value.lval);
|
||||
|
||||
if ( strcmp(_key, "kpuncs") == 0 )
|
||||
{
|
||||
memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len);
|
||||
nconfig->kpuncs[(*data)->value.str.len] = '\0';
|
||||
}
|
||||
else
|
||||
{
|
||||
//convert the data to long.
|
||||
convert_to_long_ex(data);
|
||||
if ( strcmp(_key, "max_len") == 0 )
|
||||
nconfig->max_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "r_name") == 0 )
|
||||
nconfig->r_name = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "mix_len") == 0 )
|
||||
nconfig->mix_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "lna_len") == 0 )
|
||||
nconfig->lna_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "add_syn") == 0 )
|
||||
nconfig->add_syn = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "clr_stw") == 0 )
|
||||
nconfig->clr_stw = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "add_syn") == 0 )
|
||||
nconfig->add_syn = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "keep_urec") == 0 )
|
||||
nconfig->keep_urec = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "spx_out") == 0 )
|
||||
nconfig->spx_out = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "nthreshold") == 0 )
|
||||
nconfig->nthreshold = (uint_t) (*data)->value.lval;
|
||||
else if ( strcmp(_key, "mode") == 0 )
|
||||
friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval));
|
||||
else if ( strcmp(_key, "en_sseg") == 0 )
|
||||
nconfig->en_sseg = (ushort_t) (*data)->value.lval;
|
||||
else if ( strcmp(_key, "st_minl") == 0 )
|
||||
nconfig->st_minl = (ushort_t) (*data)->value.lval;
|
||||
}
|
||||
}
|
||||
}
|
||||
//check the new setting.
|
||||
cfgArr = Z_ARRVAL_P(cfg);
|
||||
//zend_printf("array length: %d", zend_hash_num_elements(cfgArr));
|
||||
for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer);
|
||||
zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS;
|
||||
zend_hash_move_forward_ex(cfgArr, &pointer) )
|
||||
{
|
||||
zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer);
|
||||
//zend_printf("key: %s, value: %d<br />", _key, (*data)->value.lval);
|
||||
|
||||
if ( strcmp(_key, "kpuncs") == 0 )
|
||||
{
|
||||
memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len);
|
||||
nconfig->kpuncs[(*data)->value.str.len] = '\0';
|
||||
}
|
||||
else
|
||||
{
|
||||
//convert the data to long.
|
||||
convert_to_long_ex(data);
|
||||
if ( strcmp(_key, "max_len") == 0 )
|
||||
nconfig->max_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "r_name") == 0 )
|
||||
nconfig->r_name = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "mix_len") == 0 )
|
||||
nconfig->mix_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "lna_len") == 0 )
|
||||
nconfig->lna_len = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "add_syn") == 0 )
|
||||
nconfig->add_syn = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "clr_stw") == 0 )
|
||||
nconfig->clr_stw = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "add_syn") == 0 )
|
||||
nconfig->add_syn = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "keep_urec") == 0 )
|
||||
nconfig->keep_urec = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "spx_out") == 0 )
|
||||
nconfig->spx_out = (ushort_t)(*data)->value.lval;
|
||||
else if ( strcmp(_key, "nthreshold") == 0 )
|
||||
nconfig->nthreshold = (uint_t) (*data)->value.lval;
|
||||
else if ( strcmp(_key, "mode") == 0 )
|
||||
friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval));
|
||||
else if ( strcmp(_key, "en_sseg") == 0 )
|
||||
nconfig->en_sseg = (ushort_t) (*data)->value.lval;
|
||||
else if ( strcmp(_key, "st_minl") == 0 )
|
||||
nconfig->st_minl = (ushort_t) (*data)->value.lval;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//initialize the array.
|
||||
MAKE_STD_ZVAL( ret );
|
||||
array_init( ret );
|
||||
config = ( nconfig == NULL ) ? friso_globals.config : nconfig;
|
||||
//initialize the array.
|
||||
MAKE_STD_ZVAL( ret );
|
||||
array_init( ret );
|
||||
config = ( nconfig == NULL ) ? friso_globals.config : nconfig;
|
||||
|
||||
//create a new friso task.
|
||||
task = friso_new_task();
|
||||
idx = 0;
|
||||
friso_set_text(task, _str);
|
||||
while ( config->next_token( friso_globals.friso, config, task ) != NULL )
|
||||
{
|
||||
MAKE_STD_ZVAL(item);
|
||||
array_init(item);
|
||||
add_assoc_string(item, "word", task->token->word, 1);
|
||||
//check the append of type
|
||||
if ( (rargs & FRISO_RET_TYPE) != 0 )
|
||||
add_assoc_long(item, "type", task->token->type);
|
||||
if ( (rargs & FRISO_RET_LEN) != 0 )
|
||||
add_assoc_long(item, "len", task->token->length);
|
||||
if ( (rargs & FRISO_RET_RLEN) != 0 )
|
||||
add_assoc_long(item, "rlen", task->token->rlen);
|
||||
if ( (rargs & FRISO_RET_OFF) != 0 )
|
||||
add_assoc_long(item, "off", task->token->offset);
|
||||
if ( (rargs & FRISO_RET_POS) != 0 )
|
||||
add_assoc_stringl(item, "pos", &task->token->pos, 1, 1);
|
||||
|
||||
//append the sub result.
|
||||
add_index_zval( ret, idx++, item );
|
||||
}
|
||||
//create a new friso task.
|
||||
task = friso_new_task();
|
||||
idx = 0;
|
||||
friso_set_text(task, _str);
|
||||
while ( config->next_token( friso_globals.friso, config, task ) != NULL )
|
||||
{
|
||||
MAKE_STD_ZVAL(item);
|
||||
array_init(item);
|
||||
add_assoc_string(item, "word", task->token->word, 1);
|
||||
//check the append of type
|
||||
if ( (rargs & FRISO_RET_TYPE) != 0 )
|
||||
add_assoc_long(item, "type", task->token->type);
|
||||
if ( (rargs & FRISO_RET_LEN) != 0 )
|
||||
add_assoc_long(item, "len", task->token->length);
|
||||
if ( (rargs & FRISO_RET_RLEN) != 0 )
|
||||
add_assoc_long(item, "rlen", task->token->rlen);
|
||||
if ( (rargs & FRISO_RET_OFF) != 0 )
|
||||
add_assoc_long(item, "off", task->token->offset);
|
||||
if ( (rargs & FRISO_RET_POS) != 0 )
|
||||
add_assoc_stringl(item, "pos", &task->token->pos, 1, 1);
|
||||
|
||||
//append the sub result.
|
||||
add_index_zval( ret, idx++, item );
|
||||
}
|
||||
|
||||
//free the friso task.
|
||||
friso_free_task(task);
|
||||
if ( nconfig != NULL ) friso_free_config(nconfig);
|
||||
//free the friso task.
|
||||
friso_free_task(task);
|
||||
if ( nconfig != NULL ) friso_free_config(nconfig);
|
||||
|
||||
//RETURN_ZVAL( ret, 0, 0);
|
||||
*( return_value ) = *( ret );
|
||||
//RETURN_ZVAL( ret, 0, 0);
|
||||
*( return_value ) = *( ret );
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -323,7 +323,7 @@ PHP_FUNCTION(friso_split)
|
||||
Return the current version of Friso. */
|
||||
PHP_FUNCTION(friso_version)
|
||||
{
|
||||
RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1);
|
||||
RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -331,8 +331,8 @@ PHP_FUNCTION(friso_version)
|
||||
Return the current charset of friso. */
|
||||
PHP_FUNCTION(friso_charset)
|
||||
{
|
||||
char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK";
|
||||
RETURN_STRINGL(charset, strlen(charset), 1);
|
||||
char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK";
|
||||
RETURN_STRINGL(charset, strlen(charset), 1);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -340,23 +340,23 @@ PHP_FUNCTION(friso_charset)
|
||||
Return a bool to confirm that the given str is a word in a specified dictionary. */
|
||||
PHP_FUNCTION(friso_dic_exist)
|
||||
{
|
||||
char *word = NULL;
|
||||
int wlen;
|
||||
long type;
|
||||
char *word = NULL;
|
||||
int wlen;
|
||||
long type;
|
||||
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( friso_globals.friso->dic == NULL )
|
||||
RETURN_BOOL(0);
|
||||
if ( friso_globals.friso->dic == NULL )
|
||||
RETURN_BOOL(0);
|
||||
|
||||
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
|
||||
type = __LEX_CJK_WORDS__;
|
||||
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
|
||||
type = __LEX_CJK_WORDS__;
|
||||
|
||||
wlen = friso_dic_match( friso_globals.friso->dic, type, word );
|
||||
wlen = friso_dic_match( friso_globals.friso->dic, type, word );
|
||||
|
||||
RETURN_BOOL(wlen);
|
||||
RETURN_BOOL(wlen);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -364,38 +364,38 @@ PHP_FUNCTION(friso_dic_exist)
|
||||
Return a array contains all the information of the given word.*/
|
||||
PHP_FUNCTION(friso_dic_get)
|
||||
{
|
||||
char *word = NULL;
|
||||
int wlen;
|
||||
long type;
|
||||
zval *entry;
|
||||
lex_entry_t e;
|
||||
char *word = NULL;
|
||||
int wlen;
|
||||
long type;
|
||||
zval *entry;
|
||||
lex_entry_t e;
|
||||
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
|
||||
//check the dictionary
|
||||
if ( friso_globals.friso->dic == NULL )
|
||||
RETURN_BOOL(0);
|
||||
//check the dictionary
|
||||
if ( friso_globals.friso->dic == NULL )
|
||||
RETURN_BOOL(0);
|
||||
|
||||
MAKE_STD_ZVAL( entry );
|
||||
array_init( entry );
|
||||
MAKE_STD_ZVAL( entry );
|
||||
array_init( entry );
|
||||
|
||||
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
|
||||
{
|
||||
type = __LEX_CJK_WORDS__;
|
||||
}
|
||||
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
|
||||
{
|
||||
type = __LEX_CJK_WORDS__;
|
||||
}
|
||||
|
||||
e = friso_dic_get( friso_globals.friso->dic, type, word );
|
||||
if ( e != NULL )
|
||||
{
|
||||
add_assoc_long( entry, "length", e->length);
|
||||
add_assoc_long( entry, "freq", e->fre );
|
||||
*( return_value ) = * ( entry );
|
||||
return;
|
||||
}
|
||||
e = friso_dic_get( friso_globals.friso->dic, type, word );
|
||||
if ( e != NULL )
|
||||
{
|
||||
add_assoc_long( entry, "length", e->length);
|
||||
add_assoc_long( entry, "freq", e->fre );
|
||||
*( return_value ) = * ( entry );
|
||||
return;
|
||||
}
|
||||
|
||||
RETURN_BOOL(0);
|
||||
RETURN_BOOL(0);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -403,17 +403,17 @@ PHP_FUNCTION(friso_dic_get)
|
||||
Return the bytes that the utf-8 char takes.*/
|
||||
PHP_FUNCTION(friso_utf8_bytes)
|
||||
{
|
||||
char *word = NULL;
|
||||
int wlen, _bytes;
|
||||
char *word = NULL;
|
||||
int wlen, _bytes;
|
||||
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( word == NULL ) RETURN_LONG(0);
|
||||
_bytes = get_utf8_bytes( word[0] );
|
||||
if ( word == NULL ) RETURN_LONG(0);
|
||||
_bytes = get_utf8_bytes( word[0] );
|
||||
|
||||
RETURN_LONG(_bytes);
|
||||
RETURN_LONG(_bytes);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -421,16 +421,16 @@ PHP_FUNCTION(friso_utf8_bytes)
|
||||
Return the unicode of the given utf-8 char.*/
|
||||
PHP_FUNCTION(friso_utf8_ucode)
|
||||
{
|
||||
char *word = NULL;
|
||||
int wlen, _ucode;
|
||||
char *word = NULL;
|
||||
int wlen, _ucode;
|
||||
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
|
||||
_ucode = get_utf8_unicode( word );
|
||||
_ucode = get_utf8_unicode( word );
|
||||
|
||||
RETURN_LONG(_ucode);
|
||||
RETURN_LONG(_ucode);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
@ -438,18 +438,18 @@ PHP_FUNCTION(friso_utf8_ucode)
|
||||
Return char that the a unicode pointed to.*/
|
||||
PHP_FUNCTION(friso_ucode_utf8)
|
||||
{
|
||||
unsigned long *ucode = NULL;
|
||||
int _bytes;
|
||||
char word[7];
|
||||
unsigned long *ucode = NULL;
|
||||
int _bytes;
|
||||
char word[7];
|
||||
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) {
|
||||
return;
|
||||
}
|
||||
|
||||
_bytes = unicode_to_utf8( ( size_t ) ucode, word );
|
||||
word[_bytes] = '\0';
|
||||
_bytes = unicode_to_utf8( ( size_t ) ucode, word );
|
||||
word[_bytes] = '\0';
|
||||
|
||||
RETURN_STRINGL( word, _bytes, 1 );
|
||||
RETURN_STRINGL( word, _bytes, 1 );
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
$br = (php_sapi_name() == "cli")? "":"<br>";
|
||||
|
||||
if(!extension_loaded('friso')) {
|
||||
dl('friso.' . PHP_SHLIB_SUFFIX);
|
||||
dl('friso.' . PHP_SHLIB_SUFFIX);
|
||||
}
|
||||
$module = 'friso';
|
||||
$functions = get_extension_funcs($module);
|
||||
@ -13,9 +13,9 @@ foreach($functions as $func) {
|
||||
echo "$br\n";
|
||||
$function = 'confirm_' . $module . '_compiled';
|
||||
if (extension_loaded($module)) {
|
||||
$str = $function($module);
|
||||
$str = $function($module);
|
||||
} else {
|
||||
$str = "Module $module is not compiled into PHP";
|
||||
$str = "Module $module is not compiled into PHP";
|
||||
}
|
||||
echo "$str\n";
|
||||
?>
|
||||
|
@ -6,11 +6,11 @@ extern zend_module_entry friso_module_entry;
|
||||
#define phpext_friso_ptr &friso_module_entry
|
||||
|
||||
#ifdef PHP_WIN32
|
||||
# define PHP_FRISO_API __declspec(dllexport)
|
||||
# define PHP_FRISO_API __declspec(dllexport)
|
||||
#elif defined(__GNUC__) && __GNUC__ >= 4
|
||||
# define PHP_FRISO_API __attribute__ ((visibility("default")))
|
||||
# define PHP_FRISO_API __attribute__ ((visibility("default")))
|
||||
#else
|
||||
# define PHP_FRISO_API
|
||||
# define PHP_FRISO_API
|
||||
#endif
|
||||
|
||||
#ifdef ZTS
|
||||
@ -36,12 +36,12 @@ PHP_FUNCTION(friso_utf8_ucode);
|
||||
PHP_FUNCTION(friso_ucode_utf8);
|
||||
|
||||
/*
|
||||
Declare any global variables you may need between the BEGIN
|
||||
and END macros here:
|
||||
Declare any global variables you may need between the BEGIN
|
||||
and END macros here:
|
||||
|
||||
ZEND_BEGIN_MODULE_GLOBALS(friso)
|
||||
long global_value;
|
||||
char *global_string;
|
||||
long global_value;
|
||||
char *global_string;
|
||||
ZEND_END_MODULE_GLOBALS(friso)
|
||||
*/
|
||||
|
||||
@ -66,5 +66,5 @@ typedef struct {
|
||||
#define FRISO_G(v) (friso_globals.v)
|
||||
#endif
|
||||
|
||||
#endif /* PHP_FRISO_H */
|
||||
#endif /* PHP_FRISO_H */
|
||||
|
||||
|
@ -6,14 +6,14 @@ Check for friso presence
|
||||
<?php
|
||||
echo "friso extension is available";
|
||||
/*
|
||||
you can add regression tests for your extension here
|
||||
you can add regression tests for your extension here
|
||||
|
||||
the output of your test code has to be equal to the
|
||||
text in the --EXPECT-- section below for the tests
|
||||
to pass, differences between the output and the
|
||||
expected text are interpreted as failure
|
||||
|
||||
see php5/README.TESTING for further information on
|
||||
see php5/README.TESTING for further information on
|
||||
writing regression tests
|
||||
*/
|
||||
?>
|
||||
|
2872
src/friso.c
2872
src/friso.c
File diff suppressed because it is too large
Load Diff
148
src/friso.h
148
src/friso.h
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* main interface file for friso - free soul.
|
||||
* you could modify it and re-release it but never for commercial use.
|
||||
* you could modify it and re-release it but never for commercial use.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#ifndef _friso_h
|
||||
#define _friso_h
|
||||
@ -15,11 +15,11 @@
|
||||
#define friso_version() FRISO_VERSION
|
||||
|
||||
|
||||
#define DEFAULT_SEGMENT_LENGTH 5
|
||||
#define DEFAULT_MIX_LENGTH 2
|
||||
#define DEFAULT_LNA_LENGTH 1
|
||||
#define DEFAULT_NTHRESHOLD 1000000
|
||||
#define DEFAULT_SEGMENT_MODE 2
|
||||
#define DEFAULT_SEGMENT_LENGTH 5
|
||||
#define DEFAULT_MIX_LENGTH 2
|
||||
#define DEFAULT_LNA_LENGTH 1
|
||||
#define DEFAULT_NTHRESHOLD 1000000
|
||||
#define DEFAULT_SEGMENT_MODE 2
|
||||
|
||||
/*
|
||||
* Type: friso_lex_t
|
||||
@ -29,8 +29,8 @@
|
||||
typedef enum {
|
||||
__LEX_CJK_WORDS__ = 0,
|
||||
__LEX_CJK_UNITS__ = 1,
|
||||
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
|
||||
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
|
||||
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
|
||||
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
|
||||
__LEX_CN_LNAME__ = 4,
|
||||
__LEX_CN_SNAME__ = 5,
|
||||
__LEX_CN_DNAME1__ = 6,
|
||||
@ -41,8 +41,8 @@ typedef enum {
|
||||
__LEX_EN_WORDS__ = 11,
|
||||
__LEX_OTHER_WORDS__ = 15,
|
||||
__LEX_NCSYN_WORDS__ = 16,
|
||||
__LEX_PUNC_WORDS__ = 17, //punctuations
|
||||
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
|
||||
__LEX_PUNC_WORDS__ = 17, //punctuations
|
||||
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
|
||||
} friso_lex_t;
|
||||
|
||||
typedef friso_hash_t * friso_dic_t;
|
||||
@ -51,8 +51,8 @@ typedef friso_hash_t * friso_dic_t;
|
||||
|
||||
//charset that Friso now support.
|
||||
typedef enum {
|
||||
FRISO_UTF8 = 0, //UTF-8
|
||||
FRISO_GBK = 1 //GBK
|
||||
FRISO_UTF8 = 0, //UTF-8
|
||||
FRISO_GBK = 1 //GBK
|
||||
} friso_charset_t;
|
||||
|
||||
/*
|
||||
@ -61,15 +61,15 @@ typedef enum {
|
||||
* use to identidy the mode that the friso use.
|
||||
*/
|
||||
typedef enum {
|
||||
__FRISO_SIMPLE_MODE__ = 1,
|
||||
__FRISO_COMPLEX_MODE__ = 2,
|
||||
__FRISO_DETECT_MODE__ = 3
|
||||
__FRISO_SIMPLE_MODE__ = 1,
|
||||
__FRISO_COMPLEX_MODE__ = 2,
|
||||
__FRISO_DETECT_MODE__ = 3
|
||||
} friso_mode_t;
|
||||
|
||||
/* friso entry.*/
|
||||
typedef struct {
|
||||
friso_dic_t dic; //friso dictionary
|
||||
friso_charset_t charset; //project charset.
|
||||
friso_dic_t dic; //friso dictionary
|
||||
friso_charset_t charset; //project charset.
|
||||
} friso_entry;
|
||||
typedef friso_entry * friso_t;
|
||||
|
||||
@ -80,26 +80,26 @@ typedef friso_entry * friso_t;
|
||||
* -------------------
|
||||
* This type used to represent the lexicon entry struct.
|
||||
*/
|
||||
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
|
||||
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
|
||||
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
|
||||
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
|
||||
typedef struct {
|
||||
/*
|
||||
* the type of the lexicon item.
|
||||
* available value is all the elements in friso_lex_t enum.
|
||||
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
|
||||
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
|
||||
*/
|
||||
uchar_t length; //the length of the token.(after the convertor of Friso.)
|
||||
uchar_t rlen; //the real length of the token.(before any convert)
|
||||
uchar_t length; //the length of the token.(after the convertor of Friso.)
|
||||
uchar_t rlen; //the real length of the token.(before any convert)
|
||||
uchar_t type;
|
||||
uchar_t ctrlMask; //function control mask, like append the synoyums words.
|
||||
uint_t offset; //offset index.
|
||||
uchar_t ctrlMask; //function control mask, like append the synoyums words.
|
||||
uint_t offset; //offset index.
|
||||
fstring word;
|
||||
//fstring py; //pinyin of the word.(invalid)
|
||||
friso_array_t syn; //synoyums words.
|
||||
friso_array_t pos; //part of speech.
|
||||
uint_t fre; //single word frequency.
|
||||
//fstring py; //pinyin of the word.(invalid)
|
||||
friso_array_t syn; //synoyums words.
|
||||
friso_array_t pos; //part of speech.
|
||||
uint_t fre; //single word frequency.
|
||||
} lex_entry_cdt;
|
||||
typedef lex_entry_cdt * lex_entry_t;
|
||||
|
||||
@ -108,11 +108,11 @@ typedef lex_entry_cdt * lex_entry_t;
|
||||
#define __HITS_WORD_LENGTH__ 64
|
||||
|
||||
typedef struct {
|
||||
uchar_t type; //type of the word. (item of friso_lex_t)
|
||||
uchar_t length; //length of the token.
|
||||
uchar_t rlen; //the real length of the token.(in orgin strng)
|
||||
char pos; //part of speech.
|
||||
int offset; //start offset of the word.
|
||||
uchar_t type; //type of the word. (item of friso_lex_t)
|
||||
uchar_t length; //length of the token.
|
||||
uchar_t rlen; //the real length of the token.(in orgin strng)
|
||||
char pos; //part of speech.
|
||||
int offset; //start offset of the word.
|
||||
char word[__HITS_WORD_LENGTH__];
|
||||
//char py[0];
|
||||
} friso_token_entry;
|
||||
@ -122,25 +122,25 @@ typedef friso_token_entry * friso_token_t;
|
||||
/*
|
||||
* Type: friso_task_entry
|
||||
* This type used to represent the current segmentation content.
|
||||
* like the text to split, and the current index, token buffer eg....
|
||||
* like the text to split, and the current index, token buffer eg....
|
||||
*/
|
||||
//action control mask for #FRISO_TASK_T#.
|
||||
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
|
||||
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
|
||||
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
|
||||
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
|
||||
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
|
||||
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
|
||||
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
|
||||
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
|
||||
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
|
||||
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
|
||||
typedef struct {
|
||||
fstring text; //text to tokenize
|
||||
uint_t idx; //start offset index.
|
||||
uint_t length; //length of the text.
|
||||
uint_t bytes; //latest word bytes in C.
|
||||
uint_t unicode; //latest word unicode number.
|
||||
uint_t ctrlMask; //action control mask.
|
||||
friso_link_t pool; //task pool.
|
||||
string_buffer_t sbuf; //string buffer.
|
||||
friso_token_t token; //token result token;
|
||||
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
|
||||
fstring text; //text to tokenize
|
||||
uint_t idx; //start offset index.
|
||||
uint_t length; //length of the text.
|
||||
uint_t bytes; //latest word bytes in C.
|
||||
uint_t unicode; //latest word unicode number.
|
||||
uint_t ctrlMask; //action control mask.
|
||||
friso_link_t pool; //task pool.
|
||||
string_buffer_t sbuf; //string buffer.
|
||||
friso_token_t token; //token result token;
|
||||
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
|
||||
} friso_task_entry;
|
||||
typedef friso_task_entry * friso_task_t;
|
||||
|
||||
@ -151,23 +151,23 @@ typedef friso_task_entry * friso_task_t;
|
||||
//typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t );
|
||||
//typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t );
|
||||
struct friso_config_struct {
|
||||
ushort_t max_len; //the max match length (4 - 7).
|
||||
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
|
||||
ushort_t mix_len; //the max length for the CJK words in a mix string.
|
||||
ushort_t lna_len; //the max length for the chinese last name adron.
|
||||
ushort_t add_syn; //append synonyms tokenizer words.
|
||||
ushort_t clr_stw; //clear the stopwords.
|
||||
ushort_t keep_urec; //keep the unrecongnized words.
|
||||
ushort_t spx_out; //use sphinx output customize.
|
||||
ushort_t en_sseg; //start the secondary segmentation.
|
||||
ushort_t st_minl; //min length of the secondary segmentation token.
|
||||
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
|
||||
friso_mode_t mode; //Complex mode or simple mode
|
||||
ushort_t max_len; //the max match length (4 - 7).
|
||||
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
|
||||
ushort_t mix_len; //the max length for the CJK words in a mix string.
|
||||
ushort_t lna_len; //the max length for the chinese last name adron.
|
||||
ushort_t add_syn; //append synonyms tokenizer words.
|
||||
ushort_t clr_stw; //clear the stopwords.
|
||||
ushort_t keep_urec; //keep the unrecongnized words.
|
||||
ushort_t spx_out; //use sphinx output customize.
|
||||
ushort_t en_sseg; //start the secondary segmentation.
|
||||
ushort_t st_minl; //min length of the secondary segmentation token.
|
||||
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
|
||||
friso_mode_t mode; //Complex mode or simple mode
|
||||
|
||||
//pointer to the function to get the next token
|
||||
friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t);
|
||||
//pointer to the function to get the next cjk lex_entry_t
|
||||
lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t);
|
||||
//pointer to the function to get the next token
|
||||
friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t);
|
||||
//pointer to the function to get the next cjk lex_entry_t
|
||||
lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t);
|
||||
|
||||
char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer.
|
||||
};
|
||||
@ -181,7 +181,7 @@ typedef friso_config_entry * friso_config_t;
|
||||
* Usage: vars = friso_new( void );
|
||||
* --------------------------------
|
||||
* This function used to create a new empty friso friso_t;
|
||||
* with default value.
|
||||
* with default value.
|
||||
*/
|
||||
FRISO_API friso_t friso_new( void );
|
||||
|
||||
@ -202,7 +202,7 @@ FRISO_API void friso_free( friso_t );
|
||||
* Usage: dic = friso_set_dic( vars, dic );
|
||||
* ----------------------------------------
|
||||
* This function is used to set the dictionary for friso.
|
||||
* and firso_dic_t is the pointer of a hash table array.
|
||||
* and firso_dic_t is the pointer of a hash table array.
|
||||
*/
|
||||
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
|
||||
#define friso_set_dic(friso, dic)\
|
||||
@ -272,7 +272,7 @@ FRISO_API lex_entry_t next_complex_cjk( friso_t, friso_config_t, friso_task_t );
|
||||
* Usage: word = next_mmseg_token( vars, seg );
|
||||
* --------------------------------------
|
||||
* This function is used to get next word that friso segmented
|
||||
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
|
||||
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
|
||||
*/
|
||||
FRISO_API friso_token_t next_mmseg_token( friso_t, friso_config_t, friso_task_t );
|
||||
|
||||
@ -313,14 +313,14 @@ FRISO_API void free_lex_entry( lex_entry_t );
|
||||
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
|
||||
* --------------------------------------------------
|
||||
* This function is used to load dictionary from a given path.
|
||||
* no length limit when length less than 0.
|
||||
* no length limit when length less than 0.
|
||||
*/
|
||||
FRISO_API void friso_dic_load( friso_t, friso_config_t,
|
||||
friso_lex_t, fstring, uint_t );
|
||||
friso_lex_t, fstring, uint_t );
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the conf file.
|
||||
* and load all the valid lexicon from the conf file.
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile( friso_t, friso_config_t, fstring, uint_t );
|
||||
|
||||
|
@ -16,22 +16,22 @@
|
||||
|
||||
//yat, just take it as this way, 99 percent you will find no problem
|
||||
#if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) )
|
||||
# define FRISO_WINNT
|
||||
# define FRISO_WINNT
|
||||
#else
|
||||
# define FRISO_LINUX
|
||||
# define FRISO_LINUX
|
||||
#endif
|
||||
|
||||
#ifdef FRISO_WINNT
|
||||
# define FRISO_API extern __declspec(dllexport)
|
||||
# define __STATIC_API__ static
|
||||
# define FRISO_API extern __declspec(dllexport)
|
||||
# define __STATIC_API__ static
|
||||
#else
|
||||
/*platform shared library statement :: unix*/
|
||||
# define FRISO_API extern
|
||||
# define __STATIC_API__ static inline
|
||||
# define FRISO_API extern
|
||||
# define __STATIC_API__ static inline
|
||||
#endif
|
||||
|
||||
#define ___ALLOCATION_ERROR___ \
|
||||
printf("Unable to do the memory allocation, program will now exit\n" ); \
|
||||
#define ___ALLOCATION_ERROR___ \
|
||||
printf("Unable to do the memory allocation, program will now exit\n" ); \
|
||||
exit(1);
|
||||
|
||||
#define print(str) printf("%s", str )
|
||||
@ -39,12 +39,12 @@ exit(1);
|
||||
|
||||
/*
|
||||
* memory allocation macro definition.
|
||||
* cause we should use emalloc,ecalloc .ege. in php.
|
||||
* cause we should use emalloc,ecalloc .ege. in php.
|
||||
* so you could make it better apdat the php environment.
|
||||
*/
|
||||
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
|
||||
#define FRISO_MALLOC(_bytes) malloc(_bytes)
|
||||
#define FRISO_FREE( _ptr ) free( _ptr )
|
||||
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
|
||||
#define FRISO_MALLOC(_bytes) malloc(_bytes)
|
||||
#define FRISO_FREE( _ptr ) free( _ptr )
|
||||
|
||||
typedef unsigned short ushort_t;
|
||||
typedef unsigned char uchar_t;
|
||||
@ -74,7 +74,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str );
|
||||
|
||||
/*
|
||||
* this function will copy the chars that the fstring pointed.
|
||||
* to the buffer.
|
||||
* to the buffer.
|
||||
* this may cause the resize action of the buffer.
|
||||
*/
|
||||
FRISO_API void string_buffer_append( string_buffer_t, fstring );
|
||||
@ -88,21 +88,21 @@ FRISO_API fstring string_buffer_remove( string_buffer_t, uint_t idx, uint_t );
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t );
|
||||
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* the string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
* and this function will not free the allocations of the
|
||||
* the string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote( string_buffer_t );
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear( string_buffer_t );
|
||||
|
||||
@ -126,8 +126,8 @@ typedef string_split_entry * string_split_t;
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split( fstring, fstring );
|
||||
|
||||
@ -141,12 +141,12 @@ FRISO_API void free_string_split( string_split_t );
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next( string_split_t, fstring );
|
||||
/* }}} */
|
||||
@ -175,7 +175,7 @@ FRISO_API friso_array_t new_array_list_with_opacity( uint_t );
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where the items's item to pointed to .
|
||||
* and its items, but never where the items's item to pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list( friso_array_t );
|
||||
|
||||
@ -190,13 +190,13 @@ FRISO_API void *array_list_get( friso_array_t, uint_t );
|
||||
|
||||
/*
|
||||
* set the item at a specified position.
|
||||
* this will return the old value.
|
||||
* this will return the old value.
|
||||
*/
|
||||
FRISO_API void *array_list_set( friso_array_t, uint_t, void * );
|
||||
|
||||
/*
|
||||
* remove the given item at a specified position.
|
||||
* this will return the value of the removed item.
|
||||
* this will return the value of the removed item.
|
||||
*/
|
||||
FRISO_API void *array_list_remove( friso_array_t, uint_t );
|
||||
|
||||
@ -205,9 +205,9 @@ FRISO_API friso_array_t array_list_trim( friso_array_t );
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear( friso_array_t );
|
||||
|
||||
@ -300,8 +300,8 @@ FRISO_API void link_list_add_first( friso_link_t, void * );
|
||||
|
||||
/* {{{ hashtable interface define :: start*/
|
||||
struct hash_entry {
|
||||
fstring _key; //the node key
|
||||
void * _val; //the node value
|
||||
fstring _key; //the node key
|
||||
void * _val; //the node value
|
||||
struct hash_entry * _next;
|
||||
};
|
||||
typedef struct hash_entry friso_hash_entry;
|
||||
@ -319,8 +319,8 @@ typedef struct {
|
||||
typedef friso_hash_cdt * friso_hash_t;
|
||||
|
||||
//default value for friso_hash_cdt
|
||||
#define DEFAULT_LENGTH 31
|
||||
#define DEFAULT_FACTOR 0.85f
|
||||
#define DEFAULT_LENGTH 31
|
||||
#define DEFAULT_FACTOR 0.85f
|
||||
|
||||
/*
|
||||
* Function: new_hash_table
|
||||
@ -359,7 +359,7 @@ FRISO_API int hash_exist_mapping( friso_hash_t, fstring );
|
||||
* Usage: value = get_mapping_value( table, key );
|
||||
* -----------------------------------------------
|
||||
* this function return the value associated with the given key.
|
||||
* UNDEFINED will be return if the mapping is not exists.
|
||||
* UNDEFINED will be return if the mapping is not exists.
|
||||
*/
|
||||
FRISO_API void * hash_get_value( friso_hash_t, fstring );
|
||||
|
||||
|
192
src/friso_GBK.c
192
src/friso_GBK.c
@ -1,6 +1,6 @@
|
||||
/**
|
||||
* Friso GBK about function implements source file.
|
||||
* @package src/friso_GBK.c .
|
||||
* @package src/friso_GBK.c .
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
@ -12,12 +12,12 @@
|
||||
|
||||
/* read the next GBK word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int gbk_next_word(
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word )
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word )
|
||||
{
|
||||
int c;
|
||||
if ( *idx >= task->length ) return 0;
|
||||
@ -41,26 +41,26 @@ FRISO_API int gbk_next_word(
|
||||
//}
|
||||
|
||||
//check if the given buffer is a gbk word (ANSII string).
|
||||
// included the simplified and traditional words.
|
||||
// included the simplified and traditional words.
|
||||
FRISO_API int gbk_cn_string( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
//GBK/2: gb2312 chinese word.
|
||||
return ( ((c1 >= 0xb0 && c1 <= 0xf7)
|
||||
&& (c2 >= 0xa1 && c2 <= 0xfe))
|
||||
&& (c2 >= 0xa1 && c2 <= 0xfe))
|
||||
//GBK/3: extend chinese words.
|
||||
|| ((c1 >= 0x81 && c1 <= 0xa0)
|
||||
&& ( (c2 >= 0x40 && c2 <= 0x7e)
|
||||
|| (c2 >= 0x80 && c2 <= 0xfe) ))
|
||||
|| ((c1 >= 0x81 && c1 <= 0xa0)
|
||||
&& ( (c2 >= 0x40 && c2 <= 0x7e)
|
||||
|| (c2 >= 0x80 && c2 <= 0xfe) ))
|
||||
//GBK/4: extend chinese words.
|
||||
|| ((c1 >= 0xaa && c1 <= 0xfe)
|
||||
&& ( (c2 >= 0x40 && c2 <= 0xfe)
|
||||
|| (c2 >= 0x80 && c2 <= 0xa0) )) );
|
||||
|| ((c1 >= 0xaa && c1 <= 0xfe)
|
||||
&& ( (c2 >= 0x40 && c2 <= 0xfe)
|
||||
|| (c2 >= 0x80 && c2 <= 0xa0) )) );
|
||||
}
|
||||
|
||||
/*check if the given char is a ASCII letter
|
||||
* include all the arabic number, letters and english puntuations.*/
|
||||
* include all the arabic number, letters and english puntuations.*/
|
||||
FRISO_API int gbk_halfwidth_en_char( char c )
|
||||
{
|
||||
int u = (uchar_t) c;
|
||||
@ -69,58 +69,58 @@ FRISO_API int gbk_halfwidth_en_char( char c )
|
||||
|
||||
/*
|
||||
* check if the given char is a full-width latain.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int gbk_fullwidth_en_char( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
return ( (c1 == 0xA3)
|
||||
&& ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|
||||
|| ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters.
|
||||
|| ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters.
|
||||
&& ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|
||||
|| ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters.
|
||||
|| ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters.
|
||||
}
|
||||
|
||||
//check if the given char is a upper case english letter.
|
||||
// included the full-width and half-width letters.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int gbk_uppercase_letter( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 65 && c1 <= 90 );
|
||||
else //full-width
|
||||
return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) );
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 65 && c1 <= 90 );
|
||||
else //full-width
|
||||
return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) );
|
||||
}
|
||||
|
||||
//check if the given char is a lower case char.
|
||||
// included the full-width and half-width letters.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int gbk_lowercase_letter( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 97 && c1 <= 122 );
|
||||
else //full-width
|
||||
return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) );
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 97 && c1 <= 122 );
|
||||
else //full-width
|
||||
return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) );
|
||||
}
|
||||
|
||||
//check if the given char is a arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int gbk_numeric_letter( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 48 && c1 <= 57 );
|
||||
else //full-width
|
||||
return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) );
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( c1 >= 48 && c1 <= 57 );
|
||||
else //full-width
|
||||
return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int gbk_numeric_string( char *str )
|
||||
{
|
||||
@ -130,17 +130,17 @@ FRISO_API int gbk_numeric_string( char *str )
|
||||
|
||||
while ( *s != '\0' )
|
||||
{
|
||||
c1 = (uchar_t) (*s++);
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
{
|
||||
if ( c1 < 48 || c2 > 57 ) return 0;
|
||||
}
|
||||
else //full-width
|
||||
{
|
||||
if ( c1 != 0xa3 ) return 0;
|
||||
c2 = (uchar_t) (*s++);
|
||||
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
|
||||
}
|
||||
c1 = (uchar_t) (*s++);
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
{
|
||||
if ( c1 < 48 || c2 > 57 ) return 0;
|
||||
}
|
||||
else //full-width
|
||||
{
|
||||
if ( c1 != 0xa3 ) return 0;
|
||||
c2 = (uchar_t) (*s++);
|
||||
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
@ -157,47 +157,47 @@ FRISO_API int gbk_decimal_string( char *str )
|
||||
|
||||
for ( i = 0; i < len; )
|
||||
{
|
||||
c1 = (uchar_t) str[i++];
|
||||
//count the number of the points.
|
||||
if ( c1 == 46 )
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
c1 = (uchar_t) str[i++];
|
||||
//count the number of the points.
|
||||
if ( c1 == 46 )
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
{
|
||||
if ( c1 < 48 || c1 > 57 ) return 0;
|
||||
}
|
||||
else //full-width
|
||||
{
|
||||
if ( c1 != 0xa3 ) return 0;
|
||||
c2 = (uchar_t) str[i++];
|
||||
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
|
||||
}
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
{
|
||||
if ( c1 < 48 || c1 > 57 ) return 0;
|
||||
}
|
||||
else //full-width
|
||||
{
|
||||
if ( c1 != 0xa3 ) return 0;
|
||||
c2 = (uchar_t) str[i++];
|
||||
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return (p == 1);
|
||||
}
|
||||
|
||||
//check if the given char is a english(ASCII) letter.
|
||||
// (full-width and half-width), not the punctuation/arabic of course.
|
||||
// (full-width and half-width), not the punctuation/arabic of course.
|
||||
FRISO_API int gbk_en_letter( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( (c1 >= 65 && c1 <= 90) //lowercase
|
||||
|| (c1 >= 97 && c1 <= 122)); //uppercase
|
||||
if ( c1 <= 0x80 ) //half-width
|
||||
return ( (c1 >= 65 && c1 <= 90) //lowercase
|
||||
|| (c1 >= 97 && c1 <= 122)); //uppercase
|
||||
else
|
||||
return ( (c1 == 0xa3)
|
||||
&& ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase
|
||||
|| ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase
|
||||
return ( (c1 == 0xa3)
|
||||
&& ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase
|
||||
|| ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the given char is a whitespace or not.
|
||||
// included full-width and half-width whitespace.
|
||||
// included full-width and half-width whitespace.
|
||||
FRISO_API int gbk_whitespace( char *str )
|
||||
{
|
||||
int c1 = (uchar_t) str[0];
|
||||
@ -213,8 +213,8 @@ FRISO_API int gbk_letter_number( char *str )
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
return ( (c1 == 0xa2)
|
||||
&& ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase
|
||||
|| ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase
|
||||
&& ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase
|
||||
|| ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase
|
||||
}
|
||||
|
||||
/*
|
||||
@ -232,9 +232,9 @@ FRISO_API int gbk_en_punctuation( char c )
|
||||
{
|
||||
int u = (uchar_t) c;
|
||||
return ( (u > 32 && u < 48)
|
||||
|| ( u > 57 && u < 65 )
|
||||
|| ( u > 90 && u < 97 )
|
||||
|| ( u > 122 && u < 127 ) );
|
||||
|| ( u > 57 && u < 65 )
|
||||
|| ( u > 90 && u < 97 )
|
||||
|| ( u > 122 && u < 127 ) );
|
||||
}
|
||||
|
||||
//check the given char is a chinese punctuation.
|
||||
@ -244,16 +244,16 @@ FRISO_API int gbk_cn_punctuation( char *str )
|
||||
int c2 = (uchar_t) str[1];
|
||||
//full-width en punctuation.
|
||||
return ( (c1 == 0xa3 && (( c2 >= 0xa1 && c2 <= 0xaf )
|
||||
|| ( c2 >= 0xba && c2 <= 0xc0 )
|
||||
|| ( c2 >= 0xdb && c2 <= 0xe0 )
|
||||
|| ( c2 >= 0xfb && c2 <= 0xfe ) ))
|
||||
|| ( c2 >= 0xba && c2 <= 0xc0 )
|
||||
|| ( c2 >= 0xdb && c2 <= 0xe0 )
|
||||
|| ( c2 >= 0xfb && c2 <= 0xfe ) ))
|
||||
//chinese punctuation.
|
||||
|| (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae)
|
||||
|| ( c2 >= 0xb0 && c2 <= 0xbf ) ))
|
||||
|| (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae)
|
||||
|| ( c2 >= 0xb0 && c2 <= 0xbf ) ))
|
||||
//A6 area special punctuations:" "
|
||||
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
|
||||
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
|
||||
//A8 area special punctuations: " ˊˋ˙–―‥‵℅ "
|
||||
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) );
|
||||
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) );
|
||||
}
|
||||
|
||||
/* {{{
|
||||
@ -269,19 +269,19 @@ FRISO_API int gbk_cn_punctuation( char *str )
|
||||
//FRISO_API int gbk_keep_punctuation( char *str )
|
||||
//{
|
||||
// if ( __keep_punctuations_hash__ == NULL ) {
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// }
|
||||
// //check the hash.
|
||||
// return hash_exist_mapping( __keep_punctuations_hash__, str );
|
||||
|
334
src/friso_UTF8.c
334
src/friso_UTF8.c
@ -1,6 +1,6 @@
|
||||
/**
|
||||
* Friso utf8 about function implements source file.
|
||||
* @package src/friso_UTF8.c .
|
||||
* @package src/friso_UTF8.c .
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
@ -12,12 +12,12 @@
|
||||
|
||||
/* read the next utf-8 word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int utf8_next_word(
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word )
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word )
|
||||
{
|
||||
if ( *idx >= task->length ) return 0;
|
||||
|
||||
@ -25,7 +25,7 @@ FRISO_API int utf8_next_word(
|
||||
task->bytes = get_utf8_bytes( task->text[ *idx ] );
|
||||
|
||||
//for ( t = 0; t < task->bytes; t++ ) {
|
||||
// __word[t] = task->text[ (*idx)++ ];
|
||||
// __word[t] = task->text[ (*idx)++ ];
|
||||
//}
|
||||
|
||||
//change the loop to memcpy.
|
||||
@ -52,31 +52,31 @@ FRISO_API void print_char_binary( char value )
|
||||
|
||||
for ( t = 0; t < __CHAR_BYTES__; t++ )
|
||||
{
|
||||
if ( ( value & 0x80 ) == 0x80 ) {
|
||||
printf("1");
|
||||
} else {
|
||||
printf("0");
|
||||
}
|
||||
value <<= 1;
|
||||
if ( ( value & 0x80 ) == 0x80 ) {
|
||||
printf("1");
|
||||
} else {
|
||||
printf("0");
|
||||
}
|
||||
value <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the bytes of a utf-8 char.
|
||||
* between 1 - 6.
|
||||
* between 1 - 6.
|
||||
*
|
||||
* @param __char
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int get_utf8_bytes( char value )
|
||||
{
|
||||
{
|
||||
register uint_t t = 0;
|
||||
|
||||
//one byte ascii char.
|
||||
if ( ( value & 0x80 ) == 0 ) return 1;
|
||||
|
||||
for ( ; ( value & 0x80 ) != 0; value <<= 1 )
|
||||
t++;
|
||||
t++;
|
||||
|
||||
return t;
|
||||
}
|
||||
@ -94,25 +94,25 @@ FRISO_API int get_utf8_unicode( const fstring ch )
|
||||
register char b1,b2,b3;
|
||||
|
||||
switch ( bytes ) {
|
||||
case 1:
|
||||
*bit = *ch;
|
||||
break;
|
||||
case 2:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
case 1:
|
||||
*bit = *ch;
|
||||
break;
|
||||
case 2:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
|
||||
*bit = (b1 << 6) + (b2 & 0x3F);
|
||||
*(bit+1) = (b1 >> 2) & 0x07;
|
||||
break;
|
||||
case 3:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
b3 = *(ch + 2);
|
||||
*bit = (b1 << 6) + (b2 & 0x3F);
|
||||
*(bit+1) = (b1 >> 2) & 0x07;
|
||||
break;
|
||||
case 3:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
b3 = *(ch + 2);
|
||||
|
||||
*bit = (b2 << 6) + (b3 & 0x3F);
|
||||
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
|
||||
break;
|
||||
//ignore the ones that are larger than 3 bytes;
|
||||
*bit = (b2 << 6) + (b3 & 0x3F);
|
||||
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
|
||||
break;
|
||||
//ignore the ones that are larger than 3 bytes;
|
||||
}
|
||||
|
||||
return code;
|
||||
@ -122,50 +122,50 @@ FRISO_API int get_utf8_unicode( const fstring ch )
|
||||
FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
|
||||
{
|
||||
if ( u <= 0x0000007F ) {
|
||||
//U-00000000 - U-0000007F
|
||||
//0xxxxxxx
|
||||
*__word = ( u & 0x7F );
|
||||
return 1;
|
||||
//U-00000000 - U-0000007F
|
||||
//0xxxxxxx
|
||||
*__word = ( u & 0x7F );
|
||||
return 1;
|
||||
} else if ( u >= 0x00000080 && u <= 0x000007FF ) {
|
||||
//U-00000080 - U-000007FF
|
||||
//110xxxxx 10xxxxxx
|
||||
*( __word + 1 ) = ( u & 0x3F) | 0x80;
|
||||
*__word = ((u >> 6) & 0x1F) | 0xC0;
|
||||
return 2;
|
||||
//U-00000080 - U-000007FF
|
||||
//110xxxxx 10xxxxxx
|
||||
*( __word + 1 ) = ( u & 0x3F) | 0x80;
|
||||
*__word = ((u >> 6) & 0x1F) | 0xC0;
|
||||
return 2;
|
||||
} else if ( u >= 0x00000800 && u <= 0x0000FFFF ) {
|
||||
//U-00000800 - U-0000FFFF
|
||||
//1110xxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 2 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 12) & 0x0F) | 0xE0;
|
||||
return 3;
|
||||
//U-00000800 - U-0000FFFF
|
||||
//1110xxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 2 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 12) & 0x0F) | 0xE0;
|
||||
return 3;
|
||||
} else if ( u >= 0x00010000 && u <= 0x001FFFFF ) {
|
||||
//U-00010000 - U-001FFFFF
|
||||
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 3 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 18) & 0x07) | 0xF0;
|
||||
return 4;
|
||||
//U-00010000 - U-001FFFFF
|
||||
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 3 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 18) & 0x07) | 0xF0;
|
||||
return 4;
|
||||
} else if ( u >= 0x00200000 && u <= 0x03FFFFFF ) {
|
||||
//U-00200000 - U-03FFFFFF
|
||||
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 4 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 24) & 0x03) | 0xF8;
|
||||
return 5;
|
||||
//U-00200000 - U-03FFFFFF
|
||||
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 4 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 24) & 0x03) | 0xF8;
|
||||
return 5;
|
||||
} else if ( u >= 0x04000000 && u <= 0x7FFFFFFF ) {
|
||||
//U-04000000 - U-7FFFFFFF
|
||||
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 5 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 30) & 0x01) | 0xFC;
|
||||
return 6;
|
||||
//U-04000000 - U-7FFFFFFF
|
||||
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 5 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 30) & 0x01) | 0xFC;
|
||||
return 6;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -173,28 +173,28 @@ FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
|
||||
|
||||
/*
|
||||
* check the given char is a CJK char or not.
|
||||
* 2E80-2EFF CJK 部首补充
|
||||
* 2F00-2FDF 康熙字典部首
|
||||
* 3000-303F CJK 符号和标点 --ignore
|
||||
* 31C0-31EF CJK 笔画
|
||||
* 3200-32FF 封闭式 CJK 文字和月份 --ignore.
|
||||
* 3300-33FF CJK 兼容
|
||||
* 3400-4DBF CJK 统一表意符号扩展 A
|
||||
* 4DC0-4DFF 易经六十四卦符号
|
||||
* 4E00-9FBF CJK 统一表意符号
|
||||
* F900-FAFF CJK 兼容象形文字
|
||||
* FE30-FE4F CJK 兼容形式
|
||||
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin)
|
||||
* 2E80-2EFF CJK 部首补充
|
||||
* 2F00-2FDF 康熙字典部首
|
||||
* 3000-303F CJK 符号和标点 --ignore
|
||||
* 31C0-31EF CJK 笔画
|
||||
* 3200-32FF 封闭式 CJK 文字和月份 --ignore.
|
||||
* 3300-33FF CJK 兼容
|
||||
* 3400-4DBF CJK 统一表意符号扩展 A
|
||||
* 4DC0-4DFF 易经六十四卦符号
|
||||
* 4E00-9FBF CJK 统一表意符号
|
||||
* F900-FAFF CJK 兼容象形文字
|
||||
* FE30-FE4F CJK 兼容形式
|
||||
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin)
|
||||
*
|
||||
* Japanese:
|
||||
* 3040-309F 日本平假名
|
||||
* 30A0-30FF 日本片假名
|
||||
* 31F0-31FF 日本片假名拼音扩展
|
||||
* 3040-309F 日本平假名
|
||||
* 30A0-30FF 日本片假名
|
||||
* 31F0-31FF 日本片假名拼音扩展
|
||||
*
|
||||
* Korean:
|
||||
* AC00-D7AF 韩文拼音
|
||||
* 1100-11FF 韩文字母
|
||||
* 3130-318F 韩文兼容字母
|
||||
* AC00-D7AF 韩文拼音
|
||||
* 1100-11FF 韩文字母
|
||||
* 3130-318F 韩文兼容字母
|
||||
*
|
||||
* @param ch :pointer to the char
|
||||
* @return int : 1 for yes and 0 for not.
|
||||
@ -211,23 +211,23 @@ FRISO_API int utf8_cjk_string( uint_t u )
|
||||
//Chinese.
|
||||
#ifdef FRISO_CJK_CHK_C
|
||||
c = ( ( u >= 0x4E00 && u <= 0x9FBF )
|
||||
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|
||||
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|
||||
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|
||||
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|
||||
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
|
||||
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|
||||
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|
||||
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|
||||
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|
||||
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
|
||||
#endif
|
||||
|
||||
//Japanese.
|
||||
#ifdef FRISO_CJK_CHK_J
|
||||
j = ( ( u >= 0x3040 && u <= 0x309F )
|
||||
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
|
||||
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
|
||||
#endif
|
||||
|
||||
//Korean
|
||||
#ifdef FRISO_CJK_CHK_K
|
||||
k = ( ( u >= 0xAC00 && u <= 0xD7AF )
|
||||
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
|
||||
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
|
||||
#endif
|
||||
|
||||
return ( c || j || k );
|
||||
@ -235,7 +235,7 @@ FRISO_API int utf8_cjk_string( uint_t u )
|
||||
|
||||
/*
|
||||
* check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english punctuations.
|
||||
* include all the letters and english punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int 1 for yes and 0 for not.
|
||||
@ -247,21 +247,21 @@ FRISO_API int utf8_halfwidth_en_char( uint_t u )
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width punctuations.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char( uint_t u )
|
||||
{
|
||||
return ( (u >= 65296 && u <= 65305 ) //arabic number
|
||||
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|
||||
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
|
||||
return ( (u >= 65296 && u <= 65305 ) //arabic number
|
||||
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|
||||
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
|
||||
}
|
||||
|
||||
//check the given char is a upper case letters or not.
|
||||
// included the full-width and half-width letters.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int utf8_uppercase_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
@ -269,7 +269,7 @@ FRISO_API int utf8_uppercase_letter( uint_t u )
|
||||
}
|
||||
|
||||
//check the given char is a upper case letters or not.
|
||||
// included the full-width and half-width letters.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int utf8_lowercase_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
@ -277,25 +277,25 @@ FRISO_API int utf8_lowercase_letter( uint_t u )
|
||||
}
|
||||
|
||||
//check the given char is a numeric
|
||||
// included the full-width and half-width arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int utf8_numeric_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248; //make full-width half-width.
|
||||
if ( u > 65280 ) u -= 65248; //make full-width half-width.
|
||||
return ( ( u >= 48 && u <= 57 ) );
|
||||
}
|
||||
|
||||
//check the given char is a english letter.(included the full-width)
|
||||
// not the punctuation of course.
|
||||
// not the punctuation of course.
|
||||
FRISO_API int utf8_en_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
return ( ( u >= 65 && u <= 90 )
|
||||
|| ( u >= 97 && u <= 122 ) );
|
||||
|| ( u >= 97 && u <= 122 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric.
|
||||
* both full-width,half-width numeric is ok.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*
|
||||
* @param str
|
||||
* @return int
|
||||
@ -317,22 +317,22 @@ FRISO_API int utf8_numeric_string( const fstring str )
|
||||
|
||||
while ( *s != '\0' )
|
||||
{
|
||||
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
|
||||
// return 0;
|
||||
//}
|
||||
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
//new implemention.
|
||||
//@date 2013-10-14
|
||||
bytes = 1;
|
||||
if ( *s < 0 ) //full-width chars.
|
||||
{
|
||||
u = get_utf8_unicode(s);
|
||||
bytes = get_utf8_bytes(*s);
|
||||
if ( u < 65296 || u > 65305 ) return 0;
|
||||
}
|
||||
else if ( *s < 48 || *s > 57 ) return 0;
|
||||
//new implemention.
|
||||
//@date 2013-10-14
|
||||
bytes = 1;
|
||||
if ( *s < 0 ) //full-width chars.
|
||||
{
|
||||
u = get_utf8_unicode(s);
|
||||
bytes = get_utf8_bytes(*s);
|
||||
if ( u < 65296 || u > 65305 ) return 0;
|
||||
}
|
||||
else if ( *s < 48 || *s > 57 ) return 0;
|
||||
|
||||
s += bytes;
|
||||
s += bytes;
|
||||
}
|
||||
|
||||
return 1;
|
||||
@ -347,24 +347,24 @@ FRISO_API int utf8_decimal_string( const fstring str )
|
||||
|
||||
for ( i = 1; i < len; bytes = 1 )
|
||||
{
|
||||
//count the number of char '.'
|
||||
if ( str[i] == '.' )
|
||||
{
|
||||
i++;
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
//count the number of char '.'
|
||||
if ( str[i] == '.' )
|
||||
{
|
||||
i++;
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
//full-width numeric.
|
||||
else if ( str[i] < 0 )
|
||||
{
|
||||
u = get_utf8_unicode(str+i);
|
||||
bytes = get_utf8_bytes(str[i]);
|
||||
if ( u < 65296 || u > 65305 ) return 0;
|
||||
}
|
||||
else if ( str[i] < 48 || str[i] > 57 ) return 0;
|
||||
//full-width numeric.
|
||||
else if ( str[i] < 0 )
|
||||
{
|
||||
u = get_utf8_unicode(str+i);
|
||||
bytes = get_utf8_bytes(str[i]);
|
||||
if ( u < 65296 || u > 65305 ) return 0;
|
||||
}
|
||||
else if ( str[i] < 48 || str[i] > 57 ) return 0;
|
||||
|
||||
i += bytes;
|
||||
i += bytes;
|
||||
}
|
||||
|
||||
return (p == 1);
|
||||
@ -379,7 +379,7 @@ FRISO_API int utf8_decimal_string( const fstring str )
|
||||
FRISO_API int utf8_whitespace( uint_t u )
|
||||
{
|
||||
if ( u == 32 || u == 12288 )
|
||||
return 1;
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -392,16 +392,16 @@ FRISO_API int utf8_whitespace( uint_t u )
|
||||
*/
|
||||
FRISO_API int utf8_en_punctuation( uint_t u )
|
||||
{
|
||||
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
|
||||
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
|
||||
return ( (u > 32 && u < 48)
|
||||
|| ( u > 57 && u < 65 )
|
||||
|| ( u > 90 && u < 97 ) //added @2013-08-31
|
||||
|| ( u > 122 && u < 127 ) );
|
||||
|| ( u > 57 && u < 65 )
|
||||
|| ( u > 90 && u < 97 ) //added @2013-08-31
|
||||
|| ( u > 122 && u < 127 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a chinese punctuation.
|
||||
* @date 2013-08-31 added.
|
||||
* @date 2013-08-31 added.
|
||||
*
|
||||
* @param ch
|
||||
* @return int
|
||||
@ -409,17 +409,17 @@ FRISO_API int utf8_en_punctuation( uint_t u )
|
||||
FRISO_API int utf8_cn_punctuation( uint_t u )
|
||||
{
|
||||
return ( ( u > 65280 && u < 65296 )
|
||||
|| ( u > 65305 && u < 65312 )
|
||||
|| ( u > 65338 && u < 65345 )
|
||||
|| ( u > 65370 && u < 65382 )
|
||||
//cjk symbol and punctuation.(added 2013-09-06)
|
||||
//from http://www.unicode.org/charts/PDF/U3000.pdf
|
||||
|| ( u >= 12289 && u <= 12319) );
|
||||
|| ( u > 65305 && u < 65312 )
|
||||
|| ( u > 65338 && u < 65345 )
|
||||
|| ( u > 65370 && u < 65382 )
|
||||
//cjk symbol and punctuation.(added 2013-09-06)
|
||||
//from http://www.unicode.org/charts/PDF/U3000.pdf
|
||||
|| ( u >= 12289 && u <= 12319) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a letter number in unicode.
|
||||
* like 'ⅠⅡ'.
|
||||
* like 'ⅠⅡ'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
@ -430,7 +430,7 @@ FRISO_API int utf8_letter_number( uint_t u )
|
||||
|
||||
/*
|
||||
* check if the given char is a other number in unicode.
|
||||
* like '①⑩⑽㈩'.
|
||||
* like '①⑩⑽㈩'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
@ -456,19 +456,19 @@ FRISO_API int utf8_other_number( uint_t u )
|
||||
//{
|
||||
// if ( __keep_punctuations_hash__ == NULL )
|
||||
// {
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// }
|
||||
// //check the hash.
|
||||
// return hash_exist_mapping( __keep_punctuations_hash__, str );
|
||||
@ -484,7 +484,7 @@ FRISO_API int utf8_other_number( uint_t u )
|
||||
//FRISO_API int utf8_fullwidth_char( uint_t u )
|
||||
//{
|
||||
// if ( u == 12288 )
|
||||
// return 1; //full-width space
|
||||
// return 1; //full-width space
|
||||
// //(32 - 126) ascii code
|
||||
// return (u > 65280 && u <= 65406);
|
||||
//}
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* friso dynamaic interface implemented functions file
|
||||
* that defined in header file "friso_API.h".
|
||||
* never use it for commercial use.
|
||||
* that defined in header file "friso_API.h".
|
||||
* never use it for commercial use.
|
||||
*
|
||||
* @author chenxini <chenxin619315@gmail.com>
|
||||
* @author chenxini <chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
@ -14,37 +14,37 @@
|
||||
**********************************************/
|
||||
__STATIC_API__ void **create_array_entries( uint_t __blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
|
||||
if ( block == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
register uint_t t;
|
||||
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
|
||||
if ( block == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
for ( t = 0; t < __blocks; t++ ) {
|
||||
block[t] = NULL;
|
||||
}
|
||||
//initialize
|
||||
for ( t = 0; t < __blocks; t++ ) {
|
||||
block[t] = NULL;
|
||||
}
|
||||
|
||||
return block;
|
||||
return block;
|
||||
}
|
||||
|
||||
//resize the array. (the opacity should not be smaller than array->length)
|
||||
__STATIC_API__ friso_array_t resize_array_list(
|
||||
friso_array_t array,
|
||||
uint_t opacity )
|
||||
friso_array_t array,
|
||||
uint_t opacity )
|
||||
{
|
||||
register uint_t t;
|
||||
void **block = create_array_entries( opacity );
|
||||
register uint_t t;
|
||||
void **block = create_array_entries( opacity );
|
||||
|
||||
for ( t = 0; t < array->length ; t++ ) {
|
||||
block[t] = array->items[t];
|
||||
}
|
||||
for ( t = 0; t < array->length ; t++ ) {
|
||||
block[t] = array->items[t];
|
||||
}
|
||||
|
||||
FRISO_FREE( array->items );
|
||||
array->items = block;
|
||||
array->allocs = opacity;
|
||||
FRISO_FREE( array->items );
|
||||
array->items = block;
|
||||
array->allocs = opacity;
|
||||
|
||||
return array;
|
||||
return array;
|
||||
}
|
||||
|
||||
|
||||
@ -59,154 +59,154 @@ __STATIC_API__ friso_array_t resize_array_list(
|
||||
//create a new array list with a given opacity.
|
||||
FRISO_API friso_array_t new_array_list_with_opacity( uint_t opacity )
|
||||
{
|
||||
friso_array_t array = ( friso_array_t )
|
||||
FRISO_MALLOC( sizeof( friso_array_entry ) );
|
||||
if ( array == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
friso_array_t array = ( friso_array_t )
|
||||
FRISO_MALLOC( sizeof( friso_array_entry ) );
|
||||
if ( array == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
array->items = create_array_entries( opacity );
|
||||
array->allocs = opacity;
|
||||
array->length = 0;
|
||||
//initialize
|
||||
array->items = create_array_entries( opacity );
|
||||
array->allocs = opacity;
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where its items item pointed to .
|
||||
* and its items, but never where its items item pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list( friso_array_t array )
|
||||
{
|
||||
//free the allocation that all the items pointed to
|
||||
//register int t;
|
||||
//if ( flag == 1 ) {
|
||||
// for ( t = 0; t < array->length; t++ ) {
|
||||
// if ( array->items[t] == NULL ) continue;
|
||||
// FRISO_FREE( array->items[t] );
|
||||
// array->items[t] = NULL;
|
||||
// }
|
||||
//}
|
||||
//free the allocation that all the items pointed to
|
||||
//register int t;
|
||||
//if ( flag == 1 ) {
|
||||
// for ( t = 0; t < array->length; t++ ) {
|
||||
// if ( array->items[t] == NULL ) continue;
|
||||
// FRISO_FREE( array->items[t] );
|
||||
// array->items[t] = NULL;
|
||||
// }
|
||||
//}
|
||||
|
||||
FRISO_FREE( array->items );
|
||||
FRISO_FREE( array );
|
||||
FRISO_FREE( array->items );
|
||||
FRISO_FREE( array );
|
||||
}
|
||||
|
||||
//add a new item to the array.
|
||||
FRISO_API void array_list_add( friso_array_t array, void *value )
|
||||
{
|
||||
//check the condition to resize.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
array->items[array->length++] = value;
|
||||
//check the condition to resize.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
array->items[array->length++] = value;
|
||||
}
|
||||
|
||||
//insert a new item at a specified position.
|
||||
FRISO_API void array_list_insert(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void *value )
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void *value )
|
||||
{
|
||||
register uint_t t;
|
||||
register uint_t t;
|
||||
|
||||
if ( idx <= array->length )
|
||||
{
|
||||
//check the condition to resize the array.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
if ( idx <= array->length )
|
||||
{
|
||||
//check the condition to resize the array.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
|
||||
//move the elements after idx.
|
||||
//for ( t = idx; t < array->length; t++ ) {
|
||||
// array->items[t+1] = array->items[t];
|
||||
//}
|
||||
for ( t = array->length - 1; t >= idx; t-- )
|
||||
{
|
||||
array->items[t+1] = array->items[t];
|
||||
}
|
||||
//move the elements after idx.
|
||||
//for ( t = idx; t < array->length; t++ ) {
|
||||
// array->items[t+1] = array->items[t];
|
||||
//}
|
||||
for ( t = array->length - 1; t >= idx; t-- )
|
||||
{
|
||||
array->items[t+1] = array->items[t];
|
||||
}
|
||||
|
||||
array->items[idx] = value;
|
||||
array->length++;
|
||||
}
|
||||
array->items[idx] = value;
|
||||
array->length++;
|
||||
}
|
||||
}
|
||||
|
||||
//get the item at a specified position.
|
||||
FRISO_API void *array_list_get( friso_array_t array, uint_t idx )
|
||||
{
|
||||
if ( idx < array->length ) {
|
||||
return array->items[idx];
|
||||
}
|
||||
return NULL;
|
||||
if ( idx < array->length ) {
|
||||
return array->items[idx];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//set the value of the item at a specified position.
|
||||
//this will return the old value.
|
||||
FRISO_API void * array_list_set(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void * value )
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void * value )
|
||||
{
|
||||
void * oval = NULL;
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
array->items[idx] = value;
|
||||
}
|
||||
return oval;
|
||||
void * oval = NULL;
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
array->items[idx] = value;
|
||||
}
|
||||
return oval;
|
||||
}
|
||||
|
||||
//remove the item at a specified position.
|
||||
//this will return the value of the removed item.
|
||||
FRISO_API void * array_list_remove(
|
||||
friso_array_t array, uint_t idx )
|
||||
friso_array_t array, uint_t idx )
|
||||
{
|
||||
register uint_t t;
|
||||
void *oval = NULL;
|
||||
register uint_t t;
|
||||
void *oval = NULL;
|
||||
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
//move the elements after idx.
|
||||
for ( t = idx; t < array->length - 1; t++ ) {
|
||||
array->items[t] = array->items[ t + 1 ];
|
||||
}
|
||||
array->items[array->length - 1] = NULL;
|
||||
array->length--;
|
||||
}
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
//move the elements after idx.
|
||||
for ( t = idx; t < array->length - 1; t++ ) {
|
||||
array->items[t] = array->items[ t + 1 ];
|
||||
}
|
||||
array->items[array->length - 1] = NULL;
|
||||
array->length--;
|
||||
}
|
||||
|
||||
return oval;
|
||||
return oval;
|
||||
}
|
||||
|
||||
/*trim the array list*/
|
||||
FRISO_API friso_array_t array_list_trim( friso_array_t array )
|
||||
{
|
||||
if ( array->length < array->allocs ) {
|
||||
return resize_array_list( array, array->length );
|
||||
}
|
||||
return array;
|
||||
if ( array->length < array->allocs ) {
|
||||
return resize_array_list( array, array->length );
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear( friso_array_t array )
|
||||
{
|
||||
register uint_t t;
|
||||
//free all the allocations that the array->length's pointer pointed.
|
||||
for ( t = 0; t < array->length; t++ ) {
|
||||
/*if ( array->items[t] == NULL ) continue;
|
||||
FRISO_FREE( array->items[t] ); */
|
||||
array->items[t] = NULL;
|
||||
}
|
||||
//attribute reset.
|
||||
array->length = 0;
|
||||
register uint_t t;
|
||||
//free all the allocations that the array->length's pointer pointed.
|
||||
for ( t = 0; t < array->length; t++ ) {
|
||||
/*if ( array->items[t] == NULL ) continue;
|
||||
FRISO_FREE( array->items[t] ); */
|
||||
array->items[t] = NULL;
|
||||
}
|
||||
//attribute reset.
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
return array;
|
||||
}
|
||||
|
||||
//get the size of the array list. (A macro define has replace this.)
|
||||
|
@ -1,7 +1,7 @@
|
||||
/**
|
||||
* friso string type check function interface,
|
||||
* like english/CJK, full-wdith/half-width, punctuation or not.
|
||||
* @ses friso_UTF8.c and friso_GBK.c for detail.
|
||||
* like english/CJK, full-wdith/half-width, punctuation or not.
|
||||
* @ses friso_UTF8.c and friso_GBK.c for detail.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
@ -16,25 +16,25 @@
|
||||
* @return int (true for cn string or false)
|
||||
* */
|
||||
FRISO_API int friso_cn_string(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_cjk_string(task->unicode);
|
||||
return utf8_cjk_string(task->unicode);
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_cn_string(task->buffer);
|
||||
return gbk_cn_string(task->buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a whitespace.
|
||||
FRISO_API int friso_whitespace(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_whitespace(task->unicode);
|
||||
return utf8_whitespace(task->unicode);
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_whitespace(task->buffer);
|
||||
return gbk_whitespace(task->buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -52,76 +52,76 @@ FRISO_API int friso_numeric_letter(
|
||||
|
||||
//check if the specified word is aa english letter.
|
||||
FRISO_API int friso_en_letter(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_en_letter( ( uint_t ) task->text[task->idx]);
|
||||
return utf8_en_letter( ( uint_t ) task->text[task->idx]);
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_en_letter( task->text + task->idx );
|
||||
return gbk_en_letter( task->text + task->idx );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a half-width letter.
|
||||
// punctuations are inclued.
|
||||
// punctuations are inclued.
|
||||
FRISO_API int friso_halfwidth_en_char(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_halfwidth_en_char(task->unicode);
|
||||
return utf8_halfwidth_en_char(task->unicode);
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_halfwidth_en_char(task->buffer[0]);
|
||||
return gbk_halfwidth_en_char(task->buffer[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a full-width letter.
|
||||
// full-width punctuations are not included.
|
||||
// full-width punctuations are not included.
|
||||
FRISO_API int friso_fullwidth_en_char(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_fullwidth_en_char( task->unicode );
|
||||
return utf8_fullwidth_en_char( task->unicode );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_fullwidth_en_char( task->buffer );
|
||||
return gbk_fullwidth_en_char( task->buffer );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is an english punctuations.
|
||||
FRISO_API int friso_en_punctuation(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_en_punctuation( task->unicode );
|
||||
return utf8_en_punctuation( task->unicode );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_en_punctuation( task->buffer[0] );
|
||||
return gbk_en_punctuation( task->buffer[0] );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word ia sn chinese punctuation.
|
||||
FRISO_API int friso_cn_punctuation(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_cn_punctuation( task->unicode );
|
||||
return utf8_cn_punctuation( task->unicode );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_cn_punctuation( task->buffer );
|
||||
return gbk_cn_punctuation( task->buffer );
|
||||
return 0;
|
||||
}
|
||||
|
||||
FRISO_API int friso_letter_number(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
FRISO_API int friso_other_number(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -129,98 +129,98 @@ FRISO_API int friso_other_number(
|
||||
//check if the word is a keep punctuation.
|
||||
//@Deprecated
|
||||
//FRISO_API int friso_keep_punctuation(
|
||||
// friso_charset_t charset,
|
||||
// friso_task_t task )
|
||||
// friso_charset_t charset,
|
||||
// friso_task_t task )
|
||||
//{
|
||||
// if ( charset == FRISO_UTF8 )
|
||||
// return utf8_keep_punctuation( task->buffer );
|
||||
// return utf8_keep_punctuation( task->buffer );
|
||||
// else if ( charset == FRISO_GBK )
|
||||
// return gbk_keep_punctuation( task->buffer );
|
||||
// return gbk_keep_punctuation( task->buffer );
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
//check if the specified char is en english punctuation.
|
||||
// this function is the same as friso_en_punctuation.
|
||||
// this function is the same as friso_en_punctuation.
|
||||
FRISO_API int is_en_punctuation(
|
||||
friso_charset_t charset, char c )
|
||||
friso_charset_t charset, char c )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_en_punctuation( (uint_t) c);
|
||||
return utf8_en_punctuation( (uint_t) c);
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_en_punctuation( c );
|
||||
return gbk_en_punctuation( c );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the specified string is make up with numeric.
|
||||
FRISO_API int friso_numeric_string(
|
||||
friso_charset_t charset,
|
||||
char *buffer )
|
||||
friso_charset_t charset,
|
||||
char *buffer )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_numeric_string( buffer );
|
||||
return utf8_numeric_string( buffer );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_numeric_string( buffer );
|
||||
return gbk_numeric_string( buffer );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the specified string is a decimal string.
|
||||
FRISO_API int friso_decimal_string(
|
||||
friso_charset_t charset, char *buffer )
|
||||
friso_charset_t charset, char *buffer )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_decimal_string( buffer );
|
||||
return utf8_decimal_string( buffer );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_decimal_string( buffer );
|
||||
return gbk_decimal_string( buffer );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified char is english uppercase letter.
|
||||
// included full-width and half-width letters.
|
||||
// included full-width and half-width letters.
|
||||
FRISO_API int friso_uppercase_letter(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
if ( charset == FRISO_UTF8 )
|
||||
return utf8_uppercase_letter( task->unicode );
|
||||
return utf8_uppercase_letter( task->unicode );
|
||||
else if ( charset == FRISO_GBK )
|
||||
return gbk_uppercase_letter( task->buffer );
|
||||
return gbk_uppercase_letter( task->buffer );
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get the type of the specified char.
|
||||
* the type will be the constants defined above.
|
||||
* the type will be the constants defined above.
|
||||
* (include the fullwidth english char.)
|
||||
*/
|
||||
FRISO_API friso_enchar_t friso_enchar_type(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
friso_charset_t charset,
|
||||
friso_task_t task )
|
||||
{
|
||||
//Unicode or ASCII.(Both UTF-8 and GBK are valid)
|
||||
uint_t u = 0;
|
||||
|
||||
if ( charset == FRISO_UTF8 )
|
||||
{
|
||||
u = task->unicode;
|
||||
//if ( u >= 65280 ) u = 65280 - 65248;
|
||||
u = task->unicode;
|
||||
//if ( u >= 65280 ) u = 65280 - 65248;
|
||||
}
|
||||
else if ( charset == FRISO_GBK )
|
||||
{
|
||||
u = (uchar_t)task->buffer[0];
|
||||
//if ( u == 0xa3 ) ; //full-width.
|
||||
u = (uchar_t)task->buffer[0];
|
||||
//if ( u == 0xa3 ) ; //full-width.
|
||||
}
|
||||
|
||||
//range check.
|
||||
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
|
||||
if ( u == 32 ) return FRISO_EN_WHITESPACE;
|
||||
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
|
||||
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
|
||||
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
|
||||
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
|
||||
if ( u == 32 ) return FRISO_EN_WHITESPACE;
|
||||
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
|
||||
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
|
||||
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
|
||||
|
||||
return FRISO_EN_PUNCTUATION;
|
||||
}
|
||||
|
||||
/* get the type of the specified en char.
|
||||
* the type will be the constants defined above.
|
||||
* the type will be the constants defined above.
|
||||
* (the char should be half-width english char only)
|
||||
*/
|
||||
FRISO_API friso_enchar_t get_enchar_type( char ch )
|
||||
@ -228,11 +228,11 @@ FRISO_API friso_enchar_t get_enchar_type( char ch )
|
||||
uint_t u = (uchar_t) ch;
|
||||
|
||||
//range check.
|
||||
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
|
||||
if ( u == 32 ) return FRISO_EN_WHITESPACE;
|
||||
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
|
||||
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
|
||||
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
|
||||
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
|
||||
if ( u == 32 ) return FRISO_EN_WHITESPACE;
|
||||
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
|
||||
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
|
||||
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
|
||||
|
||||
return FRISO_EN_PUNCTUATION;
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
/**
|
||||
* Friso charset about function interface header file.
|
||||
* @package src/friso_charset.h .
|
||||
* @package src/friso_charset.h .
|
||||
* Available charset for now:
|
||||
* 1. UTF8 - function start with utf8
|
||||
* 2. GBK - function start with gbk
|
||||
* 1. UTF8 - function start with utf8
|
||||
* 2. GBK - function start with gbk
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
@ -33,11 +33,11 @@ FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t);
|
||||
FRISO_API int friso_en_letter( friso_charset_t, friso_task_t );
|
||||
|
||||
//check if the specified word is a half-width letter.
|
||||
// punctuations are inclued.
|
||||
// punctuations are inclued.
|
||||
FRISO_API int friso_halfwidth_en_char( friso_charset_t, friso_task_t );
|
||||
|
||||
//check if the specified word is a full-width letter.
|
||||
// full-width punctuations are not included.
|
||||
// full-width punctuations are not included.
|
||||
FRISO_API int friso_fullwidth_en_char( friso_charset_t, friso_task_t );
|
||||
|
||||
//check if the specified word is an english punctuations.
|
||||
@ -60,32 +60,32 @@ FRISO_API int friso_numeric_string( friso_charset_t, char * );
|
||||
FRISO_API int friso_decimal_string( friso_charset_t, char * );
|
||||
|
||||
//check if the specified char is english uppercase letter.
|
||||
// included full-width and half-width letters.
|
||||
// included full-width and half-width letters.
|
||||
FRISO_API int friso_uppercase_letter( friso_charset_t, friso_task_t );
|
||||
|
||||
|
||||
//en char type.
|
||||
//#define FRISO_EN_LETTER 0 //a-z && A-Z
|
||||
//#define FRISO_EN_NUMERIC 1 //0-9
|
||||
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
|
||||
//#define FRISO_EN_WHITESPACE 3 //whitespace
|
||||
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
|
||||
//#define FRISO_EN_LETTER 0 //a-z && A-Z
|
||||
//#define FRISO_EN_NUMERIC 1 //0-9
|
||||
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
|
||||
//#define FRISO_EN_WHITESPACE 3 //whitespace
|
||||
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
|
||||
typedef enum {
|
||||
FRISO_EN_LETTER = 0, //A-Z, a-z
|
||||
FRISO_EN_NUMERIC = 1, //0-9
|
||||
FRISO_EN_PUNCTUATION = 2, //english punctuations
|
||||
FRISO_EN_WHITESPACE = 3, //whitespace
|
||||
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
|
||||
FRISO_EN_LETTER = 0, //A-Z, a-z
|
||||
FRISO_EN_NUMERIC = 1, //0-9
|
||||
FRISO_EN_PUNCTUATION = 2, //english punctuations
|
||||
FRISO_EN_WHITESPACE = 3, //whitespace
|
||||
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
|
||||
} friso_enchar_t;
|
||||
|
||||
/* get the type of the specified char.
|
||||
* the type will be the constants defined above.
|
||||
* the type will be the constants defined above.
|
||||
* (include the fullwidth english char.)
|
||||
*/
|
||||
FRISO_API friso_enchar_t friso_enchar_type( friso_charset_t, friso_task_t );
|
||||
|
||||
/* get the type of the specified en char.
|
||||
* the type will be the constants defined above.
|
||||
* the type will be the constants defined above.
|
||||
* (the char should be half-width english char only)
|
||||
*/
|
||||
FRISO_API friso_enchar_t get_enchar_type( char );
|
||||
@ -99,7 +99,7 @@ FRISO_API friso_enchar_t get_enchar_type( char );
|
||||
|
||||
/* read the next utf-8 word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int utf8_next_word( friso_task_t, uint_t *, fstring );
|
||||
|
||||
@ -116,31 +116,31 @@ FRISO_API int unicode_to_utf8( uint_t, fstring );
|
||||
FRISO_API int utf8_cjk_string( uint_t ) ;
|
||||
|
||||
/*check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english puntuations.*/
|
||||
* include all the letters and english puntuations.*/
|
||||
FRISO_API int utf8_halfwidth_en_char( uint_t );
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char( uint_t );
|
||||
|
||||
//check the given char is a upper case letter or not.
|
||||
// included all the full-width and half-width letters.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int utf8_uppercase_letter( uint_t );
|
||||
|
||||
//check the given char is a lower case letter or not.
|
||||
// included all the full-width and half-width letters.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int utf8_lowercase_letter( uint_t );
|
||||
|
||||
//check the given char is a numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int utf8_numeric_letter( uint_t );
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int utf8_numeric_string( char * );
|
||||
|
||||
@ -183,7 +183,7 @@ FRISO_API int is_en_punctuation( friso_charset_t, char );
|
||||
|
||||
/* read the next GBK word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int gbk_next_word( friso_task_t, uint_t *, fstring );
|
||||
|
||||
@ -194,31 +194,31 @@ FRISO_API int get_gbk_bytes( char );
|
||||
FRISO_API int gbk_cn_string( char * ) ;
|
||||
|
||||
/*check if the given char is a ASCII letter
|
||||
* include all the letters and english puntuations.*/
|
||||
* include all the letters and english puntuations.*/
|
||||
FRISO_API int gbk_halfwidth_en_char( char );
|
||||
|
||||
/*
|
||||
* check if the given char is a full-width latain.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int gbk_fullwidth_en_char( char * );
|
||||
|
||||
//check if the given char is a upper case char.
|
||||
// included all the full-width and half-width letters.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int gbk_uppercase_letter( char * );
|
||||
|
||||
//check if the given char is a lower case char.
|
||||
// included all the full-width and half-width letters.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int gbk_lowercase_letter( char * );
|
||||
|
||||
//check if the given char is a numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int gbk_numeric_letter( char * );
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int gbk_numeric_string( char * );
|
||||
|
||||
@ -248,7 +248,7 @@ FRISO_API int gbk_en_punctuation( char ) ;
|
||||
FRISO_API int gbk_cn_punctuation( char * );
|
||||
|
||||
//cause the logic handle is the same as the utf8.
|
||||
// here invoke the utf8 interface directly.
|
||||
// here invoke the utf8 interface directly.
|
||||
//FRISO_API int gbk_keep_punctuation( char * );
|
||||
//@Deprecated
|
||||
//#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str)
|
||||
@ -257,4 +257,4 @@ FRISO_API int gbk_cn_punctuation( char * );
|
||||
//FRISO_API int gbk_fullwidth_char( char * ) ;
|
||||
/* }}}*/
|
||||
|
||||
#endif /*end _friso_charset_h*/
|
||||
#endif /*end _friso_charset_h*/
|
||||
|
180
src/friso_hash.c
180
src/friso_hash.c
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* friso hash table implements functions
|
||||
* defined in header file "friso_API.h".
|
||||
* defined in header file "friso_API.h".
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
@ -10,7 +10,7 @@
|
||||
|
||||
//-166411799L
|
||||
//31 131 1331 13331 133331 ..
|
||||
//31 131 1313 13131 131313 .. the best
|
||||
//31 131 1313 13131 131313 .. the best
|
||||
#define HASH_FACTOR 1313131
|
||||
|
||||
/* ************************
|
||||
@ -22,7 +22,7 @@ __STATIC_API__ uint_t hash( fstring str, uint_t length )
|
||||
uint_t h = 0;
|
||||
|
||||
while ( *str != '\0' )
|
||||
h = h * HASH_FACTOR + ( *str++ );
|
||||
h = h * HASH_FACTOR + ( *str++ );
|
||||
|
||||
return (h % length);
|
||||
}
|
||||
@ -32,13 +32,13 @@ __STATIC_API__ int is_prime( int n )
|
||||
{
|
||||
int j;
|
||||
if ( n == 2 || n == 3 )
|
||||
return 1;
|
||||
return 1;
|
||||
if ( n == 1 || n % 2 == 0 )
|
||||
return 0;
|
||||
return 0;
|
||||
|
||||
for ( j = 3; j * j < n; j++ )
|
||||
if ( n % j == 0 )
|
||||
return 0;
|
||||
if ( n % j == 0 )
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -47,7 +47,7 @@ __STATIC_API__ int is_prime( int n )
|
||||
__STATIC_API__ int next_prime( int n )
|
||||
{
|
||||
if ( n % 2 == 0 )
|
||||
n++;
|
||||
n++;
|
||||
for ( ; ! is_prime( n ); n = n + 2 ) ;
|
||||
|
||||
return n;
|
||||
@ -72,14 +72,14 @@ __STATIC_API__ int next_prime( int n )
|
||||
* static hashtable function area. *
|
||||
***********************************/
|
||||
__STATIC_API__ hash_entry_t new_hash_entry(
|
||||
fstring key,
|
||||
void * value,
|
||||
hash_entry_t next )
|
||||
fstring key,
|
||||
void * value,
|
||||
hash_entry_t next )
|
||||
{
|
||||
hash_entry_t e = ( hash_entry_t )
|
||||
FRISO_MALLOC( sizeof( friso_hash_entry ) );
|
||||
FRISO_MALLOC( sizeof( friso_hash_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//e->_key = string_copy( key );
|
||||
@ -95,13 +95,13 @@ __STATIC_API__ hash_entry_t * create_hash_entries( uint_t blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
hash_entry_t *e = ( hash_entry_t * )
|
||||
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
|
||||
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
e[t] = NULL;
|
||||
e[t] = NULL;
|
||||
}
|
||||
|
||||
return e;
|
||||
@ -114,22 +114,22 @@ __STATIC_API__ void rebuild_hash( friso_hash_t _hash )
|
||||
//find the next prime as the length of the hashtable.
|
||||
uint_t t, length = next_prime( _hash->length * 2 + 1 );
|
||||
hash_entry_t e, next, *_src = _hash->table, \
|
||||
*table = create_hash_entries( length );
|
||||
*table = create_hash_entries( length );
|
||||
uint_t bucket;
|
||||
|
||||
//copy the nodes
|
||||
for ( t = 0; t < _hash->length; t++ )
|
||||
{
|
||||
e = *( _src + t );
|
||||
if ( e != NULL ) {
|
||||
do {
|
||||
next = e->_next;
|
||||
bucket = hash( e->_key, length );
|
||||
e->_next = table[bucket];
|
||||
table[bucket] = e;
|
||||
e = next;
|
||||
} while ( e != NULL );
|
||||
}
|
||||
e = *( _src + t );
|
||||
if ( e != NULL ) {
|
||||
do {
|
||||
next = e->_next;
|
||||
bucket = hash( e->_key, length );
|
||||
e->_next = table[bucket];
|
||||
table[bucket] = e;
|
||||
e = next;
|
||||
} while ( e != NULL );
|
||||
}
|
||||
}
|
||||
|
||||
_hash->table = table;
|
||||
@ -149,35 +149,35 @@ FRISO_API friso_hash_t new_hash_table( void )
|
||||
{
|
||||
friso_hash_t _hash = ( friso_hash_t ) FRISO_MALLOC( sizeof ( friso_hash_cdt ) );
|
||||
if ( _hash == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the the hashtable
|
||||
_hash->length = DEFAULT_LENGTH;
|
||||
_hash->size = 0;
|
||||
_hash->factor = DEFAULT_FACTOR;
|
||||
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
|
||||
_hash->table = create_hash_entries( _hash->length );
|
||||
_hash->length = DEFAULT_LENGTH;
|
||||
_hash->size = 0;
|
||||
_hash->factor = DEFAULT_FACTOR;
|
||||
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
|
||||
_hash->table = create_hash_entries( _hash->length );
|
||||
|
||||
return _hash;
|
||||
}
|
||||
|
||||
FRISO_API void free_hash_table(
|
||||
friso_hash_t _hash,
|
||||
fhash_callback_fn_t fentry_func )
|
||||
friso_hash_t _hash,
|
||||
fhash_callback_fn_t fentry_func )
|
||||
{
|
||||
register uint_t j;
|
||||
hash_entry_t e, n;
|
||||
|
||||
for ( j = 0; j < _hash->length; j++ )
|
||||
{
|
||||
e = *( _hash->table + j );
|
||||
for ( ; e != NULL ; ) {
|
||||
n = e->_next;
|
||||
if ( fentry_func != NULL ) fentry_func(e);
|
||||
FRISO_FREE( e );
|
||||
e = n;
|
||||
}
|
||||
e = *( _hash->table + j );
|
||||
for ( ; e != NULL ; ) {
|
||||
n = e->_next;
|
||||
if ( fentry_func != NULL ) fentry_func(e);
|
||||
FRISO_FREE( e );
|
||||
e = n;
|
||||
}
|
||||
}
|
||||
|
||||
//free the pointer array block ( 4 * htable->length continuous bytes ).
|
||||
@ -189,9 +189,9 @@ FRISO_API void free_hash_table(
|
||||
//put a new mapping insite.
|
||||
//the value cannot be NULL.
|
||||
FRISO_API void *hash_put_mapping(
|
||||
friso_hash_t _hash,
|
||||
fstring key,
|
||||
void * value )
|
||||
friso_hash_t _hash,
|
||||
fstring key,
|
||||
void * value )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e = *( _hash->table + bucket );
|
||||
@ -200,14 +200,14 @@ FRISO_API void *hash_put_mapping(
|
||||
//check the given key is already exists or not.
|
||||
for ( ; e != NULL; e = e->_next )
|
||||
{
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
oval = e->_val; //bak the old value
|
||||
e->_val = value;
|
||||
return oval;
|
||||
}
|
||||
e->_val = value;
|
||||
return oval;
|
||||
}
|
||||
}
|
||||
|
||||
//put a new mapping into the hashtable.
|
||||
@ -216,27 +216,27 @@ FRISO_API void *hash_put_mapping(
|
||||
|
||||
//check the condition to rebuild the hashtable.
|
||||
if ( _hash->size >= _hash->threshold )
|
||||
rebuild_hash( _hash );
|
||||
rebuild_hash( _hash );
|
||||
|
||||
return oval;
|
||||
}
|
||||
|
||||
//check the existence of the mapping associated with the given key.
|
||||
FRISO_API int hash_exist_mapping(
|
||||
friso_hash_t _hash, fstring key )
|
||||
friso_hash_t _hash, fstring key )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -249,14 +249,14 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
|
||||
hash_entry_t e;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return e->_val;
|
||||
}
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return e->_val;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@ -264,31 +264,31 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
|
||||
|
||||
//remove the mapping associated with the given key.
|
||||
FRISO_API hash_entry_t hash_remove_mapping(
|
||||
friso_hash_t _hash, fstring key )
|
||||
friso_hash_t _hash, fstring key )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e, prev = NULL;
|
||||
hash_entry_t b;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
prev = e, e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
b = e;
|
||||
//the node located at *( htable->table + bucket )
|
||||
if ( prev == NULL ) {
|
||||
_hash->table[bucket] = e->_next;
|
||||
} else {
|
||||
prev->_next = e->_next;
|
||||
}
|
||||
//printf("%s was removed\n", b->_key);
|
||||
_hash->size--;
|
||||
//FRISO_FREE( b );
|
||||
return b;
|
||||
}
|
||||
e != NULL;
|
||||
prev = e, e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
b = e;
|
||||
//the node located at *( htable->table + bucket )
|
||||
if ( prev == NULL ) {
|
||||
_hash->table[bucket] = e->_next;
|
||||
} else {
|
||||
prev->_next = e->_next;
|
||||
}
|
||||
//printf("%s was removed\n", b->_key);
|
||||
_hash->size--;
|
||||
//FRISO_FREE( b );
|
||||
return b;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
@ -1,102 +1,102 @@
|
||||
/*
|
||||
* friso lexicon implemented functions.
|
||||
* used to deal with the friso lexicon, like: load,remove,match...
|
||||
* used to deal with the friso lexicon, like: load,remove,match...
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
#include "friso.h"
|
||||
|
||||
#define __SPLIT_MAX_TOKENS__ 5
|
||||
#define __LEX_FILE_DELIME__ '#'
|
||||
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
|
||||
#define __SPLIT_MAX_TOKENS__ 5
|
||||
#define __LEX_FILE_DELIME__ '#'
|
||||
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
|
||||
|
||||
//create a new lexicon
|
||||
FRISO_API friso_dic_t friso_dic_new()
|
||||
{
|
||||
register uint_t t;
|
||||
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
|
||||
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
|
||||
if ( dic == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
register uint_t t;
|
||||
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
|
||||
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
|
||||
if ( dic == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
dic[t] = new_hash_table();
|
||||
}
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
dic[t] = new_hash_table();
|
||||
}
|
||||
|
||||
return dic;
|
||||
return dic;
|
||||
}
|
||||
|
||||
/**
|
||||
* default callback function to invoke
|
||||
* when free the friso dictionary .
|
||||
* when free the friso dictionary .
|
||||
*
|
||||
* @date 2013-06-12
|
||||
*/
|
||||
__STATIC_API__ void default_fdic_callback( hash_entry_t e )
|
||||
{
|
||||
register uint_t i;
|
||||
friso_array_t syn;
|
||||
lex_entry_t lex = ( lex_entry_t ) e->_val;
|
||||
//free the lex->word
|
||||
FRISO_FREE( lex->word );
|
||||
//free the lex->syn if it is not NULL
|
||||
if ( lex->syn != NULL )
|
||||
{
|
||||
syn = lex->syn;
|
||||
for ( i = 0; i < syn->length; i++ ) {
|
||||
FRISO_FREE( syn->items[i] );
|
||||
}
|
||||
free_array_list( syn );
|
||||
}
|
||||
register uint_t i;
|
||||
friso_array_t syn;
|
||||
lex_entry_t lex = ( lex_entry_t ) e->_val;
|
||||
//free the lex->word
|
||||
FRISO_FREE( lex->word );
|
||||
//free the lex->syn if it is not NULL
|
||||
if ( lex->syn != NULL )
|
||||
{
|
||||
syn = lex->syn;
|
||||
for ( i = 0; i < syn->length; i++ ) {
|
||||
FRISO_FREE( syn->items[i] );
|
||||
}
|
||||
free_array_list( syn );
|
||||
}
|
||||
|
||||
//free the e->_val
|
||||
//@date 2014-01-28 posted by mlemay@gmail.com
|
||||
FRISO_FREE(lex);
|
||||
//free the e->_val
|
||||
//@date 2014-01-28 posted by mlemay@gmail.com
|
||||
FRISO_FREE(lex);
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_free( friso_dic_t dic )
|
||||
{
|
||||
register uint_t t;
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
//free the hash table
|
||||
free_hash_table( dic[t], default_fdic_callback );
|
||||
}
|
||||
register uint_t t;
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
//free the hash table
|
||||
free_hash_table( dic[t], default_fdic_callback );
|
||||
}
|
||||
|
||||
FRISO_FREE( dic );
|
||||
FRISO_FREE( dic );
|
||||
}
|
||||
|
||||
|
||||
//create a new lexicon entry
|
||||
FRISO_API lex_entry_t new_lex_entry(
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t fre,
|
||||
uint_t length,
|
||||
uint_t type )
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t fre,
|
||||
uint_t length,
|
||||
uint_t type )
|
||||
{
|
||||
lex_entry_t e = ( lex_entry_t )
|
||||
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
lex_entry_t e = ( lex_entry_t )
|
||||
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize.
|
||||
e->word = word;
|
||||
e->syn = syn; //synoyum words array list.
|
||||
e->pos = NULL; //part of speech array list.
|
||||
//e->py = NULL; //set to NULL first.
|
||||
e->fre = fre;
|
||||
e->length = (uchar_t) length; //length
|
||||
e->rlen = (uchar_t) length; //set to length by default.
|
||||
e->type = (uchar_t) type; //type
|
||||
e->ctrlMask = 0; //control mask.
|
||||
e->offset = -1;
|
||||
//initialize.
|
||||
e->word = word;
|
||||
e->syn = syn; //synoyum words array list.
|
||||
e->pos = NULL; //part of speech array list.
|
||||
//e->py = NULL; //set to NULL first.
|
||||
e->fre = fre;
|
||||
e->length = (uchar_t) length; //length
|
||||
e->rlen = (uchar_t) length; //set to length by default.
|
||||
e->type = (uchar_t) type; //type
|
||||
e->ctrlMask = 0; //control mask.
|
||||
e->offset = -1;
|
||||
|
||||
return e;
|
||||
return e;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -109,64 +109,64 @@ FRISO_API lex_entry_t new_lex_entry(
|
||||
*/
|
||||
FRISO_API void free_lex_entry( lex_entry_t e )
|
||||
{
|
||||
//if ( e->syn != NULL ) {
|
||||
// if ( flag == 1 ) free_array_list( e->syn);
|
||||
// else free_array_list( e->syn );
|
||||
//}
|
||||
FRISO_FREE( e );
|
||||
//if ( e->syn != NULL ) {
|
||||
// if ( flag == 1 ) free_array_list( e->syn);
|
||||
// else free_array_list( e->syn );
|
||||
//}
|
||||
FRISO_FREE( e );
|
||||
}
|
||||
|
||||
|
||||
//add a new entry to the dictionary.
|
||||
FRISO_API void friso_dic_add(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn )
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ )
|
||||
{
|
||||
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, 0,
|
||||
(uint_t) strlen(word), (uint_t) lex ) );
|
||||
}
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ )
|
||||
{
|
||||
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, 0,
|
||||
(uint_t) strlen(word), (uint_t) lex ) );
|
||||
}
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_add_with_fre(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t frequency )
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t frequency )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, frequency,
|
||||
( uint_t ) strlen(word), ( uint_t ) lex ) );
|
||||
}
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, frequency,
|
||||
( uint_t ) strlen(word), ( uint_t ) lex ) );
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* read a line from a specified stream.
|
||||
* the newline will be cleared.
|
||||
* the newline will be cleared.
|
||||
*
|
||||
* @date 2012-11-24
|
||||
* @date 2012-11-24
|
||||
*/
|
||||
FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
|
||||
{
|
||||
register int c;
|
||||
fstring cs;
|
||||
register int c;
|
||||
fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while ( ( c = fgetc( _stream ) ) != EOF )
|
||||
{
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
cs = __dst;
|
||||
while ( ( c = fgetc( _stream ) ) != EOF )
|
||||
{
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -174,373 +174,373 @@ FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
|
||||
*/
|
||||
///instead of memcpy
|
||||
__STATIC_API__ fstring string_copy(
|
||||
fstring _src,
|
||||
fstring __dst,
|
||||
uint_t blocks )
|
||||
fstring _src,
|
||||
fstring __dst,
|
||||
uint_t blocks )
|
||||
{
|
||||
|
||||
register fstring __src = _src;
|
||||
register uint_t t;
|
||||
register fstring __src = _src;
|
||||
register uint_t t;
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *__src == '\0' ) break;
|
||||
__dst[t] = *__src++;
|
||||
}
|
||||
__dst[t] = '\0';
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *__src == '\0' ) break;
|
||||
__dst[t] = *__src++;
|
||||
}
|
||||
__dst[t] = '\0';
|
||||
|
||||
return __dst;
|
||||
return __dst;
|
||||
}
|
||||
|
||||
/**
|
||||
* make a heap allocation, and copy the
|
||||
* source fstring to the new allocation, and
|
||||
* you should free it after use it .
|
||||
* source fstring to the new allocation, and
|
||||
* you should free it after use it .
|
||||
*
|
||||
* @param _src source fstring
|
||||
* @param blocks number of bytes to copy
|
||||
* @param _src source fstring
|
||||
* @param blocks number of bytes to copy
|
||||
*/
|
||||
__STATIC_API__ fstring string_copy_heap(
|
||||
fstring _src, uint_t blocks )
|
||||
fstring _src, uint_t blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
register uint_t t;
|
||||
|
||||
fstring str = ( fstring )
|
||||
FRISO_MALLOC( blocks + 1 );
|
||||
if ( str == NULL ) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
fstring str = ( fstring )
|
||||
FRISO_MALLOC( blocks + 1 );
|
||||
if ( str == NULL ) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *_src == '\0' ) break;
|
||||
str[t] = *_src++;
|
||||
}
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *_src == '\0' ) break;
|
||||
str[t] = *_src++;
|
||||
}
|
||||
|
||||
str[t] = '\0';
|
||||
return str;
|
||||
str[t] = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
/*
|
||||
* find the postion of the first appear of the given char.
|
||||
* address of the char in the fstring will be return .
|
||||
* if not found NULL will be return .
|
||||
* address of the char in the fstring will be return .
|
||||
* if not found NULL will be return .
|
||||
*/
|
||||
__STATIC_API__ fstring indexOf( fstring __str, char delimiter )
|
||||
{
|
||||
uint_t i, __length__;
|
||||
uint_t i, __length__;
|
||||
|
||||
__length__ = strlen( __str );
|
||||
for ( i = 0; i < __length__; i++ ) {
|
||||
if ( __str[i] == delimiter )
|
||||
return __str + i;
|
||||
}
|
||||
__length__ = strlen( __str );
|
||||
for ( i = 0; i < __length__; i++ ) {
|
||||
if ( __str[i] == delimiter )
|
||||
return __str + i;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* load all the valid wors from a specified lexicon file .
|
||||
*
|
||||
* @param dic friso dictionary instance (A hash array)
|
||||
* @param lex the lexicon type
|
||||
* @param lex_file the path of the lexicon file
|
||||
* @param length the maximum length of the word item
|
||||
* @param dic friso dictionary instance (A hash array)
|
||||
* @param lex the lexicon type
|
||||
* @param lex_file the path of the lexicon file
|
||||
* @param length the maximum length of the word item
|
||||
*/
|
||||
FRISO_API void friso_dic_load(
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
friso_lex_t lex,
|
||||
fstring lex_file,
|
||||
uint_t length )
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
friso_lex_t lex,
|
||||
fstring lex_file,
|
||||
uint_t length )
|
||||
{
|
||||
|
||||
FILE * _stream;
|
||||
char __char[1024], _buffer[512];
|
||||
fstring _line;
|
||||
string_split_entry sse;
|
||||
FILE * _stream;
|
||||
char __char[1024], _buffer[512];
|
||||
fstring _line;
|
||||
string_split_entry sse;
|
||||
|
||||
fstring _word;
|
||||
char _sbuffer[512];
|
||||
fstring _syn;
|
||||
friso_array_t sywords;
|
||||
uint_t _fre;
|
||||
fstring _word;
|
||||
char _sbuffer[512];
|
||||
fstring _syn;
|
||||
friso_array_t sywords;
|
||||
uint_t _fre;
|
||||
|
||||
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( _line = file_get_line( __char, _stream ) ) != NULL )
|
||||
{
|
||||
//clear up the notes
|
||||
//make sure the length of the line is greater than 1.
|
||||
//like the single '#' mark in stopwords dictionary.
|
||||
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
|
||||
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( _line = file_get_line( __char, _stream ) ) != NULL )
|
||||
{
|
||||
//clear up the notes
|
||||
//make sure the length of the line is greater than 1.
|
||||
//like the single '#' mark in stopwords dictionary.
|
||||
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
|
||||
|
||||
//handle the stopwords.
|
||||
if ( lex == __LEX_STOPWORDS__ )
|
||||
{
|
||||
//clean the chinese words that its length is greater than max length.
|
||||
if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
|
||||
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
|
||||
string_copy_heap( _line, strlen(_line) ), NULL );
|
||||
continue;
|
||||
}
|
||||
//handle the stopwords.
|
||||
if ( lex == __LEX_STOPWORDS__ )
|
||||
{
|
||||
//clean the chinese words that its length is greater than max length.
|
||||
if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
|
||||
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
|
||||
string_copy_heap( _line, strlen(_line) ), NULL );
|
||||
continue;
|
||||
}
|
||||
|
||||
//split the fstring with '/'.
|
||||
string_split_reset( &sse, "/", _line);
|
||||
if ( string_split_next( &sse, _buffer ) == NULL ) continue;
|
||||
//split the fstring with '/'.
|
||||
string_split_reset( &sse, "/", _line);
|
||||
if ( string_split_next( &sse, _buffer ) == NULL ) continue;
|
||||
|
||||
//1. get the word.
|
||||
_word = string_copy_heap( _buffer, strlen(_buffer) );
|
||||
//1. get the word.
|
||||
_word = string_copy_heap( _buffer, strlen(_buffer) );
|
||||
|
||||
if ( string_split_next( &sse, _buffer ) == NULL )
|
||||
{
|
||||
//normal lexicon type,
|
||||
//add them to the dictionary directly
|
||||
friso_dic_add( friso->dic, lex, _word, NULL );
|
||||
continue;
|
||||
}
|
||||
if ( string_split_next( &sse, _buffer ) == NULL )
|
||||
{
|
||||
//normal lexicon type,
|
||||
//add them to the dictionary directly
|
||||
friso_dic_add( friso->dic, lex, _word, NULL );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* filter out the words that its length is larger
|
||||
* than the specified limit.
|
||||
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
|
||||
* and __LEX_CEM_WORDS__.
|
||||
*/
|
||||
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
|
||||
&& strlen( _word ) > length )
|
||||
{
|
||||
FRISO_FREE(_word);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* filter out the words that its length is larger
|
||||
* than the specified limit.
|
||||
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
|
||||
* and __LEX_CEM_WORDS__.
|
||||
*/
|
||||
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
|
||||
&& strlen( _word ) > length )
|
||||
{
|
||||
FRISO_FREE(_word);
|
||||
continue;
|
||||
}
|
||||
|
||||
//2. get the synonyms words.
|
||||
_syn = NULL;
|
||||
if ( strcmp( _buffer, "null" ) != 0 )
|
||||
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
|
||||
//2. get the synonyms words.
|
||||
_syn = NULL;
|
||||
if ( strcmp( _buffer, "null" ) != 0 )
|
||||
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
|
||||
|
||||
//3. get the word frequency if it available.
|
||||
_fre = 0;
|
||||
if ( string_split_next( &sse, _buffer ) != NULL )
|
||||
_fre = atoi( _buffer );
|
||||
//3. get the word frequency if it available.
|
||||
_fre = 0;
|
||||
if ( string_split_next( &sse, _buffer ) != NULL )
|
||||
_fre = atoi( _buffer );
|
||||
|
||||
/**
|
||||
* Here:
|
||||
* split the synonyms words with mark ","
|
||||
* and put them in a array list if the synonyms is not NULL
|
||||
*/
|
||||
sywords = NULL;
|
||||
if ( config->add_syn && _syn != NULL )
|
||||
{
|
||||
string_split_reset( &sse, ",", _sbuffer );
|
||||
sywords = new_array_list_with_opacity(5);
|
||||
while ( string_split_next( &sse, _buffer ) != NULL )
|
||||
{
|
||||
if ( strlen(_buffer) > length ) continue;
|
||||
array_list_add( sywords,
|
||||
string_copy_heap(_buffer, strlen(_buffer)) );
|
||||
}
|
||||
sywords = array_list_trim( sywords );
|
||||
}
|
||||
/**
|
||||
* Here:
|
||||
* split the synonyms words with mark ","
|
||||
* and put them in a array list if the synonyms is not NULL
|
||||
*/
|
||||
sywords = NULL;
|
||||
if ( config->add_syn && _syn != NULL )
|
||||
{
|
||||
string_split_reset( &sse, ",", _sbuffer );
|
||||
sywords = new_array_list_with_opacity(5);
|
||||
while ( string_split_next( &sse, _buffer ) != NULL )
|
||||
{
|
||||
if ( strlen(_buffer) > length ) continue;
|
||||
array_list_add( sywords,
|
||||
string_copy_heap(_buffer, strlen(_buffer)) );
|
||||
}
|
||||
sywords = array_list_trim( sywords );
|
||||
}
|
||||
|
||||
//4. add the word item
|
||||
friso_dic_add_with_fre(
|
||||
friso->dic, lex, _word, sywords, _fre );
|
||||
}
|
||||
//4. add the word item
|
||||
friso_dic_add_with_fre(
|
||||
friso->dic, lex, _word, sywords, _fre );
|
||||
}
|
||||
|
||||
fclose( _stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open lexicon file %s\n", lex_file);
|
||||
}
|
||||
fclose( _stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open lexicon file %s\n", lex_file);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get the lexicon type index with the specified
|
||||
* type keywords .
|
||||
* type keywords .
|
||||
*
|
||||
* @see friso.h#friso_lex_t
|
||||
* @param _key
|
||||
* @return int
|
||||
* @see friso.h#friso_lex_t
|
||||
* @param _key
|
||||
* @return int
|
||||
*/
|
||||
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key )
|
||||
{
|
||||
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
|
||||
return __LEX_CJK_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
|
||||
return __LEX_CJK_UNITS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
|
||||
return __LEX_ECM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
|
||||
return __LEX_CEM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
|
||||
return __LEX_CN_LNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
|
||||
return __LEX_CN_SNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME1__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME2__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
|
||||
return __LEX_CN_LNA__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
|
||||
return __LEX_STOPWORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
|
||||
return __LEX_ENPUN_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
|
||||
return __LEX_EN_WORDS__;
|
||||
}
|
||||
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
|
||||
return __LEX_CJK_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
|
||||
return __LEX_CJK_UNITS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
|
||||
return __LEX_ECM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
|
||||
return __LEX_CEM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
|
||||
return __LEX_CN_LNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
|
||||
return __LEX_CN_SNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME1__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME2__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
|
||||
return __LEX_CN_LNA__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
|
||||
return __LEX_STOPWORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
|
||||
return __LEX_ENPUN_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
|
||||
return __LEX_EN_WORDS__;
|
||||
}
|
||||
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the configuration file.
|
||||
* and load all the valid lexicon from the configuration file.
|
||||
*
|
||||
* @param friso friso instance
|
||||
* @param config friso_config instance
|
||||
* @param _path dictionary directory
|
||||
* @param _limitts words length limit
|
||||
* @param friso friso instance
|
||||
* @param config friso_config instance
|
||||
* @param _path dictionary directory
|
||||
* @param _limitts words length limit
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile(
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
fstring _path,
|
||||
uint_t _limits )
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
fstring _path,
|
||||
uint_t _limits )
|
||||
{
|
||||
|
||||
//1.parse the configuration file.
|
||||
FILE *__stream;
|
||||
char __chars__[1024], __key__[30], *__line__;
|
||||
uint_t __length__, i, t;
|
||||
friso_lex_t lex_t;
|
||||
string_buffer_t sb;
|
||||
//1.parse the configuration file.
|
||||
FILE *__stream;
|
||||
char __chars__[1024], __key__[30], *__line__;
|
||||
uint_t __length__, i, t;
|
||||
friso_lex_t lex_t;
|
||||
string_buffer_t sb;
|
||||
|
||||
//get the lexicon configruation file path
|
||||
sb = new_string_buffer();
|
||||
string_buffer_append( sb, _path );
|
||||
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
|
||||
//printf("%s\n", sb->buffer);
|
||||
//get the lexicon configruation file path
|
||||
sb = new_string_buffer();
|
||||
string_buffer_append( sb, _path );
|
||||
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
|
||||
//printf("%s\n", sb->buffer);
|
||||
|
||||
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( __line__ =
|
||||
file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comment filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( __line__ =
|
||||
file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comment filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
|
||||
__length__ = strlen( __line__ );
|
||||
//item start
|
||||
if ( __line__[ __length__ - 1 ] == '[' )
|
||||
{
|
||||
//get the type key
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
__length__ = strlen( __line__ );
|
||||
//item start
|
||||
if ( __line__[ __length__ - 1 ] == '[' )
|
||||
{
|
||||
//get the type key
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//get the lexicon type
|
||||
lex_t = get_lexicon_type_with_constant(__key__);
|
||||
if ( lex_t == -1 ) continue;
|
||||
//get the lexicon type
|
||||
lex_t = get_lexicon_type_with_constant(__key__);
|
||||
if ( lex_t == -1 ) continue;
|
||||
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t );
|
||||
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comments filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t );
|
||||
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comments filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
|
||||
__length__ = strlen( __line__ );
|
||||
if ( __line__[ __length__ - 1 ] == ']' ) break;
|
||||
__length__ = strlen( __line__ );
|
||||
if ( __line__[ __length__ - 1 ] == ']' ) break;
|
||||
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//load the lexicon item from the lexicon file.
|
||||
string_buffer_clear( sb );
|
||||
string_buffer_append( sb, _path );
|
||||
string_buffer_append( sb, __key__ );
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t);
|
||||
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
|
||||
}
|
||||
//load the lexicon item from the lexicon file.
|
||||
string_buffer_clear( sb );
|
||||
string_buffer_append( sb, _path );
|
||||
string_buffer_append( sb, __key__ );
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t);
|
||||
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
} //end while
|
||||
} //end while
|
||||
|
||||
fclose( __stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
|
||||
}
|
||||
fclose( __stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
|
||||
}
|
||||
|
||||
free_string_buffer(sb);
|
||||
free_string_buffer(sb);
|
||||
}
|
||||
|
||||
//match the item.
|
||||
FRISO_API int friso_dic_match(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_exist_mapping( dic[lex], word );
|
||||
}
|
||||
return 0;
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_exist_mapping( dic[lex], word );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get the lex_entry_t associated with the word.
|
||||
FRISO_API lex_entry_t friso_dic_get(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return ( lex_entry_t ) hash_get_value( dic[lex], word );
|
||||
}
|
||||
return NULL;
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return ( lex_entry_t ) hash_get_value( dic[lex], word );
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//get the size of the specified type dictionary.
|
||||
FRISO_API uint_t friso_spec_dic_size(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex )
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_get_size( dic[lex] );
|
||||
}
|
||||
return 0;
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_get_size( dic[lex] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get size of the whole dictionary.
|
||||
FRISO_API uint_t friso_all_dic_size(
|
||||
friso_dic_t dic )
|
||||
friso_dic_t dic )
|
||||
{
|
||||
register uint_t size = 0, t;
|
||||
register uint_t size = 0, t;
|
||||
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
size += hash_get_size( dic[t] );
|
||||
}
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
size += hash_get_size( dic[t] );
|
||||
}
|
||||
|
||||
return size;
|
||||
return size;
|
||||
}
|
||||
|
126
src/friso_link.c
126
src/friso_link.c
@ -1,29 +1,29 @@
|
||||
/*
|
||||
* link list implemented functions
|
||||
* defined in header file "friso_API.h".
|
||||
* defined in header file "friso_API.h".
|
||||
* when the link_node is being deleted, here we just free
|
||||
* the allocation of the node, not the allcation of it's value.
|
||||
* the allocation of the node, not the allcation of it's value.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
//create a new link list node.
|
||||
__STATIC_API__ link_node_t new_node_entry(
|
||||
void * value,
|
||||
link_node_t prev,
|
||||
link_node_t next )
|
||||
void * value,
|
||||
link_node_t prev,
|
||||
link_node_t next )
|
||||
{
|
||||
link_node_t node = ( link_node_t )
|
||||
FRISO_MALLOC( sizeof( link_node_entry ) );
|
||||
FRISO_MALLOC( sizeof( link_node_entry ) );
|
||||
if ( node == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
node->prev = prev;
|
||||
node->next = next;
|
||||
node->value = value;
|
||||
node->prev = prev;
|
||||
node->next = next;
|
||||
|
||||
return node;
|
||||
}
|
||||
@ -32,14 +32,14 @@ __STATIC_API__ link_node_t new_node_entry(
|
||||
FRISO_API friso_link_t new_link_list( void )
|
||||
{
|
||||
friso_link_t e = ( friso_link_t )
|
||||
FRISO_MALLOC( sizeof( friso_link_entry ) );
|
||||
FRISO_MALLOC( sizeof( friso_link_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the entry
|
||||
e->head = new_node_entry( NULL, NULL, NULL );
|
||||
e->tail = new_node_entry( NULL, e->head, NULL );
|
||||
e->head = new_node_entry( NULL, NULL, NULL );
|
||||
e->tail = new_node_entry( NULL, e->head, NULL );
|
||||
e->head->next = e->tail;
|
||||
e->size = 0;
|
||||
|
||||
@ -52,9 +52,9 @@ FRISO_API void free_link_list( friso_link_t link )
|
||||
link_node_t node, next;
|
||||
for ( node = link->head; node != NULL; )
|
||||
{
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
}
|
||||
|
||||
FRISO_FREE( link );
|
||||
@ -62,16 +62,16 @@ FRISO_API void free_link_list( friso_link_t link )
|
||||
|
||||
//clear all nodes in the link list.
|
||||
FRISO_API friso_link_t link_list_clear(
|
||||
friso_link_t link )
|
||||
friso_link_t link )
|
||||
{
|
||||
link_node_t node, next;
|
||||
//free all the middle nodes.
|
||||
for ( node = link->head->next;
|
||||
node != link->tail; )
|
||||
node != link->tail; )
|
||||
{
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
}
|
||||
|
||||
link->head->next = link->tail;
|
||||
@ -97,22 +97,22 @@ FRISO_API friso_link_t link_list_clear(
|
||||
* static
|
||||
*/
|
||||
__STATIC_API__ link_node_t get_node(
|
||||
friso_link_t link, uint_t idx )
|
||||
friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t p = NULL;
|
||||
register uint_t t;
|
||||
|
||||
if ( idx >= 0 && idx < link->size )
|
||||
{
|
||||
if ( idx < link->size / 2 ) { //find from the head.
|
||||
p = link->head;
|
||||
for ( t = 0; t <= idx; t++ )
|
||||
p = p->next;
|
||||
} else { //find from the tail.
|
||||
p = link->tail;
|
||||
for ( t = link->size; t > idx; t-- )
|
||||
p = p->prev;
|
||||
}
|
||||
if ( idx < link->size / 2 ) { //find from the head.
|
||||
p = link->head;
|
||||
for ( t = 0; t <= idx; t++ )
|
||||
p = p->next;
|
||||
} else { //find from the tail.
|
||||
p = link->tail;
|
||||
for ( t = link->size; t > idx; t-- )
|
||||
p = p->prev;
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
@ -123,9 +123,9 @@ __STATIC_API__ link_node_t get_node(
|
||||
* static
|
||||
*/
|
||||
//__STATIC_API__ void insert_before(
|
||||
// friso_link_t link,
|
||||
// link_node_t node,
|
||||
// void * value )
|
||||
// friso_link_t link,
|
||||
// link_node_t node,
|
||||
// void * value )
|
||||
//{
|
||||
// link_node_t e = new_node_entry( value, node->prev, node );
|
||||
// e->prev->next = e;
|
||||
@ -136,10 +136,10 @@ __STATIC_API__ link_node_t get_node(
|
||||
//}
|
||||
#define insert_before( link, node, value ) \
|
||||
{ \
|
||||
link_node_t e = new_node_entry( value, node->prev, node ); \
|
||||
e->prev->next = e; \
|
||||
e->next->prev = e; \
|
||||
link->size++; \
|
||||
link_node_t e = new_node_entry( value, node->prev, node ); \
|
||||
e->prev->next = e; \
|
||||
e->next->prev = e; \
|
||||
link->size++; \
|
||||
}
|
||||
|
||||
/*
|
||||
@ -150,7 +150,7 @@ __STATIC_API__ link_node_t get_node(
|
||||
* @return the value of the removed node.
|
||||
*/
|
||||
__STATIC_API__ void * remove_node(
|
||||
friso_link_t link, link_node_t node )
|
||||
friso_link_t link, link_node_t node )
|
||||
{
|
||||
void * _value = node->value;
|
||||
|
||||
@ -166,18 +166,18 @@ __STATIC_API__ void * remove_node(
|
||||
|
||||
//add a new node to the link list.(insert just before the tail)
|
||||
FRISO_API void link_list_add(
|
||||
friso_link_t link, void * value )
|
||||
friso_link_t link, void * value )
|
||||
{
|
||||
insert_before( link, link->tail, value );
|
||||
}
|
||||
|
||||
//add a new node before the given index.
|
||||
FRISO_API void link_list_insert_before(
|
||||
friso_link_t link, uint_t idx, void * value )
|
||||
friso_link_t link, uint_t idx, void * value )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
if ( node != NULL ) {
|
||||
insert_before( link, node, value );
|
||||
insert_before( link, node, value );
|
||||
}
|
||||
}
|
||||
|
||||
@ -187,11 +187,11 @@ FRISO_API void link_list_insert_before(
|
||||
* @return the value of the node.
|
||||
*/
|
||||
FRISO_API void * link_list_get(
|
||||
friso_link_t link, uint_t idx )
|
||||
friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
if ( node != NULL ) {
|
||||
return node->value;
|
||||
return node->value;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
@ -199,20 +199,20 @@ FRISO_API void * link_list_get(
|
||||
/*
|
||||
* set the value of the node that located in the specified position.
|
||||
* we did't free the allocation of the old value, we return it to you.
|
||||
* free it yourself when it is necessary.
|
||||
* free it yourself when it is necessary.
|
||||
*
|
||||
* @return the old value.
|
||||
*/
|
||||
FRISO_API void *link_list_set(
|
||||
friso_link_t link,
|
||||
uint_t idx, void * value )
|
||||
friso_link_t link,
|
||||
uint_t idx, void * value )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
void * _value = NULL;
|
||||
|
||||
if ( node != NULL ) {
|
||||
_value = node->value;
|
||||
node->value = value;
|
||||
_value = node->value;
|
||||
node->value = value;
|
||||
}
|
||||
|
||||
return _value;
|
||||
@ -225,13 +225,13 @@ FRISO_API void *link_list_set(
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove(
|
||||
friso_link_t link, uint_t idx )
|
||||
friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
|
||||
if ( node != NULL ) {
|
||||
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
|
||||
return remove_node( link, node );
|
||||
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
|
||||
return remove_node( link, node );
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@ -244,43 +244,43 @@ FRISO_API void *link_list_remove(
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove_node(
|
||||
friso_link_t link,
|
||||
link_node_t node )
|
||||
friso_link_t link,
|
||||
link_node_t node )
|
||||
{
|
||||
return remove_node( link, node );
|
||||
}
|
||||
|
||||
//remove the first node after the head
|
||||
FRISO_API void *link_list_remove_first(
|
||||
friso_link_t link )
|
||||
friso_link_t link )
|
||||
{
|
||||
if ( link->size > 0 ) {
|
||||
return remove_node( link, link->head->next );
|
||||
return remove_node( link, link->head->next );
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//remove the last node just before the tail.
|
||||
FRISO_API void *link_list_remove_last(
|
||||
friso_link_t link )
|
||||
friso_link_t link )
|
||||
{
|
||||
if ( link->size > 0 ) {
|
||||
return remove_node( link, link->tail->prev );
|
||||
return remove_node( link, link->tail->prev );
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//append a node from the tail.
|
||||
FRISO_API void link_list_add_last(
|
||||
friso_link_t link,
|
||||
void *value )
|
||||
friso_link_t link,
|
||||
void *value )
|
||||
{
|
||||
insert_before( link, link->tail, value );
|
||||
}
|
||||
|
||||
//append a note just after the head.
|
||||
FRISO_API void link_list_add_first(
|
||||
friso_link_t link, void *value )
|
||||
friso_link_t link, void *value )
|
||||
{
|
||||
insert_before( link, link->head->next, value );
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* utf-8 handle function implements.
|
||||
* you could modify it or re-release it but never for commercial use.
|
||||
* you could modify it or re-release it but never for commercial use.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
#include <string.h>
|
||||
|
||||
/* ******************************************
|
||||
* fstring buffer functions implements. *
|
||||
* fstring buffer functions implements. *
|
||||
********************************************/
|
||||
/**
|
||||
* create a new buffer
|
||||
* @Note:
|
||||
* 1. it's real length is 1 byte greater than the specifield value
|
||||
* 2. we did not do any optimization for the memory allocation to ...
|
||||
* avoid the memory defragmentation.
|
||||
* avoid the memory defragmentation.
|
||||
*
|
||||
* @date: 2014-10-16
|
||||
*/
|
||||
@ -26,7 +26,7 @@ __STATIC_API__ fstring create_buffer( uint_t length )
|
||||
{
|
||||
fstring buffer = ( fstring ) FRISO_MALLOC( length + 1 );
|
||||
if ( buffer == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
memset( buffer, 0x00, length + 1 );
|
||||
@ -36,7 +36,7 @@ __STATIC_API__ fstring create_buffer( uint_t length )
|
||||
|
||||
//the __allocs should not be smaller than sb->length
|
||||
__STATIC_API__ string_buffer_t resize_buffer(
|
||||
string_buffer_t sb, uint_t __allocs )
|
||||
string_buffer_t sb, uint_t __allocs )
|
||||
{
|
||||
//create a new buffer.
|
||||
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
|
||||
@ -44,7 +44,7 @@ __STATIC_API__ string_buffer_t resize_buffer(
|
||||
|
||||
//register uint_t t;
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// str[t] = sb->buffer[t];
|
||||
// str[t] = sb->buffer[t];
|
||||
//}
|
||||
memcpy( str, sb->buffer, sb->length );
|
||||
FRISO_FREE( sb->buffer );
|
||||
@ -65,9 +65,9 @@ __STATIC_API__ string_buffer_t resize_buffer(
|
||||
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t opacity )
|
||||
{
|
||||
string_buffer_t sb = ( string_buffer_t )
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
if ( sb == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
sb->buffer = create_buffer( opacity );
|
||||
@ -82,9 +82,9 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
|
||||
{
|
||||
//buffer allocations.
|
||||
string_buffer_t sb = ( string_buffer_t )
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
if ( sb == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
@ -95,7 +95,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
|
||||
//register uint_t t;
|
||||
//copy the str to the buffer.
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// sb->buffer[t] = str[t];
|
||||
// sb->buffer[t] = str[t];
|
||||
//}
|
||||
memcpy( sb->buffer, str, sb->length );
|
||||
|
||||
@ -103,66 +103,66 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_append(
|
||||
string_buffer_t sb, fstring __str )
|
||||
string_buffer_t sb, fstring __str )
|
||||
{
|
||||
register uint_t __len__ = strlen( __str );
|
||||
|
||||
//check the necessity to resize the buffer.
|
||||
if ( sb->length + __len__ > sb->allocs ) {
|
||||
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
|
||||
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
|
||||
}
|
||||
|
||||
//register uint_t t;
|
||||
////copy the __str to the buffer.
|
||||
//for ( t = 0; t < __len__; t++ ) {
|
||||
// sb->buffer[ sb->length++ ] = __str[t];
|
||||
// sb->buffer[ sb->length++ ] = __str[t];
|
||||
//}
|
||||
memcpy( sb->buffer + sb->length, __str, __len__ );
|
||||
sb->length += __len__;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_append_char(
|
||||
string_buffer_t sb, char ch )
|
||||
string_buffer_t sb, char ch )
|
||||
{
|
||||
//check the necessity to resize the buffer.
|
||||
if ( sb->length + 1 > sb->allocs ) {
|
||||
sb = resize_buffer( sb, sb->length * 2 + 1 );
|
||||
sb = resize_buffer( sb, sb->length * 2 + 1 );
|
||||
}
|
||||
|
||||
sb->buffer[sb->length++] = ch;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_insert(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
fstring __str )
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
fstring __str )
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the given bytes from the buffer start from idx.
|
||||
* this will cause the byte move after the idx+length.
|
||||
* this will cause the byte move after the idx+length.
|
||||
*
|
||||
* @return the new string.
|
||||
*/
|
||||
FRISO_API fstring string_buffer_remove(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
uint_t length )
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
uint_t length )
|
||||
{
|
||||
uint_t t;
|
||||
//move the bytes after the idx + length
|
||||
for ( t = idx + length; t < sb->length; t++ ) {
|
||||
sb->buffer[t - length] = sb->buffer[t];
|
||||
sb->buffer[t - length] = sb->buffer[t];
|
||||
}
|
||||
sb->buffer[t] = '\0';
|
||||
//memcpy( sb->buffer + idx,
|
||||
// sb->buffer + idx + length,
|
||||
// sb->length - idx - length );
|
||||
// sb->buffer + idx + length,
|
||||
// sb->length - idx - length );
|
||||
|
||||
t = sb->length - idx;
|
||||
if ( t > 0 ) {
|
||||
sb->length -= ( t > length ) ? length : t;
|
||||
sb->length -= ( t > length ) ? length : t;
|
||||
}
|
||||
sb->buffer[sb->length-1] = '\0';
|
||||
|
||||
@ -171,13 +171,13 @@ FRISO_API fstring string_buffer_remove(
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
|
||||
{
|
||||
//resize the buffer.
|
||||
if ( sb->length < sb->allocs - 1 ) {
|
||||
sb = resize_buffer( sb, sb->length + 1 );
|
||||
sb = resize_buffer( sb, sb->length + 1 );
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
@ -185,8 +185,8 @@ FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
* string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote( string_buffer_t sb )
|
||||
{
|
||||
@ -197,7 +197,7 @@ FRISO_API fstring string_buffer_devote( string_buffer_t sb )
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear( string_buffer_t sb )
|
||||
{
|
||||
@ -216,17 +216,17 @@ FRISO_API void free_string_buffer( string_buffer_t sb )
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split(
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
{
|
||||
string_split_t e = ( string_split_t )
|
||||
FRISO_MALLOC( sizeof( string_split_entry ) );
|
||||
FRISO_MALLOC( sizeof( string_split_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___;
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
e->delimiter = delimiter;
|
||||
@ -239,19 +239,19 @@ FRISO_API string_split_t new_string_split(
|
||||
}
|
||||
|
||||
FRISO_API void string_split_reset(
|
||||
string_split_t sst,
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
string_split_t sst,
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
{
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen(delimiter);
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
sst->srcLen = strlen(source);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_source(
|
||||
string_split_t sst, fstring source )
|
||||
string_split_t sst, fstring source )
|
||||
{
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
@ -259,7 +259,7 @@ FRISO_API void string_split_set_source(
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_delimiter(
|
||||
string_split_t sst, fstring delimiter )
|
||||
string_split_t sst, fstring delimiter )
|
||||
{
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen( delimiter );
|
||||
@ -273,15 +273,15 @@ FRISO_API void free_string_split( string_split_t sst )
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next(
|
||||
string_split_t sst, fstring __dst)
|
||||
string_split_t sst, fstring __dst)
|
||||
{
|
||||
uint_t i, _ok;
|
||||
fstring _dst = __dst;
|
||||
@ -291,28 +291,28 @@ FRISO_API fstring string_split_next(
|
||||
|
||||
while ( 1 )
|
||||
{
|
||||
_ok = 1;
|
||||
for ( i = 0; i < sst->delLen
|
||||
&& (sst->idx + i < sst->srcLen); i++ )
|
||||
{
|
||||
if ( sst->source[sst->idx+i] != sst->delimiter[i] )
|
||||
{
|
||||
_ok = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ok = 1;
|
||||
for ( i = 0; i < sst->delLen
|
||||
&& (sst->idx + i < sst->srcLen); i++ )
|
||||
{
|
||||
if ( sst->source[sst->idx+i] != sst->delimiter[i] )
|
||||
{
|
||||
_ok = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//find the delimiter here,
|
||||
//break the loop and self plus the sst->idx, then return the buffer .
|
||||
if ( _ok == 1 ) {
|
||||
sst->idx += sst->delLen;
|
||||
break;
|
||||
}
|
||||
//find the delimiter here,
|
||||
//break the loop and self plus the sst->idx, then return the buffer .
|
||||
if ( _ok == 1 ) {
|
||||
sst->idx += sst->delLen;
|
||||
break;
|
||||
}
|
||||
|
||||
//coy the char to the buffer
|
||||
*_dst++ = sst->source[sst->idx++];
|
||||
//check if reach the end of the fstring
|
||||
if ( sst->idx >= sst->srcLen ) break;
|
||||
//coy the char to the buffer
|
||||
*_dst++ = sst->source[sst->idx++];
|
||||
//check if reach the end of the fstring
|
||||
if ( sst->idx >= sst->srcLen ) break;
|
||||
}
|
||||
|
||||
*_dst = '\0';
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* dynamatic array test program.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
@ -10,42 +10,42 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main( int argc, char **args ) {
|
||||
|
||||
//create a new array list.
|
||||
friso_array_t array = new_array_list();
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
|
||||
|
||||
//create a new array list.
|
||||
friso_array_t array = new_array_list();
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
|
||||
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
array_list_add( array, keys[j] );
|
||||
}
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
array_list_add( array, keys[j] );
|
||||
}
|
||||
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
array_list_trim( array );
|
||||
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
array_list_trim( array );
|
||||
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nAfter set %dth item.\n", idx );
|
||||
array_list_set( array, idx, "chenxin__" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
printf("\nAfter set %dth item.\n", idx );
|
||||
array_list_set( array, idx, "chenxin__" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nAfter remove %dth item.\n", idx );
|
||||
array_list_remove( array, idx );
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
printf("\nAfter remove %dth item.\n", idx );
|
||||
array_list_remove( array, idx );
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nInsert a item at %dth\n", idx );
|
||||
array_list_insert( array, idx, "*chenxin*" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
printf("\nInsert a item at %dth\n", idx );
|
||||
array_list_insert( array, idx, "*chenxin*" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
free_array_list( array );
|
||||
free_array_list( array );
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
190
src/tst-friso.c
190
src/tst-friso.c
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* Friso test program.
|
||||
* Of couse you can make it a perfect demo for friso.
|
||||
* Of couse you can make it a perfect demo for friso.
|
||||
* all threads or proccess share the same friso_t,
|
||||
* defferent threads/proccess use defferent friso_task_t.
|
||||
* defferent threads/proccess use defferent friso_task_t.
|
||||
* and you could share the friso_config_t if you wish...
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
@ -17,33 +17,33 @@
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define __INPUT_LENGTH__ 20480
|
||||
#define ___EXIT_INFO___ \
|
||||
println("Thanks for trying friso."); \
|
||||
#define ___EXIT_INFO___ \
|
||||
println("Thanks for trying friso."); \
|
||||
break;
|
||||
|
||||
#define ___ABOUT___ \
|
||||
println("+-----------------------------------------------------------+"); \
|
||||
println("| friso - a chinese word segmentation writen by c. |"); \
|
||||
println("| bug report email - chenxin619315@gmail.com. |"); \
|
||||
println("| or: visit http://code.google.com/p/friso. |"); \
|
||||
println("| java edition for http://code.google.com/p/jcseg |"); \
|
||||
println("| type 'quit' to exit the program. |"); \
|
||||
#define ___ABOUT___ \
|
||||
println("+-----------------------------------------------------------+"); \
|
||||
println("| friso - a chinese word segmentation writen by c. |"); \
|
||||
println("| bug report email - chenxin619315@gmail.com. |"); \
|
||||
println("| or: visit http://code.google.com/p/friso. |"); \
|
||||
println("| java edition for http://code.google.com/p/jcseg |"); \
|
||||
println("| type 'quit' to exit the program. |"); \
|
||||
println("+-----------------------------------------------------------+");
|
||||
|
||||
//read a line from a command line.
|
||||
static fstring getLine( FILE *fp, fstring __dst )
|
||||
{
|
||||
register int c;
|
||||
register fstring cs;
|
||||
register int c;
|
||||
register fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while ( ( c = getc( fp ) ) != EOF ) {
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
cs = __dst;
|
||||
while ( ( c = getc( fp ) ) != EOF ) {
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*static void printcode( fstring str ) {
|
||||
@ -59,94 +59,94 @@ static fstring getLine( FILE *fp, fstring __dst )
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
||||
clock_t s_time, e_time;
|
||||
char line[__INPUT_LENGTH__] = {0};
|
||||
int i;
|
||||
fstring __path__ = NULL, mode = NULL;
|
||||
clock_t s_time, e_time;
|
||||
char line[__INPUT_LENGTH__] = {0};
|
||||
int i;
|
||||
fstring __path__ = NULL, mode = NULL;
|
||||
|
||||
friso_t friso;
|
||||
friso_config_t config;
|
||||
friso_task_t task;
|
||||
friso_t friso;
|
||||
friso_config_t config;
|
||||
friso_task_t task;
|
||||
|
||||
//get the lexicon directory
|
||||
for ( i = 0; i < argc; i++ ) {
|
||||
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
|
||||
__path__ = argv[i+1];
|
||||
}
|
||||
}
|
||||
if ( __path__ == NULL ) {
|
||||
println("Usage: friso -init lexicon path");
|
||||
exit(0);
|
||||
}
|
||||
//get the lexicon directory
|
||||
for ( i = 0; i < argc; i++ ) {
|
||||
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
|
||||
__path__ = argv[i+1];
|
||||
}
|
||||
}
|
||||
if ( __path__ == NULL ) {
|
||||
println("Usage: friso -init lexicon path");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
s_time = clock();
|
||||
s_time = clock();
|
||||
|
||||
//initialize
|
||||
friso = friso_new();
|
||||
config = friso_new_config();
|
||||
/*friso_dic_t dic = friso_dic_new();
|
||||
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
|
||||
friso_set_dic( friso, dic );
|
||||
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
|
||||
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
|
||||
printf("fail to initialize friso and config.");
|
||||
goto err;
|
||||
}
|
||||
//initialize
|
||||
friso = friso_new();
|
||||
config = friso_new_config();
|
||||
/*friso_dic_t dic = friso_dic_new();
|
||||
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
|
||||
friso_set_dic( friso, dic );
|
||||
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
|
||||
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
|
||||
printf("fail to initialize friso and config.");
|
||||
goto err;
|
||||
}
|
||||
|
||||
switch ( config->mode )
|
||||
{
|
||||
case __FRISO_SIMPLE_MODE__:
|
||||
mode = "Simple";
|
||||
break;
|
||||
case __FRISO_COMPLEX_MODE__:
|
||||
mode = "Complex";
|
||||
break;
|
||||
case __FRISO_DETECT_MODE__:
|
||||
mode = "Detect";
|
||||
break;
|
||||
}
|
||||
switch ( config->mode )
|
||||
{
|
||||
case __FRISO_SIMPLE_MODE__:
|
||||
mode = "Simple";
|
||||
break;
|
||||
case __FRISO_COMPLEX_MODE__:
|
||||
mode = "Complex";
|
||||
break;
|
||||
case __FRISO_DETECT_MODE__:
|
||||
mode = "Detect";
|
||||
break;
|
||||
}
|
||||
|
||||
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
|
||||
//printf("clr_stw=%d\n", friso->clr_stw);
|
||||
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
|
||||
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
|
||||
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
|
||||
//printf("clr_stw=%d\n", friso->clr_stw);
|
||||
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
|
||||
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
|
||||
|
||||
e_time = clock();
|
||||
e_time = clock();
|
||||
|
||||
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
printf("Mode: %s\n", mode);
|
||||
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
|
||||
___ABOUT___;
|
||||
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
printf("Mode: %s\n", mode);
|
||||
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
|
||||
___ABOUT___;
|
||||
|
||||
//set the task.
|
||||
task = friso_new_task();
|
||||
//set the task.
|
||||
task = friso_new_task();
|
||||
|
||||
while ( 1 )
|
||||
{
|
||||
print("friso>> ");
|
||||
getLine( stdin, line );
|
||||
//exit the programe
|
||||
if ( strcasecmp( line, "quit" ) == 0 ) {
|
||||
___EXIT_INFO___
|
||||
}
|
||||
while ( 1 )
|
||||
{
|
||||
print("friso>> ");
|
||||
getLine( stdin, line );
|
||||
//exit the programe
|
||||
if ( strcasecmp( line, "quit" ) == 0 ) {
|
||||
___EXIT_INFO___
|
||||
}
|
||||
|
||||
//for ( i = 0; i < 1000000; i++ ) {
|
||||
//set the task text.
|
||||
friso_set_text( task, line );
|
||||
println("分词结果:");
|
||||
//for ( i = 0; i < 1000000; i++ ) {
|
||||
//set the task text.
|
||||
friso_set_text( task, line );
|
||||
println("分词结果:");
|
||||
|
||||
s_time = clock();
|
||||
while ( ( config->next_token( friso, config, task ) ) != NULL )
|
||||
{
|
||||
//printf("%s[%d, %d, %d] ", task->token->word,
|
||||
// task->token->offset, task->token->length, task->token->rlen );
|
||||
printf("%s ", task->token->word );
|
||||
}
|
||||
//}
|
||||
e_time = clock();
|
||||
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
|
||||
s_time = clock();
|
||||
while ( ( config->next_token( friso, config, task ) ) != NULL )
|
||||
{
|
||||
//printf("%s[%d, %d, %d] ", task->token->word,
|
||||
// task->token->offset, task->token->length, task->token->rlen );
|
||||
printf("%s ", task->token->word );
|
||||
}
|
||||
//}
|
||||
e_time = clock();
|
||||
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
friso_free_task( task );
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
/**
|
||||
* File Explain.
|
||||
*
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
@ -10,28 +10,28 @@
|
||||
|
||||
void print_hash_info( friso_hash_t _hash ) {
|
||||
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
|
||||
_hash->size, _hash->factor, _hash->threshold);
|
||||
_hash->size, _hash->factor, _hash->threshold);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
friso_hash_t _hash = new_hash_table();
|
||||
char *names[] = {
|
||||
"陈满文", "阳清华",
|
||||
"陈鑫", "罗江艳",
|
||||
"小燕子", "比比",
|
||||
"张仁芳", "阳建",
|
||||
"陈配", "李恒",
|
||||
"张志刚", "张怡少",
|
||||
"阳江波", "蔡再利",
|
||||
"阳绘章", "尹唐文",
|
||||
"谭志鹏", "肖路德",
|
||||
"潘凯", "刘潇",
|
||||
"马朝辉", "张强",
|
||||
"殷美林", "元明清",
|
||||
"周安", "郭桥安",
|
||||
"刘敏", "黄广华",
|
||||
"李胜", "黄海清"
|
||||
"陈满文", "阳清华",
|
||||
"陈鑫", "罗江艳",
|
||||
"小燕子", "比比",
|
||||
"张仁芳", "阳建",
|
||||
"陈配", "李恒",
|
||||
"张志刚", "张怡少",
|
||||
"阳江波", "蔡再利",
|
||||
"阳绘章", "尹唐文",
|
||||
"谭志鹏", "肖路德",
|
||||
"潘凯", "刘潇",
|
||||
"马朝辉", "张强",
|
||||
"殷美林", "元明清",
|
||||
"周安", "郭桥安",
|
||||
"刘敏", "黄广华",
|
||||
"李胜", "黄海清"
|
||||
};
|
||||
//char *str[] = {"陈鑫", "张仁芳", "比比"};
|
||||
char **str = names;
|
||||
@ -39,7 +39,7 @@ int main(int argc, char **argv)
|
||||
|
||||
print_hash_info( _hash );
|
||||
for ( j = 0; j < len; j++) {
|
||||
hash_put_mapping( _hash, names[j], names[j] );
|
||||
hash_put_mapping( _hash, names[j], names[j] );
|
||||
}
|
||||
|
||||
print_hash_info( _hash );
|
||||
@ -49,11 +49,11 @@ int main(int argc, char **argv)
|
||||
|
||||
//remove mappings
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("Now, remove %s\n", str[j]);
|
||||
hash_remove_mapping( _hash, str[j] );
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("*********************************\n");
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("Now, remove %s\n", str[j]);
|
||||
hash_remove_mapping( _hash, str[j] );
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("*********************************\n");
|
||||
}
|
||||
|
||||
printf("Press any key to continue.");
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* lex functions test program.
|
||||
*
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
*/
|
||||
#include "friso.h"
|
||||
|
||||
@ -11,10 +11,10 @@
|
||||
#include <string.h>
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define ___PRINT_HELP_INFO___ \
|
||||
printf("1. help print the current menu.\n"); \
|
||||
printf("2. #set set the classify of the dictionary.\n"); \
|
||||
printf("3. other search the words in the dictionary.\n"); \
|
||||
#define ___PRINT_HELP_INFO___ \
|
||||
printf("1. help print the current menu.\n"); \
|
||||
printf("2. #set set the classify of the dictionary.\n"); \
|
||||
printf("3. other search the words in the dictionary.\n"); \
|
||||
printf("4. quit exit the programe.\n");
|
||||
|
||||
int main(int argc, char **argv)
|
||||
@ -62,30 +62,30 @@ int main(int argc, char **argv)
|
||||
e_time = clock();
|
||||
|
||||
printf("Done, cost: %f sec, size=%d\n", ( double ) ( e_time - s_time ) / CLOCKS_PER_SEC, \
|
||||
friso_all_dic_size( friso->dic ) );
|
||||
friso_all_dic_size( friso->dic ) );
|
||||
|
||||
while ( 1 ) {
|
||||
printf("friso-%d>> ", lex);
|
||||
scanf("%s", _line);
|
||||
if ( strcmp( _line, "quit" ) == 0 ) {
|
||||
break;
|
||||
} else if ( strcmp( _line, "help" ) == 0 ) {
|
||||
___PRINT_HELP_INFO___
|
||||
} else if ( strcmp( _line, "#set" ) == 0 ) {
|
||||
printf("lex_t>> ");
|
||||
scanf("%d", &lex);
|
||||
} else {
|
||||
s_time = clock();
|
||||
e = friso_dic_get( friso->dic, lex, _line );
|
||||
e_time = clock();
|
||||
if ( e != NULL ) {
|
||||
printf("word=%s, syn=%s, fre=%d, cost:%fsec\n",
|
||||
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre,
|
||||
(double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
} else {
|
||||
printf("%s was not found.\n", _line);
|
||||
}
|
||||
}
|
||||
printf("friso-%d>> ", lex);
|
||||
scanf("%s", _line);
|
||||
if ( strcmp( _line, "quit" ) == 0 ) {
|
||||
break;
|
||||
} else if ( strcmp( _line, "help" ) == 0 ) {
|
||||
___PRINT_HELP_INFO___
|
||||
} else if ( strcmp( _line, "#set" ) == 0 ) {
|
||||
printf("lex_t>> ");
|
||||
scanf("%d", &lex);
|
||||
} else {
|
||||
s_time = clock();
|
||||
e = friso_dic_get( friso->dic, lex, _line );
|
||||
e_time = clock();
|
||||
if ( e != NULL ) {
|
||||
printf("word=%s, syn=%s, fre=%d, cost:%fsec\n",
|
||||
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre,
|
||||
(double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
} else {
|
||||
printf("%s was not found.\n", _line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//friso_dic_free( friso->dic );
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* link list test programe.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
@ -13,12 +13,12 @@ int main( int argc, char **args ) {
|
||||
|
||||
friso_link_t link;
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, len = sizeof( keys ) / sizeof( fstring );
|
||||
|
||||
@ -28,15 +28,15 @@ int main( int argc, char **args ) {
|
||||
printf("size=%d\n", link->size );
|
||||
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
//link_add( link, keys[j] );
|
||||
link_list_add_last( link, keys[j] );
|
||||
//link_add( link, keys[j] );
|
||||
link_list_add_last( link, keys[j] );
|
||||
}
|
||||
|
||||
printf("size=%d\n", link->size );
|
||||
|
||||
for ( j = 0; j < len / 2; j++ ) {
|
||||
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
|
||||
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
|
||||
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
|
||||
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
|
||||
}
|
||||
|
||||
printf("size=%d\n", link->size );
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
int main ( int argc, char **args )
|
||||
{
|
||||
fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
|
||||
fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
|
||||
char buffer[128];
|
||||
string_split_t split = new_string_split(",", source );
|
||||
|
||||
@ -20,7 +20,7 @@ int main ( int argc, char **args )
|
||||
printf("sst->delLen=%d\n", split->delLen);
|
||||
|
||||
while ( string_split_next(split, buffer) != NULL) {
|
||||
printf("buffer:%s\n", buffer);
|
||||
printf("buffer:%s\n", buffer);
|
||||
}
|
||||
|
||||
free_string_split(split);
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* fstring handle mode test program.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
@ -20,13 +20,13 @@ int main( int argc, char **args ) {
|
||||
|
||||
|
||||
for ( t = 0; t < length; t += bytes ) {
|
||||
bytes = get_utf8_bytes( *(str + t) );
|
||||
if ( bytes == 0 ) continue;
|
||||
for ( j = 0; j < bytes; j++ )
|
||||
word[j] = *(str + t + j );
|
||||
word[j] = '\0';
|
||||
string_buffer_append( sb, word );
|
||||
printf("word=%s\n", word );
|
||||
bytes = get_utf8_bytes( *(str + t) );
|
||||
if ( bytes == 0 ) continue;
|
||||
for ( j = 0; j < bytes; j++ )
|
||||
word[j] = *(str + t + j );
|
||||
word[j] = '\0';
|
||||
string_buffer_append( sb, word );
|
||||
printf("word=%s\n", word );
|
||||
}
|
||||
|
||||
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
|
||||
|
Loading…
Reference in New Issue
Block a user