code tab to 4 space

This commit is contained in:
chenxin 2015-12-07 11:42:33 +08:00
parent e9bf4a2536
commit a264922721
29 changed files with 3422 additions and 3422 deletions

View File

@ -9,9 +9,9 @@ friso-1.6.2:
3. friso deb | rmp支持
Debian & Ubuntu:
sudo apt-get install libfriso0 libfriso-dev
sudo apt-get install libfriso0 libfriso-dev
CentOS & Fedora:
sudo yum install libfriso libfriso-devel
sudo yum install libfriso libfriso-devel
4. 中文词性标注。
@ -26,41 +26,41 @@ friso-1.6.2:
friso-1.6.1:
1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done
1. friso.ini中friso.lex_dir增加相对friso.ini的路径支持 -done
2. 修复两处内存泄漏bug. -done
2. 修复两处内存泄漏bug. -done
3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done
3. 改善中英混合词的识别, 可以识别更多情况, 例如:高3 -done
4. 词库优化, 加入了一些新词条. -done
4. 词库优化, 加入了一些新词条. -done
5. 修复friso_dic_add & array_list_insert的两处代码bug -done
5. 修复friso_dic_add & array_list_insert的两处代码bug -done
6. 增加检测模式切分, 只返回词库中有的词条 -done
6. 增加检测模式切分, 只返回词库中有的词条 -done
7. 集成了php扩展绑定完美支持PHP分词 -done
7. 集成了php扩展绑定完美支持PHP分词 -done
friso-1.6.0:
1. friso_stirng.c#utf8_decimal_string初始化bytes = 0,
去除WinNT的Run-Time Check Failed. -done
去除WinNT的Run-Time Check Failed. -done
2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done
2. 复杂英文和数字组合的二次切分. 例如: QQ2013会被切分成: qq2013, qq, 2013. -done
3. GBK编码支持. -done
3. GBK编码支持. -done
4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done
4. 增加了friso.ini中自定义保留标点, 去除了默认对"^,/,-,'"等标点的保留. -done
5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done
5. 使用掩码操作控制变量来代替了原来的多个控制变量. -done
6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回纠正了offset的误差。 -done
6. 切分结果friso_hits_t中增加了对词条类别和词条长度的返回纠正了offset的误差。 -done
7. 做了一些优化,例如:同义词的追加(普通/sphinx定义)复杂的判断逻辑,
改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done
改为了使用掩码状态控制,不仅减少了代码量还提高了执行效率。 -done
8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done
8. 更多的返回信息,增加了对切分词条的类别,长度,真实长度,词性(待实现)等信息的返回。 -done
9. 增加了安装中头文件的自动拷贝usr/include/friso可以通过include <friso/xx.h>来引用头文件。
@ -83,18 +83,18 @@ friso-1.4:
1. 小数+单位无法识别的情况.更改friso_string#utf8_numeric_string()函数.
2. 更改中英混合词的识别(目前可以识别中英任何一种组合).
英中: 例如: b超,
英中英: a美1,
英中英中: a哆啦a梦,
中英: 卡拉ok,
中英中: 哆啦a梦,
中英中英: 中文a美a
英中: 例如: b超,
英中英: a美1,
英中英中: a哆啦a梦,
中英: 卡拉ok,
中英中: 哆啦a梦,
中英中英: 中文a美a
3. 更改了单位组合, 现在可以组合的单位不局限是中文, 例如: ℃,℉
4. 对于未识别的字符, 给定一个开关选项来决定保留还是过滤.
5. 英文同义词的追加(增加了lex-en.lex词库)
5. 英文同义词的追加(增加了lex-en.lex词库)
friso-1.3:
@ -103,7 +103,7 @@ friso-1.3:
2. 部分简易函数使用了宏定义来代替, 减少函数的调用.
3. 保留了英文全半角和中文标点符号的切分.(可以通过过滤停止词来过滤不需要的标点)
停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了.
停止词词库中已经加入了全部的保留的标点, 也就是默认全部过滤了.
4. 修复friso_string#utf8_en_punctuation()函数一处bug.

View File

@ -6,9 +6,9 @@ Friso是使用c语言开发的一款开源的高性能中文分词器使用
2。三种切分模式
(1). 简易模式FMM算法适合速度要求场合。
(2). 复杂模式- MMSEG四种过滤算法具有较高的岐义去除分词准确率达到了98.41%。
(3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始)
(1). 简易模式FMM算法适合速度要求场合。
(2). 复杂模式- MMSEG四种过滤算法具有较高的岐义去除分词准确率达到了98.41%。
(3). (!New)检测模式:只返回词库中已有的词条,很适合某些应用场合。(1.6.1版本开始)
请参考本算法的原作http://technology.chtsai.org/mmseg/。

View File

@ -8,6 +8,6 @@
// ARG_ENABLE("friso", "enable friso support", "no");
if (PHP_FRISO != "no") {
EXTENSION("friso", "friso.c");
EXTENSION("friso", "friso.c");
}

View File

@ -20,53 +20,53 @@ echo "friso_version(): " , friso_version(), ", friso_charset(): ", friso_charset
echo "分词函数:<br />";
if ( friso_charset() == 'UTF-8' )
{
$_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体x射线本质是什么今天去奇都ktv唱卡拉ok去哆啦a梦是一个动漫中的主角单位和全角: 2009年日开始大学之旅岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩.";
echo "<p>friso_split(\"" . $_str . "\")<p />";
//API:
//rb_split(string, Array, [long])
//1.string: 要被切分的字符串。
//2.Array: 配置选项使用NULL来选择默认的配置(friso.ini中的配置)。
//3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs
//1.完整的配置:
//array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1,
// 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX);
//1.在不了解friso内核的情况下, 请不要随便更改nthreshold
//2.使用NULL来使用php.ini中指定的friso.ini文件中的配置
//2.返回选项:
//词条: FRISO_RET_WORD, 类别FRISO_RET_TYPE, 长度FRISO_RET_LENGTH, 真实长度FRISO_RET_RLEN, 偏移量FRISO_RET_OFF
//词性FRISO_RET_POS(待实现)
$_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS;
//$_rargs = 0;
//3.切分类别:
//CJK词条FRISO_TYP_CJK, 英中混合词(b超)FRISO_TYP_ECM中英混合词(卡拉ok)FRISO_TYP_CEM
//英文标点混合词(c++)FRISO_TYP_EPUN标点FRISO_TYP_PUN未知类别FRISO_TYP_UNK其他类别(同义词)FRISO_TYP_OTR
$_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs);
unset($_str);
foreach ( $_result as $_val )
{
$_str = $_val['word'];
if ( $_rargs != 0 ) {
$_str .= '[';
if ( ($_rargs & FRISO_RET_TYPE) != 0 )
$_str .= ', type: '.$_val['type']; //获取词条类别
if ( ($_rargs & FRISO_RET_LEN) != 0 )
$_str .= ', len: ' . $_val['len']; //词条长度
if ( ($_rargs & FRISO_RET_RLEN) != 0 )
$_str .= ', rlen: ' . $_val['rlen']; //词条真实长度
if ( ($_rargs & FRISO_RET_OFF) != 0 )
$_str .= ', off: ' . $_val['off']; //词条偏移量
if ( ($_rargs & FRISO_RET_POS) != 0 )
$_str .= ', pos: ' . $_val['pos']; //词条词性
$_str .= ']';
}
$_str = "歧义和同义词:研究生命起源,混合词: 做B超检查身体x射线本质是什么今天去奇都ktv唱卡拉ok去哆啦a梦是一个动漫中的主角单位和全角: 2009年日开始大学之旅岳阳今天的气温为38.6℃, 也就是101.48℉, 英文数字: bug report chenxin619315@gmail.com or visit http://code.google.com/p/jcseg, we all admire the hacker spirit!特殊数字: ① ⑩ ⑽ ㈩.";
echo "<p>friso_split(\"" . $_str . "\")<p />";
//API:
//rb_split(string, Array, [long])
//1.string: 要被切分的字符串。
//2.Array: 配置选项使用NULL来选择默认的配置(friso.ini中的配置)。
//3.long: 可选参数,自定义切分返回选项,查看下面的$_rargs
//1.完整的配置:
//array('max_len'=>5, 'r_name'=>0, 'mix_len'=>2, 'lna_len'=>1, 'add_syn'=>1,
// 'clr_stw'=>1, 'keep_urec'=>0, 'spx_out'=>0, 'en_sseg'=> 1, 'st_minl'=>2, 'kpuncs'=>'.+#', 'mode'=>FRISO_COMPLEX);
//1.在不了解friso内核的情况下, 请不要随便更改nthreshold
//2.使用NULL来使用php.ini中指定的friso.ini文件中的配置
//2.返回选项:
//词条: FRISO_RET_WORD, 类别FRISO_RET_TYPE, 长度FRISO_RET_LENGTH, 真实长度FRISO_RET_RLEN, 偏移量FRISO_RET_OFF
//词性FRISO_RET_POS(待实现)
$_rargs = FRISO_RET_TYPE | FRISO_RET_LEN | FRISO_RET_RLEN | FRISO_RET_OFF | FRISO_RET_POS;
//$_rargs = 0;
//3.切分类别:
//CJK词条FRISO_TYP_CJK, 英中混合词(b超)FRISO_TYP_ECM中英混合词(卡拉ok)FRISO_TYP_CEM
//英文标点混合词(c++)FRISO_TYP_EPUN标点FRISO_TYP_PUN未知类别FRISO_TYP_UNK其他类别(同义词)FRISO_TYP_OTR
$_result = friso_split($_str, array('mode'=>FRISO_COMPLEX), $_rargs);
unset($_str);
foreach ( $_result as $_val )
{
$_str = $_val['word'];
if ( $_rargs != 0 ) {
$_str .= '[';
if ( ($_rargs & FRISO_RET_TYPE) != 0 )
$_str .= ', type: '.$_val['type']; //获取词条类别
if ( ($_rargs & FRISO_RET_LEN) != 0 )
$_str .= ', len: ' . $_val['len']; //词条长度
if ( ($_rargs & FRISO_RET_RLEN) != 0 )
$_str .= ', rlen: ' . $_val['rlen']; //词条真实长度
if ( ($_rargs & FRISO_RET_OFF) != 0 )
$_str .= ', off: ' . $_val['off']; //词条偏移量
if ( ($_rargs & FRISO_RET_POS) != 0 )
$_str .= ', pos: ' . $_val['pos']; //词条词性
$_str .= ']';
}
$_str .= '/&nbsp;&nbsp;&nbsp;';
echo $_str;
}
$_str .= '/&nbsp;&nbsp;&nbsp;';
echo $_str;
}
}
else echo "set charset to UTF-8 to test function friso_split.";
?>

View File

@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0);
//check the charset
if ( friso_charset() != "GBK" ) {
$_str = "Error: GBK charset required. <br />";
$_str .= "1. Modified friso.charset = 1 in your friso.ini .<br />";
$_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon. <br />";
exit($_str);
$_str = "Error: GBK charset required. <br />";
$_str .= "1. Modified friso.charset = 1 in your friso.ini .<br />";
$_str .= "2. Modified friso.lex_dir = GBK lexicon abusolute path to load your GBK lexicon. <br />";
exit($_str);
}
$text = '';
@ -15,139 +15,139 @@ $_timer = 0;
$_act = '';
$_cfg = array('mode' => FRISO_COMPLEX);
if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) {
$text = &$_POST['text'];
$_cfg = &$_POST['config'];
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
$s_time = timer();
$_ret = friso_split($text, $_cfg);
$_timer = timer() - $s_time;
$text = &$_POST['text'];
$_cfg = &$_POST['config'];
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
$s_time = timer();
$_ret = friso_split($text, $_cfg);
$_timer = timer() - $s_time;
}
function timer() {
list($msec, $sec) = explode(' ', microtime());
return ((float)$msec + (float)$sec);
list($msec, $sec) = explode(' ', microtime());
return ((float)$msec + (float)$sec);
}
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>GBK - robbe分词测试程序 </title>
<meta http-equiv="content-type" content="text/html;charset=GBK" />
<style type="text/css">
#box {width: 1000px}
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
color: #555;font-size: 14px;}
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
.link-item {float: left;font-size: 14px;font-weight: bold;
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
#cfg-box {margin-bottom: 10px;}
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
color:#555; outline: none;}
</style>
<title>GBK - robbe分词测试程序 </title>
<meta http-equiv="content-type" content="text/html;charset=GBK" />
<style type="text/css">
#box {width: 1000px}
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
color: #555;font-size: 14px;}
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
.link-item {float: left;font-size: 14px;font-weight: bold;
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
#cfg-box {margin-bottom: 10px;}
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
color:#555; outline: none;}
</style>
</head>
<body>
<div id="box">
<form name="robbe" method="post" action="gbk.demo.php">
<div class="title-item">分词配置:</div>
<div id="cfg-box">
<div>
<label>最大词长: </label>
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
</div>
<div>
<label>混合词中文词长: </label>
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
</div>
<div>
<label>英文二次切分: </label>
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>二次切分子Token最小长度: </label>
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
</div>
<div>
<label>英文Token中保留的标点: </label>
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
</div>
<div>
<label>同义词追加: </label>
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>过滤停止词: </label>
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>保留未识别词: </label>
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>sphinx定制输出: </label>
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>分词模式: </label>
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
</div>
</div>
<div class="title-item">分词内容:</div>
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
<input type="hidden" name="_act" value="split"/>
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
</form>
<div id="box">
<form name="robbe" method="post" action="gbk.demo.php">
<div class="title-item">分词配置:</div>
<div id="cfg-box">
<div>
<label>最大词长: </label>
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
</div>
<div>
<label>混合词中文词长: </label>
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
</div>
<div>
<label>英文二次切分: </label>
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>二次切分子Token最小长度: </label>
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
</div>
<div>
<label>英文Token中保留的标点: </label>
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
</div>
<div>
<label>同义词追加: </label>
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>过滤停止词: </label>
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>保留未识别词: </label>
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>sphinx定制输出: </label>
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>分词模式: </label>
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
</div>
</div>
<div class="title-item">分词内容:</div>
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
<input type="hidden" name="_act" value="split"/>
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
</form>
<?php
if ( $_act == 'split' ) {
?>
<div class="title-item">分词结果:</div>
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
</textarea></div>
<div class="link-box"><a id="info-link">
<?php
$len = strlen($text);
if ( $len >= 1048576 ) {
echo substr(($len/1048576), 0, 6).'MB';
} else if ( $len >= 1024 ) {
echo substr( ($len / 1024), 0, 6).'KB';
} else {
echo $len.'B';
}
?>
&nbsp;&nbsp;&nbsp;<?php printf("%.5f", $_timer)?>sec
</a></div>
<?php
}
?>
</div>
<?php
if ( $_act == 'split' ) {
?>
<div class="title-item">分词结果:</div>
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
</textarea></div>
<div class="link-box"><a id="info-link">
<?php
$len = strlen($text);
if ( $len >= 1048576 ) {
echo substr(($len/1048576), 0, 6).'MB';
} else if ( $len >= 1024 ) {
echo substr( ($len / 1024), 0, 6).'KB';
} else {
echo $len.'B';
}
?>
&nbsp;&nbsp;&nbsp;<?php printf("%.5f", $_timer)?>sec
</a></div>
<?php
}
?>
</div>
<script type="text/javascript">
String.prototype.trim = function() {return this.replace(/^\s+|\s+$/g, '');}
function do_submit() {
var text = document.getElementById('text');
if ( text.value.trim() == '' ) return;
document.robbe.submit();
var text = document.getElementById('text');
if ( text.value.trim() == '' ) return;
document.robbe.submit();
}
</script>
</body>

View File

@ -4,10 +4,10 @@ ini_set('magic_quotes_gpc', 0);
//charset check.
if ( friso_charset() != "UTF-8" ) {
$_str = "Error: UTF-8 charset required. <br />";
$_str .= "1. Modified friso.charset = 0 in your friso.ini .<br />";
$_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon. <br />";
exit($_str);
$_str = "Error: UTF-8 charset required. <br />";
$_str .= "1. Modified friso.charset = 0 in your friso.ini .<br />";
$_str .= "2. Modified friso.lex_dir = UTF-8 lexicon abusolute path to load your UTF-8 lexicon. <br />";
exit($_str);
}
$text = '';
@ -15,139 +15,139 @@ $_timer = 0;
$_act = '';
$_cfg = array('mode' => FRISO_COMPLEX);
if ( isset($_POST['_act']) && ($_act = $_POST['_act']) == 'split' ) {
$text = &$_POST['text'];
$_cfg = &$_POST['config'];
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
$s_time = timer();
$_ret = friso_split($text, $_cfg);
$_timer = timer() - $s_time;
$text = &$_POST['text'];
$_cfg = &$_POST['config'];
if ( ! isset($_cfg['add_syn']) ) $_cfg['add_syn'] = 0;
if ( ! isset($_cfg['clr_stw']) ) $_cfg['clr_stw'] = 0;
if ( ! isset($_cfg['keep_urec']) ) $_cfg['keep_urec'] = 0;
if ( ! isset($_cfg['spx_out']) ) $_cfg['spx_out'] = 0;
if ( ! isset($_cfg['en_sseg']) ) $_cfg['en_sseg'] = 0;
$s_time = timer();
$_ret = friso_split($text, $_cfg);
$_timer = timer() - $s_time;
}
function timer() {
list($msec, $sec) = explode(' ', microtime());
return ((float)$msec + (float)$sec);
list($msec, $sec) = explode(' ', microtime());
return ((float)$msec + (float)$sec);
}
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>UTF8 - robbe分词测试程序</title>
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
<style type="text/css">
#box {width: 1000px}
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
color: #555;font-size: 14px;}
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
.link-item {float: left;font-size: 14px;font-weight: bold;
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
#cfg-box {margin-bottom: 10px;}
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
color:#555; outline: none;}
</style>
<title>UTF8 - robbe分词测试程序</title>
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
<style type="text/css">
#box {width: 1000px}
.input-text {border: 1px solid #CCC;width: 1000px;height: 180px;background-color: #FFF;
color: #555;font-size: 14px;}
.link-box {overflow: hidden;zoom:1;padding-top:10px;}
#submit-link {float:right;width:150px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
#info-link {float:right;width:300px;height: 26px;line-height: 26px;
background-color: #A50100;color: #FFF;font-weight: bold;text-align: center;
text-decoration: none;font-size: 14px;}
.link-item {float: left;font-size: 14px;font-weight: bold;
height: 26px;line-height: 26px;width: 100px;color: #A50100;}
.title-item {height:30px;line-height: 30px;font-size: 14px;font-weight: bold;}
#cfg-box {margin-bottom: 10px;}
#cfg-box div {overflow: hidden;zoom:1;color:#555;font-size:12px;}
#cfg-box div label {float: left;width: 160px;height: 26px;line-height:26px;text-align:right;
padding-right:10px;font-size:12px;font-weight:bold;color:#555;}
.input {border: 1px solid #DDD;height: 18px;line-height: 18px;padding-left: 5px;width: 120px;
color:#555; outline: none;}
</style>
</head>
<body>
<div id="box">
<form name="robbe" method="post" action="utf8.demo.php">
<div class="title-item">分词配置:</div>
<div id="cfg-box">
<div>
<label>最大词长: </label>
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
</div>
<div>
<label>混合词中文词长: </label>
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
</div>
<div>
<label>英文二次切分: </label>
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>二次切分子Token最小长度: </label>
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
</div>
<div>
<label>英文Token中保留的标点: </label>
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
</div>
<div>
<label>同义词追加: </label>
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>过滤停止词: </label>
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>保留未识别词: </label>
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>sphinx定制输出: </label>
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>分词模式: </label>
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
</div>
</div>
<div class="title-item">分词内容:</div>
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
<input type="hidden" name="_act" value="split"/>
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
</form>
<div id="box">
<form name="robbe" method="post" action="utf8.demo.php">
<div class="title-item">分词配置:</div>
<div id="cfg-box">
<div>
<label>最大词长: </label>
<input type="text" name="config[max_len]" value="<?=isset($_cfg['max_len'])?$_cfg['max_len']:5?>" class="input" />
</div>
<div>
<label>混合词中文词长: </label>
<input type="text" name="config[mix_len]" value="<?=isset($_cfg['mix_len'])?$_cfg['mix_len']:2?>" class="input" />
</div>
<div>
<label>英文二次切分: </label>
<input type="checkbox" name="config[en_sseg]" <?=isset($_cfg['en_sseg'])&&$_cfg['en_sseg']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>二次切分子Token最小长度: </label>
<input type="text" name="config[st_minl]" value="<?=isset($_cfg['st_minl'])?$_cfg['st_minl']:2?>" class="input" />
</div>
<div>
<label>英文Token中保留的标点: </label>
<input type="text" name="config[kpuncs]" value="<?=isset($_cfg['kpuncs'])?$_cfg['kpuncs']:'@%.#&+'?>" class="input" />
</div>
<div>
<label>同义词追加: </label>
<input type="checkbox" name="config[add_syn]" <?=isset($_cfg['add_syn'])&&$_cfg['add_syn']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>过滤停止词: </label>
<input type="checkbox" name="config[clr_stw]" <?=isset($_cfg['clr_stw'])&&$_cfg['clr_stw']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>保留未识别词: </label>
<input type="checkbox" name="config[keep_urec]" <?=isset($_cfg['keep_urec'])&&$_cfg['keep_urec']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>sphinx定制输出: </label>
<input type="checkbox" name="config[spx_out]" <?=isset($_cfg['spx_out'])&&$_cfg['spx_out']==1?'checked="checked"':''?> value="1" />
</div>
<div>
<label>分词模式: </label>
<input type="radio" name="config[mode]" value="<?=RB_SMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==1?'checked="checked"':''?> />简易模式
<input type="radio" name="config[mode]" value="<?=RB_CMODE?>" <?=isset($_cfg['mode'])&&$_cfg['mode']==2?'checked="checked"':''?> />复杂模式
</div>
</div>
<div class="title-item">分词内容:</div>
<div class="r-item"><textarea name="text" class="input-text" id="text"><?=$text?></textarea></div>
<input type="hidden" name="_act" value="split"/>
<a href="javascript:;" onclick="do_submit();return false;" id="submit-link">robbe分词</a>
</form>
<?php
if ( $_act == 'split' ) {
?>
<div class="title-item">分词结果:</div>
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
</textarea></div>
<div class="link-box"><a id="info-link">
<?php
$len = strlen($text);
if ( $len >= 1048576 ) {
echo substr(($len/1048576), 0, 6).'MB';
} else if ( $len >= 1024 ) {
echo substr( ($len / 1024), 0, 6).'KB';
} else {
echo $len.'B';
}
?>
&nbsp;&nbsp;&nbsp;<?php printf("%.5f", $_timer)?>sec
</a></div>
<?php
}
?>
</div>
<?php
if ( $_act == 'split' ) {
?>
<div class="title-item">分词结果:</div>
<div><textarea class="input-text"><?php foreach ( $_ret as $_val ) echo $_val['word'].' ';?>
</textarea></div>
<div class="link-box"><a id="info-link">
<?php
$len = strlen($text);
if ( $len >= 1048576 ) {
echo substr(($len/1048576), 0, 6).'MB';
} else if ( $len >= 1024 ) {
echo substr( ($len / 1024), 0, 6).'KB';
} else {
echo $len.'B';
}
?>
&nbsp;&nbsp;&nbsp;<?php printf("%.5f", $_timer)?>sec
</a></div>
<?php
}
?>
</div>
<script type="text/javascript">
String.prototype.trim = function() {return this.replace(/^\s+|\s+$/g, '');}
function do_submit() {
var text = document.getElementById('text');
if ( text.value.trim() == '' ) return;
document.robbe.submit();
var text = document.getElementById('text');
if ( text.value.trim() == '' ) return;
document.robbe.submit();
}
</script>
</body>

View File

@ -9,9 +9,9 @@
#include "php_friso.h"
#ifdef FRISO_WINNT
# define friso_default_conf_file "c:/windows/friso.ini"
# define friso_default_conf_file "c:/windows/friso.ini"
#else
# define friso_default_conf_file "/etc/friso/friso.ini"
# define friso_default_conf_file "/etc/friso/friso.ini"
#endif
/* If you declare any globals in php_friso.h uncomment this:
@ -27,15 +27,15 @@ static int le_friso = 1;
* Every user visible function must have an entry in friso_functions[].
*/
const zend_function_entry friso_functions[] = {
PHP_FE(friso_split, NULL)
PHP_FE(friso_version, NULL)
PHP_FE(friso_charset, NULL)
PHP_FE(friso_dic_exist, NULL)
PHP_FE(friso_dic_get, NULL)
PHP_FE(friso_utf8_bytes, NULL)
PHP_FE(friso_utf8_ucode, NULL)
PHP_FE(friso_ucode_utf8, NULL)
{NULL, NULL, NULL} /* Must be the last line in friso_functions[] */
PHP_FE(friso_split, NULL)
PHP_FE(friso_version, NULL)
PHP_FE(friso_charset, NULL)
PHP_FE(friso_dic_exist, NULL)
PHP_FE(friso_dic_get, NULL)
PHP_FE(friso_utf8_bytes, NULL)
PHP_FE(friso_utf8_ucode, NULL)
PHP_FE(friso_ucode_utf8, NULL)
{NULL, NULL, NULL} /* Must be the last line in friso_functions[] */
};
/* }}} */
@ -43,19 +43,19 @@ const zend_function_entry friso_functions[] = {
*/
zend_module_entry friso_module_entry = {
#if ZEND_MODULE_API_NO >= 20010901
STANDARD_MODULE_HEADER,
STANDARD_MODULE_HEADER,
#endif
"friso",
friso_functions,
PHP_MINIT(friso),
PHP_MSHUTDOWN(friso),
PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */
PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */
PHP_MINFO(friso),
"friso",
friso_functions,
PHP_MINIT(friso),
PHP_MSHUTDOWN(friso),
PHP_RINIT(friso), /* Replace with NULL if there's nothing to do at request start */
PHP_RSHUTDOWN(friso), /* Replace with NULL if there's nothing to do at request end */
PHP_MINFO(friso),
#if ZEND_MODULE_API_NO >= 20010901
"0.1", /* Replace with version number for your extension */
"0.1", /* Replace with version number for your extension */
#endif
STANDARD_MODULE_PROPERTIES
STANDARD_MODULE_PROPERTIES
};
/* }}} */
@ -73,72 +73,72 @@ PHP_INI_END()
/* {{{ php_robbe_globals_construct */
static void php_friso_globals_construct(zend_friso_globals *friso_globals)
{
friso_globals->friso = friso_new();
friso_globals->config = friso_new_config();
friso_init_from_ifile(friso_globals->friso,
friso_globals->config, INI_STR("friso.ini_file"));
friso_globals->friso = friso_new();
friso_globals->config = friso_new_config();
friso_init_from_ifile(friso_globals->friso,
friso_globals->config, INI_STR("friso.ini_file"));
}
/* }}} */
/* {{{ php_robbe_globals_destruct*/
static void php_friso_globals_destruct(zend_friso_globals *friso_globals)
{
/*
* cause friso_free will free the dictionary
* so here we don't have to call the friso_dic_free to free the
* the robbe_dic global variable.
*/
//friso_dic_free( friso_globals->friso_dic );
//friso_globals->friso_dic = NULL;
friso_free_config( friso_globals->config );
friso_free( friso_globals->friso );
/*
* cause friso_free will free the dictionary
* so here we don't have to call the friso_dic_free to free the
* the robbe_dic global variable.
*/
//friso_dic_free( friso_globals->friso_dic );
//friso_globals->friso_dic = NULL;
friso_free_config( friso_globals->config );
friso_free( friso_globals->friso );
}
/* }}} */
#define FRISO_RET_WORD (1 << 0)
#define FRISO_RET_TYPE (1 << 1)
#define FRISO_RET_OFF (1 << 2)
#define FRISO_RET_LEN (1 << 3)
#define FRISO_RET_RLEN (1 << 4)
#define FRISO_RET_POS (1 << 5)
#define FRISO_RET_WORD (1 << 0)
#define FRISO_RET_TYPE (1 << 1)
#define FRISO_RET_OFF (1 << 2)
#define FRISO_RET_LEN (1 << 3)
#define FRISO_RET_RLEN (1 << 4)
#define FRISO_RET_POS (1 << 5)
/* {{{ PHP_MINIT_FUNCTION
*/
PHP_MINIT_FUNCTION(friso)
{
/*
* register some contants that robbe may use
* at its following work.
* the constant is case sensitive and persitent.
*/
REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT);
/*
* register some contants that robbe may use
* at its following work.
* the constant is case sensitive and persitent.
*/
REGISTER_LONG_CONSTANT("FRISO_SIMPLE", __FRISO_SIMPLE_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_COMPLEX", __FRISO_COMPLEX_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_DETECT", __FRISO_DETECT_MODE__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_LEX_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_LEX_STOP", __LEX_STOPWORDS__, CONST_CS | CONST_PERSISTENT);
//return parts for rb_split.
REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT);
//return parts for rb_split.
REGISTER_LONG_CONSTANT("FRISO_RET_WORD", FRISO_RET_WORD, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_TYPE", FRISO_RET_TYPE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_OFF", FRISO_RET_OFF, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_LEN", FRISO_RET_LEN, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_RLEN", FRISO_RET_RLEN, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_RET_POS", FRISO_RET_POS, CONST_CS | CONST_PERSISTENT);
//lex type constants.
REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
//lex type constants.
REGISTER_LONG_CONSTANT("FRISO_TYP_CJK", __LEX_CJK_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_ECM", __LEX_ECM_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_CEM", __LEX_CEM_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_EPUN", __LEX_ENPUN_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_PUN", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_UNK", __LEX_UNKNOW_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("FRISO_TYP_OTR", __LEX_OTHER_WORDS__, CONST_CS | CONST_PERSISTENT);
REGISTER_INI_ENTRIES();
/*initialize the globals variables.*/
php_friso_globals_construct( &friso_globals );
REGISTER_INI_ENTRIES();
/*initialize the globals variables.*/
php_friso_globals_construct( &friso_globals );
return SUCCESS;
return SUCCESS;
}
/* }}} */
@ -146,11 +146,11 @@ PHP_MINIT_FUNCTION(friso)
*/
PHP_MSHUTDOWN_FUNCTION(friso)
{
UNREGISTER_INI_ENTRIES();
/*destruct the globals variables*/
php_friso_globals_destruct( &friso_globals );
return SUCCESS;
UNREGISTER_INI_ENTRIES();
/*destruct the globals variables*/
php_friso_globals_destruct( &friso_globals );
return SUCCESS;
}
/* }}} */
@ -159,7 +159,7 @@ PHP_MSHUTDOWN_FUNCTION(friso)
*/
PHP_RINIT_FUNCTION(friso)
{
return SUCCESS;
return SUCCESS;
}
/* }}} */
@ -168,22 +168,22 @@ PHP_RINIT_FUNCTION(friso)
*/
PHP_RSHUTDOWN_FUNCTION(friso)
{
return SUCCESS;
return SUCCESS;
}
/* }}} */
/* {{{ PHP_MINFO_FUNCTION
*/
PHP_MINFO_FUNCTION(friso)
{
php_info_print_table_start();
php_info_print_table_row(2, "Friso Support", "enabled");
php_info_print_table_row(2, "Version", FRISO_VERSION);
php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com");
php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso");
php_info_print_table_end();
{
php_info_print_table_start();
php_info_print_table_row(2, "Friso Support", "enabled");
php_info_print_table_row(2, "Version", FRISO_VERSION);
php_info_print_table_row(2, "Bug Report", "chenxin619315@gmail.com");
php_info_print_table_row(2, "Home page", "http://code.google.com/p/friso");
php_info_print_table_end();
DISPLAY_INI_ENTRIES();
DISPLAY_INI_ENTRIES();
}
/* }}} */
@ -192,130 +192,130 @@ PHP_MINFO_FUNCTION(friso)
Return a array contains all the split result with a specified mode */
PHP_FUNCTION(friso_split)
{
char *_str = NULL, *_key;
int slen, idx, klen, rargs = 0;
int arg_count;
char *_str = NULL, *_key;
int slen, idx, klen, rargs = 0;
int arg_count;
zval *ret, *cfg, **data;
//used for multiple item return.
zval *item;
zval *ret, *cfg, **data;
//used for multiple item return.
zval *item;
HashTable *cfgArr;
HashPosition pointer;
HashTable *cfgArr;
HashPosition pointer;
friso_task_t task;
friso_config_t config = NULL, nconfig = NULL;
friso_task_t task;
friso_config_t config = NULL, nconfig = NULL;
//get the arugments from the php layer.
arg_count = ZEND_NUM_ARGS();
switch ( arg_count )
{
case 2:
if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz",
&_str, &slen, &cfg) == FAILURE ) return;
break;
case 3:
if (zend_parse_parameters( arg_count TSRMLS_CC, "szl",
&_str, &slen, &cfg, &rargs) == FAILURE ) return;
break;
default:
WRONG_PARAM_COUNT;
}
//get the arugments from the php layer.
arg_count = ZEND_NUM_ARGS();
switch ( arg_count )
{
case 2:
if ( zend_parse_parameters(arg_count TSRMLS_CC, "sz",
&_str, &slen, &cfg) == FAILURE ) return;
break;
case 3:
if (zend_parse_parameters( arg_count TSRMLS_CC, "szl",
&_str, &slen, &cfg, &rargs) == FAILURE ) return;
break;
default:
WRONG_PARAM_COUNT;
}
//make sure the RB_RET_WORD will be returned.
//rargs |= FRISO_RET_WORD;
//make sure the RB_RET_WORD will be returned.
//rargs |= FRISO_RET_WORD;
//check and initialize the friso.
if ( Z_TYPE_P(cfg) != IS_NULL )
{
nconfig = friso_new_config();
memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry));
//check and initialize the friso.
if ( Z_TYPE_P(cfg) != IS_NULL )
{
nconfig = friso_new_config();
memcpy(nconfig, friso_globals.config, sizeof(friso_config_entry));
//check the new setting.
cfgArr = Z_ARRVAL_P(cfg);
//zend_printf("array length: %d", zend_hash_num_elements(cfgArr));
for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer);
zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS;
zend_hash_move_forward_ex(cfgArr, &pointer) )
{
zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer);
//zend_printf("key: %s, value: %d<br />", _key, (*data)->value.lval);
if ( strcmp(_key, "kpuncs") == 0 )
{
memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len);
nconfig->kpuncs[(*data)->value.str.len] = '\0';
}
else
{
//convert the data to long.
convert_to_long_ex(data);
if ( strcmp(_key, "max_len") == 0 )
nconfig->max_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "r_name") == 0 )
nconfig->r_name = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "mix_len") == 0 )
nconfig->mix_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "lna_len") == 0 )
nconfig->lna_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "add_syn") == 0 )
nconfig->add_syn = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "clr_stw") == 0 )
nconfig->clr_stw = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "add_syn") == 0 )
nconfig->add_syn = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "keep_urec") == 0 )
nconfig->keep_urec = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "spx_out") == 0 )
nconfig->spx_out = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "nthreshold") == 0 )
nconfig->nthreshold = (uint_t) (*data)->value.lval;
else if ( strcmp(_key, "mode") == 0 )
friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval));
else if ( strcmp(_key, "en_sseg") == 0 )
nconfig->en_sseg = (ushort_t) (*data)->value.lval;
else if ( strcmp(_key, "st_minl") == 0 )
nconfig->st_minl = (ushort_t) (*data)->value.lval;
}
}
}
//check the new setting.
cfgArr = Z_ARRVAL_P(cfg);
//zend_printf("array length: %d", zend_hash_num_elements(cfgArr));
for ( zend_hash_internal_pointer_reset_ex(cfgArr, &pointer);
zend_hash_get_current_data_ex(cfgArr, (void **)&data, &pointer) == SUCCESS;
zend_hash_move_forward_ex(cfgArr, &pointer) )
{
zend_hash_get_current_key_ex(cfgArr, &_key, &klen, NULL, 0, &pointer);
//zend_printf("key: %s, value: %d<br />", _key, (*data)->value.lval);
if ( strcmp(_key, "kpuncs") == 0 )
{
memcpy(nconfig->kpuncs, (*data)->value.str.val, (*data)->value.str.len);
nconfig->kpuncs[(*data)->value.str.len] = '\0';
}
else
{
//convert the data to long.
convert_to_long_ex(data);
if ( strcmp(_key, "max_len") == 0 )
nconfig->max_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "r_name") == 0 )
nconfig->r_name = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "mix_len") == 0 )
nconfig->mix_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "lna_len") == 0 )
nconfig->lna_len = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "add_syn") == 0 )
nconfig->add_syn = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "clr_stw") == 0 )
nconfig->clr_stw = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "add_syn") == 0 )
nconfig->add_syn = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "keep_urec") == 0 )
nconfig->keep_urec = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "spx_out") == 0 )
nconfig->spx_out = (ushort_t)(*data)->value.lval;
else if ( strcmp(_key, "nthreshold") == 0 )
nconfig->nthreshold = (uint_t) (*data)->value.lval;
else if ( strcmp(_key, "mode") == 0 )
friso_set_mode(nconfig, (friso_mode_t)((*data)->value.lval));
else if ( strcmp(_key, "en_sseg") == 0 )
nconfig->en_sseg = (ushort_t) (*data)->value.lval;
else if ( strcmp(_key, "st_minl") == 0 )
nconfig->st_minl = (ushort_t) (*data)->value.lval;
}
}
}
//initialize the array.
MAKE_STD_ZVAL( ret );
array_init( ret );
config = ( nconfig == NULL ) ? friso_globals.config : nconfig;
//initialize the array.
MAKE_STD_ZVAL( ret );
array_init( ret );
config = ( nconfig == NULL ) ? friso_globals.config : nconfig;
//create a new friso task.
task = friso_new_task();
idx = 0;
friso_set_text(task, _str);
while ( config->next_token( friso_globals.friso, config, task ) != NULL )
{
MAKE_STD_ZVAL(item);
array_init(item);
add_assoc_string(item, "word", task->token->word, 1);
//check the append of type
if ( (rargs & FRISO_RET_TYPE) != 0 )
add_assoc_long(item, "type", task->token->type);
if ( (rargs & FRISO_RET_LEN) != 0 )
add_assoc_long(item, "len", task->token->length);
if ( (rargs & FRISO_RET_RLEN) != 0 )
add_assoc_long(item, "rlen", task->token->rlen);
if ( (rargs & FRISO_RET_OFF) != 0 )
add_assoc_long(item, "off", task->token->offset);
if ( (rargs & FRISO_RET_POS) != 0 )
add_assoc_stringl(item, "pos", &task->token->pos, 1, 1);
//append the sub result.
add_index_zval( ret, idx++, item );
}
//create a new friso task.
task = friso_new_task();
idx = 0;
friso_set_text(task, _str);
while ( config->next_token( friso_globals.friso, config, task ) != NULL )
{
MAKE_STD_ZVAL(item);
array_init(item);
add_assoc_string(item, "word", task->token->word, 1);
//check the append of type
if ( (rargs & FRISO_RET_TYPE) != 0 )
add_assoc_long(item, "type", task->token->type);
if ( (rargs & FRISO_RET_LEN) != 0 )
add_assoc_long(item, "len", task->token->length);
if ( (rargs & FRISO_RET_RLEN) != 0 )
add_assoc_long(item, "rlen", task->token->rlen);
if ( (rargs & FRISO_RET_OFF) != 0 )
add_assoc_long(item, "off", task->token->offset);
if ( (rargs & FRISO_RET_POS) != 0 )
add_assoc_stringl(item, "pos", &task->token->pos, 1, 1);
//append the sub result.
add_index_zval( ret, idx++, item );
}
//free the friso task.
friso_free_task(task);
if ( nconfig != NULL ) friso_free_config(nconfig);
//free the friso task.
friso_free_task(task);
if ( nconfig != NULL ) friso_free_config(nconfig);
//RETURN_ZVAL( ret, 0, 0);
*( return_value ) = *( ret );
//RETURN_ZVAL( ret, 0, 0);
*( return_value ) = *( ret );
}
/* }}} */
@ -323,7 +323,7 @@ PHP_FUNCTION(friso_split)
Return the current version of Friso. */
PHP_FUNCTION(friso_version)
{
RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1);
RETURN_STRINGL(FRISO_VERSION, strlen(FRISO_VERSION), 1);
}
/* }}} */
@ -331,8 +331,8 @@ PHP_FUNCTION(friso_version)
Return the current charset of friso. */
PHP_FUNCTION(friso_charset)
{
char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK";
RETURN_STRINGL(charset, strlen(charset), 1);
char *charset = friso_globals.friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK";
RETURN_STRINGL(charset, strlen(charset), 1);
}
/* }}} */
@ -340,23 +340,23 @@ PHP_FUNCTION(friso_charset)
Return a bool to confirm that the given str is a word in a specified dictionary. */
PHP_FUNCTION(friso_dic_exist)
{
char *word = NULL;
int wlen;
long type;
char *word = NULL;
int wlen;
long type;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
return;
}
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
return;
}
if ( friso_globals.friso->dic == NULL )
RETURN_BOOL(0);
if ( friso_globals.friso->dic == NULL )
RETURN_BOOL(0);
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
type = __LEX_CJK_WORDS__;
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
type = __LEX_CJK_WORDS__;
wlen = friso_dic_match( friso_globals.friso->dic, type, word );
wlen = friso_dic_match( friso_globals.friso->dic, type, word );
RETURN_BOOL(wlen);
RETURN_BOOL(wlen);
}
/* }}} */
@ -364,38 +364,38 @@ PHP_FUNCTION(friso_dic_exist)
Return a array contains all the information of the given word.*/
PHP_FUNCTION(friso_dic_get)
{
char *word = NULL;
int wlen;
long type;
zval *entry;
lex_entry_t e;
char *word = NULL;
int wlen;
long type;
zval *entry;
lex_entry_t e;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
return;
}
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ls", &type, &word, &wlen) == FAILURE) {
return;
}
//check the dictionary
if ( friso_globals.friso->dic == NULL )
RETURN_BOOL(0);
//check the dictionary
if ( friso_globals.friso->dic == NULL )
RETURN_BOOL(0);
MAKE_STD_ZVAL( entry );
array_init( entry );
MAKE_STD_ZVAL( entry );
array_init( entry );
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
{
type = __LEX_CJK_WORDS__;
}
if ( type < 0 || type >= __FRISO_LEXICON_LENGTH__ )
{
type = __LEX_CJK_WORDS__;
}
e = friso_dic_get( friso_globals.friso->dic, type, word );
if ( e != NULL )
{
add_assoc_long( entry, "length", e->length);
add_assoc_long( entry, "freq", e->fre );
*( return_value ) = * ( entry );
return;
}
e = friso_dic_get( friso_globals.friso->dic, type, word );
if ( e != NULL )
{
add_assoc_long( entry, "length", e->length);
add_assoc_long( entry, "freq", e->fre );
*( return_value ) = * ( entry );
return;
}
RETURN_BOOL(0);
RETURN_BOOL(0);
}
/* }}} */
@ -403,17 +403,17 @@ PHP_FUNCTION(friso_dic_get)
Return the bytes that the utf-8 char takes.*/
PHP_FUNCTION(friso_utf8_bytes)
{
char *word = NULL;
int wlen, _bytes;
char *word = NULL;
int wlen, _bytes;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
return;
}
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
return;
}
if ( word == NULL ) RETURN_LONG(0);
_bytes = get_utf8_bytes( word[0] );
if ( word == NULL ) RETURN_LONG(0);
_bytes = get_utf8_bytes( word[0] );
RETURN_LONG(_bytes);
RETURN_LONG(_bytes);
}
/* }}} */
@ -421,16 +421,16 @@ PHP_FUNCTION(friso_utf8_bytes)
Return the unicode of the given utf-8 char.*/
PHP_FUNCTION(friso_utf8_ucode)
{
char *word = NULL;
int wlen, _ucode;
char *word = NULL;
int wlen, _ucode;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
return;
}
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &word, &wlen) == FAILURE) {
return;
}
_ucode = get_utf8_unicode( word );
_ucode = get_utf8_unicode( word );
RETURN_LONG(_ucode);
RETURN_LONG(_ucode);
}
/* }}} */
@ -438,18 +438,18 @@ PHP_FUNCTION(friso_utf8_ucode)
Return char that the a unicode pointed to.*/
PHP_FUNCTION(friso_ucode_utf8)
{
unsigned long *ucode = NULL;
int _bytes;
char word[7];
unsigned long *ucode = NULL;
int _bytes;
char word[7];
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) {
return;
}
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &ucode ) == FAILURE) {
return;
}
_bytes = unicode_to_utf8( ( size_t ) ucode, word );
word[_bytes] = '\0';
_bytes = unicode_to_utf8( ( size_t ) ucode, word );
word[_bytes] = '\0';
RETURN_STRINGL( word, _bytes, 1 );
RETURN_STRINGL( word, _bytes, 1 );
}
/* }}} */

View File

@ -2,7 +2,7 @@
$br = (php_sapi_name() == "cli")? "":"<br>";
if(!extension_loaded('friso')) {
dl('friso.' . PHP_SHLIB_SUFFIX);
dl('friso.' . PHP_SHLIB_SUFFIX);
}
$module = 'friso';
$functions = get_extension_funcs($module);
@ -13,9 +13,9 @@ foreach($functions as $func) {
echo "$br\n";
$function = 'confirm_' . $module . '_compiled';
if (extension_loaded($module)) {
$str = $function($module);
$str = $function($module);
} else {
$str = "Module $module is not compiled into PHP";
$str = "Module $module is not compiled into PHP";
}
echo "$str\n";
?>

View File

@ -6,11 +6,11 @@ extern zend_module_entry friso_module_entry;
#define phpext_friso_ptr &friso_module_entry
#ifdef PHP_WIN32
# define PHP_FRISO_API __declspec(dllexport)
# define PHP_FRISO_API __declspec(dllexport)
#elif defined(__GNUC__) && __GNUC__ >= 4
# define PHP_FRISO_API __attribute__ ((visibility("default")))
# define PHP_FRISO_API __attribute__ ((visibility("default")))
#else
# define PHP_FRISO_API
# define PHP_FRISO_API
#endif
#ifdef ZTS
@ -36,12 +36,12 @@ PHP_FUNCTION(friso_utf8_ucode);
PHP_FUNCTION(friso_ucode_utf8);
/*
Declare any global variables you may need between the BEGIN
and END macros here:
Declare any global variables you may need between the BEGIN
and END macros here:
ZEND_BEGIN_MODULE_GLOBALS(friso)
long global_value;
char *global_string;
long global_value;
char *global_string;
ZEND_END_MODULE_GLOBALS(friso)
*/
@ -66,5 +66,5 @@ typedef struct {
#define FRISO_G(v) (friso_globals.v)
#endif
#endif /* PHP_FRISO_H */
#endif /* PHP_FRISO_H */

View File

@ -6,14 +6,14 @@ Check for friso presence
<?php
echo "friso extension is available";
/*
you can add regression tests for your extension here
you can add regression tests for your extension here
the output of your test code has to be equal to the
text in the --EXPECT-- section below for the tests
to pass, differences between the output and the
expected text are interpreted as failure
see php5/README.TESTING for further information on
see php5/README.TESTING for further information on
writing regression tests
*/
?>

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
/*
* main interface file for friso - free soul.
* you could modify it and re-release it but never for commercial use.
* you could modify it and re-release it but never for commercial use.
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#ifndef _friso_h
#define _friso_h
@ -15,11 +15,11 @@
#define friso_version() FRISO_VERSION
#define DEFAULT_SEGMENT_LENGTH 5
#define DEFAULT_MIX_LENGTH 2
#define DEFAULT_LNA_LENGTH 1
#define DEFAULT_NTHRESHOLD 1000000
#define DEFAULT_SEGMENT_MODE 2
#define DEFAULT_SEGMENT_LENGTH 5
#define DEFAULT_MIX_LENGTH 2
#define DEFAULT_LNA_LENGTH 1
#define DEFAULT_NTHRESHOLD 1000000
#define DEFAULT_SEGMENT_MODE 2
/*
* Type: friso_lex_t
@ -29,8 +29,8 @@
typedef enum {
__LEX_CJK_WORDS__ = 0,
__LEX_CJK_UNITS__ = 1,
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
__LEX_CN_LNAME__ = 4,
__LEX_CN_SNAME__ = 5,
__LEX_CN_DNAME1__ = 6,
@ -41,8 +41,8 @@ typedef enum {
__LEX_EN_WORDS__ = 11,
__LEX_OTHER_WORDS__ = 15,
__LEX_NCSYN_WORDS__ = 16,
__LEX_PUNC_WORDS__ = 17, //punctuations
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
__LEX_PUNC_WORDS__ = 17, //punctuations
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
} friso_lex_t;
typedef friso_hash_t * friso_dic_t;
@ -51,8 +51,8 @@ typedef friso_hash_t * friso_dic_t;
//charset that Friso now support.
typedef enum {
FRISO_UTF8 = 0, //UTF-8
FRISO_GBK = 1 //GBK
FRISO_UTF8 = 0, //UTF-8
FRISO_GBK = 1 //GBK
} friso_charset_t;
/*
@ -61,15 +61,15 @@ typedef enum {
* use to identidy the mode that the friso use.
*/
typedef enum {
__FRISO_SIMPLE_MODE__ = 1,
__FRISO_COMPLEX_MODE__ = 2,
__FRISO_DETECT_MODE__ = 3
__FRISO_SIMPLE_MODE__ = 1,
__FRISO_COMPLEX_MODE__ = 2,
__FRISO_DETECT_MODE__ = 3
} friso_mode_t;
/* friso entry.*/
typedef struct {
friso_dic_t dic; //friso dictionary
friso_charset_t charset; //project charset.
friso_dic_t dic; //friso dictionary
friso_charset_t charset; //project charset.
} friso_entry;
typedef friso_entry * friso_t;
@ -80,26 +80,26 @@ typedef friso_entry * friso_t;
* -------------------
* This type used to represent the lexicon entry struct.
*/
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
typedef struct {
/*
* the type of the lexicon item.
* available value is all the elements in friso_lex_t enum.
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
*/
uchar_t length; //the length of the token.(after the convertor of Friso.)
uchar_t rlen; //the real length of the token.(before any convert)
uchar_t length; //the length of the token.(after the convertor of Friso.)
uchar_t rlen; //the real length of the token.(before any convert)
uchar_t type;
uchar_t ctrlMask; //function control mask, like append the synoyums words.
uint_t offset; //offset index.
uchar_t ctrlMask; //function control mask, like append the synoyums words.
uint_t offset; //offset index.
fstring word;
//fstring py; //pinyin of the word.(invalid)
friso_array_t syn; //synoyums words.
friso_array_t pos; //part of speech.
uint_t fre; //single word frequency.
//fstring py; //pinyin of the word.(invalid)
friso_array_t syn; //synoyums words.
friso_array_t pos; //part of speech.
uint_t fre; //single word frequency.
} lex_entry_cdt;
typedef lex_entry_cdt * lex_entry_t;
@ -108,11 +108,11 @@ typedef lex_entry_cdt * lex_entry_t;
#define __HITS_WORD_LENGTH__ 64
typedef struct {
uchar_t type; //type of the word. (item of friso_lex_t)
uchar_t length; //length of the token.
uchar_t rlen; //the real length of the token.(in orgin strng)
char pos; //part of speech.
int offset; //start offset of the word.
uchar_t type; //type of the word. (item of friso_lex_t)
uchar_t length; //length of the token.
uchar_t rlen; //the real length of the token.(in orgin strng)
char pos; //part of speech.
int offset; //start offset of the word.
char word[__HITS_WORD_LENGTH__];
//char py[0];
} friso_token_entry;
@ -122,25 +122,25 @@ typedef friso_token_entry * friso_token_t;
/*
* Type: friso_task_entry
* This type used to represent the current segmentation content.
* like the text to split, and the current index, token buffer eg....
* like the text to split, and the current index, token buffer eg....
*/
//action control mask for #FRISO_TASK_T#.
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
typedef struct {
fstring text; //text to tokenize
uint_t idx; //start offset index.
uint_t length; //length of the text.
uint_t bytes; //latest word bytes in C.
uint_t unicode; //latest word unicode number.
uint_t ctrlMask; //action control mask.
friso_link_t pool; //task pool.
string_buffer_t sbuf; //string buffer.
friso_token_t token; //token result token;
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
fstring text; //text to tokenize
uint_t idx; //start offset index.
uint_t length; //length of the text.
uint_t bytes; //latest word bytes in C.
uint_t unicode; //latest word unicode number.
uint_t ctrlMask; //action control mask.
friso_link_t pool; //task pool.
string_buffer_t sbuf; //string buffer.
friso_token_t token; //token result token;
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
} friso_task_entry;
typedef friso_task_entry * friso_task_t;
@ -151,23 +151,23 @@ typedef friso_task_entry * friso_task_t;
//typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t );
//typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t );
struct friso_config_struct {
ushort_t max_len; //the max match length (4 - 7).
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
ushort_t mix_len; //the max length for the CJK words in a mix string.
ushort_t lna_len; //the max length for the chinese last name adron.
ushort_t add_syn; //append synonyms tokenizer words.
ushort_t clr_stw; //clear the stopwords.
ushort_t keep_urec; //keep the unrecongnized words.
ushort_t spx_out; //use sphinx output customize.
ushort_t en_sseg; //start the secondary segmentation.
ushort_t st_minl; //min length of the secondary segmentation token.
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
friso_mode_t mode; //Complex mode or simple mode
ushort_t max_len; //the max match length (4 - 7).
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
ushort_t mix_len; //the max length for the CJK words in a mix string.
ushort_t lna_len; //the max length for the chinese last name adron.
ushort_t add_syn; //append synonyms tokenizer words.
ushort_t clr_stw; //clear the stopwords.
ushort_t keep_urec; //keep the unrecongnized words.
ushort_t spx_out; //use sphinx output customize.
ushort_t en_sseg; //start the secondary segmentation.
ushort_t st_minl; //min length of the secondary segmentation token.
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
friso_mode_t mode; //Complex mode or simple mode
//pointer to the function to get the next token
friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t);
//pointer to the function to get the next cjk lex_entry_t
lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t);
//pointer to the function to get the next token
friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t);
//pointer to the function to get the next cjk lex_entry_t
lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t);
char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer.
};
@ -181,7 +181,7 @@ typedef friso_config_entry * friso_config_t;
* Usage: vars = friso_new( void );
* --------------------------------
* This function used to create a new empty friso friso_t;
* with default value.
* with default value.
*/
FRISO_API friso_t friso_new( void );
@ -202,7 +202,7 @@ FRISO_API void friso_free( friso_t );
* Usage: dic = friso_set_dic( vars, dic );
* ----------------------------------------
* This function is used to set the dictionary for friso.
* and firso_dic_t is the pointer of a hash table array.
* and firso_dic_t is the pointer of a hash table array.
*/
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
#define friso_set_dic(friso, dic)\
@ -272,7 +272,7 @@ FRISO_API lex_entry_t next_complex_cjk( friso_t, friso_config_t, friso_task_t );
* Usage: word = next_mmseg_token( vars, seg );
* --------------------------------------
* This function is used to get next word that friso segmented
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
*/
FRISO_API friso_token_t next_mmseg_token( friso_t, friso_config_t, friso_task_t );
@ -313,14 +313,14 @@ FRISO_API void free_lex_entry( lex_entry_t );
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
* --------------------------------------------------
* This function is used to load dictionary from a given path.
* no length limit when length less than 0.
* no length limit when length less than 0.
*/
FRISO_API void friso_dic_load( friso_t, friso_config_t,
friso_lex_t, fstring, uint_t );
friso_lex_t, fstring, uint_t );
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the conf file.
* and load all the valid lexicon from the conf file.
*/
FRISO_API void friso_dic_load_from_ifile( friso_t, friso_config_t, fstring, uint_t );

View File

@ -16,22 +16,22 @@
//yat, just take it as this way, 99 percent you will find no problem
#if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) )
# define FRISO_WINNT
# define FRISO_WINNT
#else
# define FRISO_LINUX
# define FRISO_LINUX
#endif
#ifdef FRISO_WINNT
# define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static
# define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static
#else
/*platform shared library statement :: unix*/
# define FRISO_API extern
# define __STATIC_API__ static inline
# define FRISO_API extern
# define __STATIC_API__ static inline
#endif
#define ___ALLOCATION_ERROR___ \
printf("Unable to do the memory allocation, program will now exit\n" ); \
#define ___ALLOCATION_ERROR___ \
printf("Unable to do the memory allocation, program will now exit\n" ); \
exit(1);
#define print(str) printf("%s", str )
@ -39,12 +39,12 @@ exit(1);
/*
* memory allocation macro definition.
* cause we should use emalloc,ecalloc .ege. in php.
* cause we should use emalloc,ecalloc .ege. in php.
* so you could make it better apdat the php environment.
*/
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
#define FRISO_MALLOC(_bytes) malloc(_bytes)
#define FRISO_FREE( _ptr ) free( _ptr )
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
#define FRISO_MALLOC(_bytes) malloc(_bytes)
#define FRISO_FREE( _ptr ) free( _ptr )
typedef unsigned short ushort_t;
typedef unsigned char uchar_t;
@ -74,7 +74,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str );
/*
* this function will copy the chars that the fstring pointed.
* to the buffer.
* to the buffer.
* this may cause the resize action of the buffer.
*/
FRISO_API void string_buffer_append( string_buffer_t, fstring );
@ -88,21 +88,21 @@ FRISO_API fstring string_buffer_remove( string_buffer_t, uint_t idx, uint_t );
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t );
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* the string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
* and this function will not free the allocations of the
* the string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote( string_buffer_t );
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear( string_buffer_t );
@ -126,8 +126,8 @@ typedef string_split_entry * string_split_t;
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split( fstring, fstring );
@ -141,12 +141,12 @@ FRISO_API void free_string_split( string_split_t );
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next( string_split_t, fstring );
/* }}} */
@ -175,7 +175,7 @@ FRISO_API friso_array_t new_array_list_with_opacity( uint_t );
/*
* free the given friso array.
* and its items, but never where the items's item to pointed to .
* and its items, but never where the items's item to pointed to .
*/
FRISO_API void free_array_list( friso_array_t );
@ -190,13 +190,13 @@ FRISO_API void *array_list_get( friso_array_t, uint_t );
/*
* set the item at a specified position.
* this will return the old value.
* this will return the old value.
*/
FRISO_API void *array_list_set( friso_array_t, uint_t, void * );
/*
* remove the given item at a specified position.
* this will return the value of the removed item.
* this will return the value of the removed item.
*/
FRISO_API void *array_list_remove( friso_array_t, uint_t );
@ -205,9 +205,9 @@ FRISO_API friso_array_t array_list_trim( friso_array_t );
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear( friso_array_t );
@ -300,8 +300,8 @@ FRISO_API void link_list_add_first( friso_link_t, void * );
/* {{{ hashtable interface define :: start*/
struct hash_entry {
fstring _key; //the node key
void * _val; //the node value
fstring _key; //the node key
void * _val; //the node value
struct hash_entry * _next;
};
typedef struct hash_entry friso_hash_entry;
@ -319,8 +319,8 @@ typedef struct {
typedef friso_hash_cdt * friso_hash_t;
//default value for friso_hash_cdt
#define DEFAULT_LENGTH 31
#define DEFAULT_FACTOR 0.85f
#define DEFAULT_LENGTH 31
#define DEFAULT_FACTOR 0.85f
/*
* Function: new_hash_table
@ -359,7 +359,7 @@ FRISO_API int hash_exist_mapping( friso_hash_t, fstring );
* Usage: value = get_mapping_value( table, key );
* -----------------------------------------------
* this function return the value associated with the given key.
* UNDEFINED will be return if the mapping is not exists.
* UNDEFINED will be return if the mapping is not exists.
*/
FRISO_API void * hash_get_value( friso_hash_t, fstring );

View File

@ -1,6 +1,6 @@
/**
* Friso GBK about function implements source file.
* @package src/friso_GBK.c .
* @package src/friso_GBK.c .
*
* @author chenxin <chenxin619315@gmail.com>
*/
@ -12,12 +12,12 @@
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word(
friso_task_t task,
uint_t *idx,
fstring __word )
friso_task_t task,
uint_t *idx,
fstring __word )
{
int c;
if ( *idx >= task->length ) return 0;
@ -41,26 +41,26 @@ FRISO_API int gbk_next_word(
//}
//check if the given buffer is a gbk word (ANSII string).
// included the simplified and traditional words.
// included the simplified and traditional words.
FRISO_API int gbk_cn_string( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
//GBK/2: gb2312 chinese word.
return ( ((c1 >= 0xb0 && c1 <= 0xf7)
&& (c2 >= 0xa1 && c2 <= 0xfe))
&& (c2 >= 0xa1 && c2 <= 0xfe))
//GBK/3: extend chinese words.
|| ((c1 >= 0x81 && c1 <= 0xa0)
&& ( (c2 >= 0x40 && c2 <= 0x7e)
|| (c2 >= 0x80 && c2 <= 0xfe) ))
|| ((c1 >= 0x81 && c1 <= 0xa0)
&& ( (c2 >= 0x40 && c2 <= 0x7e)
|| (c2 >= 0x80 && c2 <= 0xfe) ))
//GBK/4: extend chinese words.
|| ((c1 >= 0xaa && c1 <= 0xfe)
&& ( (c2 >= 0x40 && c2 <= 0xfe)
|| (c2 >= 0x80 && c2 <= 0xa0) )) );
|| ((c1 >= 0xaa && c1 <= 0xfe)
&& ( (c2 >= 0x40 && c2 <= 0xfe)
|| (c2 >= 0x80 && c2 <= 0xa0) )) );
}
/*check if the given char is a ASCII letter
* include all the arabic number, letters and english puntuations.*/
* include all the arabic number, letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char( char c )
{
int u = (uchar_t) c;
@ -69,58 +69,58 @@ FRISO_API int gbk_halfwidth_en_char( char c )
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ( (c1 == 0xA3)
&& ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|| ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters.
|| ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters.
&& ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|| ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters.
|| ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters.
}
//check if the given char is a upper case english letter.
// included the full-width and half-width letters.
// included the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 65 && c1 <= 90 );
else //full-width
return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) );
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 65 && c1 <= 90 );
else //full-width
return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) );
}
//check if the given char is a lower case char.
// included the full-width and half-width letters.
// included the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 97 && c1 <= 122 );
else //full-width
return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) );
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 97 && c1 <= 122 );
else //full-width
return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) );
}
//check if the given char is a arabic numeric.
// included the full-width and half-width arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 48 && c1 <= 57 );
else //full-width
return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) );
if ( c1 <= 0x80 ) //half-width
return ( c1 >= 48 && c1 <= 57 );
else //full-width
return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) );
}
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string( char *str )
{
@ -130,17 +130,17 @@ FRISO_API int gbk_numeric_string( char *str )
while ( *s != '\0' )
{
c1 = (uchar_t) (*s++);
if ( c1 <= 0x80 ) //half-width
{
if ( c1 < 48 || c2 > 57 ) return 0;
}
else //full-width
{
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) (*s++);
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
c1 = (uchar_t) (*s++);
if ( c1 <= 0x80 ) //half-width
{
if ( c1 < 48 || c2 > 57 ) return 0;
}
else //full-width
{
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) (*s++);
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
}
return 1;
@ -157,47 +157,47 @@ FRISO_API int gbk_decimal_string( char *str )
for ( i = 0; i < len; )
{
c1 = (uchar_t) str[i++];
//count the number of the points.
if ( c1 == 46 )
{
p++;
continue;
}
c1 = (uchar_t) str[i++];
//count the number of the points.
if ( c1 == 46 )
{
p++;
continue;
}
if ( c1 <= 0x80 ) //half-width
{
if ( c1 < 48 || c1 > 57 ) return 0;
}
else //full-width
{
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) str[i++];
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
if ( c1 <= 0x80 ) //half-width
{
if ( c1 < 48 || c1 > 57 ) return 0;
}
else //full-width
{
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) str[i++];
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
}
return (p == 1);
}
//check if the given char is a english(ASCII) letter.
// (full-width and half-width), not the punctuation/arabic of course.
// (full-width and half-width), not the punctuation/arabic of course.
FRISO_API int gbk_en_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) //half-width
return ( (c1 >= 65 && c1 <= 90) //lowercase
|| (c1 >= 97 && c1 <= 122)); //uppercase
if ( c1 <= 0x80 ) //half-width
return ( (c1 >= 65 && c1 <= 90) //lowercase
|| (c1 >= 97 && c1 <= 122)); //uppercase
else
return ( (c1 == 0xa3)
&& ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase
|| ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase
return ( (c1 == 0xa3)
&& ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase
|| ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase
return 0;
}
//check the given char is a whitespace or not.
// included full-width and half-width whitespace.
// included full-width and half-width whitespace.
FRISO_API int gbk_whitespace( char *str )
{
int c1 = (uchar_t) str[0];
@ -213,8 +213,8 @@ FRISO_API int gbk_letter_number( char *str )
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ( (c1 == 0xa2)
&& ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase
|| ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase
&& ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase
|| ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase
}
/*
@ -232,9 +232,9 @@ FRISO_API int gbk_en_punctuation( char c )
{
int u = (uchar_t) c;
return ( (u > 32 && u < 48)
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 )
|| ( u > 122 && u < 127 ) );
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 )
|| ( u > 122 && u < 127 ) );
}
//check the given char is a chinese punctuation.
@ -244,16 +244,16 @@ FRISO_API int gbk_cn_punctuation( char *str )
int c2 = (uchar_t) str[1];
//full-width en punctuation.
return ( (c1 == 0xa3 && (( c2 >= 0xa1 && c2 <= 0xaf )
|| ( c2 >= 0xba && c2 <= 0xc0 )
|| ( c2 >= 0xdb && c2 <= 0xe0 )
|| ( c2 >= 0xfb && c2 <= 0xfe ) ))
|| ( c2 >= 0xba && c2 <= 0xc0 )
|| ( c2 >= 0xdb && c2 <= 0xe0 )
|| ( c2 >= 0xfb && c2 <= 0xfe ) ))
//chinese punctuation.
|| (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae)
|| ( c2 >= 0xb0 && c2 <= 0xbf ) ))
|| (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae)
|| ( c2 >= 0xb0 && c2 <= 0xbf ) ))
//A6 area special punctuations:" "
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
//A8 area special punctuations: " ˊˋ˙–―‥‵℅ "
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) );
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) );
}
/* {{{
@ -269,19 +269,19 @@ FRISO_API int gbk_cn_punctuation( char *str )
//FRISO_API int gbk_keep_punctuation( char *str )
//{
// if ( __keep_punctuations_hash__ == NULL ) {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );

View File

@ -1,6 +1,6 @@
/**
* Friso utf8 about function implements source file.
* @package src/friso_UTF8.c .
* @package src/friso_UTF8.c .
*
* @author chenxin <chenxin619315@gmail.com>
*/
@ -12,12 +12,12 @@
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word(
friso_task_t task,
uint_t *idx,
fstring __word )
friso_task_t task,
uint_t *idx,
fstring __word )
{
if ( *idx >= task->length ) return 0;
@ -25,7 +25,7 @@ FRISO_API int utf8_next_word(
task->bytes = get_utf8_bytes( task->text[ *idx ] );
//for ( t = 0; t < task->bytes; t++ ) {
// __word[t] = task->text[ (*idx)++ ];
// __word[t] = task->text[ (*idx)++ ];
//}
//change the loop to memcpy.
@ -52,31 +52,31 @@ FRISO_API void print_char_binary( char value )
for ( t = 0; t < __CHAR_BYTES__; t++ )
{
if ( ( value & 0x80 ) == 0x80 ) {
printf("1");
} else {
printf("0");
}
value <<= 1;
if ( ( value & 0x80 ) == 0x80 ) {
printf("1");
} else {
printf("0");
}
value <<= 1;
}
}
/*
* get the bytes of a utf-8 char.
* between 1 - 6.
* between 1 - 6.
*
* @param __char
* @return int
*/
FRISO_API int get_utf8_bytes( char value )
{
{
register uint_t t = 0;
//one byte ascii char.
if ( ( value & 0x80 ) == 0 ) return 1;
for ( ; ( value & 0x80 ) != 0; value <<= 1 )
t++;
t++;
return t;
}
@ -94,25 +94,25 @@ FRISO_API int get_utf8_unicode( const fstring ch )
register char b1,b2,b3;
switch ( bytes ) {
case 1:
*bit = *ch;
break;
case 2:
b1 = *ch;
b2 = *(ch + 1);
case 1:
*bit = *ch;
break;
case 2:
b1 = *ch;
b2 = *(ch + 1);
*bit = (b1 << 6) + (b2 & 0x3F);
*(bit+1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *ch;
b2 = *(ch + 1);
b3 = *(ch + 2);
*bit = (b1 << 6) + (b2 & 0x3F);
*(bit+1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *ch;
b2 = *(ch + 1);
b3 = *(ch + 2);
*bit = (b2 << 6) + (b3 & 0x3F);
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
//ignore the ones that are larger than 3 bytes;
*bit = (b2 << 6) + (b3 & 0x3F);
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
//ignore the ones that are larger than 3 bytes;
}
return code;
@ -122,50 +122,50 @@ FRISO_API int get_utf8_unicode( const fstring ch )
FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
{
if ( u <= 0x0000007F ) {
//U-00000000 - U-0000007F
//0xxxxxxx
*__word = ( u & 0x7F );
return 1;
//U-00000000 - U-0000007F
//0xxxxxxx
*__word = ( u & 0x7F );
return 1;
} else if ( u >= 0x00000080 && u <= 0x000007FF ) {
//U-00000080 - U-000007FF
//110xxxxx 10xxxxxx
*( __word + 1 ) = ( u & 0x3F) | 0x80;
*__word = ((u >> 6) & 0x1F) | 0xC0;
return 2;
//U-00000080 - U-000007FF
//110xxxxx 10xxxxxx
*( __word + 1 ) = ( u & 0x3F) | 0x80;
*__word = ((u >> 6) & 0x1F) | 0xC0;
return 2;
} else if ( u >= 0x00000800 && u <= 0x0000FFFF ) {
//U-00000800 - U-0000FFFF
//1110xxxx 10xxxxxx 10xxxxxx
*( __word + 2 ) = ( u & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
*__word = ((u >> 12) & 0x0F) | 0xE0;
return 3;
//U-00000800 - U-0000FFFF
//1110xxxx 10xxxxxx 10xxxxxx
*( __word + 2 ) = ( u & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
*__word = ((u >> 12) & 0x0F) | 0xE0;
return 3;
} else if ( u >= 0x00010000 && u <= 0x001FFFFF ) {
//U-00010000 - U-001FFFFF
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 3 ) = ( u & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
*__word = ((u >> 18) & 0x07) | 0xF0;
return 4;
//U-00010000 - U-001FFFFF
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 3 ) = ( u & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
*__word = ((u >> 18) & 0x07) | 0xF0;
return 4;
} else if ( u >= 0x00200000 && u <= 0x03FFFFFF ) {
//U-00200000 - U-03FFFFFF
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 4 ) = ( u & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
*__word = ((u >> 24) & 0x03) | 0xF8;
return 5;
//U-00200000 - U-03FFFFFF
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 4 ) = ( u & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
*__word = ((u >> 24) & 0x03) | 0xF8;
return 5;
} else if ( u >= 0x04000000 && u <= 0x7FFFFFFF ) {
//U-04000000 - U-7FFFFFFF
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 5 ) = ( u & 0x3F) | 0x80;
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
*__word = ((u >> 30) & 0x01) | 0xFC;
return 6;
//U-04000000 - U-7FFFFFFF
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 5 ) = ( u & 0x3F) | 0x80;
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
*__word = ((u >> 30) & 0x01) | 0xFC;
return 6;
}
return 0;
@ -173,28 +173,28 @@ FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
/*
* check the given char is a CJK char or not.
* 2E80-2EFF CJK
* 2F00-2FDF
* 3000-303F CJK --ignore
* 31C0-31EF CJK
* 3200-32FF CJK --ignore.
* 3300-33FF CJK
* 3400-4DBF CJK A
* 4DC0-4DFF
* 4E00-9FBF CJK
* F900-FAFF CJK
* FE30-FE4F CJK
* FF00-FFEF ASCII --ignore (as basic latin)
* 2E80-2EFF CJK
* 2F00-2FDF
* 3000-303F CJK --ignore
* 31C0-31EF CJK
* 3200-32FF CJK --ignore.
* 3300-33FF CJK
* 3400-4DBF CJK A
* 4DC0-4DFF
* 4E00-9FBF CJK
* F900-FAFF CJK
* FE30-FE4F CJK
* FF00-FFEF ASCII --ignore (as basic latin)
*
* Japanese:
* 3040-309F
* 30A0-30FF
* 31F0-31FF
* 3040-309F
* 30A0-30FF
* 31F0-31FF
*
* Korean:
* AC00-D7AF
* 1100-11FF
* 3130-318F
* AC00-D7AF
* 1100-11FF
* 3130-318F
*
* @param ch :pointer to the char
* @return int : 1 for yes and 0 for not.
@ -211,23 +211,23 @@ FRISO_API int utf8_cjk_string( uint_t u )
//Chinese.
#ifdef FRISO_CJK_CHK_C
c = ( ( u >= 0x4E00 && u <= 0x9FBF )
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
#endif
//Japanese.
#ifdef FRISO_CJK_CHK_J
j = ( ( u >= 0x3040 && u <= 0x309F )
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
#endif
//Korean
#ifdef FRISO_CJK_CHK_K
k = ( ( u >= 0xAC00 && u <= 0xD7AF )
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
#endif
return ( c || j || k );
@ -235,7 +235,7 @@ FRISO_API int utf8_cjk_string( uint_t u )
/*
* check the given char is a Basic Latin letter or not.
* include all the letters and english punctuations.
* include all the letters and english punctuations.
*
* @param c
* @return int 1 for yes and 0 for not.
@ -247,21 +247,21 @@ FRISO_API int utf8_halfwidth_en_char( uint_t u )
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width punctuations.
* include the full-width arabic numeber, letters.
* but not the full-width punctuations.
*
* @param c
* @return int
*/
FRISO_API int utf8_fullwidth_en_char( uint_t u )
{
return ( (u >= 65296 && u <= 65305 ) //arabic number
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
return ( (u >= 65296 && u <= 65305 ) //arabic number
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
// included the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
@ -269,7 +269,7 @@ FRISO_API int utf8_uppercase_letter( uint_t u )
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
// included the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
@ -277,25 +277,25 @@ FRISO_API int utf8_lowercase_letter( uint_t u )
}
//check the given char is a numeric
// included the full-width and half-width arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248; //make full-width half-width.
if ( u > 65280 ) u -= 65248; //make full-width half-width.
return ( ( u >= 48 && u <= 57 ) );
}
//check the given char is a english letter.(included the full-width)
// not the punctuation of course.
// not the punctuation of course.
FRISO_API int utf8_en_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
return ( ( u >= 65 && u <= 90 )
|| ( u >= 97 && u <= 122 ) );
|| ( u >= 97 && u <= 122 ) );
}
/*
* check if the given fstring is make up with numeric.
* both full-width,half-width numeric is ok.
* both full-width,half-width numeric is ok.
*
* @param str
* @return int
@ -317,22 +317,22 @@ FRISO_API int utf8_numeric_string( const fstring str )
while ( *s != '\0' )
{
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
// return 0;
//}
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
// return 0;
//}
//new implemention.
//@date 2013-10-14
bytes = 1;
if ( *s < 0 ) //full-width chars.
{
u = get_utf8_unicode(s);
bytes = get_utf8_bytes(*s);
if ( u < 65296 || u > 65305 ) return 0;
}
else if ( *s < 48 || *s > 57 ) return 0;
//new implemention.
//@date 2013-10-14
bytes = 1;
if ( *s < 0 ) //full-width chars.
{
u = get_utf8_unicode(s);
bytes = get_utf8_bytes(*s);
if ( u < 65296 || u > 65305 ) return 0;
}
else if ( *s < 48 || *s > 57 ) return 0;
s += bytes;
s += bytes;
}
return 1;
@ -347,24 +347,24 @@ FRISO_API int utf8_decimal_string( const fstring str )
for ( i = 1; i < len; bytes = 1 )
{
//count the number of char '.'
if ( str[i] == '.' )
{
i++;
p++;
continue;
}
//count the number of char '.'
if ( str[i] == '.' )
{
i++;
p++;
continue;
}
//full-width numeric.
else if ( str[i] < 0 )
{
u = get_utf8_unicode(str+i);
bytes = get_utf8_bytes(str[i]);
if ( u < 65296 || u > 65305 ) return 0;
}
else if ( str[i] < 48 || str[i] > 57 ) return 0;
//full-width numeric.
else if ( str[i] < 0 )
{
u = get_utf8_unicode(str+i);
bytes = get_utf8_bytes(str[i]);
if ( u < 65296 || u > 65305 ) return 0;
}
else if ( str[i] < 48 || str[i] > 57 ) return 0;
i += bytes;
i += bytes;
}
return (p == 1);
@ -379,7 +379,7 @@ FRISO_API int utf8_decimal_string( const fstring str )
FRISO_API int utf8_whitespace( uint_t u )
{
if ( u == 32 || u == 12288 )
return 1;
return 1;
return 0;
}
@ -392,16 +392,16 @@ FRISO_API int utf8_whitespace( uint_t u )
*/
FRISO_API int utf8_en_punctuation( uint_t u )
{
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
return ( (u > 32 && u < 48)
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 ) //added @2013-08-31
|| ( u > 122 && u < 127 ) );
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 ) //added @2013-08-31
|| ( u > 122 && u < 127 ) );
}
/*
* check the given char is a chinese punctuation.
* @date 2013-08-31 added.
* @date 2013-08-31 added.
*
* @param ch
* @return int
@ -409,17 +409,17 @@ FRISO_API int utf8_en_punctuation( uint_t u )
FRISO_API int utf8_cn_punctuation( uint_t u )
{
return ( ( u > 65280 && u < 65296 )
|| ( u > 65305 && u < 65312 )
|| ( u > 65338 && u < 65345 )
|| ( u > 65370 && u < 65382 )
//cjk symbol and punctuation.(added 2013-09-06)
//from http://www.unicode.org/charts/PDF/U3000.pdf
|| ( u >= 12289 && u <= 12319) );
|| ( u > 65305 && u < 65312 )
|| ( u > 65338 && u < 65345 )
|| ( u > 65370 && u < 65382 )
//cjk symbol and punctuation.(added 2013-09-06)
//from http://www.unicode.org/charts/PDF/U3000.pdf
|| ( u >= 12289 && u <= 12319) );
}
/*
* check if the given char is a letter number in unicode.
* like ''.
* like ''.
* @param ch
* @return int
*/
@ -430,7 +430,7 @@ FRISO_API int utf8_letter_number( uint_t u )
/*
* check if the given char is a other number in unicode.
* like ''.
* like ''.
* @param ch
* @return int
*/
@ -456,19 +456,19 @@ FRISO_API int utf8_other_number( uint_t u )
//{
// if ( __keep_punctuations_hash__ == NULL )
// {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );
@ -484,7 +484,7 @@ FRISO_API int utf8_other_number( uint_t u )
//FRISO_API int utf8_fullwidth_char( uint_t u )
//{
// if ( u == 12288 )
// return 1; //full-width space
// return 1; //full-width space
// //(32 - 126) ascii code
// return (u > 65280 && u <= 65406);
//}

View File

@ -1,9 +1,9 @@
/*
* friso dynamaic interface implemented functions file
* that defined in header file "friso_API.h".
* never use it for commercial use.
* that defined in header file "friso_API.h".
* never use it for commercial use.
*
* @author chenxini <chenxin619315@gmail.com>
* @author chenxini <chenxin619315@gmail.com>
*/
#include "friso_API.h"
@ -14,37 +14,37 @@
**********************************************/
__STATIC_API__ void **create_array_entries( uint_t __blocks )
{
register uint_t t;
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
if ( block == NULL ) {
___ALLOCATION_ERROR___
}
register uint_t t;
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
if ( block == NULL ) {
___ALLOCATION_ERROR___
}
//initialize
for ( t = 0; t < __blocks; t++ ) {
block[t] = NULL;
}
//initialize
for ( t = 0; t < __blocks; t++ ) {
block[t] = NULL;
}
return block;
return block;
}
//resize the array. (the opacity should not be smaller than array->length)
__STATIC_API__ friso_array_t resize_array_list(
friso_array_t array,
uint_t opacity )
friso_array_t array,
uint_t opacity )
{
register uint_t t;
void **block = create_array_entries( opacity );
register uint_t t;
void **block = create_array_entries( opacity );
for ( t = 0; t < array->length ; t++ ) {
block[t] = array->items[t];
}
for ( t = 0; t < array->length ; t++ ) {
block[t] = array->items[t];
}
FRISO_FREE( array->items );
array->items = block;
array->allocs = opacity;
FRISO_FREE( array->items );
array->items = block;
array->allocs = opacity;
return array;
return array;
}
@ -59,154 +59,154 @@ __STATIC_API__ friso_array_t resize_array_list(
//create a new array list with a given opacity.
FRISO_API friso_array_t new_array_list_with_opacity( uint_t opacity )
{
friso_array_t array = ( friso_array_t )
FRISO_MALLOC( sizeof( friso_array_entry ) );
if ( array == NULL ) {
___ALLOCATION_ERROR___
}
friso_array_t array = ( friso_array_t )
FRISO_MALLOC( sizeof( friso_array_entry ) );
if ( array == NULL ) {
___ALLOCATION_ERROR___
}
//initialize
array->items = create_array_entries( opacity );
array->allocs = opacity;
array->length = 0;
//initialize
array->items = create_array_entries( opacity );
array->allocs = opacity;
array->length = 0;
return array;
return array;
}
/*
* free the given friso array.
* and its items, but never where its items item pointed to .
* and its items, but never where its items item pointed to .
*/
FRISO_API void free_array_list( friso_array_t array )
{
//free the allocation that all the items pointed to
//register int t;
//if ( flag == 1 ) {
// for ( t = 0; t < array->length; t++ ) {
// if ( array->items[t] == NULL ) continue;
// FRISO_FREE( array->items[t] );
// array->items[t] = NULL;
// }
//}
//free the allocation that all the items pointed to
//register int t;
//if ( flag == 1 ) {
// for ( t = 0; t < array->length; t++ ) {
// if ( array->items[t] == NULL ) continue;
// FRISO_FREE( array->items[t] );
// array->items[t] = NULL;
// }
//}
FRISO_FREE( array->items );
FRISO_FREE( array );
FRISO_FREE( array->items );
FRISO_FREE( array );
}
//add a new item to the array.
FRISO_API void array_list_add( friso_array_t array, void *value )
{
//check the condition to resize.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
array->items[array->length++] = value;
//check the condition to resize.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
array->items[array->length++] = value;
}
//insert a new item at a specified position.
FRISO_API void array_list_insert(
friso_array_t array,
uint_t idx,
void *value )
friso_array_t array,
uint_t idx,
void *value )
{
register uint_t t;
register uint_t t;
if ( idx <= array->length )
{
//check the condition to resize the array.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
if ( idx <= array->length )
{
//check the condition to resize the array.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
//move the elements after idx.
//for ( t = idx; t < array->length; t++ ) {
// array->items[t+1] = array->items[t];
//}
for ( t = array->length - 1; t >= idx; t-- )
{
array->items[t+1] = array->items[t];
}
//move the elements after idx.
//for ( t = idx; t < array->length; t++ ) {
// array->items[t+1] = array->items[t];
//}
for ( t = array->length - 1; t >= idx; t-- )
{
array->items[t+1] = array->items[t];
}
array->items[idx] = value;
array->length++;
}
array->items[idx] = value;
array->length++;
}
}
//get the item at a specified position.
FRISO_API void *array_list_get( friso_array_t array, uint_t idx )
{
if ( idx < array->length ) {
return array->items[idx];
}
return NULL;
if ( idx < array->length ) {
return array->items[idx];
}
return NULL;
}
//set the value of the item at a specified position.
//this will return the old value.
FRISO_API void * array_list_set(
friso_array_t array,
uint_t idx,
void * value )
friso_array_t array,
uint_t idx,
void * value )
{
void * oval = NULL;
if ( idx < array->length )
{
oval = array->items[idx];
array->items[idx] = value;
}
return oval;
void * oval = NULL;
if ( idx < array->length )
{
oval = array->items[idx];
array->items[idx] = value;
}
return oval;
}
//remove the item at a specified position.
//this will return the value of the removed item.
FRISO_API void * array_list_remove(
friso_array_t array, uint_t idx )
friso_array_t array, uint_t idx )
{
register uint_t t;
void *oval = NULL;
register uint_t t;
void *oval = NULL;
if ( idx < array->length )
{
oval = array->items[idx];
//move the elements after idx.
for ( t = idx; t < array->length - 1; t++ ) {
array->items[t] = array->items[ t + 1 ];
}
array->items[array->length - 1] = NULL;
array->length--;
}
if ( idx < array->length )
{
oval = array->items[idx];
//move the elements after idx.
for ( t = idx; t < array->length - 1; t++ ) {
array->items[t] = array->items[ t + 1 ];
}
array->items[array->length - 1] = NULL;
array->length--;
}
return oval;
return oval;
}
/*trim the array list*/
FRISO_API friso_array_t array_list_trim( friso_array_t array )
{
if ( array->length < array->allocs ) {
return resize_array_list( array, array->length );
}
return array;
if ( array->length < array->allocs ) {
return resize_array_list( array, array->length );
}
return array;
}
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear( friso_array_t array )
{
register uint_t t;
//free all the allocations that the array->length's pointer pointed.
for ( t = 0; t < array->length; t++ ) {
/*if ( array->items[t] == NULL ) continue;
FRISO_FREE( array->items[t] ); */
array->items[t] = NULL;
}
//attribute reset.
array->length = 0;
register uint_t t;
//free all the allocations that the array->length's pointer pointed.
for ( t = 0; t < array->length; t++ ) {
/*if ( array->items[t] == NULL ) continue;
FRISO_FREE( array->items[t] ); */
array->items[t] = NULL;
}
//attribute reset.
array->length = 0;
return array;
return array;
}
//get the size of the array list. (A macro define has replace this.)

View File

@ -1,7 +1,7 @@
/**
* friso string type check function interface,
* like english/CJK, full-wdith/half-width, punctuation or not.
* @ses friso_UTF8.c and friso_GBK.c for detail.
* like english/CJK, full-wdith/half-width, punctuation or not.
* @ses friso_UTF8.c and friso_GBK.c for detail.
*
* @author chenxin <chenxin619315@gmail.com>
*/
@ -16,25 +16,25 @@
* @return int (true for cn string or false)
* */
FRISO_API int friso_cn_string(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_cjk_string(task->unicode);
return utf8_cjk_string(task->unicode);
else if ( charset == FRISO_GBK )
return gbk_cn_string(task->buffer);
return gbk_cn_string(task->buffer);
return 0;
}
//check if the specified word is a whitespace.
FRISO_API int friso_whitespace(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_whitespace(task->unicode);
return utf8_whitespace(task->unicode);
else if ( charset == FRISO_GBK )
return gbk_whitespace(task->buffer);
return gbk_whitespace(task->buffer);
return 0;
}
@ -52,76 +52,76 @@ FRISO_API int friso_numeric_letter(
//check if the specified word is aa english letter.
FRISO_API int friso_en_letter(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_en_letter( ( uint_t ) task->text[task->idx]);
return utf8_en_letter( ( uint_t ) task->text[task->idx]);
else if ( charset == FRISO_GBK )
return gbk_en_letter( task->text + task->idx );
return gbk_en_letter( task->text + task->idx );
return 0;
}
//check if the specified word is a half-width letter.
// punctuations are inclued.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_halfwidth_en_char(task->unicode);
return utf8_halfwidth_en_char(task->unicode);
else if ( charset == FRISO_GBK )
return gbk_halfwidth_en_char(task->buffer[0]);
return gbk_halfwidth_en_char(task->buffer[0]);
return 0;
}
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_fullwidth_en_char( task->unicode );
return utf8_fullwidth_en_char( task->unicode );
else if ( charset == FRISO_GBK )
return gbk_fullwidth_en_char( task->buffer );
return gbk_fullwidth_en_char( task->buffer );
return 0;
}
//check if the specified word is an english punctuations.
FRISO_API int friso_en_punctuation(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_en_punctuation( task->unicode );
return utf8_en_punctuation( task->unicode );
else if ( charset == FRISO_GBK )
return gbk_en_punctuation( task->buffer[0] );
return gbk_en_punctuation( task->buffer[0] );
return 0;
}
//check if the specified word ia sn chinese punctuation.
FRISO_API int friso_cn_punctuation(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_cn_punctuation( task->unicode );
return utf8_cn_punctuation( task->unicode );
else if ( charset == FRISO_GBK )
return gbk_cn_punctuation( task->buffer );
return gbk_cn_punctuation( task->buffer );
return 0;
}
FRISO_API int friso_letter_number(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
return 0;
}
FRISO_API int friso_other_number(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
return 0;
}
@ -129,98 +129,98 @@ FRISO_API int friso_other_number(
//check if the word is a keep punctuation.
//@Deprecated
//FRISO_API int friso_keep_punctuation(
// friso_charset_t charset,
// friso_task_t task )
// friso_charset_t charset,
// friso_task_t task )
//{
// if ( charset == FRISO_UTF8 )
// return utf8_keep_punctuation( task->buffer );
// return utf8_keep_punctuation( task->buffer );
// else if ( charset == FRISO_GBK )
// return gbk_keep_punctuation( task->buffer );
// return gbk_keep_punctuation( task->buffer );
// return 0;
//}
//check if the specified char is en english punctuation.
// this function is the same as friso_en_punctuation.
// this function is the same as friso_en_punctuation.
FRISO_API int is_en_punctuation(
friso_charset_t charset, char c )
friso_charset_t charset, char c )
{
if ( charset == FRISO_UTF8 )
return utf8_en_punctuation( (uint_t) c);
return utf8_en_punctuation( (uint_t) c);
else if ( charset == FRISO_GBK )
return gbk_en_punctuation( c );
return gbk_en_punctuation( c );
return 0;
}
//check the specified string is make up with numeric.
FRISO_API int friso_numeric_string(
friso_charset_t charset,
char *buffer )
friso_charset_t charset,
char *buffer )
{
if ( charset == FRISO_UTF8 )
return utf8_numeric_string( buffer );
return utf8_numeric_string( buffer );
else if ( charset == FRISO_GBK )
return gbk_numeric_string( buffer );
return gbk_numeric_string( buffer );
return 0;
}
//check the specified string is a decimal string.
FRISO_API int friso_decimal_string(
friso_charset_t charset, char *buffer )
friso_charset_t charset, char *buffer )
{
if ( charset == FRISO_UTF8 )
return utf8_decimal_string( buffer );
return utf8_decimal_string( buffer );
else if ( charset == FRISO_GBK )
return gbk_decimal_string( buffer );
return gbk_decimal_string( buffer );
return 0;
}
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 )
return utf8_uppercase_letter( task->unicode );
return utf8_uppercase_letter( task->unicode );
else if ( charset == FRISO_GBK )
return gbk_uppercase_letter( task->buffer );
return gbk_uppercase_letter( task->buffer );
return 0;
}
/* get the type of the specified char.
* the type will be the constants defined above.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type(
friso_charset_t charset,
friso_task_t task )
friso_charset_t charset,
friso_task_t task )
{
//Unicode or ASCII.(Both UTF-8 and GBK are valid)
uint_t u = 0;
if ( charset == FRISO_UTF8 )
{
u = task->unicode;
//if ( u >= 65280 ) u = 65280 - 65248;
u = task->unicode;
//if ( u >= 65280 ) u = 65280 - 65248;
}
else if ( charset == FRISO_GBK )
{
u = (uchar_t)task->buffer[0];
//if ( u == 0xa3 ) ; //full-width.
u = (uchar_t)task->buffer[0];
//if ( u == 0xa3 ) ; //full-width.
}
//range check.
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}
/* get the type of the specified en char.
* the type will be the constants defined above.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type( char ch )
@ -228,11 +228,11 @@ FRISO_API friso_enchar_t get_enchar_type( char ch )
uint_t u = (uchar_t) ch;
//range check.
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}

View File

@ -1,9 +1,9 @@
/**
* Friso charset about function interface header file.
* @package src/friso_charset.h .
* @package src/friso_charset.h .
* Available charset for now:
* 1. UTF8 - function start with utf8
* 2. GBK - function start with gbk
* 1. UTF8 - function start with utf8
* 2. GBK - function start with gbk
*
* @author chenxin <chenxin619315@gmail.com>
*/
@ -33,11 +33,11 @@ FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t);
FRISO_API int friso_en_letter( friso_charset_t, friso_task_t );
//check if the specified word is a half-width letter.
// punctuations are inclued.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char( friso_charset_t, friso_task_t );
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char( friso_charset_t, friso_task_t );
//check if the specified word is an english punctuations.
@ -60,32 +60,32 @@ FRISO_API int friso_numeric_string( friso_charset_t, char * );
FRISO_API int friso_decimal_string( friso_charset_t, char * );
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter( friso_charset_t, friso_task_t );
//en char type.
//#define FRISO_EN_LETTER 0 //a-z && A-Z
//#define FRISO_EN_NUMERIC 1 //0-9
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
//#define FRISO_EN_WHITESPACE 3 //whitespace
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
//#define FRISO_EN_LETTER 0 //a-z && A-Z
//#define FRISO_EN_NUMERIC 1 //0-9
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
//#define FRISO_EN_WHITESPACE 3 //whitespace
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
typedef enum {
FRISO_EN_LETTER = 0, //A-Z, a-z
FRISO_EN_NUMERIC = 1, //0-9
FRISO_EN_PUNCTUATION = 2, //english punctuations
FRISO_EN_WHITESPACE = 3, //whitespace
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
FRISO_EN_LETTER = 0, //A-Z, a-z
FRISO_EN_NUMERIC = 1, //0-9
FRISO_EN_PUNCTUATION = 2, //english punctuations
FRISO_EN_WHITESPACE = 3, //whitespace
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
} friso_enchar_t;
/* get the type of the specified char.
* the type will be the constants defined above.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type( friso_charset_t, friso_task_t );
/* get the type of the specified en char.
* the type will be the constants defined above.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type( char );
@ -99,7 +99,7 @@ FRISO_API friso_enchar_t get_enchar_type( char );
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word( friso_task_t, uint_t *, fstring );
@ -116,31 +116,31 @@ FRISO_API int unicode_to_utf8( uint_t, fstring );
FRISO_API int utf8_cjk_string( uint_t ) ;
/*check the given char is a Basic Latin letter or not.
* include all the letters and english puntuations.*/
* include all the letters and english puntuations.*/
FRISO_API int utf8_halfwidth_en_char( uint_t );
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int utf8_fullwidth_en_char( uint_t );
//check the given char is a upper case letter or not.
// included all the full-width and half-width letters.
// included all the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter( uint_t );
//check the given char is a lower case letter or not.
// included all the full-width and half-width letters.
// included all the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter( uint_t );
//check the given char is a numeric.
// included the full-width and half-width arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter( uint_t );
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
* both full-width,half-width numeric is ok.
*/
FRISO_API int utf8_numeric_string( char * );
@ -183,7 +183,7 @@ FRISO_API int is_en_punctuation( friso_charset_t, char );
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word( friso_task_t, uint_t *, fstring );
@ -194,31 +194,31 @@ FRISO_API int get_gbk_bytes( char );
FRISO_API int gbk_cn_string( char * ) ;
/*check if the given char is a ASCII letter
* include all the letters and english puntuations.*/
* include all the letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char( char );
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char( char * );
//check if the given char is a upper case char.
// included all the full-width and half-width letters.
// included all the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter( char * );
//check if the given char is a lower case char.
// included all the full-width and half-width letters.
// included all the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter( char * );
//check if the given char is a numeric.
// included the full-width and half-width arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter( char * );
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string( char * );
@ -248,7 +248,7 @@ FRISO_API int gbk_en_punctuation( char ) ;
FRISO_API int gbk_cn_punctuation( char * );
//cause the logic handle is the same as the utf8.
// here invoke the utf8 interface directly.
// here invoke the utf8 interface directly.
//FRISO_API int gbk_keep_punctuation( char * );
//@Deprecated
//#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str)
@ -257,4 +257,4 @@ FRISO_API int gbk_cn_punctuation( char * );
//FRISO_API int gbk_fullwidth_char( char * ) ;
/* }}}*/
#endif /*end _friso_charset_h*/
#endif /*end _friso_charset_h*/

View File

@ -1,8 +1,8 @@
/*
* friso hash table implements functions
* defined in header file "friso_API.h".
* defined in header file "friso_API.h".
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
@ -10,7 +10,7 @@
//-166411799L
//31 131 1331 13331 133331 ..
//31 131 1313 13131 131313 .. the best
//31 131 1313 13131 131313 .. the best
#define HASH_FACTOR 1313131
/* ************************
@ -22,7 +22,7 @@ __STATIC_API__ uint_t hash( fstring str, uint_t length )
uint_t h = 0;
while ( *str != '\0' )
h = h * HASH_FACTOR + ( *str++ );
h = h * HASH_FACTOR + ( *str++ );
return (h % length);
}
@ -32,13 +32,13 @@ __STATIC_API__ int is_prime( int n )
{
int j;
if ( n == 2 || n == 3 )
return 1;
return 1;
if ( n == 1 || n % 2 == 0 )
return 0;
return 0;
for ( j = 3; j * j < n; j++ )
if ( n % j == 0 )
return 0;
if ( n % j == 0 )
return 0;
return 1;
}
@ -47,7 +47,7 @@ __STATIC_API__ int is_prime( int n )
__STATIC_API__ int next_prime( int n )
{
if ( n % 2 == 0 )
n++;
n++;
for ( ; ! is_prime( n ); n = n + 2 ) ;
return n;
@ -72,14 +72,14 @@ __STATIC_API__ int next_prime( int n )
* static hashtable function area. *
***********************************/
__STATIC_API__ hash_entry_t new_hash_entry(
fstring key,
void * value,
hash_entry_t next )
fstring key,
void * value,
hash_entry_t next )
{
hash_entry_t e = ( hash_entry_t )
FRISO_MALLOC( sizeof( friso_hash_entry ) );
FRISO_MALLOC( sizeof( friso_hash_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
//e->_key = string_copy( key );
@ -95,13 +95,13 @@ __STATIC_API__ hash_entry_t * create_hash_entries( uint_t blocks )
{
register uint_t t;
hash_entry_t *e = ( hash_entry_t * )
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
if ( e == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
for ( t = 0; t < blocks; t++ ) {
e[t] = NULL;
e[t] = NULL;
}
return e;
@ -114,22 +114,22 @@ __STATIC_API__ void rebuild_hash( friso_hash_t _hash )
//find the next prime as the length of the hashtable.
uint_t t, length = next_prime( _hash->length * 2 + 1 );
hash_entry_t e, next, *_src = _hash->table, \
*table = create_hash_entries( length );
*table = create_hash_entries( length );
uint_t bucket;
//copy the nodes
for ( t = 0; t < _hash->length; t++ )
{
e = *( _src + t );
if ( e != NULL ) {
do {
next = e->_next;
bucket = hash( e->_key, length );
e->_next = table[bucket];
table[bucket] = e;
e = next;
} while ( e != NULL );
}
e = *( _src + t );
if ( e != NULL ) {
do {
next = e->_next;
bucket = hash( e->_key, length );
e->_next = table[bucket];
table[bucket] = e;
e = next;
} while ( e != NULL );
}
}
_hash->table = table;
@ -149,35 +149,35 @@ FRISO_API friso_hash_t new_hash_table( void )
{
friso_hash_t _hash = ( friso_hash_t ) FRISO_MALLOC( sizeof ( friso_hash_cdt ) );
if ( _hash == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
//initialize the the hashtable
_hash->length = DEFAULT_LENGTH;
_hash->size = 0;
_hash->factor = DEFAULT_FACTOR;
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
_hash->table = create_hash_entries( _hash->length );
_hash->length = DEFAULT_LENGTH;
_hash->size = 0;
_hash->factor = DEFAULT_FACTOR;
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
_hash->table = create_hash_entries( _hash->length );
return _hash;
}
FRISO_API void free_hash_table(
friso_hash_t _hash,
fhash_callback_fn_t fentry_func )
friso_hash_t _hash,
fhash_callback_fn_t fentry_func )
{
register uint_t j;
hash_entry_t e, n;
for ( j = 0; j < _hash->length; j++ )
{
e = *( _hash->table + j );
for ( ; e != NULL ; ) {
n = e->_next;
if ( fentry_func != NULL ) fentry_func(e);
FRISO_FREE( e );
e = n;
}
e = *( _hash->table + j );
for ( ; e != NULL ; ) {
n = e->_next;
if ( fentry_func != NULL ) fentry_func(e);
FRISO_FREE( e );
e = n;
}
}
//free the pointer array block ( 4 * htable->length continuous bytes ).
@ -189,9 +189,9 @@ FRISO_API void free_hash_table(
//put a new mapping insite.
//the value cannot be NULL.
FRISO_API void *hash_put_mapping(
friso_hash_t _hash,
fstring key,
void * value )
friso_hash_t _hash,
fstring key,
void * value )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e = *( _hash->table + bucket );
@ -200,14 +200,14 @@ FRISO_API void *hash_put_mapping(
//check the given key is already exists or not.
for ( ; e != NULL; e = e->_next )
{
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) )
{
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) )
{
oval = e->_val; //bak the old value
e->_val = value;
return oval;
}
e->_val = value;
return oval;
}
}
//put a new mapping into the hashtable.
@ -216,27 +216,27 @@ FRISO_API void *hash_put_mapping(
//check the condition to rebuild the hashtable.
if ( _hash->size >= _hash->threshold )
rebuild_hash( _hash );
rebuild_hash( _hash );
return oval;
}
//check the existence of the mapping associated with the given key.
FRISO_API int hash_exist_mapping(
friso_hash_t _hash, fstring key )
friso_hash_t _hash, fstring key )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e;
for ( e = *( _hash->table + bucket );
e != NULL;
e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ))
{
return 1;
}
e != NULL;
e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ))
{
return 1;
}
}
return 0;
@ -249,14 +249,14 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
hash_entry_t e;
for ( e = *( _hash->table + bucket );
e != NULL;
e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ))
{
return e->_val;
}
e != NULL;
e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ))
{
return e->_val;
}
}
return NULL;
@ -264,31 +264,31 @@ FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
//remove the mapping associated with the given key.
FRISO_API hash_entry_t hash_remove_mapping(
friso_hash_t _hash, fstring key )
friso_hash_t _hash, fstring key )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e, prev = NULL;
hash_entry_t b;
for ( e = *( _hash->table + bucket );
e != NULL;
prev = e, e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) )
{
b = e;
//the node located at *( htable->table + bucket )
if ( prev == NULL ) {
_hash->table[bucket] = e->_next;
} else {
prev->_next = e->_next;
}
//printf("%s was removed\n", b->_key);
_hash->size--;
//FRISO_FREE( b );
return b;
}
e != NULL;
prev = e, e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) )
{
b = e;
//the node located at *( htable->table + bucket )
if ( prev == NULL ) {
_hash->table[bucket] = e->_next;
} else {
prev->_next = e->_next;
}
//printf("%s was removed\n", b->_key);
_hash->size--;
//FRISO_FREE( b );
return b;
}
}
return NULL;

View File

@ -1,102 +1,102 @@
/*
* friso lexicon implemented functions.
* used to deal with the friso lexicon, like: load,remove,match...
* used to deal with the friso lexicon, like: load,remove,match...
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso.h"
#define __SPLIT_MAX_TOKENS__ 5
#define __LEX_FILE_DELIME__ '#'
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
#define __SPLIT_MAX_TOKENS__ 5
#define __LEX_FILE_DELIME__ '#'
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
//create a new lexicon
FRISO_API friso_dic_t friso_dic_new()
{
register uint_t t;
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
if ( dic == NULL ) {
___ALLOCATION_ERROR___
}
register uint_t t;
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
if ( dic == NULL ) {
___ALLOCATION_ERROR___
}
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
dic[t] = new_hash_table();
}
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
dic[t] = new_hash_table();
}
return dic;
return dic;
}
/**
* default callback function to invoke
* when free the friso dictionary .
* when free the friso dictionary .
*
* @date 2013-06-12
*/
__STATIC_API__ void default_fdic_callback( hash_entry_t e )
{
register uint_t i;
friso_array_t syn;
lex_entry_t lex = ( lex_entry_t ) e->_val;
//free the lex->word
FRISO_FREE( lex->word );
//free the lex->syn if it is not NULL
if ( lex->syn != NULL )
{
syn = lex->syn;
for ( i = 0; i < syn->length; i++ ) {
FRISO_FREE( syn->items[i] );
}
free_array_list( syn );
}
register uint_t i;
friso_array_t syn;
lex_entry_t lex = ( lex_entry_t ) e->_val;
//free the lex->word
FRISO_FREE( lex->word );
//free the lex->syn if it is not NULL
if ( lex->syn != NULL )
{
syn = lex->syn;
for ( i = 0; i < syn->length; i++ ) {
FRISO_FREE( syn->items[i] );
}
free_array_list( syn );
}
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(lex);
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(lex);
}
FRISO_API void friso_dic_free( friso_dic_t dic )
{
register uint_t t;
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
//free the hash table
free_hash_table( dic[t], default_fdic_callback );
}
register uint_t t;
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
//free the hash table
free_hash_table( dic[t], default_fdic_callback );
}
FRISO_FREE( dic );
FRISO_FREE( dic );
}
//create a new lexicon entry
FRISO_API lex_entry_t new_lex_entry(
fstring word,
friso_array_t syn,
uint_t fre,
uint_t length,
uint_t type )
fstring word,
friso_array_t syn,
uint_t fre,
uint_t length,
uint_t type )
{
lex_entry_t e = ( lex_entry_t )
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
lex_entry_t e = ( lex_entry_t )
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
//initialize.
e->word = word;
e->syn = syn; //synoyum words array list.
e->pos = NULL; //part of speech array list.
//e->py = NULL; //set to NULL first.
e->fre = fre;
e->length = (uchar_t) length; //length
e->rlen = (uchar_t) length; //set to length by default.
e->type = (uchar_t) type; //type
e->ctrlMask = 0; //control mask.
e->offset = -1;
//initialize.
e->word = word;
e->syn = syn; //synoyum words array list.
e->pos = NULL; //part of speech array list.
//e->py = NULL; //set to NULL first.
e->fre = fre;
e->length = (uchar_t) length; //length
e->rlen = (uchar_t) length; //set to length by default.
e->type = (uchar_t) type; //type
e->ctrlMask = 0; //control mask.
e->offset = -1;
return e;
return e;
}
/**
@ -109,64 +109,64 @@ FRISO_API lex_entry_t new_lex_entry(
*/
FRISO_API void free_lex_entry( lex_entry_t e )
{
//if ( e->syn != NULL ) {
// if ( flag == 1 ) free_array_list( e->syn);
// else free_array_list( e->syn );
//}
FRISO_FREE( e );
//if ( e->syn != NULL ) {
// if ( flag == 1 ) free_array_list( e->syn);
// else free_array_list( e->syn );
//}
FRISO_FREE( e );
}
//add a new entry to the dictionary.
FRISO_API void friso_dic_add(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn )
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ )
{
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, 0,
(uint_t) strlen(word), (uint_t) lex ) );
}
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ )
{
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, 0,
(uint_t) strlen(word), (uint_t) lex ) );
}
}
FRISO_API void friso_dic_add_with_fre(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn,
uint_t frequency )
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn,
uint_t frequency )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, frequency,
( uint_t ) strlen(word), ( uint_t ) lex ) );
}
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, frequency,
( uint_t ) strlen(word), ( uint_t ) lex ) );
}
}
/*
* read a line from a specified stream.
* the newline will be cleared.
* the newline will be cleared.
*
* @date 2012-11-24
* @date 2012-11-24
*/
FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
{
register int c;
fstring cs;
register int c;
fstring cs;
cs = __dst;
while ( ( c = fgetc( _stream ) ) != EOF )
{
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
cs = __dst;
while ( ( c = fgetc( _stream ) ) != EOF )
{
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
return ( c == EOF && cs == __dst ) ? NULL : __dst;
return ( c == EOF && cs == __dst ) ? NULL : __dst;
}
/*
@ -174,373 +174,373 @@ FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
*/
///instead of memcpy
__STATIC_API__ fstring string_copy(
fstring _src,
fstring __dst,
uint_t blocks )
fstring _src,
fstring __dst,
uint_t blocks )
{
register fstring __src = _src;
register uint_t t;
register fstring __src = _src;
register uint_t t;
for ( t = 0; t < blocks; t++ ) {
if ( *__src == '\0' ) break;
__dst[t] = *__src++;
}
__dst[t] = '\0';
for ( t = 0; t < blocks; t++ ) {
if ( *__src == '\0' ) break;
__dst[t] = *__src++;
}
__dst[t] = '\0';
return __dst;
return __dst;
}
/**
* make a heap allocation, and copy the
* source fstring to the new allocation, and
* you should free it after use it .
* source fstring to the new allocation, and
* you should free it after use it .
*
* @param _src source fstring
* @param blocks number of bytes to copy
* @param _src source fstring
* @param blocks number of bytes to copy
*/
__STATIC_API__ fstring string_copy_heap(
fstring _src, uint_t blocks )
fstring _src, uint_t blocks )
{
register uint_t t;
register uint_t t;
fstring str = ( fstring )
FRISO_MALLOC( blocks + 1 );
if ( str == NULL ) {
___ALLOCATION_ERROR___;
}
fstring str = ( fstring )
FRISO_MALLOC( blocks + 1 );
if ( str == NULL ) {
___ALLOCATION_ERROR___;
}
for ( t = 0; t < blocks; t++ ) {
if ( *_src == '\0' ) break;
str[t] = *_src++;
}
for ( t = 0; t < blocks; t++ ) {
if ( *_src == '\0' ) break;
str[t] = *_src++;
}
str[t] = '\0';
return str;
str[t] = '\0';
return str;
}
/*
* find the postion of the first appear of the given char.
* address of the char in the fstring will be return .
* if not found NULL will be return .
* address of the char in the fstring will be return .
* if not found NULL will be return .
*/
__STATIC_API__ fstring indexOf( fstring __str, char delimiter )
{
uint_t i, __length__;
uint_t i, __length__;
__length__ = strlen( __str );
for ( i = 0; i < __length__; i++ ) {
if ( __str[i] == delimiter )
return __str + i;
}
__length__ = strlen( __str );
for ( i = 0; i < __length__; i++ ) {
if ( __str[i] == delimiter )
return __str + i;
}
return NULL;
return NULL;
}
/**
* load all the valid wors from a specified lexicon file .
*
* @param dic friso dictionary instance (A hash array)
* @param lex the lexicon type
* @param lex_file the path of the lexicon file
* @param length the maximum length of the word item
* @param dic friso dictionary instance (A hash array)
* @param lex the lexicon type
* @param lex_file the path of the lexicon file
* @param length the maximum length of the word item
*/
FRISO_API void friso_dic_load(
friso_t friso,
friso_config_t config,
friso_lex_t lex,
fstring lex_file,
uint_t length )
friso_t friso,
friso_config_t config,
friso_lex_t lex,
fstring lex_file,
uint_t length )
{
FILE * _stream;
char __char[1024], _buffer[512];
fstring _line;
string_split_entry sse;
FILE * _stream;
char __char[1024], _buffer[512];
fstring _line;
string_split_entry sse;
fstring _word;
char _sbuffer[512];
fstring _syn;
friso_array_t sywords;
uint_t _fre;
fstring _word;
char _sbuffer[512];
fstring _syn;
friso_array_t sywords;
uint_t _fre;
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL )
{
while ( ( _line = file_get_line( __char, _stream ) ) != NULL )
{
//clear up the notes
//make sure the length of the line is greater than 1.
//like the single '#' mark in stopwords dictionary.
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL )
{
while ( ( _line = file_get_line( __char, _stream ) ) != NULL )
{
//clear up the notes
//make sure the length of the line is greater than 1.
//like the single '#' mark in stopwords dictionary.
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
//handle the stopwords.
if ( lex == __LEX_STOPWORDS__ )
{
//clean the chinese words that its length is greater than max length.
if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
string_copy_heap( _line, strlen(_line) ), NULL );
continue;
}
//handle the stopwords.
if ( lex == __LEX_STOPWORDS__ )
{
//clean the chinese words that its length is greater than max length.
if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
string_copy_heap( _line, strlen(_line) ), NULL );
continue;
}
//split the fstring with '/'.
string_split_reset( &sse, "/", _line);
if ( string_split_next( &sse, _buffer ) == NULL ) continue;
//split the fstring with '/'.
string_split_reset( &sse, "/", _line);
if ( string_split_next( &sse, _buffer ) == NULL ) continue;
//1. get the word.
_word = string_copy_heap( _buffer, strlen(_buffer) );
//1. get the word.
_word = string_copy_heap( _buffer, strlen(_buffer) );
if ( string_split_next( &sse, _buffer ) == NULL )
{
//normal lexicon type,
//add them to the dictionary directly
friso_dic_add( friso->dic, lex, _word, NULL );
continue;
}
if ( string_split_next( &sse, _buffer ) == NULL )
{
//normal lexicon type,
//add them to the dictionary directly
friso_dic_add( friso->dic, lex, _word, NULL );
continue;
}
/*
* filter out the words that its length is larger
* than the specified limit.
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
* and __LEX_CEM_WORDS__.
*/
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
&& strlen( _word ) > length )
{
FRISO_FREE(_word);
continue;
}
/*
* filter out the words that its length is larger
* than the specified limit.
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
* and __LEX_CEM_WORDS__.
*/
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
&& strlen( _word ) > length )
{
FRISO_FREE(_word);
continue;
}
//2. get the synonyms words.
_syn = NULL;
if ( strcmp( _buffer, "null" ) != 0 )
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
//2. get the synonyms words.
_syn = NULL;
if ( strcmp( _buffer, "null" ) != 0 )
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
//3. get the word frequency if it available.
_fre = 0;
if ( string_split_next( &sse, _buffer ) != NULL )
_fre = atoi( _buffer );
//3. get the word frequency if it available.
_fre = 0;
if ( string_split_next( &sse, _buffer ) != NULL )
_fre = atoi( _buffer );
/**
* Here:
* split the synonyms words with mark ","
* and put them in a array list if the synonyms is not NULL
*/
sywords = NULL;
if ( config->add_syn && _syn != NULL )
{
string_split_reset( &sse, ",", _sbuffer );
sywords = new_array_list_with_opacity(5);
while ( string_split_next( &sse, _buffer ) != NULL )
{
if ( strlen(_buffer) > length ) continue;
array_list_add( sywords,
string_copy_heap(_buffer, strlen(_buffer)) );
}
sywords = array_list_trim( sywords );
}
/**
* Here:
* split the synonyms words with mark ","
* and put them in a array list if the synonyms is not NULL
*/
sywords = NULL;
if ( config->add_syn && _syn != NULL )
{
string_split_reset( &sse, ",", _sbuffer );
sywords = new_array_list_with_opacity(5);
while ( string_split_next( &sse, _buffer ) != NULL )
{
if ( strlen(_buffer) > length ) continue;
array_list_add( sywords,
string_copy_heap(_buffer, strlen(_buffer)) );
}
sywords = array_list_trim( sywords );
}
//4. add the word item
friso_dic_add_with_fre(
friso->dic, lex, _word, sywords, _fre );
}
//4. add the word item
friso_dic_add_with_fre(
friso->dic, lex, _word, sywords, _fre );
}
fclose( _stream );
} else {
printf("Warning: Fail to open lexicon file %s\n", lex_file);
}
fclose( _stream );
} else {
printf("Warning: Fail to open lexicon file %s\n", lex_file);
}
}
/**
* get the lexicon type index with the specified
* type keywords .
* type keywords .
*
* @see friso.h#friso_lex_t
* @param _key
* @return int
* @see friso.h#friso_lex_t
* @param _key
* @return int
*/
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key )
{
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
return __LEX_CJK_WORDS__;
}
else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
return __LEX_CJK_UNITS__;
}
else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
return __LEX_ECM_WORDS__;
}
else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
return __LEX_CEM_WORDS__;
}
else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
return __LEX_CN_LNAME__;
}
else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
return __LEX_CN_SNAME__;
}
else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
return __LEX_CN_DNAME1__;
}
else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
return __LEX_CN_DNAME2__;
}
else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
return __LEX_CN_LNA__;
}
else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
return __LEX_STOPWORDS__;
}
else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
return __LEX_ENPUN_WORDS__;
}
else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
return __LEX_EN_WORDS__;
}
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
return __LEX_CJK_WORDS__;
}
else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
return __LEX_CJK_UNITS__;
}
else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
return __LEX_ECM_WORDS__;
}
else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
return __LEX_CEM_WORDS__;
}
else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
return __LEX_CN_LNAME__;
}
else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
return __LEX_CN_SNAME__;
}
else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
return __LEX_CN_DNAME1__;
}
else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
return __LEX_CN_DNAME2__;
}
else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
return __LEX_CN_LNA__;
}
else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
return __LEX_STOPWORDS__;
}
else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
return __LEX_ENPUN_WORDS__;
}
else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
return __LEX_EN_WORDS__;
}
return -1;
return -1;
}
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the configuration file.
* and load all the valid lexicon from the configuration file.
*
* @param friso friso instance
* @param config friso_config instance
* @param _path dictionary directory
* @param _limitts words length limit
* @param friso friso instance
* @param config friso_config instance
* @param _path dictionary directory
* @param _limitts words length limit
*/
FRISO_API void friso_dic_load_from_ifile(
friso_t friso,
friso_config_t config,
fstring _path,
uint_t _limits )
friso_t friso,
friso_config_t config,
fstring _path,
uint_t _limits )
{
//1.parse the configuration file.
FILE *__stream;
char __chars__[1024], __key__[30], *__line__;
uint_t __length__, i, t;
friso_lex_t lex_t;
string_buffer_t sb;
//1.parse the configuration file.
FILE *__stream;
char __chars__[1024], __key__[30], *__line__;
uint_t __length__, i, t;
friso_lex_t lex_t;
string_buffer_t sb;
//get the lexicon configruation file path
sb = new_string_buffer();
string_buffer_append( sb, _path );
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
//printf("%s\n", sb->buffer);
//get the lexicon configruation file path
sb = new_string_buffer();
string_buffer_append( sb, _path );
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
//printf("%s\n", sb->buffer);
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL )
{
while ( ( __line__ =
file_get_line( __chars__, __stream ) ) != NULL )
{
//comment filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL )
{
while ( ( __line__ =
file_get_line( __chars__, __stream ) ) != NULL )
{
//comment filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
__length__ = strlen( __line__ );
//item start
if ( __line__[ __length__ - 1 ] == '[' )
{
//get the type key
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
__length__ = strlen( __line__ );
//item start
if ( __line__[ __length__ - 1 ] == '[' )
{
//get the type key
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//get the lexicon type
lex_t = get_lexicon_type_with_constant(__key__);
if ( lex_t == -1 ) continue;
//get the lexicon type
lex_t = get_lexicon_type_with_constant(__key__);
if ( lex_t == -1 ) continue;
//printf("key=%s, type=%d\n", __key__, lex_t );
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL )
{
//comments filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
//printf("key=%s, type=%d\n", __key__, lex_t );
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL )
{
//comments filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
__length__ = strlen( __line__ );
if ( __line__[ __length__ - 1 ] == ']' ) break;
__length__ = strlen( __line__ );
if ( __line__[ __length__ - 1 ] == ']' ) break;
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//load the lexicon item from the lexicon file.
string_buffer_clear( sb );
string_buffer_append( sb, _path );
string_buffer_append( sb, __key__ );
//printf("key=%s, type=%d\n", __key__, lex_t);
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
}
//load the lexicon item from the lexicon file.
string_buffer_clear( sb );
string_buffer_append( sb, _path );
string_buffer_append( sb, __key__ );
//printf("key=%s, type=%d\n", __key__, lex_t);
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
}
}
}
} //end while
} //end while
fclose( __stream );
} else {
printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
}
fclose( __stream );
} else {
printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
}
free_string_buffer(sb);
free_string_buffer(sb);
}
//match the item.
FRISO_API int friso_dic_match(
friso_dic_t dic,
friso_lex_t lex,
fstring word )
friso_dic_t dic,
friso_lex_t lex,
fstring word )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_exist_mapping( dic[lex], word );
}
return 0;
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_exist_mapping( dic[lex], word );
}
return 0;
}
//get the lex_entry_t associated with the word.
FRISO_API lex_entry_t friso_dic_get(
friso_dic_t dic,
friso_lex_t lex,
fstring word )
friso_dic_t dic,
friso_lex_t lex,
fstring word )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return ( lex_entry_t ) hash_get_value( dic[lex], word );
}
return NULL;
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return ( lex_entry_t ) hash_get_value( dic[lex], word );
}
return NULL;
}
//get the size of the specified type dictionary.
FRISO_API uint_t friso_spec_dic_size(
friso_dic_t dic,
friso_lex_t lex )
friso_dic_t dic,
friso_lex_t lex )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_get_size( dic[lex] );
}
return 0;
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_get_size( dic[lex] );
}
return 0;
}
//get size of the whole dictionary.
FRISO_API uint_t friso_all_dic_size(
friso_dic_t dic )
friso_dic_t dic )
{
register uint_t size = 0, t;
register uint_t size = 0, t;
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
size += hash_get_size( dic[t] );
}
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
size += hash_get_size( dic[t] );
}
return size;
return size;
}

View File

@ -1,29 +1,29 @@
/*
* link list implemented functions
* defined in header file "friso_API.h".
* defined in header file "friso_API.h".
* when the link_node is being deleted, here we just free
* the allocation of the node, not the allcation of it's value.
* the allocation of the node, not the allcation of it's value.
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
//create a new link list node.
__STATIC_API__ link_node_t new_node_entry(
void * value,
link_node_t prev,
link_node_t next )
void * value,
link_node_t prev,
link_node_t next )
{
link_node_t node = ( link_node_t )
FRISO_MALLOC( sizeof( link_node_entry ) );
FRISO_MALLOC( sizeof( link_node_entry ) );
if ( node == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
node->value = value;
node->prev = prev;
node->next = next;
node->value = value;
node->prev = prev;
node->next = next;
return node;
}
@ -32,14 +32,14 @@ __STATIC_API__ link_node_t new_node_entry(
FRISO_API friso_link_t new_link_list( void )
{
friso_link_t e = ( friso_link_t )
FRISO_MALLOC( sizeof( friso_link_entry ) );
FRISO_MALLOC( sizeof( friso_link_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
//initialize the entry
e->head = new_node_entry( NULL, NULL, NULL );
e->tail = new_node_entry( NULL, e->head, NULL );
e->head = new_node_entry( NULL, NULL, NULL );
e->tail = new_node_entry( NULL, e->head, NULL );
e->head->next = e->tail;
e->size = 0;
@ -52,9 +52,9 @@ FRISO_API void free_link_list( friso_link_t link )
link_node_t node, next;
for ( node = link->head; node != NULL; )
{
next = node->next;
FRISO_FREE( node );
node = next;
next = node->next;
FRISO_FREE( node );
node = next;
}
FRISO_FREE( link );
@ -62,16 +62,16 @@ FRISO_API void free_link_list( friso_link_t link )
//clear all nodes in the link list.
FRISO_API friso_link_t link_list_clear(
friso_link_t link )
friso_link_t link )
{
link_node_t node, next;
//free all the middle nodes.
for ( node = link->head->next;
node != link->tail; )
node != link->tail; )
{
next = node->next;
FRISO_FREE( node );
node = next;
next = node->next;
FRISO_FREE( node );
node = next;
}
link->head->next = link->tail;
@ -97,22 +97,22 @@ FRISO_API friso_link_t link_list_clear(
* static
*/
__STATIC_API__ link_node_t get_node(
friso_link_t link, uint_t idx )
friso_link_t link, uint_t idx )
{
link_node_t p = NULL;
register uint_t t;
if ( idx >= 0 && idx < link->size )
{
if ( idx < link->size / 2 ) { //find from the head.
p = link->head;
for ( t = 0; t <= idx; t++ )
p = p->next;
} else { //find from the tail.
p = link->tail;
for ( t = link->size; t > idx; t-- )
p = p->prev;
}
if ( idx < link->size / 2 ) { //find from the head.
p = link->head;
for ( t = 0; t <= idx; t++ )
p = p->next;
} else { //find from the tail.
p = link->tail;
for ( t = link->size; t > idx; t-- )
p = p->prev;
}
}
return p;
@ -123,9 +123,9 @@ __STATIC_API__ link_node_t get_node(
* static
*/
//__STATIC_API__ void insert_before(
// friso_link_t link,
// link_node_t node,
// void * value )
// friso_link_t link,
// link_node_t node,
// void * value )
//{
// link_node_t e = new_node_entry( value, node->prev, node );
// e->prev->next = e;
@ -136,10 +136,10 @@ __STATIC_API__ link_node_t get_node(
//}
#define insert_before( link, node, value ) \
{ \
link_node_t e = new_node_entry( value, node->prev, node ); \
e->prev->next = e; \
e->next->prev = e; \
link->size++; \
link_node_t e = new_node_entry( value, node->prev, node ); \
e->prev->next = e; \
e->next->prev = e; \
link->size++; \
}
/*
@ -150,7 +150,7 @@ __STATIC_API__ link_node_t get_node(
* @return the value of the removed node.
*/
__STATIC_API__ void * remove_node(
friso_link_t link, link_node_t node )
friso_link_t link, link_node_t node )
{
void * _value = node->value;
@ -166,18 +166,18 @@ __STATIC_API__ void * remove_node(
//add a new node to the link list.(insert just before the tail)
FRISO_API void link_list_add(
friso_link_t link, void * value )
friso_link_t link, void * value )
{
insert_before( link, link->tail, value );
}
//add a new node before the given index.
FRISO_API void link_list_insert_before(
friso_link_t link, uint_t idx, void * value )
friso_link_t link, uint_t idx, void * value )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
insert_before( link, node, value );
insert_before( link, node, value );
}
}
@ -187,11 +187,11 @@ FRISO_API void link_list_insert_before(
* @return the value of the node.
*/
FRISO_API void * link_list_get(
friso_link_t link, uint_t idx )
friso_link_t link, uint_t idx )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
return node->value;
return node->value;
}
return NULL;
}
@ -199,20 +199,20 @@ FRISO_API void * link_list_get(
/*
* set the value of the node that located in the specified position.
* we did't free the allocation of the old value, we return it to you.
* free it yourself when it is necessary.
* free it yourself when it is necessary.
*
* @return the old value.
*/
FRISO_API void *link_list_set(
friso_link_t link,
uint_t idx, void * value )
friso_link_t link,
uint_t idx, void * value )
{
link_node_t node = get_node( link, idx );
void * _value = NULL;
if ( node != NULL ) {
_value = node->value;
node->value = value;
_value = node->value;
node->value = value;
}
return _value;
@ -225,13 +225,13 @@ FRISO_API void *link_list_set(
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove(
friso_link_t link, uint_t idx )
friso_link_t link, uint_t idx )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
return remove_node( link, node );
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
return remove_node( link, node );
}
return NULL;
@ -244,43 +244,43 @@ FRISO_API void *link_list_remove(
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove_node(
friso_link_t link,
link_node_t node )
friso_link_t link,
link_node_t node )
{
return remove_node( link, node );
}
//remove the first node after the head
FRISO_API void *link_list_remove_first(
friso_link_t link )
friso_link_t link )
{
if ( link->size > 0 ) {
return remove_node( link, link->head->next );
return remove_node( link, link->head->next );
}
return NULL;
}
//remove the last node just before the tail.
FRISO_API void *link_list_remove_last(
friso_link_t link )
friso_link_t link )
{
if ( link->size > 0 ) {
return remove_node( link, link->tail->prev );
return remove_node( link, link->tail->prev );
}
return NULL;
}
//append a node from the tail.
FRISO_API void link_list_add_last(
friso_link_t link,
void *value )
friso_link_t link,
void *value )
{
insert_before( link, link->tail, value );
}
//append a note just after the head.
FRISO_API void link_list_add_first(
friso_link_t link, void *value )
friso_link_t link, void *value )
{
insert_before( link, link->head->next, value );
}

View File

@ -1,8 +1,8 @@
/*
* utf-8 handle function implements.
* you could modify it or re-release it but never for commercial use.
* you could modify it or re-release it but never for commercial use.
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#include "friso_API.h"
@ -11,14 +11,14 @@
#include <string.h>
/* ******************************************
* fstring buffer functions implements. *
* fstring buffer functions implements. *
********************************************/
/**
* create a new buffer
* @Note:
* 1. it's real length is 1 byte greater than the specifield value
* 2. we did not do any optimization for the memory allocation to ...
* avoid the memory defragmentation.
* avoid the memory defragmentation.
*
* @date: 2014-10-16
*/
@ -26,7 +26,7 @@ __STATIC_API__ fstring create_buffer( uint_t length )
{
fstring buffer = ( fstring ) FRISO_MALLOC( length + 1 );
if ( buffer == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
memset( buffer, 0x00, length + 1 );
@ -36,7 +36,7 @@ __STATIC_API__ fstring create_buffer( uint_t length )
//the __allocs should not be smaller than sb->length
__STATIC_API__ string_buffer_t resize_buffer(
string_buffer_t sb, uint_t __allocs )
string_buffer_t sb, uint_t __allocs )
{
//create a new buffer.
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
@ -44,7 +44,7 @@ __STATIC_API__ string_buffer_t resize_buffer(
//register uint_t t;
//for ( t = 0; t < sb->length; t++ ) {
// str[t] = sb->buffer[t];
// str[t] = sb->buffer[t];
//}
memcpy( str, sb->buffer, sb->length );
FRISO_FREE( sb->buffer );
@ -65,9 +65,9 @@ __STATIC_API__ string_buffer_t resize_buffer(
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t opacity )
{
string_buffer_t sb = ( string_buffer_t )
FRISO_MALLOC( sizeof( string_buffer_entry ) );
FRISO_MALLOC( sizeof( string_buffer_entry ) );
if ( sb == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
sb->buffer = create_buffer( opacity );
@ -82,9 +82,9 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
{
//buffer allocations.
string_buffer_t sb = ( string_buffer_t )
FRISO_MALLOC( sizeof( string_buffer_entry ) );
FRISO_MALLOC( sizeof( string_buffer_entry ) );
if ( sb == NULL ) {
___ALLOCATION_ERROR___
___ALLOCATION_ERROR___
}
//initialize
@ -95,7 +95,7 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
//register uint_t t;
//copy the str to the buffer.
//for ( t = 0; t < sb->length; t++ ) {
// sb->buffer[t] = str[t];
// sb->buffer[t] = str[t];
//}
memcpy( sb->buffer, str, sb->length );
@ -103,66 +103,66 @@ FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
}
FRISO_API void string_buffer_append(
string_buffer_t sb, fstring __str )
string_buffer_t sb, fstring __str )
{
register uint_t __len__ = strlen( __str );
//check the necessity to resize the buffer.
if ( sb->length + __len__ > sb->allocs ) {
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
}
//register uint_t t;
////copy the __str to the buffer.
//for ( t = 0; t < __len__; t++ ) {
// sb->buffer[ sb->length++ ] = __str[t];
// sb->buffer[ sb->length++ ] = __str[t];
//}
memcpy( sb->buffer + sb->length, __str, __len__ );
sb->length += __len__;
}
FRISO_API void string_buffer_append_char(
string_buffer_t sb, char ch )
string_buffer_t sb, char ch )
{
//check the necessity to resize the buffer.
if ( sb->length + 1 > sb->allocs ) {
sb = resize_buffer( sb, sb->length * 2 + 1 );
sb = resize_buffer( sb, sb->length * 2 + 1 );
}
sb->buffer[sb->length++] = ch;
}
FRISO_API void string_buffer_insert(
string_buffer_t sb,
uint_t idx,
fstring __str )
string_buffer_t sb,
uint_t idx,
fstring __str )
{
}
/*
* remove the given bytes from the buffer start from idx.
* this will cause the byte move after the idx+length.
* this will cause the byte move after the idx+length.
*
* @return the new string.
*/
FRISO_API fstring string_buffer_remove(
string_buffer_t sb,
uint_t idx,
uint_t length )
string_buffer_t sb,
uint_t idx,
uint_t length )
{
uint_t t;
//move the bytes after the idx + length
for ( t = idx + length; t < sb->length; t++ ) {
sb->buffer[t - length] = sb->buffer[t];
sb->buffer[t - length] = sb->buffer[t];
}
sb->buffer[t] = '\0';
//memcpy( sb->buffer + idx,
// sb->buffer + idx + length,
// sb->length - idx - length );
// sb->buffer + idx + length,
// sb->length - idx - length );
t = sb->length - idx;
if ( t > 0 ) {
sb->length -= ( t > length ) ? length : t;
sb->length -= ( t > length ) ? length : t;
}
sb->buffer[sb->length-1] = '\0';
@ -171,13 +171,13 @@ FRISO_API fstring string_buffer_remove(
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
{
//resize the buffer.
if ( sb->length < sb->allocs - 1 ) {
sb = resize_buffer( sb, sb->length + 1 );
sb = resize_buffer( sb, sb->length + 1 );
}
return sb;
}
@ -185,8 +185,8 @@ FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
* string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote( string_buffer_t sb )
{
@ -197,7 +197,7 @@ FRISO_API fstring string_buffer_devote( string_buffer_t sb )
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear( string_buffer_t sb )
{
@ -216,17 +216,17 @@ FRISO_API void free_string_buffer( string_buffer_t sb )
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split(
fstring delimiter,
fstring source )
fstring delimiter,
fstring source )
{
string_split_t e = ( string_split_t )
FRISO_MALLOC( sizeof( string_split_entry ) );
FRISO_MALLOC( sizeof( string_split_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___;
___ALLOCATION_ERROR___;
}
e->delimiter = delimiter;
@ -239,19 +239,19 @@ FRISO_API string_split_t new_string_split(
}
FRISO_API void string_split_reset(
string_split_t sst,
fstring delimiter,
fstring source )
string_split_t sst,
fstring delimiter,
fstring source )
{
sst->delimiter = delimiter;
sst->delLen = strlen(delimiter);
sst->source = source;
sst->srcLen = strlen(source);
sst->srcLen = strlen(source);
sst->idx = 0;
}
FRISO_API void string_split_set_source(
string_split_t sst, fstring source )
string_split_t sst, fstring source )
{
sst->source = source;
sst->srcLen = strlen(source);
@ -259,7 +259,7 @@ FRISO_API void string_split_set_source(
}
FRISO_API void string_split_set_delimiter(
string_split_t sst, fstring delimiter )
string_split_t sst, fstring delimiter )
{
sst->delimiter = delimiter;
sst->delLen = strlen( delimiter );
@ -273,15 +273,15 @@ FRISO_API void free_string_split( string_split_t sst )
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next(
string_split_t sst, fstring __dst)
string_split_t sst, fstring __dst)
{
uint_t i, _ok;
fstring _dst = __dst;
@ -291,28 +291,28 @@ FRISO_API fstring string_split_next(
while ( 1 )
{
_ok = 1;
for ( i = 0; i < sst->delLen
&& (sst->idx + i < sst->srcLen); i++ )
{
if ( sst->source[sst->idx+i] != sst->delimiter[i] )
{
_ok = 0;
break;
}
}
_ok = 1;
for ( i = 0; i < sst->delLen
&& (sst->idx + i < sst->srcLen); i++ )
{
if ( sst->source[sst->idx+i] != sst->delimiter[i] )
{
_ok = 0;
break;
}
}
//find the delimiter here,
//break the loop and self plus the sst->idx, then return the buffer .
if ( _ok == 1 ) {
sst->idx += sst->delLen;
break;
}
//find the delimiter here,
//break the loop and self plus the sst->idx, then return the buffer .
if ( _ok == 1 ) {
sst->idx += sst->delLen;
break;
}
//coy the char to the buffer
*_dst++ = sst->source[sst->idx++];
//check if reach the end of the fstring
if ( sst->idx >= sst->srcLen ) break;
//coy the char to the buffer
*_dst++ = sst->source[sst->idx++];
//check if reach the end of the fstring
if ( sst->idx >= sst->srcLen ) break;
}
*_dst = '\0';

View File

@ -1,8 +1,8 @@
/*
* dynamatic array test program.
*
* @author chenxin
* @email chenxin619315@gmail.com
* @author chenxin
* @email chenxin619315@gmail.com
*/
#include "friso_API.h"
@ -10,42 +10,42 @@
#include <stdlib.h>
int main( int argc, char **args ) {
//create a new array list.
friso_array_t array = new_array_list();
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
//create a new array list.
friso_array_t array = new_array_list();
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
for ( j = 0; j < len; j++ ) {
array_list_add( array, keys[j] );
}
for ( j = 0; j < len; j++ ) {
array_list_add( array, keys[j] );
}
printf("length=%d, allocations=%d\n", array->length, array->allocs );
array_list_trim( array );
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("length=%d, allocations=%d\n", array->length, array->allocs );
array_list_trim( array );
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter set %dth item.\n", idx );
array_list_set( array, idx, "chenxin__" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter set %dth item.\n", idx );
array_list_set( array, idx, "chenxin__" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter remove %dth item.\n", idx );
array_list_remove( array, idx );
printf("length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter remove %dth item.\n", idx );
array_list_remove( array, idx );
printf("length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nInsert a item at %dth\n", idx );
array_list_insert( array, idx, "*chenxin*" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nInsert a item at %dth\n", idx );
array_list_insert( array, idx, "*chenxin*" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
free_array_list( array );
free_array_list( array );
return 0;
return 0;
}

View File

@ -1,8 +1,8 @@
/*
* Friso test program.
* Of couse you can make it a perfect demo for friso.
* Of couse you can make it a perfect demo for friso.
* all threads or proccess share the same friso_t,
* defferent threads/proccess use defferent friso_task_t.
* defferent threads/proccess use defferent friso_task_t.
* and you could share the friso_config_t if you wish...
*
* @author chenxin <chenxin619315@gmail.com>
@ -17,33 +17,33 @@
#define __LENGTH__ 15
#define __INPUT_LENGTH__ 20480
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
break;
#define ___ABOUT___ \
println("+-----------------------------------------------------------+"); \
println("| friso - a chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit http://code.google.com/p/friso. |"); \
println("| java edition for http://code.google.com/p/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
#define ___ABOUT___ \
println("+-----------------------------------------------------------+"); \
println("| friso - a chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit http://code.google.com/p/friso. |"); \
println("| java edition for http://code.google.com/p/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
println("+-----------------------------------------------------------+");
//read a line from a command line.
static fstring getLine( FILE *fp, fstring __dst )
{
register int c;
register fstring cs;
register int c;
register fstring cs;
cs = __dst;
while ( ( c = getc( fp ) ) != EOF ) {
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
cs = __dst;
while ( ( c = getc( fp ) ) != EOF ) {
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
return ( c == EOF && cs == __dst ) ? NULL : __dst;
return ( c == EOF && cs == __dst ) ? NULL : __dst;
}
/*static void printcode( fstring str ) {
@ -59,94 +59,94 @@ static fstring getLine( FILE *fp, fstring __dst )
int main(int argc, char **argv)
{
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
friso_t friso;
friso_config_t config;
friso_task_t task;
friso_t friso;
friso_config_t config;
friso_task_t task;
//get the lexicon directory
for ( i = 0; i < argc; i++ ) {
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
__path__ = argv[i+1];
}
}
if ( __path__ == NULL ) {
println("Usage: friso -init lexicon path");
exit(0);
}
//get the lexicon directory
for ( i = 0; i < argc; i++ ) {
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
__path__ = argv[i+1];
}
}
if ( __path__ == NULL ) {
println("Usage: friso -init lexicon path");
exit(0);
}
s_time = clock();
s_time = clock();
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
printf("fail to initialize friso and config.");
goto err;
}
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
printf("fail to initialize friso and config.");
goto err;
}
switch ( config->mode )
{
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
switch ( config->mode )
{
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
e_time = clock();
e_time = clock();
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
___ABOUT___;
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
___ABOUT___;
//set the task.
task = friso_new_task();
//set the task.
task = friso_new_task();
while ( 1 )
{
print("friso>> ");
getLine( stdin, line );
//exit the programe
if ( strcasecmp( line, "quit" ) == 0 ) {
___EXIT_INFO___
}
while ( 1 )
{
print("friso>> ");
getLine( stdin, line );
//exit the programe
if ( strcasecmp( line, "quit" ) == 0 ) {
___EXIT_INFO___
}
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text( task, line );
println("分词结果:");
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text( task, line );
println("分词结果:");
s_time = clock();
while ( ( config->next_token( friso, config, task ) ) != NULL )
{
//printf("%s[%d, %d, %d] ", task->token->word,
// task->token->offset, task->token->length, task->token->rlen );
printf("%s ", task->token->word );
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
s_time = clock();
while ( ( config->next_token( friso, config, task ) ) != NULL )
{
//printf("%s[%d, %d, %d] ", task->token->word,
// task->token->offset, task->token->length, task->token->rlen );
printf("%s ", task->token->word );
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
}
}
friso_free_task( task );

View File

@ -1,8 +1,8 @@
/**
* File Explain.
*
* @author chenxin
* @see http://www.webssky.com
* @author chenxin
* @see http://www.webssky.com
*/
#include "friso_API.h"
@ -10,28 +10,28 @@
void print_hash_info( friso_hash_t _hash ) {
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
_hash->size, _hash->factor, _hash->threshold);
_hash->size, _hash->factor, _hash->threshold);
}
int main(int argc, char **argv)
{
friso_hash_t _hash = new_hash_table();
char *names[] = {
"陈满文", "阳清华",
"陈鑫", "罗江艳",
"小燕子", "比比",
"张仁芳", "阳建",
"陈配", "李恒",
"张志刚", "张怡少",
"阳江波", "蔡再利",
"阳绘章", "尹唐文",
"谭志鹏", "肖路德",
"潘凯", "刘潇",
"马朝辉", "张强",
"殷美林", "元明清",
"周安", "郭桥安",
"刘敏", "黄广华",
"李胜", "黄海清"
"陈满文", "阳清华",
"陈鑫", "罗江艳",
"小燕子", "比比",
"张仁芳", "阳建",
"陈配", "李恒",
"张志刚", "张怡少",
"阳江波", "蔡再利",
"阳绘章", "尹唐文",
"谭志鹏", "肖路德",
"潘凯", "刘潇",
"马朝辉", "张强",
"殷美林", "元明清",
"周安", "郭桥安",
"刘敏", "黄广华",
"李胜", "黄海清"
};
//char *str[] = {"陈鑫", "张仁芳", "比比"};
char **str = names;
@ -39,7 +39,7 @@ int main(int argc, char **argv)
print_hash_info( _hash );
for ( j = 0; j < len; j++) {
hash_put_mapping( _hash, names[j], names[j] );
hash_put_mapping( _hash, names[j], names[j] );
}
print_hash_info( _hash );
@ -49,11 +49,11 @@ int main(int argc, char **argv)
//remove mappings
for ( j = 0; j < len; j++ ) {
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
printf("Now, remove %s\n", str[j]);
hash_remove_mapping( _hash, str[j] );
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
printf("*********************************\n");
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
printf("Now, remove %s\n", str[j]);
hash_remove_mapping( _hash, str[j] );
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
printf("*********************************\n");
}
printf("Press any key to continue.");

View File

@ -1,8 +1,8 @@
/*
* lex functions test program.
*
* @author chenxin
* @see http://www.webssky.com
* @author chenxin
* @see http://www.webssky.com
*/
#include "friso.h"
@ -11,10 +11,10 @@
#include <string.h>
#define __LENGTH__ 15
#define ___PRINT_HELP_INFO___ \
printf("1. help print the current menu.\n"); \
printf("2. #set set the classify of the dictionary.\n"); \
printf("3. other search the words in the dictionary.\n"); \
#define ___PRINT_HELP_INFO___ \
printf("1. help print the current menu.\n"); \
printf("2. #set set the classify of the dictionary.\n"); \
printf("3. other search the words in the dictionary.\n"); \
printf("4. quit exit the programe.\n");
int main(int argc, char **argv)
@ -62,30 +62,30 @@ int main(int argc, char **argv)
e_time = clock();
printf("Done, cost: %f sec, size=%d\n", ( double ) ( e_time - s_time ) / CLOCKS_PER_SEC, \
friso_all_dic_size( friso->dic ) );
friso_all_dic_size( friso->dic ) );
while ( 1 ) {
printf("friso-%d>> ", lex);
scanf("%s", _line);
if ( strcmp( _line, "quit" ) == 0 ) {
break;
} else if ( strcmp( _line, "help" ) == 0 ) {
___PRINT_HELP_INFO___
} else if ( strcmp( _line, "#set" ) == 0 ) {
printf("lex_t>> ");
scanf("%d", &lex);
} else {
s_time = clock();
e = friso_dic_get( friso->dic, lex, _line );
e_time = clock();
if ( e != NULL ) {
printf("word=%s, syn=%s, fre=%d, cost:%fsec\n",
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre,
(double) ( e_time - s_time ) / CLOCKS_PER_SEC );
} else {
printf("%s was not found.\n", _line);
}
}
printf("friso-%d>> ", lex);
scanf("%s", _line);
if ( strcmp( _line, "quit" ) == 0 ) {
break;
} else if ( strcmp( _line, "help" ) == 0 ) {
___PRINT_HELP_INFO___
} else if ( strcmp( _line, "#set" ) == 0 ) {
printf("lex_t>> ");
scanf("%d", &lex);
} else {
s_time = clock();
e = friso_dic_get( friso->dic, lex, _line );
e_time = clock();
if ( e != NULL ) {
printf("word=%s, syn=%s, fre=%d, cost:%fsec\n",
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre,
(double) ( e_time - s_time ) / CLOCKS_PER_SEC );
} else {
printf("%s was not found.\n", _line);
}
}
}
//friso_dic_free( friso->dic );

View File

@ -1,8 +1,8 @@
/*
* link list test programe.
*
* @author chenxin
* @email chenxin619315@gmail.com
* @author chenxin
* @email chenxin619315@gmail.com
*/
#include "friso_API.h"
@ -13,12 +13,12 @@ int main( int argc, char **args ) {
friso_link_t link;
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, len = sizeof( keys ) / sizeof( fstring );
@ -28,15 +28,15 @@ int main( int argc, char **args ) {
printf("size=%d\n", link->size );
for ( j = 0; j < len; j++ ) {
//link_add( link, keys[j] );
link_list_add_last( link, keys[j] );
//link_add( link, keys[j] );
link_list_add_last( link, keys[j] );
}
printf("size=%d\n", link->size );
for ( j = 0; j < len / 2; j++ ) {
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
}
printf("size=%d\n", link->size );

View File

@ -11,7 +11,7 @@
int main ( int argc, char **args )
{
fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
char buffer[128];
string_split_t split = new_string_split(",", source );
@ -20,7 +20,7 @@ int main ( int argc, char **args )
printf("sst->delLen=%d\n", split->delLen);
while ( string_split_next(split, buffer) != NULL) {
printf("buffer:%s\n", buffer);
printf("buffer:%s\n", buffer);
}
free_string_split(split);

View File

@ -1,7 +1,7 @@
/*
* fstring handle mode test program.
*
* @author chenxin <chenxin619315@gmail.com>
* @author chenxin <chenxin619315@gmail.com>
*/
#include "friso_API.h"
@ -20,13 +20,13 @@ int main( int argc, char **args ) {
for ( t = 0; t < length; t += bytes ) {
bytes = get_utf8_bytes( *(str + t) );
if ( bytes == 0 ) continue;
for ( j = 0; j < bytes; j++ )
word[j] = *(str + t + j );
word[j] = '\0';
string_buffer_append( sb, word );
printf("word=%s\n", word );
bytes = get_utf8_bytes( *(str + t) );
if ( bytes == 0 ) continue;
for ( j = 0; j < bytes; j++ )
word[j] = *(str + t + j );
word[j] = '\0';
string_buffer_append( sb, word );
printf("word=%s\n", word );
}
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );