mirror of
https://gitee.com/lionsoul/friso.git
synced 2024-11-29 17:57:38 +08:00
First commit of friso chinese tokenizer
This commit is contained in:
commit
900a1715e5
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
help/
|
||||
*.o
|
||||
lib/
|
||||
CHANGES.txt
|
||||
# vim #
|
||||
*.swp
|
||||
*.vim
|
||||
*.viminfo
|
2
DONATE.txt
Normal file
2
DONATE.txt
Normal file
@ -0,0 +1,2 @@
|
||||
如果您愿意, 你可以使用下面的方式来资助作者.
|
||||
1.支付宝: chenxin619315@gmail.com
|
226
LICENSE.md
Normal file
226
LICENSE.md
Normal file
@ -0,0 +1,226 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
==========================================================================
|
||||
The following license applies to the JQuery JavaScript library
|
||||
--------------------------------------------------------------------------
|
||||
Copyright (c) 2010 John Resig, http://jquery.com/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
28
README.md
Normal file
28
README.md
Normal file
@ -0,0 +1,28 @@
|
||||
一. 关于Friso
|
||||
friso是使用c语言开发的一个中文分词器,使用流行的mmseg算法实现。完全基于模块化设计和实现,可以很方便的植入到其他程序中,例如:MySQL,PHP等。
|
||||
分词准确率高达98.41%。
|
||||
|
||||
详细介绍: https://code.google.com/p/friso/
|
||||
Wiki文档: https://code.google.com/p/friso/w/list
|
||||
开发API: https://code.google.com/p/friso/#四。使用方法
|
||||
|
||||
二. 如何安装:
|
||||
cd src
|
||||
make
|
||||
sudo make install
|
||||
|
||||
三. 如何运行测试程序:
|
||||
friso -lex {friso lex path}
|
||||
|
||||
#for example:
|
||||
#the configuration file is in the $(HOME) dictionary name friso.ini
|
||||
#try:
|
||||
friso -init $(HOME)/friso.ini
|
||||
|
||||
四. 如何卸载
|
||||
sudo rm /usr/local/bin/friso
|
||||
|
||||
作者: 狮子的魂
|
||||
email: chenxin619315@gmail.com
|
||||
qq: 1187582057
|
||||
|
64
dict/friso.lex.ini
Normal file
64
dict/friso.lex.ini
Normal file
@ -0,0 +1,64 @@
|
||||
#friso lexicon configure file.
|
||||
# @email chenxin619315@gmail.com
|
||||
# @date 2012-12-19
|
||||
#main lexion
|
||||
__LEX_CJK_WORDS__ :[
|
||||
lex-main.lex;
|
||||
lex-admin.lex;
|
||||
lex-chars.lex;
|
||||
lex-cn-mz.lex;
|
||||
lex-cn-place.lex;
|
||||
lex-company.lex;
|
||||
lex-festival.lex;
|
||||
lex-flname.lex;
|
||||
lex-food.lex;
|
||||
lex-lang.lex;
|
||||
lex-nation.lex;
|
||||
lex-net.lex;
|
||||
lex-org.lex;
|
||||
#add more here
|
||||
]
|
||||
#single chinese unit lexicon
|
||||
__LEX_CJK_UNITS__ :[
|
||||
lex-units.lex;
|
||||
]
|
||||
#chinese and english mixed word lexicon like "b超".
|
||||
__LEX_ECM_WORDS__:[
|
||||
lex-ecmix.lex;
|
||||
]
|
||||
#english and chinese mixed word lexicon like "卡拉ok".
|
||||
__LEX_CEM_WORDS__:[
|
||||
lex-cemix.lex;
|
||||
]
|
||||
#chinese last name lexicon.
|
||||
__LEX_CN_LNAME__:[
|
||||
lex-lname.lex;
|
||||
]
|
||||
#single name words lexicon.
|
||||
__LEX_CN_SNAME__:[
|
||||
lex-sname.lex;
|
||||
]
|
||||
#first word of a double chinese name.
|
||||
__LEX_CN_DNAME1__:[
|
||||
lex-dname-1.lex;
|
||||
]
|
||||
#second word of a double chinese name.
|
||||
__LEX_CN_DNAME2__:[
|
||||
lex-dname-2.lex;
|
||||
]
|
||||
#chinese last name decorate word.
|
||||
__LEX_CN_LNA__:[
|
||||
lex-lna.lex;
|
||||
]
|
||||
#stopwords lexicon
|
||||
__LEX_STOPWORDS__:[
|
||||
lex-stopwords.lex;
|
||||
]
|
||||
#english and punctuation mixed words lexicon.
|
||||
__LEX_ENPUN_WORDS__:[
|
||||
lex-en-pun.lex;
|
||||
]
|
||||
#english words(for synonyms words)
|
||||
__LEX_EN_WORDS__:[
|
||||
lex-en.lex;
|
||||
]
|
27
dict/lex-admin.lex
Normal file
27
dict/lex-admin.lex
Normal file
@ -0,0 +1,27 @@
|
||||
人事部/人事管理部门,人事管理部
|
||||
人事管理部/人事管理部门,人事部
|
||||
人事管理部/事管理部门,人事部
|
||||
信息产业部/null
|
||||
农业部/null
|
||||
医管局/医疗管理部门,医疗管理部
|
||||
医疗管理部/医疗管理部门,医管局
|
||||
医疗管理部门/医管局,医疗管理部
|
||||
发改委/null
|
||||
国土资源部/null
|
||||
国防部/人民武装力量部,军事部,防卫厅
|
||||
军事部/人民武装力量部,防卫厅
|
||||
外交部/国务院,政治部,对外关系部,外务省
|
||||
外交部长/null
|
||||
教育部/null
|
||||
文化部/null
|
||||
民政部/null
|
||||
能源部/null
|
||||
财政部/null
|
||||
铁道部/null
|
||||
防卫厅/null
|
||||
防卫省/null
|
||||
革命委员会/null
|
||||
交通运输部/null
|
||||
对外经济贸易部/null
|
||||
技术部/null
|
||||
总装备部/null
|
6
dict/lex-cemix.lex
Normal file
6
dict/lex-cemix.lex
Normal file
@ -0,0 +1,6 @@
|
||||
#中文英文混合词词库
|
||||
卡拉ok/null
|
||||
漂亮mm/null
|
||||
拳皇ova/拳皇动漫
|
||||
奇都ktv/null
|
||||
哆啦a梦/null
|
12640
dict/lex-chars.lex
Normal file
12640
dict/lex-chars.lex
Normal file
File diff suppressed because it is too large
Load Diff
168
dict/lex-cn-mz.lex
Normal file
168
dict/lex-cn-mz.lex
Normal file
@ -0,0 +1,168 @@
|
||||
汉族/null
|
||||
汉族人/null
|
||||
汉族语/null
|
||||
蒙古族/null
|
||||
蒙古族人/null
|
||||
蒙古族语/null
|
||||
满族/null
|
||||
满族人/null
|
||||
满族语/null
|
||||
朝鲜族/null
|
||||
朝鲜族人/null
|
||||
朝鲜族语/null
|
||||
赫哲族/null
|
||||
赫哲族人/null
|
||||
赫哲族语/null
|
||||
达斡尔族/null
|
||||
达斡尔族人/null
|
||||
达斡尔族语/null
|
||||
鄂温克族/null
|
||||
鄂温克族人/null
|
||||
鄂温克族语/null
|
||||
鄂伦春族/null
|
||||
鄂伦春族人/null
|
||||
鄂伦春族语/null
|
||||
回族/null
|
||||
回族人/null
|
||||
回族语/null
|
||||
东乡族/null
|
||||
东乡族人/null
|
||||
东乡族语/null
|
||||
土族/null
|
||||
土族人/null
|
||||
土族语/null
|
||||
撒拉族/null
|
||||
撒拉族人/null
|
||||
撒拉族语/null
|
||||
保安族/null
|
||||
保安族人/null
|
||||
保安族语/null
|
||||
裕固族/null
|
||||
裕固族人/null
|
||||
裕固族语/null
|
||||
维吾尔族/null
|
||||
维吾尔族人/null
|
||||
维吾尔族语/null
|
||||
哈萨克族/null
|
||||
哈萨克族人/null
|
||||
哈萨克族语/null
|
||||
柯尔克孜族/null
|
||||
柯尔克孜族人/null
|
||||
柯尔克孜族语/null
|
||||
锡伯族/null
|
||||
锡伯族人/null
|
||||
锡伯族语/null
|
||||
塔吉克族/null
|
||||
塔吉克族人/null
|
||||
塔吉克族语/null
|
||||
乌孜别克族/null
|
||||
乌孜别克族人/null
|
||||
乌孜别克族语/null
|
||||
俄罗斯族/null
|
||||
俄罗斯族人/null
|
||||
俄罗斯族语/null
|
||||
塔塔尔族/null
|
||||
塔塔尔族人/null
|
||||
塔塔尔族语/null
|
||||
藏族/null
|
||||
藏族人/null
|
||||
藏族语/null
|
||||
门巴族/null
|
||||
门巴族人/null
|
||||
门巴族语/null
|
||||
珞巴族/null
|
||||
珞巴族人/null
|
||||
珞巴族语/null
|
||||
羌族/null
|
||||
羌族人/null
|
||||
羌族语/null
|
||||
彝族/null
|
||||
彝族人/null
|
||||
彝族语/null
|
||||
白族/null
|
||||
白族人/null
|
||||
白族语/null
|
||||
哈尼族/null
|
||||
哈尼族人/null
|
||||
哈尼族语/null
|
||||
傣族/null
|
||||
傣族人/null
|
||||
傣族语/null
|
||||
僳僳族/null
|
||||
僳僳族人/null
|
||||
僳僳族语/null
|
||||
佤族/null
|
||||
佤族人/null
|
||||
佤族语/null
|
||||
拉祜族/null
|
||||
拉祜族人/null
|
||||
拉祜族语/null
|
||||
纳西族/null
|
||||
纳西族人/null
|
||||
纳西族语/null
|
||||
景颇族/null
|
||||
景颇族人/null
|
||||
景颇族语/null
|
||||
布朗族/null
|
||||
布朗族人/null
|
||||
布朗族语/null
|
||||
阿昌族/null
|
||||
阿昌族人/null
|
||||
阿昌族语/null
|
||||
普米族/null
|
||||
普米族人/null
|
||||
普米族语/null
|
||||
怒族/null
|
||||
怒族人/null
|
||||
怒族语/null
|
||||
德昂族/null
|
||||
德昂族人/null
|
||||
德昂族语/null
|
||||
独龙族/null
|
||||
独龙族人/null
|
||||
独龙族语/null
|
||||
基诺族/null
|
||||
基诺族人/null
|
||||
基诺族语/null
|
||||
苗族/null
|
||||
苗族人/null
|
||||
苗族语/null
|
||||
布依族/null
|
||||
布依族人/null
|
||||
布依族语/null
|
||||
侗族/null
|
||||
侗族人/null
|
||||
侗族语/null
|
||||
水族/null
|
||||
水族人/null
|
||||
水族语/null
|
||||
仡佬族/null
|
||||
仡佬族人/null
|
||||
仡佬族语/null
|
||||
壮族/null
|
||||
壮族人/null
|
||||
壮族语/null
|
||||
瑶族/null
|
||||
瑶族人/null
|
||||
瑶族语/null
|
||||
仫佬族/null
|
||||
仫佬族人/null
|
||||
仫佬族语/null
|
||||
毛南族/null
|
||||
毛南族人/null
|
||||
毛南族语/null
|
||||
京族/null
|
||||
京族人/null
|
||||
京族语/null
|
||||
土家族/null
|
||||
土家族人/null
|
||||
土家族语/null
|
||||
黎族/null
|
||||
黎族人/null
|
||||
黎族语/null
|
||||
畲族/null
|
||||
畲族人/null
|
||||
畲族语/null
|
||||
高山族/null
|
||||
高山族人/null
|
||||
高山族语/null
|
2564
dict/lex-cn-place.lex
Normal file
2564
dict/lex-cn-place.lex
Normal file
File diff suppressed because it is too large
Load Diff
100
dict/lex-company.lex
Normal file
100
dict/lex-company.lex
Normal file
@ -0,0 +1,100 @@
|
||||
央视/null
|
||||
电信/null
|
||||
移动/null
|
||||
网通/null
|
||||
联通/null
|
||||
铁通/null
|
||||
百度/null
|
||||
环球网/null
|
||||
长城网/null
|
||||
新浪/null
|
||||
腾讯/null
|
||||
搜搜/soso
|
||||
谷歌/null
|
||||
雅虎/null
|
||||
微软/null
|
||||
中关村/null
|
||||
搜狐/null
|
||||
网易/null
|
||||
硅谷/null
|
||||
维基百科/null
|
||||
巨人网络/null
|
||||
阿里巴巴/null
|
||||
阿里旺旺/旺旺
|
||||
旺旺/null
|
||||
淘宝/null
|
||||
赶集网/null
|
||||
猪八戒网/null
|
||||
唯你英语/null
|
||||
拉手网/null
|
||||
百贯福泰/null
|
||||
汇划算/null
|
||||
汇划算网/null
|
||||
聚划算/null
|
||||
天猫/null
|
||||
天猫网/null
|
||||
亚马逊/null
|
||||
亚马逊网/null
|
||||
拍拍/null
|
||||
拍拍网/null
|
||||
京东/null
|
||||
京东商城/null
|
||||
返利网/null
|
||||
支付宝/null
|
||||
支付宝担保/null
|
||||
支付宝及时到帐/null
|
||||
支付宝双工能/null
|
||||
财付通/null
|
||||
财付通及时到帐/null
|
||||
网银在线/null
|
||||
苏宁易购/null
|
||||
苏宁电器/null
|
||||
仙童公司/null
|
||||
开源中国/null
|
||||
畅想网络/null
|
||||
快乐大本营/null
|
||||
越策越开心/null
|
||||
超级男声/null
|
||||
超男/null
|
||||
超级女声/null
|
||||
超女/null
|
||||
好声音/null
|
||||
快乐男声/null
|
||||
快男/null
|
||||
快乐女声/null
|
||||
快女/null
|
||||
德克士/null
|
||||
肯德基/null
|
||||
奥利奥/null
|
||||
回头客/null
|
||||
苏波尔/null
|
||||
苏宁/null
|
||||
苏宁电器/null
|
||||
苏宁易购/null
|
||||
中央银行/null
|
||||
人民银行/null
|
||||
工商银行/null
|
||||
农业银行/null
|
||||
中国银行/null
|
||||
建设银行/null
|
||||
交通银行/null
|
||||
华夏银行/null
|
||||
光大银行/null
|
||||
招商银行/null
|
||||
中信银行/null
|
||||
兴业银行/null
|
||||
民生银行/null
|
||||
深圳发展银行/null
|
||||
广东发展银行/null
|
||||
上海浦东发展银行/null
|
||||
恒丰银行/null
|
||||
农业发展银行/null
|
||||
国家进出口信贷银行/null
|
||||
国家开发银行/null
|
||||
北京商业银行/null
|
||||
上海银行/null
|
||||
济南商业银行/null
|
||||
信用社/null
|
||||
农村信用社/null
|
||||
邮政局/null
|
||||
邮政储蓄银行/null
|
210
dict/lex-dname-1.lex
Normal file
210
dict/lex-dname-1.lex
Normal file
@ -0,0 +1,210 @@
|
||||
#双姓名首字词库
|
||||
建
|
||||
小
|
||||
晓
|
||||
文
|
||||
志
|
||||
国
|
||||
玉
|
||||
丽
|
||||
永
|
||||
海
|
||||
春
|
||||
金
|
||||
明
|
||||
新
|
||||
德
|
||||
秀
|
||||
红
|
||||
亚
|
||||
伟
|
||||
雪
|
||||
俊
|
||||
桂
|
||||
爱
|
||||
美
|
||||
世
|
||||
正
|
||||
庆
|
||||
学
|
||||
家
|
||||
立
|
||||
淑
|
||||
振
|
||||
云
|
||||
华
|
||||
光
|
||||
惠
|
||||
兴
|
||||
天
|
||||
长
|
||||
艳
|
||||
慧
|
||||
利
|
||||
宏
|
||||
佳
|
||||
瑞
|
||||
凤
|
||||
荣
|
||||
秋
|
||||
继
|
||||
嘉
|
||||
卫
|
||||
燕
|
||||
思
|
||||
维
|
||||
少
|
||||
福
|
||||
忠
|
||||
宝
|
||||
子
|
||||
成
|
||||
月
|
||||
洪
|
||||
东
|
||||
一
|
||||
泽
|
||||
林
|
||||
大
|
||||
素
|
||||
旭
|
||||
宇
|
||||
智
|
||||
锦
|
||||
冬
|
||||
玲
|
||||
雅
|
||||
伯
|
||||
翠
|
||||
传
|
||||
启
|
||||
剑
|
||||
安
|
||||
树
|
||||
良
|
||||
中
|
||||
梦
|
||||
广
|
||||
昌
|
||||
元
|
||||
万
|
||||
清
|
||||
静
|
||||
友
|
||||
宗
|
||||
兆
|
||||
丹
|
||||
克
|
||||
彩
|
||||
绍
|
||||
喜
|
||||
远
|
||||
朝
|
||||
敏
|
||||
培
|
||||
胜
|
||||
祖
|
||||
先
|
||||
菊
|
||||
士
|
||||
向
|
||||
有
|
||||
连
|
||||
军
|
||||
健
|
||||
巧
|
||||
耀
|
||||
莉
|
||||
英
|
||||
方
|
||||
和
|
||||
仁
|
||||
孝
|
||||
梅
|
||||
汉
|
||||
兰
|
||||
松
|
||||
水
|
||||
江
|
||||
益
|
||||
开
|
||||
景
|
||||
运
|
||||
贵
|
||||
祥
|
||||
青
|
||||
芳
|
||||
碧
|
||||
婷
|
||||
龙
|
||||
鹏
|
||||
自
|
||||
顺
|
||||
双
|
||||
书
|
||||
生
|
||||
义
|
||||
跃
|
||||
银
|
||||
佩
|
||||
雨
|
||||
保
|
||||
贤
|
||||
仲
|
||||
鸿
|
||||
浩
|
||||
加
|
||||
定
|
||||
炳
|
||||
飞
|
||||
锡
|
||||
柏
|
||||
发
|
||||
超
|
||||
道
|
||||
怀
|
||||
进
|
||||
其
|
||||
富
|
||||
平
|
||||
全
|
||||
阳
|
||||
吉
|
||||
茂
|
||||
彦
|
||||
诗
|
||||
洁
|
||||
润
|
||||
承
|
||||
治
|
||||
焕
|
||||
如
|
||||
君
|
||||
增
|
||||
善
|
||||
希
|
||||
根
|
||||
应
|
||||
勇
|
||||
宜
|
||||
守
|
||||
会
|
||||
凯
|
||||
育
|
||||
湘
|
||||
凌
|
||||
本
|
||||
敬
|
||||
博
|
||||
延
|
||||
乐
|
||||
三
|
||||
高
|
||||
熙
|
||||
逸
|
||||
幸
|
||||
灵
|
||||
宣
|
||||
才
|
||||
述
|
||||
化
|
210
dict/lex-dname-2.lex
Normal file
210
dict/lex-dname-2.lex
Normal file
@ -0,0 +1,210 @@
|
||||
#双姓名尾字词库
|
||||
华
|
||||
平
|
||||
明
|
||||
英
|
||||
军
|
||||
林
|
||||
萍
|
||||
芳
|
||||
玲
|
||||
红
|
||||
生
|
||||
霞
|
||||
梅
|
||||
文
|
||||
荣
|
||||
珍
|
||||
兰
|
||||
娟
|
||||
峰
|
||||
琴
|
||||
云
|
||||
辉
|
||||
东
|
||||
龙
|
||||
敏
|
||||
伟
|
||||
强
|
||||
丽
|
||||
春
|
||||
杰
|
||||
燕
|
||||
民
|
||||
君
|
||||
波
|
||||
国
|
||||
芬
|
||||
清
|
||||
祥
|
||||
斌
|
||||
婷
|
||||
飞
|
||||
良
|
||||
忠
|
||||
新
|
||||
凤
|
||||
锋
|
||||
成
|
||||
勇
|
||||
刚
|
||||
玉
|
||||
元
|
||||
宇
|
||||
海
|
||||
兵
|
||||
安
|
||||
庆
|
||||
涛
|
||||
鹏
|
||||
亮
|
||||
青
|
||||
阳
|
||||
艳
|
||||
松
|
||||
江
|
||||
莲
|
||||
娜
|
||||
兴
|
||||
光
|
||||
德
|
||||
武
|
||||
香
|
||||
俊
|
||||
秀
|
||||
慧
|
||||
雄
|
||||
才
|
||||
宏
|
||||
群
|
||||
琼
|
||||
胜
|
||||
超
|
||||
彬
|
||||
莉
|
||||
中
|
||||
山
|
||||
富
|
||||
花
|
||||
宁
|
||||
利
|
||||
贵
|
||||
福
|
||||
发
|
||||
义
|
||||
蓉
|
||||
喜
|
||||
娥
|
||||
昌
|
||||
仁
|
||||
志
|
||||
全
|
||||
宝
|
||||
权
|
||||
美
|
||||
琳
|
||||
建
|
||||
金
|
||||
贤
|
||||
星
|
||||
丹
|
||||
根
|
||||
和
|
||||
珠
|
||||
康
|
||||
菊
|
||||
琪
|
||||
坤
|
||||
泉
|
||||
秋
|
||||
静
|
||||
佳
|
||||
顺
|
||||
源
|
||||
珊
|
||||
达
|
||||
欣
|
||||
如
|
||||
莹
|
||||
章
|
||||
浩
|
||||
勤
|
||||
芹
|
||||
容
|
||||
友
|
||||
芝
|
||||
豪
|
||||
洁
|
||||
鑫
|
||||
惠
|
||||
洪
|
||||
旺
|
||||
虎
|
||||
远
|
||||
妮
|
||||
森
|
||||
妹
|
||||
南
|
||||
雯
|
||||
奇
|
||||
健
|
||||
卿
|
||||
虹
|
||||
娇
|
||||
媛
|
||||
怡
|
||||
铭
|
||||
川
|
||||
进
|
||||
博
|
||||
智
|
||||
来
|
||||
琦
|
||||
学
|
||||
聪
|
||||
洋
|
||||
乐
|
||||
年
|
||||
翔
|
||||
然
|
||||
栋
|
||||
凯
|
||||
颖
|
||||
鸣
|
||||
丰
|
||||
瑞
|
||||
奎
|
||||
立
|
||||
堂
|
||||
威
|
||||
雪
|
||||
鸿
|
||||
晶
|
||||
桂
|
||||
凡
|
||||
娣
|
||||
先
|
||||
洲
|
||||
毅
|
||||
雅
|
||||
月
|
||||
旭
|
||||
田
|
||||
晖
|
||||
方
|
||||
恒
|
||||
亚
|
||||
泽
|
||||
风
|
||||
银
|
||||
高
|
||||
贞
|
||||
九
|
||||
薇
|
||||
钰
|
||||
城
|
||||
宜
|
||||
厚
|
||||
耐
|
||||
声
|
||||
腾
|
124
dict/lex-ecmix.lex
Normal file
124
dict/lex-ecmix.lex
Normal file
@ -0,0 +1,124 @@
|
||||
#中英混合字, 注意英文字符均为小写
|
||||
a咖
|
||||
a片
|
||||
a座
|
||||
a股
|
||||
a型
|
||||
a杯
|
||||
a罩杯
|
||||
a计划
|
||||
aa制
|
||||
ab型
|
||||
ab档案
|
||||
a梦
|
||||
#b
|
||||
b座
|
||||
b股
|
||||
b型
|
||||
b计划
|
||||
b超
|
||||
b杯
|
||||
b罩杯
|
||||
bb机
|
||||
bb仔
|
||||
bp机
|
||||
#c
|
||||
c盘
|
||||
c座
|
||||
c语言
|
||||
c杯
|
||||
c罩杯
|
||||
cd盒
|
||||
cd机
|
||||
call机
|
||||
#d
|
||||
d盘
|
||||
d座
|
||||
d版
|
||||
d杯
|
||||
d罩杯
|
||||
dna鉴定
|
||||
#e
|
||||
e盘
|
||||
e座
|
||||
e化
|
||||
e通
|
||||
e仔
|
||||
e语言
|
||||
e杯
|
||||
e罩杯
|
||||
#f
|
||||
f盘
|
||||
f座
|
||||
f杯
|
||||
f罩杯
|
||||
#g
|
||||
g盘
|
||||
g点
|
||||
g杯
|
||||
g罩杯
|
||||
#h
|
||||
h盘
|
||||
h股
|
||||
h杯
|
||||
h罩杯
|
||||
#i
|
||||
i盘
|
||||
ic卡
|
||||
ip卡
|
||||
ip段
|
||||
ip电话
|
||||
ip地址
|
||||
it行业
|
||||
it民工
|
||||
it男
|
||||
#j
|
||||
j盘
|
||||
#k
|
||||
k仔
|
||||
k盘
|
||||
k党
|
||||
k书
|
||||
k粉
|
||||
k歌
|
||||
k他命
|
||||
k歌之王
|
||||
#n
|
||||
n年
|
||||
#o
|
||||
o型
|
||||
#p
|
||||
pc机
|
||||
ph值
|
||||
#s
|
||||
sim卡
|
||||
#u
|
||||
u盘
|
||||
u形
|
||||
usb手指
|
||||
usb接口
|
||||
usb插口
|
||||
usb记忆棒
|
||||
#v
|
||||
visa卡
|
||||
v沟
|
||||
#z
|
||||
z盘
|
||||
#q
|
||||
q版
|
||||
qq号
|
||||
q立方
|
||||
#r
|
||||
rss订阅
|
||||
#t
|
||||
t盘
|
||||
#x
|
||||
x光
|
||||
x光线
|
||||
x射线
|
||||
γ射线
|
||||
#t
|
||||
t恤衫
|
||||
t恤
|
||||
t字帐
|
||||
t型台
|
4
dict/lex-en-pun.lex
Normal file
4
dict/lex-en-pun.lex
Normal file
@ -0,0 +1,4 @@
|
||||
#英文和标点组合成的词,英文字母统一使用小写。
|
||||
c++
|
||||
g++
|
||||
c#
|
4
dict/lex-en.lex
Normal file
4
dict/lex-en.lex
Normal file
@ -0,0 +1,4 @@
|
||||
#英文词条, 做英文词语同义词追加用
|
||||
decimal/decimals,fraction
|
||||
spirit/mind
|
||||
admire/appreciate,like,love,enjoy
|
186
dict/lex-festival.lex
Normal file
186
dict/lex-festival.lex
Normal file
@ -0,0 +1,186 @@
|
||||
七七纪念日/null
|
||||
七夕/七夕情人节,情人节,中国情人节
|
||||
七夕情人节/七夕,中国情人节,情人节
|
||||
七夕节/七夕,情人节,中国情人节
|
||||
万圣节/鬼节
|
||||
世界人权日/null
|
||||
世界儿歌节/null
|
||||
世界儿童节/null
|
||||
世界动物日/null
|
||||
世界卫生日/null
|
||||
世界地球日/null
|
||||
世界教师日/null
|
||||
世界无烟日/null
|
||||
世界无童工日/null
|
||||
世界林业节/null
|
||||
世界森林日/null
|
||||
世界水日/null
|
||||
世界海洋日/null
|
||||
世界湿地日/null
|
||||
世界献血日/null
|
||||
世界环境日/null
|
||||
世界电视日/null
|
||||
世界睡眠日/null
|
||||
世界粮食日/null
|
||||
世界精神卫生日/null
|
||||
世界红十字日/null
|
||||
世界问候日/null
|
||||
中国人民抗日战争纪念日/null
|
||||
抗日战争纪念日/null
|
||||
中国国耻日/null
|
||||
中国学生营养日/null
|
||||
中国爱牙日/null
|
||||
中国爱耳日/null
|
||||
中国青年志愿者服务日/null
|
||||
中国青年节/null
|
||||
中秋/null
|
||||
中秋节/null
|
||||
人口日/null
|
||||
人权日/null
|
||||
儿歌节/null
|
||||
儿童节/null
|
||||
元宵/null
|
||||
元宵节/null
|
||||
元旦/null
|
||||
元旦节/null
|
||||
党生日/null
|
||||
全国中小学生安全教育日/null
|
||||
全国助残日/null
|
||||
全国爱眼日/null
|
||||
全国爱耳日/null
|
||||
六十亿人口日/null
|
||||
六四纪念日/null
|
||||
冬至/null
|
||||
减轻自然灾害日/null
|
||||
动物日/null
|
||||
助残日/null
|
||||
劳动妇女节/null
|
||||
劳动节/null
|
||||
博物馆日/null
|
||||
卫生日/null
|
||||
和平日/null
|
||||
国庆/null
|
||||
国庆节/null
|
||||
国耻日/null
|
||||
国际儿童节/null
|
||||
国际减轻自然灾害日/null
|
||||
国际劳动妇女节/null
|
||||
国际劳动节/null
|
||||
国际博物馆日/null
|
||||
国际和平日/null
|
||||
国际奥林匹克日/null
|
||||
国际妇女节/null
|
||||
国际容忍日/null
|
||||
国际左撇子日/null
|
||||
国际志愿者日/null
|
||||
国际护士节/null
|
||||
国际无车日/null
|
||||
国际残疾人日/null
|
||||
国际母语日/null
|
||||
国际气象节/null
|
||||
国际消费者权益日/null
|
||||
国际牛奶日/null
|
||||
国际盲人节/null
|
||||
国际禁毒日/null
|
||||
国际老人日/null
|
||||
国际臭氧层保护日/null
|
||||
国际非洲儿童日/null
|
||||
国际音乐日/null
|
||||
国际麻风日/null
|
||||
圣诞节/null
|
||||
地球日/null
|
||||
处暑/null
|
||||
复活节/null
|
||||
夏至/null
|
||||
大寒/null
|
||||
大暑/null
|
||||
大雪/null
|
||||
奥林匹克日/null
|
||||
妇女节/null
|
||||
三八节/null
|
||||
三八妇女节/null
|
||||
学生营养日/null
|
||||
安全教育日/null
|
||||
安全日/null
|
||||
容忍日/null
|
||||
寒露/null
|
||||
小寒/null
|
||||
小年/null
|
||||
小暑/null
|
||||
小满/null
|
||||
小雪/null
|
||||
左撇子日/null
|
||||
平安夜/null
|
||||
建党日/null
|
||||
建军节/null
|
||||
志愿人员日/null
|
||||
志愿者日/null
|
||||
情人节/null
|
||||
惊蛰/null
|
||||
愚人节/null
|
||||
感恩节/null
|
||||
扫房日/null
|
||||
抗日战争纪念日/null
|
||||
抗日纪念日/null
|
||||
护士节/null
|
||||
教师日/null
|
||||
教师节/null
|
||||
文化遗产日/null
|
||||
无烟日/null
|
||||
无童工日/null
|
||||
无车日/null
|
||||
春分/null
|
||||
春节/null
|
||||
植树节/null
|
||||
残疾人日/null
|
||||
母亲节/null
|
||||
母语日/null
|
||||
气象节/null
|
||||
水日/null
|
||||
海洋日/null
|
||||
消费者权益日/null
|
||||
清明/null
|
||||
清明节/null
|
||||
湿地日/null
|
||||
爱牙日/null
|
||||
爱眼日/null
|
||||
爱耳日/null
|
||||
父亲节/null
|
||||
牛奶日/null
|
||||
独立日/null
|
||||
献血日/null
|
||||
环境日/null
|
||||
电视日/null
|
||||
白露/null
|
||||
盲人节/null
|
||||
睡眠日/null
|
||||
秋分/null
|
||||
立冬/null
|
||||
立夏/null
|
||||
立春/null
|
||||
立秋/null
|
||||
端午节/null
|
||||
粮食日/null
|
||||
精神卫生日/null
|
||||
红十字日/null
|
||||
老人日/null
|
||||
联合国日/null
|
||||
腊八节/null
|
||||
腊日/null
|
||||
臭氧保护日/null
|
||||
臭氧层保护日/null
|
||||
芒种/null
|
||||
营养日/null
|
||||
谷雨/null
|
||||
重阳/null
|
||||
重阳节/null
|
||||
问候日/null
|
||||
除夕/null
|
||||
雨水/null
|
||||
霜降/null
|
||||
青年志愿者服务日/null
|
||||
青年节/null
|
||||
非洲儿童日/null
|
||||
音乐日/null
|
||||
麻风日/null
|
||||
龙头节/null
|
10
dict/lex-flname.lex
Normal file
10
dict/lex-flname.lex
Normal file
@ -0,0 +1,10 @@
|
||||
亚历山大/null
|
||||
克林顿/null
|
||||
克里斯汀/null
|
||||
布什/null
|
||||
布莱尔/null
|
||||
科特勒/null
|
||||
约翰/null
|
||||
约翰逊/null
|
||||
蒂娜/null
|
||||
安妮/null
|
12
dict/lex-food.lex
Normal file
12
dict/lex-food.lex
Normal file
@ -0,0 +1,12 @@
|
||||
雪碧/null
|
||||
可口可乐/null
|
||||
冰红茶/null
|
||||
奶茶/null
|
||||
花生奶/null
|
||||
芬达/null
|
||||
珍珠奶茶/null
|
||||
达利源/null
|
||||
肯德鸡/null
|
||||
炸薯条/null
|
||||
麻辣烫/null
|
||||
麻辣干锅/null
|
20
dict/lex-lang.lex
Normal file
20
dict/lex-lang.lex
Normal file
@ -0,0 +1,20 @@
|
||||
中文/国语
|
||||
国语/null
|
||||
台湾话/台语
|
||||
台语/台湾话
|
||||
客家话/null
|
||||
汉字/null
|
||||
汉语/国语,中文
|
||||
法文/法文
|
||||
法语/法语
|
||||
福建话/null
|
||||
粤语/广东话
|
||||
美语/英语,英文
|
||||
英文/英语
|
||||
英语/英文
|
||||
西班牙语/null
|
||||
闽南语/null
|
||||
泰语/null
|
||||
西班牙语/null
|
||||
俄罗斯语/null
|
||||
拉丁语/null
|
4
dict/lex-lna.lex
Normal file
4
dict/lex-lna.lex
Normal file
@ -0,0 +1,4 @@
|
||||
#姓氏修饰,例如:老陈,小陈,中的老,小
|
||||
#如果他已经是姓氏(lex-lname.lex中的词),则无须放在这里。
|
||||
老
|
||||
小
|
513
dict/lex-lname.lex
Normal file
513
dict/lex-lname.lex
Normal file
@ -0,0 +1,513 @@
|
||||
#中文姓氏词库
|
||||
#单姓
|
||||
王
|
||||
李
|
||||
张
|
||||
刘
|
||||
陈
|
||||
杨
|
||||
周
|
||||
黄
|
||||
孙
|
||||
吴
|
||||
徐
|
||||
赵
|
||||
林
|
||||
胡
|
||||
朱
|
||||
梁
|
||||
郭
|
||||
高
|
||||
何
|
||||
马
|
||||
郑
|
||||
罗
|
||||
宋
|
||||
唐
|
||||
谢
|
||||
叶
|
||||
韩
|
||||
任
|
||||
潘
|
||||
于
|
||||
冯
|
||||
蒋
|
||||
董
|
||||
吕
|
||||
邓
|
||||
许
|
||||
曹
|
||||
曾
|
||||
袁
|
||||
汪
|
||||
程
|
||||
田
|
||||
彭
|
||||
钟
|
||||
蔡
|
||||
魏
|
||||
沈
|
||||
方
|
||||
卢
|
||||
余
|
||||
杜
|
||||
丁
|
||||
苏
|
||||
贾
|
||||
姚
|
||||
姜
|
||||
陆
|
||||
戴
|
||||
傅
|
||||
夏
|
||||
廖
|
||||
萧
|
||||
石
|
||||
江
|
||||
范
|
||||
今
|
||||
谭
|
||||
邹
|
||||
崔
|
||||
薛
|
||||
邱
|
||||
康
|
||||
史
|
||||
侯
|
||||
邵
|
||||
熊
|
||||
秦
|
||||
雷
|
||||
孟
|
||||
庞
|
||||
白
|
||||
毛
|
||||
郝
|
||||
钱
|
||||
段
|
||||
俞
|
||||
洪
|
||||
汤
|
||||
顾
|
||||
贺
|
||||
龚
|
||||
尹
|
||||
万
|
||||
龙
|
||||
赖
|
||||
章
|
||||
孔
|
||||
武
|
||||
邢
|
||||
颜
|
||||
梅
|
||||
阮
|
||||
黎
|
||||
常
|
||||
倪
|
||||
施
|
||||
乔
|
||||
樊
|
||||
严
|
||||
齐
|
||||
陶
|
||||
#向
|
||||
温
|
||||
文
|
||||
易
|
||||
兰
|
||||
闫
|
||||
芦
|
||||
牛
|
||||
尚
|
||||
安
|
||||
管
|
||||
殷
|
||||
霍
|
||||
翟
|
||||
佘
|
||||
葛
|
||||
庄
|
||||
伍
|
||||
辛
|
||||
练
|
||||
申
|
||||
付
|
||||
曲
|
||||
焦
|
||||
项
|
||||
代
|
||||
鲁
|
||||
季
|
||||
覃
|
||||
覃
|
||||
毕
|
||||
麦
|
||||
阳
|
||||
耿
|
||||
舒
|
||||
聂
|
||||
盛
|
||||
童
|
||||
祝
|
||||
柳
|
||||
单
|
||||
单
|
||||
岳
|
||||
骆
|
||||
纪
|
||||
欧
|
||||
房
|
||||
左
|
||||
尤
|
||||
凌
|
||||
韦
|
||||
景
|
||||
詹
|
||||
莫
|
||||
郎
|
||||
路
|
||||
宁
|
||||
宁
|
||||
关
|
||||
丛
|
||||
翁
|
||||
容
|
||||
亢
|
||||
柯
|
||||
鲍
|
||||
蒲
|
||||
苗
|
||||
牟
|
||||
谷
|
||||
裴
|
||||
商
|
||||
初
|
||||
屈
|
||||
成
|
||||
包
|
||||
游
|
||||
司
|
||||
祁
|
||||
强
|
||||
靳
|
||||
甘
|
||||
席
|
||||
瞿
|
||||
卜
|
||||
褚
|
||||
解
|
||||
臧
|
||||
时
|
||||
费
|
||||
班
|
||||
华
|
||||
全
|
||||
涂
|
||||
卓
|
||||
党
|
||||
饶
|
||||
应
|
||||
卫
|
||||
丘
|
||||
隋
|
||||
米
|
||||
闵
|
||||
畅
|
||||
喻
|
||||
冉
|
||||
宫
|
||||
甄
|
||||
宣
|
||||
穆
|
||||
谈
|
||||
匡
|
||||
帅
|
||||
车
|
||||
母
|
||||
查
|
||||
戚
|
||||
符
|
||||
缪
|
||||
昌
|
||||
娄
|
||||
滕
|
||||
位
|
||||
奚
|
||||
边
|
||||
卞
|
||||
桂
|
||||
邝
|
||||
苟
|
||||
柏
|
||||
井
|
||||
冀
|
||||
邬
|
||||
吉
|
||||
敖
|
||||
桑
|
||||
池
|
||||
简
|
||||
蔺
|
||||
连
|
||||
艾
|
||||
蓝
|
||||
窦
|
||||
刚
|
||||
封
|
||||
占
|
||||
迟
|
||||
姬
|
||||
刁
|
||||
栾
|
||||
冷
|
||||
杭
|
||||
植
|
||||
郁
|
||||
晋
|
||||
虞
|
||||
佟
|
||||
苑
|
||||
屠
|
||||
藏
|
||||
蒙
|
||||
占
|
||||
辜
|
||||
廉
|
||||
巩
|
||||
麻
|
||||
晏
|
||||
相
|
||||
师
|
||||
鄢
|
||||
泮
|
||||
燕
|
||||
岑
|
||||
官
|
||||
仲
|
||||
羊
|
||||
揭
|
||||
仇
|
||||
邸
|
||||
宗
|
||||
荆
|
||||
盖
|
||||
盖
|
||||
粱
|
||||
原
|
||||
茅
|
||||
荣
|
||||
沙
|
||||
郜
|
||||
巫
|
||||
鞠
|
||||
罡
|
||||
未
|
||||
来
|
||||
劳
|
||||
诸
|
||||
计
|
||||
乐
|
||||
乐
|
||||
双
|
||||
花
|
||||
冼
|
||||
尉
|
||||
木
|
||||
丰
|
||||
寇
|
||||
栗
|
||||
况
|
||||
干
|
||||
楼
|
||||
满
|
||||
桑
|
||||
湛
|
||||
谌
|
||||
储
|
||||
邦
|
||||
皮
|
||||
楚
|
||||
胥
|
||||
明
|
||||
平
|
||||
腾
|
||||
厉
|
||||
仉
|
||||
励
|
||||
竺
|
||||
闻
|
||||
宇
|
||||
支
|
||||
都
|
||||
折
|
||||
旷
|
||||
南
|
||||
战
|
||||
嵇
|
||||
化
|
||||
糜
|
||||
衣
|
||||
国
|
||||
逄
|
||||
门
|
||||
崇
|
||||
裘
|
||||
薄
|
||||
束
|
||||
宿
|
||||
东
|
||||
降
|
||||
逯
|
||||
伊
|
||||
修
|
||||
粟
|
||||
漆
|
||||
阙
|
||||
禹
|
||||
先
|
||||
银
|
||||
台
|
||||
#和
|
||||
祖
|
||||
惠
|
||||
伦
|
||||
候
|
||||
阚
|
||||
慕
|
||||
戈
|
||||
富
|
||||
伏
|
||||
僧
|
||||
习
|
||||
云
|
||||
元
|
||||
狄
|
||||
危
|
||||
雍
|
||||
蔚
|
||||
索
|
||||
居
|
||||
浦
|
||||
权
|
||||
税
|
||||
谯
|
||||
於
|
||||
芮
|
||||
濮
|
||||
基
|
||||
寿
|
||||
凡
|
||||
卿
|
||||
酆
|
||||
苻
|
||||
保
|
||||
郗
|
||||
渠
|
||||
琚
|
||||
淡
|
||||
由
|
||||
豆
|
||||
扈
|
||||
仁
|
||||
呼
|
||||
矫
|
||||
巢
|
||||
盘
|
||||
敬
|
||||
巴
|
||||
茆
|
||||
鱼
|
||||
戎
|
||||
缠
|
||||
区
|
||||
幸
|
||||
海
|
||||
弓
|
||||
阴
|
||||
住
|
||||
晁
|
||||
菅
|
||||
印
|
||||
汝
|
||||
历
|
||||
么
|
||||
乌
|
||||
贡
|
||||
妙
|
||||
禤
|
||||
荀
|
||||
鹿
|
||||
邰
|
||||
随
|
||||
雒
|
||||
贝
|
||||
录
|
||||
鲜
|
||||
茹
|
||||
种
|
||||
农
|
||||
佐
|
||||
赫
|
||||
字
|
||||
油
|
||||
#但
|
||||
綦
|
||||
美
|
||||
利
|
||||
钮
|
||||
信
|
||||
勾
|
||||
火
|
||||
昝
|
||||
圣
|
||||
颉
|
||||
从
|
||||
靖
|
||||
开
|
||||
公
|
||||
那
|
||||
山
|
||||
智
|
||||
补
|
||||
虎
|
||||
才
|
||||
布
|
||||
亓
|
||||
药
|
||||
造
|
||||
普
|
||||
五
|
||||
仝
|
||||
扆
|
||||
暴
|
||||
咸
|
||||
庚
|
||||
奕
|
||||
锺
|
||||
问
|
||||
招
|
||||
贵
|
||||
巨
|
||||
檀
|
||||
厚
|
||||
恽
|
||||
过
|
||||
达
|
||||
邴
|
||||
洛
|
||||
忻
|
||||
展
|
||||
户
|
||||
毋
|
||||
暨
|
||||
#复姓
|
||||
欧阳
|
||||
上官
|
||||
司徒
|
||||
刘付
|
||||
皇甫
|
||||
长孙
|
||||
相里
|
||||
令狐
|
||||
诸葛
|
169450
dict/lex-main.lex
Normal file
169450
dict/lex-main.lex
Normal file
File diff suppressed because it is too large
Load Diff
53
dict/lex-nation.lex
Normal file
53
dict/lex-nation.lex
Normal file
@ -0,0 +1,53 @@
|
||||
东非/null
|
||||
中华/null
|
||||
中华/null
|
||||
中华人民共和国/null
|
||||
中华民国/null
|
||||
中国/null
|
||||
中非/null
|
||||
乌克兰/null
|
||||
也门/null
|
||||
以色列/null
|
||||
伊拉克/null
|
||||
伊朗/null
|
||||
俄罗斯/null
|
||||
分类/null
|
||||
加拿大/null
|
||||
南非/null
|
||||
古巴/null
|
||||
台湾/null
|
||||
埃及/null
|
||||
塞尔维亚/null
|
||||
墨西哥/null
|
||||
威尔士/null
|
||||
尼日利亚/null
|
||||
巴比伦/null
|
||||
希腊/null
|
||||
德国/null
|
||||
德意志/null
|
||||
意大利/null
|
||||
捷克/null
|
||||
日本/null
|
||||
朝鲜/null
|
||||
比利时/null
|
||||
法兰西/null
|
||||
法国/null
|
||||
波兰/null
|
||||
波黑/null
|
||||
瑞典/null
|
||||
瑞士/null
|
||||
白俄罗斯/null
|
||||
缅甸/null
|
||||
美利坚/null
|
||||
美利坚合众国/null
|
||||
美国/null
|
||||
老挝/null
|
||||
苏格兰/null
|
||||
苏联/null
|
||||
英国/null
|
||||
英格兰/null
|
||||
葡萄牙/null
|
||||
蒙古/null
|
||||
西班牙/null
|
||||
越南/null
|
||||
韩国/null
|
18
dict/lex-net.lex
Normal file
18
dict/lex-net.lex
Normal file
@ -0,0 +1,18 @@
|
||||
油条哥/null
|
||||
活雷锋/null
|
||||
夕阳红/null
|
||||
帮扶村/null
|
||||
后援会/null
|
||||
复炸油/null
|
||||
献血哥/null
|
||||
放心姐/null
|
||||
啃老族/null
|
||||
特训班/null
|
||||
平头男/null
|
||||
爆头哥/null
|
||||
楼主/null
|
||||
有两把刷子/null
|
||||
超女/超级女声
|
||||
快男/快乐男生
|
||||
非典/null
|
||||
吊丝/null
|
15
dict/lex-org.lex
Normal file
15
dict/lex-org.lex
Normal file
@ -0,0 +1,15 @@
|
||||
上海合作组织/null
|
||||
世卫/null
|
||||
世界卫生组织/null
|
||||
世界银行/null
|
||||
东盟/null
|
||||
亚太经合组织/null
|
||||
人权理事会/null
|
||||
六方会谈/null
|
||||
北约/null
|
||||
哈马斯/null
|
||||
安全理事会/null
|
||||
安理会/null
|
||||
欧佩克/null
|
||||
红十字会/null
|
||||
联合国/null
|
207
dict/lex-sname.lex
Normal file
207
dict/lex-sname.lex
Normal file
@ -0,0 +1,207 @@
|
||||
#中文单名词库
|
||||
敏
|
||||
伟
|
||||
勇
|
||||
军
|
||||
斌
|
||||
静
|
||||
丽
|
||||
涛
|
||||
芳
|
||||
杰
|
||||
萍
|
||||
强
|
||||
俊
|
||||
明
|
||||
燕
|
||||
磊
|
||||
玲
|
||||
华
|
||||
平
|
||||
鹏
|
||||
健
|
||||
波
|
||||
红
|
||||
丹
|
||||
辉
|
||||
超
|
||||
艳
|
||||
莉
|
||||
刚
|
||||
娟
|
||||
峰
|
||||
婷
|
||||
亮
|
||||
洁
|
||||
颖
|
||||
琳
|
||||
英
|
||||
慧
|
||||
飞
|
||||
霞
|
||||
浩
|
||||
凯
|
||||
宇
|
||||
毅
|
||||
林
|
||||
佳
|
||||
云
|
||||
莹
|
||||
娜
|
||||
晶
|
||||
洋
|
||||
文
|
||||
鑫
|
||||
欣
|
||||
琴
|
||||
宁
|
||||
琼
|
||||
兵
|
||||
青
|
||||
琦
|
||||
翔
|
||||
彬
|
||||
锋
|
||||
阳
|
||||
璐
|
||||
旭
|
||||
蕾
|
||||
剑
|
||||
虹
|
||||
蓉
|
||||
建
|
||||
倩
|
||||
梅
|
||||
宏
|
||||
威
|
||||
博
|
||||
君
|
||||
力
|
||||
龙
|
||||
晨
|
||||
薇
|
||||
雪
|
||||
琪
|
||||
欢
|
||||
荣
|
||||
江
|
||||
炜
|
||||
成
|
||||
庆
|
||||
冰
|
||||
东
|
||||
帆
|
||||
雷
|
||||
楠
|
||||
锐
|
||||
进
|
||||
海
|
||||
凡
|
||||
巍
|
||||
维
|
||||
迪
|
||||
媛
|
||||
玮
|
||||
杨
|
||||
群
|
||||
瑛
|
||||
悦
|
||||
春
|
||||
瑶
|
||||
婧
|
||||
兰
|
||||
茜
|
||||
松
|
||||
爽
|
||||
立
|
||||
瑜
|
||||
睿
|
||||
晖
|
||||
聪
|
||||
帅
|
||||
瑾
|
||||
骏
|
||||
雯
|
||||
晓
|
||||
昊
|
||||
勤
|
||||
新
|
||||
瑞
|
||||
岩
|
||||
星
|
||||
忠
|
||||
志
|
||||
怡
|
||||
坤
|
||||
康
|
||||
航
|
||||
利
|
||||
畅
|
||||
坚
|
||||
雄
|
||||
智
|
||||
萌
|
||||
哲
|
||||
岚
|
||||
洪
|
||||
捷
|
||||
珊
|
||||
恒
|
||||
靖
|
||||
清
|
||||
扬
|
||||
昕
|
||||
乐
|
||||
武
|
||||
玉
|
||||
诚
|
||||
菲
|
||||
锦
|
||||
凤
|
||||
珍
|
||||
晔
|
||||
妍
|
||||
璇
|
||||
胜
|
||||
菁
|
||||
科
|
||||
芬
|
||||
露
|
||||
越
|
||||
彤
|
||||
曦
|
||||
义
|
||||
良
|
||||
鸣
|
||||
芸
|
||||
方
|
||||
月
|
||||
铭
|
||||
光
|
||||
震
|
||||
冬
|
||||
源
|
||||
政
|
||||
虎
|
||||
莎
|
||||
彪
|
||||
蓓
|
||||
钢
|
||||
凌
|
||||
奇
|
||||
卫
|
||||
彦
|
||||
烨
|
||||
可
|
||||
黎
|
||||
川
|
||||
淼
|
||||
惠
|
||||
祥
|
||||
然
|
||||
三
|
||||
逗
|
||||
高
|
||||
潇
|
||||
正
|
||||
硕
|
887
dict/lex-stopwords.lex
Normal file
887
dict/lex-stopwords.lex
Normal file
@ -0,0 +1,887 @@
|
||||
#en-punctuation
|
||||
!
|
||||
"
|
||||
#
|
||||
$
|
||||
%
|
||||
&
|
||||
'
|
||||
(
|
||||
)
|
||||
*
|
||||
+
|
||||
,
|
||||
-
|
||||
.
|
||||
/
|
||||
#0
|
||||
#1
|
||||
#2
|
||||
#3
|
||||
#4
|
||||
#5
|
||||
#6
|
||||
#7
|
||||
#8
|
||||
#9
|
||||
:
|
||||
;
|
||||
<
|
||||
=
|
||||
>
|
||||
?
|
||||
@
|
||||
[
|
||||
\
|
||||
]
|
||||
^
|
||||
_
|
||||
`
|
||||
#a
|
||||
#b
|
||||
#c
|
||||
#d
|
||||
#e
|
||||
#f
|
||||
#g
|
||||
#h
|
||||
#i
|
||||
#j
|
||||
#k
|
||||
#l
|
||||
#m
|
||||
#n
|
||||
#o
|
||||
#p
|
||||
#q
|
||||
#r
|
||||
#s
|
||||
#t
|
||||
#u
|
||||
#v
|
||||
#w
|
||||
#x
|
||||
#y
|
||||
#z
|
||||
{
|
||||
|
|
||||
}
|
||||
~
|
||||
!
|
||||
#fullwidth
|
||||
!
|
||||
"
|
||||
#
|
||||
$
|
||||
%
|
||||
&
|
||||
'
|
||||
(
|
||||
)
|
||||
*
|
||||
+
|
||||
,
|
||||
-
|
||||
.
|
||||
/
|
||||
:
|
||||
;
|
||||
<
|
||||
=
|
||||
>
|
||||
?
|
||||
@
|
||||
[
|
||||
\
|
||||
]
|
||||
^
|
||||
_
|
||||
`
|
||||
{
|
||||
|
|
||||
}
|
||||
~
|
||||
⦅
|
||||
⦆
|
||||
。
|
||||
「
|
||||
」
|
||||
、
|
||||
・
|
||||
#cn-punctuation
|
||||
、
|
||||
。
|
||||
〃
|
||||
〄
|
||||
々
|
||||
〆
|
||||
〇
|
||||
〈
|
||||
〉
|
||||
《
|
||||
》
|
||||
「
|
||||
」
|
||||
『
|
||||
』
|
||||
【
|
||||
】
|
||||
〒
|
||||
〓
|
||||
〔
|
||||
〕
|
||||
〖
|
||||
〗
|
||||
〘
|
||||
〙
|
||||
〚
|
||||
〛
|
||||
〜
|
||||
〝
|
||||
〞
|
||||
〟
|
||||
#中文
|
||||
的
|
||||
啊
|
||||
呀
|
||||
吗
|
||||
不
|
||||
我
|
||||
们
|
||||
起
|
||||
就
|
||||
最
|
||||
在
|
||||
人
|
||||
有
|
||||
是
|
||||
为
|
||||
以
|
||||
于
|
||||
上
|
||||
他
|
||||
而
|
||||
后
|
||||
之
|
||||
来
|
||||
由
|
||||
及
|
||||
了
|
||||
下
|
||||
可
|
||||
到
|
||||
这
|
||||
与
|
||||
也
|
||||
因
|
||||
此
|
||||
但
|
||||
并
|
||||
个
|
||||
其
|
||||
已
|
||||
无
|
||||
小
|
||||
今
|
||||
去
|
||||
再
|
||||
好
|
||||
只
|
||||
又
|
||||
或
|
||||
很
|
||||
亦
|
||||
某
|
||||
把
|
||||
那
|
||||
你
|
||||
乃
|
||||
它
|
||||
吧
|
||||
被
|
||||
比
|
||||
别
|
||||
趁
|
||||
当
|
||||
从
|
||||
到
|
||||
得
|
||||
打
|
||||
凡
|
||||
儿
|
||||
尔
|
||||
该
|
||||
各
|
||||
给
|
||||
跟
|
||||
和
|
||||
何
|
||||
还
|
||||
即
|
||||
几
|
||||
既
|
||||
看
|
||||
据
|
||||
距
|
||||
靠
|
||||
啦
|
||||
了
|
||||
另
|
||||
么
|
||||
每
|
||||
们
|
||||
嘛
|
||||
拿
|
||||
哪
|
||||
那
|
||||
您
|
||||
凭
|
||||
且
|
||||
却
|
||||
让
|
||||
仍
|
||||
啥
|
||||
如
|
||||
若
|
||||
使
|
||||
谁
|
||||
虽
|
||||
随
|
||||
同
|
||||
所
|
||||
她
|
||||
哇
|
||||
嗡
|
||||
往
|
||||
哪
|
||||
些
|
||||
向
|
||||
沿
|
||||
哟
|
||||
用
|
||||
于
|
||||
咱
|
||||
则
|
||||
怎
|
||||
曾
|
||||
至
|
||||
致
|
||||
着
|
||||
诸
|
||||
自
|
||||
会
|
||||
#英文
|
||||
to
|
||||
can
|
||||
could
|
||||
dare
|
||||
do
|
||||
did
|
||||
does
|
||||
may
|
||||
might
|
||||
would
|
||||
should
|
||||
must
|
||||
will
|
||||
ought
|
||||
shall
|
||||
need
|
||||
is
|
||||
a
|
||||
am
|
||||
are
|
||||
about
|
||||
according
|
||||
after
|
||||
against
|
||||
all
|
||||
almost
|
||||
also
|
||||
although
|
||||
among
|
||||
an
|
||||
and
|
||||
another
|
||||
any
|
||||
anything
|
||||
approximately
|
||||
as
|
||||
asked
|
||||
at
|
||||
back
|
||||
because
|
||||
before
|
||||
besides
|
||||
between
|
||||
both
|
||||
but
|
||||
by
|
||||
call
|
||||
called
|
||||
currently
|
||||
despite
|
||||
did
|
||||
do
|
||||
dr
|
||||
during
|
||||
each
|
||||
earlier
|
||||
eight
|
||||
even
|
||||
eventually
|
||||
every
|
||||
everything
|
||||
five
|
||||
for
|
||||
four
|
||||
from
|
||||
he
|
||||
her
|
||||
here
|
||||
his
|
||||
how
|
||||
however
|
||||
i
|
||||
if
|
||||
in
|
||||
indeed
|
||||
instead
|
||||
it
|
||||
its
|
||||
just
|
||||
last
|
||||
like
|
||||
major
|
||||
many
|
||||
may
|
||||
maybe
|
||||
meanwhile
|
||||
more
|
||||
moreover
|
||||
most
|
||||
mr
|
||||
mrs
|
||||
ms
|
||||
much
|
||||
my
|
||||
neither
|
||||
net
|
||||
never
|
||||
nevertheless
|
||||
nine
|
||||
no
|
||||
none
|
||||
not
|
||||
nothing
|
||||
now
|
||||
of
|
||||
on
|
||||
once
|
||||
one
|
||||
only
|
||||
or
|
||||
other
|
||||
our
|
||||
over
|
||||
partly
|
||||
perhaps
|
||||
prior
|
||||
regarding
|
||||
separately
|
||||
seven
|
||||
several
|
||||
she
|
||||
should
|
||||
similarly
|
||||
since
|
||||
six
|
||||
so
|
||||
some
|
||||
somehow
|
||||
still
|
||||
such
|
||||
ten
|
||||
that
|
||||
the
|
||||
their
|
||||
then
|
||||
there
|
||||
therefore
|
||||
these
|
||||
they
|
||||
this
|
||||
those
|
||||
though
|
||||
three
|
||||
to
|
||||
two
|
||||
under
|
||||
unless
|
||||
unlike
|
||||
until
|
||||
volume
|
||||
we
|
||||
what
|
||||
whatever
|
||||
whats
|
||||
when
|
||||
where
|
||||
which
|
||||
while
|
||||
why
|
||||
with
|
||||
without
|
||||
yesterday
|
||||
yet
|
||||
you
|
||||
your
|
||||
aboard
|
||||
about
|
||||
above
|
||||
according to
|
||||
across
|
||||
afore
|
||||
after
|
||||
against
|
||||
agin
|
||||
along
|
||||
alongside
|
||||
amid
|
||||
amidst
|
||||
among
|
||||
amongst
|
||||
anent
|
||||
around
|
||||
as
|
||||
aslant
|
||||
astride
|
||||
at
|
||||
athwart
|
||||
bar
|
||||
because of
|
||||
before
|
||||
behind
|
||||
below
|
||||
beneath
|
||||
beside
|
||||
besides
|
||||
between
|
||||
betwixt
|
||||
beyond
|
||||
but
|
||||
by
|
||||
circa
|
||||
despite
|
||||
down
|
||||
during
|
||||
due to
|
||||
ere
|
||||
except
|
||||
for
|
||||
from
|
||||
in
|
||||
inside
|
||||
into
|
||||
less
|
||||
like
|
||||
mid
|
||||
midst
|
||||
minus
|
||||
near
|
||||
next
|
||||
nigh
|
||||
nigher
|
||||
nighest
|
||||
notwithstanding
|
||||
of
|
||||
off
|
||||
on
|
||||
onto
|
||||
out
|
||||
out of
|
||||
outside
|
||||
over
|
||||
past
|
||||
pending
|
||||
per
|
||||
plus
|
||||
qua
|
||||
re
|
||||
round
|
||||
sans
|
||||
save
|
||||
since
|
||||
through
|
||||
throughout
|
||||
thru
|
||||
till
|
||||
to
|
||||
toward
|
||||
towards
|
||||
under
|
||||
underneath
|
||||
unlike
|
||||
until
|
||||
unto
|
||||
up
|
||||
upon
|
||||
versus
|
||||
via
|
||||
vice
|
||||
with
|
||||
within
|
||||
without
|
||||
he
|
||||
her
|
||||
herself
|
||||
hers
|
||||
him
|
||||
himself
|
||||
his
|
||||
I
|
||||
it
|
||||
its
|
||||
itself
|
||||
me
|
||||
mine
|
||||
my
|
||||
myself
|
||||
ours
|
||||
she
|
||||
their
|
||||
theirs
|
||||
them
|
||||
themselves
|
||||
they
|
||||
us
|
||||
we
|
||||
our
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourselves
|
||||
yourself
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
a
|
||||
about
|
||||
above
|
||||
across
|
||||
after
|
||||
afterwards
|
||||
again
|
||||
against
|
||||
all
|
||||
almost
|
||||
alone
|
||||
along
|
||||
already
|
||||
also
|
||||
although
|
||||
always
|
||||
am
|
||||
among
|
||||
amongst
|
||||
amoungst
|
||||
amount
|
||||
an
|
||||
and
|
||||
another
|
||||
any
|
||||
anyhow
|
||||
anyone
|
||||
anything
|
||||
anyway
|
||||
anywhere
|
||||
are
|
||||
around
|
||||
as
|
||||
at
|
||||
back
|
||||
be
|
||||
became
|
||||
because
|
||||
become
|
||||
becomes
|
||||
becoming
|
||||
been
|
||||
before
|
||||
beforehand
|
||||
behind
|
||||
being
|
||||
below
|
||||
beside
|
||||
besides
|
||||
between
|
||||
beyond
|
||||
bill
|
||||
both
|
||||
bottom
|
||||
but
|
||||
by
|
||||
call
|
||||
can
|
||||
cannot
|
||||
cant
|
||||
co
|
||||
computer
|
||||
con
|
||||
could
|
||||
couldnt
|
||||
cry
|
||||
de
|
||||
describe
|
||||
detail
|
||||
do
|
||||
done
|
||||
down
|
||||
due
|
||||
during
|
||||
each
|
||||
eg
|
||||
eight
|
||||
either
|
||||
eleven
|
||||
else
|
||||
elsewhere
|
||||
empty
|
||||
enough
|
||||
etc
|
||||
even
|
||||
ever
|
||||
every
|
||||
everyone
|
||||
everything
|
||||
everywhere
|
||||
except
|
||||
few
|
||||
fifteen
|
||||
fify
|
||||
fill
|
||||
find
|
||||
fire
|
||||
first
|
||||
five
|
||||
for
|
||||
former
|
||||
formerly
|
||||
forty
|
||||
found
|
||||
four
|
||||
from
|
||||
front
|
||||
full
|
||||
further
|
||||
get
|
||||
give
|
||||
go
|
||||
had
|
||||
has
|
||||
hasnt
|
||||
have
|
||||
he
|
||||
hence
|
||||
her
|
||||
here
|
||||
hereafter
|
||||
hereby
|
||||
herein
|
||||
hereupon
|
||||
hers
|
||||
herself
|
||||
him
|
||||
himself
|
||||
his
|
||||
how
|
||||
however
|
||||
hundred
|
||||
i
|
||||
ie
|
||||
if
|
||||
in
|
||||
inc
|
||||
indeed
|
||||
interest
|
||||
into
|
||||
is
|
||||
it
|
||||
its
|
||||
itself
|
||||
keep
|
||||
last
|
||||
latter
|
||||
latterly
|
||||
least
|
||||
less
|
||||
ltd
|
||||
made
|
||||
many
|
||||
may
|
||||
me
|
||||
meanwhile
|
||||
might
|
||||
mill
|
||||
mine
|
||||
more
|
||||
moreover
|
||||
most
|
||||
mostly
|
||||
move
|
||||
much
|
||||
must
|
||||
my
|
||||
myself
|
||||
name
|
||||
namely
|
||||
neither
|
||||
never
|
||||
nevertheless
|
||||
next
|
||||
nine
|
||||
no
|
||||
nobody
|
||||
none
|
||||
noone
|
||||
nor
|
||||
not
|
||||
nothing
|
||||
now
|
||||
nowhere
|
||||
of
|
||||
off
|
||||
often
|
||||
on
|
||||
once
|
||||
one
|
||||
only
|
||||
onto
|
||||
or
|
||||
other
|
||||
others
|
||||
otherwise
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
over
|
||||
own
|
||||
part
|
||||
per
|
||||
perhaps
|
||||
please
|
||||
put
|
||||
rather
|
||||
re
|
||||
same
|
||||
see
|
||||
seem
|
||||
seemed
|
||||
seeming
|
||||
seems
|
||||
serious
|
||||
several
|
||||
she
|
||||
should
|
||||
show
|
||||
side
|
||||
since
|
||||
sincere
|
||||
six
|
||||
sixty
|
||||
so
|
||||
some
|
||||
somehow
|
||||
someone
|
||||
something
|
||||
sometime
|
||||
sometimes
|
||||
somewhere
|
||||
still
|
||||
such
|
||||
take
|
||||
ten
|
||||
than
|
||||
that
|
||||
the
|
||||
their
|
||||
them
|
||||
themselves
|
||||
then
|
||||
thence
|
||||
there
|
||||
thereafter
|
||||
thereby
|
||||
therefore
|
||||
therein
|
||||
thereupon
|
||||
these
|
||||
they
|
||||
thick
|
||||
thin
|
||||
third
|
||||
this
|
||||
those
|
||||
though
|
||||
three
|
||||
through
|
||||
throughout
|
||||
thru
|
||||
thus
|
||||
to
|
||||
together
|
||||
too
|
||||
top
|
||||
toward
|
||||
towards
|
||||
twelve
|
||||
twenty
|
||||
two
|
||||
un
|
||||
under
|
||||
until
|
||||
up
|
||||
upon
|
||||
us
|
||||
very
|
||||
via
|
||||
was
|
||||
we
|
||||
well
|
||||
were
|
||||
what
|
||||
whatever
|
||||
when
|
||||
whence
|
||||
whenever
|
||||
where
|
||||
whereafter
|
||||
whereas
|
||||
whereby
|
||||
wherein
|
||||
whereupon
|
||||
wherever
|
||||
whether
|
||||
which
|
||||
while
|
||||
whither
|
||||
who
|
||||
whoever
|
||||
whole
|
||||
whom
|
||||
whose
|
||||
why
|
||||
will
|
||||
with
|
||||
within
|
||||
without
|
||||
would
|
||||
yet
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
39
dict/lex-units.lex
Normal file
39
dict/lex-units.lex
Normal file
@ -0,0 +1,39 @@
|
||||
#single chinese unit
|
||||
#长度
|
||||
米
|
||||
寸
|
||||
尺
|
||||
丈
|
||||
里
|
||||
#时间
|
||||
年
|
||||
月
|
||||
日
|
||||
时
|
||||
#分
|
||||
秒
|
||||
#币
|
||||
元
|
||||
角
|
||||
#容量
|
||||
升
|
||||
斗
|
||||
石
|
||||
瓶
|
||||
袋
|
||||
盒
|
||||
#重量
|
||||
吨
|
||||
克
|
||||
斤
|
||||
两
|
||||
担
|
||||
#地积
|
||||
亩
|
||||
顷
|
||||
#其他
|
||||
折
|
||||
件
|
||||
#其他
|
||||
℃
|
||||
℉
|
47
friso.ini
Normal file
47
friso.ini
Normal file
@ -0,0 +1,47 @@
|
||||
#friso configuration file.
|
||||
# do not change the name of the left key.
|
||||
# @email chenxin619315@gmail.com
|
||||
# @date 2012-12-20
|
||||
#
|
||||
|
||||
#lexicon directory absolute path.
|
||||
# the value must end with '/'
|
||||
#this will tell friso how to find friso.lex.ini configuration file and all the lexicon files.
|
||||
friso.lex_dir = /c/products/friso/dict/
|
||||
|
||||
#the maximum matching length.
|
||||
friso.max_len = 5
|
||||
|
||||
#1 for recognition chinese name.
|
||||
# and 0 for closed it.
|
||||
friso.r_name = 1
|
||||
|
||||
#the maximum length for the cjk words in a
|
||||
# chinese and english mixed word.
|
||||
friso.mix_len = 2
|
||||
|
||||
#the maxinum length for the chinese last name adron.
|
||||
friso.lna_len = 1
|
||||
|
||||
#append the synonyms words
|
||||
friso.add_syn = 1
|
||||
|
||||
#clear the stopwords or not (1 to open it and 0 to close it)
|
||||
#@date 2013-06-13
|
||||
friso.clr_stw = 0
|
||||
|
||||
#keep the unrecongized words or not (1 to open it and 0 to close it)
|
||||
#@date 2013-06-13
|
||||
friso.keep_urec = 1
|
||||
|
||||
#use sphinx output style like 'admire|love|enjoy einsten'
|
||||
#@date 2013-10-25
|
||||
friso.spx_out = 1
|
||||
|
||||
#the threshold value for a char not a part of a chinese name.
|
||||
friso.nthreshold = 2000000
|
||||
|
||||
#default mode for friso.
|
||||
# 1 : simple mode - simply maxmum matching algorithm.
|
||||
# 2 : complex mode - four rules of mmseg alogrithm.
|
||||
friso.mode = 2
|
BIN
images/friso_linux.tst.jpg
Normal file
BIN
images/friso_linux.tst.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 49 KiB |
BIN
images/friso_win.tst.jpg
Normal file
BIN
images/friso_win.tst.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 68 KiB |
88
src/Makefile
Normal file
88
src/Makefile
Normal file
@ -0,0 +1,88 @@
|
||||
#############################################################
|
||||
# friso chinese word segmentation makefile. #
|
||||
# do not use it for commercial use. #
|
||||
# @author chenxin #
|
||||
# @email chenxin619315@gmail.com #
|
||||
#############################################################
|
||||
|
||||
#complie
|
||||
CC = gcc
|
||||
#include directory
|
||||
INCLUDE = .
|
||||
#complie flags for devolep
|
||||
CFLAGS = -g -Wall
|
||||
#complile flags for products
|
||||
FFLAGS = -O2 -Wall -fPIC
|
||||
#extension libs for friso
|
||||
ELIB = m
|
||||
LIB_FILE = libfriso.so
|
||||
#STA_FILE = libfriso.a
|
||||
LIBRARY_DIR = /usr/lib
|
||||
INSTALL_DIR = /usr/local/bin
|
||||
|
||||
|
||||
SOURCE = friso.o friso_array.o friso_hash.o friso_lexicon.o friso_link.o friso_string.o friso_split.o
|
||||
|
||||
all: share friso
|
||||
|
||||
#static: $(SOURCE)
|
||||
# ar -cr $(STA_FILE) $(SOURCE)
|
||||
|
||||
share: $(SOURCE)
|
||||
$(CC) $(FFLAGS) $(SOURCE) -fPIC -shared -l$(ELIB) -o $(LIB_FILE)
|
||||
|
||||
friso: tst-friso.o
|
||||
$(CC) tst-friso.o -o ./friso -L. -lfriso
|
||||
|
||||
tst-friso.o: friso_API.h friso.h tst-friso.c
|
||||
$(CC) $(FFLAGS) -c tst-friso.c
|
||||
|
||||
friso.o: friso.c friso.h friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso.c -l$(ELIB)
|
||||
|
||||
friso_array.o: friso_array.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_array.c
|
||||
|
||||
friso_hash.o: friso_hash.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_hash.c
|
||||
|
||||
friso_lexicon.o: friso_hash.c friso_lexicon.c friso_API.h friso.h
|
||||
$(CC) $(FFLAGS) -c friso_lexicon.c
|
||||
|
||||
friso_link.o: friso_link.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_link.c
|
||||
|
||||
friso_string.o: friso_string.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_string.c
|
||||
|
||||
friso_split.o: friso_split.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_split.c
|
||||
|
||||
#clean all the object files.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
find . -name \*.so | xargs rm -f
|
||||
find . -name \*.o | xargs rm -f
|
||||
@if [ -f friso ];\
|
||||
then\
|
||||
rm -f friso;\
|
||||
fi
|
||||
|
||||
#install friso
|
||||
install: friso
|
||||
@if [ -d $(INSTALL_DIR) ] && [ -d $(LIBRARY_DIR) ];\
|
||||
then\
|
||||
cp friso $(INSTALL_DIR);\
|
||||
chmod a+x $(INSTALL_DIR)/friso;\
|
||||
chmod og-w $(INSTALL_DIR)/friso;\
|
||||
cp $(LIB_FILE) $(LIBRARY_DIR);\
|
||||
chmod a+x $(LIBRARY_DIR)/$(LIB_FILE);\
|
||||
chmod og-w $(LIBRARY_DIR)/$(LIB_FILE);\
|
||||
#cp $(STA_FILE) $(LIBRARY_DIR);\
|
||||
#chmod a+x $(LIBRARY_DIR)/$(STA_FILE);\
|
||||
#chmod og-w $(LIBRARY_DIR)/$(STA_FILE);\
|
||||
echo "install friso successfully.";\
|
||||
echo "Usage: friso -init friso configuration file path.";\
|
||||
else\
|
||||
echo "Sorry, $(INSTALL_DIR) or $(LIBRARY_DIR) does not exits.";\
|
||||
fi
|
62
src/Makefile.cygwin
Normal file
62
src/Makefile.cygwin
Normal file
@ -0,0 +1,62 @@
|
||||
#############################################################
|
||||
# friso chinese word segmentation makefile. #
|
||||
# do not use it for commercial use. #
|
||||
# @author chenxin #
|
||||
# @email chenxin619315@gmail.com #
|
||||
#############################################################
|
||||
|
||||
#complie
|
||||
CC = gcc
|
||||
#include directory
|
||||
INCLUDE = .
|
||||
#complie flags for devolep
|
||||
CFLAGS = -g -Wall
|
||||
#complile flags for products
|
||||
FFLAGS = -O2 -Wall
|
||||
#extension libs for friso
|
||||
ELIB = m
|
||||
SH_FILE = friso.dll
|
||||
LB_FILE = friso.a
|
||||
|
||||
|
||||
OBJS = friso.o friso_array.o friso_hash.o friso_lexicon.o friso_link.o friso_string.o friso_split.o
|
||||
SRCS = friso.c friso_array.c friso_hash.c friso_lexicon.c friso_link.c friso_string.c friso_split.c
|
||||
|
||||
all: share friso
|
||||
|
||||
#$(CC) $(FFLAGS) -shared $(SOURCE) -l$(ELIB) --mno-cygwin -o $(SH_FILE)
|
||||
share: $(OBJS)
|
||||
$(CC) $(FFLAGS) -shared $(SRCS) -mno-cygwin -o $(SH_FILE) -Wl,--output-def,friso.def,--out-implib,$(LB_FILE)
|
||||
|
||||
#--mno-cygwin
|
||||
friso: tst-friso.o
|
||||
$(CC) tst-friso.o -o ./friso -L. -lfriso
|
||||
|
||||
tst-friso.o: friso_API.h friso.h tst-friso.c
|
||||
$(CC) $(FFLAGS) -c tst-friso.c
|
||||
|
||||
friso.o: friso.c friso.h friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso.c -l$(ELIB)
|
||||
|
||||
friso_array.o: friso_array.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_array.c
|
||||
|
||||
friso_hash.o: friso_hash.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_hash.c
|
||||
|
||||
friso_lexicon.o: friso_hash.o friso_lexicon.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_lexicon.c
|
||||
|
||||
friso_link.o: friso_link.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_link.c
|
||||
|
||||
friso_string.o: friso_string.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_string.c
|
||||
|
||||
friso_split.o: friso_split.c friso_API.h
|
||||
$(CC) $(FFLAGS) -c friso_split.c
|
||||
|
||||
#clean all the object files.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
-rm friso $(SH_FILE) $(LB_FILE) $(OBJS) tst-friso.o
|
1286
src/friso.c
Normal file
1286
src/friso.c
Normal file
File diff suppressed because it is too large
Load Diff
295
src/friso.h
Normal file
295
src/friso.h
Normal file
@ -0,0 +1,295 @@
|
||||
/*
|
||||
* main interface file for friso - free soul.
|
||||
* you could modify it and re-release it but never for commercial use.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#ifndef _friso_h
|
||||
#define _friso_h
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdio.h>
|
||||
|
||||
/* {{{ friso main interface define :: start*/
|
||||
#define DEFAULT_SEGMENT_LENGTH 5
|
||||
#define DEFAULT_MIX_LENGTH 2
|
||||
#define DEFAULT_LNA_LENGTH 1
|
||||
#define DEFAULT_NTHRESHOLD 1000000
|
||||
#define DEFAULT_SEGMENT_MODE 2
|
||||
|
||||
/*
|
||||
* Type: friso_lex_t
|
||||
* -----------
|
||||
* This type used to represent the type of the lexicon.
|
||||
*/
|
||||
typedef enum {
|
||||
__LEX_CJK_WORDS__ = 0,
|
||||
__LEX_CJK_UNITS__ = 1,
|
||||
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
|
||||
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
|
||||
__LEX_CN_LNAME__ = 4,
|
||||
__LEX_CN_SNAME__ = 5,
|
||||
__LEX_CN_DNAME1__ = 6,
|
||||
__LEX_CN_DNAME2__ = 7,
|
||||
__LEX_CN_LNA__ = 8,
|
||||
__LEX_STOPWORDS__ = 9,
|
||||
__LEX_ENPUN_WORDS__ = 10,
|
||||
__LEX_EN_WORDS__ = 11,
|
||||
__LEX_OTHER_WORDS__ = 15,
|
||||
__LEX_NCSYN_WORDS__ = 16
|
||||
} friso_lex_t;
|
||||
|
||||
typedef friso_hash_t * friso_dic_t;
|
||||
#define __FRISO_LEXICON_LENGTH__ 12
|
||||
|
||||
/*
|
||||
* Type: lex_entry_cdt
|
||||
* -------------------
|
||||
* This type used to represent the lexicon entry struct.
|
||||
*/
|
||||
typedef struct {
|
||||
/*
|
||||
* the type of the lexicon item.
|
||||
* available value is all the elements in friso_lex_t enum.
|
||||
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
|
||||
* here use char as its data type not int.
|
||||
*/
|
||||
uint_t length;
|
||||
uint_t type;
|
||||
fstring word;
|
||||
friso_array_t syn;
|
||||
uint_t fre;
|
||||
} lex_entry_cdt;
|
||||
typedef lex_entry_cdt * lex_entry_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Type: friso_mode_t
|
||||
* ------------------
|
||||
* use to identidy the mode that the friso use.
|
||||
*/
|
||||
typedef enum {
|
||||
__FRISO_SIMPLE_MODE__ = 1,
|
||||
__FRISO_COMPLEX_MODE__ = 2
|
||||
} friso_mode_t;
|
||||
|
||||
/*
|
||||
* Type: friso_entry
|
||||
* -----------------
|
||||
* This type is used to set the configuration of friso.
|
||||
*/
|
||||
typedef struct {
|
||||
ushort_t max_len; //the max match length (4 - 7).
|
||||
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
|
||||
ushort_t mix_len; //the max length for the CJK words in a mix string.
|
||||
ushort_t lna_len; //the max length for the chinese last name adron.
|
||||
ushort_t add_syn; //append synonyms tokenizer words.
|
||||
ushort_t clr_stw; //clear the stopwords.
|
||||
ushort_t keep_urec; //keep the unrecongnized words.
|
||||
ushort_t spx_out; //use sphinx output customize.
|
||||
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
|
||||
friso_mode_t mode; //Complex mode or simple mode
|
||||
friso_dic_t dic; //friso dictionary
|
||||
} friso_entry;
|
||||
|
||||
typedef friso_entry * friso_t;
|
||||
|
||||
|
||||
#define __HITS_WORD_LENGTH__ 128
|
||||
/*the segmentation term*/
|
||||
typedef struct {
|
||||
int offset;
|
||||
char word[__HITS_WORD_LENGTH__];
|
||||
} friso_hits_entry;
|
||||
|
||||
typedef friso_hits_entry * friso_hits_t;
|
||||
|
||||
/*
|
||||
* Type: friso_segment
|
||||
* This type used to represent the current segmentation content.
|
||||
* like the text to split, and the current index.
|
||||
*/
|
||||
typedef struct {
|
||||
fstring text; //text to tokenize
|
||||
uint_t idx; //start offset index.
|
||||
uint_t length; //length of the text.
|
||||
uint_t bytes; //latest word bytes in C.
|
||||
uint_t unicode; //latest word unicode number.
|
||||
//uint_t ce_check; //check the CN and EN mixed word if it is 1.
|
||||
friso_link_t pool; //task pool.
|
||||
friso_hits_t hits; //token result hits.
|
||||
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
|
||||
} friso_task_entry;
|
||||
|
||||
typedef friso_task_entry * friso_task_t;
|
||||
|
||||
|
||||
/*
|
||||
* Function: friso_new;
|
||||
* Usage: vars = friso_new( void );
|
||||
* --------------------------------
|
||||
* This function used to create a new empty friso friso_t;
|
||||
* with default value.
|
||||
*/
|
||||
FRISO_API friso_t friso_new( void );
|
||||
|
||||
//creat a friso entry with a default value from a configuratile file.
|
||||
FRISO_API friso_t friso_new_from_ifile( fstring );
|
||||
|
||||
/*
|
||||
* Function: friso_free_vars;
|
||||
* Usage: friso_free( vars );
|
||||
* --------------------------
|
||||
* This function is used to free the allocation of the given vars.
|
||||
*/
|
||||
FRISO_API void friso_free( friso_t );
|
||||
|
||||
/*
|
||||
* Function: friso_set_dic
|
||||
* Usage: dic = friso_set_dic( vars, dic );
|
||||
* ----------------------------------------
|
||||
* This function is used to set the dictionary for friso.
|
||||
* and firso_dic_t is the pointer of a hash table array.
|
||||
*/
|
||||
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
|
||||
#define friso_set_dic(friso, dic)\
|
||||
friso->dic = dic
|
||||
|
||||
/*
|
||||
* Function: friso_set_mode
|
||||
* Usage: friso_set_mode( vars, mode );
|
||||
* ------------------------------------
|
||||
* This function is used to set the mode(complex or simple) that you want to friso to use.
|
||||
*/
|
||||
//FRISO_API void friso_set_mode( friso_t, friso_mode_t );
|
||||
#define friso_set_mode( friso, mode )\
|
||||
friso->mode = mode
|
||||
|
||||
/*
|
||||
* Function: friso_new_task;
|
||||
* Usage: segment = friso_new_task( void );
|
||||
* ----------------------------------------
|
||||
* This function is used to create a new friso segment type;
|
||||
*/
|
||||
FRISO_API friso_task_t friso_new_task( void );
|
||||
|
||||
/*
|
||||
* Function: friso_free_task;
|
||||
* Usage: friso_free_task( task );
|
||||
* -------------------------------
|
||||
* This function is used to free the allocation of function friso_new_segment();
|
||||
*/
|
||||
FRISO_API void friso_free_task( friso_task_t );
|
||||
|
||||
//create a new friso hits
|
||||
FRISO_API friso_hits_t friso_new_hits( void );
|
||||
|
||||
//free the given friso hits
|
||||
//FRISO_API void friso_free_hits( friso_hits_t );
|
||||
#define friso_free_hits(hits) FRISO_FREE(hits)
|
||||
|
||||
/*
|
||||
* Function: friso_set_text
|
||||
* Usage: friso_set_text( task, text );
|
||||
* ------------------------------------
|
||||
* This function is used to set the text that is going to segment.
|
||||
*/
|
||||
FRISO_API void friso_set_text( friso_task_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: friso_next
|
||||
* Usage: word = friso_next( vars, seg );
|
||||
* --------------------------------------
|
||||
* This function is used to get next word that friso segmented.
|
||||
*/
|
||||
FRISO_API friso_hits_t friso_next( friso_t, friso_mode_t, friso_task_t );
|
||||
/* }}} friso main interface define :: end*/
|
||||
|
||||
/* {{{ lexicon interface define :: start*/
|
||||
|
||||
/*
|
||||
* Function: friso_dic_new
|
||||
* Usage: dic = friso_new_dic();
|
||||
* -----------------------------
|
||||
* This function used to create a new dictionary.(memory allocation).
|
||||
*/
|
||||
FRISO_API friso_dic_t friso_dic_new( void );
|
||||
|
||||
FRISO_API fstring file_get_line( fstring, FILE * );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_free
|
||||
* Usage: friso_dic_free( void );
|
||||
* ------------------------------
|
||||
* This function is used to free all the allocation of friso_dic_new.
|
||||
*/
|
||||
FRISO_API void friso_dic_free( friso_dic_t );
|
||||
|
||||
//create a new lexicon entry.
|
||||
FRISO_API lex_entry_t new_lex_entry( fstring, friso_array_t, uint_t, uint_t, uint_t );
|
||||
|
||||
//free the given lexicon entry.
|
||||
//free all the allocations that its synonyms word's items pointed to
|
||||
//when the second arguments is 1
|
||||
FRISO_API void free_lex_entry( lex_entry_t );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_load
|
||||
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
|
||||
* --------------------------------------------------
|
||||
* This function is used to load dictionary from a given path.
|
||||
* no length limit when length less than 0.
|
||||
*/
|
||||
FRISO_API void friso_dic_load( friso_t, friso_lex_t, fstring, uint_t );
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the conf file.
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile( friso_t, fstring, uint_t );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_match
|
||||
* Usage: friso_dic_add( dic, friso_lex_t, word, syn );
|
||||
* ----------------------------------------------
|
||||
* This function used to put new word into the dictionary.
|
||||
*/
|
||||
FRISO_API void friso_dic_add( friso_dic_t, friso_lex_t, fstring, friso_array_t );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_add_with_fre
|
||||
* Usage: friso_dic_add_with_fre( dic, friso_lex_t, word, value, syn, fre );
|
||||
* -------------------------------------------------------------------
|
||||
* This function used to put new word width frequency into the dictionary.
|
||||
*/
|
||||
FRISO_API void friso_dic_add_with_fre( friso_dic_t, friso_lex_t, fstring, friso_array_t, uint_t );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_match
|
||||
* Usage: result = friso_dic_match( dic, friso_lex_t, word );
|
||||
* ----------------------------------------------------
|
||||
* This function is used to check the given word is in the dictionary or not.
|
||||
*/
|
||||
FRISO_API int friso_dic_match( friso_dic_t, friso_lex_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: friso_dic_get
|
||||
* Usage: friso_dic_get( dic, friso_lex_t, word );
|
||||
* -----------------------------------------
|
||||
* This function is used to search the specified lex_entry_t.
|
||||
*/
|
||||
FRISO_API lex_entry_t friso_dic_get( friso_dic_t, friso_lex_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: friso_spec_dic_size
|
||||
* Usage: friso_spec_dic_size( dic, friso_lex_t )
|
||||
* This function is used to get the size of the dictionary with a specified type.
|
||||
*/
|
||||
FRISO_API uint_t friso_spec_dic_size( friso_dic_t, friso_lex_t );
|
||||
FRISO_API uint_t friso_all_dic_size( friso_dic_t );
|
||||
/* }}} lexicon interface define :: end*/
|
||||
|
||||
|
||||
#endif /*end ifndef*/
|
452
src/friso_API.h
Normal file
452
src/friso_API.h
Normal file
@ -0,0 +1,452 @@
|
||||
/*
|
||||
* main interface file for friso - free soul.
|
||||
* you could modify it and re-release it but never for commercial use.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
|
||||
#ifndef _friso_api_h
|
||||
#define _friso_api_h
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/*platform shared library statement :: unix*/
|
||||
#define FRISO_API extern
|
||||
//#define FRISO_API extern __declspec(dllexport)
|
||||
#define __STATIC_API__ static inline
|
||||
|
||||
|
||||
#define ___ALLOCATION_ERROR___ \
|
||||
printf("Unable to do the memory allocation, program will now exit\n" ); \
|
||||
exit(1);
|
||||
|
||||
#define print(str) printf("%s", str )
|
||||
#define println(str) printf("%s\n", str )
|
||||
|
||||
/*
|
||||
* memory allocation macro definition.
|
||||
* cause we should use emalloc,ecalloc .ege. in php.
|
||||
* so you could make it better apdat the php environment.
|
||||
*/
|
||||
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
|
||||
#define FRISO_MALLOC(_bytes) malloc(_bytes)
|
||||
#define FRISO_FREE( _ptr ) free( _ptr )
|
||||
|
||||
typedef unsigned short ushort_t;
|
||||
typedef unsigned char uchar_t;
|
||||
typedef unsigned int uint_t;
|
||||
typedef char * fstring;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ fstring handle interface define::start. */
|
||||
#define __CHAR_BYTES__ 8
|
||||
#define __BUFFER_DEFAULT_LENGTH__ 16
|
||||
|
||||
typedef struct {
|
||||
fstring buffer;
|
||||
uint_t length;
|
||||
uint_t allocs;
|
||||
} string_buffer_entry;
|
||||
|
||||
typedef string_buffer_entry * string_buffer_t;
|
||||
|
||||
//FRISO_API string_buffer_t new_string_buffer( void );
|
||||
#define new_string_buffer() new_string_buffer_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
|
||||
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t );
|
||||
FRISO_API string_buffer_t new_string_buffer_with_string( fstring str );
|
||||
|
||||
/*
|
||||
* this function will copy the chars that the fstring pointed.
|
||||
* to the buffer.
|
||||
* this may cause the resize action of the buffer.
|
||||
*/
|
||||
FRISO_API void string_buffer_append( string_buffer_t, fstring );
|
||||
|
||||
//insert the given fstring from the specified position.
|
||||
FRISO_API void string_buffer_insert( string_buffer_t, uint_t idx, fstring );
|
||||
|
||||
//remove the char in the specified position.
|
||||
FRISO_API fstring string_buffer_remove( string_buffer_t, uint_t idx, uint_t );
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t );
|
||||
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* the string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote( string_buffer_t );
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear( string_buffer_t );
|
||||
|
||||
//free the fstring buffer include the buffer.
|
||||
FRISO_API void free_string_buffer( string_buffer_t );
|
||||
|
||||
/**
|
||||
* fstring specified chars tokenizer functions
|
||||
*
|
||||
* @date 2013-06-08
|
||||
*/
|
||||
typedef struct {
|
||||
fstring source;
|
||||
uint_t srcLen;
|
||||
fstring delimiter;
|
||||
uint_t delLen;
|
||||
uint_t idx;
|
||||
} string_split_entry;
|
||||
typedef string_split_entry * string_split_t;
|
||||
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split( fstring, fstring );
|
||||
|
||||
FRISO_API void string_split_reset( string_split_t, fstring, fstring );
|
||||
|
||||
FRISO_API void string_split_set_source( string_split_t, fstring );
|
||||
|
||||
FRISO_API void string_split_set_delimiter( string_split_t, fstring );
|
||||
|
||||
FRISO_API void free_string_split( string_split_t );
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next( string_split_t, fstring );
|
||||
|
||||
/* **********************************************
|
||||
* utf-8 handle functions. *
|
||||
************************************************/
|
||||
|
||||
//print the given integer in a binary style.
|
||||
FRISO_API void print_char_binary( char );
|
||||
|
||||
//get the bytes of a utf-8 char.
|
||||
FRISO_API int get_utf8_bytes( char );
|
||||
|
||||
//return the unicode serial number of a given string.
|
||||
FRISO_API int get_utf8_unicode( const fstring );
|
||||
|
||||
//turn the unicode serial to a utf-8 string.
|
||||
FRISO_API int unicode_to_utf8( uint_t, fstring );
|
||||
|
||||
//check if the given char is a CJK.
|
||||
FRISO_API int utf8_cjk_string( uint_t ) ;
|
||||
|
||||
/*check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english puntuations.*/
|
||||
FRISO_API int utf8_halfwidth_en_char( uint_t );
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char( uint_t );
|
||||
|
||||
//check the given char is a upper case char or not.
|
||||
FRISO_API int utf8_uppercase_letter( uint_t );
|
||||
|
||||
//check the given char is a lower case char or not.
|
||||
FRISO_API int utf8_lowercase_letter( uint_t );
|
||||
|
||||
//check the given char is a numeric
|
||||
FRISO_API int utf8_numeric_letter( uint_t );
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int utf8_numeric_string( const fstring );
|
||||
|
||||
FRISO_API int utf8_decimal_string( const fstring );
|
||||
|
||||
//check the given char is a english char.
|
||||
//(full-width and half-width)
|
||||
//not the punctuation of course.
|
||||
FRISO_API int utf8_en_letter( uint_t );
|
||||
|
||||
//check the given char is a whitespace or not.
|
||||
FRISO_API int utf8_whitespace( uint_t );
|
||||
|
||||
/* check if the given char is a letter number
|
||||
* like 'ⅠⅡ'
|
||||
*/
|
||||
FRISO_API int utf8_letter_number( uint_t );
|
||||
|
||||
/*
|
||||
* check if the given char is a other number
|
||||
* like '①⑩⑽㈩'
|
||||
*/
|
||||
FRISO_API int utf8_other_number( uint_t );
|
||||
|
||||
//check the given char is a english punctuation.
|
||||
FRISO_API int utf8_en_punctuation( uint_t ) ;
|
||||
|
||||
//check the given char is a chinese punctuation.
|
||||
FRISO_API int utf8_cn_punctuation( uint_t u );
|
||||
|
||||
//FRISO_API int is_en_punctuation( char );
|
||||
#define is_en_punctuation( c ) utf8_en_punctuation((uint_t) c)
|
||||
|
||||
FRISO_API int utf8_keep_punctuation( fstring );
|
||||
|
||||
//check the given english char is a full-width char or not.
|
||||
FRISO_API int utf8_fullwidth_char( uint_t ) ;
|
||||
/* }}} fstring interface define::end*/
|
||||
|
||||
|
||||
|
||||
/* {{{ dynamaic array interface define::start*/
|
||||
#define __DEFAULT_ARRAY_LIST_OPACITY__ 8
|
||||
|
||||
/*friso array list entry struct*/
|
||||
typedef struct {
|
||||
void **items;
|
||||
uint_t allocs;
|
||||
uint_t length;
|
||||
} friso_array_entry;
|
||||
|
||||
typedef friso_array_entry * friso_array_t;
|
||||
|
||||
//create a new friso dynamic array.
|
||||
//FRISO_API friso_array_t new_array_list( void );
|
||||
#define new_array_list() new_array_list_with_opacity(__DEFAULT_ARRAY_LIST_OPACITY__)
|
||||
|
||||
//create a new friso dynamic array with the given opacity
|
||||
FRISO_API friso_array_t new_array_list_with_opacity( uint_t );
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where the items's item to pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list( friso_array_t );
|
||||
|
||||
//add a new item to the array.
|
||||
FRISO_API void array_list_add( friso_array_t, void * );
|
||||
|
||||
//insert a new item at a specifed position.
|
||||
FRISO_API void array_list_insert( friso_array_t, uint_t, void * );
|
||||
|
||||
//get a item at a specified position.
|
||||
FRISO_API void *array_list_get( friso_array_t, uint_t );
|
||||
|
||||
/*
|
||||
* set the item at a specified position.
|
||||
* this will return the old value.
|
||||
*/
|
||||
FRISO_API void *array_list_set( friso_array_t, uint_t, void * );
|
||||
|
||||
/*
|
||||
* remove the given item at a specified position.
|
||||
* this will return the value of the removed item.
|
||||
*/
|
||||
FRISO_API void *array_list_remove( friso_array_t, uint_t );
|
||||
|
||||
/*trim the array list for final use.*/
|
||||
FRISO_API friso_array_t array_list_trim( friso_array_t );
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear( friso_array_t );
|
||||
|
||||
//return the size of the array.
|
||||
//FRISO_API uint_t array_list_size( friso_array_t );
|
||||
#define array_list_size( array ) array->length
|
||||
|
||||
//return the allocations of the array.
|
||||
//FRISO_API uint_t array_list_allocs( friso_array_t );
|
||||
#define array_list_allocs( array ) array->allocs
|
||||
|
||||
//check if the array is empty.
|
||||
//FRISO_API int array_list_empty( friso_array_t );
|
||||
#define array_list_empty( array ) ( array->length == 0 )
|
||||
/* }}} dynamaic array interface define::end*/
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ link list interface define::start*/
|
||||
struct friso_link_node {
|
||||
void * value;
|
||||
struct friso_link_node *prev;
|
||||
struct friso_link_node *next;
|
||||
};
|
||||
typedef struct friso_link_node link_node_entry;
|
||||
typedef link_node_entry * link_node_t;
|
||||
|
||||
/*
|
||||
* link list adt
|
||||
*/
|
||||
typedef struct {
|
||||
link_node_t head;
|
||||
link_node_t tail;
|
||||
uint_t size;
|
||||
} friso_link_entry;
|
||||
|
||||
typedef friso_link_entry * friso_link_t;
|
||||
|
||||
//create a new link list
|
||||
FRISO_API friso_link_t new_link_list( void );
|
||||
|
||||
//free the specified link list
|
||||
FRISO_API void free_link_list( friso_link_t );
|
||||
|
||||
//return the size of the current link list.
|
||||
//FRISO_API uint_t link_list_size( friso_link_t );
|
||||
#define link_list_size( link ) link->size
|
||||
|
||||
//check the given link is empty or not.
|
||||
//FRISO_API int link_list_empty( friso_link_t );
|
||||
#define link_list_empty( link ) (link->size == 0)
|
||||
|
||||
//clear all the nodes in the link list( except the head and the tail ).
|
||||
FRISO_API friso_link_t link_list_clear( friso_link_t link );
|
||||
|
||||
//add a new node to the link list.(append from the tail)
|
||||
FRISO_API void link_list_add( friso_link_t, void * );
|
||||
|
||||
//add a new node before the specified node
|
||||
FRISO_API void link_list_insert_before( friso_link_t, uint_t, void * );
|
||||
|
||||
//get the node in the current index.
|
||||
FRISO_API void *link_list_get( friso_link_t, uint_t );
|
||||
|
||||
//modify the node in the current index.
|
||||
FRISO_API void *link_list_set( friso_link_t, uint_t, void * );
|
||||
|
||||
//remove the specified link node
|
||||
FRISO_API void *link_list_remove( friso_link_t, uint_t );
|
||||
|
||||
//remove the given node
|
||||
FRISO_API void *link_list_remove_node( friso_link_t, link_node_t );
|
||||
|
||||
//remove the node from the frist.
|
||||
FRISO_API void *link_list_remove_first( friso_link_t );
|
||||
|
||||
//remove the last node from the link list
|
||||
FRISO_API void *link_list_remove_last( friso_link_t );
|
||||
|
||||
//append a node from the end.
|
||||
FRISO_API void link_list_add_last( friso_link_t, void * );
|
||||
|
||||
//add a node at the begining of the link list.
|
||||
FRISO_API void link_list_add_first( friso_link_t, void * );
|
||||
/* }}} link list interface define::end*/
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ hashtable interface define :: start*/
|
||||
struct hash_entry {
|
||||
fstring _key; //the node key
|
||||
void * _val; //the node value
|
||||
struct hash_entry * _next;
|
||||
};
|
||||
typedef struct hash_entry friso_hash_entry;
|
||||
typedef friso_hash_entry * hash_entry_t;
|
||||
typedef void (*fhash_callback_fn_t)( hash_entry_t );
|
||||
|
||||
typedef struct {
|
||||
uint_t length;
|
||||
uint_t size;
|
||||
float factor;
|
||||
uint_t threshold;
|
||||
hash_entry_t *table;
|
||||
} friso_hash_cdt;
|
||||
|
||||
typedef friso_hash_cdt * friso_hash_t;
|
||||
|
||||
//default value for friso_hash_cdt
|
||||
#define DEFAULT_LENGTH 31
|
||||
#define DEFAULT_FACTOR 0.85f
|
||||
|
||||
/*
|
||||
* Function: new_hash_table
|
||||
* Usage: table = new_hash_table();
|
||||
* --------------------------------
|
||||
* this function allocates a new symbol table with no entries.
|
||||
*/
|
||||
FRISO_API friso_hash_t new_hash_table( void );
|
||||
|
||||
/*
|
||||
* Function: free_hash_table
|
||||
* Usage: free_hash_table( table );
|
||||
* --------------------------------------
|
||||
* this function will free all the allocation for memory.
|
||||
*/
|
||||
FRISO_API void free_hash_table( friso_hash_t, fhash_callback_fn_t );
|
||||
|
||||
/*
|
||||
* Function: put_new_mapping
|
||||
* Usage: put_mapping( table, key, value );
|
||||
* ----------------------------------------
|
||||
* the function associates the specified key with the given value.
|
||||
*/
|
||||
FRISO_API void hash_put_mapping( friso_hash_t, fstring, void * );
|
||||
|
||||
/*
|
||||
* Function: is_mapping_exists
|
||||
* Usage: bool = is_mapping_exists( table, key );
|
||||
* ----------------------------------------------
|
||||
* this function check the given key mapping is exists or not.
|
||||
*/
|
||||
FRISO_API int hash_exist_mapping( friso_hash_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: get_mapping_value
|
||||
* Usage: value = get_mapping_value( table, key );
|
||||
* -----------------------------------------------
|
||||
* this function return the value associated with the given key.
|
||||
* UNDEFINED will be return if the mapping is not exists.
|
||||
*/
|
||||
FRISO_API void * hash_get_value( friso_hash_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: remove_mapping
|
||||
* Usage: remove_mapping( table, key );
|
||||
* ------------------------------------
|
||||
* This function is used to remove the mapping associated with the given key.
|
||||
*/
|
||||
FRISO_API hash_entry_t hash_remove_mapping( friso_hash_t, fstring );
|
||||
|
||||
/*
|
||||
* Function: get_table_size
|
||||
* Usage: size = get_table_size( table );
|
||||
* --------------------------------------
|
||||
* This function is used to count the size of the specified table.
|
||||
*/
|
||||
//FRISO_API uint_t hash_get_size( friso_hash_t );
|
||||
#define hash_get_size( hash ) hash->size
|
||||
/* }}} hashtable interface define :: end*/
|
||||
|
||||
|
||||
#endif /*end ifndef*/
|
||||
|
221
src/friso_array.c
Normal file
221
src/friso_array.c
Normal file
@ -0,0 +1,221 @@
|
||||
/*
|
||||
* friso dynamaic interface implemented functions file
|
||||
* that defined in header file "friso_API.h".
|
||||
* never use it for commercial use.
|
||||
*
|
||||
* @author chenxini <chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
/* ********************************************
|
||||
* friso array list static functions block *
|
||||
**********************************************/
|
||||
__STATIC_API__ void **create_array_entries( uint_t __blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
|
||||
if ( block == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
for ( t = 0; t < __blocks; t++ ) {
|
||||
block[t] = NULL;
|
||||
}
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
//resize the array. (the opacity should not be smaller than array->length)
|
||||
__STATIC_API__ friso_array_t resize_array_list(
|
||||
friso_array_t array,
|
||||
uint_t opacity )
|
||||
{
|
||||
register uint_t t;
|
||||
void **block = create_array_entries( opacity );
|
||||
|
||||
for ( t = 0; t < array->length ; t++ ) {
|
||||
block[t] = array->items[t];
|
||||
}
|
||||
|
||||
FRISO_FREE( array->items );
|
||||
array->items = block;
|
||||
array->allocs = opacity;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
|
||||
/* ********************************************
|
||||
* friso array list FRISO_API functions block *
|
||||
**********************************************/
|
||||
//create a new array list. (A macro define has replace this.)
|
||||
//FRISO_API friso_array_t new_array_list( void ) {
|
||||
// return new_array_list_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
|
||||
//}
|
||||
|
||||
//create a new array list with a given opacity.
|
||||
FRISO_API friso_array_t new_array_list_with_opacity( uint_t opacity )
|
||||
{
|
||||
friso_array_t array = ( friso_array_t )
|
||||
FRISO_MALLOC( sizeof( friso_array_entry ) );
|
||||
if ( array == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
array->items = create_array_entries( opacity );
|
||||
array->allocs = opacity;
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where its items item pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list( friso_array_t array )
|
||||
{
|
||||
//free the allocation that all the items pointed to
|
||||
//register int t;
|
||||
//if ( flag == 1 ) {
|
||||
// for ( t = 0; t < array->length; t++ ) {
|
||||
// if ( array->items[t] == NULL ) continue;
|
||||
// FRISO_FREE( array->items[t] );
|
||||
// array->items[t] = NULL;
|
||||
// }
|
||||
//}
|
||||
|
||||
FRISO_FREE( array->items );
|
||||
FRISO_FREE( array );
|
||||
}
|
||||
|
||||
//add a new item to the array.
|
||||
FRISO_API void array_list_add( friso_array_t array, void *value )
|
||||
{
|
||||
//check the condition to resize.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
array->items[array->length++] = value;
|
||||
}
|
||||
|
||||
//insert a new item at a specified position.
|
||||
FRISO_API void array_list_insert(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void * value )
|
||||
{
|
||||
register uint_t t;
|
||||
|
||||
if ( idx <= array->length )
|
||||
{
|
||||
//check the condition to resize the array.
|
||||
if ( array->length == array->allocs ) {
|
||||
resize_array_list( array, array->length * 2 + 1 );
|
||||
}
|
||||
|
||||
//move the elements after idx.
|
||||
for ( t = idx; t < array->length; t++ ) {
|
||||
array->items[t+1] = array->items[t];
|
||||
}
|
||||
array->items[idx] = value;
|
||||
array->length++;
|
||||
}
|
||||
}
|
||||
|
||||
//get the item at a specified position.
|
||||
FRISO_API void *array_list_get( friso_array_t array, uint_t idx )
|
||||
{
|
||||
if ( idx < array->length ) {
|
||||
return array->items[idx];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//set the value of the item at a specified position.
|
||||
//this will return the old value.
|
||||
FRISO_API void * array_list_set(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void * value )
|
||||
{
|
||||
void * oval = NULL;
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
array->items[idx] = value;
|
||||
}
|
||||
return oval;
|
||||
}
|
||||
|
||||
//remove the item at a specified position.
|
||||
//this will return the value of the removed item.
|
||||
FRISO_API void * array_list_remove(
|
||||
friso_array_t array, uint_t idx )
|
||||
{
|
||||
register uint_t t;
|
||||
void *oval = NULL;
|
||||
|
||||
if ( idx < array->length )
|
||||
{
|
||||
oval = array->items[idx];
|
||||
//move the elements after idx.
|
||||
for ( t = idx; t < array->length - 1; t++ ) {
|
||||
array->items[t] = array->items[ t + 1 ];
|
||||
}
|
||||
array->items[array->length - 1] = NULL;
|
||||
array->length--;
|
||||
}
|
||||
|
||||
return oval;
|
||||
}
|
||||
|
||||
/*trim the array list*/
|
||||
FRISO_API friso_array_t array_list_trim( friso_array_t array )
|
||||
{
|
||||
if ( array->length < array->allocs ) {
|
||||
return resize_array_list( array, array->length );
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear( friso_array_t array )
|
||||
{
|
||||
register uint_t t;
|
||||
//free all the allocations that the array->length's pointer pointed.
|
||||
for ( t = 0; t < array->length; t++ ) {
|
||||
/*if ( array->items[t] == NULL ) continue;
|
||||
FRISO_FREE( array->items[t] ); */
|
||||
array->items[t] = NULL;
|
||||
}
|
||||
//attribute reset.
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
//get the size of the array list. (A macro define has replace this.)
|
||||
//FRISO_API uint_t array_list_size( friso_array_t array ) {
|
||||
// return array->length;
|
||||
//}
|
||||
|
||||
//return the allocations of the array list.(A macro define has replace this)
|
||||
//FRISO_API uint_t array_list_allocs( friso_array_t array ) {
|
||||
// return array->allocs;
|
||||
//}
|
||||
|
||||
//check if the array is empty.(A macro define has replace this.)
|
||||
//FRISO_API int array_list_empty( friso_array_t array )
|
||||
//{
|
||||
// return ( array->length == 0 );
|
||||
//}
|
297
src/friso_hash.c
Normal file
297
src/friso_hash.c
Normal file
@ -0,0 +1,297 @@
|
||||
/*
|
||||
* friso hash table implements functions
|
||||
* defined in header file "friso_API.h".
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
//-166411799L
|
||||
//31 131 1331 13331 133331 ..
|
||||
//31 131 1313 13131 131313 .. the best
|
||||
#define HASH_FACTOR 1313131
|
||||
|
||||
/* ************************
|
||||
* mapping function area *
|
||||
**************************/
|
||||
__STATIC_API__ uint_t hash( fstring str, uint_t length )
|
||||
{
|
||||
//hash code
|
||||
uint_t h = 0;
|
||||
|
||||
while ( *str != '\0' )
|
||||
h = h * HASH_FACTOR + ( *str++ );
|
||||
|
||||
return (h % length);
|
||||
}
|
||||
|
||||
/*test if a integer is a prime.*/
|
||||
__STATIC_API__ int is_prime( int n )
|
||||
{
|
||||
int j;
|
||||
if ( n == 2 || n == 3 )
|
||||
return 1;
|
||||
if ( n == 1 || n % 2 == 0 )
|
||||
return 0;
|
||||
|
||||
for ( j = 3; j * j < n; j++ )
|
||||
if ( n % j == 0 )
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*get the next prime just after the speicified integer.*/
|
||||
__STATIC_API__ int next_prime( int n )
|
||||
{
|
||||
if ( n % 2 == 0 )
|
||||
n++;
|
||||
for ( ; ! is_prime( n ); n = n + 2 ) ;
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
//fstring copy, return the pointer of the new string.
|
||||
//static fstring string_copy( fstring _src ) {
|
||||
//int bytes = strlen( _src );
|
||||
//fstring _dst = ( fstring ) FRISO_MALLOC( bytes + 1 );
|
||||
//register int t = 0;
|
||||
|
||||
//do {
|
||||
//_dst[t] = _src[t];
|
||||
//t++;
|
||||
//} while ( _src[t] != '\0' );
|
||||
//_dst[t] = '\0';
|
||||
|
||||
//return _dst;
|
||||
//}
|
||||
|
||||
/* *********************************
|
||||
* static hashtable function area. *
|
||||
***********************************/
|
||||
__STATIC_API__ hash_entry_t new_hash_entry(
|
||||
fstring key,
|
||||
void * value,
|
||||
hash_entry_t next )
|
||||
{
|
||||
hash_entry_t e = ( hash_entry_t )
|
||||
FRISO_MALLOC( sizeof( friso_hash_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//e->_key = string_copy( key );
|
||||
e->_key = key;
|
||||
e->_val = value;
|
||||
e->_next = next;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//create blocks copy of entries.
|
||||
__STATIC_API__ hash_entry_t * create_hash_entries( uint_t blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
hash_entry_t *e = ( hash_entry_t * )
|
||||
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
e[t] = NULL;
|
||||
}
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//a static function to do the re-hash work.
|
||||
__STATIC_API__ void rebuild_hash( friso_hash_t _hash )
|
||||
{
|
||||
//printf("rehashed.\n");
|
||||
//find the next prime as the length of the hashtable.
|
||||
uint_t t, length = next_prime( _hash->length * 2 + 1 );
|
||||
hash_entry_t e, next, *_src = _hash->table, \
|
||||
*table = create_hash_entries( length );
|
||||
uint_t bucket;
|
||||
|
||||
//copy the nodes
|
||||
for ( t = 0; t < _hash->length; t++ )
|
||||
{
|
||||
e = *( _src + t );
|
||||
if ( e != NULL ) {
|
||||
do {
|
||||
next = e->_next;
|
||||
bucket = hash( e->_key, length );
|
||||
e->_next = table[bucket];
|
||||
table[bucket] = e;
|
||||
e = next;
|
||||
} while ( e != NULL );
|
||||
}
|
||||
}
|
||||
|
||||
_hash->table = table;
|
||||
_hash->length = length;
|
||||
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
|
||||
|
||||
//free the old hash_entry_t blocks allocations.
|
||||
FRISO_FREE( _src );
|
||||
}
|
||||
|
||||
/* ********************************
|
||||
* hashtable interface functions. *
|
||||
* ********************************/
|
||||
|
||||
//create a new hash table.
|
||||
FRISO_API friso_hash_t new_hash_table( void )
|
||||
{
|
||||
friso_hash_t _hash = ( friso_hash_t ) FRISO_MALLOC( sizeof ( friso_hash_cdt ) );
|
||||
if ( _hash == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the the hashtable
|
||||
_hash->length = DEFAULT_LENGTH;
|
||||
_hash->size = 0;
|
||||
_hash->factor = DEFAULT_FACTOR;
|
||||
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
|
||||
_hash->table = create_hash_entries( _hash->length );
|
||||
|
||||
return _hash;
|
||||
}
|
||||
|
||||
FRISO_API void free_hash_table(
|
||||
friso_hash_t _hash,
|
||||
fhash_callback_fn_t fentry_func )
|
||||
{
|
||||
register uint_t j;
|
||||
hash_entry_t e, n;
|
||||
|
||||
for ( j = 0; j < _hash->length; j++ )
|
||||
{
|
||||
e = *( _hash->table + j );
|
||||
for ( ; e != NULL ; ) {
|
||||
n = e->_next;
|
||||
if ( fentry_func != NULL ) fentry_func(e);
|
||||
FRISO_FREE( e );
|
||||
e = n;
|
||||
}
|
||||
}
|
||||
|
||||
//free the pointer array block ( 4 * htable->length continuous bytes ).
|
||||
FRISO_FREE( _hash->table );
|
||||
FRISO_FREE( _hash );
|
||||
}
|
||||
|
||||
|
||||
//put a new mapping insite.
|
||||
//the value cannot be NULL.
|
||||
FRISO_API void hash_put_mapping(
|
||||
friso_hash_t _hash,
|
||||
fstring key,
|
||||
void * value )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e = *( _hash->table + bucket );
|
||||
|
||||
//check the given key is already exists or not.
|
||||
for ( ; e != NULL; e = e->_next )
|
||||
{
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
e->_val = value;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//put a new mapping into the hashtable.
|
||||
_hash->table[bucket] = new_hash_entry( key, value, _hash->table[bucket] );
|
||||
_hash->size++;
|
||||
|
||||
//check the condition to rebuild the hashtable.
|
||||
if ( _hash->size >= _hash->threshold )
|
||||
rebuild_hash( _hash );
|
||||
|
||||
}
|
||||
|
||||
//check the existence of the mapping associated with the given key.
|
||||
FRISO_API int hash_exist_mapping(
|
||||
friso_hash_t _hash, fstring key )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get the value associated with the given key.
|
||||
FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ))
|
||||
{
|
||||
return e->_val;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//remove the mapping associated with the given key.
|
||||
FRISO_API hash_entry_t hash_remove_mapping(
|
||||
friso_hash_t _hash, fstring key )
|
||||
{
|
||||
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
|
||||
hash_entry_t e, prev = NULL;
|
||||
hash_entry_t b;
|
||||
|
||||
for ( e = *( _hash->table + bucket );
|
||||
e != NULL;
|
||||
prev = e, e = e->_next ) {
|
||||
if ( key == e->_key
|
||||
|| ( key != NULL && e->_key != NULL
|
||||
&& strcmp( key, e->_key ) == 0 ) )
|
||||
{
|
||||
b = e;
|
||||
//the node located at *( htable->table + bucket )
|
||||
if ( prev == NULL ) {
|
||||
_hash->table[bucket] = e->_next;
|
||||
} else {
|
||||
prev->_next = e->_next;
|
||||
}
|
||||
//printf("%s was removed\n", b->_key);
|
||||
_hash->size--;
|
||||
//FRISO_FREE( b );
|
||||
return b;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//count the size.(A macro define has replace this.)
|
||||
//FRISO_API uint_t hash_get_size( friso_hash_t _hash ) {
|
||||
// return _hash->size;
|
||||
//}
|
529
src/friso_lexicon.c
Normal file
529
src/friso_lexicon.c
Normal file
@ -0,0 +1,529 @@
|
||||
/*
|
||||
* friso lexicon implemented functions.
|
||||
* used to deal with the friso lexicon, like: load,remove,match...
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
#include "friso.h"
|
||||
|
||||
#define __SPLIT_MAX_TOKENS__ 5
|
||||
#define __LEX_FILE_DELIME__ '#'
|
||||
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
|
||||
|
||||
//create a new lexicon
|
||||
FRISO_API friso_dic_t friso_dic_new()
|
||||
{
|
||||
register uint_t t;
|
||||
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
|
||||
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
|
||||
if ( dic == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
dic[t] = new_hash_table();
|
||||
}
|
||||
|
||||
return dic;
|
||||
}
|
||||
|
||||
/**
|
||||
* default callback function to invoke
|
||||
* when free the friso dictionary .
|
||||
*
|
||||
* @date 2013-06-12
|
||||
*/
|
||||
__STATIC_API__ void default_fdic_callback( hash_entry_t e )
|
||||
{
|
||||
register uint_t i;
|
||||
friso_array_t syn;
|
||||
lex_entry_t lex = ( lex_entry_t ) e->_val;
|
||||
//free the lex->word
|
||||
FRISO_FREE( lex->word );
|
||||
//free the lex->syn if it is not NULL
|
||||
if ( lex->syn != NULL ) {
|
||||
syn = lex->syn;
|
||||
for ( i = 0; i < syn->length; i++ ) {
|
||||
FRISO_FREE( syn->items[i] );
|
||||
}
|
||||
free_array_list( syn );
|
||||
}
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_free( friso_dic_t dic )
|
||||
{
|
||||
register uint_t t;
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
//free the hash table
|
||||
free_hash_table( dic[t], default_fdic_callback );
|
||||
}
|
||||
|
||||
FRISO_FREE( dic );
|
||||
}
|
||||
|
||||
|
||||
//create a new lexicon entry
|
||||
FRISO_API lex_entry_t new_lex_entry(
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t fre,
|
||||
uint_t length,
|
||||
uint_t type )
|
||||
{
|
||||
lex_entry_t e = ( lex_entry_t )
|
||||
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize.
|
||||
e->word = word;
|
||||
e->syn = syn;
|
||||
e->fre = fre;
|
||||
e->length = length;
|
||||
e->type = type;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
/**
|
||||
* free the given lexicon entry.
|
||||
* you have to do three thing maybe:
|
||||
* 1. free where its syn items points to. (not implemented)
|
||||
* 2. free its syn. (friso_array_t)
|
||||
* 3. free the lex_entry_t.
|
||||
*/
|
||||
FRISO_API void free_lex_entry( lex_entry_t e )
|
||||
{
|
||||
//if ( e->syn != NULL ) {
|
||||
// if ( flag == 1 ) free_array_list( e->syn);
|
||||
// else free_array_list( e->syn );
|
||||
//}
|
||||
FRISO_FREE( e );
|
||||
}
|
||||
|
||||
|
||||
//add a new entry to the dictionary.
|
||||
FRISO_API void friso_dic_add(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn )
|
||||
{
|
||||
if ( lex >= 0 || lex < __FRISO_LEXICON_LENGTH__ )
|
||||
{
|
||||
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, 0,
|
||||
(uint_t) strlen(word), (uint_t) lex ) );
|
||||
}
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_add_with_fre(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t frequency )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
hash_put_mapping( dic[lex], word,
|
||||
new_lex_entry( word, syn, frequency,
|
||||
( uint_t ) strlen(word), ( uint_t ) lex ) );
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* read a line from a specified stream.
|
||||
* the newline will be cleared.
|
||||
*
|
||||
* @date 2012-11-24
|
||||
*/
|
||||
FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
|
||||
{
|
||||
register int c;
|
||||
fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while ( ( c = fgetc( _stream ) ) != EOF )
|
||||
{
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*
|
||||
* static function to copy a string.
|
||||
*/
|
||||
///instead of memcpy
|
||||
__STATIC_API__ fstring string_copy(
|
||||
fstring _src,
|
||||
fstring __dst,
|
||||
uint_t blocks )
|
||||
{
|
||||
|
||||
register fstring __src = _src;
|
||||
register uint_t t;
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *__src == '\0' ) break;
|
||||
__dst[t] = *__src++;
|
||||
}
|
||||
__dst[t] = '\0';
|
||||
|
||||
return __dst;
|
||||
}
|
||||
|
||||
/**
|
||||
* make a heap allocation, and copy the
|
||||
* source fstring to the new allocation, and
|
||||
* you should free it after use it .
|
||||
*
|
||||
* @param _src source fstring
|
||||
* @param blocks number of bytes to copy
|
||||
*/
|
||||
__STATIC_API__ fstring string_copy_heap(
|
||||
fstring _src, uint_t blocks )
|
||||
{
|
||||
register uint_t t;
|
||||
|
||||
fstring str = ( fstring )
|
||||
FRISO_MALLOC( blocks + 1 );
|
||||
if ( str == NULL ) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
for ( t = 0; t < blocks; t++ ) {
|
||||
if ( *_src == '\0' ) break;
|
||||
str[t] = *_src++;
|
||||
}
|
||||
|
||||
str[t] = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
/*
|
||||
* find the postion of the first appear of the given char.
|
||||
* address of the char in the fstring will be return .
|
||||
* if not found NULL will be return .
|
||||
*/
|
||||
__STATIC_API__ fstring indexOf( fstring __str, char delimiter )
|
||||
{
|
||||
uint_t i, __length__;
|
||||
|
||||
__length__ = strlen( __str );
|
||||
for ( i = 0; i < __length__; i++ ) {
|
||||
if ( __str[i] == delimiter )
|
||||
return __str + i;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* load all the valid wors from a specified lexicon file .
|
||||
*
|
||||
* @param dic friso dictionary instance (A hash array)
|
||||
* @param lex the lexicon type
|
||||
* @param lex_file the path of the lexicon file
|
||||
* @param length the maximum length of the word item
|
||||
*/
|
||||
FRISO_API void friso_dic_load(
|
||||
friso_t friso,
|
||||
friso_lex_t lex,
|
||||
fstring lex_file,
|
||||
uint_t length )
|
||||
{
|
||||
|
||||
FILE * _stream;
|
||||
char __char[1024], _buffer[512];
|
||||
fstring _line;
|
||||
string_split_entry sse;
|
||||
|
||||
fstring _word;
|
||||
char _sbuffer[512];
|
||||
fstring _syn;
|
||||
friso_array_t sywords;
|
||||
uint_t _fre;
|
||||
|
||||
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( _line = file_get_line( __char, _stream ) ) != NULL )
|
||||
{
|
||||
//clear up the notes
|
||||
//make sure the length of the line is greater than 1.
|
||||
//like the single '#' mark in stopwords dictionary.
|
||||
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
|
||||
|
||||
//handle the stopwords.
|
||||
if ( lex == __LEX_STOPWORDS__ )
|
||||
{
|
||||
if ( get_utf8_bytes( _line[0] ) > 1
|
||||
&& strlen( _line ) > length ) continue;
|
||||
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
|
||||
string_copy_heap( _line, strlen(_line) ), NULL );
|
||||
continue;
|
||||
}
|
||||
|
||||
//split the fstring with '/'.
|
||||
string_split_reset( &sse, "/", _line);
|
||||
if ( string_split_next( &sse, _buffer ) == NULL ) continue;
|
||||
|
||||
//1. get the word.
|
||||
_word = string_copy_heap( _buffer, strlen(_buffer) );
|
||||
|
||||
if ( string_split_next( &sse, _buffer ) == NULL )
|
||||
{
|
||||
//normal lexicon type,
|
||||
//add them to the dictionary directly
|
||||
friso_dic_add( friso->dic, lex, _word, NULL );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* filter out the words that its length is larger
|
||||
* than the specified limit.
|
||||
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
|
||||
* and __LEX_CEM_WORDS__.
|
||||
*/
|
||||
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
|
||||
&& strlen( _word ) > length ) continue;
|
||||
|
||||
//2. get the synonyms words.
|
||||
_syn = NULL;
|
||||
if ( strcmp( _buffer, "null" ) != 0 )
|
||||
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
|
||||
|
||||
//3. get the word frequency if it available.
|
||||
_fre = 0;
|
||||
if ( string_split_next( &sse, _buffer ) != NULL )
|
||||
_fre = atoi( _buffer );
|
||||
|
||||
/**
|
||||
* Here:
|
||||
* split the synonyms words with mark ","
|
||||
* and put them in a array list if the synonyms is not NULL
|
||||
*/
|
||||
sywords = NULL;
|
||||
if ( friso->add_syn && _syn != NULL )
|
||||
{
|
||||
string_split_reset( &sse, ",", _sbuffer );
|
||||
sywords = new_array_list_with_opacity(5);
|
||||
while ( string_split_next( &sse, _buffer ) != NULL )
|
||||
{
|
||||
if ( strlen(_buffer) > length ) continue;
|
||||
array_list_add( sywords,
|
||||
string_copy_heap(_buffer, strlen(_buffer)) );
|
||||
}
|
||||
sywords = array_list_trim( sywords );
|
||||
}
|
||||
|
||||
//4. add the word item
|
||||
friso_dic_add_with_fre(
|
||||
friso->dic, lex, _word, sywords, _fre );
|
||||
}
|
||||
|
||||
fclose( _stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open lexicon file %s\n", lex_file);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get the lexicon type index with the specified
|
||||
* type keywords .
|
||||
*
|
||||
* @see friso.h#friso_lex_t
|
||||
* @param _key
|
||||
* @return int
|
||||
*/
|
||||
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key )
|
||||
{
|
||||
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
|
||||
return __LEX_CJK_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
|
||||
return __LEX_CJK_UNITS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
|
||||
return __LEX_ECM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
|
||||
return __LEX_CEM_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
|
||||
return __LEX_CN_LNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
|
||||
return __LEX_CN_SNAME__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME1__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
|
||||
return __LEX_CN_DNAME2__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
|
||||
return __LEX_CN_LNA__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
|
||||
return __LEX_STOPWORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
|
||||
return __LEX_ENPUN_WORDS__;
|
||||
}
|
||||
else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
|
||||
return __LEX_EN_WORDS__;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the configuration file.
|
||||
*
|
||||
* @param friso friso instance
|
||||
* @param _path dictionary directory
|
||||
* @param _limitts words length limit
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile(
|
||||
friso_t friso,
|
||||
fstring _path,
|
||||
uint_t _limits )
|
||||
{
|
||||
|
||||
//1.parse the configuration file.
|
||||
FILE * __stream;
|
||||
char __chars__[1024], __key__[30], *__line__;
|
||||
uint_t __length__, i, t;
|
||||
friso_lex_t lex_t;
|
||||
string_buffer_t sb;
|
||||
|
||||
//get the lexicon configruation file path
|
||||
sb = new_string_buffer();
|
||||
string_buffer_append( sb, _path );
|
||||
if ( _path[ strlen(_path) - 1 ] != '/' )
|
||||
string_buffer_append( sb, "/" );
|
||||
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
|
||||
|
||||
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL )
|
||||
{
|
||||
while ( ( __line__ =
|
||||
file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comment filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
|
||||
__length__ = strlen( __line__ );
|
||||
//item start
|
||||
if ( __line__[ __length__ - 1 ] == '[' )
|
||||
{
|
||||
//get the type key
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//get the lexicon type
|
||||
lex_t = get_lexicon_type_with_constant(__key__);
|
||||
if ( lex_t == -1 ) continue;
|
||||
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t );
|
||||
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL )
|
||||
{
|
||||
//comments filter.
|
||||
if ( __line__[0] == '#' ) continue;
|
||||
if ( __line__[0] == '\0' ) continue;
|
||||
|
||||
__length__ = strlen( __line__ );
|
||||
if ( __line__[ __length__ - 1 ] == ']' ) break;
|
||||
|
||||
for ( i = 0; i < __length__
|
||||
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
|
||||
for ( t = 0; i < __length__; i++,t++ ) {
|
||||
if ( __line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//load the lexicon item from the lexicon file.
|
||||
string_buffer_clear( sb );
|
||||
string_buffer_append( sb, _path );
|
||||
string_buffer_append( sb, __key__ );
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t);
|
||||
friso_dic_load( friso, lex_t, sb->buffer, _limits );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} //end while
|
||||
|
||||
fclose( __stream );
|
||||
} else {
|
||||
printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
|
||||
}
|
||||
|
||||
free_string_buffer(sb);
|
||||
}
|
||||
|
||||
//match the item.
|
||||
FRISO_API int friso_dic_match(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_exist_mapping( dic[lex], word );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get the lex_entry_t associated with the word.
|
||||
FRISO_API lex_entry_t friso_dic_get(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return ( lex_entry_t ) hash_get_value( dic[lex], word );
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//get the size of the specified type dictionary.
|
||||
FRISO_API uint_t friso_spec_dic_size(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex )
|
||||
{
|
||||
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
|
||||
return hash_get_size( dic[lex] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get size of the whole dictionary.
|
||||
FRISO_API uint_t friso_all_dic_size(
|
||||
friso_dic_t dic )
|
||||
{
|
||||
register uint_t size = 0, t;
|
||||
|
||||
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
|
||||
size += hash_get_size( dic[t] );
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
276
src/friso_link.c
Normal file
276
src/friso_link.c
Normal file
@ -0,0 +1,276 @@
|
||||
/*
|
||||
* link list implemented functions
|
||||
* defined in header file "friso_API.h".
|
||||
* when the link_node is being deleted, here we just free
|
||||
* the allocation of the node, not the allcation of it's value.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
//create a new link list node.
|
||||
__STATIC_API__ link_node_t new_node_entry(
|
||||
void * value,
|
||||
link_node_t prev,
|
||||
link_node_t next )
|
||||
{
|
||||
link_node_t node = ( link_node_t )
|
||||
FRISO_MALLOC( sizeof( link_node_entry ) );
|
||||
if ( node == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
node->prev = prev;
|
||||
node->next = next;
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
//create a new link list
|
||||
FRISO_API friso_link_t new_link_list( void )
|
||||
{
|
||||
friso_link_t e = ( friso_link_t )
|
||||
FRISO_MALLOC( sizeof( friso_link_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the entry
|
||||
e->head = new_node_entry( NULL, NULL, NULL );
|
||||
e->tail = new_node_entry( NULL, e->head, NULL );
|
||||
e->head->next = e->tail;
|
||||
e->size = 0;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//free the given link list
|
||||
FRISO_API void free_link_list( friso_link_t link )
|
||||
{
|
||||
link_node_t node, next;
|
||||
for ( node = link->head; node != NULL; )
|
||||
{
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
}
|
||||
|
||||
FRISO_FREE( link );
|
||||
}
|
||||
|
||||
//clear all nodes in the link list.
|
||||
FRISO_API friso_link_t link_list_clear( friso_link_t link )
|
||||
{
|
||||
link_node_t node, next;
|
||||
//free all the middle nodes.
|
||||
for ( node = link->head->next;
|
||||
node != link->tail; )
|
||||
{
|
||||
next = node->next;
|
||||
FRISO_FREE( node );
|
||||
node = next;
|
||||
}
|
||||
|
||||
link->head->next = link->tail;
|
||||
link->tail->prev = link->head;
|
||||
link->size = 0;
|
||||
|
||||
return link;
|
||||
}
|
||||
|
||||
//get the size of the link list.
|
||||
//FRISO_API uint_t link_list_size( friso_link_t link ) {
|
||||
// return link->size;
|
||||
//}
|
||||
|
||||
//check if the link list is empty
|
||||
//FRISO_API int link_list_empty( friso_link_t link ) {
|
||||
// return ( link->size == 0 );
|
||||
//}
|
||||
|
||||
|
||||
/*
|
||||
* find the node at a specified position.
|
||||
* static
|
||||
*/
|
||||
__STATIC_API__ link_node_t get_node(
|
||||
friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t p = NULL;
|
||||
register uint_t t;
|
||||
|
||||
if ( idx >= 0 && idx < link->size )
|
||||
{
|
||||
if ( idx < link->size / 2 ) { //find from the head.
|
||||
p = link->head;
|
||||
for ( t = 0; t <= idx; t++ )
|
||||
p = p->next;
|
||||
} else { //find from the tail.
|
||||
p = link->tail;
|
||||
for ( t = link->size; t > idx; t-- )
|
||||
p = p->prev;
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/*
|
||||
* insert a node before the given node.
|
||||
* static
|
||||
*/
|
||||
//__STATIC_API__ void insert_before(
|
||||
// friso_link_t link,
|
||||
// link_node_t node,
|
||||
// void * value )
|
||||
//{
|
||||
// link_node_t e = new_node_entry( value, node->prev, node );
|
||||
// e->prev->next = e;
|
||||
// e->next->prev = e;
|
||||
// //node->prev = e;
|
||||
//
|
||||
// link->size++;
|
||||
//}
|
||||
#define insert_before( link, node, value ) \
|
||||
{ \
|
||||
link_node_t e = new_node_entry( value, node->prev, node ); \
|
||||
e->prev->next = e; \
|
||||
e->next->prev = e; \
|
||||
link->size++; \
|
||||
}
|
||||
|
||||
/*
|
||||
* static function:
|
||||
* remove the given node, the allocation of the value will not free,
|
||||
* but we return it to you, you will free it youself when there is a necessary.
|
||||
*
|
||||
* @return the value of the removed node.
|
||||
*/
|
||||
__STATIC_API__ void * remove_node(
|
||||
friso_link_t link, link_node_t node )
|
||||
{
|
||||
void * _value = node->value;
|
||||
|
||||
node->prev->next = node->next;
|
||||
node->next->prev = node->prev;
|
||||
link->size--;
|
||||
|
||||
FRISO_FREE( node );
|
||||
|
||||
return _value;
|
||||
}
|
||||
|
||||
|
||||
//add a new node to the link list.(insert just before the tail)
|
||||
FRISO_API void link_list_add( friso_link_t link, void * value )
|
||||
{
|
||||
insert_before( link, link->tail, value );
|
||||
}
|
||||
|
||||
//add a new node before the given index.
|
||||
FRISO_API void link_list_insert_before(
|
||||
friso_link_t link, uint_t idx, void * value )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
if ( node != NULL ) {
|
||||
insert_before( link, node, value );
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the value with the specified node.
|
||||
*
|
||||
* @return the value of the node.
|
||||
*/
|
||||
FRISO_API void * link_list_get( friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
if ( node != NULL ) {
|
||||
return node->value;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* set the value of the node that located in the specified position.
|
||||
* we did't free the allocation of the old value, we return it to you.
|
||||
* free it yourself when it is necessary.
|
||||
*
|
||||
* @return the old value.
|
||||
*/
|
||||
FRISO_API void *link_list_set(
|
||||
friso_link_t link, uint_t idx, void * value )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
void * _value = NULL;
|
||||
|
||||
if ( node != NULL ) {
|
||||
_value = node->value;
|
||||
node->value = value;
|
||||
}
|
||||
|
||||
return _value;
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the node located in the specified position.
|
||||
*
|
||||
* @see remove_node
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove( friso_link_t link, uint_t idx )
|
||||
{
|
||||
link_node_t node = get_node( link, idx );
|
||||
|
||||
if ( node != NULL ) {
|
||||
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
|
||||
return remove_node( link, node );
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the given node from the given link list.
|
||||
*
|
||||
* @see remove_node.
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove_node( friso_link_t link, link_node_t node )
|
||||
{
|
||||
return remove_node( link, node );
|
||||
}
|
||||
|
||||
//remove the first node after the head
|
||||
FRISO_API void *link_list_remove_first( friso_link_t link )
|
||||
{
|
||||
if ( link->size > 0 ) {
|
||||
return remove_node( link, link->head->next );
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//remove the last node just before the tail.
|
||||
FRISO_API void *link_list_remove_last( friso_link_t link )
|
||||
{
|
||||
if ( link->size > 0 ) {
|
||||
return remove_node( link, link->tail->prev );
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//append a node from the tail.
|
||||
FRISO_API void link_list_add_last( friso_link_t link, void * value )
|
||||
{
|
||||
insert_before( link, link->tail, value );
|
||||
}
|
||||
|
||||
//append a note just after the head.
|
||||
FRISO_API void link_list_add_frist( friso_link_t link, void * value )
|
||||
{
|
||||
insert_before( link, link->head->next, value );
|
||||
}
|
116
src/friso_split.c
Normal file
116
src/friso_split.c
Normal file
@ -0,0 +1,116 @@
|
||||
/**
|
||||
* friso fstring split function implments defined
|
||||
* in header file friso_API.h .
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
* @date 2013-06-08
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split(
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
{
|
||||
string_split_t e = ( string_split_t )
|
||||
FRISO_MALLOC( sizeof( string_split_entry ) );
|
||||
if ( e == NULL ) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
e->delimiter = delimiter;
|
||||
e->delLen = strlen(delimiter);
|
||||
e->source = source;
|
||||
e->srcLen = strlen(source);
|
||||
e->idx = 0;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_reset(
|
||||
string_split_t sst,
|
||||
fstring delimiter,
|
||||
fstring source )
|
||||
{
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen(delimiter);
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_source(
|
||||
string_split_t sst, fstring source )
|
||||
{
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_delimiter(
|
||||
string_split_t sst, fstring delimiter )
|
||||
{
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen( delimiter );
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void free_string_split( string_split_t sst )
|
||||
{
|
||||
FRISO_FREE(sst);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next(
|
||||
string_split_t sst, fstring __dst)
|
||||
{
|
||||
uint_t i, _ok;
|
||||
fstring _dst = __dst;
|
||||
|
||||
//check if reach the end of the fstring
|
||||
if ( sst->idx >= sst->srcLen ) return NULL;
|
||||
|
||||
while ( 1 )
|
||||
{
|
||||
_ok = 1;
|
||||
for ( i = 0; i < sst->delLen
|
||||
&& (sst->idx + i < sst->srcLen); i++ )
|
||||
{
|
||||
if ( sst->source[sst->idx+i] != sst->delimiter[i] )
|
||||
{
|
||||
_ok = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//find the delimiter here,
|
||||
//break the loop and self plus the sst->idx, then return the buffer .
|
||||
if ( _ok == 1 ) {
|
||||
sst->idx += sst->delLen;
|
||||
break;
|
||||
}
|
||||
|
||||
//coy the char to the buffer
|
||||
*_dst++ = sst->source[sst->idx++];
|
||||
//check if reach the end of the fstring
|
||||
if ( sst->idx >= sst->srcLen ) break;
|
||||
}
|
||||
|
||||
*_dst = '\0';
|
||||
return _dst;
|
||||
}
|
636
src/friso_string.c
Normal file
636
src/friso_string.c
Normal file
@ -0,0 +1,636 @@
|
||||
/*
|
||||
* utf-8 handle function implements.
|
||||
* you could modify it or re-release it but never for commercial use.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* ******************************************
|
||||
* fstring buffer functions implements. *
|
||||
********************************************/
|
||||
__STATIC_API__ fstring create_buffer( uint_t length )
|
||||
{
|
||||
fstring buffer = ( fstring ) FRISO_MALLOC( length );
|
||||
if ( buffer == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
memset( buffer, 0x00, length );
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
//the __allocs should not be smaller than sb->length
|
||||
__STATIC_API__ string_buffer_t resize_buffer(
|
||||
string_buffer_t sb, uint_t __allocs )
|
||||
{
|
||||
//create a new buffer.
|
||||
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
|
||||
fstring str = create_buffer( __allocs );
|
||||
|
||||
//register uint_t t;
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// str[t] = sb->buffer[t];
|
||||
//}
|
||||
memcpy( str, sb->buffer, sb->length );
|
||||
FRISO_FREE( sb->buffer );
|
||||
|
||||
sb->buffer = str;
|
||||
sb->allocs = __allocs;
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
//create a new fstring buffer with a default opacity.
|
||||
//FRISO_API string_buffer_t new_string_buffer( void )
|
||||
//{
|
||||
// return new_string_buffer_with_opacity( __BUFFER_DEFAULT_LENGTH__ );
|
||||
//}
|
||||
|
||||
//create a new fstring buffer with the given opacity.
|
||||
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t opacity )
|
||||
{
|
||||
string_buffer_t sb = ( string_buffer_t )
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
if ( sb == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
sb->buffer = create_buffer( opacity );
|
||||
sb->length = 0;
|
||||
sb->allocs = opacity;
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
//create a buffer with the given string.
|
||||
FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
|
||||
{
|
||||
//buffer allocations.
|
||||
string_buffer_t sb = ( string_buffer_t )
|
||||
FRISO_MALLOC( sizeof( string_buffer_entry ) );
|
||||
if ( sb == NULL ) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
sb->length = strlen( str );
|
||||
sb->buffer = create_buffer( sb->length + __BUFFER_DEFAULT_LENGTH__ );
|
||||
sb->allocs = sb->length + __BUFFER_DEFAULT_LENGTH__;
|
||||
|
||||
//register uint_t t;
|
||||
//copy the str to the buffer.
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// sb->buffer[t] = str[t];
|
||||
//}
|
||||
memcpy( sb->buffer, str, sb->length );
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_append(
|
||||
string_buffer_t sb, fstring __str )
|
||||
{
|
||||
register uint_t __len__ = strlen( __str );
|
||||
|
||||
//check the necessity to resize the buffer.
|
||||
if ( sb->length + __len__ > sb->allocs ) {
|
||||
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
|
||||
}
|
||||
|
||||
//register uint_t t;
|
||||
////copy the __str to the buffer.
|
||||
//for ( t = 0; t < __len__; t++ ) {
|
||||
// sb->buffer[ sb->length++ ] = __str[t];
|
||||
//}
|
||||
memcpy( sb->buffer + sb->length, __str, __len__ );
|
||||
sb->length += __len__;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_insert(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
fstring __str )
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the given bytes from the buffer start from idx.
|
||||
* this will cause the byte move after the idx+length.
|
||||
*
|
||||
* @return the new string.
|
||||
*/
|
||||
FRISO_API fstring string_buffer_remove(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
uint_t length )
|
||||
{
|
||||
uint_t t;
|
||||
//move the bytes after the idx + length
|
||||
for ( t = idx + length; t < sb->length; t++ ) {
|
||||
sb->buffer[t - length] = sb->buffer[t];
|
||||
}
|
||||
sb->buffer[t] = '\0';
|
||||
//memcpy( sb->buffer + idx,
|
||||
// sb->buffer + idx + length,
|
||||
// sb->length - idx - length );
|
||||
|
||||
t = sb->length - idx;
|
||||
if ( t > 0 ) {
|
||||
sb->length -= ( t > length ) ? length : t;
|
||||
}
|
||||
sb->buffer[sb->length-1] = '\0';
|
||||
|
||||
return sb->buffer;
|
||||
}
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
|
||||
{
|
||||
//resize the buffer.
|
||||
if ( sb->length < sb->allocs - 1 ) {
|
||||
sb = resize_buffer( sb, sb->length + 1 );
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote( string_buffer_t sb )
|
||||
{
|
||||
fstring buffer = sb->buffer;
|
||||
FRISO_FREE( sb );
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear( string_buffer_t sb )
|
||||
{
|
||||
memset( sb->buffer, 0x00, sb->length );
|
||||
sb->length = 0;
|
||||
}
|
||||
|
||||
//free everything of the fstring buffer.
|
||||
FRISO_API void free_string_buffer( string_buffer_t sb )
|
||||
{
|
||||
FRISO_FREE( sb->buffer );
|
||||
FRISO_FREE( sb );
|
||||
}
|
||||
|
||||
|
||||
/* ******************************************
|
||||
* utf-8 handle functions implements *
|
||||
********************************************/
|
||||
|
||||
/*
|
||||
* print a character in a binary style.
|
||||
*
|
||||
* @param int
|
||||
*/
|
||||
FRISO_API void print_char_binary( char value )
|
||||
{
|
||||
register uint_t t;
|
||||
|
||||
for ( t = 0; t < __CHAR_BYTES__; t++ )
|
||||
{
|
||||
if ( ( value & 0x80 ) == 0x80 ) {
|
||||
printf("1");
|
||||
} else {
|
||||
printf("0");
|
||||
}
|
||||
value <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the bytes of a utf-8 char.
|
||||
* between 1 - 6.
|
||||
*
|
||||
* @param __char
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int get_utf8_bytes( char value )
|
||||
{
|
||||
register uint_t t = 0;
|
||||
|
||||
//one byte ascii char.
|
||||
if ( ( value & 0x80 ) == 0 ) return 1;
|
||||
|
||||
for ( ; ( value & 0x80 ) != 0; value <<= 1 )
|
||||
t++;
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the unicode serial of a utf-8 char.
|
||||
*
|
||||
* @param ch
|
||||
* @return int.
|
||||
*/
|
||||
FRISO_API int get_utf8_unicode( const fstring ch )
|
||||
{
|
||||
int code = 0, bytes = get_utf8_bytes( *ch );
|
||||
register uchar_t *bit = ( uchar_t * ) &code;
|
||||
register char b1,b2,b3;
|
||||
|
||||
switch ( bytes ) {
|
||||
case 1:
|
||||
*bit = *ch;
|
||||
break;
|
||||
case 2:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
|
||||
*bit = (b1 << 6) + (b2 & 0x3F);
|
||||
*(bit+1) = (b1 >> 2) & 0x07;
|
||||
break;
|
||||
case 3:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
b3 = *(ch + 2);
|
||||
|
||||
*bit = (b2 << 6) + (b3 & 0x3F);
|
||||
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
|
||||
break;
|
||||
//ignore the ones that are larger than 3 bytes;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
//turn the unicode serial to a utf-8 string.
|
||||
FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
|
||||
{
|
||||
if ( u <= 0x0000007F ) {
|
||||
//U-00000000 - U-0000007F
|
||||
//0xxxxxxx
|
||||
*__word = ( u & 0x7F );
|
||||
return 1;
|
||||
} else if ( u >= 0x00000080 && u <= 0x000007FF ) {
|
||||
//U-00000080 - U-000007FF
|
||||
//110xxxxx 10xxxxxx
|
||||
*( __word + 1 ) = ( u & 0x3F) | 0x80;
|
||||
*__word = ((u >> 6) & 0x1F) | 0xC0;
|
||||
return 2;
|
||||
} else if ( u >= 0x00000800 && u <= 0x0000FFFF ) {
|
||||
//U-00000800 - U-0000FFFF
|
||||
//1110xxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 2 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 12) & 0x0F) | 0xE0;
|
||||
return 3;
|
||||
} else if ( u >= 0x00010000 && u <= 0x001FFFFF ) {
|
||||
//U-00010000 - U-001FFFFF
|
||||
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 3 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 18) & 0x07) | 0xF0;
|
||||
return 4;
|
||||
} else if ( u >= 0x00200000 && u <= 0x03FFFFFF ) {
|
||||
//U-00200000 - U-03FFFFFF
|
||||
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 4 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 24) & 0x03) | 0xF8;
|
||||
return 5;
|
||||
} else if ( u >= 0x04000000 && u <= 0x7FFFFFFF ) {
|
||||
//U-04000000 - U-7FFFFFFF
|
||||
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*( __word + 5 ) = ( u & 0x3F) | 0x80;
|
||||
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 30) & 0x01) | 0xFC;
|
||||
return 6;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a CJK char or not.
|
||||
* 2E80-2EFF CJK 部首补充
|
||||
* 2F00-2FDF 康熙字典部首
|
||||
* 3000-303F CJK 符号和标点 --ignore
|
||||
* 31C0-31EF CJK 笔画
|
||||
* 3200-32FF 封闭式 CJK 文字和月份
|
||||
* 3300-33FF CJK 兼容
|
||||
* 3400-4DBF CJK 统一表意符号扩展 A
|
||||
* 4DC0-4DFF 易经六十四卦符号
|
||||
* 4E00-9FBF CJK 统一表意符号
|
||||
* F900-FAFF CJK 兼容象形文字
|
||||
* FE30-FE4F CJK 兼容形式
|
||||
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin)
|
||||
*
|
||||
* Japanese:
|
||||
* 3040-309F 日本平假名
|
||||
* 30A0-30FF 日本片假名
|
||||
* 31F0-31FF 日本片假名拼音扩展
|
||||
*
|
||||
* Korean:
|
||||
* AC00-D7AF 韩文拼音
|
||||
* 1100-11FF 韩文字母
|
||||
* 3130-318F 韩文兼容字母
|
||||
*
|
||||
* @param ch :pointer to the char
|
||||
* @return int : 1 for yes and 0 for not.
|
||||
*/
|
||||
|
||||
//Comment one of the following macro define
|
||||
//to clear the check of the specified language.
|
||||
#define FRISO_CJK_CHK_C
|
||||
#define FRISO_CJK_CHK_J
|
||||
#define FRISO_CJK_CHK_K
|
||||
FRISO_API int utf8_cjk_string( uint_t u )
|
||||
{
|
||||
int c = 0, j = 0, k = 0;
|
||||
//Chinese.
|
||||
#ifdef FRISO_CJK_CHK_C
|
||||
c = ( ( u >= 0x4E00 && u <= 0x9FBF )
|
||||
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|
||||
|| ( u >= 0x31C0 && u <= 0x31EF ) || ( u >= 0x3200 && u <= 0x32FF )
|
||||
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|
||||
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|
||||
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
|
||||
#endif
|
||||
|
||||
//Japanese.
|
||||
#ifdef FRISO_CJK_CHK_J
|
||||
j = ( (u >= 0x3040 && u <= 0x309F)
|
||||
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
|
||||
#endif
|
||||
|
||||
//Korean
|
||||
#ifdef FRISO_CJK_CHK_K
|
||||
k = ( ( u >= 0xAC00 && u <= 0xD7AF )
|
||||
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
|
||||
#endif
|
||||
|
||||
return ( c || j || k );
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int 1 for yes and 0 for not.
|
||||
*/
|
||||
FRISO_API int utf8_halfwidth_en_char( uint_t u )
|
||||
{
|
||||
return ( u >= 32 && u <= 126 );
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char( uint_t u )
|
||||
{
|
||||
return ( (u >= 65296 && u <= 65305 ) //arabic number
|
||||
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|
||||
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
|
||||
}
|
||||
|
||||
//check the given char is a upper case char or not.
|
||||
FRISO_API int utf8_uppercase_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
return ( u >= 65 && u <= 90 );
|
||||
}
|
||||
|
||||
//check the given char is a upper case char or not.
|
||||
FRISO_API int utf8_lowercase_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
return ( u >= 97 && u <= 122 );
|
||||
}
|
||||
|
||||
//check the given char is a numeric
|
||||
FRISO_API int utf8_numeric_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248; //make full-width half-width.
|
||||
return ( ( u >= 48 && u <= 57 ) );
|
||||
}
|
||||
|
||||
//check the given char is a english char.
|
||||
//not the punctuation of course.
|
||||
FRISO_API int utf8_en_letter( uint_t u )
|
||||
{
|
||||
if ( u > 65280 ) u -= 65248;
|
||||
return ( ( u >= 65 && u <= 90 )
|
||||
|| ( u >= 97 && u <= 122 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*
|
||||
* @param str
|
||||
* @return int
|
||||
* 65296, 0
|
||||
* 65297, 1
|
||||
* 65298, 2
|
||||
* 65299, 3
|
||||
* 65300, 4
|
||||
* 65301, 5
|
||||
* 65302, 6
|
||||
* 65303, 7
|
||||
* 65304, 8
|
||||
* 65305, 9
|
||||
*/
|
||||
FRISO_API int utf8_numeric_string( const fstring str )
|
||||
{
|
||||
fstring s = str;
|
||||
int bytes, u;
|
||||
|
||||
while ( *s != '\0' ) {
|
||||
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
//new implemention.
|
||||
//@date 2013-10-14
|
||||
bytes = 1;
|
||||
if ( *s < 0 ) {
|
||||
//full-width chars.
|
||||
u = get_utf8_unicode(s);
|
||||
bytes = get_utf8_bytes(*s);
|
||||
if ( u < 65296 || u > 65305 )
|
||||
return 0;
|
||||
}
|
||||
else if ( *s < 48 || *s > 57 )
|
||||
return 0;
|
||||
|
||||
s += bytes;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
FRISO_API int utf8_decimal_string( const fstring str )
|
||||
{
|
||||
int len = strlen(str), i, p = 0;
|
||||
int bytes, u;
|
||||
|
||||
if ( str[0] == '.' || str[len-1] == '.' ) return 0;
|
||||
|
||||
for ( i = 1; i < len; bytes = 1 ) {
|
||||
//count the number of char '.'
|
||||
if ( str[i] == '.' ) p++;
|
||||
//full-width numeric.
|
||||
else if ( str[i] < 0 ) {
|
||||
u = get_utf8_unicode(str+i);
|
||||
bytes = get_utf8_bytes(str[i]);
|
||||
if ( u < 65296 || u > 65305 )
|
||||
return 0;
|
||||
}
|
||||
else if ( str[i] < 48 || str[i] > 57 )
|
||||
return 0;
|
||||
|
||||
i += bytes;
|
||||
}
|
||||
|
||||
return (p == 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a whitespace or not.
|
||||
*
|
||||
* @param ch
|
||||
* @return int 1 for yes and 0 for not.
|
||||
*/
|
||||
FRISO_API int utf8_whitespace( uint_t u )
|
||||
{
|
||||
if ( u == 32 || u == 12288 )
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* check the given char is a english punctuation.
|
||||
*
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_en_punctuation( uint_t u )
|
||||
{
|
||||
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
|
||||
return ( (u > 32 && u < 48)
|
||||
|| ( u > 57 && u < 65 )
|
||||
|| ( u > 90 && u < 97 ) //added @2013-08-31
|
||||
|| ( u > 122 && u < 127 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a chinese punctuation.
|
||||
* @date 2013-08-31 added.
|
||||
*
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_cn_punctuation( uint_t u )
|
||||
{
|
||||
return ( ( u > 65280 && u < 65296 )
|
||||
|| ( u > 65305 && u < 65312 )
|
||||
|| ( u > 65338 && u < 65345 )
|
||||
|| ( u > 65370 && u < 65382 )
|
||||
//cjk symbol and punctuation.(added 2013-09-06)
|
||||
//from http://www.unicode.org/charts/PDF/U3000.pdf
|
||||
|| ( u >= 12289 && u <= 12319) );
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a letter number in unicode.
|
||||
* like 'ⅠⅡ'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_letter_number( uint_t u )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a other number in unicode.
|
||||
* like '①⑩⑽㈩'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_other_number( uint_t u )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
//A macro define has replace this.
|
||||
//FRISO_API int is_en_punctuation( char c )
|
||||
//{
|
||||
// return utf8_en_punctuation( (uint_t) c );
|
||||
//}
|
||||
|
||||
/* {{{
|
||||
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
|
||||
*/
|
||||
static friso_hash_t __keep_punctuations_hash__ = NULL;
|
||||
|
||||
/*check the given char is an english keep char.*/
|
||||
FRISO_API int utf8_keep_punctuation( fstring str )
|
||||
{
|
||||
if ( __keep_punctuations_hash__ == NULL ) {
|
||||
__keep_punctuations_hash__ = new_hash_table();
|
||||
hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
}
|
||||
//check the hash.
|
||||
return hash_exist_mapping( __keep_punctuations_hash__, str );
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
/*
|
||||
* check the given english char is a full-width char or not.
|
||||
*
|
||||
* @param ch
|
||||
* @return 1 for yes and 0 for not.
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_char( uint_t u )
|
||||
{
|
||||
if ( u == 12288 )
|
||||
return 1; //full-width space
|
||||
//(32 - 126) ascii code
|
||||
return (u > 65280 && u <= 65406);
|
||||
}
|
51
src/tst-array.c
Normal file
51
src/tst-array.c
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* dynamatic array test program.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main( int argc, char **args ) {
|
||||
|
||||
//create a new array list.
|
||||
friso_array_t array = new_array_list();
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
|
||||
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
array_list_add( array, keys[j] );
|
||||
}
|
||||
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
array_list_trim( array );
|
||||
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nAfter set %dth item.\n", idx );
|
||||
array_list_set( array, idx, "chenxin__" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nAfter remove %dth item.\n", idx );
|
||||
array_list_remove( array, idx );
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
printf("\nInsert a item at %dth\n", idx );
|
||||
array_list_insert( array, idx, "*chenxin*" );
|
||||
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
|
||||
|
||||
free_array_list( array );
|
||||
|
||||
return 0;
|
||||
}
|
133
src/tst-friso.c
Normal file
133
src/tst-friso.c
Normal file
@ -0,0 +1,133 @@
|
||||
/*
|
||||
* friso test program.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
#include "friso.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
/**
|
||||
* File Explain.
|
||||
*
|
||||
* @author chenxin<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define __INPUT_LENGTH__ 20480
|
||||
#define ___EXIT_INFO___ \
|
||||
println("Thanks for trying friso."); \
|
||||
break;
|
||||
|
||||
#define ___ABOUT___ \
|
||||
println("+-----------------------------------------------------------+"); \
|
||||
println("| friso - a chinese word segmentation writen by c. |"); \
|
||||
println("| bug report email - chenxin619315@gmail.com. |"); \
|
||||
println("| or: visit http://code.google.com/p/friso. |"); \
|
||||
println("| java edition for http://code.google.com/p/jcseg |"); \
|
||||
println("| type 'quit' to exit the program. |"); \
|
||||
println("+-----------------------------------------------------------+");
|
||||
|
||||
//read a line from a command line.
|
||||
static fstring getLine( FILE *fp, fstring __dst ) {
|
||||
register int c;
|
||||
register fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while ( ( c = getc( fp ) ) != EOF ) {
|
||||
if ( c == '\n' ) break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return ( c == EOF && cs == __dst ) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*static void printcode( fstring str ) {
|
||||
int i,length;
|
||||
length = strlen( str );
|
||||
printf("str:length=%d\n", length );
|
||||
for ( i = 0; i < length; i++ ) {
|
||||
printf("%d ", str[i] );
|
||||
}
|
||||
putchar('\n');
|
||||
}*/
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
clock_t s_time, e_time;
|
||||
char line[__INPUT_LENGTH__] = {0};
|
||||
int i;
|
||||
fstring __path__ = NULL;
|
||||
|
||||
friso_t friso;
|
||||
friso_task_t task;
|
||||
|
||||
//get the lexicon directory
|
||||
for ( i = 0; i < argc; i++ ) {
|
||||
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
|
||||
__path__ = argv[i+1];
|
||||
}
|
||||
}
|
||||
if ( __path__ == NULL ) {
|
||||
println("Usage: friso -init lexicon path");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
s_time = clock();
|
||||
|
||||
//initialize
|
||||
/* friso_t friso = friso_new();
|
||||
friso_dic_t dic = friso_dic_new();
|
||||
|
||||
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
|
||||
friso_set_dic( friso, dic );
|
||||
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
|
||||
friso = friso_new_from_ifile(__path__);
|
||||
//friso->mode = __FRISO_SIMPLE_MODE__;
|
||||
//printf("clr_stw=%d\n", friso->clr_stw);
|
||||
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
|
||||
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
|
||||
|
||||
e_time = clock();
|
||||
|
||||
printf("friso initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
___ABOUT___;
|
||||
|
||||
//set the task.
|
||||
task = friso_new_task();
|
||||
|
||||
while ( 1 ) {
|
||||
print("friso>> ");
|
||||
getLine( stdin, line );
|
||||
//exit the programe
|
||||
if ( strcasecmp( line, "quit" ) == 0 ) {
|
||||
___EXIT_INFO___
|
||||
}
|
||||
|
||||
//for ( i = 0; i < 1000000; i++ ) {
|
||||
//set the task text.
|
||||
friso_set_text( task, line );
|
||||
println("分词结果:");
|
||||
|
||||
s_time = clock();
|
||||
while ( ( friso_next( friso, friso->mode, task ) ) != NULL ) {
|
||||
//printf("%s[%d]/ ", task->hits->word, task->hits->offset );
|
||||
printf("%s/ ", task->hits->word );
|
||||
}
|
||||
//}
|
||||
e_time = clock();
|
||||
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
|
||||
|
||||
}
|
||||
|
||||
friso_free_task( task );
|
||||
friso_free(friso);
|
||||
|
||||
return 0;
|
||||
}
|
67
src/tst-hash.c
Normal file
67
src/tst-hash.c
Normal file
@ -0,0 +1,67 @@
|
||||
/**
|
||||
* File Explain.
|
||||
*
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void print_hash_info( friso_hash_t _hash ) {
|
||||
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
|
||||
_hash->size, _hash->factor, _hash->threshold);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
friso_hash_t _hash = new_hash_table();
|
||||
char *names[] = {
|
||||
"陈满文", "阳清华",
|
||||
"陈鑫", "罗江艳",
|
||||
"小燕子", "比比",
|
||||
"张仁芳", "阳建",
|
||||
"陈配", "李恒",
|
||||
"张志刚", "张怡少",
|
||||
"阳江波", "蔡再利",
|
||||
"阳绘章", "尹唐文",
|
||||
"谭志鹏", "肖路德",
|
||||
"潘凯", "刘潇",
|
||||
"马朝辉", "张强",
|
||||
"殷美林", "元明清",
|
||||
"周安", "郭桥安",
|
||||
"刘敏", "黄广华",
|
||||
"李胜", "黄海清"
|
||||
};
|
||||
//char *str[] = {"陈鑫", "张仁芳", "比比"};
|
||||
char **str = names;
|
||||
int j, len = 30;
|
||||
|
||||
print_hash_info( _hash );
|
||||
for ( j = 0; j < len; j++) {
|
||||
hash_put_mapping( _hash, names[j], names[j] );
|
||||
}
|
||||
|
||||
print_hash_info( _hash );
|
||||
|
||||
printf("Press any key to continue.");
|
||||
getchar();
|
||||
|
||||
//remove mappings
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("Now, remove %s\n", str[j]);
|
||||
hash_remove_mapping( _hash, str[j] );
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping( _hash, str[j] ));
|
||||
printf("*********************************\n");
|
||||
}
|
||||
|
||||
printf("Press any key to continue.");
|
||||
getchar();
|
||||
|
||||
print_hash_info( _hash );
|
||||
//free the table
|
||||
free_hash_table( _hash, 0 );
|
||||
|
||||
return 0;
|
||||
}
|
95
src/tst-lex.c
Normal file
95
src/tst-lex.c
Normal file
@ -0,0 +1,95 @@
|
||||
/*
|
||||
* lex functions test program.
|
||||
*
|
||||
* @author chenxin
|
||||
* @see http://www.webssky.com
|
||||
*/
|
||||
#include "friso.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define ___PRINT_HELP_INFO___ \
|
||||
printf("1. help print the current menu.\n"); \
|
||||
printf("2. #set set the classify of the dictionary.\n"); \
|
||||
printf("3. other search the words in the dictionary.\n"); \
|
||||
printf("4. quit exit the programe.\n");
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
lex_entry_t e;
|
||||
int lex = __LEX_CJK_WORDS__;
|
||||
char _line[__LENGTH__];
|
||||
clock_t s_time, e_time;
|
||||
friso_t friso;
|
||||
|
||||
s_time = clock();
|
||||
|
||||
friso = friso_new();
|
||||
friso->dic = friso_dic_new();
|
||||
//__CJK_WORDS__
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-main.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-admin.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-chars.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-cn-mz.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-cn-place.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-company.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-festival.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-flname.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-food.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-lang.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-nation.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-net.lex", __LENGTH__ );
|
||||
friso_dic_load( friso, __LEX_CJK_WORDS__, "../dict/lex-org.lex", __LENGTH__ );
|
||||
|
||||
//__CJK_UNITS__
|
||||
friso_dic_load( friso, __LEX_CJK_UNITS__, "../dict/lex-units.lex", __LENGTH__ );
|
||||
//__MIX_WORDS__
|
||||
friso_dic_load( friso, __LEX_ECM_WORDS__, "../dict/lex-mixed.lex", __LENGTH__ );
|
||||
//__CN_LNAME__
|
||||
friso_dic_load( friso, __LEX_CN_LNAME__, "../dict/lex-lname.lex", __LENGTH__ );
|
||||
//__CN_SNAME__
|
||||
friso_dic_load( friso, __LEX_CN_SNAME__, "../dict/lex-sname.lex", __LENGTH__ );
|
||||
//__CN_DNAME1__
|
||||
friso_dic_load( friso, __LEX_CN_DNAME1__, "../dict/lex-dname-1.lex", __LENGTH__ );
|
||||
//__CN_DNAME2__
|
||||
friso_dic_load( friso, __LEX_CN_DNAME2__, "../dict/lex-dname-2.lex", __LENGTH__ );
|
||||
//__CN_LNA__
|
||||
friso_dic_load( friso, __LEX_CN_LNA__, "../dict/lex-lna.lex", __LENGTH__ );
|
||||
|
||||
e_time = clock();
|
||||
|
||||
printf("Done, cost: %f sec, size=%d\n", ( double ) ( e_time - s_time ) / CLOCKS_PER_SEC, \
|
||||
friso_all_dic_size( friso->dic ) );
|
||||
|
||||
while ( 1 ) {
|
||||
printf("friso-%d>> ", lex);
|
||||
scanf("%s", _line);
|
||||
if ( strcmp( _line, "quit" ) == 0 ) {
|
||||
break;
|
||||
} else if ( strcmp( _line, "help" ) == 0 ) {
|
||||
___PRINT_HELP_INFO___
|
||||
} else if ( strcmp( _line, "#set" ) == 0 ) {
|
||||
printf("lex_t>> ");
|
||||
scanf("%d", &lex);
|
||||
} else {
|
||||
s_time = clock();
|
||||
e = friso_dic_get( friso->dic, lex, _line );
|
||||
e_time = clock();
|
||||
if ( e != NULL ) {
|
||||
printf("word=%s, syn=%s, fre=%d, cost:%fsec\n",
|
||||
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0], e->fre,
|
||||
(double) ( e_time - s_time ) / CLOCKS_PER_SEC );
|
||||
} else {
|
||||
printf("%s was not found.\n", _line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//friso_dic_free( friso->dic );
|
||||
friso_free(friso);
|
||||
|
||||
return 0;
|
||||
}
|
51
src/tst-link.c
Normal file
51
src/tst-link.c
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* link list test programe.
|
||||
*
|
||||
* @author chenxin
|
||||
* @email chenxin619315@gmail.com
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main( int argc, char **args ) {
|
||||
|
||||
friso_link_t link;
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, len = sizeof( keys ) / sizeof( fstring );
|
||||
|
||||
link = new_link_list();
|
||||
|
||||
//print the size of the link
|
||||
printf("size=%d\n", link->size );
|
||||
|
||||
for ( j = 0; j < len; j++ ) {
|
||||
//link_add( link, keys[j] );
|
||||
link_list_add_last( link, keys[j] );
|
||||
}
|
||||
|
||||
printf("size=%d\n", link->size );
|
||||
|
||||
for ( j = 0; j < len / 2; j++ ) {
|
||||
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
|
||||
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
|
||||
}
|
||||
|
||||
printf("size=%d\n", link->size );
|
||||
|
||||
//clear all the nodes
|
||||
link_list_clear( link );
|
||||
printf("size=%d, head->next->value=%s\n", link->size, ( fstring ) link->head->next->value );
|
||||
|
||||
free_link_list( link );
|
||||
|
||||
return 0;
|
||||
}
|
30
src/tst-split.c
Normal file
30
src/tst-split.c
Normal file
@ -0,0 +1,30 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "friso_API.h"
|
||||
|
||||
/**
|
||||
* friso fstring split test program .
|
||||
*
|
||||
* @author chenxin<chenxin619315@gmail.com>
|
||||
* @date 2013-06-09
|
||||
*/
|
||||
|
||||
int main ( int argc, char **args )
|
||||
{
|
||||
fstring source = ",I am a chinese,,my name is chenxin,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
|
||||
char buffer[128];
|
||||
string_split_t split = new_string_split(",", source );
|
||||
|
||||
printf("sst->idx=%d\n", split->idx);
|
||||
printf("sst->srcLen=%d\n", split->srcLen);
|
||||
printf("sst->delLen=%d\n", split->delLen);
|
||||
|
||||
while ( string_split_next(split, buffer) != NULL) {
|
||||
printf("buffer:%s\n", buffer);
|
||||
}
|
||||
|
||||
free_string_split(split);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
46
src/tst-string.c
Normal file
46
src/tst-string.c
Normal file
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* fstring handle mode test program.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
#include "friso_API.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main( int argc, char **args ) {
|
||||
|
||||
fstring str = "康熙字典部首, 符号和标点, 统一表意符号扩展 A ,CJK㈩兼Ⅱ容形式⑩.";
|
||||
char word[4];
|
||||
int bytes, t, j, length = strlen( str );
|
||||
string_buffer_t sb = new_string_buffer();
|
||||
|
||||
printf("str=%s, length=%d\n", str, length );
|
||||
|
||||
|
||||
for ( t = 0; t < length; t += bytes ) {
|
||||
bytes = get_utf8_bytes( *(str + t) );
|
||||
if ( bytes == 0 ) continue;
|
||||
for ( j = 0; j < bytes; j++ )
|
||||
word[j] = *(str + t + j );
|
||||
word[j] = '\0';
|
||||
string_buffer_append( sb, word );
|
||||
printf("word=%s\n", word );
|
||||
}
|
||||
|
||||
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
|
||||
string_buffer_remove( sb, 0, 3 );
|
||||
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
|
||||
string_buffer_remove( sb, 0, 3 );
|
||||
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
|
||||
string_buffer_remove( sb, sb->length - 3, 6 );
|
||||
sb = string_buffer_trim( sb );
|
||||
printf("length=%d, buffer=%s\n", sb->length, string_buffer_devote( sb ) );
|
||||
|
||||
//00011110 - yuan ma
|
||||
//11100001 - fa ma
|
||||
//11100010 - bu ma
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user