mirror of
https://gitee.com/mymagicpower/AIAS.git
synced 2024-12-02 04:08:21 +08:00
update jieba sdk.
This commit is contained in:
parent
43122ac6b9
commit
b895414094
@ -1,4 +1,4 @@
|
||||
# 文本 - 中文分词SDK
|
||||
### 文本 - 中文分词SDK
|
||||
jieba分词java版本的简化实现。
|
||||
|
||||
![jieba](https://aias-home.oss-cn-beijing.aliyuncs.com/AIAS/nlp_sdks/jieba.png)
|
||||
@ -6,7 +6,7 @@ jieba分词java版本的简化实现。
|
||||
Python版:
|
||||
https://github.com/fxsjy/jieba
|
||||
|
||||
## 运行例子 - JiebaExample
|
||||
#### 运行例子 - JiebaExample
|
||||
运行成功后,命令行应该看到下面的信息:
|
||||
```text
|
||||
...
|
||||
|
114
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/README.org
Normal file
114
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/README.org
Normal file
@ -0,0 +1,114 @@
|
||||
* 结巴分词(java版) jieba-analysis
|
||||
首先感谢jieba分词原作者[[https://github.com/fxsjy][fxsjy]],没有他的无私贡献,我们也不会结识到结巴
|
||||
分词,更不会有现在的java版本。
|
||||
|
||||
结巴分词的原始版本为python编写,目前该项目在github上的关注量为170,
|
||||
打星727次(最新的数据以原仓库为准),Fork238次,可以说已经有一定的用户群。
|
||||
|
||||
结巴分词(java版)只保留的原项目针对搜索引擎分词的功能(cut_for_index、cut_for_search),词性标注,关键词提取没有实现(今后如用到,可以考虑实现)。
|
||||
|
||||
* 简介
|
||||
** 支持分词模式
|
||||
- Search模式,用于对用户查询词分词
|
||||
- Index模式,用于对索引文档分词
|
||||
|
||||
** 特性
|
||||
- 支持多种分词模式
|
||||
- 全角统一转成半角
|
||||
- 用户词典功能
|
||||
- conf 目录有整理的搜狗细胞词库
|
||||
- 因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。
|
||||
|
||||
* 如何获取
|
||||
- 当前稳定版本
|
||||
#+BEGIN_SRC xml
|
||||
<dependency>
|
||||
<groupId>com.huaban</groupId>
|
||||
<artifactId>jieba-analysis</artifactId>
|
||||
<version>1.0.2</version>
|
||||
</dependency>
|
||||
#+END_SRC
|
||||
|
||||
- 当前快照版本
|
||||
#+BEGIN_SRC xml
|
||||
<dependency>
|
||||
<groupId>com.huaban</groupId>
|
||||
<artifactId>jieba-analysis</artifactId>
|
||||
<version>1.0.3-SNAPSHOT</version>
|
||||
</dependency>
|
||||
#+END_SRC
|
||||
|
||||
|
||||
* 如何使用
|
||||
- Demo
|
||||
#+BEGIN_SRC java
|
||||
|
||||
@Test
|
||||
public void testDemo() {
|
||||
JiebaSegmenter segmenter = new JiebaSegmenter();
|
||||
String[] sentences =
|
||||
new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。",
|
||||
"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结过婚的和尚未结过婚的"};
|
||||
for (String sentence : sentences) {
|
||||
System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
|
||||
}
|
||||
}
|
||||
#+END_SRC
|
||||
|
||||
* 算法(wiki补充...)
|
||||
- [ ] 基于 =trie= 树结构实现高效词图扫描
|
||||
- [ ] 生成所有切词可能的有向无环图 =DAG=
|
||||
- [ ] 采用动态规划算法计算最佳切词组合
|
||||
- [ ] 基于 =HMM= 模型,采用 =Viterbi= (维特比)算法实现未登录词识别
|
||||
|
||||
* 性能评估
|
||||
- 测试机配置
|
||||
#+BEGIN_SRC screen
|
||||
Processor 2 Intel(R) Pentium(R) CPU G620 @ 2.60GHz
|
||||
Memory:8GB
|
||||
|
||||
分词测试时机器开了许多应用(eclipse、emacs、chrome...),可能
|
||||
会影响到测试速度
|
||||
#+END_SRC
|
||||
- [[src/test/resources/test.txt][测试文本]]
|
||||
- 测试结果(单线程,对测试文本逐行分词,并循环调用上万次)
|
||||
#+BEGIN_SRC screen
|
||||
循环调用一万次
|
||||
第一次测试结果:
|
||||
time elapsed:12373, rate:2486.986533kb/s, words:917319.94/s
|
||||
第二次测试结果:
|
||||
time elapsed:12284, rate:2505.005241kb/s, words:923966.10/s
|
||||
第三次测试结果:
|
||||
time elapsed:12336, rate:2494.445880kb/s, words:920071.30/s
|
||||
|
||||
循环调用2万次
|
||||
第一次测试结果:
|
||||
time elapsed:22237, rate:2767.593144kb/s, words:1020821.12/s
|
||||
第二次测试结果:
|
||||
time elapsed:22435, rate:2743.167762kb/s, words:1011811.87/s
|
||||
第三次测试结果:
|
||||
time elapsed:22102, rate:2784.497726kb/s, words:1027056.34/s
|
||||
统计结果:词典加载时间1.8s左右,分词效率每秒2Mb多,近100万词。
|
||||
|
||||
2 Processor Intel(R) Core(TM) i3-2100 CPU @ 3.10GHz
|
||||
12G 测试效果
|
||||
time elapsed:19597, rate:3140.428063kb/s, words:1158340.52/s
|
||||
time elapsed:20122, rate:3058.491639kb/s, words:1128118.44/s
|
||||
|
||||
#+END_SRC
|
||||
|
||||
* 使用本库项目
|
||||
- [[https://github.com/sing1ee/analyzer-solr][analyzer-solr]] @sing1ee
|
||||
|
||||
|
||||
* 许可证
|
||||
jieba(python版本)的许可证为MIT,jieba(java版本)的许可证为ApacheLicence 2.0
|
||||
#+BEGIN_SRC screen
|
||||
Copyright (C) 2013 Huaban Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
|
||||
#+END_SRC
|
6
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/bin/build.sh
Executable file
6
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/bin/build.sh
Executable file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
ROOT=`dirname $0`
|
||||
cd $ROOT/..
|
||||
mvn package install -DcreateChecksum=true -DskipTests
|
||||
|
94640
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/conf/sougou.dict
Normal file
94640
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/conf/sougou.dict
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,8 @@
|
||||
小清新 3
|
||||
百搭 3
|
||||
显瘦 3
|
||||
又拍云 3
|
||||
iphone 3
|
||||
鲜芋仙 3
|
||||
UTF-8 3 nz
|
||||
utf-8 3 nz
|
@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="CheckStyle-IDEA-Module">
|
||||
<option name="configuration">
|
||||
<map />
|
||||
</option>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.25" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.8" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
|
||||
</component>
|
||||
</module>
|
47
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/pom.xml
Normal file
47
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/pom.xml
Normal file
@ -0,0 +1,47 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>aias</groupId>
|
||||
<artifactId>jieba-lib</artifactId>
|
||||
<version>0.1.0</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
<version>3.8.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.12.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example;
|
||||
package me.aias;
|
||||
|
||||
import me.aias.jieba.JiebaSegmenter;
|
||||
import me.aias.jieba.SegToken;
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
@ -1,8 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
|
||||
import me.aias.jieba.CharacterUtil;
|
||||
import me.aias.jieba.Node;
|
||||
import me.aias.jieba.Pair;
|
||||
package me.aias.jieba;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
@ -17,6 +13,7 @@ import java.util.Vector;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.Collections;
|
||||
|
||||
|
||||
public class FinalSeg {
|
||||
private static FinalSeg singleInstance;
|
||||
private static final String PROB_EMIT = "/prob_emit.txt";
|
@ -23,7 +23,7 @@
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
*/
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
/**
|
||||
* 表示一次词典匹配的命中
|
@ -1,12 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
|
||||
|
||||
import me.aias.example.jieba.CharacterUtil;
|
||||
import me.aias.example.jieba.FinalSeg;
|
||||
import me.aias.example.jieba.Hit;
|
||||
import me.aias.example.jieba.Pair;
|
||||
import me.aias.example.jieba.SegToken;
|
||||
import me.aias.example.jieba.WordDictionary;
|
||||
package me.aias.jieba;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
public class Node {
|
||||
public Character value;
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
public class Pair<K> {
|
||||
public K key;
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
public class SegToken {
|
||||
public String word;
|
@ -1,4 +1,4 @@
|
||||
package me.aias.example.jieba;
|
||||
package me.aias.jieba;
|
||||
|
||||
|
||||
import java.io.BufferedReader;
|
349045
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/src/main/resources/dict.txt
Normal file
349045
2_nlp_sdks/lexical_analysis/jieba_sdk/jieba_lib/src/main/resources/dict.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,27 @@
|
||||
package me.aias;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class JiebaExample {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(JiebaExample.class);
|
||||
|
||||
private JiebaExample() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
String input = "今天是个好日子";
|
||||
|
||||
|
||||
logger.info("input Sentence: {}", input);
|
||||
Jieba parser = new Jieba();
|
||||
String[] result = parser.cut(input);
|
||||
|
||||
logger.info("Words : " + Arrays.toString(result));
|
||||
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,16 @@
|
||||
package me.aias;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class Test {
|
||||
public static void main(String[] args) {
|
||||
String sentence = "今天是个好日子";
|
||||
Jieba parser = new Jieba();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String[] words = parser.cut(sentence);
|
||||
System.out.println(Arrays.toString(words));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。
|
||||
我不喜欢日本和服。
|
||||
雷猴回归人间。
|
||||
工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
|
||||
我需要廉租房
|
||||
永和服装饰品有限公司
|
||||
我爱北京天安门
|
||||
abc
|
||||
隐马尔可夫
|
||||
雷猴是个好网站
|
||||
“,”和“SOFTware(软件)”两部分组成
|
||||
草泥马和欺实马是今年的流行词汇
|
||||
伊藤洋华堂总府店
|
||||
中国科学院计算技术研究所
|
||||
罗密欧与朱丽叶
|
||||
我购买了道具和服装
|
||||
PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍
|
||||
湖北省石首市
|
||||
湖北省十堰市
|
||||
总经理完成了这件事情
|
||||
电脑修好了
|
||||
做好了这件事情就一了百了了
|
||||
人们审美的观点是不同的
|
||||
我们买了一个美的空调
|
||||
线程初始化时我们要注意
|
||||
一个分子是由好多原子组织成的
|
||||
祝你马到功成
|
||||
他掉进了无底洞里
|
||||
中国的首都是北京
|
||||
孙君意
|
||||
外交部发言人马朝旭
|
||||
领导人会议和第四届东亚峰会
|
||||
在过去的这五年
|
||||
还需要很长的路要走
|
||||
60周年首都阅兵
|
||||
你好人们审美的观点是不同的
|
||||
买水果然后来世博园
|
||||
买水果然后去世博园
|
||||
但是后来我才知道你是对的
|
||||
存在即合理
|
||||
的的的的的在的的的的就以和和和
|
||||
I love你,不以为耻,反以为rong
|
||||
因
|
||||
|
||||
hello你好人们审美的观点是不同的
|
||||
很好但主要是基于网页形式
|
||||
hello你好人们审美的观点是不同的
|
||||
为什么我不能拥有想要的生活
|
||||
后来我才
|
||||
此次来中国是为了
|
||||
使用了它就可以解决一些问题
|
||||
使用了它就可以解决一些问题
|
||||
其实使用了它就可以解决一些问题
|
||||
好人使用了它就可以解决一些问题
|
||||
是因为和国家
|
||||
老年搜索还支持
|
||||
干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。
|
||||
大 " "他说的确实在理 "长春市长春节讲话 "结婚的和尚未结婚的 "结合成分子时 "旅游和服务是最好的
|
||||
这件事情的确是我的错 "供大家参考指正 "哈尔滨政府公布塌桥原因 "我在机场入口处 "邢永臣摄影报道
|
||||
BP神经网络如何训练才能在分类时增加区分度? "南京市长江大桥 "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究
|
||||
长春市长春药店 "邓颖超生前最喜欢的衣服 "胡锦涛是热爱世界和平的政治局常委
|
||||
程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪 "一次性交多少钱 "两块五一套,三块八一斤,四块七一本,五块六一条
|
||||
小和尚留了一个像大和尚一样的和尚头 "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站 "张晓梅去人民医院做了个B超然后去买了件T恤
|
||||
AT&T是一件不错的公司,给你发offer了吗? "C++和c#是什么关系?11+122=133,是吗?PI=3.14159
|
||||
你认识那个和主席握手的的哥吗?他开一辆黑色的士。
|
||||
枪杆子中出政权
|
Binary file not shown.
After Width: | Height: | Size: 91 KiB |
@ -23,5 +23,14 @@
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
|
||||
<orderEntry type="module-library">
|
||||
<library name="Maven: aias:jieba-lib:0.1">
|
||||
<CLASSES>
|
||||
<root url="jar://$MODULE_DIR$/lib/jieba-lib-0.1.0.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
</library>
|
||||
</orderEntry>
|
||||
</component>
|
||||
</module>
|
Binary file not shown.
@ -51,13 +51,22 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.17.2</version>
|
||||
<version>2.12.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>aias</groupId>
|
||||
<artifactId>jieba-lib</artifactId>
|
||||
<version>0.1</version>
|
||||
<scope>system</scope>
|
||||
<systemPath>${project.basedir}/lib/jieba-lib-0.1.0.jar</systemPath>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration status="INFO">
|
||||
<Appenders>
|
||||
<Console name="console" target="SYSTEM_OUT">
|
||||
<PatternLayout
|
||||
pattern="[%-5level] - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Root>
|
||||
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Logger>
|
||||
</Loggers>
|
||||
</Configuration>
|
Loading…
Reference in New Issue
Block a user