mirror of
https://gitee.com/mymagicpower/AIAS.git
synced 2024-12-02 12:17:37 +08:00
Add word vector sdk.
This commit is contained in:
parent
d8fa5592e6
commit
13bb3fd0f0
59
nlp_sdks/word_encoder_cn_sdk/README.md
Normal file
59
nlp_sdks/word_encoder_cn_sdk/README.md
Normal file
@ -0,0 +1,59 @@
|
||||
# 词向量SDK【英文】
|
||||
词向量/词嵌入(Word embedding)是自然语言处理(NLP)中语言模型与表征学习技术的统称。
|
||||
概念上而言,它是指把一个维数为所有词的数量的高维空间嵌入到一个维数低得多的连续向量空间中,
|
||||
每个单词或词组被映射为实数域上的向量。
|
||||
|
||||
|
||||
- 词向量
|
||||
![img](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/nlp_sdks/Universal-Sentence-Encoder.png)
|
||||
|
||||
|
||||
句向量应用:
|
||||
- 语义搜索,通过句向量相似性,检索语料库中与query最匹配的文本
|
||||
- 文本聚类,文本转为定长向量,通过聚类模型可无监督聚集相似文本
|
||||
- 文本分类,表示成句向量,直接用简单分类器即训练文本分类器
|
||||
|
||||
|
||||
### SDK包含两个模型:
|
||||
- w2v_wiki_dim300 (WordEncoderExample1)
|
||||
基于W2V训练得到的中文Embedding模型,词向量的纬度为300,词表大小为352219,
|
||||
训练采用的语料是——Wikipedia_zh 中文维基百科。
|
||||
|
||||
- w2v_weibo_dim300 (WordEncoderExample2)
|
||||
基于W2V训练得到的中文Embedding模型,词向量的纬度为300,词表大小为195204,
|
||||
训练采用的语料是——Weibo微博。
|
||||
|
||||
### SDK功能:
|
||||
- 词向量提取
|
||||
- 相似度计算:
|
||||
- 余弦相似度
|
||||
- 内积
|
||||
|
||||
## 运行例子 - WordEncoderExample1
|
||||
运行成功后,命令行应该看到下面的信息:
|
||||
```text
|
||||
...
|
||||
[INFO ] - 中国-特征值: [0.365368, 0.506662, ..., -0.157893, 0.346256]
|
||||
[INFO ] - 美国-特征值: [0.365368, 0.506662, ..., -0.157893, 0.346256]
|
||||
|
||||
[INFO ] - 余弦相似度: 0.41243544
|
||||
[INFO ] - 内积: 11.631776
|
||||
```
|
||||
## 运行例子 - WordEncoderExample2
|
||||
运行成功后,命令行应该看到下面的信息:
|
||||
```text
|
||||
...
|
||||
[INFO ] - 中国-特征值: [-0.186542, 0.153161, ..., -0.344588, 0.269266]
|
||||
[INFO ] - 美国-特征值: [-0.186542, 0.153161, ..., -0.344588, 0.269266]
|
||||
|
||||
[INFO ] - 余弦相似度: 0.30708003
|
||||
[INFO ] - 内积: 6.5972724
|
||||
```
|
||||
|
||||
### 帮助
|
||||
- 添加依赖库:lib/aias-word-encoder-cn-lib-0.1.0.jar
|
||||
- 下载wiki模型特征数据,添加到 src/test/resources/ 路径下:
|
||||
[wiki](https://djl-model.oss-cn-hongkong.aliyuncs.com/models/nlp_models/w2v_wiki_dim300.npy)
|
||||
|
||||
- 下载weibo模型特征数据,添加到 src/test/resources/ 路径下:
|
||||
[weibo](https://djl-model.oss-cn-hongkong.aliyuncs.com/models/nlp_models/w2v_weibo_dim300.npy)
|
BIN
nlp_sdks/word_encoder_cn_sdk/doc/img/word_vector.jpeg
Normal file
BIN
nlp_sdks/word_encoder_cn_sdk/doc/img/word_vector.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 58 KiB |
Binary file not shown.
123
nlp_sdks/word_encoder_cn_sdk/pom.xml
Normal file
123
nlp_sdks/word_encoder_cn_sdk/pom.xml
Normal file
@ -0,0 +1,123 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one
|
||||
~ or more contributor license agreements. See the NOTICE file
|
||||
~ distributed with this work for additional information
|
||||
~ regarding copyright ownership. The ASF licenses this file
|
||||
~ to you under the Apache License, Version 2.0 (the
|
||||
~ "License"); you may not use this file except in compliance
|
||||
~ with the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing,
|
||||
~ software distributed under the License is distributed on an
|
||||
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
~ KIND, either express or implied. See the License for the
|
||||
~ specific language governing permissions and limitations
|
||||
~ under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>calvin</groupId>
|
||||
<artifactId>sentence-encoder-sdk</artifactId>
|
||||
<version>0.1</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<djl.version>0.12.0</djl.version>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
<version>3.8.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
<version>2.8.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.30</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.12.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ai.djl</groupId>
|
||||
<artifactId>api</artifactId>
|
||||
<version>${djl.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ai.djl</groupId>
|
||||
<artifactId>basicdataset</artifactId>
|
||||
<version>${djl.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ai.djl</groupId>
|
||||
<artifactId>model-zoo</artifactId>
|
||||
<version>${djl.version}</version>
|
||||
</dependency>
|
||||
<!-- MXNet -->
|
||||
<dependency>
|
||||
<groupId>ai.djl.mxnet</groupId>
|
||||
<artifactId>mxnet-model-zoo</artifactId>
|
||||
<version>${djl.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ai.djl.mxnet</groupId>
|
||||
<artifactId>mxnet-engine</artifactId>
|
||||
<version>${djl.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ai.djl.mxnet</groupId>
|
||||
<artifactId>mxnet-native-auto</artifactId>
|
||||
<version>1.8.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.nd4j</groupId>
|
||||
<artifactId>nd4j-native</artifactId>
|
||||
<version>1.0.0-M1.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.18</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.testng</groupId>
|
||||
<artifactId>testng</artifactId>
|
||||
<version>6.8.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,48 @@
|
||||
package me.calvin.example;
|
||||
|
||||
import ai.djl.ModelException;
|
||||
import ai.djl.translate.TranslateException;
|
||||
import me.calvin.aias.WordEncoder;
|
||||
import me.calvin.aias.util.FeatureComparison;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* 词向量提取:
|
||||
* 词向量的纬度为300,词表大小为352219,训练采用的语料是——Wikipedia_zh 中文维基百科。
|
||||
*
|
||||
* @author Calvin
|
||||
* 179209347@qq.com
|
||||
*/
|
||||
public final class WordEncoderExample1 {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(WordEncoderExample1.class);
|
||||
|
||||
private WordEncoderExample1() {}
|
||||
|
||||
public static void main(String[] args) throws IOException, ModelException, TranslateException {
|
||||
Path vocabPath = Paths.get("src/test/resources/w2v_wiki_vocab.txt");
|
||||
Path embeddingPath = Paths.get("src/test/resources/w2v_wiki_dim300.npy");
|
||||
|
||||
WordEncoder encoder = new WordEncoder(vocabPath, embeddingPath);
|
||||
|
||||
// 获取单词的特征值embedding
|
||||
float[] embedding1 = encoder.search("中国");
|
||||
logger.info("中国-特征值: " + Arrays.toString(embedding1));
|
||||
float[] embedding2 = encoder.search("美国");
|
||||
logger.info("美国-特征值: " + Arrays.toString(embedding1));
|
||||
|
||||
// 计算两个词向量的余弦相似度
|
||||
float cosineSim = FeatureComparison.cosineSim(embedding1, embedding2);
|
||||
logger.info("余弦相似度: "+ Float.toString(cosineSim));
|
||||
|
||||
// 计算两个词向量的内积
|
||||
float dot = FeatureComparison.dot(embedding1, embedding2);
|
||||
logger.info("内积: "+ Float.toString(dot));
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package me.calvin.example;
|
||||
|
||||
import ai.djl.ModelException;
|
||||
import ai.djl.translate.TranslateException;
|
||||
import me.calvin.aias.WordEncoder;
|
||||
import me.calvin.aias.util.FeatureComparison;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* 词向量提取:
|
||||
* 词向量的纬度为300,词表大小为195204,训练采用的语料是——Weibo 微博。
|
||||
*
|
||||
* @author Calvin
|
||||
* 179209347@qq.com
|
||||
*/
|
||||
public final class WordEncoderExample2 {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(WordEncoderExample2.class);
|
||||
|
||||
private WordEncoderExample2() {}
|
||||
|
||||
public static void main(String[] args) throws IOException, ModelException, TranslateException {
|
||||
Path vocabPath = Paths.get("src/test/resources/w2v_weibo_vocab.txt");
|
||||
Path embeddingPath = Paths.get("src/test/resources/w2v_weibo_dim300.npy");
|
||||
|
||||
WordEncoder encoder = new WordEncoder(vocabPath, embeddingPath);
|
||||
|
||||
// 获取单词的特征值embedding
|
||||
float[] embedding1 = encoder.search("中国");
|
||||
logger.info("中国-特征值: " + Arrays.toString(embedding1));
|
||||
float[] embedding2 = encoder.search("美国");
|
||||
logger.info("美国-特征值: " + Arrays.toString(embedding1));
|
||||
|
||||
// 计算两个词向量的余弦相似度
|
||||
float cosineSim = FeatureComparison.cosineSim(embedding1, embedding2);
|
||||
logger.info("余弦相似度: "+ Float.toString(cosineSim));
|
||||
|
||||
// 计算两个词向量的内积
|
||||
float dot = FeatureComparison.dot(embedding1, embedding2);
|
||||
logger.info("内积: "+ Float.toString(dot));
|
||||
}
|
||||
}
|
17
nlp_sdks/word_encoder_cn_sdk/src/main/resources/log4j2.xml
Normal file
17
nlp_sdks/word_encoder_cn_sdk/src/main/resources/log4j2.xml
Normal file
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration status="INFO">
|
||||
<Appenders>
|
||||
<Console name="console" target="SYSTEM_OUT">
|
||||
<PatternLayout
|
||||
pattern="[%-5level] - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Root>
|
||||
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Logger>
|
||||
</Loggers>
|
||||
</Configuration>
|
195202
nlp_sdks/word_encoder_cn_sdk/src/test/resources/w2v_weibo_vocab.txt
Normal file
195202
nlp_sdks/word_encoder_cn_sdk/src/test/resources/w2v_weibo_vocab.txt
Normal file
File diff suppressed because it is too large
Load Diff
352217
nlp_sdks/word_encoder_cn_sdk/src/test/resources/w2v_wiki_vocab.txt
Normal file
352217
nlp_sdks/word_encoder_cn_sdk/src/test/resources/w2v_wiki_vocab.txt
Normal file
File diff suppressed because it is too large
Load Diff
72
nlp_sdks/word_encoder_cn_sdk/word_encoder_cn_sdk.iml
Normal file
72
nlp_sdks/word_encoder_cn_sdk/word_encoder_cn_sdk.iml
Normal file
@ -0,0 +1,72 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="CheckStyle-IDEA-Module">
|
||||
<option name="configuration">
|
||||
<map />
|
||||
</option>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="aias-word-encoder-cn-lib-0.1.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl:basicdataset:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.8" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl:model-zoo:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-model-zoo:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-engine:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-native-auto:1.8.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native:macosx-x86_64:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:javacpp:1.5.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:javacpp:macosx-x86_64:1.5.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:openblas:0.3.13-1.5.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:openblas:macosx-x86_64:0.3.13-1.5.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-api:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.jakewharton.byteunits:byteunits:0.9.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-collections4:4.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.google.flatbuffers:flatbuffers-java:1.10.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:protobuf:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.github.oshi:oshi-core:3.4.2" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.java.dev.jna:jna-platform:4.3.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.threeten:threetenbp:1.3.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:jackson:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-net:commons-net:3.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.ericaro:neoitertools:1.0.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-common:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:guava:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.10" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-api:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-preset:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-preset:macosx-x86_64:1.0.0-M1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:mkl:2021.1-1.5.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.bytedeco:mkl:macosx-x86_64:2021.1-1.5.5" level="project" />
|
||||
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.testng:testng:6.8.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.10" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: com.beust:jcommander:1.27" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
|
||||
</component>
|
||||
</module>
|
Loading…
Reference in New Issue
Block a user