Add word vector sdk.

This commit is contained in:
Calvin 2021-08-07 19:13:33 +08:00
parent d8fa5592e6
commit 13bb3fd0f0
10 changed files with 547786 additions and 0 deletions

View File

@ -0,0 +1,59 @@
# 词向量SDK【英文】
词向量/词嵌入Word embedding是自然语言处理NLP中语言模型与表征学习技术的统称。
概念上而言,它是指把一个维数为所有词的数量的高维空间嵌入到一个维数低得多的连续向量空间中,
每个单词或词组被映射为实数域上的向量。
- 词向量
![img](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/nlp_sdks/Universal-Sentence-Encoder.png)
句向量应用:
- 语义搜索通过句向量相似性检索语料库中与query最匹配的文本
- 文本聚类,文本转为定长向量,通过聚类模型可无监督聚集相似文本
- 文本分类,表示成句向量,直接用简单分类器即训练文本分类器
### SDK包含两个模型
- w2v_wiki_dim300 (WordEncoderExample1)
基于W2V训练得到的中文Embedding模型词向量的纬度为300词表大小为352219
训练采用的语料是——Wikipedia_zh 中文维基百科。
- w2v_weibo_dim300 (WordEncoderExample2)
基于W2V训练得到的中文Embedding模型词向量的纬度为300词表大小为195204
训练采用的语料是——Weibo微博。
### SDK功能
- 词向量提取
- 相似度计算:
- 余弦相似度
- 内积
## 运行例子 - WordEncoderExample1
运行成功后,命令行应该看到下面的信息:
```text
...
[INFO ] - 中国-特征值: [0.365368, 0.506662, ..., -0.157893, 0.346256]
[INFO ] - 美国-特征值: [0.365368, 0.506662, ..., -0.157893, 0.346256]
[INFO ] - 余弦相似度: 0.41243544
[INFO ] - 内积: 11.631776
```
## 运行例子 - WordEncoderExample2
运行成功后,命令行应该看到下面的信息:
```text
...
[INFO ] - 中国-特征值: [-0.186542, 0.153161, ..., -0.344588, 0.269266]
[INFO ] - 美国-特征值: [-0.186542, 0.153161, ..., -0.344588, 0.269266]
[INFO ] - 余弦相似度: 0.30708003
[INFO ] - 内积: 6.5972724
```
### 帮助
- 添加依赖库lib/aias-word-encoder-cn-lib-0.1.0.jar
- 下载wiki模型特征数据添加到 src/test/resources/ 路径下:
[wiki](https://djl-model.oss-cn-hongkong.aliyuncs.com/models/nlp_models/w2v_wiki_dim300.npy)
- 下载weibo模型特征数据添加到 src/test/resources/ 路径下:
[weibo](https://djl-model.oss-cn-hongkong.aliyuncs.com/models/nlp_models/w2v_weibo_dim300.npy)

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

View File

@ -0,0 +1,123 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>calvin</groupId>
<artifactId>sentence-encoder-sdk</artifactId>
<version>0.1</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<djl.version>0.12.0</djl.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
<version>3.8.1</version>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.1</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>api</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>basicdataset</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>model-zoo</artifactId>
<version>${djl.version}</version>
</dependency>
<!-- MXNet -->
<dependency>
<groupId>ai.djl.mxnet</groupId>
<artifactId>mxnet-model-zoo</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl.mxnet</groupId>
<artifactId>mxnet-engine</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl.mxnet</groupId>
<artifactId>mxnet-native-auto</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native</artifactId>
<version>1.0.0-M1.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.18</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,48 @@
package me.calvin.example;
import ai.djl.ModelException;
import ai.djl.translate.TranslateException;
import me.calvin.aias.WordEncoder;
import me.calvin.aias.util.FeatureComparison;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
/**
* 词向量提取
* 词向量的纬度为300词表大小为352219训练采用的语料是Wikipedia_zh 中文维基百科
*
* @author Calvin
* 179209347@qq.com
*/
public final class WordEncoderExample1 {
private static final Logger logger = LoggerFactory.getLogger(WordEncoderExample1.class);
private WordEncoderExample1() {}
public static void main(String[] args) throws IOException, ModelException, TranslateException {
Path vocabPath = Paths.get("src/test/resources/w2v_wiki_vocab.txt");
Path embeddingPath = Paths.get("src/test/resources/w2v_wiki_dim300.npy");
WordEncoder encoder = new WordEncoder(vocabPath, embeddingPath);
// 获取单词的特征值embedding
float[] embedding1 = encoder.search("中国");
logger.info("中国-特征值: " + Arrays.toString(embedding1));
float[] embedding2 = encoder.search("美国");
logger.info("美国-特征值: " + Arrays.toString(embedding1));
// 计算两个词向量的余弦相似度
float cosineSim = FeatureComparison.cosineSim(embedding1, embedding2);
logger.info("余弦相似度: "+ Float.toString(cosineSim));
// 计算两个词向量的内积
float dot = FeatureComparison.dot(embedding1, embedding2);
logger.info("内积: "+ Float.toString(dot));
}
}

View File

@ -0,0 +1,48 @@
package me.calvin.example;
import ai.djl.ModelException;
import ai.djl.translate.TranslateException;
import me.calvin.aias.WordEncoder;
import me.calvin.aias.util.FeatureComparison;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
/**
* 词向量提取
* 词向量的纬度为300词表大小为195204训练采用的语料是Weibo 微博
*
* @author Calvin
* 179209347@qq.com
*/
public final class WordEncoderExample2 {
private static final Logger logger = LoggerFactory.getLogger(WordEncoderExample2.class);
private WordEncoderExample2() {}
public static void main(String[] args) throws IOException, ModelException, TranslateException {
Path vocabPath = Paths.get("src/test/resources/w2v_weibo_vocab.txt");
Path embeddingPath = Paths.get("src/test/resources/w2v_weibo_dim300.npy");
WordEncoder encoder = new WordEncoder(vocabPath, embeddingPath);
// 获取单词的特征值embedding
float[] embedding1 = encoder.search("中国");
logger.info("中国-特征值: " + Arrays.toString(embedding1));
float[] embedding2 = encoder.search("美国");
logger.info("美国-特征值: " + Arrays.toString(embedding1));
// 计算两个词向量的余弦相似度
float cosineSim = FeatureComparison.cosineSim(embedding1, embedding2);
logger.info("余弦相似度: "+ Float.toString(cosineSim));
// 计算两个词向量的内积
float dot = FeatureComparison.dot(embedding1, embedding2);
logger.info("内积: "+ Float.toString(dot));
}
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="INFO">
<Appenders>
<Console name="console" target="SYSTEM_OUT">
<PatternLayout
pattern="[%-5level] - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info" additivity="false">
<AppenderRef ref="console"/>
</Root>
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
<AppenderRef ref="console"/>
</Logger>
</Loggers>
</Configuration>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="CheckStyle-IDEA-Module">
<option name="configuration">
<map />
</option>
</component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="aias-word-encoder-cn-lib-0.1.0" level="project" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
<orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
<orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
<orderEntry type="library" name="Maven: ai.djl:basicdataset:0.12.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.8" level="project" />
<orderEntry type="library" name="Maven: ai.djl:model-zoo:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-model-zoo:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-engine:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.mxnet:mxnet-native-auto:1.8.0" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native:macosx-x86_64:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:javacpp:1.5.5" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:javacpp:macosx-x86_64:1.5.5" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:openblas:0.3.13-1.5.5" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:openblas:macosx-x86_64:0.3.13-1.5.5" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-api:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: com.jakewharton.byteunits:byteunits:0.9.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-collections4:4.1" level="project" />
<orderEntry type="library" name="Maven: com.google.flatbuffers:flatbuffers-java:1.10.0" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:protobuf:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: com.github.oshi:oshi-core:3.4.2" level="project" />
<orderEntry type="library" name="Maven: net.java.dev.jna:jna-platform:4.3.0" level="project" />
<orderEntry type="library" name="Maven: org.threeten:threetenbp:1.3.3" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:jackson:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: commons-net:commons-net:3.1" level="project" />
<orderEntry type="library" name="Maven: net.ericaro:neoitertools:1.0.0" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-common:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:guava:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.10" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-api:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-preset:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.nd4j:nd4j-native-preset:macosx-x86_64:1.0.0-M1.1" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:mkl:2021.1-1.5.5" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:mkl:macosx-x86_64:2021.1-1.5.5" level="project" />
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.testng:testng:6.8.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.beust:jcommander:1.27" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
</component>
</module>