Add sentence encoder english sdk.

This commit is contained in:
Calvin 2021-08-05 14:37:34 +08:00
parent 5f67904c43
commit a6b7e21cdb
8 changed files with 329 additions and 0 deletions

View File

@ -24,3 +24,6 @@ build/output/result.gif
```
![gif](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/gan_sdks/result.gif)
帮助
添加依赖库lib/aias-first-order-lib-0.1.0.jar

View File

@ -0,0 +1,37 @@
# 句向量SDK【英文】
句向量是指将语句映射至固定维度的实数向量。
将不定长的句子用定长的向量表示为NLP下游任务提供服务。
句向量应用:
- 语义搜索通过句向量相似性检索语料库中与query最匹配的文本
- 文本聚类,文本转为定长向量,通过聚类模型可无监督聚集相似文本
- 文本分类,表示成句向量,直接用简单分类器即训练文本分类器
### SDK包含两个模型模型较大首次运行耐心等待下载
- SentenceEncoder 500M
- SentenceEncoderLarge5 1G
### SDK功能
- 句向量提取
- 相似度计算
## 运行例子 - SentenceEncoderExample
- 测试语句:
- I am a sentence for which I would like to get its embedding
- I am a sentence
- I am a sentence for which I would like to get ...
运行成功后,命令行应该看到下面的信息:
```text
...
[INFO ] - length: 512
[INFO ] - [0.050808605, -0.016524296, 0.015737822, -0.042864114, ..., 0.017881196]
[INFO ] - [0.018138515, -0.037706323, 0.04290087, -0.019213252, ..., 0.058189757]
[INFO ] - [0.04447042, -0.07614114, 0.0073701036, -0.045335855, ..., 0.054146692]
[INFO ] - 0.80258906
[INFO ] - 0.90100515
```
### 帮助
添加依赖库lib/aias-sentence-encoder-lib-0.1.0.jar

View File

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>calvin</groupId>
<artifactId>sentence-encoder-sdk</artifactId>
<version>0.1</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<djl.version>0.12.0</djl.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
<version>3.8.1</version>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.1</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>api</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>basicdataset</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl</groupId>
<artifactId>model-zoo</artifactId>
<version>${djl.version}</version>
</dependency>
<!-- Tensorflow -->
<dependency>
<groupId>ai.djl.tensorflow</groupId>
<artifactId>tensorflow-api</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl.tensorflow</groupId>
<artifactId>tensorflow-engine</artifactId>
<version>${djl.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>ai.djl.tensorflow</groupId>
<artifactId>tensorflow-model-zoo</artifactId>
<version>${djl.version}</version>
</dependency>
<dependency>
<groupId>ai.djl.tensorflow</groupId>
<artifactId>tensorflow-native-auto</artifactId>
<version>2.4.1</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.18</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,47 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="CheckStyle-IDEA-Module">
<option name="configuration">
<map />
</option>
</component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="aias-sentence-encoder-lib-0.1.0" level="project" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
<orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
<orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
<orderEntry type="library" name="Maven: ai.djl:basicdataset:0.12.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.8" level="project" />
<orderEntry type="library" name="Maven: ai.djl:model-zoo:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.tensorflow:tensorflow-api:0.12.0" level="project" />
<orderEntry type="library" name="Maven: org.bytedeco:javacpp:1.5.5" level="project" />
<orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:3.8.0" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: ai.djl.tensorflow:tensorflow-engine:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.tensorflow:tensorflow-model-zoo:0.12.0" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: ai.djl.tensorflow:tensorflow-native-auto:2.4.1" level="project" />
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.testng:testng:6.8.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: com.beust:jcommander:1.27" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
</component>
</module>

View File

@ -0,0 +1,51 @@
package me.calvin.example;
import ai.djl.ModelException;
import ai.djl.inference.Predictor;
import ai.djl.repository.zoo.ModelZoo;
import ai.djl.repository.zoo.ZooModel;
import ai.djl.translate.TranslateException;
import me.calvin.aias.SentenceEncoder;
import me.calvin.aias.util.FeatureComparison;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public final class SentenceEncoderExample {
private static final Logger logger = LoggerFactory.getLogger(SentenceEncoderExample.class);
private SentenceEncoderExample() {}
public static void main(String[] args) throws IOException, ModelException, TranslateException {
List<String> inputs = new ArrayList<>();
inputs.add("I am a sentence for which I would like to get its embedding");
inputs.add("I am a sentence");
inputs.add("I am a sentence for which I would like to get ...");
SentenceEncoder sentenceEncoder = new SentenceEncoder();
try (ZooModel<String[], float[][]> model = ModelZoo.loadModel(sentenceEncoder.criteria());
Predictor<String[], float[][]> predictor = model.newPredictor()) {
float[][] embeddings = predictor.predict(inputs.toArray(new String[0]));
float[] feature1 = embeddings[0];
float[] feature2 = embeddings[1];
float[] feature3 = embeddings[2];
logger.info("length: " + feature1.length);
logger.info(Arrays.toString(feature1));
logger.info(Arrays.toString(feature2));
logger.info(Arrays.toString(feature3));
logger.info(Float.toString(FeatureComparison.calculSimilar(feature1, feature2)));
logger.info(Float.toString(FeatureComparison.calculSimilar(feature1, feature3)));
}
}
}

View File

@ -0,0 +1,50 @@
package me.calvin.example;
import ai.djl.ModelException;
import ai.djl.inference.Predictor;
import ai.djl.repository.zoo.ModelZoo;
import ai.djl.repository.zoo.ZooModel;
import ai.djl.translate.TranslateException;
import me.calvin.aias.SentenceEncoderLarge5;
import me.calvin.aias.util.FeatureComparison;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public final class SentenceEncoderLarge5Example {
private static final Logger logger = LoggerFactory.getLogger(SentenceEncoderLarge5Example.class);
private SentenceEncoderLarge5Example() {}
public static void main(String[] args) throws IOException, ModelException, TranslateException {
List<String> inputs = new ArrayList<>();
inputs.add("I am a sentence for which I would like to get its embedding");
inputs.add("I am a sentence");
inputs.add("I am a sentence for which I would like to get ...");
SentenceEncoderLarge5 sentenceEncoder = new SentenceEncoderLarge5();
try (ZooModel<String[], float[][]> model = ModelZoo.loadModel(sentenceEncoder.criteria());
Predictor<String[], float[][]> predictor = model.newPredictor()) {
float[][] embeddings = predictor.predict(inputs.toArray(new String[0]));
float[] feature1 = embeddings[0];
float[] feature2 = embeddings[1];
float[] feature3 = embeddings[2];
logger.info("length: " + feature1.length);
logger.info(Arrays.toString(feature1));
logger.info(Arrays.toString(feature2));
logger.info(Arrays.toString(feature3));
logger.info(Float.toString(FeatureComparison.calculSimilar(feature1, feature2)));
logger.info(Float.toString(FeatureComparison.calculSimilar(feature1, feature3)));
}
}
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="INFO">
<Appenders>
<Console name="console" target="SYSTEM_OUT">
<PatternLayout
pattern="[%-5level] - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info" additivity="false">
<AppenderRef ref="console"/>
</Root>
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
<AppenderRef ref="console"/>
</Logger>
</Loggers>
</Configuration>