mirror of
https://gitee.com/mymagicpower/AIAS.git
synced 2024-12-02 04:08:21 +08:00
Add sentencepiece sdk.
This commit is contained in:
parent
7ac049c730
commit
a7d05c151b
66
nlp_sdks/sentencepiece_sdk/README.md
Normal file
66
nlp_sdks/sentencepiece_sdk/README.md
Normal file
@ -0,0 +1,66 @@
|
||||
# Sentencepiece分词的Java实现
|
||||
Sentencepiece是google开源的文本Tokenzier工具,其主要原理是利用统计算法,
|
||||
在语料库中生成一个类似分词器的工具,外加可以将词token化的功能。
|
||||
|
||||
![image](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/nlp_sdks/wordpiece.jpeg)
|
||||
|
||||
## 运行例子 - SpTokenizerExample
|
||||
运行成功后,命令行应该看到下面的信息:
|
||||
```text
|
||||
|
||||
#测试token生成,并根据token还原句子
|
||||
[INFO ] - Test Tokenize
|
||||
[INFO ] - Input sentence: Hello World
|
||||
[INFO ] - Tokens: [▁He, ll, o, ▁, W, or, l, d]
|
||||
[INFO ] - Recovered sentence: Hello World
|
||||
|
||||
#测试Encode生成ids,并根据ids还原句子
|
||||
[INFO ] - Test Encode & Decode
|
||||
[INFO ] - Input sentence: Hello World
|
||||
[INFO ] - Ids: [151, 88, 21, 4, 321, 54, 31, 17]
|
||||
[INFO ] - Recovered sentence: Hello World
|
||||
|
||||
#测试GetToken,根据id获取token
|
||||
[INFO ] - Test GetToken
|
||||
[INFO ] - ids: [151, 88, 21, 4, 321, 54, 31, 17]
|
||||
[INFO ] - ▁He
|
||||
[INFO ] - ll
|
||||
[INFO ] - o
|
||||
[INFO ] - ▁
|
||||
[INFO ] - W
|
||||
[INFO ] - or
|
||||
[INFO ] - l
|
||||
[INFO ] - d
|
||||
|
||||
#测试GetId,根据token获取id
|
||||
[INFO ] - Test GetId
|
||||
[INFO ] - tokens: [▁He, ll, o, ▁, W, or, l, d]
|
||||
[INFO ] - 151
|
||||
[INFO ] - 88
|
||||
[INFO ] - 21
|
||||
[INFO ] - 4
|
||||
[INFO ] - 321
|
||||
[INFO ] - 54
|
||||
[INFO ] - 31
|
||||
[INFO ] - 17
|
||||
|
||||
```
|
||||
|
||||
### 如何训练模型?
|
||||
参考:https://github.com/google/sentencepiece/blob/master/README.md
|
||||
### 1. 安装编译sentencepiece:
|
||||
```text
|
||||
% git clone https://github.com/google/sentencepiece.git
|
||||
% cd sentencepiece
|
||||
% mkdir build
|
||||
% cd build
|
||||
% cmake ..
|
||||
% make -j $(nproc)
|
||||
% sudo make install
|
||||
% sudo ldconfig -v
|
||||
|
||||
```
|
||||
### 2. 训练模型:
|
||||
```text
|
||||
% spm_train --input=<input> --model_prefix=<model_name> --vocab_size=8000 --character_coverage=1.0 --model_type=<type>
|
||||
```
|
Binary file not shown.
BIN
nlp_sdks/sentencepiece_sdk/doc/img/wordpiece.jpeg
Normal file
BIN
nlp_sdks/sentencepiece_sdk/doc/img/wordpiece.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
115
nlp_sdks/sentencepiece_sdk/pom.xml
Normal file
115
nlp_sdks/sentencepiece_sdk/pom.xml
Normal file
@ -0,0 +1,115 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one
|
||||
~ or more contributor license agreements. See the NOTICE file
|
||||
~ distributed with this work for additional information
|
||||
~ regarding copyright ownership. The ASF licenses this file
|
||||
~ to you under the Apache License, Version 2.0 (the
|
||||
~ "License"); you may not use this file except in compliance
|
||||
~ with the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing,
|
||||
~ software distributed under the License is distributed on an
|
||||
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
~ KIND, either express or implied. See the License for the
|
||||
~ specific language governing permissions and limitations
|
||||
~ under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>calvin</groupId>
|
||||
<artifactId>sentencepiece-sdk</artifactId>
|
||||
<version>0.1</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<djl.version>0.13.0-SNAPSHOT</djl.version>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
<version>3.8.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<!-- For development use -->
|
||||
<!-- Include this if you want to try the latest SNAPSHOT version -->
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>oss.sonatype.org-snapshot</id>
|
||||
<url>http://oss.sonatype.org/content/repositories/snapshots</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
<version>2.8.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.30</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.12.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>ai.djl.sentencepiece</groupId>
|
||||
<artifactId>sentencepiece</artifactId>
|
||||
<version>0.12.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.18</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.testng</groupId>
|
||||
<artifactId>testng</artifactId>
|
||||
<version>6.8.1</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
40
nlp_sdks/sentencepiece_sdk/sentencepiece_sdk.iml
Normal file
40
nlp_sdks/sentencepiece_sdk/sentencepiece_sdk.iml
Normal file
@ -0,0 +1,40 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="CheckStyle-IDEA-Module">
|
||||
<option name="configuration">
|
||||
<map />
|
||||
</option>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="aias-translation-zh-en-lib-0.1.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl.sentencepiece:sentencepiece:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
|
||||
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.testng:testng:6.8.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.beust:jcommander:1.27" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
|
||||
</component>
|
||||
</module>
|
@ -0,0 +1,86 @@
|
||||
package me.calvin.example;
|
||||
|
||||
import me.calvin.example.utils.SpProcessor;
|
||||
import me.calvin.example.utils.SpTokenizer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class SpTokenizerExample {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SpTokenizerExample.class);
|
||||
|
||||
private SpTokenizerExample() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
|
||||
|
||||
logger.info("Test Tokenize");
|
||||
testTokenize(modelPath);
|
||||
|
||||
logger.info("Test Encode & Decode");
|
||||
testEncodeDecode(modelPath);
|
||||
|
||||
logger.info("Test GetToken");
|
||||
testGetToken(modelPath);
|
||||
|
||||
logger.info("Test GetId");
|
||||
testGetId(modelPath);
|
||||
|
||||
}
|
||||
|
||||
public static void testTokenize(Path modelPath) throws IOException {
|
||||
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
|
||||
String original = "Hello World";
|
||||
logger.info("Input sentence: " + original);
|
||||
List<String> tokens = tokenizer.tokenize(original);
|
||||
String[] strs = tokens.toArray(new String[]{});
|
||||
logger.info("Tokens: " + Arrays.toString(strs));
|
||||
String recovered = tokenizer.buildSentence(tokens);
|
||||
logger.info("Recovered sentence: " + recovered);
|
||||
}
|
||||
}
|
||||
|
||||
public static void testEncodeDecode(Path modelPath) throws IOException {
|
||||
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
|
||||
String original = "Hello World";
|
||||
logger.info("Input sentence: " + original);
|
||||
SpProcessor processor = tokenizer.getProcessor();
|
||||
int[] ids = processor.encode(original);
|
||||
logger.info("Ids: " + Arrays.toString(ids));
|
||||
String recovered = processor.decode(ids);
|
||||
logger.info("Recovered sentence: " + recovered);
|
||||
}
|
||||
}
|
||||
|
||||
public static void testGetToken(Path modelPath) throws IOException {
|
||||
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
|
||||
String original = "Hello World";
|
||||
SpProcessor processor = tokenizer.getProcessor();
|
||||
int[] ids = processor.encode(original);
|
||||
logger.info("ids: " + Arrays.toString(ids));
|
||||
for (int i = 0; i < ids.length; i++) {
|
||||
logger.info(processor.getToken(ids[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void testGetId(Path modelPath) throws IOException {
|
||||
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
|
||||
String original = "Hello World";
|
||||
List<String> tokens = tokenizer.tokenize(original);
|
||||
String[] strs = tokens.toArray(new String[]{});
|
||||
logger.info("tokens: " + Arrays.toString(strs));
|
||||
SpProcessor processor = tokenizer.getProcessor();
|
||||
for (String token : tokens
|
||||
) {
|
||||
logger.info("" + processor.getId(token));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package me.calvin.example;
|
||||
|
||||
import me.calvin.example.utils.SpTokenizer;
|
||||
import me.calvin.example.utils.SpVocabulary;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public class SpVocabularyExample {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SpVocabularyExample.class);
|
||||
|
||||
private SpVocabularyExample() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
|
||||
|
||||
logger.info("Test TokenIdConversion");
|
||||
testTokenIdConversion(modelPath);
|
||||
|
||||
}
|
||||
|
||||
public static void testTokenIdConversion(Path modelPath) throws IOException {
|
||||
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
|
||||
SpVocabulary vocabulary = SpVocabulary.from(tokenizer);
|
||||
//根据id获取词表中的词
|
||||
logger.info(vocabulary.getToken(1));
|
||||
//获取词表中的词对应的id
|
||||
logger.info("" + vocabulary.getIndex("<s>"));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
||||
* with the License. A copy of the License is located at
|
||||
*
|
||||
* http://aws.amazon.com/apache2.0/
|
||||
*
|
||||
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*/
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import ai.djl.sentencepiece.jni.LibUtils;
|
||||
import ai.djl.sentencepiece.jni.SentencePieceLibrary;
|
||||
import ai.djl.util.NativeResource;
|
||||
|
||||
/** The processor holder for SentencePiece. */
|
||||
public final class SpProcessor extends NativeResource<Long> {
|
||||
|
||||
private static RuntimeException libraryStatus;
|
||||
|
||||
static {
|
||||
try {
|
||||
LibUtils.loadLibrary();
|
||||
} catch (RuntimeException e) {
|
||||
libraryStatus = e;
|
||||
}
|
||||
}
|
||||
|
||||
private SpProcessor() {
|
||||
super(SentencePieceLibrary.LIB.createSentencePieceProcessor());
|
||||
}
|
||||
|
||||
static SpProcessor newInstance() {
|
||||
if (libraryStatus != null) {
|
||||
throw libraryStatus;
|
||||
}
|
||||
return new SpProcessor();
|
||||
}
|
||||
|
||||
public void loadModel(String path) {
|
||||
SentencePieceLibrary.LIB.loadModel(getHandle(), path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenize a sentence into array of tokens.
|
||||
*
|
||||
* @param input sentence
|
||||
* @return tokens
|
||||
*/
|
||||
public String[] tokenize(String input) {
|
||||
return SentencePieceLibrary.LIB.tokenize(getHandle(), input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build sentence from tokens.
|
||||
*
|
||||
* @param tokens input
|
||||
* @return recovered sentence
|
||||
*/
|
||||
public String buildSentence(String[] tokens) {
|
||||
return SentencePieceLibrary.LIB.detokenize(getHandle(), tokens);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get tokens from ID.
|
||||
*
|
||||
* @param id the index of token
|
||||
* @return recovered token
|
||||
*/
|
||||
public String getToken(int id) {
|
||||
return SentencePieceLibrary.LIB.idToPiece(getHandle(), id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get ID from token.
|
||||
*
|
||||
* @param token token that ready to map
|
||||
* @return id from token
|
||||
*/
|
||||
public int getId(String token) {
|
||||
return SentencePieceLibrary.LIB.pieceToId(getHandle(), token);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode sentence into indices.
|
||||
*
|
||||
* @param sentence input sentence
|
||||
* @return indices
|
||||
*/
|
||||
public int[] encode(String sentence) {
|
||||
return SentencePieceLibrary.LIB.encode(getHandle(), sentence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode indices into sentence.
|
||||
*
|
||||
* @param ids the indices
|
||||
* @return recovered sentence
|
||||
*/
|
||||
public String decode(int[] ids) {
|
||||
return SentencePieceLibrary.LIB.decode(getHandle(), ids);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public void close() {
|
||||
Long pointer = handle.get();
|
||||
if (pointer != null) {
|
||||
SentencePieceLibrary.LIB.deleteSentencePieceProcessor(pointer);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
||||
* with the License. A copy of the License is located at
|
||||
*
|
||||
* http://aws.amazon.com/apache2.0/
|
||||
*
|
||||
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*/
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import ai.djl.modality.nlp.preprocess.Tokenizer;
|
||||
import me.calvin.example.utils.SpProcessor;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* {@code SpTokenizer} is a SentencePiece implementation of the {@link Tokenizer} interface that
|
||||
* converts sentences into token.
|
||||
*/
|
||||
public class SpTokenizer implements Tokenizer, AutoCloseable {
|
||||
|
||||
private SpProcessor processor;
|
||||
|
||||
/**
|
||||
* Create a SentencePiece Tokenizer from existing models.
|
||||
*
|
||||
* @param modelPath the directory or file path of the model location
|
||||
* @throws IOException when IO operation fails in loading a resource
|
||||
*/
|
||||
public SpTokenizer(Path modelPath) throws IOException {
|
||||
this(modelPath, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a SentencePiece Tokenizer from existing models.
|
||||
*
|
||||
* @param modelPath the directory or file path of the model location
|
||||
* @param prefix the model file name or path prefix
|
||||
* @throws IOException when IO operation fails in loading a resource
|
||||
*/
|
||||
public SpTokenizer(Path modelPath, String prefix) throws IOException {
|
||||
this.processor = SpProcessor.newInstance();
|
||||
loadModel(modelPath, prefix);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public List<String> tokenize(String sentence) {
|
||||
return Arrays.asList(processor.tokenize(sentence));
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public String buildSentence(List<String> tokens) {
|
||||
return processor.buildSentence(tokens.toArray(new String[0]));
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public void close() {
|
||||
processor.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get SentencePiece processor.
|
||||
*
|
||||
* @return {@link SpProcessor}
|
||||
*/
|
||||
public SpProcessor getProcessor() {
|
||||
return processor;
|
||||
}
|
||||
|
||||
private void loadModel(Path modelPath, String prefix) throws IOException {
|
||||
if (Files.notExists(modelPath)) {
|
||||
throw new FileNotFoundException(
|
||||
"Model path doesn't exist: " + modelPath.toAbsolutePath());
|
||||
}
|
||||
Path modelDir = modelPath.toAbsolutePath();
|
||||
Path modelFile = findModelFile(modelDir, prefix);
|
||||
if (modelFile == null) {
|
||||
// TODO: support proto and IOStream model
|
||||
modelFile = findModelFile(modelDir, modelDir.toFile().getName());
|
||||
if (modelFile == null) {
|
||||
throw new FileNotFoundException("No .model found in : " + modelPath);
|
||||
}
|
||||
}
|
||||
|
||||
String modelFilePath = modelFile.toString();
|
||||
processor.loadModel(modelFilePath);
|
||||
}
|
||||
|
||||
private Path findModelFile(Path modelPath, String prefix) {
|
||||
if (Files.isRegularFile(modelPath)) {
|
||||
return modelPath;
|
||||
}
|
||||
Path modelFile = modelPath.resolve(prefix);
|
||||
if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
|
||||
if (prefix.endsWith(".model")) {
|
||||
return null;
|
||||
}
|
||||
modelFile = modelPath.resolve(prefix + ".model");
|
||||
if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return modelFile;
|
||||
}
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
||||
* with the License. A copy of the License is located at
|
||||
*
|
||||
* http://aws.amazon.com/apache2.0/
|
||||
*
|
||||
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*/
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import ai.djl.modality.nlp.Vocabulary;
|
||||
|
||||
/** {@link SpVocabulary} is a SentencePiece implementation of {@link Vocabulary}. */
|
||||
public final class SpVocabulary implements Vocabulary {
|
||||
|
||||
private SpProcessor processor;
|
||||
|
||||
// TODO: Support direct Vocabulary loading
|
||||
private SpVocabulary(SpProcessor processor) {
|
||||
this.processor = processor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Vocabulary from {@link SpTokenizer}.
|
||||
*
|
||||
* @param tokenizer the {@link SpTokenizer}
|
||||
* @return {@link SpVocabulary}
|
||||
*/
|
||||
public static SpVocabulary from(SpTokenizer tokenizer) {
|
||||
return new SpVocabulary(tokenizer.getProcessor());
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public String getToken(long index) {
|
||||
return processor.getToken((int) index);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public boolean contains(String token) {
|
||||
throw new UnsupportedOperationException("Not supported for Sentence Piece");
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public long getIndex(String token) {
|
||||
return processor.getId(token);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public long size() {
|
||||
throw new UnsupportedOperationException("Not supported for Sentence Piece");
|
||||
}
|
||||
}
|
17
nlp_sdks/sentencepiece_sdk/src/main/resources/log4j2.xml
Normal file
17
nlp_sdks/sentencepiece_sdk/src/main/resources/log4j2.xml
Normal file
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration status="INFO">
|
||||
<Appenders>
|
||||
<Console name="console" target="SYSTEM_OUT">
|
||||
<PatternLayout
|
||||
pattern="[%-5level] - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Root>
|
||||
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Logger>
|
||||
</Loggers>
|
||||
</Configuration>
|
Loading…
Reference in New Issue
Block a user