Add sentencepiece sdk.

This commit is contained in:
Calvin 2021-08-14 23:37:23 +08:00
parent 7ac049c730
commit a7d05c151b
11 changed files with 651 additions and 0 deletions

View File

@ -0,0 +1,66 @@
# Sentencepiece分词的Java实现
Sentencepiece是google开源的文本Tokenzier工具其主要原理是利用统计算法
在语料库中生成一个类似分词器的工具外加可以将词token化的功能。
![image](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/nlp_sdks/wordpiece.jpeg)
## 运行例子 - SpTokenizerExample
运行成功后,命令行应该看到下面的信息:
```text
#测试token生成并根据token还原句子
[INFO ] - Test Tokenize
[INFO ] - Input sentence: Hello World
[INFO ] - Tokens: [▁He, ll, o, ▁, W, or, l, d]
[INFO ] - Recovered sentence: Hello World
#测试Encode生成ids并根据ids还原句子
[INFO ] - Test Encode & Decode
[INFO ] - Input sentence: Hello World
[INFO ] - Ids: [151, 88, 21, 4, 321, 54, 31, 17]
[INFO ] - Recovered sentence: Hello World
#测试GetToken根据id获取token
[INFO ] - Test GetToken
[INFO ] - ids: [151, 88, 21, 4, 321, 54, 31, 17]
[INFO ] - ▁He
[INFO ] - ll
[INFO ] - o
[INFO ] - ▁
[INFO ] - W
[INFO ] - or
[INFO ] - l
[INFO ] - d
#测试GetId根据token获取id
[INFO ] - Test GetId
[INFO ] - tokens: [▁He, ll, o, ▁, W, or, l, d]
[INFO ] - 151
[INFO ] - 88
[INFO ] - 21
[INFO ] - 4
[INFO ] - 321
[INFO ] - 54
[INFO ] - 31
[INFO ] - 17
```
### 如何训练模型?
参考https://github.com/google/sentencepiece/blob/master/README.md
### 1. 安装编译sentencepiece
```text
% git clone https://github.com/google/sentencepiece.git
% cd sentencepiece
% mkdir build
% cd build
% cmake ..
% make -j $(nproc)
% sudo make install
% sudo ldconfig -v
```
### 2. 训练模型:
```text
% spm_train --input=<input> --model_prefix=<model_name> --vocab_size=8000 --character_coverage=1.0 --model_type=<type>
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -0,0 +1,115 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>calvin</groupId>
<artifactId>sentencepiece-sdk</artifactId>
<version>0.1</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<djl.version>0.13.0-SNAPSHOT</djl.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
<version>3.8.1</version>
</plugin>
</plugins>
</build>
<!-- For development use -->
<!-- Include this if you want to try the latest SNAPSHOT version -->
<repositories>
<repository>
<id>oss.sonatype.org-snapshot</id>
<url>http://oss.sonatype.org/content/repositories/snapshots</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>
<dependency>
<groupId>ai.djl.sentencepiece</groupId>
<artifactId>sentencepiece</artifactId>
<version>0.12.0</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.18</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8.1</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="CheckStyle-IDEA-Module">
<option name="configuration">
<map />
</option>
</component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="aias-translation-zh-en-lib-0.1.0" level="project" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl.sentencepiece:sentencepiece:0.12.0" level="project" />
<orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
<orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
<orderEntry type="library" name="Maven: org.testng:testng:6.8.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
<orderEntry type="library" name="Maven: com.beust:jcommander:1.27" level="project" />
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
</component>
</module>

View File

@ -0,0 +1,86 @@
package me.calvin.example;
import me.calvin.example.utils.SpProcessor;
import me.calvin.example.utils.SpTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
public class SpTokenizerExample {
private static final Logger logger = LoggerFactory.getLogger(SpTokenizerExample.class);
private SpTokenizerExample() {
}
public static void main(String[] args) throws IOException {
Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
logger.info("Test Tokenize");
testTokenize(modelPath);
logger.info("Test Encode & Decode");
testEncodeDecode(modelPath);
logger.info("Test GetToken");
testGetToken(modelPath);
logger.info("Test GetId");
testGetId(modelPath);
}
public static void testTokenize(Path modelPath) throws IOException {
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
String original = "Hello World";
logger.info("Input sentence: " + original);
List<String> tokens = tokenizer.tokenize(original);
String[] strs = tokens.toArray(new String[]{});
logger.info("Tokens: " + Arrays.toString(strs));
String recovered = tokenizer.buildSentence(tokens);
logger.info("Recovered sentence: " + recovered);
}
}
public static void testEncodeDecode(Path modelPath) throws IOException {
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
String original = "Hello World";
logger.info("Input sentence: " + original);
SpProcessor processor = tokenizer.getProcessor();
int[] ids = processor.encode(original);
logger.info("Ids: " + Arrays.toString(ids));
String recovered = processor.decode(ids);
logger.info("Recovered sentence: " + recovered);
}
}
public static void testGetToken(Path modelPath) throws IOException {
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
String original = "Hello World";
SpProcessor processor = tokenizer.getProcessor();
int[] ids = processor.encode(original);
logger.info("ids: " + Arrays.toString(ids));
for (int i = 0; i < ids.length; i++) {
logger.info(processor.getToken(ids[i]));
}
}
}
public static void testGetId(Path modelPath) throws IOException {
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
String original = "Hello World";
List<String> tokens = tokenizer.tokenize(original);
String[] strs = tokens.toArray(new String[]{});
logger.info("tokens: " + Arrays.toString(strs));
SpProcessor processor = tokenizer.getProcessor();
for (String token : tokens
) {
logger.info("" + processor.getId(token));
}
}
}
}

View File

@ -0,0 +1,35 @@
package me.calvin.example;
import me.calvin.example.utils.SpTokenizer;
import me.calvin.example.utils.SpVocabulary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
public class SpVocabularyExample {
private static final Logger logger = LoggerFactory.getLogger(SpVocabularyExample.class);
private SpVocabularyExample() {
}
public static void main(String[] args) throws IOException {
Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
logger.info("Test TokenIdConversion");
testTokenIdConversion(modelPath);
}
public static void testTokenIdConversion(Path modelPath) throws IOException {
try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
SpVocabulary vocabulary = SpVocabulary.from(tokenizer);
//根据id获取词表中的词
logger.info(vocabulary.getToken(1));
//获取词表中的词对应的id
logger.info("" + vocabulary.getIndex("<s>"));
}
}
}

View File

@ -0,0 +1,115 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
* with the License. A copy of the License is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
* and limitations under the License.
*/
package me.calvin.example.utils;
import ai.djl.sentencepiece.jni.LibUtils;
import ai.djl.sentencepiece.jni.SentencePieceLibrary;
import ai.djl.util.NativeResource;
/** The processor holder for SentencePiece. */
public final class SpProcessor extends NativeResource<Long> {
private static RuntimeException libraryStatus;
static {
try {
LibUtils.loadLibrary();
} catch (RuntimeException e) {
libraryStatus = e;
}
}
private SpProcessor() {
super(SentencePieceLibrary.LIB.createSentencePieceProcessor());
}
static SpProcessor newInstance() {
if (libraryStatus != null) {
throw libraryStatus;
}
return new SpProcessor();
}
public void loadModel(String path) {
SentencePieceLibrary.LIB.loadModel(getHandle(), path);
}
/**
* Tokenize a sentence into array of tokens.
*
* @param input sentence
* @return tokens
*/
public String[] tokenize(String input) {
return SentencePieceLibrary.LIB.tokenize(getHandle(), input);
}
/**
* Build sentence from tokens.
*
* @param tokens input
* @return recovered sentence
*/
public String buildSentence(String[] tokens) {
return SentencePieceLibrary.LIB.detokenize(getHandle(), tokens);
}
/**
* Get tokens from ID.
*
* @param id the index of token
* @return recovered token
*/
public String getToken(int id) {
return SentencePieceLibrary.LIB.idToPiece(getHandle(), id);
}
/**
* Get ID from token.
*
* @param token token that ready to map
* @return id from token
*/
public int getId(String token) {
return SentencePieceLibrary.LIB.pieceToId(getHandle(), token);
}
/**
* Encode sentence into indices.
*
* @param sentence input sentence
* @return indices
*/
public int[] encode(String sentence) {
return SentencePieceLibrary.LIB.encode(getHandle(), sentence);
}
/**
* Decode indices into sentence.
*
* @param ids the indices
* @return recovered sentence
*/
public String decode(int[] ids) {
return SentencePieceLibrary.LIB.decode(getHandle(), ids);
}
/** {@inheritDoc} */
@Override
public void close() {
Long pointer = handle.get();
if (pointer != null) {
SentencePieceLibrary.LIB.deleteSentencePieceProcessor(pointer);
}
}
}

View File

@ -0,0 +1,117 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
* with the License. A copy of the License is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
* and limitations under the License.
*/
package me.calvin.example.utils;
import ai.djl.modality.nlp.preprocess.Tokenizer;
import me.calvin.example.utils.SpProcessor;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
/**
* {@code SpTokenizer} is a SentencePiece implementation of the {@link Tokenizer} interface that
* converts sentences into token.
*/
public class SpTokenizer implements Tokenizer, AutoCloseable {
private SpProcessor processor;
/**
* Create a SentencePiece Tokenizer from existing models.
*
* @param modelPath the directory or file path of the model location
* @throws IOException when IO operation fails in loading a resource
*/
public SpTokenizer(Path modelPath) throws IOException {
this(modelPath, null);
}
/**
* Create a SentencePiece Tokenizer from existing models.
*
* @param modelPath the directory or file path of the model location
* @param prefix the model file name or path prefix
* @throws IOException when IO operation fails in loading a resource
*/
public SpTokenizer(Path modelPath, String prefix) throws IOException {
this.processor = SpProcessor.newInstance();
loadModel(modelPath, prefix);
}
/** {@inheritDoc} */
@Override
public List<String> tokenize(String sentence) {
return Arrays.asList(processor.tokenize(sentence));
}
/** {@inheritDoc} */
@Override
public String buildSentence(List<String> tokens) {
return processor.buildSentence(tokens.toArray(new String[0]));
}
/** {@inheritDoc} */
@Override
public void close() {
processor.close();
}
/**
* Get SentencePiece processor.
*
* @return {@link SpProcessor}
*/
public SpProcessor getProcessor() {
return processor;
}
private void loadModel(Path modelPath, String prefix) throws IOException {
if (Files.notExists(modelPath)) {
throw new FileNotFoundException(
"Model path doesn't exist: " + modelPath.toAbsolutePath());
}
Path modelDir = modelPath.toAbsolutePath();
Path modelFile = findModelFile(modelDir, prefix);
if (modelFile == null) {
// TODO: support proto and IOStream model
modelFile = findModelFile(modelDir, modelDir.toFile().getName());
if (modelFile == null) {
throw new FileNotFoundException("No .model found in : " + modelPath);
}
}
String modelFilePath = modelFile.toString();
processor.loadModel(modelFilePath);
}
private Path findModelFile(Path modelPath, String prefix) {
if (Files.isRegularFile(modelPath)) {
return modelPath;
}
Path modelFile = modelPath.resolve(prefix);
if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
if (prefix.endsWith(".model")) {
return null;
}
modelFile = modelPath.resolve(prefix + ".model");
if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
return null;
}
}
return modelFile;
}
}

View File

@ -0,0 +1,60 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
* with the License. A copy of the License is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
* OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
* and limitations under the License.
*/
package me.calvin.example.utils;
import ai.djl.modality.nlp.Vocabulary;
/** {@link SpVocabulary} is a SentencePiece implementation of {@link Vocabulary}. */
public final class SpVocabulary implements Vocabulary {
private SpProcessor processor;
// TODO: Support direct Vocabulary loading
private SpVocabulary(SpProcessor processor) {
this.processor = processor;
}
/**
* Get Vocabulary from {@link SpTokenizer}.
*
* @param tokenizer the {@link SpTokenizer}
* @return {@link SpVocabulary}
*/
public static SpVocabulary from(SpTokenizer tokenizer) {
return new SpVocabulary(tokenizer.getProcessor());
}
/** {@inheritDoc} */
@Override
public String getToken(long index) {
return processor.getToken((int) index);
}
/** {@inheritDoc} */
@Override
public boolean contains(String token) {
throw new UnsupportedOperationException("Not supported for Sentence Piece");
}
/** {@inheritDoc} */
@Override
public long getIndex(String token) {
return processor.getId(token);
}
/** {@inheritDoc} */
@Override
public long size() {
throw new UnsupportedOperationException("Not supported for Sentence Piece");
}
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="INFO">
<Appenders>
<Console name="console" target="SYSTEM_OUT">
<PatternLayout
pattern="[%-5level] - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info" additivity="false">
<AppenderRef ref="console"/>
</Root>
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
<AppenderRef ref="console"/>
</Logger>
</Loggers>
</Configuration>