Add sentencepiece sdk.

2024-12-02 04:08:21 +08:00 · 2021-08-14 23:37:23 +08:00 · 2021-08-14 23:37:23 +08:00 · a7d05c151b
commit a7d05c151b
parent 7ac049c730
11 changed files with 651 additions and 0 deletions
--- a/nlp_sdks/sentencepiece_sdk/README.md
+++ b/nlp_sdks/sentencepiece_sdk/README.md
@ -0,0 +1,66 @@
+# Sentencepiece分词的Java实现
+Sentencepiece是google开源的文本Tokenzier工具，其主要原理是利用统计算法，
+在语料库中生成一个类似分词器的工具，外加可以将词token化的功能。
+
+![image](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/nlp_sdks/wordpiece.jpeg)
+
+## 运行例子 - SpTokenizerExample
+运行成功后，命令行应该看到下面的信息:
+```text
+
+#测试token生成，并根据token还原句子
+[INFO ] - Test Tokenize
+[INFO ] - Input sentence: Hello World
+[INFO ] - Tokens: [▁He, ll, o, ▁, W, or, l, d]
+[INFO ] - Recovered sentence: Hello World
+
+#测试Encode生成ids，并根据ids还原句子
+[INFO ] - Test Encode & Decode
+[INFO ] - Input sentence: Hello World
+[INFO ] - Ids: [151, 88, 21, 4, 321, 54, 31, 17]
+[INFO ] - Recovered sentence: Hello World
+
+#测试GetToken，根据id获取token
+[INFO ] - Test GetToken
+[INFO ] - ids: [151, 88, 21, 4, 321, 54, 31, 17]
+[INFO ] - ▁He
+[INFO ] - ll
+[INFO ] - o
+[INFO ] - ▁
+[INFO ] - W
+[INFO ] - or
+[INFO ] - l
+[INFO ] - d
+
+#测试GetId，根据token获取id
+[INFO ] - Test GetId
+[INFO ] - tokens: [▁He, ll, o, ▁, W, or, l, d]
+[INFO ] - 151
+[INFO ] - 88
+[INFO ] - 21
+[INFO ] - 4
+[INFO ] - 321
+[INFO ] - 54
+[INFO ] - 31
+[INFO ] - 17
+
+```
+
+### 如何训练模型？ 
+参考：https://github.com/google/sentencepiece/blob/master/README.md
+### 1. 安装编译sentencepiece：
+```text
+% git clone https://github.com/google/sentencepiece.git 
+% cd sentencepiece
+% mkdir build
+% cd build
+% cmake ..
+% make -j $(nproc)
+% sudo make install
+% sudo ldconfig -v
+
+```
+### 2. 训练模型：
+```text
+% spm_train --input=<input> --model_prefix=<model_name> --vocab_size=8000 --character_coverage=1.0 --model_type=<type>
+```
--- a/nlp_sdks/sentencepiece_sdk/build/test/models/sententpiece_test_model.model
+++ b/nlp_sdks/sentencepiece_sdk/build/test/models/sententpiece_test_model.model
--- a/nlp_sdks/sentencepiece_sdk/doc/img/wordpiece.jpeg
+++ b/nlp_sdks/sentencepiece_sdk/doc/img/wordpiece.jpeg
--- a/nlp_sdks/sentencepiece_sdk/pom.xml
+++ b/nlp_sdks/sentencepiece_sdk/pom.xml
@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>calvin</groupId>
+    <artifactId>sentencepiece-sdk</artifactId>
+    <version>0.1</version>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <djl.version>0.13.0-SNAPSHOT</djl.version>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>8</source>
+                    <target>8</target>
+                </configuration>
+                <version>3.8.1</version>
+            </plugin>
+        </plugins>
+    </build>
+
+    <!--  For development use  -->
+    <!--  Include this if you want to try the latest SNAPSHOT version  -->
+    <repositories>
+        <repository>
+            <id>oss.sonatype.org-snapshot</id>
+            <url>http://oss.sonatype.org/content/repositories/snapshots</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.google.code.gson</groupId>
+            <artifactId>gson</artifactId>
+            <version>2.8.6</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.7.30</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>1.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-slf4j-impl</artifactId>
+            <version>2.12.1</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.12.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>ai.djl.sentencepiece</groupId>
+            <artifactId>sentencepiece</artifactId>
+            <version>0.12.0</version>
+        </dependency>
+        
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+            <version>1.18.18</version>
+            <scope>provided</scope>
+        </dependency>
+        
+        <dependency>
+            <groupId>org.testng</groupId>
+            <artifactId>testng</artifactId>
+            <version>6.8.1</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+
+</project>
--- a/nlp_sdks/sentencepiece_sdk/sentencepiece_sdk.iml
+++ b/nlp_sdks/sentencepiece_sdk/sentencepiece_sdk.iml
@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
+  <component name="CheckStyle-IDEA-Module">
+    <option name="configuration">
+      <map />
+    </option>
+  </component>
+  <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
+    <output url="file://$MODULE_DIR$/target/classes" />
+    <output-test url="file://$MODULE_DIR$/target/test-classes" />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="aias-translation-zh-en-lib-0.1.0" level="project" />
+    <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
+    <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
+    <orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
+    <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
+    <orderEntry type="library" name="Maven: ai.djl.sentencepiece:sentencepiece:0.12.0" level="project" />
+    <orderEntry type="library" name="Maven: ai.djl:api:0.12.0" level="project" />
+    <orderEntry type="library" name="Maven: net.java.dev.jna:jna:5.8.0" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
+    <orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
+    <orderEntry type="library" name="Maven: org.testng:testng:6.8.1" level="project" />
+    <orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
+    <orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
+    <orderEntry type="library" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
+    <orderEntry type="library" name="Maven: com.beust:jcommander:1.27" level="project" />
+    <orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
+  </component>
+</module>
--- a/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/SpTokenizerExample.java
+++ b/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/SpTokenizerExample.java
@ -0,0 +1,86 @@
+package me.calvin.example;
+
+import me.calvin.example.utils.SpProcessor;
+import me.calvin.example.utils.SpTokenizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+public class SpTokenizerExample {
+    private static final Logger logger = LoggerFactory.getLogger(SpTokenizerExample.class);
+
+    private SpTokenizerExample() {
+    }
+
+    public static void main(String[] args) throws IOException {
+        Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
+        
+        logger.info("Test Tokenize");
+        testTokenize(modelPath);
+
+        logger.info("Test Encode & Decode");
+        testEncodeDecode(modelPath);
+
+        logger.info("Test GetToken");
+        testGetToken(modelPath);
+
+        logger.info("Test GetId");
+        testGetId(modelPath);
+
+    }
+
+    public static void testTokenize(Path modelPath) throws IOException {
+        try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
+            String original = "Hello World";
+            logger.info("Input sentence: " + original);
+            List<String> tokens = tokenizer.tokenize(original);
+            String[] strs = tokens.toArray(new String[]{});
+            logger.info("Tokens: " + Arrays.toString(strs));
+            String recovered = tokenizer.buildSentence(tokens);
+            logger.info("Recovered sentence: " + recovered);
+        }
+    }
+
+    public static void testEncodeDecode(Path modelPath) throws IOException {
+        try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
+            String original = "Hello World";
+            logger.info("Input sentence: " + original);
+            SpProcessor processor = tokenizer.getProcessor();
+            int[] ids = processor.encode(original);
+            logger.info("Ids: " + Arrays.toString(ids));
+            String recovered = processor.decode(ids);
+            logger.info("Recovered sentence: " + recovered);
+        }
+    }
+
+    public static void testGetToken(Path modelPath) throws IOException {
+        try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
+            String original = "Hello World";
+            SpProcessor processor = tokenizer.getProcessor();
+            int[] ids = processor.encode(original);
+            logger.info("ids: " + Arrays.toString(ids));
+            for (int i = 0; i < ids.length; i++) {
+                logger.info(processor.getToken(ids[i]));
+            }
+        }
+    }
+
+    public static void testGetId(Path modelPath) throws IOException {
+        try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
+            String original = "Hello World";
+            List<String> tokens = tokenizer.tokenize(original);
+            String[] strs = tokens.toArray(new String[]{});
+            logger.info("tokens: " + Arrays.toString(strs));
+            SpProcessor processor = tokenizer.getProcessor();
+            for (String token : tokens
+            ) {
+                logger.info("" + processor.getId(token));
+            }
+        }
+    }
+}
--- a/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/SpVocabularyExample.java
+++ b/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/SpVocabularyExample.java
@ -0,0 +1,35 @@
+package me.calvin.example;
+
+import me.calvin.example.utils.SpTokenizer;
+import me.calvin.example.utils.SpVocabulary;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class SpVocabularyExample {
+    private static final Logger logger = LoggerFactory.getLogger(SpVocabularyExample.class);
+
+    private SpVocabularyExample() {
+    }
+
+    public static void main(String[] args) throws IOException {
+        Path modelPath = Paths.get("build/test/models/sententpiece_test_model.model");
+
+        logger.info("Test TokenIdConversion");
+        testTokenIdConversion(modelPath);
+
+    }
+
+    public static void testTokenIdConversion(Path modelPath) throws IOException {
+        try (SpTokenizer tokenizer = new SpTokenizer(modelPath)) {
+            SpVocabulary vocabulary = SpVocabulary.from(tokenizer);
+            //根据id获取词表中的词
+            logger.info(vocabulary.getToken(1));
+            //获取词表中的词对应的id
+            logger.info("" + vocabulary.getIndex("<s>"));
+        }
+    }
+}      
--- a/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpProcessor.java
+++ b/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpProcessor.java
@ -0,0 +1,115 @@
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package me.calvin.example.utils;
+
+import ai.djl.sentencepiece.jni.LibUtils;
+import ai.djl.sentencepiece.jni.SentencePieceLibrary;
+import ai.djl.util.NativeResource;
+
+/** The processor holder for SentencePiece. */
+public final class SpProcessor extends NativeResource<Long> {
+
+    private static RuntimeException libraryStatus;
+
+    static {
+        try {
+            LibUtils.loadLibrary();
+        } catch (RuntimeException e) {
+            libraryStatus = e;
+        }
+    }
+
+    private SpProcessor() {
+        super(SentencePieceLibrary.LIB.createSentencePieceProcessor());
+    }
+
+    static SpProcessor newInstance() {
+        if (libraryStatus != null) {
+            throw libraryStatus;
+        }
+        return new SpProcessor();
+    }
+
+    public void loadModel(String path) {
+        SentencePieceLibrary.LIB.loadModel(getHandle(), path);
+    }
+
+    /**
+     * Tokenize a sentence into array of tokens.
+     *
+     * @param input sentence
+     * @return tokens
+     */
+    public String[] tokenize(String input) {
+        return SentencePieceLibrary.LIB.tokenize(getHandle(), input);
+    }
+
+    /**
+     * Build sentence from tokens.
+     *
+     * @param tokens input
+     * @return recovered sentence
+     */
+    public String buildSentence(String[] tokens) {
+        return SentencePieceLibrary.LIB.detokenize(getHandle(), tokens);
+    }
+
+    /**
+     * Get tokens from ID.
+     *
+     * @param id the index of token
+     * @return recovered token
+     */
+    public String getToken(int id) {
+        return SentencePieceLibrary.LIB.idToPiece(getHandle(), id);
+    }
+
+    /**
+     * Get ID from token.
+     *
+     * @param token token that ready to map
+     * @return id from token
+     */
+    public int getId(String token) {
+        return SentencePieceLibrary.LIB.pieceToId(getHandle(), token);
+    }
+
+    /**
+     * Encode sentence into indices.
+     *
+     * @param sentence input sentence
+     * @return indices
+     */
+    public int[] encode(String sentence) {
+        return SentencePieceLibrary.LIB.encode(getHandle(), sentence);
+    }
+
+    /**
+     * Decode indices into sentence.
+     *
+     * @param ids the indices
+     * @return recovered sentence
+     */
+    public String decode(int[] ids) {
+        return SentencePieceLibrary.LIB.decode(getHandle(), ids);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public void close() {
+        Long pointer = handle.get();
+        if (pointer != null) {
+            SentencePieceLibrary.LIB.deleteSentencePieceProcessor(pointer);
+        }
+    }
+}
--- a/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpTokenizer.java
+++ b/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpTokenizer.java
@ -0,0 +1,117 @@
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package me.calvin.example.utils;
+
+import ai.djl.modality.nlp.preprocess.Tokenizer;
+import me.calvin.example.utils.SpProcessor;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * {@code SpTokenizer} is a SentencePiece implementation of the {@link Tokenizer} interface that
+ * converts sentences into token.
+ */
+public class SpTokenizer implements Tokenizer, AutoCloseable {
+
+    private SpProcessor processor;
+
+    /**
+     * Create a SentencePiece Tokenizer from existing models.
+     *
+     * @param modelPath the directory or file path of the model location
+     * @throws IOException when IO operation fails in loading a resource
+     */
+    public SpTokenizer(Path modelPath) throws IOException {
+        this(modelPath, null);
+    }
+
+    /**
+     * Create a SentencePiece Tokenizer from existing models.
+     *
+     * @param modelPath the directory or file path of the model location
+     * @param prefix the model file name or path prefix
+     * @throws IOException when IO operation fails in loading a resource
+     */
+    public SpTokenizer(Path modelPath, String prefix) throws IOException {
+        this.processor = SpProcessor.newInstance();
+        loadModel(modelPath, prefix);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public List<String> tokenize(String sentence) {
+        return Arrays.asList(processor.tokenize(sentence));
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public String buildSentence(List<String> tokens) {
+        return processor.buildSentence(tokens.toArray(new String[0]));
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public void close() {
+        processor.close();
+    }
+
+    /**
+     * Get SentencePiece processor.
+     *
+     * @return {@link SpProcessor}
+     */
+    public SpProcessor getProcessor() {
+        return processor;
+    }
+
+    private void loadModel(Path modelPath, String prefix) throws IOException {
+        if (Files.notExists(modelPath)) {
+            throw new FileNotFoundException(
+                    "Model path doesn't exist: " + modelPath.toAbsolutePath());
+        }
+        Path modelDir = modelPath.toAbsolutePath();
+        Path modelFile = findModelFile(modelDir, prefix);
+        if (modelFile == null) {
+            // TODO: support proto and IOStream model
+            modelFile = findModelFile(modelDir, modelDir.toFile().getName());
+            if (modelFile == null) {
+                throw new FileNotFoundException("No .model found in : " + modelPath);
+            }
+        }
+
+        String modelFilePath = modelFile.toString();
+        processor.loadModel(modelFilePath);
+    }
+
+    private Path findModelFile(Path modelPath, String prefix) {
+        if (Files.isRegularFile(modelPath)) {
+            return modelPath;
+        }
+        Path modelFile = modelPath.resolve(prefix);
+        if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
+            if (prefix.endsWith(".model")) {
+                return null;
+            }
+            modelFile = modelPath.resolve(prefix + ".model");
+            if (Files.notExists(modelFile) || !Files.isRegularFile(modelFile)) {
+                return null;
+            }
+        }
+        return modelFile;
+    }
+}
--- a/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpVocabulary.java
+++ b/nlp_sdks/sentencepiece_sdk/src/main/java/me/calvin/example/utils/SpVocabulary.java
@ -0,0 +1,60 @@
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package me.calvin.example.utils;              
+
+import ai.djl.modality.nlp.Vocabulary;
+
+/** {@link SpVocabulary} is a SentencePiece implementation of {@link Vocabulary}. */
+public final class SpVocabulary implements Vocabulary {
+
+    private SpProcessor processor;
+
+    // TODO: Support direct Vocabulary loading
+    private SpVocabulary(SpProcessor processor) {
+        this.processor = processor;
+    }
+
+    /**
+     * Get Vocabulary from {@link SpTokenizer}.
+     *
+     * @param tokenizer the {@link SpTokenizer}
+     * @return {@link SpVocabulary}
+     */
+    public static SpVocabulary from(SpTokenizer tokenizer) {
+        return new SpVocabulary(tokenizer.getProcessor());
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public String getToken(long index) {
+        return processor.getToken((int) index);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public boolean contains(String token) {
+        throw new UnsupportedOperationException("Not supported for Sentence Piece");
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public long getIndex(String token) {
+        return processor.getId(token);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public long size() {
+        throw new UnsupportedOperationException("Not supported for Sentence Piece");
+    }
+}
--- a/nlp_sdks/sentencepiece_sdk/src/main/resources/log4j2.xml
+++ b/nlp_sdks/sentencepiece_sdk/src/main/resources/log4j2.xml
@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Configuration status="INFO">
+  <Appenders>
+    <Console name="console" target="SYSTEM_OUT">
+      <PatternLayout
+          pattern="[%-5level] - %msg%n"/>
+    </Console>
+  </Appenders>
+  <Loggers>
+    <Root level="info" additivity="false">
+      <AppenderRef ref="console"/>
+    </Root>
+    <Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
+      <AppenderRef ref="console"/>
+    </Logger>
+  </Loggers>
+</Configuration>