feat: add SimpleTokenizeSplitter

This commit is contained in:
Michael Yang 2024-07-03 17:54:10 +08:00
parent c32fcb97ba
commit 0f5faef56d
2 changed files with 21 additions and 3 deletions

View File

@ -105,18 +105,36 @@ public class SimpleTokenizeSplitter implements DocumentSplitter {
while (currentIndex < maxIndex) {
int endIndex = Math.min(currentIndex + chunkSize, maxIndex);
List<Integer> chunkTokens = tokens.subList(currentIndex, endIndex);
currentIndex = currentIndex + chunkSize - overlapSize;
IntArrayList intArrayList = new IntArrayList();
for (Integer chunkToken : chunkTokens) {
intArrayList.add(chunkToken);
}
String chunkText = encoding.decode(intArrayList).trim();
if (chunkText.isEmpty()) {
continue;
}
//UTF-8 'Unicode replacement character' which in your case is 0xFFFD (65533 in Hex).
//fix 修复中文乱码的问题
boolean firstIsReplacement = chunkText.charAt(0) == 65533;
boolean lastIsReplacement = chunkText.charAt(chunkText.length() - 1) == 65533;
if (firstIsReplacement || lastIsReplacement) {
if (firstIsReplacement) currentIndex -= 1;
if (lastIsReplacement) endIndex += 1;
chunkTokens = tokens.subList(currentIndex, endIndex);
intArrayList = new IntArrayList();
for (Integer chunkToken : chunkTokens) {
intArrayList.add(chunkToken);
}
chunkText = encoding.decode(intArrayList).trim();
}
currentIndex = currentIndex + chunkSize - overlapSize;
Document newDocument = new Document();
newDocument.addMetadata(document.getMetadataMap());
newDocument.setContent(chunkText);

View File

@ -40,7 +40,7 @@ public class SimpleTokenizeSplitterTest {
@Test
public void test02() {
SimpleTokenizeSplitter splitter = new SimpleTokenizeSplitter(20, 3);
SimpleTokenizeSplitter splitter = new SimpleTokenizeSplitter(20, 4);
List<Document> chunks = splitter.split(Document.of(text));
for (Document chunk : chunks) {