diff --git a/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java index 346675b..05e92e6 100644 --- a/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java +++ b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java @@ -105,18 +105,36 @@ public class SimpleTokenizeSplitter implements DocumentSplitter { while (currentIndex < maxIndex) { int endIndex = Math.min(currentIndex + chunkSize, maxIndex); List chunkTokens = tokens.subList(currentIndex, endIndex); - currentIndex = currentIndex + chunkSize - overlapSize; IntArrayList intArrayList = new IntArrayList(); for (Integer chunkToken : chunkTokens) { intArrayList.add(chunkToken); } String chunkText = encoding.decode(intArrayList).trim(); - if (chunkText.isEmpty()) { continue; } + //UTF-8 'Unicode replacement character' which in your case is 0xFFFD (65533 in Hex). + //fix 修复中文乱码的问题 + boolean firstIsReplacement = chunkText.charAt(0) == 65533; + boolean lastIsReplacement = chunkText.charAt(chunkText.length() - 1) == 65533; + + if (firstIsReplacement || lastIsReplacement) { + if (firstIsReplacement) currentIndex -= 1; + if (lastIsReplacement) endIndex += 1; + + chunkTokens = tokens.subList(currentIndex, endIndex); + intArrayList = new IntArrayList(); + for (Integer chunkToken : chunkTokens) { + intArrayList.add(chunkToken); + } + + chunkText = encoding.decode(intArrayList).trim(); + } + + currentIndex = currentIndex + chunkSize - overlapSize; + Document newDocument = new Document(); newDocument.addMetadata(document.getMetadataMap()); newDocument.setContent(chunkText); diff --git a/agents-flex-core/src/test/java/com/agentsflex/core/test/splitter/SimpleTokenizeSplitterTest.java b/agents-flex-core/src/test/java/com/agentsflex/core/test/splitter/SimpleTokenizeSplitterTest.java index 9bde851..3d6c371 100644 --- a/agents-flex-core/src/test/java/com/agentsflex/core/test/splitter/SimpleTokenizeSplitterTest.java +++ b/agents-flex-core/src/test/java/com/agentsflex/core/test/splitter/SimpleTokenizeSplitterTest.java @@ -40,7 +40,7 @@ public class SimpleTokenizeSplitterTest { @Test public void test02() { - SimpleTokenizeSplitter splitter = new SimpleTokenizeSplitter(20, 3); + SimpleTokenizeSplitter splitter = new SimpleTokenizeSplitter(20, 4); List chunks = splitter.split(Document.of(text)); for (Document chunk : chunks) {