diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index d8f4a32b4e3c..d2e7453c2564 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -51,7 +51,6 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou /// The number of tokens in the input string. public delegate int TokenCounter(string input); - private static readonly char[] s_spaceChar = [' ']; private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null]; @@ -192,18 +191,11 @@ private static List ProcessParagraphs(List paragraphs, int adjus if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4) { - var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); - var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); + var mergedParagraph = $"{secondLastParagraph} {lastParagraph}"; - var lastParagraphTokensCount = lastParagraphTokens.Length; - var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; - - if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + if (GetTokenCount(mergedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph) { - var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); - var newLastParagraph = string.Join(" ", lastParagraphTokens); - - paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}"; + paragraphs[paragraphs.Count - 2] = mergedParagraph; paragraphs.RemoveAt(paragraphs.Count - 1); } } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a31f077eef66..3bae6381951b 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -819,4 +819,19 @@ public void SplitPlainTextParagraphsSplitsWhenExceedingTokenLimit() Assert.Contains("Second line", combined); Assert.Contains("Third line", combined); } + + [Fact] + public void SplitPlainTextParagraphsDoesNotMergeOrphanChunkBeyondTokenLimit() + { + var lines = new[] + { + new string('a', 45), + new string('b', 10), + }; + + var result = TextChunker.SplitPlainTextParagraphs(lines, 52, tokenCounter: input => input.Length); + + Assert.Equal(2, result.Count); + Assert.All(result, paragraph => Assert.True(paragraph.Length <= 52)); + } }