diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
index d8f4a32b4e3c..d2e7453c2564 100644
--- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
+++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
@@ -51,7 +51,6 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou
/// The number of tokens in the input string.
public delegate int TokenCounter(string input);
- private static readonly char[] s_spaceChar = [' '];
private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null];
private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null];
@@ -192,18 +191,11 @@ private static List ProcessParagraphs(List paragraphs, int adjus
if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4)
{
- var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
- var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
+ var mergedParagraph = $"{secondLastParagraph} {lastParagraph}";
- var lastParagraphTokensCount = lastParagraphTokens.Length;
- var secondLastParagraphTokensCount = secondLastParagraphTokens.Length;
-
- if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph)
+ if (GetTokenCount(mergedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph)
{
- var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens);
- var newLastParagraph = string.Join(" ", lastParagraphTokens);
-
- paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}";
+ paragraphs[paragraphs.Count - 2] = mergedParagraph;
paragraphs.RemoveAt(paragraphs.Count - 1);
}
}
diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs
index a31f077eef66..3bae6381951b 100644
--- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs
+++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs
@@ -819,4 +819,19 @@ public void SplitPlainTextParagraphsSplitsWhenExceedingTokenLimit()
Assert.Contains("Second line", combined);
Assert.Contains("Third line", combined);
}
+
+ [Fact]
+ public void SplitPlainTextParagraphsDoesNotMergeOrphanChunkBeyondTokenLimit()
+ {
+ var lines = new[]
+ {
+ new string('a', 45),
+ new string('b', 10),
+ };
+
+ var result = TextChunker.SplitPlainTextParagraphs(lines, 52, tokenCounter: input => input.Length);
+
+ Assert.Equal(2, result.Count);
+ Assert.All(result, paragraph => Assert.True(paragraph.Length <= 52));
+ }
}