From d4e2f420ad9295dcba4757a6938178583fb3a64c Mon Sep 17 00:00:00 2001 From: pragnyanramtha Date: Fri, 15 May 2026 19:08:06 +0000 Subject: [PATCH] Fix TextChunker token-counted paragraph merge --- .../SemanticKernel.Core/Text/TextChunker.cs | 8 +++- .../Text/TextChunkerTests.cs | 39 +++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index d8f4a32b4e3c..2e72066ac8d7 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -202,9 +202,13 @@ private static List ProcessParagraphs(List paragraphs, int adjus { var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); var newLastParagraph = string.Join(" ", lastParagraphTokens); + var mergedParagraph = $"{newSecondLastParagraph} {newLastParagraph}"; - paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}"; - paragraphs.RemoveAt(paragraphs.Count - 1); + if (GetTokenCount(mergedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph) + { + paragraphs[paragraphs.Count - 2] = mergedParagraph; + paragraphs.RemoveAt(paragraphs.Count - 1); + } } } } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a31f077eef66..a5e7e1ee3c0f 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -558,6 +558,45 @@ public void CanSplitTextParagraphsWithCustomTokenCounter() Assert.Equal(expected, result); } + [Fact] + public void SplitPlainTextParagraphsWithCustomTokenCounterDoesNotMergePastTokenLimit() + { + List input = + [ + "abcdefghijklmnopqr", + "abc" + ]; + + var expected = new[] + { + "abcdefghijklmnopqr", + "abc" + }; + + var result = TextChunker.SplitPlainTextParagraphs(input, 20, tokenCounter: static (input) => input.Length); + + Assert.Equal(expected, result); + } + + [Fact] + public void SplitPlainTextParagraphsWithCustomTokenCounterStillMergesWhenCandidateFits() + { + List input = + [ + "abcdefghijklmnop", + "abc" + ]; + + var expected = new[] + { + "abcdefghijklmnop abc" + }; + + var result = TextChunker.SplitPlainTextParagraphs(input, 20, tokenCounter: static (input) => input.Length); + + Assert.Equal(expected, result); + } + [Fact] public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter() {