.Net: Fix TextChunker.SplitPlainTextParagraphs to handle embedded newlines in input strings (#12558)

shethaadit · Adit Sheth · kyle-rader-msft · web-flow · commit 55132da6ec6b · 2025-06-25T14:26:17.000Z
## Description ### Summary Fixes issue #12556 where `TextChunker.SplitPlainTextParagraphs` does not properly handle embedded newlines in input strings. ### Problem The `SplitPlainTextParagraphs` method had two issues: 1. **Incorrect separator**: Used `"\n\r"` (LF+CR) which is not a standard line ending format - should be `"\r\n"` (CR+LF) for Windows or `"\n"` for Unix 2. **No embedded newline handling**: When input strings contained embedded newlines, they were not split into separate lines for processing This caused the method to process text with embedded newlines as single units instead of handling each line separately. ### Solution - Modified `s_plaintextSplitOptions` array to use `"\n"` as the separator for proper newline recognition - Modified `SplitPlainTextParagraphs` to use `SelectMany` with `Split('\n')` to handle embedded newlines - Added normalization of all newline formats (`\r\n`, `\r`, `\n`) to ensure consistent handling - Lines are split before processing but may be recombined based on token limits (expected behavior) ## Changes - **Modified**: `s_plaintextSplitOptions` array to use correct newline separator - **Modified**: `SplitPlainTextParagraphs` method to split embedded newlines before processing - **Preserved**: Existing paragraph grouping behavior based on token limits ## Testing - ✅ Fixes handling of embedded newlines in input strings - ✅ All existing tests continue to pass, including `CanSplitTextParagraphsOnNewlines` - ✅ Maintains backward compatibility for paragraph splitting behavior --------- Co-authored-by: Adit Sheth <adsheth@microsoft.com> Co-authored-by: Kyle Rader <126627085+kyle-rader-msft@users.noreply.github.com> Co-authored-by: westey <164392973+westey-m@users.noreply.github.com> Co-authored-by: Mark Wallace <127216156+markwallace-microsoft@users.noreply.github.com>
diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
@@ -52,7 +52,7 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou
     public delegate int TokenCounter(string input);
 
     private static readonly char[] s_spaceChar = [' '];
-    private static readonly string?[] s_plaintextSplitOptions = ["\n\r", ".。．", "?!", ";", ":", ",，、", ")]}", " ", "-", null];
+    private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。．", "?!", ";", ":", ",，、", ")]}", " ", "-", null];
     private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null];
 
     /// <summary>
@@ -84,8 +84,21 @@ public static List<string> SplitMarkDownLines(string text, int maxTokensPerLine,
     /// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
     /// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
     /// <returns>List of paragraphs.</returns>
-    public static List<string> SplitPlainTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
-        InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader, static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_plaintextSplitOptions, tokenCounter), tokenCounter);
+    public static List<string> SplitPlainTextParagraphs(
+    IEnumerable<string> lines,
+    int maxTokensPerParagraph,
+    int overlapTokens = 0,
+    string? chunkHeader = null,
+    TokenCounter? tokenCounter = null) =>
+    InternalSplitTextParagraphs(
+        lines.Select(line => line
+            .Replace("\r\n", "\n")
+            .Replace('\r', '\n')),
+        maxTokensPerParagraph,
+        overlapTokens,
+        chunkHeader,
+        static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_plaintextSplitOptions, tokenCounter),
+        tokenCounter);
 
     /// <summary>
     /// Split markdown text into paragraphs.
diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs
@@ -777,4 +777,46 @@ public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
 
         Assert.Equal(expected, result);
     }
+
+    [Fact]
+    public void SplitPlainTextParagraphsHandlesExampleFromIssue()
+    {
+        var lines = new[] { "First line\nSecond line\nThird line" };
+
+        var result = TextChunker.SplitPlainTextParagraphs(lines, 100);
+
+        Assert.Equal("First line\nSecond line\nThird line", result[0]);
+    }
+
+    [Theory]
+    [InlineData("First line\r\nSecond line\r\nThird line")]
+    [InlineData("First line\nSecond line\nThird line")]
+    [InlineData("First line\rSecond line\rThird line")]
+    public void SplitPlainTextParagraphsNormalizesNewlinesButDoesNotSplit(string input)
+    {
+        var lines = new[] { input };
+
+        var result = TextChunker.SplitPlainTextParagraphs(lines, 100);
+
+        Assert.Single(result);
+        Assert.DoesNotContain('\r', result[0]);
+        Assert.Contains("First line", result[0]);
+        Assert.Contains("Second line", result[0]);
+        Assert.Contains("Third line", result[0]);
+    }
+
+    [Fact]
+    public void SplitPlainTextParagraphsSplitsWhenExceedingTokenLimit()
+    {
+        var lines = new[] { "First line\nSecond line\nThird line" };
+
+        var result = TextChunker.SplitPlainTextParagraphs(lines, 5);
+
+        Assert.True(result.Count > 1);
+
+        var combined = string.Join(" ", result);
+        Assert.Contains("First line", combined);
+        Assert.Contains("Second line", combined);
+        Assert.Contains("Third line", combined);
+    }
 }