Skip to content

Commit 55132da

Browse files
shethaaditAdit Shethkyle-rader-msftwestey-mmarkwallace-microsoft
authored
.Net: Fix TextChunker.SplitPlainTextParagraphs to handle embedded newlines in input strings (#12558)
## Description ### Summary Fixes issue #12556 where `TextChunker.SplitPlainTextParagraphs` does not properly handle embedded newlines in input strings. ### Problem The `SplitPlainTextParagraphs` method had two issues: 1. **Incorrect separator**: Used `"\n\r"` (LF+CR) which is not a standard line ending format - should be `"\r\n"` (CR+LF) for Windows or `"\n"` for Unix 2. **No embedded newline handling**: When input strings contained embedded newlines, they were not split into separate lines for processing This caused the method to process text with embedded newlines as single units instead of handling each line separately. ### Solution - Modified `s_plaintextSplitOptions` array to use `"\n"` as the separator for proper newline recognition - Modified `SplitPlainTextParagraphs` to use `SelectMany` with `Split('\n')` to handle embedded newlines - Added normalization of all newline formats (`\r\n`, `\r`, `\n`) to ensure consistent handling - Lines are split before processing but may be recombined based on token limits (expected behavior) ## Changes - **Modified**: `s_plaintextSplitOptions` array to use correct newline separator - **Modified**: `SplitPlainTextParagraphs` method to split embedded newlines before processing - **Preserved**: Existing paragraph grouping behavior based on token limits ## Testing - ✅ Fixes handling of embedded newlines in input strings - ✅ All existing tests continue to pass, including `CanSplitTextParagraphsOnNewlines` - ✅ Maintains backward compatibility for paragraph splitting behavior --------- Co-authored-by: Adit Sheth <adsheth@microsoft.com> Co-authored-by: Kyle Rader <126627085+kyle-rader-msft@users.noreply.github.com> Co-authored-by: westey <164392973+westey-m@users.noreply.github.com> Co-authored-by: Mark Wallace <127216156+markwallace-microsoft@users.noreply.github.com>
1 parent b11c621 commit 55132da

File tree

2 files changed

+58
-3
lines changed

2 files changed

+58
-3
lines changed

dotnet/src/SemanticKernel.Core/Text/TextChunker.cs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou
5252
public delegate int TokenCounter(string input);
5353

5454
private static readonly char[] s_spaceChar = [' '];
55-
private static readonly string?[] s_plaintextSplitOptions = ["\n\r", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null];
55+
private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null];
5656
private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null];
5757

5858
/// <summary>
@@ -84,8 +84,21 @@ public static List<string> SplitMarkDownLines(string text, int maxTokensPerLine,
8484
/// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
8585
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
8686
/// <returns>List of paragraphs.</returns>
87-
public static List<string> SplitPlainTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
88-
InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader, static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_plaintextSplitOptions, tokenCounter), tokenCounter);
87+
public static List<string> SplitPlainTextParagraphs(
88+
IEnumerable<string> lines,
89+
int maxTokensPerParagraph,
90+
int overlapTokens = 0,
91+
string? chunkHeader = null,
92+
TokenCounter? tokenCounter = null) =>
93+
InternalSplitTextParagraphs(
94+
lines.Select(line => line
95+
.Replace("\r\n", "\n")
96+
.Replace('\r', '\n')),
97+
maxTokensPerParagraph,
98+
overlapTokens,
99+
chunkHeader,
100+
static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_plaintextSplitOptions, tokenCounter),
101+
tokenCounter);
89102

90103
/// <summary>
91104
/// Split markdown text into paragraphs.

dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,4 +777,46 @@ public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
777777

778778
Assert.Equal(expected, result);
779779
}
780+
781+
[Fact]
782+
public void SplitPlainTextParagraphsHandlesExampleFromIssue()
783+
{
784+
var lines = new[] { "First line\nSecond line\nThird line" };
785+
786+
var result = TextChunker.SplitPlainTextParagraphs(lines, 100);
787+
788+
Assert.Equal("First line\nSecond line\nThird line", result[0]);
789+
}
790+
791+
[Theory]
792+
[InlineData("First line\r\nSecond line\r\nThird line")]
793+
[InlineData("First line\nSecond line\nThird line")]
794+
[InlineData("First line\rSecond line\rThird line")]
795+
public void SplitPlainTextParagraphsNormalizesNewlinesButDoesNotSplit(string input)
796+
{
797+
var lines = new[] { input };
798+
799+
var result = TextChunker.SplitPlainTextParagraphs(lines, 100);
800+
801+
Assert.Single(result);
802+
Assert.DoesNotContain('\r', result[0]);
803+
Assert.Contains("First line", result[0]);
804+
Assert.Contains("Second line", result[0]);
805+
Assert.Contains("Third line", result[0]);
806+
}
807+
808+
[Fact]
809+
public void SplitPlainTextParagraphsSplitsWhenExceedingTokenLimit()
810+
{
811+
var lines = new[] { "First line\nSecond line\nThird line" };
812+
813+
var result = TextChunker.SplitPlainTextParagraphs(lines, 5);
814+
815+
Assert.True(result.Count > 1);
816+
817+
var combined = string.Join(" ", result);
818+
Assert.Contains("First line", combined);
819+
Assert.Contains("Second line", combined);
820+
Assert.Contains("Third line", combined);
821+
}
780822
}

0 commit comments

Comments
 (0)