From 6af8de06203039e7c75d8425129d988e3c7b35b1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:15:28 +0000 Subject: [PATCH 1/4] Initial plan for issue From 23278b9df152b48c8b2c71f8939a1a217a877d1b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:20:18 +0000 Subject: [PATCH 2/4] Add failing test to reproduce TextChunker newline splitting issue Co-authored-by: shethaadit <10839617+shethaadit@users.noreply.github.com> --- dotnet/global.json | 2 +- .../Text/TextChunkerTests.cs | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/dotnet/global.json b/dotnet/global.json index e00bec377198..43376b545c46 100644 --- a/dotnet/global.json +++ b/dotnet/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "9.0.300", + "version": "8.0.116", "rollForward": "latestMajor", "allowPrerelease": false } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index 807282a2778a..a114c52842eb 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -777,4 +777,21 @@ public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter() Assert.Equal(expected, result); } + + [Theory] + [InlineData("First line\r\nSecond line\r\nThird line")] + [InlineData("First line\nSecond line\nThird line")] + public void ActuallySplitsOnNewLines(string input) + { + var result = TextChunker.SplitPlainTextLines(input, 100); + + var expected = new[] + { + "First line", + "Second line", + "Third line" + }; + + Assert.Equal(expected, result); // Currently fails + } } From 33ad4a8ec0ca46572557e5bcbb69715b928ae85e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:27:04 +0000 Subject: [PATCH 3/4] Fix SplitPlainTextLines to split on newlines when token count is low Co-authored-by: shethaadit <10839617+shethaadit@users.noreply.github.com> --- .../SemanticKernel.Core/Text/TextChunker.cs | 35 +++++++++++++++++-- .../Text/TextChunkerTests.cs | 23 +++++++++++- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index 333528bf5e50..96342c958a49 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -52,7 +52,7 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou public delegate int TokenCounter(string input); private static readonly char[] s_spaceChar = [' ']; - private static readonly string?[] s_plaintextSplitOptions = ["\n\r", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; + private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null]; /// @@ -62,8 +62,37 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou /// Maximum number of tokens per line. /// Function to count tokens in a string. If not supplied, the default counter will be used. /// List of lines. - public static List SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) => - InternalSplitLines(text, maxTokensPerLine, trim: true, s_plaintextSplitOptions, tokenCounter); + public static List SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) + { + if (string.IsNullOrEmpty(text)) + { + return []; + } + + // Normalize line endings first + text = text.Replace("\r\n", "\n"); + + // Check if the text contains newlines and the total token count is within limits + var totalTokenCount = GetTokenCount(text, tokenCounter); + if (text.Contains('\n') && totalTokenCount <= maxTokensPerLine) + { + // Split on newlines regardless of token count, as the method name suggests line-based splitting + var lines = text.Split('\n', StringSplitOptions.RemoveEmptyEntries); + var result = new List(); + foreach (var line in lines) + { + var trimmedLine = line.Trim(); + if (!string.IsNullOrEmpty(trimmedLine)) + { + result.Add(trimmedLine); + } + } + return result; + } + + // Fall back to the original hierarchical splitting for long text or text without newlines + return InternalSplitLines(text, maxTokensPerLine, trim: true, s_plaintextSplitOptions, tokenCounter); + } /// /// Split markdown text into lines. diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a114c52842eb..0deb64d379a9 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -788,10 +788,31 @@ public void ActuallySplitsOnNewLines(string input) var expected = new[] { "First line", - "Second line", + "Second line", "Third line" }; Assert.Equal(expected, result); // Currently fails } + + [Fact] + public void SplitPlainTextLinesHandlesEdgeCases() + { + // Test with low token limit to ensure hierarchical splitting still works + var result1 = TextChunker.SplitPlainTextLines("This is a very long line without any newlines that should be split using hierarchical approach", 5); + Assert.True(result1.Count > 1, "Long line without newlines should be split hierarchically"); + + // Test with high token limit and newlines - should split on newlines + var result2 = TextChunker.SplitPlainTextLines("Short line\nAnother short line", 100); + Assert.Equal(new[] { "Short line", "Another short line" }, result2); + + // Test empty string + var result3 = TextChunker.SplitPlainTextLines("", 10); + Assert.Empty(result3); + + // Test single line without newlines + var result4 = TextChunker.SplitPlainTextLines("Single line without newlines", 100); + Assert.Single(result4); + Assert.Equal("Single line without newlines", result4[0]); + } } From 7bdd7b6a499823124768786d96f0bc6ec45fb937 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Jun 2025 23:31:59 +0000 Subject: [PATCH 4/4] Complete TextChunker fix with comprehensive tests and backward compatibility Co-authored-by: shethaadit <10839617+shethaadit@users.noreply.github.com> --- dotnet/global.json | 2 +- .../SemanticKernel.Core/Text/TextChunker.cs | 7 ++++--- .../Text/TextChunkerTests.cs | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/dotnet/global.json b/dotnet/global.json index 43376b545c46..e00bec377198 100644 --- a/dotnet/global.json +++ b/dotnet/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "8.0.116", + "version": "9.0.300", "rollForward": "latestMajor", "allowPrerelease": false } diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index 96342c958a49..90b600584e4d 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -52,7 +52,7 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou public delegate int TokenCounter(string input); private static readonly char[] s_spaceChar = [' ']; - private static readonly string?[] s_plaintextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; + private static readonly string?[] s_plaintextSplitOptions = ["\n\r", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; private static readonly string?[] s_markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null]; /// @@ -74,10 +74,11 @@ public static List SplitPlainTextLines(string text, int maxTokensPerLine // Check if the text contains newlines and the total token count is within limits var totalTokenCount = GetTokenCount(text, tokenCounter); - if (text.Contains('\n') && totalTokenCount <= maxTokensPerLine) + if ((text.Contains('\n') || text.Contains('\r')) && totalTokenCount <= maxTokensPerLine) { // Split on newlines regardless of token count, as the method name suggests line-based splitting - var lines = text.Split('\n', StringSplitOptions.RemoveEmptyEntries); + char[] separators = ['\n', '\r']; + var lines = text.Split(separators, StringSplitOptions.RemoveEmptyEntries); var result = new List(); foreach (var line in lines) { diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index 0deb64d379a9..294bf2be21d2 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -2,6 +2,7 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text; using Microsoft.SemanticKernel.Text; using Xunit; @@ -815,4 +816,22 @@ public void SplitPlainTextLinesHandlesEdgeCases() Assert.Single(result4); Assert.Equal("Single line without newlines", result4[0]); } + + [Fact] + public void SplitPlainTextLinesWorksWithMixedLineEndings() + { + // Test mixed line endings with different combinations + var result1 = TextChunker.SplitPlainTextLines("Line1\r\nLine2\nLine3\rLine4", 100); + Assert.Equal(new[] { "Line1", "Line2", "Line3", "Line4" }, result1); + + // Test that token limits still work when splitting on newlines + var result2 = TextChunker.SplitPlainTextLines("Short\nLine", 2); + Assert.Equal(new[] { "Short", "Line" }, result2); + + // Test that very long text still uses hierarchical splitting + var longText = string.Join("", Enumerable.Repeat("word ", 1000)); + var result3 = TextChunker.SplitPlainTextLines(longText, 10); + Assert.True(result3.Count > 1, "Long text should be split hierarchically"); + Assert.True(result3.All(s => !s.Contains('\n')), "Hierarchical split should not contain newlines"); + } }