Skip to content

Fix TextChunker.SplitPlainTextLines to properly split on newlines regardless of token count #12557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,38 @@ private sealed class StringListWithTokenCount(TextChunker.TokenCounter? tokenCou
/// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
/// <returns>List of lines.</returns>
public static List<string> SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) =>
InternalSplitLines(text, maxTokensPerLine, trim: true, s_plaintextSplitOptions, tokenCounter);
public static List<string> SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null)
{
if (string.IsNullOrEmpty(text))
{
return [];
}

// Normalize line endings first
text = text.Replace("\r\n", "\n");

// Check if the text contains newlines and the total token count is within limits
var totalTokenCount = GetTokenCount(text, tokenCounter);
if ((text.Contains('\n') || text.Contains('\r')) && totalTokenCount <= maxTokensPerLine)
{
// Split on newlines regardless of token count, as the method name suggests line-based splitting
char[] separators = ['\n', '\r'];
var lines = text.Split(separators, StringSplitOptions.RemoveEmptyEntries);
var result = new List<string>();
foreach (var line in lines)
{
var trimmedLine = line.Trim();
if (!string.IsNullOrEmpty(trimmedLine))
{
result.Add(trimmedLine);
}
}
return result;
}

// Fall back to the original hierarchical splitting for long text or text without newlines
return InternalSplitLines(text, maxTokensPerLine, trim: true, s_plaintextSplitOptions, tokenCounter);
}

/// <summary>
/// Split markdown text into lines.
Expand Down
57 changes: 57 additions & 0 deletions dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.SemanticKernel.Text;
using Xunit;
Expand Down Expand Up @@ -777,4 +778,60 @@ public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter()

Assert.Equal(expected, result);
}

[Theory]
[InlineData("First line\r\nSecond line\r\nThird line")]
[InlineData("First line\nSecond line\nThird line")]
public void ActuallySplitsOnNewLines(string input)
{
var result = TextChunker.SplitPlainTextLines(input, 100);

var expected = new[]
{
"First line",
"Second line",
"Third line"
};

Assert.Equal(expected, result); // Currently fails
}

[Fact]
public void SplitPlainTextLinesHandlesEdgeCases()
{
// Test with low token limit to ensure hierarchical splitting still works
var result1 = TextChunker.SplitPlainTextLines("This is a very long line without any newlines that should be split using hierarchical approach", 5);
Assert.True(result1.Count > 1, "Long line without newlines should be split hierarchically");

// Test with high token limit and newlines - should split on newlines
var result2 = TextChunker.SplitPlainTextLines("Short line\nAnother short line", 100);
Assert.Equal(new[] { "Short line", "Another short line" }, result2);

// Test empty string
var result3 = TextChunker.SplitPlainTextLines("", 10);
Assert.Empty(result3);

// Test single line without newlines
var result4 = TextChunker.SplitPlainTextLines("Single line without newlines", 100);
Assert.Single(result4);
Assert.Equal("Single line without newlines", result4[0]);
}

[Fact]
public void SplitPlainTextLinesWorksWithMixedLineEndings()
{
// Test mixed line endings with different combinations
var result1 = TextChunker.SplitPlainTextLines("Line1\r\nLine2\nLine3\rLine4", 100);
Assert.Equal(new[] { "Line1", "Line2", "Line3", "Line4" }, result1);

// Test that token limits still work when splitting on newlines
var result2 = TextChunker.SplitPlainTextLines("Short\nLine", 2);
Assert.Equal(new[] { "Short", "Line" }, result2);

// Test that very long text still uses hierarchical splitting
var longText = string.Join("", Enumerable.Repeat("word ", 1000));
var result3 = TextChunker.SplitPlainTextLines(longText, 10);
Assert.True(result3.Count > 1, "Long text should be split hierarchically");
Assert.True(result3.All(s => !s.Contains('\n')), "Hierarchical split should not contain newlines");
}
}