Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion langextract/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,9 @@ def __next__(self) -> TextChunk:
curr_chunk.start_index, token_index + 1
)
if self._tokens_exceed_buffer(test_chunk):
if start_of_new_line > 0:
# Only break at newline if: 1) newline exists (> 0) and
# 2) it's after chunk start (prevents empty intervals)
if start_of_new_line > 0 and start_of_new_line > curr_chunk.start_index:
# Terminate the curr_chunk at the start of the most recent newline.
curr_chunk = create_token_interval(
curr_chunk.start_index, start_of_new_line
Expand Down
32 changes: 29 additions & 3 deletions tests/chunking_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def test_sentence_with_multiple_newlines_and_right_interval(self):
+ "Mr\n\nBond\n\nasks why?"
)
tokenized_text = tokenizer.tokenize(text)
# To take the whole text
chunk_interval = tokenizer.TokenInterval(
start_index=0, end_index=len(tokenized_text.tokens)
)
Expand Down Expand Up @@ -192,6 +191,33 @@ def test_long_token_gets_own_chunk(self):
with self.assertRaises(StopIteration):
next(chunk_iter)

def test_newline_at_chunk_boundary_does_not_create_empty_interval(self):
"""Test that newlines at chunk boundaries don't create empty token intervals.
When a newline occurs exactly at a chunk boundary, the chunking algorithm
should not attempt to create an empty interval (where start_index == end_index).
This was causing a ValueError in create_token_interval().
"""
text = "First sentence.\nSecond sentence that is longer.\nThird sentence."
tokenized_text = tokenizer.tokenize(text)

chunk_iter = chunking.ChunkIterator(tokenized_text, max_char_buffer=20)
chunks = list(chunk_iter)

for chunk in chunks:
self.assertLess(
chunk.token_interval.start_index,
chunk.token_interval.end_index,
"Chunk should have non-empty interval",
)

expected_intervals = [(0, 3), (3, 6), (6, 9), (9, 12)]
actual_intervals = [
(chunk.token_interval.start_index, chunk.token_interval.end_index)
for chunk in chunks
]
self.assertEqual(actual_intervals, expected_intervals)

def test_chunk_unicode_text(self):
text = textwrap.dedent("""\
Chief Complaint:
Expand Down Expand Up @@ -353,7 +379,7 @@ def test_make_batches_of_textchunk(
self.assertListEqual(
actual_batches,
expected_batches,
"Batches do not match expected",
"Batched chunks should match expected structure",
)


Expand Down Expand Up @@ -410,7 +436,7 @@ def test_multiple_chunks_with_additional_context(self):
)
chunks = list(chunk_iter)
self.assertGreater(
len(chunks), 1, "Expected multiple chunks due to max_char_buffer limit"
len(chunks), 1, "Should create multiple chunks with small buffer"
)
additional_contexts = [chunk.additional_context for chunk in chunks]
expected_additional_contexts = [self._ADDITIONAL_CONTEXT] * len(chunks)
Expand Down
Loading