google · aksg87 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/langextract/chunking.py b/langextract/chunking.py
@@ -451,7 +451,9 @@ def __next__(self) -> TextChunk:
           curr_chunk.start_index, token_index + 1
       )
       if self._tokens_exceed_buffer(test_chunk):
-        if start_of_new_line > 0:
+        # Only break at newline if: 1) newline exists (> 0) and
+        # 2) it's after chunk start (prevents empty intervals)
+        if start_of_new_line > 0 and start_of_new_line > curr_chunk.start_index:
           # Terminate the curr_chunk at the start of the most recent newline.
           curr_chunk = create_token_interval(
               curr_chunk.start_index, start_of_new_line

diff --git a/tests/chunking_test.py b/tests/chunking_test.py
@@ -95,7 +95,6 @@ def test_sentence_with_multiple_newlines_and_right_interval(self):
         + "Mr\n\nBond\n\nasks why?"
     )
     tokenized_text = tokenizer.tokenize(text)
-    # To take the whole text
     chunk_interval = tokenizer.TokenInterval(
         start_index=0, end_index=len(tokenized_text.tokens)
     )
@@ -192,6 +191,33 @@ def test_long_token_gets_own_chunk(self):
     with self.assertRaises(StopIteration):
       next(chunk_iter)
 
+  def test_newline_at_chunk_boundary_does_not_create_empty_interval(self):
+    """Test that newlines at chunk boundaries don't create empty token intervals.
+
+    When a newline occurs exactly at a chunk boundary, the chunking algorithm
+    should not attempt to create an empty interval (where start_index == end_index).
+    This was causing a ValueError in create_token_interval().
+    """
+    text = "First sentence.\nSecond sentence that is longer.\nThird sentence."
+    tokenized_text = tokenizer.tokenize(text)
+
+    chunk_iter = chunking.ChunkIterator(tokenized_text, max_char_buffer=20)
+    chunks = list(chunk_iter)
+
+    for chunk in chunks:
+      self.assertLess(
+          chunk.token_interval.start_index,
+          chunk.token_interval.end_index,
+          "Chunk should have non-empty interval",
+      )
+
+    expected_intervals = [(0, 3), (3, 6), (6, 9), (9, 12)]
+    actual_intervals = [
+        (chunk.token_interval.start_index, chunk.token_interval.end_index)
+        for chunk in chunks
+    ]
+    self.assertEqual(actual_intervals, expected_intervals)
+
   def test_chunk_unicode_text(self):
     text = textwrap.dedent("""\
     Chief Complaint:
@@ -353,7 +379,7 @@ def test_make_batches_of_textchunk(
     self.assertListEqual(
         actual_batches,
         expected_batches,
-        "Batches do not match expected",
+        "Batched chunks should match expected structure",
     )
 
 
@@ -410,7 +436,7 @@ def test_multiple_chunks_with_additional_context(self):
     )
     chunks = list(chunk_iter)
     self.assertGreater(
-        len(chunks), 1, "Expected multiple chunks due to max_char_buffer limit"
+        len(chunks), 1, "Should create multiple chunks with small buffer"
     )
     additional_contexts = [chunk.additional_context for chunk in chunks]
     expected_additional_contexts = [self._ADDITIONAL_CONTEXT] * len(chunks)