fix: properly handle the case when an element's text is None (#3995)

badGarnet · web-flow · commit b814ece39f5a · 2025-05-05T18:08:11.000Z
Some elements, like `Image`, can have `None` as its `text` attribute's
value. In that case current chunking logic fails because it expects the
field to always have a length or can be split. The fix is to update the
logic as `element.text or ""` for checking length and add flow control
to early exit to avoid calling split on `None`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.17.6-dev2
+## 0.17.6
 
 ### Enhancements
 
@@ -10,6 +10,7 @@ Two executions of the same code, on the same file, produce different results. Th
 This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
 - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
 - Resolve open CVEs
+- Properly handle the case when an element's `text` attribute is None
 
 
 ## 0.17.5
@@ -48,7 +49,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r
 ### Features
 
 ### Fixes
-- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml 
+- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
 
 ## 0.17.2
 
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -31,6 +31,7 @@
     CompositeElement,
     Element,
     ElementMetadata,
+    Image,
     PageBreak,
     Table,
     TableChunk,
@@ -234,6 +235,10 @@ def it_accumulates_elements_added_to_it(self):
         assert builder._text_length == 112
         assert builder._remaining_space == 36
 
+    def it_will_fit_when_element_has_none_as_text(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions())
+        assert builder.will_fit(Image(None))
+
     def it_will_fit_an_oversized_element_when_empty(self):
         builder = PreChunkBuilder(opts=ChunkingOptions())
         assert builder.will_fit(Text("abcd " * 200))
@@ -405,6 +410,12 @@ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self):
         pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions())
         assert pre_chunk != 42
 
+    def it_can_handle_element_with_none_as_text(self):
+        pre_chunk = PreChunk(
+            [Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions()
+        )
+        assert pre_chunk._text == "hello"
+
     @pytest.mark.parametrize(
         ("max_characters", "combine_text_under_n_chars", "expected_value"),
         [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.17.6-dev2"  # pragma: no cover
+__version__ = "0.17.6"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
@@ -387,7 +387,7 @@ def will_fit(self, element: Element) -> bool:
         if self._text_length > self._opts.soft_max:
             return False
         # -- don't add an element if it would increase total size beyond the hard-max --
-        return not self._remaining_space < len(element.text)
+        return not self._remaining_space < len(element.text or "")
 
     @property
     def _remaining_space(self) -> int:
@@ -503,6 +503,8 @@ def _iter_text_segments(self) -> Iterator[str]:
         if self._overlap_prefix:
             yield self._overlap_prefix
         for e in self._elements:
+            if e.text is None:
+                continue
             text = " ".join(e.text.strip().split())
             if not text:
                 continue

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.17.6-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.17.6" # pragma: no cover`