From 4c74ef9ee58192ece5b89c854973a351e7ea5834 Mon Sep 17 00:00:00 2001 From: Teddy Wahle Date: Mon, 12 May 2025 17:42:27 -0700 Subject: [PATCH] Better handling for synced_block feature in Notion connector --- .../notion/types/blocks/synced_block.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py index c1456643f..9347d0969 100644 --- a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +++ b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py @@ -18,6 +18,9 @@ def can_have_children() -> bool: @classmethod def from_dict(cls, data: dict): + # Original blocks contain children content + if "children" not in data: + raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}") return cls(children=data["children"]) def get_html(self) -> Optional[HtmlTag]: @@ -31,27 +34,56 @@ class DuplicateSyncedBlock(BlockBase): @staticmethod def can_have_children() -> bool: + # Duplicate blocks themselves don't have children directly fetched here, + # but they represent content that does, so Notion API might report has_children=True + # on the parent block object. The actual children are fetched from the original block. return True @classmethod def from_dict(cls, data: dict): - return cls(**data) + # Duplicate blocks contain a 'synced_from' reference + synced_from_data = data.get("synced_from") + if not synced_from_data or not isinstance(synced_from_data, dict): + raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}") + # Ensure required keys are present in the nested dictionary + if "type" not in synced_from_data or "block_id" not in synced_from_data: + raise ValueError( + f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}" + ) + return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"]) def get_html(self) -> Optional[HtmlTag]: + # HTML representation might need fetching the original block's content, + # which is outside the scope of this simple data class. return None class SyncBlock(BlockBase): @staticmethod def can_have_children() -> bool: + # Synced blocks (both original and duplicate) can conceptually have children. return True @classmethod def from_dict(cls, data: dict): - if "synced_from" in data: + # Determine if it's a duplicate (has 'synced_from') or original (has 'children') + if data.get("synced_from") is not None: + # It's a duplicate block containing a reference + return DuplicateSyncedBlock.from_dict(data) + elif "children" in data: + # It's an original block containing children return OriginalSyncedBlock.from_dict(data) else: - return DuplicateSyncedBlock.from_dict(data) + # Handle cases where neither 'synced_from' nor 'children' are present. + # Notion API might return this for an empty original synced block. + # Let's treat it as an empty OriginalSyncedBlock. + # If this assumption is wrong, errors might occur later. + # Consider logging a warning here if strictness is needed. + return OriginalSyncedBlock(children=[]) + def get_html(self) -> Optional[HtmlTag]: + # The specific instance returned by from_dict (Original or Duplicate) + # will handle its own get_html logic. + # This method on the base SyncBlock might not be directly called. return None