seperman
diff --git a/‎deepdiff/helper.py
Lines changed: 8 additions & 2 deletions b/‎deepdiff/helper.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎deepdiff/summarize.py
Lines changed: 122 additions & 161 deletions b/‎deepdiff/summarize.py
Lines changed: 122 additions & 161 deletions
@@ -8,7 +8,7 @@
 import string
 import time
 import enum
-from typing import NamedTuple, Any, List, Optional
+from typing import NamedTuple, Any, List, Optional, TypeAlias
 from ast import literal_eval
 from decimal import Decimal, localcontext, InvalidOperation as InvalidDecimalOperation
 from itertools import repeat
@@ -817,4 +817,10 @@ class FlatDeltaRow(NamedTuple):
     __repr__ = __str__ = named_tuple_repr
 
 
-type JSON = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None
+JSON: TypeAlias = dict[str, str] | list[str] | list[int] | dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None
+
+
+class SummaryNodeType(EnumBase):
+    dict = 'dict'
+    list = 'list'
+    leaf = 'leaf'
@@ -1,8 +1,8 @@
-from typing import Any
+from deepdiff.helper import JSON, SummaryNodeType
 from deepdiff.serialization import json_dumps
 
 
-def _truncate(s, max_len):
+def _truncate(s: str, max_len: int) -> str:
     """
     Truncate string s to max_len characters.
     If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
@@ -12,165 +12,126 @@ def _truncate(s, max_len):
     if max_len <= 5:
         return s[:max_len]
     return s[:max_len - 5] + "..." + s[-2:]
+# Re-defining the functions due to environment reset
 
-class JSONNode:
-    def __init__(self, data: Any, key=None):
-        """
-        Build a tree node for the JSON data.
-        If this node is a child of a dict, key is its key name.
-        """
-        self.key = key
-        self.children_list: list[JSONNode] = []
-        self.children_dict: list[tuple[Any, JSONNode]] = []
-        self.value: str = ""
-        if isinstance(data, dict):
-            self.type = "dict"
-            # Preserve insertion order: list of (key, child) pairs.
-            for k, v in data.items():
-                child = JSONNode(v, key=k)
-                self.children_dict.append((k, child))
-        elif isinstance(data, list):
-            self.type = "list"
-            self.children_list = [JSONNode(item) for item in data]
+
+# Function to calculate node weights recursively
+def calculate_weights(node):
+    if isinstance(node, dict):
+        weight = 0
+        children_weights = {}
+        for k, v in node.items():
+            edge_weight = len(k)
+            child_weight, child_structure = calculate_weights(v)
+            total_weight = edge_weight + child_weight
+            weight += total_weight
+            children_weights[k] = (edge_weight, child_weight, child_structure)
+        return weight, (SummaryNodeType.dict, children_weights)
+
+    elif isinstance(node, list):
+        weight = 0
+        children_weights = []
+        for v in node:
+            edge_weight = 0  # Index weights are zero
+            child_weight, child_structure = calculate_weights(v)
+            total_weight = edge_weight + child_weight
+            weight += total_weight
+            children_weights.append((edge_weight, child_weight, child_structure))
+        return weight, (SummaryNodeType.list, children_weights)
+
+    else:
+        if isinstance(node, str):
+            node_weight = len(node)
+        elif isinstance(node, int):
+            node_weight = len(str(node))
+        elif isinstance(node, float):
+            node_weight = len(str(round(node, 2)))
+        elif node is None:
+            node_weight = 1
+        else:
+            node_weight = 0
+        return node_weight, (SummaryNodeType.leaf, node)
+
+# Include previously defined functions for shrinking with threshold
+# (Implementing directly the balanced summarization algorithm as above)
+
+# Balanced algorithm (simplified version):
+def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float) -> tuple[JSON, float]:
+    node_type, node_info = node_structure
+
+    if node_type is SummaryNodeType.leaf:
+        leaf_value = node_info
+        leaf_weight, _ = calculate_weights(leaf_value)
+        if leaf_weight <= max_weight:
+            return leaf_value, leaf_weight
         else:
-            self.type = "primitive"
-            # For primitives, use json.dumps to get a compact representation.
-            try:
-                self.value = json_dumps(data)
-            except Exception:
-                self.value = str(data)
-    
-    def __repr__(self) -> str:
-        if self.children_list:
-            return "List-[" + ",".join([str(i) for i in self.children_list]) + "]"
-        if self.children_dict:
-            return "Dict-[" + ",".join([f"{i}:{v}" for i, v in self.children_dict]) + "]"
-        return self.value
-
-    __str__ = __repr__
-
-    def full_repr(self) -> str:
-        """Return the full minimized JSON representation (without trimming) for this node."""
-        if self.type == "primitive":
-            return self.value
-        elif self.type == "dict":
-            parts = []
-            for k, child in self.children_dict:
-                parts.append(f'"{k}":{child.full_repr()}')
-            return "{" + ",".join(parts) + "}"
-        elif self.type == "list":
-            parts = [child.full_repr() for child in self.children_list]
-            return "[" + ",".join(parts) + "]"
-        return self.value
-    
-    def full_weight(self):
-        """Return the character count of the full representation."""
-        return len(self.full_repr())
-    
-    def _summarize(self, budget) -> str:
-        """
-        Return a summary string for this node that fits within budget characters.
-        The algorithm may drop whole sub-branches (for dicts) or truncate long primitives.
-        """
-        if self.type == "primitive":
-            rep = self.value
-            if len(rep) <= budget:
-                return rep
-            else:
-                return _truncate(rep, budget)
-        elif self.type == "dict":
-            return self._summarize_dict(budget)
-        elif self.type == "list":
-            return self._summarize_list(budget)
-        return str(self.value)
-    
-    def _summarize_dict(self, budget) -> str:
-        # If the dict is empty, return {}
-        if not self.children_dict:
-            return "{}"
-        # Build a list of pairs with fixed parts:
-        # Each pair: key_repr is f'"{key}":'
-        # Also store the full (untrimmed) child representation.
-        pairs = []
-        for k, child in self.children_dict:
-            key_repr = f'"{k}":'
-            child_full = child.full_repr()
-            pair_full = key_repr + child_full
-            pairs.append({
-                "key": k,
-                "child": child,
-                "key_repr": key_repr,
-                "child_full": child_full,
-                "pair_full": pair_full,
-                "full_length": len(pair_full)
-            })
-        n = len(pairs)
-        fixed_overhead = 2 + (n - 1)  # braces plus commas between pairs
-        total_full = sum(p["full_length"] for p in pairs) + fixed_overhead
-        # If full representation fits, return it.
-        if total_full <= budget:
-            parts = [p["key_repr"] + p["child_full"] for p in pairs]
-            return "{" + ",".join(parts) + "}"
-        
-        # Otherwise, try dropping some pairs.
-        kept = pairs.copy()
-        # Heuristic: while the representation is too long, drop the pair whose child_full is longest.
-        while kept:
-            # Sort kept pairs in original insertion order.
-            kept_sorted = sorted(kept, key=lambda p: self.children_dict.index((p["key"], p["child"])))
-            current_n = len(kept_sorted)
-            fixed = sum(len(p["key_repr"]) for p in kept_sorted) + (current_n - 1) + 2
-            remaining_budget = budget - fixed
-            if remaining_budget < 0:
-                # Not enough even for fixed costs; drop one pair.
-                kept.remove(max(kept, key=lambda p: len(p["child_full"])))
+            if isinstance(leaf_value, str):
+                truncated_value = _truncate(leaf_value, max_weight)
+                return truncated_value, len(truncated_value)
+            elif isinstance(leaf_value, (int, float)):
+                leaf_str = str(leaf_value)
+                truncated_str = leaf_str[:max_weight]
+                try:
+                    return int(truncated_str), len(truncated_str)
+                except Exception:
+                    try:
+                        return float(truncated_str), len(truncated_str)
+                    except Exception:
+                        return truncated_str, len(truncated_str)
+            elif leaf_value is None:
+                return None, 1 if max_weight >= 1 else 0
+
+    elif node_type is SummaryNodeType.dict:
+        shrunk_dict = {}
+        total_weight = 0
+        sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
+
+        for k, (edge_w, _, child_struct) in sorted_children:
+            allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight)
+            if allowed_branch_weight <= edge_w:
                 continue
-            total_child_full = sum(len(p["child_full"]) for p in kept_sorted)
-            # Allocate available budget for each child's summary proportionally.
-            child_summaries = []
-            for p in kept_sorted:
-                ideal = int(remaining_budget * (len(p["child_full"]) / total_child_full)) if total_child_full > 0 else 0
-                summary_child = p["child"]._summarize(ideal)
-                child_summaries.append(summary_child)
-            candidate = "{" + ",".join([p["key_repr"] + s for p, s in zip(kept_sorted, child_summaries)]) + "}"
-            if len(candidate) <= budget:
-                return candidate
-            # If still too long, drop the pair with the largest child_full length.
-            to_drop = max(kept, key=lambda p: len(p["child_full"]))
-            kept.remove(to_drop)
-        # If nothing remains, return a truncated empty object.
-        return _truncate("{}", budget)
-    
-    def _summarize_list(self, budget) -> str:
-        # If the list is empty, return []
-        if not self.children_list:
-            return "[]"
-        full_repr = self.full_repr()
-        if len(full_repr) <= budget:
-            return full_repr
-        # For lists, show only the first element and an omission indicator if more elements exist.
-        suffix = ",..." if len(self.children_list) > 1 else ""
-
-        inner_budget = budget - 2 - len(suffix)  # subtract brackets and suffix
-        budget_per_element: int =  min(inner_budget, max(4, inner_budget // len(self.children_list)))
-        max_element_count: int = inner_budget // budget_per_element
-        element_summaries: list[str] = []
-        for element in self.children_list[:max_element_count]:
-            element_summaries.append(element._summarize(budget_per_element))
-        # first_summary = self.children_list[0]._summarize(budget_per_element)
-        joined_elements = ",".join(element_summaries)
-        joined_elements = joined_elements.rstrip(".")
-        joined_elements = joined_elements[:inner_budget]
-        return f"[{joined_elements}{suffix}]"
-        # if len(candidate) <= budget:
-        #     return candidate
-        # return _truncate(candidate, budget)
-
-
-def summarize(data, max_length=200):
-    """
-    Build a tree for the given JSON-compatible data and return its summary,
-    ensuring the final string is no longer than self.max_length.
-    """
-    root = JSONNode(data)
-    return root._summarize(max_length).replace("{,", "{")
+
+            remaining_weight = int(allowed_branch_weight - edge_w)
+            shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold)
+            if shrunk_child is not None:
+                shrunk_dict[k[:edge_w]] = shrunk_child
+                total_weight += edge_w + shrunk_weight
+
+            if total_weight >= max_weight:
+                break
+        if not shrunk_dict:
+            return None, 0
+
+        return shrunk_dict, total_weight
+
+    elif node_type is SummaryNodeType.list:
+        shrunk_list = []
+        total_weight = 0
+        sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
+        for edge_w, _, child_struct in sorted_children:
+            allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight))
+            shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold)
+            if shrunk_child is not None:
+                shrunk_list.append(shrunk_child)
+                total_weight += shrunk_weight
+            if total_weight >= max_weight - 1:
+                shrunk_list.append("...")
+                break
+        if not shrunk_list:
+            return None, 0
+        return shrunk_list, total_weight
+    return None, 0
+
+
+def greedy_tree_summarization_balanced(json_data: JSON, max_weight: int, balance_threshold=0.6) -> JSON:
+    total_weight, tree_structure = calculate_weights(json_data)
+    if total_weight <= max_weight:
+        return json_data
+    shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold)
+    return shrunk_tree
+
+
+def summarize(data: JSON, max_length:int=200, balance_threshold:float=0.6) -> str:
+    return json_dumps(
+        greedy_tree_summarization_balanced(data, max_length, balance_threshold)
+    )