Skip to content

Commit 249fcfb

Browse files
committed
the new summarization algorithm
1 parent 4ae9901 commit 249fcfb

File tree

8 files changed

+160
-611
lines changed

8 files changed

+160
-611
lines changed

deepdiff/helper.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import string
99
import time
1010
import enum
11-
from typing import NamedTuple, Any, List, Optional
11+
from typing import NamedTuple, Any, List, Optional, TypeAlias
1212
from ast import literal_eval
1313
from decimal import Decimal, localcontext, InvalidOperation as InvalidDecimalOperation
1414
from itertools import repeat
@@ -817,4 +817,10 @@ class FlatDeltaRow(NamedTuple):
817817
__repr__ = __str__ = named_tuple_repr
818818

819819

820-
type JSON = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None
820+
JSON: TypeAlias = dict[str, str] | list[str] | list[int] | dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None
821+
822+
823+
class SummaryNodeType(EnumBase):
824+
dict = 'dict'
825+
list = 'list'
826+
leaf = 'leaf'

deepdiff/summarize.py

Lines changed: 122 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from typing import Any
1+
from deepdiff.helper import JSON, SummaryNodeType
22
from deepdiff.serialization import json_dumps
33

44

5-
def _truncate(s, max_len):
5+
def _truncate(s: str, max_len: int) -> str:
66
"""
77
Truncate string s to max_len characters.
88
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
@@ -12,165 +12,126 @@ def _truncate(s, max_len):
1212
if max_len <= 5:
1313
return s[:max_len]
1414
return s[:max_len - 5] + "..." + s[-2:]
15+
# Re-defining the functions due to environment reset
1516

16-
class JSONNode:
17-
def __init__(self, data: Any, key=None):
18-
"""
19-
Build a tree node for the JSON data.
20-
If this node is a child of a dict, key is its key name.
21-
"""
22-
self.key = key
23-
self.children_list: list[JSONNode] = []
24-
self.children_dict: list[tuple[Any, JSONNode]] = []
25-
self.value: str = ""
26-
if isinstance(data, dict):
27-
self.type = "dict"
28-
# Preserve insertion order: list of (key, child) pairs.
29-
for k, v in data.items():
30-
child = JSONNode(v, key=k)
31-
self.children_dict.append((k, child))
32-
elif isinstance(data, list):
33-
self.type = "list"
34-
self.children_list = [JSONNode(item) for item in data]
17+
18+
# Function to calculate node weights recursively
19+
def calculate_weights(node):
20+
if isinstance(node, dict):
21+
weight = 0
22+
children_weights = {}
23+
for k, v in node.items():
24+
edge_weight = len(k)
25+
child_weight, child_structure = calculate_weights(v)
26+
total_weight = edge_weight + child_weight
27+
weight += total_weight
28+
children_weights[k] = (edge_weight, child_weight, child_structure)
29+
return weight, (SummaryNodeType.dict, children_weights)
30+
31+
elif isinstance(node, list):
32+
weight = 0
33+
children_weights = []
34+
for v in node:
35+
edge_weight = 0 # Index weights are zero
36+
child_weight, child_structure = calculate_weights(v)
37+
total_weight = edge_weight + child_weight
38+
weight += total_weight
39+
children_weights.append((edge_weight, child_weight, child_structure))
40+
return weight, (SummaryNodeType.list, children_weights)
41+
42+
else:
43+
if isinstance(node, str):
44+
node_weight = len(node)
45+
elif isinstance(node, int):
46+
node_weight = len(str(node))
47+
elif isinstance(node, float):
48+
node_weight = len(str(round(node, 2)))
49+
elif node is None:
50+
node_weight = 1
51+
else:
52+
node_weight = 0
53+
return node_weight, (SummaryNodeType.leaf, node)
54+
55+
# Include previously defined functions for shrinking with threshold
56+
# (Implementing directly the balanced summarization algorithm as above)
57+
58+
# Balanced algorithm (simplified version):
59+
def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float) -> tuple[JSON, float]:
60+
node_type, node_info = node_structure
61+
62+
if node_type is SummaryNodeType.leaf:
63+
leaf_value = node_info
64+
leaf_weight, _ = calculate_weights(leaf_value)
65+
if leaf_weight <= max_weight:
66+
return leaf_value, leaf_weight
3567
else:
36-
self.type = "primitive"
37-
# For primitives, use json.dumps to get a compact representation.
38-
try:
39-
self.value = json_dumps(data)
40-
except Exception:
41-
self.value = str(data)
42-
43-
def __repr__(self) -> str:
44-
if self.children_list:
45-
return "List-[" + ",".join([str(i) for i in self.children_list]) + "]"
46-
if self.children_dict:
47-
return "Dict-[" + ",".join([f"{i}:{v}" for i, v in self.children_dict]) + "]"
48-
return self.value
49-
50-
__str__ = __repr__
51-
52-
def full_repr(self) -> str:
53-
"""Return the full minimized JSON representation (without trimming) for this node."""
54-
if self.type == "primitive":
55-
return self.value
56-
elif self.type == "dict":
57-
parts = []
58-
for k, child in self.children_dict:
59-
parts.append(f'"{k}":{child.full_repr()}')
60-
return "{" + ",".join(parts) + "}"
61-
elif self.type == "list":
62-
parts = [child.full_repr() for child in self.children_list]
63-
return "[" + ",".join(parts) + "]"
64-
return self.value
65-
66-
def full_weight(self):
67-
"""Return the character count of the full representation."""
68-
return len(self.full_repr())
69-
70-
def _summarize(self, budget) -> str:
71-
"""
72-
Return a summary string for this node that fits within budget characters.
73-
The algorithm may drop whole sub-branches (for dicts) or truncate long primitives.
74-
"""
75-
if self.type == "primitive":
76-
rep = self.value
77-
if len(rep) <= budget:
78-
return rep
79-
else:
80-
return _truncate(rep, budget)
81-
elif self.type == "dict":
82-
return self._summarize_dict(budget)
83-
elif self.type == "list":
84-
return self._summarize_list(budget)
85-
return str(self.value)
86-
87-
def _summarize_dict(self, budget) -> str:
88-
# If the dict is empty, return {}
89-
if not self.children_dict:
90-
return "{}"
91-
# Build a list of pairs with fixed parts:
92-
# Each pair: key_repr is f'"{key}":'
93-
# Also store the full (untrimmed) child representation.
94-
pairs = []
95-
for k, child in self.children_dict:
96-
key_repr = f'"{k}":'
97-
child_full = child.full_repr()
98-
pair_full = key_repr + child_full
99-
pairs.append({
100-
"key": k,
101-
"child": child,
102-
"key_repr": key_repr,
103-
"child_full": child_full,
104-
"pair_full": pair_full,
105-
"full_length": len(pair_full)
106-
})
107-
n = len(pairs)
108-
fixed_overhead = 2 + (n - 1) # braces plus commas between pairs
109-
total_full = sum(p["full_length"] for p in pairs) + fixed_overhead
110-
# If full representation fits, return it.
111-
if total_full <= budget:
112-
parts = [p["key_repr"] + p["child_full"] for p in pairs]
113-
return "{" + ",".join(parts) + "}"
114-
115-
# Otherwise, try dropping some pairs.
116-
kept = pairs.copy()
117-
# Heuristic: while the representation is too long, drop the pair whose child_full is longest.
118-
while kept:
119-
# Sort kept pairs in original insertion order.
120-
kept_sorted = sorted(kept, key=lambda p: self.children_dict.index((p["key"], p["child"])))
121-
current_n = len(kept_sorted)
122-
fixed = sum(len(p["key_repr"]) for p in kept_sorted) + (current_n - 1) + 2
123-
remaining_budget = budget - fixed
124-
if remaining_budget < 0:
125-
# Not enough even for fixed costs; drop one pair.
126-
kept.remove(max(kept, key=lambda p: len(p["child_full"])))
68+
if isinstance(leaf_value, str):
69+
truncated_value = _truncate(leaf_value, max_weight)
70+
return truncated_value, len(truncated_value)
71+
elif isinstance(leaf_value, (int, float)):
72+
leaf_str = str(leaf_value)
73+
truncated_str = leaf_str[:max_weight]
74+
try:
75+
return int(truncated_str), len(truncated_str)
76+
except Exception:
77+
try:
78+
return float(truncated_str), len(truncated_str)
79+
except Exception:
80+
return truncated_str, len(truncated_str)
81+
elif leaf_value is None:
82+
return None, 1 if max_weight >= 1 else 0
83+
84+
elif node_type is SummaryNodeType.dict:
85+
shrunk_dict = {}
86+
total_weight = 0
87+
sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
88+
89+
for k, (edge_w, _, child_struct) in sorted_children:
90+
allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight)
91+
if allowed_branch_weight <= edge_w:
12792
continue
128-
total_child_full = sum(len(p["child_full"]) for p in kept_sorted)
129-
# Allocate available budget for each child's summary proportionally.
130-
child_summaries = []
131-
for p in kept_sorted:
132-
ideal = int(remaining_budget * (len(p["child_full"]) / total_child_full)) if total_child_full > 0 else 0
133-
summary_child = p["child"]._summarize(ideal)
134-
child_summaries.append(summary_child)
135-
candidate = "{" + ",".join([p["key_repr"] + s for p, s in zip(kept_sorted, child_summaries)]) + "}"
136-
if len(candidate) <= budget:
137-
return candidate
138-
# If still too long, drop the pair with the largest child_full length.
139-
to_drop = max(kept, key=lambda p: len(p["child_full"]))
140-
kept.remove(to_drop)
141-
# If nothing remains, return a truncated empty object.
142-
return _truncate("{}", budget)
143-
144-
def _summarize_list(self, budget) -> str:
145-
# If the list is empty, return []
146-
if not self.children_list:
147-
return "[]"
148-
full_repr = self.full_repr()
149-
if len(full_repr) <= budget:
150-
return full_repr
151-
# For lists, show only the first element and an omission indicator if more elements exist.
152-
suffix = ",..." if len(self.children_list) > 1 else ""
153-
154-
inner_budget = budget - 2 - len(suffix) # subtract brackets and suffix
155-
budget_per_element: int = min(inner_budget, max(4, inner_budget // len(self.children_list)))
156-
max_element_count: int = inner_budget // budget_per_element
157-
element_summaries: list[str] = []
158-
for element in self.children_list[:max_element_count]:
159-
element_summaries.append(element._summarize(budget_per_element))
160-
# first_summary = self.children_list[0]._summarize(budget_per_element)
161-
joined_elements = ",".join(element_summaries)
162-
joined_elements = joined_elements.rstrip(".")
163-
joined_elements = joined_elements[:inner_budget]
164-
return f"[{joined_elements}{suffix}]"
165-
# if len(candidate) <= budget:
166-
# return candidate
167-
# return _truncate(candidate, budget)
168-
169-
170-
def summarize(data, max_length=200):
171-
"""
172-
Build a tree for the given JSON-compatible data and return its summary,
173-
ensuring the final string is no longer than self.max_length.
174-
"""
175-
root = JSONNode(data)
176-
return root._summarize(max_length).replace("{,", "{")
93+
94+
remaining_weight = int(allowed_branch_weight - edge_w)
95+
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold)
96+
if shrunk_child is not None:
97+
shrunk_dict[k[:edge_w]] = shrunk_child
98+
total_weight += edge_w + shrunk_weight
99+
100+
if total_weight >= max_weight:
101+
break
102+
if not shrunk_dict:
103+
return None, 0
104+
105+
return shrunk_dict, total_weight
106+
107+
elif node_type is SummaryNodeType.list:
108+
shrunk_list = []
109+
total_weight = 0
110+
sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
111+
for edge_w, _, child_struct in sorted_children:
112+
allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight))
113+
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold)
114+
if shrunk_child is not None:
115+
shrunk_list.append(shrunk_child)
116+
total_weight += shrunk_weight
117+
if total_weight >= max_weight - 1:
118+
shrunk_list.append("...")
119+
break
120+
if not shrunk_list:
121+
return None, 0
122+
return shrunk_list, total_weight
123+
return None, 0
124+
125+
126+
def greedy_tree_summarization_balanced(json_data: JSON, max_weight: int, balance_threshold=0.6) -> JSON:
127+
total_weight, tree_structure = calculate_weights(json_data)
128+
if total_weight <= max_weight:
129+
return json_data
130+
shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold)
131+
return shrunk_tree
132+
133+
134+
def summarize(data: JSON, max_length:int=200, balance_threshold:float=0.6) -> str:
135+
return json_dumps(
136+
greedy_tree_summarization_balanced(data, max_length, balance_threshold)
137+
)

0 commit comments

Comments
 (0)