Skip to content

Commit 4ae9901

Browse files
committed
leaving variuos implementations of summary in this commit for future
reference
1 parent 0fcaca4 commit 4ae9901

File tree

6 files changed

+450
-7
lines changed

6 files changed

+450
-7
lines changed

deepdiff/helper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,3 +815,6 @@ class FlatDeltaRow(NamedTuple):
815815
t2_to_index: Optional[int] = None
816816

817817
__repr__ = __str__ = named_tuple_repr
818+
819+
820+
type JSON = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None

deepdiff/summarize.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def __init__(self, data: Any, key=None):
2222
self.key = key
2323
self.children_list: list[JSONNode] = []
2424
self.children_dict: list[tuple[Any, JSONNode]] = []
25+
self.value: str = ""
2526
if isinstance(data, dict):
2627
self.type = "dict"
2728
# Preserve insertion order: list of (key, child) pairs.
@@ -39,6 +40,15 @@ def __init__(self, data: Any, key=None):
3940
except Exception:
4041
self.value = str(data)
4142

43+
def __repr__(self) -> str:
44+
if self.children_list:
45+
return "List-[" + ",".join([str(i) for i in self.children_list]) + "]"
46+
if self.children_dict:
47+
return "Dict-[" + ",".join([f"{i}:{v}" for i, v in self.children_dict]) + "]"
48+
return self.value
49+
50+
__str__ = __repr__
51+
4252
def full_repr(self) -> str:
4353
"""Return the full minimized JSON representation (without trimming) for this node."""
4454
if self.type == "primitive":
@@ -72,7 +82,7 @@ def _summarize(self, budget) -> str:
7282
return self._summarize_dict(budget)
7383
elif self.type == "list":
7484
return self._summarize_list(budget)
75-
return self.value
85+
return str(self.value)
7686

7787
def _summarize_dict(self, budget) -> str:
7888
# If the dict is empty, return {}
@@ -140,12 +150,21 @@ def _summarize_list(self, budget) -> str:
140150
return full_repr
141151
# For lists, show only the first element and an omission indicator if more elements exist.
142152
suffix = ",..." if len(self.children_list) > 1 else ""
153+
143154
inner_budget = budget - 2 - len(suffix) # subtract brackets and suffix
144-
first_summary = self.children_list[0]._summarize(inner_budget)
145-
candidate = "[" + first_summary + suffix + "]"
146-
if len(candidate) <= budget:
147-
return candidate
148-
return _truncate(candidate, budget)
155+
budget_per_element: int = min(inner_budget, max(4, inner_budget // len(self.children_list)))
156+
max_element_count: int = inner_budget // budget_per_element
157+
element_summaries: list[str] = []
158+
for element in self.children_list[:max_element_count]:
159+
element_summaries.append(element._summarize(budget_per_element))
160+
# first_summary = self.children_list[0]._summarize(budget_per_element)
161+
joined_elements = ",".join(element_summaries)
162+
joined_elements = joined_elements.rstrip(".")
163+
joined_elements = joined_elements[:inner_budget]
164+
return f"[{joined_elements}{suffix}]"
165+
# if len(candidate) <= budget:
166+
# return candidate
167+
# return _truncate(candidate, budget)
149168

150169

151170
def summarize(data, max_length=200):

deepdiff/summarize2.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
from deepdiff.helper import JSON
2+
from deepdiff.serialization import json_dumps
3+
4+
# type edge_weight_child_strcuture = tuple[int, int, Any]
5+
6+
# Function to calculate node weights recursively
7+
def calculate_weights(node):# -> tuple[int, tuple[str, edge_weight_child_strcuture]]:
8+
if isinstance(node, dict):
9+
weight = 0
10+
children_weights = {}
11+
for k, v in node.items():
12+
edge_weight = len(k)
13+
child_weight, child_structure = calculate_weights(v)
14+
total_weight = edge_weight + child_weight
15+
weight += total_weight
16+
children_weights[k] = (edge_weight, child_weight, child_structure)
17+
return weight, ('dict', children_weights)
18+
19+
elif isinstance(node, list):
20+
weight = 0
21+
children_weights = []
22+
for v in node:
23+
edge_weight = 0 # As per updated instruction, indexes have zero weight
24+
child_weight, child_structure = calculate_weights(v)
25+
total_weight = edge_weight + child_weight
26+
weight += total_weight
27+
children_weights.append((edge_weight, child_weight, child_structure))
28+
return weight, ('list', children_weights)
29+
30+
else:
31+
if isinstance(node, str):
32+
node_weight = len(node)
33+
elif isinstance(node, int):
34+
node_weight = len(str(node))
35+
elif isinstance(node, float):
36+
node_weight = len(str(round(node, 2)))
37+
elif node is None:
38+
node_weight = 1
39+
else:
40+
node_weight = 0
41+
return node_weight, ('leaf', node)
42+
43+
44+
def _truncate(s: str, max_len: int) -> str:
45+
"""
46+
Truncate string s to max_len characters.
47+
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
48+
"""
49+
if len(s) <= max_len:
50+
return s
51+
if max_len <= 5:
52+
return s[:max_len]
53+
return s[:max_len - 5] + "..." + s[-2:]
54+
55+
56+
# Greedy algorithm to shrink the tree
57+
def shrink_tree(node_structure, max_weight: int) -> tuple[JSON, int]:
58+
node_type, node_info = node_structure
59+
60+
if node_type == 'leaf':
61+
leaf_value = node_info
62+
leaf_weight, _ = calculate_weights(leaf_value)
63+
if leaf_weight <= max_weight:
64+
return leaf_value, leaf_weight
65+
else:
66+
# Truncate leaf value if string
67+
if isinstance(leaf_value, str):
68+
truncated_value = _truncate(leaf_value, max_weight)
69+
return truncated_value, len(truncated_value)
70+
# For int or float, convert to string and truncate
71+
elif isinstance(leaf_value, (int, float)):
72+
leaf_str = str(leaf_value)
73+
truncated_str = leaf_str[:max_weight]
74+
# Convert back if possible
75+
try:
76+
return int(truncated_str), len(truncated_str)
77+
except Exception:
78+
try:
79+
return float(truncated_str), len(truncated_str)
80+
except Exception:
81+
return truncated_str, len(truncated_str)
82+
elif leaf_value is None:
83+
return None, 1 if max_weight >=1 else 0
84+
85+
elif node_type == 'dict':
86+
shrunk_dict = {}
87+
total_weight = 0
88+
# Sort children by weight (heavy first)
89+
sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
90+
for k, (edge_w, child_w, child_struct) in sorted_children:
91+
if total_weight + edge_w >= max_weight:
92+
continue # Skip heavy edge entirely
93+
remaining_weight = max_weight - total_weight - edge_w
94+
shrunk_child, shrunk_weight = shrink_tree(child_struct, remaining_weight)
95+
if shrunk_child is not None:
96+
shrunk_dict[k[:edge_w]] = shrunk_child
97+
total_weight += edge_w + shrunk_weight
98+
if total_weight >= max_weight:
99+
break
100+
return shrunk_dict, total_weight
101+
102+
elif node_type == 'list':
103+
shrunk_list = []
104+
total_weight = 0
105+
# Sort children by weight (heavy first)
106+
sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
107+
for edge_w, child_w, child_struct in sorted_children:
108+
remaining_weight = max_weight - total_weight
109+
shrunk_child, shrunk_weight = shrink_tree(child_struct, remaining_weight)
110+
if shrunk_child is not None:
111+
shrunk_list.append(shrunk_child)
112+
total_weight += shrunk_weight
113+
if total_weight >= max_weight - 1:
114+
shrunk_list.append('...')
115+
break
116+
return shrunk_list, total_weight
117+
return None, 1
118+
119+
# Main function to summarize the tree
120+
def summarize_tree(tree: dict | list, max_weight: int) -> JSON:
121+
total_weight, tree_structure = calculate_weights(tree)
122+
if total_weight <= max_weight:
123+
return tree # No need to shrink
124+
shrunk_tree, _ = shrink_tree(tree_structure, max_weight)
125+
return shrunk_tree
126+
127+
# Exposed function for user convenience
128+
def summarize(json_data, max_length=200) -> str:
129+
return json_dumps(summarize_tree(json_data, max_length))

deepdiff/summarize3.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from deepdiff.helper import JSON
2+
from deepdiff.serialization import json_dumps
3+
4+
5+
def _truncate(s: str, max_len: int) -> str:
6+
"""
7+
Truncate string s to max_len characters.
8+
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
9+
"""
10+
if len(s) <= max_len:
11+
return s
12+
if max_len <= 5:
13+
return s[:max_len]
14+
return s[:max_len - 5] + "..." + s[-2:]
15+
# Re-defining the functions due to environment reset
16+
17+
18+
# Function to calculate node weights recursively
19+
def calculate_weights(node):
20+
if isinstance(node, dict):
21+
weight = 0
22+
children_weights = {}
23+
for k, v in node.items():
24+
edge_weight = len(k)
25+
child_weight, child_structure = calculate_weights(v)
26+
total_weight = edge_weight + child_weight
27+
weight += total_weight
28+
children_weights[k] = (edge_weight, child_weight, child_structure)
29+
return weight, ('dict', children_weights)
30+
31+
elif isinstance(node, list):
32+
weight = 0
33+
children_weights = []
34+
for v in node:
35+
edge_weight = 0 # Index weights are zero
36+
child_weight, child_structure = calculate_weights(v)
37+
total_weight = edge_weight + child_weight
38+
weight += total_weight
39+
children_weights.append((edge_weight, child_weight, child_structure))
40+
return weight, ('list', children_weights)
41+
42+
else:
43+
if isinstance(node, str):
44+
node_weight = len(node)
45+
elif isinstance(node, int):
46+
node_weight = len(str(node))
47+
elif isinstance(node, float):
48+
node_weight = len(str(round(node, 2)))
49+
elif node is None:
50+
node_weight = 1
51+
else:
52+
node_weight = 0
53+
return node_weight, ('leaf', node)
54+
55+
# Include previously defined functions for shrinking with threshold
56+
# (Implementing directly the balanced summarization algorithm as above)
57+
58+
# Balanced algorithm (simplified version):
59+
def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float):
60+
node_type, node_info = node_structure
61+
62+
if node_type == 'leaf':
63+
leaf_value = node_info
64+
leaf_weight, _ = calculate_weights(leaf_value)
65+
if leaf_weight <= max_weight:
66+
return leaf_value, leaf_weight
67+
else:
68+
if isinstance(leaf_value, str):
69+
truncated_value = _truncate(leaf_value, max_weight)
70+
return truncated_value, len(truncated_value)
71+
elif isinstance(leaf_value, (int, float)):
72+
leaf_str = str(leaf_value)
73+
truncated_str = leaf_str[:max_weight]
74+
try:
75+
return int(truncated_str), len(truncated_str)
76+
except Exception:
77+
try:
78+
return float(truncated_str), len(truncated_str)
79+
except Exception:
80+
return truncated_str, len(truncated_str)
81+
elif leaf_value is None:
82+
return None, 1 if max_weight >= 1 else 0
83+
84+
elif node_type == 'dict':
85+
shrunk_dict = {}
86+
total_weight = 0
87+
sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
88+
89+
for k, (edge_w, child_w, child_struct) in sorted_children:
90+
allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight)
91+
if allowed_branch_weight <= edge_w:
92+
continue
93+
94+
remaining_weight = int(allowed_branch_weight - edge_w)
95+
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold)
96+
if shrunk_child is not None:
97+
shrunk_dict[k[:edge_w]] = shrunk_child
98+
total_weight += edge_w + shrunk_weight
99+
100+
if total_weight >= max_weight:
101+
break
102+
if not shrunk_dict:
103+
return None, 0
104+
105+
return shrunk_dict, total_weight
106+
107+
elif node_type == 'list':
108+
shrunk_list = []
109+
total_weight = 0
110+
sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
111+
for edge_w, child_w, child_struct in sorted_children:
112+
allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight))
113+
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold)
114+
if shrunk_child is not None:
115+
shrunk_list.append(shrunk_child)
116+
total_weight += shrunk_weight
117+
if total_weight >= max_weight - 1:
118+
shrunk_list.append("...")
119+
break
120+
if not shrunk_list:
121+
return None, 0
122+
return shrunk_list, total_weight
123+
return None, 0
124+
125+
# Main exposed function
126+
def greedy_tree_summarization_balanced(json_data, max_weight: int, balance_threshold=0.6):
127+
total_weight, tree_structure = calculate_weights(json_data)
128+
if total_weight <= max_weight:
129+
return json_data
130+
shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold)
131+
return shrunk_tree
132+
133+
134+
# Exposed function for user convenience
135+
def summarize(json_data, max_length=200, balance_threshold=0.6) -> str:
136+
return json_dumps(
137+
greedy_tree_summarization_balanced(json_data, max_length, balance_threshold)
138+
)

tests/test_summarize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,10 @@ def test_list_summary(self):
132132
assert "..." not in summary
133133

134134
data2 = list(range(1, 200))
135-
summary2 = summarize(data2)
135+
summary2 = summarize(data2, max_length=14)
136136
assert "..." in summary2
137+
expected = '[1,2,...]'
138+
assert expected == summary2
137139

138140
def test_direct_truncate_function(self):
139141
s = "abcdefghijklmnopqrstuvwxyz"

0 commit comments

Comments
 (0)