Skip to content

Commit 142c260

Browse files
committed
adding summarization
1 parent 5993a42 commit 142c260

File tree

3 files changed

+15074
-0
lines changed

3 files changed

+15074
-0
lines changed

deepdiff/summarize.py

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from deepdiff.serialization import json_dumps
2+
3+
4+
def _truncate(s, max_len):
5+
"""
6+
Truncate string s to max_len characters.
7+
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
8+
"""
9+
if len(s) <= max_len:
10+
return s
11+
if max_len <= 5:
12+
return s[:max_len]
13+
return s[:max_len - 5] + "..." + s[-2:]
14+
15+
class JSONNode:
16+
def __init__(self, data, key=None):
17+
"""
18+
Build a tree node for the JSON data.
19+
If this node is a child of a dict, key is its key name.
20+
"""
21+
self.key = key
22+
if isinstance(data, dict):
23+
self.type = "dict"
24+
self.children = []
25+
# Preserve insertion order: list of (key, child) pairs.
26+
for k, v in data.items():
27+
child = JSONNode(v, key=k)
28+
self.children.append((k, child))
29+
elif isinstance(data, list):
30+
self.type = "list"
31+
self.children = [JSONNode(item) for item in data]
32+
else:
33+
self.type = "primitive"
34+
# For primitives, use json.dumps to get a compact representation.
35+
try:
36+
self.value = json_dumps(data)
37+
except Exception:
38+
self.value = str(data)
39+
40+
def full_repr(self):
41+
"""Return the full minimized JSON representation (without trimming) for this node."""
42+
if self.type == "primitive":
43+
return self.value
44+
elif self.type == "dict":
45+
parts = []
46+
for k, child in self.children:
47+
parts.append(f'"{k}":{child.full_repr()}')
48+
return "{" + ",".join(parts) + "}"
49+
elif self.type == "list":
50+
parts = [child.full_repr() for child in self.children]
51+
return "[" + ",".join(parts) + "]"
52+
53+
def full_weight(self):
54+
"""Return the character count of the full representation."""
55+
return len(self.full_repr())
56+
57+
def summarize(self, budget):
58+
"""
59+
Return a summary string for this node that fits within budget characters.
60+
The algorithm may drop whole sub-branches (for dicts) or truncate long primitives.
61+
"""
62+
if self.type == "primitive":
63+
rep = self.value
64+
if len(rep) <= budget:
65+
return rep
66+
else:
67+
return _truncate(rep, budget)
68+
elif self.type == "dict":
69+
return self._summarize_dict(budget)
70+
elif self.type == "list":
71+
return self._summarize_list(budget)
72+
73+
def _summarize_dict(self, budget):
74+
# If the dict is empty, return {}
75+
if not self.children:
76+
return "{}"
77+
# Build a list of pairs with fixed parts:
78+
# Each pair: key_repr is f'"{key}":'
79+
# Also store the full (untrimmed) child representation.
80+
pairs = []
81+
for k, child in self.children:
82+
key_repr = f'"{k}":'
83+
child_full = child.full_repr()
84+
pair_full = key_repr + child_full
85+
pairs.append({
86+
"key": k,
87+
"child": child,
88+
"key_repr": key_repr,
89+
"child_full": child_full,
90+
"pair_full": pair_full,
91+
"full_length": len(pair_full)
92+
})
93+
n = len(pairs)
94+
fixed_overhead = 2 + (n - 1) # braces plus commas between pairs
95+
total_full = sum(p["full_length"] for p in pairs) + fixed_overhead
96+
# If full representation fits, return it.
97+
if total_full <= budget:
98+
parts = [p["key_repr"] + p["child_full"] for p in pairs]
99+
return "{" + ",".join(parts) + "}"
100+
101+
# Otherwise, try dropping some pairs.
102+
kept = pairs.copy()
103+
# Heuristic: while the representation is too long, drop the pair whose child_full is longest.
104+
while kept:
105+
# Sort kept pairs in original insertion order.
106+
kept_sorted = sorted(kept, key=lambda p: self.children.index((p["key"], p["child"])))
107+
current_n = len(kept_sorted)
108+
fixed = sum(len(p["key_repr"]) for p in kept_sorted) + (current_n - 1) + 2
109+
remaining_budget = budget - fixed
110+
if remaining_budget < 0:
111+
# Not enough even for fixed costs; drop one pair.
112+
kept.remove(max(kept, key=lambda p: len(p["child_full"])))
113+
continue
114+
total_child_full = sum(len(p["child_full"]) for p in kept_sorted)
115+
# Allocate available budget for each child's summary proportionally.
116+
child_summaries = []
117+
for p in kept_sorted:
118+
ideal = int(remaining_budget * (len(p["child_full"]) / total_child_full)) if total_child_full > 0 else 0
119+
summary_child = p["child"].summarize(ideal)
120+
child_summaries.append(summary_child)
121+
candidate = "{" + ",".join([p["key_repr"] + s for p, s in zip(kept_sorted, child_summaries)]) + "}"
122+
if len(candidate) <= budget:
123+
return candidate
124+
# If still too long, drop the pair with the largest child_full length.
125+
to_drop = max(kept, key=lambda p: len(p["child_full"]))
126+
kept.remove(to_drop)
127+
# If nothing remains, return a truncated empty object.
128+
return _truncate("{}", budget)
129+
130+
def _summarize_list(self, budget):
131+
# If the list is empty, return []
132+
if not self.children:
133+
return "[]"
134+
full_repr = self.full_repr()
135+
if len(full_repr) <= budget:
136+
return full_repr
137+
# For lists, show only the first element and an omission indicator if more elements exist.
138+
suffix = ",..." if len(self.children) > 1 else ""
139+
inner_budget = budget - 2 - len(suffix) # subtract brackets and suffix
140+
first_summary = self.children[0].summarize(inner_budget)
141+
candidate = "[" + first_summary + suffix + "]"
142+
if len(candidate) <= budget:
143+
return candidate
144+
return _truncate(candidate, budget)
145+
146+
147+
def summarize(data, max_length=200):
148+
"""
149+
Build a tree for the given JSON-compatible data and return its summary,
150+
ensuring the final string is no longer than self.max_length.
151+
"""
152+
root = JSONNode(data)
153+
return root.summarize(max_length).replace("{,", "{")

0 commit comments

Comments
 (0)