|
| 1 | +from deepdiff.serialization import json_dumps |
| 2 | + |
| 3 | + |
| 4 | +def _truncate(s, max_len): |
| 5 | + """ |
| 6 | + Truncate string s to max_len characters. |
| 7 | + If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters. |
| 8 | + """ |
| 9 | + if len(s) <= max_len: |
| 10 | + return s |
| 11 | + if max_len <= 5: |
| 12 | + return s[:max_len] |
| 13 | + return s[:max_len - 5] + "..." + s[-2:] |
| 14 | + |
| 15 | +class JSONNode: |
| 16 | + def __init__(self, data, key=None): |
| 17 | + """ |
| 18 | + Build a tree node for the JSON data. |
| 19 | + If this node is a child of a dict, key is its key name. |
| 20 | + """ |
| 21 | + self.key = key |
| 22 | + if isinstance(data, dict): |
| 23 | + self.type = "dict" |
| 24 | + self.children = [] |
| 25 | + # Preserve insertion order: list of (key, child) pairs. |
| 26 | + for k, v in data.items(): |
| 27 | + child = JSONNode(v, key=k) |
| 28 | + self.children.append((k, child)) |
| 29 | + elif isinstance(data, list): |
| 30 | + self.type = "list" |
| 31 | + self.children = [JSONNode(item) for item in data] |
| 32 | + else: |
| 33 | + self.type = "primitive" |
| 34 | + # For primitives, use json.dumps to get a compact representation. |
| 35 | + try: |
| 36 | + self.value = json_dumps(data) |
| 37 | + except Exception: |
| 38 | + self.value = str(data) |
| 39 | + |
| 40 | + def full_repr(self): |
| 41 | + """Return the full minimized JSON representation (without trimming) for this node.""" |
| 42 | + if self.type == "primitive": |
| 43 | + return self.value |
| 44 | + elif self.type == "dict": |
| 45 | + parts = [] |
| 46 | + for k, child in self.children: |
| 47 | + parts.append(f'"{k}":{child.full_repr()}') |
| 48 | + return "{" + ",".join(parts) + "}" |
| 49 | + elif self.type == "list": |
| 50 | + parts = [child.full_repr() for child in self.children] |
| 51 | + return "[" + ",".join(parts) + "]" |
| 52 | + |
| 53 | + def full_weight(self): |
| 54 | + """Return the character count of the full representation.""" |
| 55 | + return len(self.full_repr()) |
| 56 | + |
| 57 | + def summarize(self, budget): |
| 58 | + """ |
| 59 | + Return a summary string for this node that fits within budget characters. |
| 60 | + The algorithm may drop whole sub-branches (for dicts) or truncate long primitives. |
| 61 | + """ |
| 62 | + if self.type == "primitive": |
| 63 | + rep = self.value |
| 64 | + if len(rep) <= budget: |
| 65 | + return rep |
| 66 | + else: |
| 67 | + return _truncate(rep, budget) |
| 68 | + elif self.type == "dict": |
| 69 | + return self._summarize_dict(budget) |
| 70 | + elif self.type == "list": |
| 71 | + return self._summarize_list(budget) |
| 72 | + |
| 73 | + def _summarize_dict(self, budget): |
| 74 | + # If the dict is empty, return {} |
| 75 | + if not self.children: |
| 76 | + return "{}" |
| 77 | + # Build a list of pairs with fixed parts: |
| 78 | + # Each pair: key_repr is f'"{key}":' |
| 79 | + # Also store the full (untrimmed) child representation. |
| 80 | + pairs = [] |
| 81 | + for k, child in self.children: |
| 82 | + key_repr = f'"{k}":' |
| 83 | + child_full = child.full_repr() |
| 84 | + pair_full = key_repr + child_full |
| 85 | + pairs.append({ |
| 86 | + "key": k, |
| 87 | + "child": child, |
| 88 | + "key_repr": key_repr, |
| 89 | + "child_full": child_full, |
| 90 | + "pair_full": pair_full, |
| 91 | + "full_length": len(pair_full) |
| 92 | + }) |
| 93 | + n = len(pairs) |
| 94 | + fixed_overhead = 2 + (n - 1) # braces plus commas between pairs |
| 95 | + total_full = sum(p["full_length"] for p in pairs) + fixed_overhead |
| 96 | + # If full representation fits, return it. |
| 97 | + if total_full <= budget: |
| 98 | + parts = [p["key_repr"] + p["child_full"] for p in pairs] |
| 99 | + return "{" + ",".join(parts) + "}" |
| 100 | + |
| 101 | + # Otherwise, try dropping some pairs. |
| 102 | + kept = pairs.copy() |
| 103 | + # Heuristic: while the representation is too long, drop the pair whose child_full is longest. |
| 104 | + while kept: |
| 105 | + # Sort kept pairs in original insertion order. |
| 106 | + kept_sorted = sorted(kept, key=lambda p: self.children.index((p["key"], p["child"]))) |
| 107 | + current_n = len(kept_sorted) |
| 108 | + fixed = sum(len(p["key_repr"]) for p in kept_sorted) + (current_n - 1) + 2 |
| 109 | + remaining_budget = budget - fixed |
| 110 | + if remaining_budget < 0: |
| 111 | + # Not enough even for fixed costs; drop one pair. |
| 112 | + kept.remove(max(kept, key=lambda p: len(p["child_full"]))) |
| 113 | + continue |
| 114 | + total_child_full = sum(len(p["child_full"]) for p in kept_sorted) |
| 115 | + # Allocate available budget for each child's summary proportionally. |
| 116 | + child_summaries = [] |
| 117 | + for p in kept_sorted: |
| 118 | + ideal = int(remaining_budget * (len(p["child_full"]) / total_child_full)) if total_child_full > 0 else 0 |
| 119 | + summary_child = p["child"].summarize(ideal) |
| 120 | + child_summaries.append(summary_child) |
| 121 | + candidate = "{" + ",".join([p["key_repr"] + s for p, s in zip(kept_sorted, child_summaries)]) + "}" |
| 122 | + if len(candidate) <= budget: |
| 123 | + return candidate |
| 124 | + # If still too long, drop the pair with the largest child_full length. |
| 125 | + to_drop = max(kept, key=lambda p: len(p["child_full"])) |
| 126 | + kept.remove(to_drop) |
| 127 | + # If nothing remains, return a truncated empty object. |
| 128 | + return _truncate("{}", budget) |
| 129 | + |
| 130 | + def _summarize_list(self, budget): |
| 131 | + # If the list is empty, return [] |
| 132 | + if not self.children: |
| 133 | + return "[]" |
| 134 | + full_repr = self.full_repr() |
| 135 | + if len(full_repr) <= budget: |
| 136 | + return full_repr |
| 137 | + # For lists, show only the first element and an omission indicator if more elements exist. |
| 138 | + suffix = ",..." if len(self.children) > 1 else "" |
| 139 | + inner_budget = budget - 2 - len(suffix) # subtract brackets and suffix |
| 140 | + first_summary = self.children[0].summarize(inner_budget) |
| 141 | + candidate = "[" + first_summary + suffix + "]" |
| 142 | + if len(candidate) <= budget: |
| 143 | + return candidate |
| 144 | + return _truncate(candidate, budget) |
| 145 | + |
| 146 | + |
| 147 | +def summarize(data, max_length=200): |
| 148 | + """ |
| 149 | + Build a tree for the given JSON-compatible data and return its summary, |
| 150 | + ensuring the final string is no longer than self.max_length. |
| 151 | + """ |
| 152 | + root = JSONNode(data) |
| 153 | + return root.summarize(max_length).replace("{,", "{") |
0 commit comments