|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Convert an NDJSON file to a prettified JSON file. |
| 4 | +
|
| 5 | +This script takes an NDJSON (newline-delimited JSON) file and converts it to a |
| 6 | +standard human-readable JSON file where each line becomes an element in a JSON array, with |
| 7 | +pretty formatting applied. |
| 8 | +
|
| 9 | +Example: |
| 10 | + Input NDJSON file (data.ndjson): |
| 11 | + {"name": "Alice", "age": 30} |
| 12 | + {"name": "Bob", "age": 25} |
| 13 | + {"name": "Charlie", "age": 35} |
| 14 | +
|
| 15 | + Output JSON file (data_prettified.json): |
| 16 | + [ |
| 17 | + { |
| 18 | + "age": 30, |
| 19 | + "name": "Alice" |
| 20 | + }, |
| 21 | + { |
| 22 | + "age": 25, |
| 23 | + "name": "Bob" |
| 24 | + }, |
| 25 | + { |
| 26 | + "age": 35, |
| 27 | + "name": "Charlie" |
| 28 | + } |
| 29 | + ] |
| 30 | +
|
| 31 | +Usage: |
| 32 | + python prettify_ndjson.py data.ndjson |
| 33 | + python prettify_ndjson.py --lines 1,3 data.ndjson # Only process lines 1 and 3 |
| 34 | + python prettify_ndjson.py --save-irs logs.ndjson # Keep all fields for compilation events |
| 35 | +
|
| 36 | +
|
| 37 | +""" |
| 38 | + |
| 39 | +import argparse |
| 40 | +import json |
| 41 | +import sys |
| 42 | +from pathlib import Path |
| 43 | +from typing import Any, List |
| 44 | + |
| 45 | + |
| 46 | +def parse_line_ranges(lines_arg: str) -> set[int]: |
| 47 | + """ |
| 48 | + Parse line ranges from string like "1,2,3,5-10" into a set of line numbers. |
| 49 | +
|
| 50 | + Line numbers use 1-based indexing (first line is line 1, not 0). |
| 51 | +
|
| 52 | + Args: |
| 53 | + lines_arg: String containing comma-separated line numbers and ranges |
| 54 | + Examples: "1", "1,2,3", "5-10", "1,3,5-10,15" |
| 55 | +
|
| 56 | + Returns: |
| 57 | + Set of line numbers (1-based indexing, where 1 = first line) |
| 58 | +
|
| 59 | + Raises: |
| 60 | + ValueError: If the format is invalid or contains non-positive numbers |
| 61 | + """ |
| 62 | + line_numbers = set() |
| 63 | + |
| 64 | + if not lines_arg.strip(): |
| 65 | + return line_numbers |
| 66 | + |
| 67 | + parts = lines_arg.split(",") |
| 68 | + for part in parts: |
| 69 | + part = part.strip() |
| 70 | + if not part: |
| 71 | + continue |
| 72 | + |
| 73 | + if "-" in part: |
| 74 | + # Handle range like "5-10" |
| 75 | + try: |
| 76 | + start, end = part.split("-", 1) |
| 77 | + start_num = int(start.strip()) |
| 78 | + end_num = int(end.strip()) |
| 79 | + if start_num <= 0 or end_num <= 0: |
| 80 | + raise ValueError("Line numbers must be positive") |
| 81 | + if start_num > end_num: |
| 82 | + raise ValueError(f"Invalid range: {part} (start > end)") |
| 83 | + line_numbers.update(range(start_num, end_num + 1)) |
| 84 | + except ValueError as e: |
| 85 | + if "invalid literal" in str(e): |
| 86 | + raise ValueError(f"Invalid range format: {part}") |
| 87 | + raise |
| 88 | + else: |
| 89 | + # Handle single number like "1" |
| 90 | + try: |
| 91 | + line_num = int(part) |
| 92 | + if line_num <= 0: |
| 93 | + raise ValueError("Line numbers must be positive") |
| 94 | + line_numbers.add(line_num) |
| 95 | + except ValueError: |
| 96 | + raise ValueError(f"Invalid line number: {part}") |
| 97 | + |
| 98 | + return line_numbers |
| 99 | + |
| 100 | + |
| 101 | +def load_ndjson( |
| 102 | + file_path: Path, save_irs: bool = False, line_filter: set[int] = None |
| 103 | +) -> List[Any]: |
| 104 | + """ |
| 105 | + Load NDJSON file and return list of JSON objects. |
| 106 | +
|
| 107 | + Args: |
| 108 | + file_path: Path to the NDJSON file |
| 109 | + save_irs: Whether to save file_content and python_source for compilation events |
| 110 | + line_filter: Set of line numbers to include (1-based indexing), None means include all |
| 111 | +
|
| 112 | + Returns: |
| 113 | + List of parsed JSON objects |
| 114 | +
|
| 115 | + Raises: |
| 116 | + FileNotFoundError: If the input file doesn't exist |
| 117 | + json.JSONDecodeError: If a line contains invalid JSON |
| 118 | + """ |
| 119 | + json_objects = [] |
| 120 | + filtered_compilation_events = 0 |
| 121 | + total_lines_processed = 0 |
| 122 | + |
| 123 | + try: |
| 124 | + with open(file_path, "r", encoding="utf-8") as f: |
| 125 | + # enumerate(f, 1) starts line numbering from 1 (1-based indexing) |
| 126 | + for line_num, line in enumerate(f, 1): |
| 127 | + line = line.strip() |
| 128 | + if not line: # Skip empty lines |
| 129 | + continue |
| 130 | + |
| 131 | + # Skip line if line filtering is enabled and this line is not in the filter |
| 132 | + # line_num is 1-based (first line = 1, second line = 2, etc.) |
| 133 | + if line_filter is not None and line_num not in line_filter: |
| 134 | + continue |
| 135 | + |
| 136 | + total_lines_processed += 1 |
| 137 | + |
| 138 | + try: |
| 139 | + json_obj = json.loads(line) |
| 140 | + |
| 141 | + # Filter out file_content and python_source for compilation events if save_irs is False |
| 142 | + if not save_irs and isinstance(json_obj, dict): |
| 143 | + event_type = json_obj.get("event_type") |
| 144 | + if event_type == "compilation": |
| 145 | + # Remove file_content and python_source from payload if they exist |
| 146 | + payload = json_obj.get("payload") |
| 147 | + if isinstance(payload, dict): |
| 148 | + fields_to_remove = [] |
| 149 | + if "file_content" in payload: |
| 150 | + fields_to_remove.append("file_content") |
| 151 | + if "python_source" in payload: |
| 152 | + fields_to_remove.append("python_source") |
| 153 | + |
| 154 | + if fields_to_remove: |
| 155 | + payload = ( |
| 156 | + payload.copy() |
| 157 | + ) # Create a copy to avoid modifying original |
| 158 | + for field in fields_to_remove: |
| 159 | + del payload[field] |
| 160 | + json_obj = ( |
| 161 | + json_obj.copy() |
| 162 | + ) # Create a copy of the main object |
| 163 | + json_obj["payload"] = payload |
| 164 | + filtered_compilation_events += 1 |
| 165 | + |
| 166 | + json_objects.append(json_obj) |
| 167 | + except json.JSONDecodeError as e: |
| 168 | + print( |
| 169 | + f"Error parsing JSON on line {line_num}: {e}", file=sys.stderr |
| 170 | + ) |
| 171 | + print(f"Problematic line: {line[:100]}...", file=sys.stderr) |
| 172 | + raise |
| 173 | + |
| 174 | + except FileNotFoundError: |
| 175 | + print(f"Error: File '{file_path}' not found.", file=sys.stderr) |
| 176 | + raise |
| 177 | + except Exception as e: |
| 178 | + print(f"Error reading file '{file_path}': {e}", file=sys.stderr) |
| 179 | + raise |
| 180 | + |
| 181 | + # Print informational messages |
| 182 | + if line_filter is not None: |
| 183 | + if line_filter: |
| 184 | + print( |
| 185 | + f"Line filtering: processed {total_lines_processed} out of {len(line_filter)} specified lines" |
| 186 | + ) |
| 187 | + else: |
| 188 | + print("Line filtering: no valid lines specified") |
| 189 | + |
| 190 | + # Print warning if compilation events were filtered |
| 191 | + if not save_irs and filtered_compilation_events > 0: |
| 192 | + print( |
| 193 | + f"WARNING: Removed 'file_content' and 'python_source' fields from {filtered_compilation_events} compilation events to reduce file size.", |
| 194 | + file=sys.stderr, |
| 195 | + ) |
| 196 | + print( |
| 197 | + " Use --save-irs flag to preserve these fields if needed.", |
| 198 | + file=sys.stderr, |
| 199 | + ) |
| 200 | + |
| 201 | + return json_objects |
| 202 | + |
| 203 | + |
| 204 | +def save_prettified_json(json_objects: List[Any], output_path: Path) -> None: |
| 205 | + """ |
| 206 | + Save list of JSON objects to a prettified JSON file. |
| 207 | +
|
| 208 | + Args: |
| 209 | + json_objects: List of JSON objects to save |
| 210 | + output_path: Path where to save the prettified JSON file |
| 211 | + """ |
| 212 | + try: |
| 213 | + with open(output_path, "w", encoding="utf-8") as f: |
| 214 | + json.dump(json_objects, f, indent=2, ensure_ascii=False, sort_keys=True) |
| 215 | + print(f"Successfully converted to prettified JSON: {output_path}") |
| 216 | + except Exception as e: |
| 217 | + print(f"Error writing to file '{output_path}': {e}", file=sys.stderr) |
| 218 | + raise |
| 219 | + |
| 220 | + |
| 221 | +def main(): |
| 222 | + """Main function to handle command line arguments and orchestrate the conversion.""" |
| 223 | + parser = argparse.ArgumentParser( |
| 224 | + description="Convert NDJSON file to prettified JSON file", |
| 225 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 226 | + epilog=""" |
| 227 | +Examples: |
| 228 | + python prettify_ndjson.py data.ndjson |
| 229 | + python prettify_ndjson.py /path/to/logs.ndjson |
| 230 | + """, |
| 231 | + ) |
| 232 | + |
| 233 | + parser.add_argument( |
| 234 | + "ndjson_file", type=str, help="Path to the NDJSON file to convert" |
| 235 | + ) |
| 236 | + |
| 237 | + parser.add_argument( |
| 238 | + "--save-irs", |
| 239 | + action="store_true", |
| 240 | + default=False, |
| 241 | + help="Save file_content and python_source for compilation events (default: False, removes these fields to reduce size)", |
| 242 | + ) |
| 243 | + |
| 244 | + parser.add_argument( |
| 245 | + "--lines", |
| 246 | + type=str, |
| 247 | + help="Specify line numbers to include using 1-based indexing (e.g., '1,2,3,5-10'). " |
| 248 | + "Line 1 is the first line of the file. Only these lines from the original NDJSON will be processed. " |
| 249 | + "Supports individual lines (1,2,3) and ranges (5-10).", |
| 250 | + ) |
| 251 | + |
| 252 | + parser.add_argument( |
| 253 | + "-o", |
| 254 | + "--output", |
| 255 | + type=str, |
| 256 | + help="Specify output file path (default: {input_stem}_prettified.json in the same directory as input)", |
| 257 | + ) |
| 258 | + |
| 259 | + args = parser.parse_args() |
| 260 | + |
| 261 | + # Convert to Path object and validate |
| 262 | + input_path = Path(args.ndjson_file) |
| 263 | + |
| 264 | + if not input_path.exists(): |
| 265 | + print(f"Error: File '{input_path}' does not exist.", file=sys.stderr) |
| 266 | + sys.exit(1) |
| 267 | + |
| 268 | + if not input_path.is_file(): |
| 269 | + print(f"Error: '{input_path}' is not a file.", file=sys.stderr) |
| 270 | + sys.exit(1) |
| 271 | + |
| 272 | + # Generate output filename |
| 273 | + if args.output: |
| 274 | + output_path = Path(args.output) |
| 275 | + else: |
| 276 | + # Default: original_prettified.json in same directory as input |
| 277 | + output_path = input_path.parent / f"{input_path.stem}_prettified.json" |
| 278 | + |
| 279 | + try: |
| 280 | + # Parse line filter if provided |
| 281 | + line_filter = None |
| 282 | + if args.lines: |
| 283 | + try: |
| 284 | + line_filter = parse_line_ranges(args.lines) |
| 285 | + print( |
| 286 | + f"Line filtering enabled: will process {len(line_filter)} specified lines" |
| 287 | + ) |
| 288 | + except ValueError as e: |
| 289 | + print(f"Error parsing --lines argument: {e}", file=sys.stderr) |
| 290 | + sys.exit(1) |
| 291 | + |
| 292 | + # Load NDJSON file |
| 293 | + print(f"Loading NDJSON file: {input_path}") |
| 294 | + if not args.save_irs: |
| 295 | + print( |
| 296 | + "Filtering out file_content and python_source from compilation events to reduce size" |
| 297 | + ) |
| 298 | + json_objects = load_ndjson( |
| 299 | + input_path, save_irs=args.save_irs, line_filter=line_filter |
| 300 | + ) |
| 301 | + print(f"Loaded {len(json_objects)} JSON objects") |
| 302 | + |
| 303 | + # Save as prettified JSON |
| 304 | + print(f"Saving prettified JSON to: {output_path}") |
| 305 | + save_prettified_json(json_objects, output_path) |
| 306 | + |
| 307 | + print("Conversion completed successfully!") |
| 308 | + |
| 309 | + except Exception as e: |
| 310 | + print(f"Conversion failed: {e}", file=sys.stderr) |
| 311 | + sys.exit(1) |
| 312 | + |
| 313 | + |
| 314 | +if __name__ == "__main__": |
| 315 | + main() |
0 commit comments