Skip to content

Commit 6ef4072

Browse files
FindHaofacebook-github-bot
authored andcommitted
Add a tool script to prettify ndjson for debugging
Summary: This script can prettify the ndjson file and output a valid json file with human-friendly format. with three key features: **field filtering** (`--save-irs` flag, default removes `file_content` and `python_source` from compilation events to reduce file size), **line range filtering** (`--lines "1,2,3,5-10"` using 1-based indexing to process only specific lines), and **custom output path** (`-o output.json` to specify output location). The implementation includes a new `parse_line_ranges()` function for complex line parsing, enhanced `load_ndjson()` with filtering capabilities, comprehensive error handling, and maintains full backward compatibility while providing significant file size reduction and selective processing for large NDJSON files. Reviewed By: davidberard98 Differential Revision: D77679362 fbshipit-source-id: f21216aae645a6f26c56f8398e5a644d94a57050
1 parent 153bce5 commit 6ef4072

File tree

4 files changed

+319
-4
lines changed

4 files changed

+319
-4
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "tritonparse"
7-
version = "0.0.4"
7+
version = "0.0.5"
88
dependencies = [
99
"triton",
1010
]

tritonparse/tools/prettify_ndjson.py

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Convert an NDJSON file to a prettified JSON file.
4+
5+
This script takes an NDJSON (newline-delimited JSON) file and converts it to a
6+
standard human-readable JSON file where each line becomes an element in a JSON array, with
7+
pretty formatting applied.
8+
9+
Example:
10+
Input NDJSON file (data.ndjson):
11+
{"name": "Alice", "age": 30}
12+
{"name": "Bob", "age": 25}
13+
{"name": "Charlie", "age": 35}
14+
15+
Output JSON file (data_prettified.json):
16+
[
17+
{
18+
"age": 30,
19+
"name": "Alice"
20+
},
21+
{
22+
"age": 25,
23+
"name": "Bob"
24+
},
25+
{
26+
"age": 35,
27+
"name": "Charlie"
28+
}
29+
]
30+
31+
Usage:
32+
python prettify_ndjson.py data.ndjson
33+
python prettify_ndjson.py --lines 1,3 data.ndjson # Only process lines 1 and 3
34+
python prettify_ndjson.py --save-irs logs.ndjson # Keep all fields for compilation events
35+
36+
37+
"""
38+
39+
import argparse
40+
import json
41+
import sys
42+
from pathlib import Path
43+
from typing import Any, List
44+
45+
46+
def parse_line_ranges(lines_arg: str) -> set[int]:
47+
"""
48+
Parse line ranges from string like "1,2,3,5-10" into a set of line numbers.
49+
50+
Line numbers use 1-based indexing (first line is line 1, not 0).
51+
52+
Args:
53+
lines_arg: String containing comma-separated line numbers and ranges
54+
Examples: "1", "1,2,3", "5-10", "1,3,5-10,15"
55+
56+
Returns:
57+
Set of line numbers (1-based indexing, where 1 = first line)
58+
59+
Raises:
60+
ValueError: If the format is invalid or contains non-positive numbers
61+
"""
62+
line_numbers = set()
63+
64+
if not lines_arg.strip():
65+
return line_numbers
66+
67+
parts = lines_arg.split(",")
68+
for part in parts:
69+
part = part.strip()
70+
if not part:
71+
continue
72+
73+
if "-" in part:
74+
# Handle range like "5-10"
75+
try:
76+
start, end = part.split("-", 1)
77+
start_num = int(start.strip())
78+
end_num = int(end.strip())
79+
if start_num <= 0 or end_num <= 0:
80+
raise ValueError("Line numbers must be positive")
81+
if start_num > end_num:
82+
raise ValueError(f"Invalid range: {part} (start > end)")
83+
line_numbers.update(range(start_num, end_num + 1))
84+
except ValueError as e:
85+
if "invalid literal" in str(e):
86+
raise ValueError(f"Invalid range format: {part}")
87+
raise
88+
else:
89+
# Handle single number like "1"
90+
try:
91+
line_num = int(part)
92+
if line_num <= 0:
93+
raise ValueError("Line numbers must be positive")
94+
line_numbers.add(line_num)
95+
except ValueError:
96+
raise ValueError(f"Invalid line number: {part}")
97+
98+
return line_numbers
99+
100+
101+
def load_ndjson(
102+
file_path: Path, save_irs: bool = False, line_filter: set[int] = None
103+
) -> List[Any]:
104+
"""
105+
Load NDJSON file and return list of JSON objects.
106+
107+
Args:
108+
file_path: Path to the NDJSON file
109+
save_irs: Whether to save file_content and python_source for compilation events
110+
line_filter: Set of line numbers to include (1-based indexing), None means include all
111+
112+
Returns:
113+
List of parsed JSON objects
114+
115+
Raises:
116+
FileNotFoundError: If the input file doesn't exist
117+
json.JSONDecodeError: If a line contains invalid JSON
118+
"""
119+
json_objects = []
120+
filtered_compilation_events = 0
121+
total_lines_processed = 0
122+
123+
try:
124+
with open(file_path, "r", encoding="utf-8") as f:
125+
# enumerate(f, 1) starts line numbering from 1 (1-based indexing)
126+
for line_num, line in enumerate(f, 1):
127+
line = line.strip()
128+
if not line: # Skip empty lines
129+
continue
130+
131+
# Skip line if line filtering is enabled and this line is not in the filter
132+
# line_num is 1-based (first line = 1, second line = 2, etc.)
133+
if line_filter is not None and line_num not in line_filter:
134+
continue
135+
136+
total_lines_processed += 1
137+
138+
try:
139+
json_obj = json.loads(line)
140+
141+
# Filter out file_content and python_source for compilation events if save_irs is False
142+
if not save_irs and isinstance(json_obj, dict):
143+
event_type = json_obj.get("event_type")
144+
if event_type == "compilation":
145+
# Remove file_content and python_source from payload if they exist
146+
payload = json_obj.get("payload")
147+
if isinstance(payload, dict):
148+
fields_to_remove = []
149+
if "file_content" in payload:
150+
fields_to_remove.append("file_content")
151+
if "python_source" in payload:
152+
fields_to_remove.append("python_source")
153+
154+
if fields_to_remove:
155+
payload = (
156+
payload.copy()
157+
) # Create a copy to avoid modifying original
158+
for field in fields_to_remove:
159+
del payload[field]
160+
json_obj = (
161+
json_obj.copy()
162+
) # Create a copy of the main object
163+
json_obj["payload"] = payload
164+
filtered_compilation_events += 1
165+
166+
json_objects.append(json_obj)
167+
except json.JSONDecodeError as e:
168+
print(
169+
f"Error parsing JSON on line {line_num}: {e}", file=sys.stderr
170+
)
171+
print(f"Problematic line: {line[:100]}...", file=sys.stderr)
172+
raise
173+
174+
except FileNotFoundError:
175+
print(f"Error: File '{file_path}' not found.", file=sys.stderr)
176+
raise
177+
except Exception as e:
178+
print(f"Error reading file '{file_path}': {e}", file=sys.stderr)
179+
raise
180+
181+
# Print informational messages
182+
if line_filter is not None:
183+
if line_filter:
184+
print(
185+
f"Line filtering: processed {total_lines_processed} out of {len(line_filter)} specified lines"
186+
)
187+
else:
188+
print("Line filtering: no valid lines specified")
189+
190+
# Print warning if compilation events were filtered
191+
if not save_irs and filtered_compilation_events > 0:
192+
print(
193+
f"WARNING: Removed 'file_content' and 'python_source' fields from {filtered_compilation_events} compilation events to reduce file size.",
194+
file=sys.stderr,
195+
)
196+
print(
197+
" Use --save-irs flag to preserve these fields if needed.",
198+
file=sys.stderr,
199+
)
200+
201+
return json_objects
202+
203+
204+
def save_prettified_json(json_objects: List[Any], output_path: Path) -> None:
205+
"""
206+
Save list of JSON objects to a prettified JSON file.
207+
208+
Args:
209+
json_objects: List of JSON objects to save
210+
output_path: Path where to save the prettified JSON file
211+
"""
212+
try:
213+
with open(output_path, "w", encoding="utf-8") as f:
214+
json.dump(json_objects, f, indent=2, ensure_ascii=False, sort_keys=True)
215+
print(f"Successfully converted to prettified JSON: {output_path}")
216+
except Exception as e:
217+
print(f"Error writing to file '{output_path}': {e}", file=sys.stderr)
218+
raise
219+
220+
221+
def main():
222+
"""Main function to handle command line arguments and orchestrate the conversion."""
223+
parser = argparse.ArgumentParser(
224+
description="Convert NDJSON file to prettified JSON file",
225+
formatter_class=argparse.RawDescriptionHelpFormatter,
226+
epilog="""
227+
Examples:
228+
python prettify_ndjson.py data.ndjson
229+
python prettify_ndjson.py /path/to/logs.ndjson
230+
""",
231+
)
232+
233+
parser.add_argument(
234+
"ndjson_file", type=str, help="Path to the NDJSON file to convert"
235+
)
236+
237+
parser.add_argument(
238+
"--save-irs",
239+
action="store_true",
240+
default=False,
241+
help="Save file_content and python_source for compilation events (default: False, removes these fields to reduce size)",
242+
)
243+
244+
parser.add_argument(
245+
"--lines",
246+
type=str,
247+
help="Specify line numbers to include using 1-based indexing (e.g., '1,2,3,5-10'). "
248+
"Line 1 is the first line of the file. Only these lines from the original NDJSON will be processed. "
249+
"Supports individual lines (1,2,3) and ranges (5-10).",
250+
)
251+
252+
parser.add_argument(
253+
"-o",
254+
"--output",
255+
type=str,
256+
help="Specify output file path (default: {input_stem}_prettified.json in the same directory as input)",
257+
)
258+
259+
args = parser.parse_args()
260+
261+
# Convert to Path object and validate
262+
input_path = Path(args.ndjson_file)
263+
264+
if not input_path.exists():
265+
print(f"Error: File '{input_path}' does not exist.", file=sys.stderr)
266+
sys.exit(1)
267+
268+
if not input_path.is_file():
269+
print(f"Error: '{input_path}' is not a file.", file=sys.stderr)
270+
sys.exit(1)
271+
272+
# Generate output filename
273+
if args.output:
274+
output_path = Path(args.output)
275+
else:
276+
# Default: original_prettified.json in same directory as input
277+
output_path = input_path.parent / f"{input_path.stem}_prettified.json"
278+
279+
try:
280+
# Parse line filter if provided
281+
line_filter = None
282+
if args.lines:
283+
try:
284+
line_filter = parse_line_ranges(args.lines)
285+
print(
286+
f"Line filtering enabled: will process {len(line_filter)} specified lines"
287+
)
288+
except ValueError as e:
289+
print(f"Error parsing --lines argument: {e}", file=sys.stderr)
290+
sys.exit(1)
291+
292+
# Load NDJSON file
293+
print(f"Loading NDJSON file: {input_path}")
294+
if not args.save_irs:
295+
print(
296+
"Filtering out file_content and python_source from compilation events to reduce size"
297+
)
298+
json_objects = load_ndjson(
299+
input_path, save_irs=args.save_irs, line_filter=line_filter
300+
)
301+
print(f"Loaded {len(json_objects)} JSON objects")
302+
303+
# Save as prettified JSON
304+
print(f"Saving prettified JSON to: {output_path}")
305+
save_prettified_json(json_objects, output_path)
306+
307+
print("Conversion completed successfully!")
308+
309+
except Exception as e:
310+
print(f"Conversion failed: {e}", file=sys.stderr)
311+
sys.exit(1)
312+
313+
314+
if __name__ == "__main__":
315+
main()

website/package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

website/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "tritonparse-website",
33
"private": true,
4-
"version": "0.0.4",
4+
"version": "0.0.5",
55
"type": "module",
66
"scripts": {
77
"dev": "vite",

0 commit comments

Comments
 (0)