Skip to content

Commit a827352

Browse files
committed
json-based file structure enconding and decoding
1 parent fb4914e commit a827352

File tree

3 files changed

+218
-223
lines changed

3 files changed

+218
-223
lines changed

servicex_analysis_utils/file_peeking.py

Lines changed: 108 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -30,94 +30,88 @@
3030
import uproot
3131
import numpy as np
3232
import awkward as ak
33+
import json
3334

34-
def run_query(input_filenames=None):
35+
36+
def run_query(input_filenames):
3537
import uproot
3638
import awkward as ak
37-
"""
38-
Sent to ServiceX python transformers.
39-
Open a file and return one array containing a single string that describes the DataSet root file structure.
40-
41-
The string will be formatted like:
42-
"Tree: TreeName1; TBranch: Branchname1 ; dtype: BranchType1, TBranch: Branchname2 ; dtype: BranchType2, ...
43-
Tree: TreeName2; TBranch: Branchname1 ; dtype: BranchType1, ..."
44-
"""
39+
import json
40+
4541
def is_tree(obj):
46-
"""
47-
Helper to check if a root file item is TTree. Different object types use .classname or .classnames
48-
"""
49-
# Check for 'classname'
42+
"""Helper to check if a root file item is TTree."""
5043
if hasattr(obj, "classname"):
5144
cls_attr = obj.classname
52-
# Call if it's callable
5345
cls_value = cls_attr() if callable(cls_attr) else cls_attr
5446
return "TTree" in cls_value
55-
# Check for 'classnames'
5647
elif hasattr(obj, "classnames"):
5748
cls_attr = obj.classnames
5849
cls_values = cls_attr() if callable(cls_attr) else cls_attr
5950
return any("TTree" in cls for cls in cls_values)
6051
return False
61-
62-
trees_info = [] # list of str info for each tree
52+
53+
"""
54+
Opens a ROOT file and returns a JSON-formatted string describing the structure,
55+
encoded inside an ak.Array for ServiceX.
56+
"""
57+
tree_dict = {}
6358

6459
with uproot.open(input_filenames) as file:
6560
for tree_name in file.keys():
66-
# Remove uproot tree sufix
6761
tree_name_clean = tree_name.rstrip(";1")
6862
tree = file[tree_name]
6963

70-
# Only TTrees
7164
if not is_tree(tree):
7265
continue
7366

74-
# Gather branch info
75-
branch_info_list = []
67+
branch_dict = {}
7668
for branch_name, branch in tree.items():
77-
# Using uproot type interpretor
7869
branch_type = str(branch.interpretation)
79-
branch_info_list.append(f"TBranch: {branch_name} ; dtype: {branch_type}")
70+
branch_dict[branch_name] = branch_type
71+
72+
tree_dict[tree_name_clean] = branch_dict
8073

81-
# Join branch info & separate by ,
82-
tree_info = f"Tree: {tree_name_clean}; " + ", ".join(branch_info_list)
83-
trees_info.append(tree_info)
74+
# Serialize tree_dict to JSON string
75+
json_str = json.dumps(tree_dict)
76+
77+
# Return JSON string wrapped in an awkward array
78+
return ak.Array([json_str])
8479

85-
# Join all trees & separate by \n
86-
final_str = "\n".join(trees_info)
87-
88-
# Return str in an array
89-
return ak.Array([final_str])
9080

9181
def build_deliver_spec(dataset):
9282
"""
9383
Helper to build the servicex.deliver configuration.
9484
Supports multiple inputs for multiple sample queries.
9585
9686
Parameters:
97-
dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
87+
dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
9888
If dict, custom names can be inputed
9989
10090
Returns:
10191
spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
10292
"""
103-
#Servicex query using the PythonFunction backend
104-
query_PythonFunction = servicex.query.PythonFunction().with_uproot_function(run_query)
105-
106-
#Create a dict with sample name for ServiceX query & datasetID
107-
dataset_dict={}
108-
user_in=type(dataset)
109-
93+
# Servicex query using the PythonFunction backend
94+
query_PythonFunction = servicex.query.PythonFunction().with_uproot_function(
95+
run_query
96+
)
97+
98+
# Create a dict with sample name for ServiceX query & datasetID
99+
dataset_dict = {}
100+
user_in = type(dataset)
101+
110102
if user_in == str:
111-
dataset_dict.update({"Sample":dataset})
103+
dataset_dict.update({"Sample": dataset})
112104
elif user_in == list and type(dataset[0]) is str:
113105
for i in range(len(dataset)):
114-
name="Sample"+str(i+1) #write number for humans
115-
dataset_dict.update({name:dataset[i]})
106+
name = "Sample" + str(i + 1) # write number for humans
107+
dataset_dict.update({name: dataset[i]})
116108
elif user_in == dict:
117-
dataset_dict=dataset
109+
dataset_dict = dataset
118110
else:
119-
raise ValueError(f"Unsupported dataset input type: {user_in}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str")
120-
111+
raise ValueError(
112+
f"Unsupported dataset input type: {user_in}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str"
113+
)
114+
121115
sample_list = [
122116
{
123117
"NFiles": 1,
@@ -129,57 +123,51 @@ def build_deliver_spec(dataset):
129123
]
130124
spec_python = {"Sample": sample_list}
131125

132-
return spec_python
126+
return spec_python
127+
133128

134-
def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False, do_print=False):
129+
def print_structure_from_str(
130+
deliver_dict, filter_branch="", save_to_txt=False, do_print=False
131+
):
135132
"""
136-
Re-formats the deliver-retrieve str structure for readability with a filter for branch selection.
137-
The string can be printed, written out or returned
133+
Re-formats the JSON structure string from ServiceX into a readable summary.
138134
139135
Parameters:
140136
deliver_dict (dict): ServiceX deliver output (keys: sample names, values: file paths or URLs).
141-
filter_branch (str): If provided, only branches containing this string are included in the output.
137+
filter_branch (str): If provided, only branches containing this string are included.
142138
save_to_txt (bool): If True, saves output to a text file instead of returning it.
143-
do_print (bool): If True, dumps the ouput to the terminal and returns None. Not called if save_to_txt is True
139+
do_print (bool): If True, prints the output to the terminal and returns None.
144140
145141
Returns:
146142
result_str (str): The formatted file structure.
147143
"""
144+
import uproot
145+
import json
146+
148147
output_lines = []
149-
output_lines.append(f"\nFile structure of all samples with branch filter '{filter_branch}':")
148+
output_lines.append(
149+
f"\nFile structure of all samples with branch filter '{filter_branch}':"
150+
)
150151

151152
for sample_name, path in deliver_dict.items():
152153
output_lines.append(
153154
f"\n---------------------------\n"
154-
f"\U0001F4C1 Sample: {sample_name}\n"
155+
f"\U0001f4c1 Sample: {sample_name}\n"
155156
f"---------------------------"
156157
)
157158

158159
with uproot.open(path[0]) as f:
159-
structure_str = f["servicex"]["branch"].array()[0]
160-
161-
# Trees separated by \n
162-
tree_lines = structure_str.split("\n")
163-
for line in tree_lines:
164-
if not line.strip():
165-
continue # Skip empty lines
166-
167-
#Separate Tree header from branches
168-
parts = line.split(";", 1)
169-
tree_header = parts[0]
170-
output_lines.append(f"\n\U0001F333 {tree_header}")
171-
172-
if len(parts) > 1:
173-
branch_infos = parts[1].split(",") # Branches separated by ,
174-
output_lines.append(" ├── Branches:")
175-
for b in branch_infos:
176-
branch_line = b.strip()
177-
# Removes lines w/o filter str
178-
if filter_branch not in branch_line:
179-
continue
180-
if branch_line.startswith("TBranch:"):
181-
output_lines.append(f" │ ├── {branch_line[8:]}")
182-
160+
json_str = f["servicex"]["branch"].array()[0]
161+
structure_dict = json.loads(json_str)
162+
163+
for tree_name, branches in structure_dict.items():
164+
output_lines.append(f"\n\U0001f333 Tree: {tree_name}")
165+
output_lines.append(" ├── Branches:")
166+
for branch_name, dtype in branches.items():
167+
if filter_branch and filter_branch not in branch_name:
168+
continue
169+
output_lines.append(f" │ ├── {branch_name} ; dtype: {dtype}")
170+
183171
result_str = "\n".join(output_lines)
184172

185173
if save_to_txt:
@@ -188,18 +176,19 @@ def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False,
188176
return "File structure saved to 'samples_structure.txt'."
189177
elif do_print:
190178
print(result_str)
191-
return
179+
return
192180
else:
193181
return result_str
194182

183+
195184
def parse_jagged_depth_and_dtype(dtype_str):
196185
"""
197186
Helper to decode the dtype str for each branch.
198187
199188
Parses uproot-style interpretation strings such as:
200189
- "AsJagged(AsJagged(AsDtype('>f4')))"
201190
202-
Returns the number of nested layers and the inner dtype.
191+
Returns the number of nested layers and the inner dtype.
203192
Used in str_to_array to reconstruct the ak.array.
204193
205194
Parameters:
@@ -214,106 +203,83 @@ def parse_jagged_depth_and_dtype(dtype_str):
214203
# Count how many nested AsJagged(...) wrappers exist
215204
while current.startswith("AsJagged("):
216205
depth += 1
217-
current = current[len("AsJagged("):-1].strip() # Strip outermost wrapper, up to -1 to remove )
206+
current = current[
207+
len("AsJagged(") : -1
208+
].strip() # Strip outermost wrapper, up to -1 to remove )
218209

219210
# Extract the base dtype string from AsDtype('<np-format>')
220211
if current.startswith("AsDtype('") and current.endswith("')"):
221-
base_dtype = current[len("AsDtype('"):-2]
212+
base_dtype = current[len("AsDtype('") : -2]
222213
return depth, base_dtype
223214
else:
224215
return depth, None
225216

226-
def str_to_array(encoded_str):
217+
218+
def str_to_array(encoded_json_str):
227219
"""
228-
Helper to reconstruct ak.Arrays from an encoded file-structure string.
229-
Retruned array mimicks TTrees, TBranches with correct field names and dtypes.
220+
Helper to reconstruct ak.Arrays from a JSON-formatted file-structure string.
221+
Returns an array mimicking TTrees and TBranches with correct field names and dtypes.
230222
231223
Parameters:
232-
encoded_str (str): The encoded string from run_query.
224+
encoded_json_str (str): JSON string from run_query.
233225
234226
Returns:
235-
reconstructed_data (ak.Array): Contains trees and branches with typed dumy values.
227+
ak.Array: An array containing a dictionary of trees with branch structures and dummy typed values.
236228
"""
237-
238-
#Separate trees
239-
tree_sections = encoded_str.strip().split("\n")
240229
reconstructed_data = {}
230+
structure_dict = json.loads(encoded_json_str)
241231

242-
for tree_section in tree_sections:
243-
tree_section = tree_section.strip()
244-
if not tree_section:
245-
continue #skip empty lines
246-
247-
parts = tree_section.split(";", 1) #Tree and branches separated by ;
248-
tree_header = parts[0].strip()
249-
250-
# Extract tree name
251-
treename = tree_header[len("Tree: "):]
232+
for treename, branch_dict in structure_dict.items():
252233
branches = {}
253234

254-
if len(parts) > 1:
255-
branches_str = parts[1].strip()
256-
branch_infos = branches_str.split(",") #Branches separated by
257-
258-
for branch in branch_infos:
259-
branch = branch.strip()
260-
261-
if " ; dtype: " in branch: # line with branch info
262-
name_str, dtype_str = branch.split(" ; dtype: ", 1)
263-
# Extract name
264-
branch_name = name_str[len("TBranch: "):].strip()
265-
dtype_str = dtype_str.strip()
266-
267-
# Get nesting depth and base dtype from interpretation string
268-
depth, base_dtype_str = parse_jagged_depth_and_dtype(dtype_str)
269-
if base_dtype_str is None:
270-
branches[branch_name] = None
271-
continue
272-
273-
try:
274-
np_dtype = np.dtype(base_dtype_str)
275-
except TypeError:
276-
branches[branch_name] = None
277-
continue
235+
for branch_name, dtype_str in branch_dict.items():
236+
# Get jagged depth and numpy base dtype
237+
depth, base_dtype_str = parse_jagged_depth_and_dtype(dtype_str)
238+
if base_dtype_str is None:
239+
branches[branch_name] = None
240+
continue
278241

279-
dummy = np_dtype.type(0) # Typed placeholder value
242+
try:
243+
np_dtype = np.dtype(base_dtype_str)
244+
except TypeError:
245+
branches[branch_name] = None
246+
continue
280247

281-
# Simulate jagged structure by nesting the value in lists
282-
for _ in range(depth):
283-
dummy = [dummy]
248+
dummy = np_dtype.type(0)
249+
for _ in range(depth):
250+
dummy = [dummy]
284251

285-
# Wrap dummy in a length-1 ak.Array
286-
branches[branch_name] = ak.Array([dummy])
252+
branches[branch_name] = ak.Array([dummy])
287253

288254
if branches:
289-
# Each tree becomes a record array with 1 entry (dict of branch arrays)
290255
reconstructed_data[treename] = ak.Array([branches])
291256

292257
return ak.Array(reconstructed_data).type
293258

259+
294260
def get_structure(dataset, array_out=False, **kwargs):
295261
"""
296-
Utility function.
262+
Utility function.
297263
Creates and sends the ServiceX request from user inputed datasets to retrieve file stucture.
298-
Calls print_structure_from_str() to get the structure in a user-friendly format
264+
Calls print_structure_from_str() to dump the structure in a user-friendly format
299265
300266
Parameters:
301267
dataset (dict,str,[str]): The datasets from which to print the file structures.
302268
A custom sample name per dataset can be given in a dict form: {'sample_name':'dataset_id'}
303-
kwargs : Arguments to be propagated to print_structure_from_str
269+
kwargs : Arguments to be propagated to print_structure_from_str
304270
"""
305-
spec_python=build_deliver_spec(dataset)
271+
spec_python = build_deliver_spec(dataset)
306272

307-
output=servicex.deliver(spec_python)
273+
output = servicex.deliver(spec_python)
308274

309-
if array_out==True:
310-
all_arrays={}
275+
if array_out == True:
276+
all_arrays = {}
311277
for sample, path in output.items():
312278
with uproot.open(path[0]) as f:
313-
structure_str = f["servicex"]["branch"].array()[0]
314-
sample_array=str_to_array(structure_str)
315-
all_arrays[sample]=sample_array
279+
structure_str = f["servicex"]["branch"].array()[0]
280+
sample_array = str_to_array(structure_str)
281+
all_arrays[sample] = sample_array
316282
return all_arrays
317-
283+
318284
else:
319-
return print_structure_from_str(output, **kwargs)
285+
return print_structure_from_str(output, **kwargs)

0 commit comments

Comments
 (0)