30
30
import uproot
31
31
import numpy as np
32
32
import awkward as ak
33
+ import json
33
34
34
- def run_query (input_filenames = None ):
35
+
36
+ def run_query (input_filenames ):
35
37
import uproot
36
38
import awkward as ak
37
- """
38
- Sent to ServiceX python transformers.
39
- Open a file and return one array containing a single string that describes the DataSet root file structure.
40
-
41
- The string will be formatted like:
42
- "Tree: TreeName1; TBranch: Branchname1 ; dtype: BranchType1, TBranch: Branchname2 ; dtype: BranchType2, ...
43
- Tree: TreeName2; TBranch: Branchname1 ; dtype: BranchType1, ..."
44
- """
39
+ import json
40
+
45
41
def is_tree (obj ):
46
- """
47
- Helper to check if a root file item is TTree. Different object types use .classname or .classnames
48
- """
49
- # Check for 'classname'
42
+ """Helper to check if a root file item is TTree."""
50
43
if hasattr (obj , "classname" ):
51
44
cls_attr = obj .classname
52
- # Call if it's callable
53
45
cls_value = cls_attr () if callable (cls_attr ) else cls_attr
54
46
return "TTree" in cls_value
55
- # Check for 'classnames'
56
47
elif hasattr (obj , "classnames" ):
57
48
cls_attr = obj .classnames
58
49
cls_values = cls_attr () if callable (cls_attr ) else cls_attr
59
50
return any ("TTree" in cls for cls in cls_values )
60
51
return False
61
-
62
- trees_info = [] # list of str info for each tree
52
+
53
+ """
54
+ Opens a ROOT file and returns a JSON-formatted string describing the structure,
55
+ encoded inside an ak.Array for ServiceX.
56
+ """
57
+ tree_dict = {}
63
58
64
59
with uproot .open (input_filenames ) as file :
65
60
for tree_name in file .keys ():
66
- # Remove uproot tree sufix
67
61
tree_name_clean = tree_name .rstrip (";1" )
68
62
tree = file [tree_name ]
69
63
70
- # Only TTrees
71
64
if not is_tree (tree ):
72
65
continue
73
66
74
- # Gather branch info
75
- branch_info_list = []
67
+ branch_dict = {}
76
68
for branch_name , branch in tree .items ():
77
- # Using uproot type interpretor
78
69
branch_type = str (branch .interpretation )
79
- branch_info_list .append (f"TBranch: { branch_name } ; dtype: { branch_type } " )
70
+ branch_dict [branch_name ] = branch_type
71
+
72
+ tree_dict [tree_name_clean ] = branch_dict
80
73
81
- # Join branch info & separate by ,
82
- tree_info = f"Tree: { tree_name_clean } ; " + ", " .join (branch_info_list )
83
- trees_info .append (tree_info )
74
+ # Serialize tree_dict to JSON string
75
+ json_str = json .dumps (tree_dict )
76
+
77
+ # Return JSON string wrapped in an awkward array
78
+ return ak .Array ([json_str ])
84
79
85
- # Join all trees & separate by \n
86
- final_str = "\n " .join (trees_info )
87
-
88
- # Return str in an array
89
- return ak .Array ([final_str ])
90
80
91
81
def build_deliver_spec (dataset ):
92
82
"""
93
83
Helper to build the servicex.deliver configuration.
94
84
Supports multiple inputs for multiple sample queries.
95
85
96
86
Parameters:
97
- dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
87
+ dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
98
88
If dict, custom names can be inputed
99
89
100
90
Returns:
101
91
spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
102
92
"""
103
- #Servicex query using the PythonFunction backend
104
- query_PythonFunction = servicex .query .PythonFunction ().with_uproot_function (run_query )
105
-
106
- #Create a dict with sample name for ServiceX query & datasetID
107
- dataset_dict = {}
108
- user_in = type (dataset )
109
-
93
+ # Servicex query using the PythonFunction backend
94
+ query_PythonFunction = servicex .query .PythonFunction ().with_uproot_function (
95
+ run_query
96
+ )
97
+
98
+ # Create a dict with sample name for ServiceX query & datasetID
99
+ dataset_dict = {}
100
+ user_in = type (dataset )
101
+
110
102
if user_in == str :
111
- dataset_dict .update ({"Sample" :dataset })
103
+ dataset_dict .update ({"Sample" : dataset })
112
104
elif user_in == list and type (dataset [0 ]) is str :
113
105
for i in range (len (dataset )):
114
- name = "Sample" + str (i + 1 ) # write number for humans
115
- dataset_dict .update ({name :dataset [i ]})
106
+ name = "Sample" + str (i + 1 ) # write number for humans
107
+ dataset_dict .update ({name : dataset [i ]})
116
108
elif user_in == dict :
117
- dataset_dict = dataset
109
+ dataset_dict = dataset
118
110
else :
119
- raise ValueError (f"Unsupported dataset input type: { user_in } .\n Input must be dict ('sample_name':'dataset_id'), str or list of str" )
120
-
111
+ raise ValueError (
112
+ f"Unsupported dataset input type: { user_in } .\n Input must be dict ('sample_name':'dataset_id'), str or list of str"
113
+ )
114
+
121
115
sample_list = [
122
116
{
123
117
"NFiles" : 1 ,
@@ -129,57 +123,51 @@ def build_deliver_spec(dataset):
129
123
]
130
124
spec_python = {"Sample" : sample_list }
131
125
132
- return spec_python
126
+ return spec_python
127
+
133
128
134
- def print_structure_from_str (deliver_dict , filter_branch = "" , save_to_txt = False , do_print = False ):
129
+ def print_structure_from_str (
130
+ deliver_dict , filter_branch = "" , save_to_txt = False , do_print = False
131
+ ):
135
132
"""
136
- Re-formats the deliver-retrieve str structure for readability with a filter for branch selection.
137
- The string can be printed, written out or returned
133
+ Re-formats the JSON structure string from ServiceX into a readable summary.
138
134
139
135
Parameters:
140
136
deliver_dict (dict): ServiceX deliver output (keys: sample names, values: file paths or URLs).
141
- filter_branch (str): If provided, only branches containing this string are included in the output .
137
+ filter_branch (str): If provided, only branches containing this string are included.
142
138
save_to_txt (bool): If True, saves output to a text file instead of returning it.
143
- do_print (bool): If True, dumps the ouput to the terminal and returns None. Not called if save_to_txt is True
139
+ do_print (bool): If True, prints the output to the terminal and returns None.
144
140
145
141
Returns:
146
142
result_str (str): The formatted file structure.
147
143
"""
144
+ import uproot
145
+ import json
146
+
148
147
output_lines = []
149
- output_lines .append (f"\n File structure of all samples with branch filter '{ filter_branch } ':" )
148
+ output_lines .append (
149
+ f"\n File structure of all samples with branch filter '{ filter_branch } ':"
150
+ )
150
151
151
152
for sample_name , path in deliver_dict .items ():
152
153
output_lines .append (
153
154
f"\n ---------------------------\n "
154
- f"\U0001F4C1 Sample: { sample_name } \n "
155
+ f"\U0001f4c1 Sample: { sample_name } \n "
155
156
f"---------------------------"
156
157
)
157
158
158
159
with uproot .open (path [0 ]) as f :
159
- structure_str = f ["servicex" ]["branch" ].array ()[0 ]
160
-
161
- # Trees separated by \n
162
- tree_lines = structure_str .split ("\n " )
163
- for line in tree_lines :
164
- if not line .strip ():
165
- continue # Skip empty lines
166
-
167
- #Separate Tree header from branches
168
- parts = line .split (";" , 1 )
169
- tree_header = parts [0 ]
170
- output_lines .append (f"\n \U0001F333 { tree_header } " )
171
-
172
- if len (parts ) > 1 :
173
- branch_infos = parts [1 ].split ("," ) # Branches separated by ,
174
- output_lines .append (" ├── Branches:" )
175
- for b in branch_infos :
176
- branch_line = b .strip ()
177
- # Removes lines w/o filter str
178
- if filter_branch not in branch_line :
179
- continue
180
- if branch_line .startswith ("TBranch:" ):
181
- output_lines .append (f" │ ├── { branch_line [8 :]} " )
182
-
160
+ json_str = f ["servicex" ]["branch" ].array ()[0 ]
161
+ structure_dict = json .loads (json_str )
162
+
163
+ for tree_name , branches in structure_dict .items ():
164
+ output_lines .append (f"\n \U0001f333 Tree: { tree_name } " )
165
+ output_lines .append (" ├── Branches:" )
166
+ for branch_name , dtype in branches .items ():
167
+ if filter_branch and filter_branch not in branch_name :
168
+ continue
169
+ output_lines .append (f" │ ├── { branch_name } ; dtype: { dtype } " )
170
+
183
171
result_str = "\n " .join (output_lines )
184
172
185
173
if save_to_txt :
@@ -188,18 +176,19 @@ def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False,
188
176
return "File structure saved to 'samples_structure.txt'."
189
177
elif do_print :
190
178
print (result_str )
191
- return
179
+ return
192
180
else :
193
181
return result_str
194
182
183
+
195
184
def parse_jagged_depth_and_dtype (dtype_str ):
196
185
"""
197
186
Helper to decode the dtype str for each branch.
198
187
199
188
Parses uproot-style interpretation strings such as:
200
189
- "AsJagged(AsJagged(AsDtype('>f4')))"
201
190
202
- Returns the number of nested layers and the inner dtype.
191
+ Returns the number of nested layers and the inner dtype.
203
192
Used in str_to_array to reconstruct the ak.array.
204
193
205
194
Parameters:
@@ -214,106 +203,83 @@ def parse_jagged_depth_and_dtype(dtype_str):
214
203
# Count how many nested AsJagged(...) wrappers exist
215
204
while current .startswith ("AsJagged(" ):
216
205
depth += 1
217
- current = current [len ("AsJagged(" ):- 1 ].strip () # Strip outermost wrapper, up to -1 to remove )
206
+ current = current [
207
+ len ("AsJagged(" ) : - 1
208
+ ].strip () # Strip outermost wrapper, up to -1 to remove )
218
209
219
210
# Extract the base dtype string from AsDtype('<np-format>')
220
211
if current .startswith ("AsDtype('" ) and current .endswith ("')" ):
221
- base_dtype = current [len ("AsDtype('" ): - 2 ]
212
+ base_dtype = current [len ("AsDtype('" ) : - 2 ]
222
213
return depth , base_dtype
223
214
else :
224
215
return depth , None
225
216
226
- def str_to_array (encoded_str ):
217
+
218
+ def str_to_array (encoded_json_str ):
227
219
"""
228
- Helper to reconstruct ak.Arrays from an encoded file-structure string.
229
- Retruned array mimicks TTrees, TBranches with correct field names and dtypes.
220
+ Helper to reconstruct ak.Arrays from a JSON-formatted file-structure string.
221
+ Returns an array mimicking TTrees and TBranches with correct field names and dtypes.
230
222
231
223
Parameters:
232
- encoded_str (str): The encoded string from run_query.
224
+ encoded_json_str (str): JSON string from run_query.
233
225
234
226
Returns:
235
- reconstructed_data ( ak.Array): Contains trees and branches with typed dumy values.
227
+ ak.Array: An array containing a dictionary of trees with branch structures and dummy typed values.
236
228
"""
237
-
238
- #Separate trees
239
- tree_sections = encoded_str .strip ().split ("\n " )
240
229
reconstructed_data = {}
230
+ structure_dict = json .loads (encoded_json_str )
241
231
242
- for tree_section in tree_sections :
243
- tree_section = tree_section .strip ()
244
- if not tree_section :
245
- continue #skip empty lines
246
-
247
- parts = tree_section .split (";" , 1 ) #Tree and branches separated by ;
248
- tree_header = parts [0 ].strip ()
249
-
250
- # Extract tree name
251
- treename = tree_header [len ("Tree: " ):]
232
+ for treename , branch_dict in structure_dict .items ():
252
233
branches = {}
253
234
254
- if len (parts ) > 1 :
255
- branches_str = parts [1 ].strip ()
256
- branch_infos = branches_str .split ("," ) #Branches separated by
257
-
258
- for branch in branch_infos :
259
- branch = branch .strip ()
260
-
261
- if " ; dtype: " in branch : # line with branch info
262
- name_str , dtype_str = branch .split (" ; dtype: " , 1 )
263
- # Extract name
264
- branch_name = name_str [len ("TBranch: " ):].strip ()
265
- dtype_str = dtype_str .strip ()
266
-
267
- # Get nesting depth and base dtype from interpretation string
268
- depth , base_dtype_str = parse_jagged_depth_and_dtype (dtype_str )
269
- if base_dtype_str is None :
270
- branches [branch_name ] = None
271
- continue
272
-
273
- try :
274
- np_dtype = np .dtype (base_dtype_str )
275
- except TypeError :
276
- branches [branch_name ] = None
277
- continue
235
+ for branch_name , dtype_str in branch_dict .items ():
236
+ # Get jagged depth and numpy base dtype
237
+ depth , base_dtype_str = parse_jagged_depth_and_dtype (dtype_str )
238
+ if base_dtype_str is None :
239
+ branches [branch_name ] = None
240
+ continue
278
241
279
- dummy = np_dtype .type (0 ) # Typed placeholder value
242
+ try :
243
+ np_dtype = np .dtype (base_dtype_str )
244
+ except TypeError :
245
+ branches [branch_name ] = None
246
+ continue
280
247
281
- # Simulate jagged structure by nesting the value in lists
282
- for _ in range (depth ):
283
- dummy = [dummy ]
248
+ dummy = np_dtype . type ( 0 )
249
+ for _ in range (depth ):
250
+ dummy = [dummy ]
284
251
285
- # Wrap dummy in a length-1 ak.Array
286
- branches [branch_name ] = ak .Array ([dummy ])
252
+ branches [branch_name ] = ak .Array ([dummy ])
287
253
288
254
if branches :
289
- # Each tree becomes a record array with 1 entry (dict of branch arrays)
290
255
reconstructed_data [treename ] = ak .Array ([branches ])
291
256
292
257
return ak .Array (reconstructed_data ).type
293
258
259
+
294
260
def get_structure (dataset , array_out = False , ** kwargs ):
295
261
"""
296
- Utility function.
262
+ Utility function.
297
263
Creates and sends the ServiceX request from user inputed datasets to retrieve file stucture.
298
- Calls print_structure_from_str() to get the structure in a user-friendly format
264
+ Calls print_structure_from_str() to dump the structure in a user-friendly format
299
265
300
266
Parameters:
301
267
dataset (dict,str,[str]): The datasets from which to print the file structures.
302
268
A custom sample name per dataset can be given in a dict form: {'sample_name':'dataset_id'}
303
- kwargs : Arguments to be propagated to print_structure_from_str
269
+ kwargs : Arguments to be propagated to print_structure_from_str
304
270
"""
305
- spec_python = build_deliver_spec (dataset )
271
+ spec_python = build_deliver_spec (dataset )
306
272
307
- output = servicex .deliver (spec_python )
273
+ output = servicex .deliver (spec_python )
308
274
309
- if array_out == True :
310
- all_arrays = {}
275
+ if array_out == True :
276
+ all_arrays = {}
311
277
for sample , path in output .items ():
312
278
with uproot .open (path [0 ]) as f :
313
- structure_str = f ["servicex" ]["branch" ].array ()[0 ]
314
- sample_array = str_to_array (structure_str )
315
- all_arrays [sample ]= sample_array
279
+ structure_str = f ["servicex" ]["branch" ].array ()[0 ]
280
+ sample_array = str_to_array (structure_str )
281
+ all_arrays [sample ] = sample_array
316
282
return all_arrays
317
-
283
+
318
284
else :
319
- return print_structure_from_str (output , ** kwargs )
285
+ return print_structure_from_str (output , ** kwargs )
0 commit comments