28
28
29
29
import servicex
30
30
import uproot
31
+ import numpy as np
32
+ import awkward as ak
31
33
32
34
def run_query (input_filenames = None ):
33
35
import uproot
@@ -86,9 +88,53 @@ def is_tree(obj):
86
88
# Return str in an array
87
89
return ak .Array ([final_str ])
88
90
91
+ def build_deliver_spec (dataset ):
92
+ """
93
+ Helper to build the servicex.deliver configuration.
94
+ Supports multiple inputs for multiple sample queries.
95
+
96
+ Parameters:
97
+ dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
98
+ If dict, custom names can be inputed
99
+
100
+ Returns:
101
+ spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
102
+ """
103
+ #Servicex query using the PythonFunction backend
104
+ query_PythonFunction = servicex .query .PythonFunction ().with_uproot_function (run_query )
105
+
106
+ #Create a dict with sample name for ServiceX query & datasetID
107
+ dataset_dict = {}
108
+ user_in = type (dataset )
109
+
110
+ if user_in == str :
111
+ dataset_dict .update ({"Sample" :dataset })
112
+ elif user_in == list and type (dataset [0 ]) is str :
113
+ for i in range (len (dataset )):
114
+ name = "Sample" + str (i + 1 ) #write number for humans
115
+ dataset_dict .update ({name :dataset [i ]})
116
+ elif user_in == dict :
117
+ dataset_dict = dataset
118
+ else :
119
+ raise ValueError (f"Unsupported dataset input type: { user_in } .\n Input must be dict ('sample_name':'dataset_id'), str or list of str" )
120
+
121
+ sample_list = [
122
+ {
123
+ "NFiles" : 1 ,
124
+ "Name" : name ,
125
+ "Dataset" : servicex .dataset .Rucio (did ),
126
+ "Query" : query_PythonFunction ,
127
+ }
128
+ for name , did in dataset_dict .items ()
129
+ ]
130
+ spec_python = {"Sample" : sample_list }
131
+
132
+ return spec_python
133
+
89
134
def print_structure_from_str (deliver_dict , filter_branch = "" , save_to_txt = False , do_print = False ):
90
135
"""
91
- Converts dataset file structures to a formatted string.
136
+ Re-formats the deliver-retrieve str structure for readability with a filter for branch selection.
137
+ The string can be printed, written out or returned
92
138
93
139
Parameters:
94
140
deliver_dict (dict): ServiceX deliver output (keys: sample names, values: file paths or URLs).
@@ -146,50 +192,106 @@ def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False,
146
192
else :
147
193
return result_str
148
194
149
- def build_deliver_spec ( dataset ):
195
+ def parse_jagged_depth_and_dtype ( dtype_str ):
150
196
"""
151
- Helper to build the servicex.deliver dict configuration.
152
- Supports multiple inputs for multiple sample queries.
197
+ Helper to decode the dtype str for each branch.
198
+
199
+ Parses uproot-style interpretation strings such as:
200
+ - "AsJagged(AsJagged(AsDtype('>f4')))"
201
+
202
+ Returns the number of nested layers and the inner dtype.
203
+ Used in str_to_array to reconstruct the ak.array.
153
204
154
205
Parameters:
155
- dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
156
- If dict, custom names can be inputed
206
+ dtype_str (str): The dtype part of a branch info str; from the delivered file structure.
157
207
158
208
Returns:
159
- spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
209
+ int, str: jagged_depth, base_numpy_dtype_str or None if not recognized.
160
210
"""
161
- #Servicex query using the PythonFunction backend
162
- query_PythonFunction = servicex .query .PythonFunction ().with_uproot_function (run_query )
163
-
164
- #Create a dict with sample name for ServiceX query & datasetID
165
- dataset_dict = {}
166
- user_in = type (dataset )
167
-
168
- if user_in == str :
169
- dataset_dict .update ({"Sample" :dataset })
170
- elif user_in == list and type (dataset [0 ]) is str :
171
- for i in range (len (dataset )):
172
- name = "Sample" + str (i + 1 ) #write number for humans
173
- dataset_dict .update ({name :dataset [i ]})
174
- elif user_in == dict :
175
- dataset_dict = dataset
211
+ depth = 0
212
+ current = dtype_str .strip ()
213
+
214
+ # Count how many nested AsJagged(...) wrappers exist
215
+ while current .startswith ("AsJagged(" ):
216
+ depth += 1
217
+ current = current [len ("AsJagged(" ):- 1 ].strip () # Strip outermost wrapper, up to -1 to remove )
218
+
219
+ # Extract the base dtype string from AsDtype('<np-format>')
220
+ if current .startswith ("AsDtype('" ) and current .endswith ("')" ):
221
+ base_dtype = current [len ("AsDtype('" ):- 2 ]
222
+ return depth , base_dtype
176
223
else :
177
- raise ValueError (f"Unsupported dataset input type: { user_in } .\n Input must be dict ('sample_name':'dataset_id'), str or list of str" )
178
-
179
- sample_list = [
180
- {
181
- "NFiles" : 1 ,
182
- "Name" : name ,
183
- "Dataset" : servicex .dataset .Rucio (did ),
184
- "Query" : query_PythonFunction ,
185
- }
186
- for name , did in dataset_dict .items ()
187
- ]
188
- spec_python = {"Sample" : sample_list }
224
+ return depth , None
189
225
190
- return spec_python
226
+ def str_to_array (encoded_str ):
227
+ """
228
+ Helper to reconstruct ak.Arrays from an encoded file-structure string.
229
+ Retruned array mimicks TTrees, TBranches with correct field names and dtypes.
230
+
231
+ Parameters:
232
+ encoded_str (str): The encoded string from run_query.
191
233
192
- def get_structure (dataset , ** kwargs ):
234
+ Returns:
235
+ reconstructed_data (ak.Array): Contains trees and branches with typed dumy values.
236
+ """
237
+
238
+ #Separate trees
239
+ tree_sections = encoded_str .strip ().split ("\n " )
240
+ reconstructed_data = {}
241
+
242
+ for tree_section in tree_sections :
243
+ tree_section = tree_section .strip ()
244
+ if not tree_section :
245
+ continue #skip empty lines
246
+
247
+ parts = tree_section .split (";" , 1 ) #Tree and branches separated by ;
248
+ tree_header = parts [0 ].strip ()
249
+
250
+ # Extract tree name
251
+ treename = tree_header [len ("Tree: " ):]
252
+ branches = {}
253
+
254
+ if len (parts ) > 1 :
255
+ branches_str = parts [1 ].strip ()
256
+ branch_infos = branches_str .split ("," ) #Branches separated by
257
+
258
+ for branch in branch_infos :
259
+ branch = branch .strip ()
260
+
261
+ if " ; dtype: " in branch : # line with branch info
262
+ name_str , dtype_str = branch .split (" ; dtype: " , 1 )
263
+ # Extract name
264
+ branch_name = name_str [len ("TBranch: " ):].strip ()
265
+ dtype_str = dtype_str .strip ()
266
+
267
+ # Get nesting depth and base dtype from interpretation string
268
+ depth , base_dtype_str = parse_jagged_depth_and_dtype (dtype_str )
269
+ if base_dtype_str is None :
270
+ branches [branch_name ] = None
271
+ continue
272
+
273
+ try :
274
+ np_dtype = np .dtype (base_dtype_str )
275
+ except TypeError :
276
+ branches [branch_name ] = None
277
+ continue
278
+
279
+ dummy = np_dtype .type (0 ) # Typed placeholder value
280
+
281
+ # Simulate jagged structure by nesting the value in lists
282
+ for _ in range (depth ):
283
+ dummy = [dummy ]
284
+
285
+ # Wrap dummy in a length-1 ak.Array
286
+ branches [branch_name ] = ak .Array ([dummy ])
287
+
288
+ if branches :
289
+ # Each tree becomes a record array with 1 entry (dict of branch arrays)
290
+ reconstructed_data [treename ] = ak .Array ([branches ])
291
+
292
+ return ak .Array (reconstructed_data )
293
+
294
+ def get_structure (dataset , array_out = False , ** kwargs ):
193
295
"""
194
296
Utility function.
195
297
Creates and sends the ServiceX request from user inputed datasets to retrieve file stucture.
@@ -204,4 +306,14 @@ def get_structure(dataset, **kwargs):
204
306
205
307
output = servicex .deliver (spec_python )
206
308
207
- return print_structure_from_str (output , ** kwargs )
309
+ if array_out == True :
310
+ all_arrays = {}
311
+ for sample , path in output .items ():
312
+ with uproot .open (path [0 ]) as f :
313
+ structure_str = f ["servicex" ]["branch" ].array ()[0 ]
314
+ sample_array = str_to_array (structure_str )
315
+ all_arrays [sample ]= sample_array
316
+ return all_arrays
317
+
318
+ else :
319
+ return print_structure_from_str (output , ** kwargs )
0 commit comments