Skip to content

Commit 054e4a6

Browse files
committed
Removing decode-raw and implementing it in file_peeking
1 parent e1954db commit 054e4a6

File tree

2 files changed

+149
-156
lines changed

2 files changed

+149
-156
lines changed

servicex_analysis_utils/decode-raw-test.py

Lines changed: 0 additions & 119 deletions
This file was deleted.

servicex_analysis_utils/file_peeking.py

Lines changed: 149 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
import servicex
3030
import uproot
31+
import numpy as np
32+
import awkward as ak
3133

3234
def run_query(input_filenames=None):
3335
import uproot
@@ -86,9 +88,53 @@ def is_tree(obj):
8688
# Return str in an array
8789
return ak.Array([final_str])
8890

91+
def build_deliver_spec(dataset):
92+
"""
93+
Helper to build the servicex.deliver configuration.
94+
Supports multiple inputs for multiple sample queries.
95+
96+
Parameters:
97+
dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
98+
If dict, custom names can be inputed
99+
100+
Returns:
101+
spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
102+
"""
103+
#Servicex query using the PythonFunction backend
104+
query_PythonFunction = servicex.query.PythonFunction().with_uproot_function(run_query)
105+
106+
#Create a dict with sample name for ServiceX query & datasetID
107+
dataset_dict={}
108+
user_in=type(dataset)
109+
110+
if user_in == str:
111+
dataset_dict.update({"Sample":dataset})
112+
elif user_in == list and type(dataset[0]) is str:
113+
for i in range(len(dataset)):
114+
name="Sample"+str(i+1) #write number for humans
115+
dataset_dict.update({name:dataset[i]})
116+
elif user_in == dict:
117+
dataset_dict=dataset
118+
else:
119+
raise ValueError(f"Unsupported dataset input type: {user_in}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str")
120+
121+
sample_list = [
122+
{
123+
"NFiles": 1,
124+
"Name": name,
125+
"Dataset": servicex.dataset.Rucio(did),
126+
"Query": query_PythonFunction,
127+
}
128+
for name, did in dataset_dict.items()
129+
]
130+
spec_python = {"Sample": sample_list}
131+
132+
return spec_python
133+
89134
def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False, do_print=False):
90135
"""
91-
Converts dataset file structures to a formatted string.
136+
Re-formats the deliver-retrieve str structure for readability with a filter for branch selection.
137+
The string can be printed, written out or returned
92138
93139
Parameters:
94140
deliver_dict (dict): ServiceX deliver output (keys: sample names, values: file paths or URLs).
@@ -146,50 +192,106 @@ def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False,
146192
else:
147193
return result_str
148194

149-
def build_deliver_spec(dataset):
195+
def parse_jagged_depth_and_dtype(dtype_str):
150196
"""
151-
Helper to build the servicex.deliver dict configuration.
152-
Supports multiple inputs for multiple sample queries.
197+
Helper to decode the dtype str for each branch.
198+
199+
Parses uproot-style interpretation strings such as:
200+
- "AsJagged(AsJagged(AsDtype('>f4')))"
201+
202+
Returns the number of nested layers and the inner dtype.
203+
Used in str_to_array to reconstruct the ak.array.
153204
154205
Parameters:
155-
dataset (str, [str], or dict): Rucio DIDs to be checked by the servicex workers.
156-
If dict, custom names can be inputed
206+
dtype_str (str): The dtype part of a branch info str; from the delivered file structure.
157207
158208
Returns:
159-
spec_python (dict): The specification for the python function query containing Name, Query, Dataset, NFiles
209+
int, str: jagged_depth, base_numpy_dtype_str or None if not recognized.
160210
"""
161-
#Servicex query using the PythonFunction backend
162-
query_PythonFunction = servicex.query.PythonFunction().with_uproot_function(run_query)
163-
164-
#Create a dict with sample name for ServiceX query & datasetID
165-
dataset_dict={}
166-
user_in=type(dataset)
167-
168-
if user_in == str:
169-
dataset_dict.update({"Sample":dataset})
170-
elif user_in == list and type(dataset[0]) is str:
171-
for i in range(len(dataset)):
172-
name="Sample"+str(i+1) #write number for humans
173-
dataset_dict.update({name:dataset[i]})
174-
elif user_in == dict:
175-
dataset_dict=dataset
211+
depth = 0
212+
current = dtype_str.strip()
213+
214+
# Count how many nested AsJagged(...) wrappers exist
215+
while current.startswith("AsJagged("):
216+
depth += 1
217+
current = current[len("AsJagged("):-1].strip() # Strip outermost wrapper, up to -1 to remove )
218+
219+
# Extract the base dtype string from AsDtype('<np-format>')
220+
if current.startswith("AsDtype('") and current.endswith("')"):
221+
base_dtype = current[len("AsDtype('"):-2]
222+
return depth, base_dtype
176223
else:
177-
raise ValueError(f"Unsupported dataset input type: {user_in}.\nInput must be dict ('sample_name':'dataset_id'), str or list of str")
178-
179-
sample_list = [
180-
{
181-
"NFiles": 1,
182-
"Name": name,
183-
"Dataset": servicex.dataset.Rucio(did),
184-
"Query": query_PythonFunction,
185-
}
186-
for name, did in dataset_dict.items()
187-
]
188-
spec_python = {"Sample": sample_list}
224+
return depth, None
189225

190-
return spec_python
226+
def str_to_array(encoded_str):
227+
"""
228+
Helper to reconstruct ak.Arrays from an encoded file-structure string.
229+
Retruned array mimicks TTrees, TBranches with correct field names and dtypes.
230+
231+
Parameters:
232+
encoded_str (str): The encoded string from run_query.
191233
192-
def get_structure(dataset, **kwargs):
234+
Returns:
235+
reconstructed_data (ak.Array): Contains trees and branches with typed dumy values.
236+
"""
237+
238+
#Separate trees
239+
tree_sections = encoded_str.strip().split("\n")
240+
reconstructed_data = {}
241+
242+
for tree_section in tree_sections:
243+
tree_section = tree_section.strip()
244+
if not tree_section:
245+
continue #skip empty lines
246+
247+
parts = tree_section.split(";", 1) #Tree and branches separated by ;
248+
tree_header = parts[0].strip()
249+
250+
# Extract tree name
251+
treename = tree_header[len("Tree: "):]
252+
branches = {}
253+
254+
if len(parts) > 1:
255+
branches_str = parts[1].strip()
256+
branch_infos = branches_str.split(",") #Branches separated by
257+
258+
for branch in branch_infos:
259+
branch = branch.strip()
260+
261+
if " ; dtype: " in branch: # line with branch info
262+
name_str, dtype_str = branch.split(" ; dtype: ", 1)
263+
# Extract name
264+
branch_name = name_str[len("TBranch: "):].strip()
265+
dtype_str = dtype_str.strip()
266+
267+
# Get nesting depth and base dtype from interpretation string
268+
depth, base_dtype_str = parse_jagged_depth_and_dtype(dtype_str)
269+
if base_dtype_str is None:
270+
branches[branch_name] = None
271+
continue
272+
273+
try:
274+
np_dtype = np.dtype(base_dtype_str)
275+
except TypeError:
276+
branches[branch_name] = None
277+
continue
278+
279+
dummy = np_dtype.type(0) # Typed placeholder value
280+
281+
# Simulate jagged structure by nesting the value in lists
282+
for _ in range(depth):
283+
dummy = [dummy]
284+
285+
# Wrap dummy in a length-1 ak.Array
286+
branches[branch_name] = ak.Array([dummy])
287+
288+
if branches:
289+
# Each tree becomes a record array with 1 entry (dict of branch arrays)
290+
reconstructed_data[treename] = ak.Array([branches])
291+
292+
return ak.Array(reconstructed_data)
293+
294+
def get_structure(dataset, array_out=False, **kwargs):
193295
"""
194296
Utility function.
195297
Creates and sends the ServiceX request from user inputed datasets to retrieve file stucture.
@@ -204,4 +306,14 @@ def get_structure(dataset, **kwargs):
204306

205307
output=servicex.deliver(spec_python)
206308

207-
return print_structure_from_str(output, **kwargs)
309+
if array_out==True:
310+
all_arrays={}
311+
for sample, path in output.items():
312+
with uproot.open(path[0]) as f:
313+
structure_str = f["servicex"]["branch"].array()[0]
314+
sample_array=str_to_array(structure_str)
315+
all_arrays[sample]=sample_array
316+
return all_arrays
317+
318+
else:
319+
return print_structure_from_str(output, **kwargs)

0 commit comments

Comments
 (0)