Skip to content

Commit ae69520

Browse files
committed
adding get_structure utility function and dependencies
1 parent afccc4b commit ae69520

File tree

3 files changed

+181
-0
lines changed

3 files changed

+181
-0
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ readme = "README.md"
1111
license = { text = "BSD-3-Clause" }
1212
requires-python = ">=3.9"
1313
dependencies = [
14+
"servicex",
1415
"uproot>=5.0",
1516
"awkward>=2.6",
1617
"dask-awkward>=2024.12.2",

servicex_analysis_utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
from .materialization import to_awk
29+
from .file_peeking import get_structure
2930

3031
__version__ = "1.0.b1"
3132
__all__ = ['to_awk']
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Copyright (c) 2025, IRIS-HEP
2+
# All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# * Redistributions of source code must retain the above copyright notice, this
8+
# list of conditions and the following disclaimer.
9+
#
10+
# * Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# * Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from
16+
# this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
import servicex
30+
import uproot
31+
32+
def run_query(input_filenames=None):
33+
import uproot
34+
import awkward as ak
35+
"""
36+
Helper. Open a file and return one array containing a single string that describes the DataSet root file structure.
37+
Sent to ServiceX python transformers.
38+
39+
The string will be formatted like:
40+
"Tree: TreeName1; TBranch: Branchname1 ; dtype: BranchType1, TBranch: Branchname2 ; dtype: BranchType2, ...
41+
Tree: TreeName2; TBranch: Branchname1 ; dtype: BranchType1, ..."
42+
"""
43+
def is_tree(obj):
44+
"""
45+
Helper to check if a root file item is TTree. Different object types use .classname or .classnames
46+
"""
47+
# Check for 'classname'
48+
if hasattr(obj, "classname"):
49+
cls_attr = obj.classname
50+
# Call if it's callable
51+
cls_value = cls_attr() if callable(cls_attr) else cls_attr
52+
return "TTree" in cls_value
53+
# Check for 'classnames'
54+
elif hasattr(obj, "classnames"):
55+
cls_attr = obj.classnames
56+
cls_values = cls_attr() if callable(cls_attr) else cls_attr
57+
return any("TTree" in cls for cls in cls_values)
58+
return False
59+
60+
trees_info = [] # list of str info for each tree
61+
62+
with uproot.open(input_filenames) as file:
63+
for tree_name in file.keys():
64+
# Remove uproot tree sufix
65+
tree_name_clean = tree_name.rstrip(";1")
66+
tree = file[tree_name]
67+
68+
# Only TTrees
69+
if not is_tree(tree):
70+
continue
71+
72+
# Gather branch info
73+
branch_info_list = []
74+
for branch_name, branch in tree.items():
75+
# Using uproot type interpretor
76+
branch_type = str(branch.interpretation)
77+
branch_info_list.append(f"TBranch: {branch_name} ; dtype: {branch_type}")
78+
79+
# Join branch info & separate by ,
80+
tree_info = f"Tree: {tree_name_clean}; " + ", ".join(branch_info_list)
81+
trees_info.append(tree_info)
82+
83+
# Join all trees & separate by \n
84+
final_str = "\n".join(trees_info)
85+
86+
# Return str in an array
87+
return ak.Array([final_str])
88+
89+
90+
def print_structure_from_str(deliver_dict, filter_branch="", save_to_txt=False ):
91+
"""
92+
Helper. Takes the structure strings for all samples from servicex.deliver output
93+
and prints them in a friendly formatted view.
94+
95+
The expected structure string format is:
96+
97+
Tree: TreeName1; TBranch: Branchname1 ; dtype: BranchType1, TBranch: Branchname2 ; dtype: BranchType2, ...
98+
Tree: TreeName2; TBranch: Branchname1 ; dtype: BranchType1, ...
99+
100+
Parameters:
101+
deliver_dict (dict): The return dictionary of servicex.deliver
102+
(keys are sample names, values are file paths or URLs)
103+
filter_branch (str): Optional. Only Branch names containing it are printed.
104+
save_to_txt (bool): Optional. Select if file structure is printed or dumped to .txt
105+
"""
106+
print(f"File structure of all samples with branch filter {filter_branch}:")
107+
108+
for sample_name, path in deliver_dict.items():
109+
#Sample name with icon and bands
110+
print(
111+
f"\n---------------------------\n"
112+
f"\U0001F4C1 Sample: {sample_name}\n"
113+
f"---------------------------"
114+
)
115+
116+
with uproot.open(path[0]) as f:
117+
#Expected position of structure_str from servicex.deliver
118+
structure_str=f["servicex"]["branch"].array()[0]
119+
120+
# Split at each \n each new line represents one tree.
121+
tree_lines = structure_str.split("\n")
122+
123+
for line in tree_lines:
124+
if not line.strip():
125+
continue # Skip empty lines
126+
127+
# First part before ';' is the tree header.
128+
parts = line.split(";", 1)
129+
tree_header = parts[0]
130+
print(f"\n\U0001F333 {tree_header}") # Print tree header with icon
131+
132+
# Check for branches
133+
if len(parts) > 1:
134+
# branch info separated by ','
135+
branch_info_str = parts[1]
136+
branch_infos = branch_info_str.split(",")
137+
print(" ├── Branches:")
138+
for b in branch_infos:
139+
branch_line = b.strip()
140+
if filter_branch not in branch_line:
141+
continue
142+
# Only print lines that start with "TBranch:"
143+
if branch_line.startswith("TBranch:"):
144+
print(f" │ ├── {branch_line[8:]}")
145+
146+
147+
def get_structure(dataset_dict, **kwargs):
148+
"""
149+
Utility function.
150+
Creates and sends the ServiceX request from user input.
151+
Calls print_structure_from_str()
152+
153+
Parameters:
154+
dataset_dict (dict): The datasets to print the structures from, with the associated sample name for readability
155+
note - should add default sample names and option to add just did or list of dids
156+
kwargs : Arguments to be propagated to print_structure_from_str
157+
"""
158+
#Servicex query using the PythonFunction backend
159+
query_PythonFunction = servicex.query.PythonFunction().with_uproot_function(run_query)
160+
sample_list=[]
161+
162+
for name, did in dataset_dict.items():
163+
tmp_dict={
164+
"NFiles":1,
165+
"Name": name,
166+
"Dataset": servicex.dataset.Rucio(did),
167+
"Query": query_PythonFunction,
168+
}
169+
sample_list.append(tmp_dict)
170+
171+
spec_python = {
172+
"Sample": sample_list
173+
}
174+
175+
output=servicex.deliver(spec_python)
176+
177+
print_structure_from_str(output,**kwargs)
178+
179+

0 commit comments

Comments
 (0)