Original package commit on personal repo

ArturU043 · ArturU043 · commit a2245616a6ba · 2025-02-05T18:52:19.000+01:00
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright (c) 2025, IRIS-HEP 
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,10 @@
+# ServiceX analysis utils
+This repository provides analysis tools to be used with the [ServiceX Client](https://github.com/ssl-hep/ServiceX_frontend/tree/master)
+
+### To install 
+'''
+pip install servicex-analysis-utils
+'''
+
+## Documentation
+The different functions are documented in [ServiceX Documentation](https://servicex-frontend.readthedocs.io)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,39 @@
+[build-system]
+requires = ["hatchling"] 
+build-backend = "hatchling.build"
+
+[project]
+name = "servicex_analysis_utils"
+version = "1.0.b1"
+description = "A package with analysis tools for ServiceX."
+authors = [{name = "Artur Cordeiro Oudot Choi", email = "acordeir@cern.ch"}]
+readme = "README.md"
+license = { text = "BSD-3-Clause" }
+requires-python = ">=3.9"
+dependencies = [ 
+    "uproot>=5.0",
+    "awkward>=2.6",
+    "dask-awkward>=2024.12.2",
+]
+
+[project.urls]
+"Source Code" = "https://github.com/ArturU043/ServiceX_analysis_utils"
+"Documentation" = "https://servicex.readthedocs.io/"
+"Issue Tracker" = "https://github.com/ArturU043/ServiceX_analysis_utils/issues"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/servicex_analysis_utils",
+    "/tests/"
+]
+
+[project.optional-dependencies]
+
+# Developer extras
+test = [
+    "pytest>=7.2.0",
+    "numpy>=1.21", 
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["servicex_analysis_utils"]
diff --git a/servicex_analysis_utils/__init__.py b/servicex_analysis_utils/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2025, IRIS-HEP
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from .materialization import to_awk 
+
+__version__ = "1.0.b1"
+__all__ = ['to_awk']
+#__version__ = importlib.metadata.version("servicex") 
diff --git a/servicex_analysis_utils/materialization.py b/servicex_analysis_utils/materialization.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2025, IRIS-HEP
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import uproot
+import awkward as ak
+import dask_awkward as dak 
+import logging 
+
+def to_awk(deliver_dict, dask=False, **uproot_kwargs):
+    """
+    Load an awkward array from the deliver() output with uproot or uproot.dask.
+
+    Parameters:
+        deliver_dict (dict): Returned dictionary from servicex.deliver()
+                            (keys are sample names, values are file paths or URLs).
+        dask (bool):        Optional. Flag to load as dask-awkward array. Default is False
+        **uproot_kwargs :   Optional. Additional keyword arguments passed to uproot.dask or uproot.iterate
+
+    
+    Returns:
+        dict: keys are sample names and values are awkward arrays or dask-awkward arrays.
+    """
+  
+    awk_arrays = {}
+
+    for sample, paths in deliver_dict.items():
+        try:
+            if dask:
+                # Use uproot.dask to handle URLs and local paths lazily 
+                awk_arrays[sample] = uproot.dask(paths, library="ak", **uproot_kwargs)
+            else:
+                # Use uproot.iterate to handle URLs and local paths files in chunks
+                tmp_arrays = list(uproot.iterate(paths, library="ak", **uproot_kwargs))
+                # Merge arrays
+                awk_arrays[sample] = ak.concatenate(tmp_arrays) 
+
+        except Exception as e:
+            # Log the exception pointing at the user's code 
+            msg=f"\nError loading sample: {sample}"
+            logging.error(msg, exc_info=True, stacklevel=2)
+            # Mark the sample as failed
+            awk_arrays[sample] = None
+
+    return awk_arrays
diff --git a/tests/test_materialization.py b/tests/test_materialization.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, IRIS-HEP
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+import uproot 
+import awkward as ak
+import dask_awkward as dak 
+import logging 
+import os
+import sys
+import numpy as np
+
+#Setting rpath
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from servicex_analysis_utils.materialization import to_awk
+
+def build_test_samples():
+    # example data for two branches
+    tree_data1 = {
+    "branch1": np.ones(100),
+    "branch2": np.zeros(100)
+    }
+    # example data for one branch
+    tree_data2 = {"branch1": np.ones(10)}  
+
+    # Create tmp .root files
+    with uproot.create(test_path1) as file:
+        file["Tree"] = tree_data1
+    
+    with uproot.create(test_path2) as file:
+        file["Tree"] = tree_data2
+
+#Initial test configuration
+@pytest.fixture(scope="function", autouse=True)
+def init(tmp_path):
+    #Setting global variables to be used in the tests and helper function
+    global test_path1, test_path2, \
+           result, result_da, result_filtered 
+
+    test_path1 = tmp_path / "test_file1.root"
+    test_path2 = tmp_path / "test_file2.root"
+
+    #Building dumy test files
+    if not os.path.exists(test_path1) or not os.path.exists(test_path2):
+        build_test_samples()
+
+    #Dict like servicex.deliver() output
+    sx_dict = {"Test-Sample1": test_path1, "Test-Sample2": test_path2}
+
+    #Executing to_awk() and saving results for tests
+    result = to_awk(sx_dict)
+    result_da = to_awk(sx_dict, dask=True, step_size=10) #uproot.dask step_size kwarg
+    result_filtered = to_awk(sx_dict, expressions="branch1") #uproot.iterate expressions kwarg
+
+#Test functions
+def test_to_awk_instances():
+    arr1=result["Test-Sample1"]
+    da_arr1=result_da["Test-Sample1"]
+
+    #Testing returned types
+    assert isinstance(arr1, ak.Array), "to_awk() does not produce an awkward.Array instance"
+    assert isinstance(da_arr1, dak.Array), "to_awk(dask=True) does not produce a dask_awkward.Array instance"
+
+def test_to_awk_collection():
+    arr1=result["Test-Sample1"]
+    arr2=result["Test-Sample2"]
+
+    #Collecting all samples 
+    assert list(result.keys())==["Test-Sample1", "Test-Sample2"]
+
+    #Collecting all branches
+    assert ak.fields(arr1) == ['branch1', 'branch2']
+    assert ak.fields(arr2) == ['branch1']
+
+    #Collecting all elements per branch
+    assert ak.all(arr1['branch2'] == ak.from_numpy(np.zeros(100)))
+    assert ak.all(arr2['branch1'] == ak.from_numpy(np.ones(10)))
+
+def test_to_awk_dask():
+    arr1=result_da["Test-Sample1"]
+    arr2=result_da["Test-Sample2"]
+
+    #Testing if dask.compute() leads to same results
+    assert ak.almost_equal(arr1.compute(), result["Test-Sample1"])
+    assert ak.almost_equal(arr2.compute(), result["Test-Sample2"])
+
+    #Testing partitionning kwarg
+    assert arr1.npartitions == 10
+    assert arr2.npartitions == 1
+
+def test_to_awk_filter():
+    arr1=result_filtered["Test-Sample1"]
+    arr2=result_filtered["Test-Sample2"]
+
+    #Testing if filtering kwargs are passed to uproot.iterate()
+    assert ak.fields(arr1) == ['branch1'] #branch2 should be filtered out
+    assert ak.fields(arr2) == ['branch1'] 
+
+
+
+
+
+    
+