ssl-hep · ponyisi · May 28, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/code_generator_raw_uproot/servicex/raw_uproot_code_generator/request_translator.py b/code_generator_raw_uproot/servicex/raw_uproot_code_generator/request_translator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, IRIS-HEP
+# Copyright (c) 2019-2025, IRIS-HEP
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -58,9 +58,8 @@ def run_query(file_path):
 
     rv_arrays_trees = {{}}; rv_arrays_histograms = {{}}
     for subquery in jquery:
-        a, b = run_single_query(file_path, subquery)
-        rv_arrays_trees.update(a); rv_arrays_histograms.update(b)
-    return rv_arrays_trees, rv_arrays_histograms
+        for obj in run_single_query(file_path, subquery):
+            yield obj
 
 def run_single_query(file_path, query):
     import uproot
@@ -141,26 +140,20 @@ def run_single_query(file_path, query):
                         raise
                     else:
                         continue
-                arr = None
+                arrfound = False
                 for subarr in t.iterate(language=lang, **sanitized_args):
-                    if arr is None:
-                        arr = subarr
-                    else:
-                        arr = ak.concatenate([arr, subarr])
-                if arr is not None and len(arr):  # iterate will not give anything if tree empty
-                    rv_arrays_trees[outtreename] = (arr, None)
-                else:  # recent uproot handles zero-length case properly for arrays()
+                    arrfound = True
+                    yield ('tree', outtreename, subarr)
+                if not arrfound:  # need this branch if the original tree has no entries
                     if 'cut' in sanitized_args:
                         sanitized_args.pop('cut')
                     arr = t.arrays(language=lang, entry_stop=0, **sanitized_args)
-                    rv_arrays_trees[outtreename] = (None, arr.layout)
+                    yield ('tree', outtreename, arr)
         else:
             histograms = query['copy_histograms']
             keys = fl.keys(filter_name=histograms, cycle=False)
             for key in keys:
-                rv_arrays_histograms[key] = fl[key]
-
-    return rv_arrays_trees, rv_arrays_histograms
+                yield ('obj', key, fl[key])
 '''
 
         _hash = hashlib.md5(generated_code.encode(), usedforsecurity=False).hexdigest()

diff --git a/code_generator_raw_uproot/servicex/templates/transform_single_file.py b/code_generator_raw_uproot/servicex/templates/transform_single_file.py
@@ -1,14 +1,80 @@
+# Copyright (c) 2019-2025, IRIS-HEP
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import os
 import sys
 import time
 from pathlib import Path
-import generated_transformer
+from generated_transformer import run_query  # noqa
 import awkward as ak
 import pyarrow.parquet as pq
-import pyarrow
+import functools
 instance = os.environ.get('INSTANCE_NAME', 'Unknown')
 
 
+def get_generator_timing(f):
+    from time import perf_counter
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        t0 = perf_counter()
+        for yv in f(*args, **kwargs):
+            dt = perf_counter()-t0
+            yield dt, yv
+            t0 = perf_counter()
+    return wrapper
+
+
+def get_direct_timing(f):
+    from time import perf_counter
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        t0 = perf_counter()
+        rv = f(*args, **kwargs)
+        dt = perf_counter()-t0
+        return dt, rv
+    return wrapper
+
+
+def root_write_table_data(output_format, writer, outtreename, data):
+    if output_format == 'root-file':
+        if outtreename in writer:
+            writer[outtreename].extend({field: data[field] for field in data.fields})
+        else:
+            writer[outtreename] = {field: data[field] for field in data.fields}
+    else:  # RNTuple
+        if outtreename in writer:
+            writer[outtreename].extend(data)
+        else:
+            writer.mkrntuple(outtreename, data)
+
+
 def transform_single_file(file_path: str, output_path: Path, output_format: str):
     """
     Transform a single file and return some information about output
@@ -18,73 +84,56 @@ def transform_single_file(file_path: str, output_path: Path, output_format: str)
     """
     try:
         stime = time.time()
-
-        awkward_array_dict, histograms = generated_transformer.run_query(file_path)
-        total_events = sum((ak.num(awkward_array[0], axis=0)
-                            for awkward_array in awkward_array_dict.values()
-                            if awkward_array[0] is not None), 0)
-
-        ttime = time.time()
+        total_events = 0
+        ttimedt = 0
+        etimedt = 0
 
         if output_format in ('root-file', 'root-rntuple'):
             import uproot
-            etime = time.time()
             # opening the file with open() is a workaround for a bug handling multiple colons
-            # in the filename in uproot 5.3.9
+            # in the filename in uproot
             with open(output_path, 'b+w') as wfile:
                 with uproot.recreate(wfile, compression=uproot.ZSTD(5)) as writer:
-                    for k, v in awkward_array_dict.items():
-                        if output_format == 'root-file':
-                            if v[0] is not None:
-                                writer[k] = {field: v[0][field] for field in
-                                             v[0].fields}
-                            else:
-                                writer.mktree(k, dict(zip(v[1].form.columns(),
-                                                          v[1].form.column_types())))
-                        else:  # RNTuple
-                            if v[0] is not None:
-                                # Work around a limitation in uproot 5.6.0
-                                # If a cut is specified, we'll get ListArrays which can't be
-                                # written via uproot. Convert them to ListOffsetArrays
-                                # Assume the ListArrays are only at top level
-                                warr = ak.zip({_: v[0][_].layout.to_ListOffsetArray64()
-                                               if isinstance(v[0][_].layout, ak.contents.ListArray)
-                                               else v[0][_]
-                                               for _ in v[0].fields}, depth_limit=1)
-                                writer.mkrntuple(k, warr)
-                            else:
-                                writer.mkrntuple(k, v[1].form)
-                    for k, v in histograms.items():
-                        writer[k] = v
+                    for dt, item in get_generator_timing(run_query)(file_path):
+                        ttimedt += dt
+                        match item:
+                            case ('tree', k, v):
+                                total_events += ak.num(v, axis=0)
+                                root_write_table_data(output_format, writer, k, v)
+                            case ('obj', k, v):
+                                writer[k] = v
             wtime = time.time()
 
-        else:
-            if histograms:
-                raise RuntimeError("Cannot store histograms in a non-ROOT return file format")
-            for treename, subarray in awkward_array_dict.items():
-                subarray['treename'] = treename
-            awkward_array = awkward_array_dict.popitem()[1]
-            for treename, subarray in awkward_array_dict.items():
-                awkward_array = ak.concatenate([awkward_array, subarray])
-
-            arrow = ak.to_arrow_table(awkward_array)
-
-            etime = time.time()
-
-            try:
-                writer = pq.ParquetWriter(output_path, arrow.schema)
-            except pyarrow.lib.ArrowNotImplementedError:
-                raise RuntimeError("Unable to translate output tables to parquet "
-                                   "(probably different queries give different branches?)")
-            writer.write_table(table=arrow)
-            writer.close()
-
+        else:  # parquet
+            awkward_array = None
+            writer = None
+            for dt, item in get_generator_timing(run_query)(file_path):
+                ttimedt += dt
+                match item:
+                    case ('tree', k, awkward_array):
+                        total_events += ak.num(awkward_array, axis=0)
+                        awkward_array['treename'] = k
+                        dt2, arrow = get_direct_timing(ak.to_arrow_table)(awkward_array)
+                        etimedt += dt2
+                        if not writer:
+                            writer = pq.ParquetWriter(output_path, arrow.schema)
+                        try:
+                            writer.write_table(table=arrow)
+                        except ValueError as e:
+                            raise RuntimeError("Unable to translate output tables to parquet "
+                                               "(probably different queries give different "
+                                               f"branches?)\n{e}")
+                    case ('obj', k, v):
+                        raise RuntimeError("Cannot store histograms in a non-ROOT "
+                                           "return file format")
+            if writer:
+                writer.close()
             wtime = time.time()
 
         output_size = os.stat(output_path).st_size
-        print(f'Detailed transformer times. query_time:{round(ttime - stime, 3)} '
-              f'serialization: {round(etime - ttime, 3)} '
-              f'writing: {round(wtime - etime, 3)}')
+        print(f'Detailed transformer times. query_time:{round(ttimedt, 3)} '
+              f'serialization: {round(etimedt, 3)} '
+              f'writing: {round((wtime - stime) - etimedt - ttimedt, 3)}')
 
         print(f"Transform stats: Total Events: {total_events}, resulting file size {output_size}")
     except Exception as error:

diff --git a/code_generator_raw_uproot/tests/test_src.py b/code_generator_raw_uproot/tests/test_src.py
@@ -50,7 +50,7 @@ def test_generate_code():
                              'filter_name': ['lbn']},
                             {'copy_histograms': 'CutBookkeeper*'}
                             ])
-        expected_hash = "5c3235898f268e81080455c92b7c914e"
+        expected_hash = "e95bbb95ff7556f2ffcc8a8c8f09919c"
         result = translator.generate_code(query, tmpdirname)
 
         # is the generated code at least syntactically valid Python?