updates for lgdo 1.12

ggmarshall · ggmarshall · commit e0edfd72c79c · 2025-04-24T03:05:54.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "h5py>=3.2",
     "iminuit",
     "legend-daq2lh5>=1.2.1",
-    "legend-pydataobj>=1.7,<1.12",
+    "legend-pydataobj>=1.12.0a1,",
     "pylegendmeta>=0.9",
     "matplotlib",
     "numba!=0.53.*,!=0.54.*,!=0.57",
diff --git a/src/pygama/evt/build_evt.py b/src/pygama/evt/build_evt.py
@@ -268,7 +268,7 @@ def build_evt_cols(
         if Path(datainfo.evt.file).exists():
             Path(datainfo.evt.file).unlink()
 
-    for tcm_lh5, _, n_rows in lh5.LH5Iterator(
+    for tcm_lh5 in lh5.LH5Iterator(
         datainfo.tcm.file,
         datainfo.tcm.group,
         buffer_len=buffer_len,
@@ -281,7 +281,7 @@ def build_evt_cols(
         )
 
         # get number of events in file (ask the TCM)
-        table = Table(size=n_rows)
+        table = Table(size=len(tcm_lh5))
 
         # now loop over operations (columns in evt table)
         for field, v in config["operations"].items():
@@ -335,7 +335,7 @@ def build_evt_cols(
                     channels_skip=channels_skip,
                     mode=v["aggregation_mode"],
                     expr=v["expression"],
-                    n_rows=n_rows,
+                    n_rows=len(tcm_lh5),
                     table=table,
                     parameters=v.get("parameters", None),
                     query=v.get("query", None),
@@ -366,7 +366,7 @@ def build_evt_cols(
             table.add_field(field, obj)
 
         # might need to re-organize fields in subtables, create a new object for that
-        nested_tbl = Table(size=n_rows)
+        nested_tbl = Table(size=len(tcm_lh5))
         output_fields = config.get("outputs", table.keys())
 
         for field, obj in table.items():
@@ -389,7 +389,7 @@ def build_evt_cols(
 
                 # otherwise, increase nesting
                 if level not in lvl_ptr:
-                    lvl_ptr.add_field(level, Table(size=n_rows))
+                    lvl_ptr.add_field(level, Table(size=len(tcm_lh5)))
                 lvl_ptr = lvl_ptr[level]
 
         # write output fields into outfile
diff --git a/src/pygama/evt/tcm.py b/src/pygama/evt/tcm.py
@@ -100,7 +100,9 @@ def generate_tcm_cols(
         for _ii, it in enumerate(iterators[curr_mask]):
             ii = np.where(curr_mask)[0][_ii]
             try:
-                buffer, start, buf_len = it.__next__()
+                buffer = it.__next__()
+                buf_len = len(buffer)
+                start = it.current_i_entry
             except StopIteration:
                 at_end[ii] = True
                 continue
diff --git a/src/pygama/flow/data_loader.py b/src/pygama/flow/data_loader.py
@@ -11,11 +11,12 @@
 from keyword import iskeyword
 from typing import Iterator
 
+import lgdo.lh5 as lh5
 import numpy as np
 import pandas as pd
 from awkward_pandas import AwkwardDtype
 from dspeed.vis import WaveformBrowser
-from lgdo.lh5 import LH5Iterator, LH5Store
+from lgdo.lh5 import LH5Iterator
 from lgdo.lh5.utils import expand_vars
 from lgdo.types import Array, Struct, Table
 from lgdo.types.vovutils import build_cl, explode_arrays
@@ -537,8 +538,6 @@ def build_entry_list(
         if save_output_columns:
             entry_cols += for_output
 
-        sto = LH5Store()
-
         if log.getEffectiveLevel() >= logging.INFO:
             progress_bar = tqdm(
                 desc="Building entry list",
@@ -585,7 +584,7 @@ def build_entry_list(
 
             tcm_table_name = self.filedb.get_table_name(tcm_tier, tcm_tb)
             try:
-                tcm_lgdo, _ = sto.read(tcm_table_name, tcm_path)
+                tcm_lgdo = lh5.read(tcm_table_name, tcm_path)
             except KeyError:
                 log.warning(f"Cannot find table {tcm_table_name} in file {tcm_path}")
                 continue
@@ -651,7 +650,7 @@ def build_entry_list(
                             if tb in col_tiers[file]["tables"][tier]:
                                 table_name = self.filedb.get_table_name(tier, tb)
                                 try:
-                                    tier_table, _ = sto.read(
+                                    tier_table = lh5.read(
                                         table_name,
                                         tier_path,
                                         field_mask=cut_cols[level],
@@ -710,9 +709,9 @@ def build_entry_list(
                 f_dict = f_entries.to_dict("list")
                 f_struct = Struct(f_dict)
                 if self.merge_files:
-                    sto.write(f_struct, "entries", output_file, wo_mode="a")
+                    lh5.write(f_struct, "entries", output_file, wo_mode="a")
                 else:
-                    sto.write(f_struct, f"entries/{file}", output_file, wo_mode="a")
+                    lh5.write(f_struct, f"entries/{file}", output_file, wo_mode="a")
 
         if log.getEffectiveLevel() >= logging.INFO:
             progress_bar.close()
@@ -782,8 +781,6 @@ def build_hit_entries(
         log.debug(f"need to load {cut_cols} columns for applying cuts")
         col_tiers = self.get_tiers_for_col(cut_cols, merge_files=False)
 
-        sto = LH5Store()
-
         if log.getEffectiveLevel() >= logging.INFO:
             progress_bar = tqdm(
                 desc="Building entry list",
@@ -832,7 +829,7 @@ def build_hit_entries(
                     # now read how many rows are there in the file
                     table_name = self.filedb.get_table_name(tier, tb)
                     try:
-                        n_rows = sto.read_n_rows(table_name, tier_path)
+                        n_rows = lh5.read_n_rows(table_name, tier_path)
                     except KeyError:
                         log.warning(f"Cannot find {table_name} in file {tier_path}")
                         continue
@@ -862,7 +859,7 @@ def build_hit_entries(
                             # load the data from the tier file, just the columns needed for the cut
                             table_name = self.filedb.get_table_name(tier, tb)
                             try:
-                                tier_tb, _ = sto.read(
+                                tier_tb = lh5.read(
                                     table_name, tier_path, field_mask=cut_cols
                                 )
                             except KeyError:
@@ -902,9 +899,9 @@ def build_hit_entries(
                 f_dict = f_entries.to_dict("list")
                 f_struct = Struct(f_dict)
                 if self.merge_files:
-                    sto.write(f_struct, "entries", output_file, wo_mode="a")
+                    lh5.write(f_struct, "entries", output_file, wo_mode="a")
                 else:
-                    sto.write(f_struct, f"entries/{file}", output_file, wo_mode="a")
+                    lh5.write(f_struct, f"entries/{file}", output_file, wo_mode="a")
 
         if log.getEffectiveLevel() >= logging.INFO:
             progress_bar.close()
@@ -1063,8 +1060,6 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table):
             tier_table.update(zip(tier_table.keys(), exp_cols))
             return tier_table
 
-        sto = LH5Store()
-
         if self.merge_files:
             tables = entry_list[f"{parent}_table"].unique()
             field_mask = []
@@ -1115,7 +1110,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table):
                         for file in files
                     ]
 
-                    tier_table, _ = sto.read(
+                    tier_table = lh5.read(
                         name=tb_name,
                         lh5_file=tier_paths,
                         idx=idx_mask,
@@ -1141,7 +1136,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table):
             f_table = utils.dict_to_table(col_dict=col_dict, attr_dict=attr_dict)
 
             if output_file:
-                sto.write(f_table, "merged_data", output_file, wo_mode="o")
+                lh5.write(f_table, "merged_data", output_file, wo_mode="o")
             if in_memory:
                 if self.output_format == "lgdo.Table":
                     return f_table
@@ -1218,7 +1213,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table):
                             raise FileNotFoundError(tier_path)
 
                         table_name = self.filedb.get_table_name(tier, tb)
-                        tier_table, _ = sto.read(
+                        tier_table = lh5.read(
                             table_name,
                             tier_path,
                             idx=idx_mask,
@@ -1244,7 +1239,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table):
                 if in_memory:
                     load_out.add_field(name=file, obj=f_table)
                 if output_file:
-                    sto.write(f_table, f"{file}", output_file, wo_mode="o")
+                    lh5.write(f_table, f"{file}", output_file, wo_mode="o")
                 # end file loop
 
             if log.getEffectiveLevel() >= logging.INFO:
@@ -1278,8 +1273,6 @@ def load_evts(
         child = self.tcms[tcm_level]["child"]
         load_levels = [parent, child]
 
-        sto = LH5Store()
-
         if self.merge_files:  # Try to load all information at once
             raise NotImplementedError
         else:  # Not merge_files
@@ -1316,7 +1309,7 @@ def load_evts(
                                 )
                                 if os.path.exists(tier_path):
                                     table_name = self.filedb.get_table_name(tier, tb)
-                                    tier_table, _ = sto.read(
+                                    tier_table = lh5.read(
                                         table_name,
                                         tier_path,
                                         idx=idx_mask,
@@ -1330,7 +1323,7 @@ def load_evts(
                 if in_memory:
                     load_out[file] = f_table
                 if output_file:
-                    sto.write(f_table, f"file{file}", output_file, wo_mode="o")
+                    lh5.write(f_table, f"file{file}", output_file, wo_mode="o")
                 # end file loop
 
             if in_memory:
diff --git a/src/pygama/flow/file_db.py b/src/pygama/flow/file_db.py
@@ -10,10 +10,10 @@
 import warnings
 
 import h5py
+import lgdo.lh5 as lh5
 import numpy as np
 import pandas as pd
 from lgdo.lh5 import ls
-from lgdo.lh5.store import LH5Store
 from lgdo.lh5.utils import expand_path, expand_vars
 from lgdo.types import Array, Scalar, VectorOfVectors
 from parse import parse
@@ -480,8 +480,7 @@ def update_tables_cols(row, tier: str, utc_cache: dict = None) -> pd.Series:
             columns_vov = VectorOfVectors(
                 flattened_data=flattened, cumulative_length=length
             )
-            sto = LH5Store()
-            sto.write(columns_vov, "unique_columns", to_file)
+            lh5.write(columns_vov, "unique_columns", to_file)
 
         return self.columns
 
@@ -509,7 +508,6 @@ def from_disk(self, path: str | list[str]) -> None:
         if not paths:
             raise FileNotFoundError(path)
 
-        sto = LH5Store()
         # objects/accumulators that will be used to configure the FileDB at the end
         _cfg = None
         _df = None
@@ -531,7 +529,7 @@ def _replace_idx(row, trans, tier):
 
         # loop over the files
         for p in paths:
-            cfg, _ = sto.read("config", p)
+            cfg = lh5.read("config", p)
             cfg = json.loads(cfg.value.decode())
 
             # make sure configurations are all the same
@@ -543,7 +541,7 @@ def _replace_idx(row, trans, tier):
                 )
 
             # read in unique columns
-            vov, _ = sto.read("columns", p)
+            vov = lh5.read("columns", p)
             # Convert back from VoV of UTF-8 bytestrings to a list of lists of strings
             columns = [[v.decode("utf-8") for v in ov] for ov in list(vov)]
 
@@ -606,8 +604,7 @@ def to_disk(self, filename: str, wo_mode="write_safe") -> None:
         """
         log.debug(f"writing database to {filename}")
 
-        sto = LH5Store()
-        sto.write(Scalar(json.dumps(self.config)), "config", filename, wo_mode=wo_mode)
+        lh5.write(Scalar(json.dumps(self.config)), "config", filename, wo_mode=wo_mode)
 
         if wo_mode in ["write_safe", "w", "overwrite_file", "of"]:
             wo_mode = "a"
@@ -624,7 +621,7 @@ def to_disk(self, filename: str, wo_mode="write_safe") -> None:
                 flattened_data=Array(nda=np.array(flat).astype("S")),
                 cumulative_length=Array(nda=np.array(cum_l)),
             )
-            sto.write(col_vov, "columns", filename, wo_mode=wo_mode)
+            lh5.write(col_vov, "columns", filename, wo_mode=wo_mode)
 
         # FIXME: to_hdf() throws this:
         #
diff --git a/src/pygama/hit/build_hit.py b/src/pygama/hit/build_hit.py
@@ -10,8 +10,9 @@
 from typing import Iterable, Mapping
 
 import lgdo
+import lgdo.lh5 as lh5
 import numpy as np
-from lgdo.lh5 import LH5Iterator, LH5Store, ls
+from lgdo.lh5 import LH5Iterator, ls
 
 from .. import utils
 
@@ -75,13 +76,12 @@ def build_hit(
     n_max
         maximum number of rows to process
     wo_mode
-        forwarded to :meth:`lgdo.lh5.store.LH5Store.write`.
+        forwarded to :meth:`lgdo.lh5.write`.
 
     See Also
     --------
     lgdo.types.table.Table.eval
     """
-    store = LH5Store()
 
     if lh5_tables_config is None and hit_config is None:
         raise ValueError("either lh5_tables_config or hit_config must be specified")
@@ -134,13 +134,12 @@ def build_hit(
     first_done = False
     for tbl, cfg in lh5_tables_config.items():
         lh5_it = LH5Iterator(infile, tbl, buffer_len=buffer_len)
-        tot_n_rows = store.read_n_rows(tbl, infile)
         write_offset = 0
 
         log.info(f"Processing table '{tbl}' in file {infile}")
 
-        for tbl_obj, start_row, n_rows in lh5_it:
-            n_rows = min(tot_n_rows - start_row, n_rows)
+        for tbl_obj in lh5_it:
+            start_row = lh5_it.current_i_entry
 
             # create a new table object that links all the columns in the
             # current table (i.e. no copy)
@@ -193,11 +192,11 @@ def build_hit(
                         if col not in cfg["outputs"]:
                             outtbl_obj.remove_column(col, delete=True)
 
-            store.write(
+            lh5.write(
                 obj=outtbl_obj,
                 name=tbl.replace("/dsp", "/hit"),
                 lh5_file=outfile,
-                n_rows=n_rows,
+                n_rows=len(tbl_obj),
                 wo_mode=wo_mode if first_done is False else "append",
                 write_start=write_offset + start_row,
             )
diff --git a/src/pygama/pargen/utils.py b/src/pygama/pargen/utils.py
@@ -123,12 +123,14 @@ def load_data(
             )
         df = pd.DataFrame(columns=list(df_fields))
 
-        for table, entry, n_rows in lh5_it:
+        for table in lh5_it:
             # Evaluate all provided expressions and add to table
             for outname, info in cal_dict.items():
                 table[outname] = table.eval(
                     info["expression"], info.get("parameters", None)
                 )
+            entry = lh5_it.current_global_entries[0]
+            n_rows = len(table)
 
             # Copy params in table into dataframe
             for par in df:
diff --git a/tests/hit/test_build_hit.py b/tests/hit/test_build_hit.py
diff --git a/tests/pargen/test_datacleaning.py b/tests/pargen/test_datacleaning.py

Original file line number	Diff line number	Diff line change
`@@ -123,12 +123,14 @@ def load_data(`
`123`	`123`	`)`
`124`	`124`	`df = pd.DataFrame(columns=list(df_fields))`
`125`	`125`
`126`		`- for table, entry, n_rows in lh5_it:`
	`126`	`+ for table in lh5_it:`
`127`	`127`	`# Evaluate all provided expressions and add to table`
`128`	`128`	`for outname, info in cal_dict.items():`
`129`	`129`	`table[outname] = table.eval(`
`130`	`130`	`info["expression"], info.get("parameters", None)`
`131`	`131`	`)`
	`132`	`+ entry = lh5_it.current_global_entries[0]`
	`133`	`+ n_rows = len(table)`
`132`	`134`
`133`	`135`	`# Copy params in table into dataframe`
`134`	`136`	`for par in df:`