From f0373353f7f3f9bc0eb43f47a2f40dcefa209143 Mon Sep 17 00:00:00 2001 From: jayceslesar Date: Sat, 21 Jun 2025 14:17:17 -0400 Subject: [PATCH 1/3] perf: `table.add_files` and `inspect.files` --- pyiceberg/table/__init__.py | 2 +- pyiceberg/table/inspect.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index f82658a1d2..2eebd0e42a 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -847,7 +847,7 @@ def add_files( import pyarrow.compute as pc expr = pc.field("file_path").isin(file_paths) - referenced_files = [file["file_path"] for file in self._table.inspect.files().filter(expr).to_pylist()] + referenced_files = [file["file_path"] for file in self._table.inspect.data_files().filter(expr).to_pylist()] if referenced_files: raise ValueError(f"Cannot add files that are already referenced by table, files: {', '.join(referenced_files)}") diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index cce5250ad5..85387e99c1 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -650,11 +650,12 @@ def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[S snapshot = self._get_snapshot(snapshot_id) io = self.tbl.io - files_table: list[pa.Table] = [] - for manifest_list in snapshot.manifests(io): - files_table.append(self._get_files_from_manifest(manifest_list, data_file_filter)) - return pa.concat_tables(files_table) + executor = ExecutorFactory.get_or_create() + results = list( + executor.map(lambda manifest: self._get_files_from_manifest(manifest, data_file_filter), snapshot.manifests(io)) + ) + return pa.concat_tables(results) def files(self, snapshot_id: Optional[int] = None) -> "pa.Table": return self._files(snapshot_id) From 1fff47b45b18953458f4913389149b68efbc270d Mon Sep 17 00:00:00 2001 From: Jayce Slesar <47452474+jayceslesar@users.noreply.github.com> Date: Sun, 22 Jun 2025 13:26:10 -0400 Subject: [PATCH 2/3] Update inspect.py Co-authored-by: Kevin Liu --- pyiceberg/table/inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 85387e99c1..71b1b8f507 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -653,7 +653,7 @@ def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[S executor = ExecutorFactory.get_or_create() results = list( - executor.map(lambda manifest: self._get_files_from_manifest(manifest, data_file_filter), snapshot.manifests(io)) + executor.map(lambda manifest_list: self._get_files_from_manifest(manifest_list, data_file_filter), snapshot.manifests(io)) ) return pa.concat_tables(results) From 6b6e5f78978c021e4ffafce378a7e346f27063db Mon Sep 17 00:00:00 2001 From: jayceslesar Date: Sun, 22 Jun 2025 13:43:47 -0400 Subject: [PATCH 3/3] lint --- pyiceberg/table/inspect.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 71b1b8f507..3bb0268a05 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -653,7 +653,9 @@ def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[S executor = ExecutorFactory.get_or_create() results = list( - executor.map(lambda manifest_list: self._get_files_from_manifest(manifest_list, data_file_filter), snapshot.manifests(io)) + executor.map( + lambda manifest_list: self._get_files_from_manifest(manifest_list, data_file_filter), snapshot.manifests(io) + ) ) return pa.concat_tables(results)