Skip to content

Commit e937f6a

Browse files
perf: optimize table.add_files and inspect.files (#2133)
Should help with #2130 and #2132 Modifies `Table.add_files` to explicitly use `inspect.data_files` and also parallelize `inspect._files` I didn't see anywhere else where looping over manifest entries was parallelized, so seems better to parallelize across manifests than within. No changes here but should be faster. --------- Co-authored-by: Kevin Liu <kevinjqliu@users.noreply.github.com>
1 parent 89e71c3 commit e937f6a

File tree

2 files changed

+8
-5
lines changed

2 files changed

+8
-5
lines changed

pyiceberg/table/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ def add_files(
847847
import pyarrow.compute as pc
848848

849849
expr = pc.field("file_path").isin(file_paths)
850-
referenced_files = [file["file_path"] for file in self._table.inspect.files().filter(expr).to_pylist()]
850+
referenced_files = [file["file_path"] for file in self._table.inspect.data_files().filter(expr).to_pylist()]
851851

852852
if referenced_files:
853853
raise ValueError(f"Cannot add files that are already referenced by table, files: {', '.join(referenced_files)}")

pyiceberg/table/inspect.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -650,11 +650,14 @@ def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[S
650650

651651
snapshot = self._get_snapshot(snapshot_id)
652652
io = self.tbl.io
653-
files_table: list[pa.Table] = []
654-
for manifest_list in snapshot.manifests(io):
655-
files_table.append(self._get_files_from_manifest(manifest_list, data_file_filter))
656653

657-
return pa.concat_tables(files_table)
654+
executor = ExecutorFactory.get_or_create()
655+
results = list(
656+
executor.map(
657+
lambda manifest_list: self._get_files_from_manifest(manifest_list, data_file_filter), snapshot.manifests(io)
658+
)
659+
)
660+
return pa.concat_tables(results)
658661

659662
def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
660663
return self._files(snapshot_id)

0 commit comments

Comments
 (0)