Prefer FileIO over the PyArrow FileSystem (apache#2115)

Fokko · amitgilad3 · commit 2a5fb97c3050 · 2025-07-07T08:39:57.000+03:00
# Rationale for this change

This is problematic if you try to implement your own `FileIO`. Then
Streams are opened both through the FileIO and the FileSystem directly.

# Are these changes tested?

Yes, existing tests.

# Are there any user-facing changes?

No, but I think this makes the code esthetically also more pleasing by
removing complexity.

&lt;!-- In the case of user-facing changes, please add the changelog label.
--&gt;

# Numbers

A while ago I did some inspection of the calls being made to S3, so just
to be sure that we don't alter anything, I've collected some stats using
a small "benchmark" locally:

```python
def test_fokko(session_catalog: RestCatalog):
    parquet_file = "/Users/fokko.driesprong/Downloads/yellow_tripdata_2024-01.parquet"

    from pyarrow import parquet as pq

    df = pq.read_table(parquet_file)

    try:
        session_catalog.drop_table("default.taxi")
    except Exception:
        pass

    tbl = session_catalog.create_table("default.taxi", schema=df.schema)

    with tbl.update_spec() as tx:
        tx.add_field("tpep_pickup_datetime", "hour")

    tbl.append(df)

    rounds = []
    for _ in range(22):
        start = round(time.time() * 1000)
        assert len(tbl.scan().to_arrow()) == 2964624
        stop = round(time.time() * 1000)
        rounds.append(stop - start)

    print(f"Took: {sum(rounds) / len(rounds)} ms on average")
```

Main:
Took: 1715.1818181818182 ms on average

```
&gt; mc admin trace --stats minio

Call                        Count       RPM    Avg Time  Min Time  Max Time  Avg TTFB  Max TTFB  Avg Size     Rate /min    Errors  
s3.GetObject                77 (29.2%)  697.9  701µs     153µs     1.6ms     463µs     838µs     ↑159B ↓712K  ↑108K ↓485M  0       
s3.HeadObject               73 (27.7%)  661.6  192µs     107µs     735µs     177µs     719µs     ↑153B        ↑99K         0       
s3.CompleteMultipartUpload  37 (14.0%)  335.4  8.2ms     1.9ms     17.5ms    8.2ms     17.5ms    ↑397B ↓507B  ↑130K ↓166K  0       
s3.NewMultipartUpload       37 (14.0%)  335.4  6.2ms     2.1ms     14.2ms    6.1ms     14.1ms    ↑153B ↓437B  ↑50K ↓143K   0       
s3.PutObjectPart            37 (14.0%)  335.4  18.4ms    5.1ms     38.8ms    18.4ms    38.8ms    ↑1.4M        ↑469M        0       
s3.PutObject                3 (1.1%)    27.2   5.4ms     3.4ms     8.8ms     5.3ms     8.8ms     ↑2.8K        ↑75K         0  
```

Branch:
Took: 1723.1818181818182 ms on average

```
&gt; mc admin trace --stats minio

Call                        Count       RPM    Avg Time  Min Time  Max Time  Avg TTFB  Max TTFB  Avg Size     Rate /min    Errors  
s3.GetObject                77 (29.2%)  696.3  927µs     171µs     4.5ms     610µs     3.5ms     ↑159B ↓712K  ↑108K ↓484M  0       
s3.HeadObject               73 (27.7%)  660.1  222µs     109µs     1.2ms     205µs     1.2ms     ↑153B        ↑99K         0       
s3.CompleteMultipartUpload  37 (14.0%)  334.6  4.4ms     1.2ms     14.2ms    4.4ms     14.2ms    ↑397B ↓507B  ↑130K ↓166K  0       
s3.NewMultipartUpload       37 (14.0%)  334.6  4.3ms     1.2ms     15ms      4.3ms     15ms      ↑153B ↓437B  ↑50K ↓143K   0       
s3.PutObjectPart            37 (14.0%)  334.6  14.5ms    2.6ms     30.7ms    14.5ms    30.7ms    ↑1.4M        ↑468M        0       
s3.PutObject                3 (1.1%)    27.1   6.6ms     2.8ms     10.4ms    6.5ms     10.3ms    ↑2.8K        ↑75K         0  
```
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
@@ -27,7 +27,6 @@
 
 import importlib
 import logging
-import os
 import warnings
 from abc import ABC, abstractmethod
 from io import SEEK_SET
@@ -37,7 +36,6 @@
     List,
     Optional,
     Protocol,
-    Tuple,
     Type,
     Union,
     runtime_checkable,
@@ -371,14 +369,3 @@ def load_file_io(properties: Properties = EMPTY_DICT, location: Optional[str] =
         raise ModuleNotFoundError(
             'Could not load a FileIO, please consider installing one: pip3 install "pyiceberg[pyarrow]", for more options refer to the docs.'
         ) from e
-
-
-def _parse_location(location: str) -> Tuple[str, str, str]:
-    """Return the path without the scheme."""
-    uri = urlparse(location)
-    if not uri.scheme:
-        return "file", uri.netloc, os.path.abspath(location)
-    elif uri.scheme in ("hdfs", "viewfs"):
-        return uri.scheme, uri.netloc, uri.path
-    else:
-        return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}"
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -69,7 +69,6 @@
     FileInfo,
     FileSystem,
     FileType,
-    FSSpecHandler,
 )
 from sortedcontainers import SortedList
 
@@ -117,7 +116,6 @@
     InputStream,
     OutputFile,
     OutputStream,
-    _parse_location,
 )
 from pyiceberg.manifest import (
     DataFile,
@@ -309,9 +307,7 @@ def open(self, seekable: bool = True) -> InputStream:
                 input_file = self._filesystem.open_input_file(self._path)
             else:
                 input_file = self._filesystem.open_input_stream(self._path, buffer_size=self._buffer_size)
-        except FileNotFoundError:
-            raise
-        except PermissionError:
+        except (FileNotFoundError, PermissionError):
             raise
         except OSError as e:
             if e.errno == 2 or "Path does not exist" in str(e):
@@ -916,27 +912,20 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi
         raise ValueError(f"Unsupported file format: {file_format}")
 
 
-def _construct_fragment(fs: FileSystem, data_file: DataFile, file_format_kwargs: Dict[str, Any] = EMPTY_DICT) -> ds.Fragment:
-    _, _, path = PyArrowFileIO.parse_location(data_file.file_path)
-    return _get_file_format(data_file.file_format, **file_format_kwargs).make_fragment(path, fs)
-
-
-def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedArray]:
+def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray]:
     if data_file.file_format == FileFormat.PARQUET:
-        delete_fragment = _construct_fragment(
-            fs,
-            data_file,
-            file_format_kwargs={"dictionary_columns": ("file_path",), "pre_buffer": True, "buffer_size": ONE_MEGABYTE},
-        )
-        table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
+        with io.new_input(data_file.file_path).open() as fi:
+            delete_fragment = _get_file_format(
+                data_file.file_format, dictionary_columns=("file_path",), pre_buffer=True, buffer_size=ONE_MEGABYTE
+            ).make_fragment(fi)
+            table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
         table = table.unify_dictionaries()
         return {
             file.as_py(): table.filter(pc.field("file_path") == file).column("pos")
             for file in table.column("file_path").chunks[0].dictionary
         }
     elif data_file.file_format == FileFormat.PUFFIN:
-        _, _, path = PyArrowFileIO.parse_location(data_file.file_path)
-        with fs.open_input_file(path) as fi:
+        with io.new_input(data_file.file_path).open() as fi:
             payload = fi.read()
 
         return PuffinFile(payload).to_vector()
@@ -1383,7 +1372,7 @@ def _get_column_projection_values(
 
 
 def _task_to_record_batches(
-    fs: FileSystem,
+    io: FileIO,
     task: FileScanTask,
     bound_row_filter: BooleanExpression,
     projected_schema: Schema,
@@ -1393,9 +1382,8 @@ def _task_to_record_batches(
     name_mapping: Optional[NameMapping] = None,
     partition_spec: Optional[PartitionSpec] = None,
 ) -> Iterator[pa.RecordBatch]:
-    _, _, path = _parse_location(task.file.file_path)
     arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
-    with fs.open_input_file(path) as fin:
+    with io.new_input(task.file.file_path).open() as fin:
         fragment = arrow_format.make_fragment(fin)
         physical_schema = fragment.physical_schema
         # In V1 and V2 table formats, we only support Timestamp 'us' in Iceberg Schema
@@ -1479,7 +1467,7 @@ def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[st
         executor = ExecutorFactory.get_or_create()
         deletes_per_files: Iterator[Dict[str, ChunkedArray]] = executor.map(
             lambda args: _read_deletes(*args),
-            [(_fs_from_file_path(io, delete_file.file_path), delete_file) for delete_file in unique_deletes],
+            [(io, delete_file) for delete_file in unique_deletes],
         )
         for delete in deletes_per_files:
             for file, arr in delete.items():
@@ -1491,25 +1479,6 @@ def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[st
     return deletes_per_file
 
 
-def _fs_from_file_path(io: FileIO, file_path: str) -> FileSystem:
-    scheme, netloc, _ = _parse_location(file_path)
-    if isinstance(io, PyArrowFileIO):
-        return io.fs_by_scheme(scheme, netloc)
-    else:
-        try:
-            from pyiceberg.io.fsspec import FsspecFileIO
-
-            if isinstance(io, FsspecFileIO):
-                from pyarrow.fs import PyFileSystem
-
-                return PyFileSystem(FSSpecHandler(io.get_fs(scheme)))
-            else:
-                raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO, got: {io}")
-        except ModuleNotFoundError as e:
-            # When FsSpec is not installed
-            raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO, got: {io}") from e
-
-
 class ArrowScan:
     _table_metadata: TableMetadata
     _io: FileIO
@@ -1654,7 +1623,7 @@ def _record_batches_from_scan_tasks_and_deletes(
             if self._limit is not None and total_row_count >= self._limit:
                 break
             batches = _task_to_record_batches(
-                _fs_from_file_path(self._io, task.file.file_path),
+                self._io,
                 task,
                 self._bound_row_filter,
                 self._projected_schema,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -1539,7 +1539,7 @@ def deletes_file(tmp_path: str, example_task: FileScanTask) -> str:
 
 
 def test_read_deletes(deletes_file: str, example_task: FileScanTask) -> None:
-    deletes = _read_deletes(LocalFileSystem(), DataFile.from_args(file_path=deletes_file, file_format=FileFormat.PARQUET))
+    deletes = _read_deletes(PyArrowFileIO(), DataFile.from_args(file_path=deletes_file, file_format=FileFormat.PARQUET))
     assert set(deletes.keys()) == {example_task.file.file_path}
     assert list(deletes.values())[0] == pa.chunked_array([[1, 3, 5]])