Skip to content

ENH: add support to read and write .gpkg.zip and .shp.zip #527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Improvements

- Capture all errors logged by gdal when opening a file fails (#495).
- Add support to read and write ".gpkg.zip" (GDAL >= 3.7), ".shp.zip", and ".shz"
files (#527).

### Bug fixes

Expand Down
1 change: 1 addition & 0 deletions pyogrio/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0")

GDAL_GE_352 = __gdal_version__ >= (3, 5, 2)
GDAL_GE_37 = __gdal_version__ >= (3, 7, 0)
GDAL_GE_38 = __gdal_version__ >= (3, 8, 0)

HAS_GDAL_GEOS = __gdal_geos_version__ is not None
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gpkg.zip": "GPKG",
".shp": "ESRI Shapefile",
".shp.zip": "ESRI Shapefile",
".shz": "ESRI Shapefile",
}

# mapping of driver name to extension
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,9 @@ def test_write_append(request, tmp_path, naturalearth_lowres, ext):
pytest.mark.xfail(reason="Bugs with append when writing Arrow to GeoJSON")
)

if ext == ".gpkg.zip":
pytest.skip("Append is not supported for .gpkg.zip")

meta, table = read_arrow(naturalearth_lowres)

# coerce output layer to generic Geometry to avoid mixed type errors
Expand Down
32 changes: 31 additions & 1 deletion pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
vsi_listtree,
vsi_unlink,
)
from pyogrio._compat import GDAL_GE_352, HAS_ARROW_WRITE_API, HAS_PYPROJ, PANDAS_GE_15
from pyogrio._compat import (
GDAL_GE_37,
GDAL_GE_352,
HAS_ARROW_WRITE_API,
HAS_PYPROJ,
PANDAS_GE_15,
)
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
from pyogrio.geopandas import PANDAS_GE_20, read_dataframe, write_dataframe
from pyogrio.raw import (
Expand Down Expand Up @@ -1580,6 +1586,30 @@ def test_custom_crs_io(tmp_path, naturalearth_lowres_all_ext, use_arrow):
assert df.crs.equals(expected.crs)


@pytest.mark.parametrize("ext", [".gpkg.zip", ".shp.zip", ".shz"])
@pytest.mark.requires_arrow_write_api
def test_write_read_zipped_ext(tmp_path, naturalearth_lowres, ext, use_arrow):
"""Run a basic read and write test on some extra (zipped) extensions."""
if ext == ".gpkg.zip" and not GDAL_GE_37:
pytest.skip(".gpkg.zip support requires GDAL >= 3.7")

input_gdf = read_dataframe(naturalearth_lowres)
output_path = tmp_path / f"test{ext}"

write_dataframe(input_gdf, output_path, use_arrow=use_arrow)

assert output_path.exists()
result_gdf = read_dataframe(output_path)

geometry_types = result_gdf.geometry.type.unique()
if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
assert list(geometry_types) == ["MultiPolygon"]
else:
assert set(geometry_types) == {"MultiPolygon", "Polygon"}

assert_geodataframe_equal(result_gdf, input_gdf, check_index_type=False)


def test_write_read_mixed_column_values(tmp_path):
# use_arrow=True is tested separately below
mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
Expand Down
10 changes: 10 additions & 0 deletions pyogrio/tests/test_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,20 @@ def change_cwd(path):
[
# local file paths that should be passed through as is
("data.gpkg", "data.gpkg"),
("data.gpkg.zip", "data.gpkg.zip"),
("data.shp.zip", "data.shp.zip"),
(Path("data.gpkg"), "data.gpkg"),
(Path("data.gpkg.zip"), "data.gpkg.zip"),
(Path("data.shp.zip"), "data.shp.zip"),
("/home/user/data.gpkg", "/home/user/data.gpkg"),
("/home/user/data.gpkg.zip", "/home/user/data.gpkg.zip"),
("/home/user/data.shp.zip", "/home/user/data.shp.zip"),
(r"C:\User\Documents\data.gpkg", r"C:\User\Documents\data.gpkg"),
(r"C:\User\Documents\data.gpkg.zip", r"C:\User\Documents\data.gpkg.zip"),
(r"C:\User\Documents\data.shp.zip", r"C:\User\Documents\data.shp.zip"),
("file:///home/user/data.gpkg", "/home/user/data.gpkg"),
("file:///home/user/data.gpkg.zip", "/home/user/data.gpkg.zip"),
("file:///home/user/data.shp.zip", "/home/user/data.shp.zip"),
("/home/folder # with hash/data.gpkg", "/home/folder # with hash/data.gpkg"),
# cloud URIs
("https://testing/data.gpkg", "/vsicurl/https://testing/data.gpkg"),
Expand Down
8 changes: 6 additions & 2 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
read_info,
set_gdal_config_options,
)
from pyogrio._compat import HAS_PYARROW, HAS_SHAPELY
from pyogrio._compat import GDAL_GE_37, HAS_PYARROW, HAS_SHAPELY
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError
from pyogrio.raw import open_arrow, read, write
from pyogrio.tests.conftest import (
Expand Down Expand Up @@ -63,9 +63,10 @@ def test_read(naturalearth_lowres):
@pytest.mark.parametrize("ext", DRIVERS)
def test_read_autodetect_driver(tmp_path, naturalearth_lowres, ext):
# Test all supported autodetect drivers
if ext == ".gpkg.zip" and not GDAL_GE_37:
pytest.skip(".gpkg.zip not supported for gdal < 3.7.0")
testfile = prepare_testfile(naturalearth_lowres, dst_dir=tmp_path, ext=ext)

assert testfile.suffix == ext
assert testfile.exists()
meta, _, geometry, fields = read(testfile)

Expand Down Expand Up @@ -703,6 +704,9 @@ def test_write_append(tmp_path, naturalearth_lowres, ext):
if ext in (".geojsonl", ".geojsons") and __gdal_version__ < (3, 6, 0):
pytest.skip("Append to GeoJSONSeq only available for GDAL >= 3.6.0")

if ext == ".gpkg.zip":
pytest.skip("Append to .gpkg.zip is not supported")

meta, _, geometry, field_data = read(naturalearth_lowres)

# coerce output layer to MultiPolygon to avoid mixed type errors
Expand Down
17 changes: 15 additions & 2 deletions pyogrio/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from pyogrio._vsi import vsimem_rmtree_toplevel as _vsimem_rmtree_toplevel

MULTI_EXTENSIONS = (".gpkg.zip", ".shp.zip")


def get_vsi_path_or_buffer(path_or_buffer):
"""Get VSI-prefixed path or bytes buffer depending on type of path_or_buffer.
Expand Down Expand Up @@ -68,15 +70,23 @@ def vsi_path(path: Union[str, Path]) -> str:
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
# URL schemes
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
# If it is not a zip file or it is multi-extension zip file that is directly
# supported by a GDAL driver, return the path as is.
if not path.split("!")[0].endswith(".zip"):
return path
if path.split("!")[0].endswith(MULTI_EXTENSIONS):
return path

# prefix then allow to proceed with remaining parsing
path = f"zip://{path}"

path, archive, scheme = _parse_uri(path)

if scheme or archive or path.endswith(".zip"):
if (
scheme
or archive
or (path.endswith(".zip") and not path.endswith(MULTI_EXTENSIONS))
):
return _construct_vsi_path(path, archive, scheme)

return path
Expand Down Expand Up @@ -146,7 +156,10 @@ def _construct_vsi_path(path, archive, scheme) -> str:
suffix = ""
schemes = scheme.split("+")

if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
if "zip" not in schemes and (
archive.endswith(".zip")
or (path.endswith(".zip") and not path.endswith(MULTI_EXTENSIONS))
):
schemes.insert(0, "zip")

if schemes:
Expand Down