Skip to content

COMPAT: prepare for pandas 3.0 string dtype #493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 30, 2025
Merged
5 changes: 4 additions & 1 deletion .github/workflows/tests-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ jobs:
- os: "ubuntu-latest"
python: "3.11"
env: "nightly-deps"
pandas_future_infer_string: "1"

steps:
- name: Checkout repo
Expand All @@ -68,5 +69,7 @@ jobs:
run: pip install -e .

- name: Test
env:
PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }}
run: |
pytest -v --color=yes -r s pyogrio/tests
pytest -v -s --color=yes -r s pyogrio/tests
5 changes: 5 additions & 0 deletions pyogrio/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,21 @@
HAS_ARROW_WRITE_API = __gdal_version__ >= (3, 8, 0)
HAS_PYARROW = pyarrow is not None
HAS_PYPROJ = pyproj is not None
PYARROW_GE_19 = pyarrow is not None and Version(pyarrow.__version__) >= Version(
"19.0.0"
)

HAS_GEOPANDAS = geopandas is not None

PANDAS_GE_15 = pandas is not None and Version(pandas.__version__) >= Version("1.5.0")
PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0")
PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0")
PANDAS_GE_30 = pandas is not None and Version(pandas.__version__) >= Version("3.0.0dev")

GDAL_GE_352 = __gdal_version__ >= (3, 5, 2)
GDAL_GE_37 = __gdal_version__ >= (3, 7, 0)
GDAL_GE_38 = __gdal_version__ >= (3, 8, 0)
GDAL_GE_311 = __gdal_version__ >= (3, 11, 0)

HAS_GDAL_GEOS = __gdal_geos_version__ is not None

Expand Down
33 changes: 30 additions & 3 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@

import numpy as np

from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_22
from pyogrio._compat import (
HAS_GEOPANDAS,
PANDAS_GE_15,
PANDAS_GE_20,
PANDAS_GE_22,
PANDAS_GE_30,
PYARROW_GE_19,
)
from pyogrio.errors import DataSourceError
from pyogrio.raw import (
DRIVERS_NO_MIXED_DIMENSIONS,
Expand Down Expand Up @@ -52,13 +59,13 @@ def _try_parse_datetime(ser):
except Exception:
res = ser
# if object dtype, try parse as utc instead
if res.dtype == "object":
if res.dtype in ("object", "string"):
try:
res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
except Exception:
pass

if res.dtype != "object":
if res.dtype.kind == "M":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To confirm, this is the dtype code for datetime with millisecond precision? Perhaps add a comment to indicate the full dtype name here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not specifically millisecond resolution, but just datetime64 in general (the numpy datetime64 dtype's short character name is "M").

But will add a comment

# GDAL only supports ms precision, convert outputs to match.
# Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns],
# Instead, round the values to [ms] precision.
Expand Down Expand Up @@ -285,11 +292,31 @@ def read_dataframe(
)

if use_arrow:
import pyarrow as pa

meta, table = result

# split_blocks and self_destruct decrease memory usage, but have as side effect
# that accessing table afterwards causes crash, so del table to avoid.
kwargs = {"self_destruct": True}
if PANDAS_GE_30:
# starting with pyarrow 19.0, pyarrow will correctly handle this themselves,
# so only use types_mapper as workaround for older versions
if not PYARROW_GE_19:
kwargs["types_mapper"] = {
pa.string(): pd.StringDtype(na_value=np.nan),
pa.large_string(): pd.StringDtype(na_value=np.nan),
pa.json_(): pd.StringDtype(na_value=np.nan),
}.get
# TODO enable the below block when upstream issue to accept extension types
# is fixed
# else:
# # for newer pyarrow, still include mapping for json
# # GDAL 3.11 started to emit this extension type, but pyarrow does not
# # yet support it properly in the conversion to pandas
# kwargs["types_mapper"] = {
# pa.json_(): pd.StringDtype(na_value=np.nan),
# }.get
if arrow_to_pandas_kwargs is not None:
kwargs.update(arrow_to_pandas_kwargs)
df = table.to_pandas(**kwargs)
Expand Down
47 changes: 37 additions & 10 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
)
from pyogrio._compat import (
GDAL_GE_37,
GDAL_GE_311,
GDAL_GE_352,
HAS_ARROW_WRITE_API,
HAS_PYPROJ,
PANDAS_GE_15,
PANDAS_GE_30,
SHAPELY_GE_21,
)
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
Expand Down Expand Up @@ -256,13 +258,20 @@ def test_read_layer(tmp_path, use_arrow):

# create a multilayer GPKG
expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326")
if use_arrow:
# TODO this needs to be fixed on the geopandas side (to ensure the
# GeoDataFrame() constructor does this), when use_arrow we already
# get columns Index with string dtype
expected1.columns = expected1.columns.astype("str")
write_dataframe(
expected1,
filename,
layer="layer1",
)

expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326")
if use_arrow:
expected2.columns = expected2.columns.astype("str")
write_dataframe(expected2, filename, layer="layer2", append=True)

assert np.array_equal(
Expand Down Expand Up @@ -385,7 +394,7 @@ def test_read_null_values(tmp_path, use_arrow):
df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False)

# make sure that Null values are preserved
assert np.array_equal(df.col.values, expected.col.values)
assert df["col"].isna().all()


def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow):
Expand Down Expand Up @@ -699,6 +708,13 @@ def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_feature
# In .geojsonl the vertices are reordered, so normalize
is_jsons = ext == ".geojsonl"

if skip_features == 200 and not use_arrow:
# result is an empty dataframe, so no proper dtype inference happens
# for the numpy object dtype arrays
df[["continent", "name", "iso_a3"]] = df[
["continent", "name", "iso_a3"]
].astype("str")

assert_geodataframe_equal(
df,
expected,
Expand Down Expand Up @@ -1180,6 +1196,10 @@ def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow):
# For older pandas versions, the index is created as Object dtype but read as
# RangeIndex, so don't check the index dtype in that case.
check_index_type = True if PANDAS_GE_20 else False
# with pandas 3+ and reading through arrow, we preserve the string dtype
# (no proper dtype inference happens for the empty numpy object dtype arrays)
if use_arrow and dtype is object:
expected["col_object"] = expected["col_object"].astype("str")
assert_geodataframe_equal(df, expected, check_index_type=check_index_type)


Expand Down Expand Up @@ -1214,7 +1234,11 @@ def test_write_None_string_column(tmp_path, use_arrow):
assert filename.exists()

result_gdf = read_dataframe(filename, use_arrow=use_arrow)
assert result_gdf.object_col.dtype == object
assert (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this assert be folded into the conditional below (with a corresponding else)?

result_gdf.object_col.dtype == "str" if PANDAS_GE_30 and use_arrow else object
)
if PANDAS_GE_30 and use_arrow:
gdf["object_col"] = gdf["object_col"].astype("str")
assert_geodataframe_equal(result_gdf, gdf)


Expand Down Expand Up @@ -1658,11 +1682,12 @@ def test_write_read_mixed_column_values(tmp_path):
write_dataframe(test_gdf, output_path)
output_gdf = read_dataframe(output_path)
assert len(test_gdf) == len(output_gdf)
for idx, value in enumerate(mixed_values):
if value in (None, np.nan):
assert output_gdf["mixed"][idx] is None
else:
assert output_gdf["mixed"][idx] == str(value)
# mixed values as object dtype are currently written as strings
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# mixed values as object dtype are currently written as strings
# non-null mixed values as object dtype are currently written as strings

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also null values are written as string dtype, but preserving the fact they are null)

expected = pd.Series(
[str(value) if value not in (None, np.nan) else None for value in mixed_values],
name="mixed",
)
assert_series_equal(output_gdf["mixed"], expected)


@requires_arrow_write_api
Expand Down Expand Up @@ -1695,8 +1720,8 @@ def test_write_read_null(tmp_path, use_arrow):
assert pd.isna(result_gdf["float64"][1])
assert pd.isna(result_gdf["float64"][2])
assert result_gdf["object_str"][0] == "test"
assert result_gdf["object_str"][1] is None
assert result_gdf["object_str"][2] is None
assert pd.isna(result_gdf["object_str"][1])
assert pd.isna(result_gdf["object_str"][2])


@pytest.mark.requires_arrow_write_api
Expand Down Expand Up @@ -1927,6 +1952,8 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow):
geometry=[shapely.Point(0, 0)],
crs="EPSG:4326",
)
if GDAL_GE_311 and use_arrow:
expected["intermediate_level"] = expected["intermediate_level"].astype(object)

assert_geodataframe_equal(df, expected)

Expand Down Expand Up @@ -1972,7 +1999,7 @@ def test_write_nullable_dtypes(tmp_path, use_arrow):
expected["col2"] = expected["col2"].astype("float64")
expected["col3"] = expected["col3"].astype("float32")
expected["col4"] = expected["col4"].astype("float64")
expected["col5"] = expected["col5"].astype(object)
expected["col5"] = expected["col5"].astype("str")
expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
assert_geodataframe_equal(output_gdf, expected)

Expand Down