-
-
Notifications
You must be signed in to change notification settings - Fork 27
COMPAT: prepare for pandas 3.0 string dtype #493
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
2ccc8a5
b1b1b03
63538d9
885376e
1cabecd
01835c7
ce0c810
6520de4
05ebe72
9753fa6
77d87b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -18,10 +18,12 @@ | |||||
) | ||||||
from pyogrio._compat import ( | ||||||
GDAL_GE_37, | ||||||
GDAL_GE_311, | ||||||
GDAL_GE_352, | ||||||
HAS_ARROW_WRITE_API, | ||||||
HAS_PYPROJ, | ||||||
PANDAS_GE_15, | ||||||
PANDAS_GE_30, | ||||||
SHAPELY_GE_21, | ||||||
) | ||||||
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError | ||||||
|
@@ -256,13 +258,20 @@ def test_read_layer(tmp_path, use_arrow): | |||||
|
||||||
# create a multilayer GPKG | ||||||
expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326") | ||||||
if use_arrow: | ||||||
# TODO this needs to be fixed on the geopandas side (to ensure the | ||||||
# GeoDataFrame() constructor does this), when use_arrow we already | ||||||
# get columns Index with string dtype | ||||||
expected1.columns = expected1.columns.astype("str") | ||||||
write_dataframe( | ||||||
expected1, | ||||||
filename, | ||||||
layer="layer1", | ||||||
) | ||||||
|
||||||
expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326") | ||||||
if use_arrow: | ||||||
expected2.columns = expected2.columns.astype("str") | ||||||
write_dataframe(expected2, filename, layer="layer2", append=True) | ||||||
|
||||||
assert np.array_equal( | ||||||
|
@@ -385,7 +394,7 @@ def test_read_null_values(tmp_path, use_arrow): | |||||
df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False) | ||||||
|
||||||
# make sure that Null values are preserved | ||||||
assert np.array_equal(df.col.values, expected.col.values) | ||||||
assert df["col"].isna().all() | ||||||
|
||||||
|
||||||
def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow): | ||||||
|
@@ -699,6 +708,13 @@ def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_feature | |||||
# In .geojsonl the vertices are reordered, so normalize | ||||||
is_jsons = ext == ".geojsonl" | ||||||
|
||||||
if skip_features == 200 and not use_arrow: | ||||||
# result is an empty dataframe, so no proper dtype inference happens | ||||||
# for the numpy object dtype arrays | ||||||
df[["continent", "name", "iso_a3"]] = df[ | ||||||
["continent", "name", "iso_a3"] | ||||||
].astype("str") | ||||||
|
||||||
assert_geodataframe_equal( | ||||||
df, | ||||||
expected, | ||||||
|
@@ -1180,6 +1196,10 @@ def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): | |||||
# For older pandas versions, the index is created as Object dtype but read as | ||||||
# RangeIndex, so don't check the index dtype in that case. | ||||||
check_index_type = True if PANDAS_GE_20 else False | ||||||
# with pandas 3+ and reading through arrow, we preserve the string dtype | ||||||
# (no proper dtype inference happens for the empty numpy object dtype arrays) | ||||||
if use_arrow and dtype is object: | ||||||
expected["col_object"] = expected["col_object"].astype("str") | ||||||
assert_geodataframe_equal(df, expected, check_index_type=check_index_type) | ||||||
|
||||||
|
||||||
|
@@ -1214,7 +1234,11 @@ def test_write_None_string_column(tmp_path, use_arrow): | |||||
assert filename.exists() | ||||||
|
||||||
result_gdf = read_dataframe(filename, use_arrow=use_arrow) | ||||||
assert result_gdf.object_col.dtype == object | ||||||
assert ( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this |
||||||
result_gdf.object_col.dtype == "str" if PANDAS_GE_30 and use_arrow else object | ||||||
) | ||||||
if PANDAS_GE_30 and use_arrow: | ||||||
gdf["object_col"] = gdf["object_col"].astype("str") | ||||||
assert_geodataframe_equal(result_gdf, gdf) | ||||||
|
||||||
|
||||||
|
@@ -1658,11 +1682,12 @@ def test_write_read_mixed_column_values(tmp_path): | |||||
write_dataframe(test_gdf, output_path) | ||||||
output_gdf = read_dataframe(output_path) | ||||||
assert len(test_gdf) == len(output_gdf) | ||||||
for idx, value in enumerate(mixed_values): | ||||||
if value in (None, np.nan): | ||||||
assert output_gdf["mixed"][idx] is None | ||||||
else: | ||||||
assert output_gdf["mixed"][idx] == str(value) | ||||||
# mixed values as object dtype are currently written as strings | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also null values are written as string dtype, but preserving the fact they are null) |
||||||
expected = pd.Series( | ||||||
[str(value) if value not in (None, np.nan) else None for value in mixed_values], | ||||||
name="mixed", | ||||||
) | ||||||
assert_series_equal(output_gdf["mixed"], expected) | ||||||
|
||||||
|
||||||
@requires_arrow_write_api | ||||||
|
@@ -1695,8 +1720,8 @@ def test_write_read_null(tmp_path, use_arrow): | |||||
assert pd.isna(result_gdf["float64"][1]) | ||||||
assert pd.isna(result_gdf["float64"][2]) | ||||||
assert result_gdf["object_str"][0] == "test" | ||||||
assert result_gdf["object_str"][1] is None | ||||||
assert result_gdf["object_str"][2] is None | ||||||
assert pd.isna(result_gdf["object_str"][1]) | ||||||
assert pd.isna(result_gdf["object_str"][2]) | ||||||
|
||||||
|
||||||
@pytest.mark.requires_arrow_write_api | ||||||
|
@@ -1927,6 +1952,8 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow): | |||||
geometry=[shapely.Point(0, 0)], | ||||||
crs="EPSG:4326", | ||||||
) | ||||||
if GDAL_GE_311 and use_arrow: | ||||||
brendan-ward marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
expected["intermediate_level"] = expected["intermediate_level"].astype(object) | ||||||
|
||||||
assert_geodataframe_equal(df, expected) | ||||||
|
||||||
|
@@ -1972,7 +1999,7 @@ def test_write_nullable_dtypes(tmp_path, use_arrow): | |||||
expected["col2"] = expected["col2"].astype("float64") | ||||||
expected["col3"] = expected["col3"].astype("float32") | ||||||
expected["col4"] = expected["col4"].astype("float64") | ||||||
expected["col5"] = expected["col5"].astype(object) | ||||||
expected["col5"] = expected["col5"].astype("str") | ||||||
expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above | ||||||
assert_geodataframe_equal(output_gdf, expected) | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To confirm, this is the dtype code for datetime with millisecond precision? Perhaps add a comment to indicate the full dtype name here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not specifically millisecond resolution, but just datetime64 in general (the numpy datetime64 dtype's short character name is "M").
But will add a comment