diff --git a/CHANGES.md b/CHANGES.md index c1f44855..97310719 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Improvements - Capture all errors logged by gdal when opening a file fails (#495). +- Improve support for datetime columns (#486). - Add support to read and write ".gpkg.zip" (GDAL >= 3.7), ".shp.zip", and ".shz" files (#527). - Compatibility with the string dtype in the upcoming pandas 3.0 release (#493). diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 9cd5756e..1ab601ea 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1482,6 +1482,7 @@ def ogr_open_arrow( int return_fids=False, int batch_size=0, use_pyarrow=False, + datetime_as_string=False, ): cdef int err = 0 @@ -1695,6 +1696,12 @@ def ogr_open_arrow( "GEOARROW".encode("UTF-8") ) + # Read DateTime fields as strings, as the Arrow DateTime column type is + # quite limited regarding support for mixed timezones,... + IF CTE_GDAL_VERSION >= (3, 11, 0): + if datetime_as_string: + options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES") + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1720,6 +1727,7 @@ def ogr_open_arrow( "crs": crs, "encoding": encoding, "fields": fields[:, 2], + "dtypes": fields[:, 3], "geometry_type": geometry_type, "geometry_name": geometry_name, "fid_column": fid_column, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ce57575b..4ad5ab9c 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -2,6 +2,7 @@ import os import warnings +from datetime import datetime import numpy as np @@ -46,6 +47,7 @@ def _try_parse_datetime(ser): datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} else: datetime_kwargs = {"yearfirst": True} + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -58,12 +60,6 @@ def _try_parse_datetime(ser): res = pd.to_datetime(ser, **datetime_kwargs) except Exception: res = ser - # if object dtype, try parse as utc instead - if res.dtype in ("object", "string"): - try: - res = pd.to_datetime(ser, utc=True, **datetime_kwargs) - except Exception: - pass if res.dtype.kind == "M": # any datetime64 # GDAL only supports ms precision, convert outputs to match. @@ -73,6 +69,7 @@ def _try_parse_datetime(ser): res = res.dt.as_unit("ms") else: res = res.dt.round(freq="ms") + return res @@ -267,11 +264,10 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - if not use_arrow: - # For arrow, datetimes are read as is. - # For numpy IO, datetimes are read as string values to preserve timezone info - # as numpy does not directly support timezones. - kwargs["datetime_as_string"] = True + + # Always read datetimes as string values to preserve (mixed) timezone info + # as numpy does not directly support timezones and arrow datetime columns + # don't support mixed timezones. result = read_func( path_or_buffer, layer=layer, @@ -288,6 +284,7 @@ def read_dataframe( sql=sql, sql_dialect=sql_dialect, return_fids=fid_as_index, + datetime_as_string=True, **kwargs, ) @@ -330,6 +327,11 @@ def read_dataframe( del table + # convert datetime columns that were read as string to datetime + for dtype, column in zip(meta["dtypes"], meta["fields"]): + if dtype is not None and dtype.startswith("datetime"): + df[column] = _try_parse_datetime(df[column]) + if fid_as_index: df = df.set_index(meta["fid_column"]) df.index.names = ["fid"] @@ -619,8 +621,33 @@ def write_dataframe( df = pd.DataFrame(df, copy=False) df[geometry_column] = geometry + # Convert all datetime columns to isoformat strings, to avoid mixed timezone + # information getting lost. + datetime_cols = [] + for name, dtype in df.dtypes.items(): + col = df[name] + if dtype == "object": + # When all non-NA values are Timestamps, treat as datetime column + col_na = df[col.notna()][name] + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): + df[name] = col.astype("string") + datetime_cols.append(name) + elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": + # When it is a datetime column with a timezone different than UTC, it + # needs to be converted to string, otherwise the timezone info is lost. + df[name] = col.astype("string") + datetime_cols.append(name) + table = pa.Table.from_pandas(df, preserve_index=False) + # Add metadata to datetime columns so GDAL knows they are datetimes. + for datetime_col in datetime_cols: + table = _add_column_metadata( + table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}} + ) + # Null arrow columns are not supported by GDAL, so convert to string for field_index, field in enumerate(table.schema): if field.type == pa.null(): @@ -678,6 +705,8 @@ def write_dataframe( gdal_tz_offsets = {} for name in fields: col = df[name] + values = None + if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime @@ -692,8 +721,24 @@ def write_dataframe( # Convert each row offset to a signed multiple of 15m and add to GMT value gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_representation.values - else: + + elif col.dtype == "object": + # Column of Timestamp/datetime objects, split in naive datetime and tz. + col_na = df[col.notna()][name] + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply( + lambda x: None if pd.isna(x) else x.replace(tzinfo=None) + ) + values = naive.values + + if values is None: values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import BooleanArray, FloatingArray, IntegerArray @@ -729,3 +774,48 @@ def write_dataframe( gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) + + +def _add_column_metadata(table, column_metadata: dict = {}): + """Add or update column-level metadata to an arrow table. + + Parameters + ---------- + table : pyarrow.Table + The table to add the column metadata to. + column_metadata : dict + A dictionary with column metadata in the form + { + "column_1": {"some": "data"}, + "column_2": {"more": "stuff"}, + } + + Returns + ------- + pyarrow.Table: table with the updated column metadata. + """ + import pyarrow as pa + + if not column_metadata: + return table + + # Create updated column fields with new metadata + fields = [] + for col in table.schema.names: + if col in column_metadata: + # Add/update column metadata + metadata = table.field(col).metadata or {} + for key, value in column_metadata[col].items(): + metadata[key] = value + # Update field with updated metadata + fields.append(table.field(col).with_metadata(metadata)) + else: + fields.append(table.field(col)) + + # Create new schema with the updated field metadata + schema = pa.schema(fields, metadata=table.schema.metadata) + + # Build new table with updated schema (shouldn't copy data) + table = table.cast(schema) + + return table diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0f0c3063..09bd5aa2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -233,6 +233,7 @@ def read_arrow( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into a pyarrow Table. @@ -303,6 +304,7 @@ def read_arrow( skip_features=gdal_skip_features, batch_size=batch_size, use_pyarrow=True, + datetime_as_string=datetime_as_string, **kwargs, ) as source: meta, reader = source @@ -358,6 +360,7 @@ def open_arrow( return_fids=False, batch_size=65_536, use_pyarrow=False, + datetime_as_string=False, **kwargs, ): """Open OGR data source as a stream of Arrow record batches. @@ -386,6 +389,9 @@ def open_arrow( ArrowStream object. In the default case, this stream object needs to be passed to another library supporting the Arrow PyCapsule Protocol to consume the stream of data. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as strings, + as arrow doesn't support e.g. mixed timezones. Examples -------- @@ -423,6 +429,7 @@ def open_arrow( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "", "geometry_name": "", @@ -453,6 +460,7 @@ def open_arrow( dataset_kwargs=dataset_kwargs, batch_size=batch_size, use_pyarrow=use_pyarrow, + datetime_as_string=datetime_as_string, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8eb7d6bc..88f39d1c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -48,6 +48,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import is_datetime64_dtype, is_object_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -333,77 +334,254 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): - df = read_dataframe(datetime_tz_file) - # Make the index non-consecutive to test this case as well. Added for issue - # https://github.com/geopandas/pyogrio/issues/324 - df = df.set_index(np.array([0, 2])) - raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] +def test_write_read_datetime_tz(tmp_path, ext, use_arrow): + """Write and read file with all equal timezones. + + This should result in the result being in pandas datetime64 dtype column. + """ + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None] if PANDAS_GE_20: - expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: - expected = pd.to_datetime(raw_expected) - expected = pd.Series(expected, name="datetime_col") - assert_series_equal(df.datetime_col, expected, check_index=False) - # test write and read round trips - fpath = tmp_path / "test.gpkg" + dates = pd.to_datetime(dates_raw) + + # Make the index non-consecutive to test this case as well. Added for issue + # https://github.com/geopandas/pyogrio/issues/324 + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, + index=[0, 2, 3], + crs="EPSG:4326", + ) + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) + + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - df_read = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - expected = expected.dt.tz_convert("UTC") - assert_series_equal(df_read.datetime_col, expected) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(df.dates.dtype) + if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + df_exp = df.copy() + df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str) + assert_series_equal(result.dates, df_exp.dates, check_index=False) + pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) + assert_series_equal(result.dates, df.dates, check_index=False) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow): + """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - naive_col = pd.Series(pd.to_datetime(dates), name="dates") - localised_col = naive_col.dt.tz_localize("Australia/Sydney") - utc_col = localised_col.dt.tz_convert("UTC") - if PANDAS_GE_20: - utc_col = utc_col.dt.as_unit("ms") + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] + dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") + dates_local = dates_naive.dt.tz_localize("Australia/Sydney") + dates_local_offsets_str = dates_local.astype("string").astype("O") + dates_exp = dates_local_offsets_str.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else None + ) df = gp.GeoDataFrame( - {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + dates_utc = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_utc = dates_utc.dt.as_unit("ms") + assert_series_equal(result.dates, dates_utc) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With GDAL < 3.11 with arrow, datetime columns written as string type + assert_series_equal(result.dates, dates_local_offsets_str) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + # GDAL tz only encodes offsets, not timezones - # check multiple offsets are read as utc datetime instead of string values - assert_series_equal(result["dates"], utc_col) + assert is_object_dtype(result.dates.dtype) + assert_series_equal(result.dates, dates_exp) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): + """Test with dates with mixed timezone offsets.""" + # Pandas datetime64 column types doesn't support mixed timezone offsets, so + # it needs to be a list of pandas.Timestamp objects instead. + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + None, + ] + + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + df_exp = df.copy() + df_exp.dates = pd.to_datetime(dates, utc=True) + if PANDAS_GE_20: + df_exp.dates = df_exp.dates.dt.as_unit("ms") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string + # type columns, so no proper roundtrip possible. + df_exp = df.copy() + df_exp.dates = df_exp.dates.astype("string").astype("O") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + + assert is_object_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize( + "dates_raw", + [ + ( + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + None, + ), + ( + datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"), + datetime.fromisoformat("2020-01-01T10:00:00-05:00"), + None, + ), + ], +) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with equal offsets are read as datetime64.""" + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + + dates = pd.Series(dates_raw, dtype="O") + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + if PANDAS_GE_20: + dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates_exp = pd.to_datetime(dates_raw) + exp_df = df.copy() + exp_df.dates = pd.Series(dates_exp, name="dates") + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(exp_df.dates.dtype) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + exp2_df = exp_df.copy() + exp2_df.dates = exp2_df.dates.astype("string").astype("O") + assert_geodataframe_equal(result, exp2_df) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + assert_geodataframe_equal(result, exp_df) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_utc(tmp_path, ext, use_arrow): + """Test writing/reading a column with UTC datetimes.""" + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + assert df.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") + + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_convert("UTC") - assert_geodataframe_equal(df, result) + + if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb + assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") + + assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") + assert_geodataframe_equal(result, df) def test_read_null_values(tmp_path, use_arrow):