@@ -337,6 +337,34 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
337
337
return var
338
338
339
339
340
+ def _validate_datatypes_for_zarr_append (vname , existing_var , new_var ):
341
+ """If variable exists in the store, confirm dtype of the data to append is compatible with
342
+ existing dtype.
343
+ """
344
+ if (
345
+ np .issubdtype (new_var .dtype , np .number )
346
+ or np .issubdtype (new_var .dtype , np .datetime64 )
347
+ or np .issubdtype (new_var .dtype , np .bool_ )
348
+ or new_var .dtype == object
349
+ ):
350
+ # We can skip dtype equality checks under two conditions: (1) if the var to append is
351
+ # new to the dataset, because in this case there is no existing var to compare it to;
352
+ # or (2) if var to append's dtype is known to be easy-to-append, because in this case
353
+ # we can be confident appending won't cause problems. Examples of dtypes which are not
354
+ # easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
355
+ # positive integer character length). For these dtypes, appending dissimilar lengths
356
+ # can result in truncation of appended data. Therefore, variables which already exist
357
+ # in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
358
+ # exact dtype equality, as checked below.
359
+ pass
360
+ elif not new_var .dtype == existing_var .dtype :
361
+ raise ValueError (
362
+ f"Mismatched dtypes for variable { vname } between Zarr store on disk "
363
+ f"and dataset to append. Store has dtype { existing_var .dtype } but "
364
+ f"dataset to append has dtype { new_var .dtype } ."
365
+ )
366
+
367
+
340
368
def _validate_and_transpose_existing_dims (
341
369
var_name , new_var , existing_var , region , append_dim
342
370
):
@@ -625,26 +653,58 @@ def store(
625
653
import zarr
626
654
627
655
existing_keys = tuple (self .zarr_group .array_keys ())
656
+
657
+ if self ._mode == "r+" :
658
+ new_names = [k for k in variables if k not in existing_keys ]
659
+ if new_names :
660
+ raise ValueError (
661
+ f"dataset contains non-pre-existing variables { new_names } , "
662
+ "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
663
+ "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
664
+ )
665
+
666
+ if self ._append_dim is not None and self ._append_dim not in existing_keys :
667
+ # For dimensions without coordinate values, we must parse
668
+ # the _ARRAY_DIMENSIONS attribute on *all* arrays to check if it
669
+ # is a valid existing dimension name.
670
+ # TODO: This `get_dimensions` method also does shape checking
671
+ # which isn't strictly necessary for our check.
672
+ existing_dims = self .get_dimensions ()
673
+ if self ._append_dim not in existing_dims :
674
+ raise ValueError (
675
+ f"append_dim={ self ._append_dim !r} does not match any existing "
676
+ f"dataset dimensions { existing_dims } "
677
+ )
678
+
628
679
existing_variable_names = {
629
680
vn for vn in variables if _encode_variable_name (vn ) in existing_keys
630
681
}
631
- new_variables = set (variables ) - existing_variable_names
632
- variables_without_encoding = {vn : variables [vn ] for vn in new_variables }
682
+ new_variable_names = set (variables ) - existing_variable_names
633
683
variables_encoded , attributes = self .encode (
634
- variables_without_encoding , attributes
684
+ { vn : variables [ vn ] for vn in new_variable_names } , attributes
635
685
)
636
686
637
687
if existing_variable_names :
638
- # Decode variables directly, without going via xarray.Dataset to
639
- # avoid needing to load index variables into memory.
640
- # TODO: consider making loading indexes lazy again?
688
+ # We make sure that values to be appended are encoded *exactly*
689
+ # as the current values in the store.
690
+ # To do so, we decode variables directly to access the proper encoding,
691
+ # without going via xarray.Dataset to avoid needing to load
692
+ # index variables into memory.
641
693
existing_vars , _ , _ = conventions .decode_cf_variables (
642
- {k : self .open_store_variable (name = k ) for k in existing_variable_names },
643
- self .get_attrs (),
694
+ variables = {
695
+ k : self .open_store_variable (name = k ) for k in existing_variable_names
696
+ },
697
+ # attributes = {} since we don't care about parsing the global
698
+ # "coordinates" attribute
699
+ attributes = {},
644
700
)
645
701
# Modified variables must use the same encoding as the store.
646
702
vars_with_encoding = {}
647
703
for vn in existing_variable_names :
704
+ if self ._mode in ["a" , "a-" , "r+" ]:
705
+ _validate_datatypes_for_zarr_append (
706
+ vn , existing_vars [vn ], variables [vn ]
707
+ )
648
708
vars_with_encoding [vn ] = variables [vn ].copy (deep = False )
649
709
vars_with_encoding [vn ].encoding = existing_vars [vn ].encoding
650
710
vars_with_encoding , _ = self .encode (vars_with_encoding , {})
@@ -709,7 +769,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
709
769
710
770
for vn , v in variables .items ():
711
771
name = _encode_variable_name (vn )
712
- check = vn in check_encoding_set
713
772
attrs = v .attrs .copy ()
714
773
dims = v .dims
715
774
dtype = v .dtype
@@ -725,7 +784,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
725
784
# https://github.com/pydata/xarray/issues/8371 for details.
726
785
encoding = extract_zarr_variable_encoding (
727
786
v ,
728
- raise_on_invalid = check ,
787
+ raise_on_invalid = vn in check_encoding_set ,
729
788
name = vn ,
730
789
safe_chunks = self ._safe_chunks ,
731
790
)
@@ -828,7 +887,7 @@ def _auto_detect_regions(self, ds, region):
828
887
assert variable .dims == (dim ,)
829
888
index = pd .Index (variable .data )
830
889
idxs = index .get_indexer (ds [dim ].data )
831
- if any (idxs == - 1 ):
890
+ if (idxs == - 1 ). any ( ):
832
891
raise KeyError (
833
892
f"Not all values of coordinate '{ dim } ' in the new array were"
834
893
" found in the original store. Writing to a zarr region slice"
0 commit comments