@@ -261,7 +261,7 @@ def _is_time_like(units):
261
261
262
262
263
263
def _check_fill_values (attrs , name , dtype ):
264
- """ " Check _FillValue and missing_value if available.
264
+ """Check _FillValue and missing_value if available.
265
265
266
266
Return dictionary with raw fill values and set with encoded fill values.
267
267
@@ -298,18 +298,87 @@ def _check_fill_values(attrs, name, dtype):
298
298
return raw_fill_dict , encoded_fill_values
299
299
300
300
301
+ def _convert_unsigned_fill_value (
302
+ name : T_Name ,
303
+ data : Any ,
304
+ unsigned : str ,
305
+ raw_fill_value : Any ,
306
+ encoded_fill_values : set ,
307
+ ) -> Any :
308
+ if data .dtype .kind == "i" :
309
+ if unsigned == "true" :
310
+ unsigned_dtype = np .dtype (f"u{ data .dtype .itemsize } " )
311
+ transform = partial (np .asarray , dtype = unsigned_dtype )
312
+ if raw_fill_value is not None :
313
+ new_fill = np .array (raw_fill_value , dtype = data .dtype )
314
+ encoded_fill_values .remove (raw_fill_value )
315
+ # use view here to prevent OverflowError
316
+ encoded_fill_values .add (new_fill .view (unsigned_dtype ).item ())
317
+ data = lazy_elemwise_func (data , transform , unsigned_dtype )
318
+ elif data .dtype .kind == "u" :
319
+ if unsigned == "false" :
320
+ signed_dtype = np .dtype (f"i{ data .dtype .itemsize } " )
321
+ transform = partial (np .asarray , dtype = signed_dtype )
322
+ data = lazy_elemwise_func (data , transform , signed_dtype )
323
+ if raw_fill_value is not None :
324
+ new_fill = signed_dtype .type (raw_fill_value )
325
+ encoded_fill_values .remove (raw_fill_value )
326
+ encoded_fill_values .add (new_fill )
327
+ else :
328
+ warnings .warn (
329
+ f"variable { name !r} has _Unsigned attribute but is not "
330
+ "of integer type. Ignoring attribute." ,
331
+ SerializationWarning ,
332
+ stacklevel = 3 ,
333
+ )
334
+ return data
335
+
336
+
337
+ def _encode_unsigned_fill_value (
338
+ name : T_Name ,
339
+ fill_value : Any ,
340
+ encoded_dtype : np .dtype ,
341
+ ) -> Any :
342
+ try :
343
+ if hasattr (fill_value , "item" ):
344
+ # if numpy type, convert to python native integer to determine overflow
345
+ # otherwise numpy unsigned ints will silently cast to the signed counterpart
346
+ fill_value = fill_value .item ()
347
+ # passes if provided fill value fits in encoded on-disk type
348
+ new_fill = encoded_dtype .type (fill_value )
349
+ except OverflowError :
350
+ encoded_kind_str = "signed" if encoded_dtype .kind == "i" else "unsigned"
351
+ warnings .warn (
352
+ f"variable { name !r} will be stored as { encoded_kind_str } integers "
353
+ f"but _FillValue attribute can't be represented as a "
354
+ f"{ encoded_kind_str } integer." ,
355
+ SerializationWarning ,
356
+ stacklevel = 3 ,
357
+ )
358
+ # user probably provided the fill as the in-memory dtype,
359
+ # convert to on-disk type to match CF standard
360
+ orig_kind = "u" if encoded_dtype .kind == "i" else "i"
361
+ orig_dtype = np .dtype (f"{ orig_kind } { encoded_dtype .itemsize } " )
362
+ # use view here to prevent OverflowError
363
+ new_fill = np .array (fill_value , dtype = orig_dtype ).view (encoded_dtype ).item ()
364
+ return new_fill
365
+
366
+
301
367
class CFMaskCoder (VariableCoder ):
302
368
"""Mask or unmask fill values according to CF conventions."""
303
369
304
370
def encode (self , variable : Variable , name : T_Name = None ):
305
371
dims , data , attrs , encoding = unpack_for_encoding (variable )
306
372
307
373
dtype = np .dtype (encoding .get ("dtype" , data .dtype ))
374
+ # from netCDF best practices
375
+ # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
376
+ # "_Unsigned = "true" to indicate that
377
+ # integer data should be treated as unsigned"
378
+ has_unsigned = encoding .get ("_Unsigned" ) is not None
308
379
fv = encoding .get ("_FillValue" )
309
380
mv = encoding .get ("missing_value" )
310
- # to properly handle _FillValue/missing_value below [a], [b]
311
- # we need to check if unsigned data is written as signed data
312
- unsigned = encoding .get ("_Unsigned" ) is not None
381
+ fill_value = None
313
382
314
383
fv_exists = fv is not None
315
384
mv_exists = mv is not None
@@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None):
324
393
325
394
if fv_exists :
326
395
# Ensure _FillValue is cast to same dtype as data's
327
- # [a] need to skip this if _Unsigned is available
328
- if not unsigned :
329
- encoding ["_FillValue" ] = dtype .type (fv )
396
+ encoding ["_FillValue" ] = (
397
+ _encode_unsigned_fill_value (name , fv , dtype )
398
+ if has_unsigned
399
+ else dtype .type (fv )
400
+ )
330
401
fill_value = pop_to (encoding , attrs , "_FillValue" , name = name )
331
402
332
403
if mv_exists :
333
404
# try to use _FillValue, if it exists to align both values
334
405
# or use missing_value and ensure it's cast to same dtype as data's
335
- # [b] need to provide mv verbatim if _Unsigned is available
336
406
encoding ["missing_value" ] = attrs .get (
337
407
"_FillValue" ,
338
- (dtype .type (mv ) if not unsigned else mv ),
408
+ (
409
+ _encode_unsigned_fill_value (name , mv , dtype )
410
+ if has_unsigned
411
+ else dtype .type (mv )
412
+ ),
339
413
)
340
414
fill_value = pop_to (encoding , attrs , "missing_value" , name = name )
341
415
342
416
# apply fillna
343
- if not pd .isnull (fill_value ):
417
+ if fill_value is not None and not pd .isnull (fill_value ):
344
418
# special case DateTime to properly handle NaT
345
419
if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
346
420
data = duck_array_ops .where (
@@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None):
349
423
else :
350
424
data = duck_array_ops .fillna (data , fill_value )
351
425
426
+ if fill_value is not None and has_unsigned :
427
+ pop_to (encoding , attrs , "_Unsigned" )
428
+ # XXX: Is this actually needed? Doesn't the backend handle this?
429
+ data = duck_array_ops .astype (duck_array_ops .around (data ), dtype )
430
+ attrs ["_FillValue" ] = fill_value
431
+
352
432
return Variable (dims , data , attrs , encoding , fastpath = True )
353
433
354
434
def decode (self , variable : Variable , name : T_Name = None ):
355
435
raw_fill_dict , encoded_fill_values = _check_fill_values (
356
436
variable .attrs , name , variable .dtype
357
437
)
438
+ if "_Unsigned" not in variable .attrs and not raw_fill_dict :
439
+ return variable
358
440
359
- if raw_fill_dict :
360
- dims , data , attrs , encoding = unpack_for_decoding (variable )
361
- [
362
- safe_setitem (encoding , attr , value , name = name )
363
- for attr , value in raw_fill_dict .items ()
364
- ]
365
-
366
- if encoded_fill_values :
367
- # special case DateTime to properly handle NaT
368
- dtype : np .typing .DTypeLike
369
- decoded_fill_value : Any
370
- if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
371
- dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
441
+ dims , data , attrs , encoding = unpack_for_decoding (variable )
442
+
443
+ # Even if _Unsigned is use, retain on-disk _FillValue
444
+ [
445
+ safe_setitem (encoding , attr , value , name = name )
446
+ for attr , value in raw_fill_dict .items ()
447
+ ]
448
+
449
+ if "_Unsigned" in attrs :
450
+ unsigned = pop_to (attrs , encoding , "_Unsigned" )
451
+ data = _convert_unsigned_fill_value (
452
+ name ,
453
+ data ,
454
+ unsigned ,
455
+ raw_fill_dict .get ("_FillValue" ),
456
+ encoded_fill_values ,
457
+ )
458
+
459
+ if encoded_fill_values :
460
+ # special case DateTime to properly handle NaT
461
+ dtype : np .typing .DTypeLike
462
+ decoded_fill_value : Any
463
+ if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
464
+ dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
465
+ else :
466
+ if "scale_factor" not in attrs and "add_offset" not in attrs :
467
+ dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
372
468
else :
373
- if "scale_factor" not in attrs and "add_offset" not in attrs :
374
- dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
375
- else :
376
- dtype , decoded_fill_value = (
377
- _choose_float_dtype (data .dtype , attrs ),
378
- np .nan ,
379
- )
469
+ dtype , decoded_fill_value = (
470
+ _choose_float_dtype (data .dtype , attrs ),
471
+ np .nan ,
472
+ )
380
473
381
- transform = partial (
382
- _apply_mask ,
383
- encoded_fill_values = encoded_fill_values ,
384
- decoded_fill_value = decoded_fill_value ,
385
- dtype = dtype ,
386
- )
387
- data = lazy_elemwise_func (data , transform , dtype )
474
+ transform = partial (
475
+ _apply_mask ,
476
+ encoded_fill_values = encoded_fill_values ,
477
+ decoded_fill_value = decoded_fill_value ,
478
+ dtype = dtype ,
479
+ )
480
+ data = lazy_elemwise_func (data , transform , dtype )
388
481
389
- return Variable (dims , data , attrs , encoding , fastpath = True )
390
- else :
391
- return variable
482
+ return Variable (dims , data , attrs , encoding , fastpath = True )
392
483
393
484
394
485
def _scale_offset_decoding (data , scale_factor , add_offset , dtype : np .typing .DTypeLike ):
@@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
506
597
return variable
507
598
508
599
509
- class UnsignedIntegerCoder (VariableCoder ):
510
- def encode (self , variable : Variable , name : T_Name = None ) -> Variable :
511
- # from netCDF best practices
512
- # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
513
- # "_Unsigned = "true" to indicate that
514
- # integer data should be treated as unsigned"
515
- if variable .encoding .get ("_Unsigned" , "false" ) == "true" :
516
- dims , data , attrs , encoding = unpack_for_encoding (variable )
517
-
518
- pop_to (encoding , attrs , "_Unsigned" )
519
- # we need the on-disk type here
520
- # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
521
- signed_dtype = np .dtype (encoding .get ("dtype" , f"i{ data .dtype .itemsize } " ))
522
- if "_FillValue" in attrs :
523
- try :
524
- # user provided the on-disk signed fill
525
- new_fill = signed_dtype .type (attrs ["_FillValue" ])
526
- except OverflowError :
527
- # user provided the in-memory unsigned fill, convert to signed type
528
- unsigned_dtype = np .dtype (f"u{ signed_dtype .itemsize } " )
529
- # use view here to prevent OverflowError
530
- new_fill = (
531
- np .array (attrs ["_FillValue" ], dtype = unsigned_dtype )
532
- .view (signed_dtype )
533
- .item ()
534
- )
535
- attrs ["_FillValue" ] = new_fill
536
- data = duck_array_ops .astype (duck_array_ops .around (data ), signed_dtype )
537
-
538
- return Variable (dims , data , attrs , encoding , fastpath = True )
539
- else :
540
- return variable
541
-
542
- def decode (self , variable : Variable , name : T_Name = None ) -> Variable :
543
- if "_Unsigned" in variable .attrs :
544
- dims , data , attrs , encoding = unpack_for_decoding (variable )
545
- unsigned = pop_to (attrs , encoding , "_Unsigned" )
546
-
547
- if data .dtype .kind == "i" :
548
- if unsigned == "true" :
549
- unsigned_dtype = np .dtype (f"u{ data .dtype .itemsize } " )
550
- transform = partial (np .asarray , dtype = unsigned_dtype )
551
- if "_FillValue" in attrs :
552
- new_fill = np .array (attrs ["_FillValue" ], dtype = data .dtype )
553
- # use view here to prevent OverflowError
554
- attrs ["_FillValue" ] = new_fill .view (unsigned_dtype ).item ()
555
- data = lazy_elemwise_func (data , transform , unsigned_dtype )
556
- elif data .dtype .kind == "u" :
557
- if unsigned == "false" :
558
- signed_dtype = np .dtype (f"i{ data .dtype .itemsize } " )
559
- transform = partial (np .asarray , dtype = signed_dtype )
560
- data = lazy_elemwise_func (data , transform , signed_dtype )
561
- if "_FillValue" in attrs :
562
- new_fill = signed_dtype .type (attrs ["_FillValue" ])
563
- attrs ["_FillValue" ] = new_fill
564
- else :
565
- warnings .warn (
566
- f"variable { name !r} has _Unsigned attribute but is not "
567
- "of integer type. Ignoring attribute." ,
568
- SerializationWarning ,
569
- stacklevel = 3 ,
570
- )
571
-
572
- return Variable (dims , data , attrs , encoding , fastpath = True )
573
- else :
574
- return variable
575
-
576
-
577
600
class DefaultFillvalueCoder (VariableCoder ):
578
601
"""Encode default _FillValue if needed."""
579
602
0 commit comments