1
+ import functools
1
2
import numpy as np
2
3
import pandas as pd
3
4
import warnings
@@ -372,6 +373,73 @@ def pop_to(source, dest, key, default=None):
372
373
return value
373
374
374
375
376
+ def _var_as_tuple (var ):
377
+ return var .dims , var .values , var .attrs .copy (), var .encoding .copy ()
378
+
379
+
380
+ def maybe_encode_datetime (var ):
381
+ if (np .issubdtype (var .dtype , np .datetime64 )
382
+ or (var .dtype .kind == 'O'
383
+ and isinstance (var .values .flat [0 ], datetime ))):
384
+
385
+ dims , values , attrs , encoding = _var_as_tuple (var )
386
+ if 'units' in attrs or 'calendar' in attrs :
387
+ raise ValueError (
388
+ "Failed hard to prevent overwriting 'units' or 'calendar'" )
389
+
390
+ (values , units , calendar ) = encode_cf_datetime (
391
+ values , encoding .pop ('units' , None ), encoding .pop ('calendar' , None ))
392
+ attrs ['units' ] = units
393
+ attrs ['calendar' ] = calendar
394
+ var = Variable (dims , values , attrs , encoding )
395
+ return var
396
+
397
+
398
+ def maybe_encode_offset_and_scale (var , needs_copy = True ):
399
+ if any (k in var .encoding for k in ['add_offset' , 'scale_factor' ]):
400
+ dims , values , attrs , encoding = _var_as_tuple (var )
401
+ values = np .array (values , dtype = float , copy = needs_copy )
402
+ needs_copy = False
403
+ if 'add_offset' in encoding :
404
+ values -= pop_to (encoding , attrs , 'add_offset' )
405
+ if 'scale_factor' in encoding :
406
+ values /= pop_to (encoding , attrs , 'scale_factor' )
407
+ var = Variable (dims , values , attrs , encoding )
408
+ return var , needs_copy
409
+
410
+
411
+ def maybe_encode_fill_value (var , needs_copy = True ):
412
+ # replace NaN with the fill value
413
+ if '_FillValue' in var .encoding :
414
+ dims , values , attrs , encoding = _var_as_tuple (var )
415
+ fill_value = pop_to (encoding , attrs , '_FillValue' )
416
+ if not pd .isnull (fill_value ):
417
+ missing = pd .isnull (values )
418
+ if missing .any ():
419
+ if needs_copy :
420
+ values = values .copy ()
421
+ needs_copy = False
422
+ values [missing ] = fill_value
423
+ var = Variable (dims , values , attrs , encoding )
424
+ return var , needs_copy
425
+
426
+
427
+ def maybe_encode_dtype (var , needs_copy = True ):
428
+ if 'dtype' in var .encoding :
429
+ dims , values , attrs , encoding = _var_as_tuple (var )
430
+ dtype = np .dtype (encoding .pop ('dtype' ))
431
+ if dtype .kind != 'O' :
432
+ if np .issubdtype (dtype , int ):
433
+ out = np .empty_like (values ) if needs_copy else values
434
+ np .around (values , out = out )
435
+ if dtype == 'S1' and values .dtype != 'S1' :
436
+ values = string_to_char (np .asarray (values , 'S' ))
437
+ dims = dims + ('string%s' % values .shape [- 1 ],)
438
+ values = np .asarray (values , dtype = dtype )
439
+ var = Variable (dims , values , attrs , encoding )
440
+ return var
441
+
442
+
375
443
def _infer_dtype (array ):
376
444
"""Given an object array with no missing values, infer its dtype from its
377
445
first element
@@ -390,7 +458,36 @@ def _infer_dtype(array):
390
458
return dtype
391
459
392
460
393
- def encode_cf_variable (var ):
461
+ def ensure_dtype_not_object (var ):
462
+ # TODO: move this from conventions to backends? (it's not CF related)
463
+ if var .dtype .kind == 'O' :
464
+ dims , values , attrs , encoding = _var_as_tuple (var )
465
+ missing = pd .isnull (values )
466
+ if missing .any ():
467
+ non_missing_values = values [~ missing ]
468
+ inferred_dtype = _infer_dtype (non_missing_values )
469
+
470
+ if inferred_dtype .kind in ['S' , 'U' ]:
471
+ # There is no safe bit-pattern for NA in typical binary string
472
+ # formats, we so can't set a fill_value. Unfortunately, this
473
+ # means we won't be able to restore string arrays with missing
474
+ # values.
475
+ fill_value = ''
476
+ else :
477
+ # insist on using float for numeric values
478
+ if not np .issubdtype (inferred_dtype , float ):
479
+ inferred_dtype = np .dtype (float )
480
+ fill_value = np .nan
481
+
482
+ values = np .array (values , dtype = inferred_dtype , copy = True )
483
+ values [missing ] = fill_value
484
+ else :
485
+ values = np .asarray (values , dtype = _infer_dtype (values ))
486
+ var = Variable (dims , values , attrs , encoding )
487
+ return var
488
+
489
+
490
+ def encode_cf_variable (var , needs_copy = True ):
394
491
"""
395
492
Converts an Variable into an Variable which follows some
396
493
of the CF conventions:
@@ -410,86 +507,12 @@ def encode_cf_variable(var):
410
507
out : xray.Variable
411
508
A variable which has been encoded as described above.
412
509
"""
413
- dimensions = var .dims
414
- data = var .values
415
- attributes = var .attrs .copy ()
416
- encoding = var .encoding .copy ()
417
-
418
- # convert datetimes into numbers
419
- if (np .issubdtype (data .dtype , np .datetime64 )
420
- or (data .dtype .kind == 'O'
421
- and isinstance (data .reshape (- 1 )[0 ], datetime ))):
422
- if 'units' in attributes or 'calendar' in attributes :
423
- raise ValueError (
424
- "Failed hard to prevent overwriting 'units' or 'calendar'" )
425
- (data , units , calendar ) = encode_cf_datetime (
426
- data , encoding .pop ('units' , None ), encoding .pop ('calendar' , None ))
427
- attributes ['units' ] = units
428
- attributes ['calendar' ] = calendar
429
-
430
- # unscale/mask
431
- if any (k in encoding for k in ['add_offset' , 'scale_factor' ]):
432
- data = np .array (data , dtype = float , copy = True )
433
- if 'add_offset' in encoding :
434
- data -= pop_to (encoding , attributes , 'add_offset' )
435
- if 'scale_factor' in encoding :
436
- data /= pop_to (encoding , attributes , 'scale_factor' )
437
-
438
- # replace NaN with the fill value
439
- if '_FillValue' in encoding :
440
- fill_value = pop_to (encoding , attributes , '_FillValue' )
441
- if not pd .isnull (fill_value ):
442
- missing = pd .isnull (data )
443
- if missing .any ():
444
- data = data .copy ()
445
- data [missing ] = fill_value
446
-
447
- # replace NaN with the missing_value
448
- if 'missing_value' in encoding :
449
- missing_value = pop_to (encoding , attributes , 'missing_value' )
450
- if not pd .isnull (missing_value ):
451
- missing = pd .isnull (data )
452
- if missing .any ():
453
- data = data .copy ()
454
- data [missing ] = missing_value
455
-
456
- # cast to encoded dtype
457
- if 'dtype' in encoding :
458
- dtype = np .dtype (encoding .pop ('dtype' ))
459
- if dtype .kind != 'O' :
460
- if np .issubdtype (dtype , int ):
461
- data = data .round ()
462
- if dtype == 'S1' and data .dtype != 'S1' :
463
- data = string_to_char (np .asarray (data , 'S' ))
464
- dimensions = dimensions + ('string%s' % data .shape [- 1 ],)
465
- data = np .asarray (data , dtype = dtype )
466
-
467
- # infer a valid dtype if necessary
468
- # TODO: move this from conventions to backends (it's not CF related)
469
- if data .dtype .kind == 'O' :
470
- missing = pd .isnull (data )
471
- if missing .any ():
472
- non_missing_data = data [~ missing ]
473
- inferred_dtype = _infer_dtype (non_missing_data )
474
-
475
- if inferred_dtype .kind in ['S' , 'U' ]:
476
- # There is no safe bit-pattern for NA in typical binary string
477
- # formats, we so can't set a fill_value. Unfortunately, this
478
- # means we won't be able to restore string arrays with missing
479
- # values.
480
- fill_value = ''
481
- else :
482
- # insist on using float for numeric data
483
- if not np .issubdtype (inferred_dtype , float ):
484
- inferred_dtype = np .dtype (float )
485
- fill_value = np .nan
486
-
487
- data = np .array (data , dtype = inferred_dtype , copy = True )
488
- data [missing ] = fill_value
489
- else :
490
- data = np .asarray (data , dtype = _infer_dtype (data ))
491
-
492
- return Variable (dimensions , data , attributes , encoding = encoding )
510
+ var = maybe_encode_datetime (var )
511
+ var , needs_copy = maybe_encode_offset_and_scale (var , needs_copy )
512
+ var , needs_copy = maybe_encode_fill_value (var , needs_copy )
513
+ var = maybe_encode_dtype (var , needs_copy )
514
+ var = ensure_dtype_not_object (var )
515
+ return var
493
516
494
517
495
518
def decode_cf_variable (var , concat_characters = True , mask_and_scale = True ,
@@ -539,15 +562,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
539
562
data = CharToStringArray (data )
540
563
541
564
if mask_and_scale :
542
- # missing_value is deprecated, but we still want to support it.
543
- missing_value = pop_to (attributes , encoding , 'missing_value' )
565
+ if 'missing_value' in attributes :
566
+ # missing_value is deprecated, but we still want to support it as
567
+ # an alias for _FillValue.
568
+ assert ('_FillValue' not in attributes
569
+ or utils .equivalent (attributes ['_FillValue' ],
570
+ attributes ['missing_value' ]))
571
+ attributes ['_FillValue' ] = attributes .pop ('missing_value' )
572
+
544
573
fill_value = pop_to (attributes , encoding , '_FillValue' )
545
- # if missing_value is given but not fill_value we use missing_value
546
- if fill_value is None and missing_value is not None :
547
- fill_value = missing_value
548
- # if both were given we make sure they are the same.
549
- if fill_value is not None and missing_value is not None :
550
- assert fill_value == missing_value
551
574
scale_factor = pop_to (attributes , encoding , 'scale_factor' )
552
575
add_offset = pop_to (attributes , encoding , 'add_offset' )
553
576
if ((fill_value is not None and not pd .isnull (fill_value ))
0 commit comments