Support mixed data formats (#416)

aulemahal · web-flow · commit 40fe3a7808c1 · 2021-12-15T12:21:48.000-07:00
diff --git a/intake_esm/cat.py b/intake_esm/cat.py
@@ -42,7 +42,7 @@ class Config:
 
 class Assets(pydantic.BaseModel):
     column_name: pydantic.StrictStr
-    format: DataFormat
+    format: typing.Optional[DataFormat]
     format_column_name: typing.Optional[pydantic.StrictStr]
 
     class Config:
@@ -54,6 +54,8 @@ def _validate_data_format(cls, values):
         data_format, format_column_name = values.get('format'), values.get('format_column_name')
         if data_format is not None and format_column_name is not None:
             raise ValueError('Cannot set both format and format_column_name')
+        elif data_format is None and format_column_name is None:
+            raise ValueError('Must set one of format or format_column_name')
         return values
 
 
diff --git a/intake_esm/core.py b/intake_esm/core.py
@@ -202,6 +202,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
                     variable_column_name=self.esmcat.aggregation_control.variable_column_name,
                     path_column_name=self.esmcat.assets.column_name,
                     data_format=self.esmcat.assets.format,
+                    format_column_name=self.esmcat.assets.format_column_name,
                     aggregations=self.esmcat.aggregation_control.aggregations,
                     intake_kwargs={'metadata': {}},
                 )
diff --git a/intake_esm/source.py b/intake_esm/source.py
@@ -114,7 +114,8 @@ def __init__(
         records: typing.List[typing.Dict[str, typing.Any]],
         variable_column_name: pydantic.StrictStr,
         path_column_name: pydantic.StrictStr,
-        data_format: DataFormat,
+        data_format: typing.Optional[DataFormat],
+        format_column_name: typing.Optional[pydantic.StrictStr],
         *,
         aggregations: typing.Optional[typing.List[Aggregation]] = None,
         requested_variables: typing.List[str] = None,
@@ -162,12 +163,11 @@ def __init__(
         self.storage_options = storage_options or {}
         self.preprocess = preprocess
         self.requested_variables = requested_variables or []
-        self.data_format = data_format.value
         self.path_column_name = path_column_name
         self.variable_column_name = variable_column_name
         self.aggregations = aggregations
         self.df = pd.DataFrame.from_records(records)
-        self.xarray_open_kwargs = _get_xarray_open_kwargs(self.data_format, xarray_open_kwargs)
+        self.xarray_open_kwargs = xarray_open_kwargs
         self.xarray_combine_by_coords_kwargs = dict(combine_attrs='drop_conflicts')
         if xarray_combine_by_coords_kwargs is None:
             xarray_combine_by_coords_kwargs = {}
@@ -177,6 +177,11 @@ def __init__(
         }
         self._ds = None
 
+        if data_format is not None:
+            self.df['_data_format_'] = data_format.value
+        else:
+            self.df = self.df.rename(columns={format_column_name: '_data_format_'})
+
     def __repr__(self) -> str:
         return f'<{type(self).__name__}  (name: {self.key}, asset(s): {len(self.df)})>'
 
@@ -203,7 +208,9 @@ def _open_dataset(self):
                 _open_dataset(
                     record[self.path_column_name],
                     record[self.variable_column_name],
-                    xarray_open_kwargs=self.xarray_open_kwargs,
+                    xarray_open_kwargs=_get_xarray_open_kwargs(
+                        record['_data_format_'], self.xarray_open_kwargs
+                    ),
                     preprocess=self.preprocess,
                     expand_dims={
                         agg.attribute_name: [record[agg.attribute_name]]
diff --git a/tests/sample-collections/cmip6-bcc-mixed-formats.csv b/tests/sample-collections/cmip6-bcc-mixed-formats.csv
@@ -0,0 +1,5 @@
+activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,format,path,time_range,dcpp_init_year
+CMIP,BCC,BCC-ESM1,piControl,r1i1p1f1,Amon,tasmax,gn,netcdf,./tests/sample_data/cmip/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/Amon/tasmax/gn/v20181214/tasmax/tasmax_Amon_BCC-ESM1_piControl_r1i1p1f1_gn_185001-230012.nc,185001-230012,
+CMIP,BCC,BCC-ESM1,piControl,r1i1p1f1,Amon,tasmin,gn,zarr,gs://cmip6/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/Amon/tasmin/gn/v20181214/,185001-230012,
+CMIP,BCC,BCC-CSM2-MR,abrupt-4xCO2,r1i1p1f1,Amon,tasmax,gn,netcdf,./tests/sample_data/cmip/CMIP6/CMIP/BCC/BCC-CSM2-MR/abrupt-4xCO2/r1i1p1f1/Amon/tasmax/gn/v20181016/tasmax/tasmax_Amon_BCC-CSM2-MR_abrupt-4xCO2_r1i1p1f1_gn_185001-200012.nc,185001-200012,
+CMIP,BCC,BCC-CSM2-MR,abrupt-4xCO2,r1i1p1f1,Amon,tasmin,gn,zarr,gs://cmip6/CMIP6/CMIP/BCC/BCC-CSM2-MR/abrupt-4xCO2/r1i1p1f1/Amon/tasmin/gn/v20181016/,185001-200012,
diff --git a/tests/sample-collections/cmip6-bcc-mixed-formats.json b/tests/sample-collections/cmip6-bcc-mixed-formats.json
@@ -0,0 +1,66 @@
+{
+  "esmcat_version": "0.1.0",
+  "id": "sample-cmip6-mixed",
+  "description": "This is a sample ESM collection for CMIP6 data in netcdf AND zarr format, local and remote.",
+  "catalog_file": "cmip6-bcc-mixed-formats.csv",
+  "attributes": [
+    {
+      "column_name": "activity_id",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_activity_id.json"
+    },
+    {
+      "column_name": "source_id",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_source_id.json"
+    },
+    {
+      "column_name": "institution_id",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_institution_id.json"
+    },
+    {
+      "column_name": "experiment_id",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_experiment_id.json"
+    },
+    { "column_name": "member_id", "vocabulary": "" },
+    {
+      "column_name": "table_id",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_table_id.json"
+    },
+    { "column_name": "variable_id", "vocabulary": "" },
+    {
+      "column_name": "grid_label",
+      "vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_grid_label.json"
+    }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format_column_name": "format"
+  },
+
+  "aggregation_control": {
+    "variable_column_name": "variable_id",
+    "groupby_attrs": [
+      "activity_id",
+      "institution_id",
+      "source_id",
+      "experiment_id",
+      "table_id",
+      "grid_label"
+    ],
+    "aggregations": [
+      {
+        "type": "join_new",
+        "attribute_name": "member_id",
+        "options": { "coords": "minimal", "compat": "override" }
+      },
+      {
+        "type": "join_existing",
+        "attribute_name": "time_range",
+        "options": { "dim": "time" }
+      },
+      {
+        "type": "union",
+        "attribute_name": "variable_id"
+      }
+    ]
+  }
+}
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -27,6 +27,7 @@ def funcs(ds):
     cdf_col_sample_cesmle,
     cdf_col_sample_cmip5,
     cdf_col_sample_cmip6,
+    mixed_col_sample_cmip6,
     multi_variable_col,
     sample_df,
     sample_esmcol_data,
@@ -231,6 +232,7 @@ def test_multi_variable_catalog(query):
             dict(source_id=['CNRM-ESM2-1', 'CNRM-CM6-1', 'BCC-ESM1'], variable_id=['tasmax']),
             {'chunks': {'time': 1}},
         ),
+        (mixed_col_sample_cmip6, dict(institution_id='BCC'), {}),
     ],
 )
 def test_to_dataset_dict(path, query, xarray_open_kwargs):
diff --git a/tests/utils.py b/tests/utils.py
@@ -12,6 +12,7 @@
 zarr_col_aws_cesm = (
     'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'
 )
+mixed_col_sample_cmip6 = os.path.join(here, 'sample-collections/cmip6-bcc-mixed-formats.json')
 
 
 sample_df = pd.DataFrame(

Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,7 @@ def __getitem__(self, key: str) -> ESMDataSource:`
`202`	`202`	`variable_column_name=self.esmcat.aggregation_control.variable_column_name,`
`203`	`203`	`path_column_name=self.esmcat.assets.column_name,`
`204`	`204`	`data_format=self.esmcat.assets.format,`
	`205`	`+ format_column_name=self.esmcat.assets.format_column_name,`
`205`	`206`	`aggregations=self.esmcat.aggregation_control.aggregations,`
`206`	`207`	`intake_kwargs={'metadata': {}},`
`207`	`208`	`)`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@`
`12`	`12`	`zarr_col_aws_cesm = (`
`13`	`13`	`'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'`
`14`	`14`	`)`
	`15`	`+mixed_col_sample_cmip6 = os.path.join(here, 'sample-collections/cmip6-bcc-mixed-formats.json')`
`15`	`16`
`16`	`17`
`17`	`18`	`sample_df = pd.DataFrame(`