Skip to content

Commit 40fe3a7

Browse files
authored
Support mixed data formats (#416)
1 parent 46498d5 commit 40fe3a7

File tree

7 files changed

+89
-5
lines changed

7 files changed

+89
-5
lines changed

intake_esm/cat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class Config:
4242

4343
class Assets(pydantic.BaseModel):
4444
column_name: pydantic.StrictStr
45-
format: DataFormat
45+
format: typing.Optional[DataFormat]
4646
format_column_name: typing.Optional[pydantic.StrictStr]
4747

4848
class Config:
@@ -54,6 +54,8 @@ def _validate_data_format(cls, values):
5454
data_format, format_column_name = values.get('format'), values.get('format_column_name')
5555
if data_format is not None and format_column_name is not None:
5656
raise ValueError('Cannot set both format and format_column_name')
57+
elif data_format is None and format_column_name is None:
58+
raise ValueError('Must set one of format or format_column_name')
5759
return values
5860

5961

intake_esm/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
202202
variable_column_name=self.esmcat.aggregation_control.variable_column_name,
203203
path_column_name=self.esmcat.assets.column_name,
204204
data_format=self.esmcat.assets.format,
205+
format_column_name=self.esmcat.assets.format_column_name,
205206
aggregations=self.esmcat.aggregation_control.aggregations,
206207
intake_kwargs={'metadata': {}},
207208
)

intake_esm/source.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ def __init__(
114114
records: typing.List[typing.Dict[str, typing.Any]],
115115
variable_column_name: pydantic.StrictStr,
116116
path_column_name: pydantic.StrictStr,
117-
data_format: DataFormat,
117+
data_format: typing.Optional[DataFormat],
118+
format_column_name: typing.Optional[pydantic.StrictStr],
118119
*,
119120
aggregations: typing.Optional[typing.List[Aggregation]] = None,
120121
requested_variables: typing.List[str] = None,
@@ -162,12 +163,11 @@ def __init__(
162163
self.storage_options = storage_options or {}
163164
self.preprocess = preprocess
164165
self.requested_variables = requested_variables or []
165-
self.data_format = data_format.value
166166
self.path_column_name = path_column_name
167167
self.variable_column_name = variable_column_name
168168
self.aggregations = aggregations
169169
self.df = pd.DataFrame.from_records(records)
170-
self.xarray_open_kwargs = _get_xarray_open_kwargs(self.data_format, xarray_open_kwargs)
170+
self.xarray_open_kwargs = xarray_open_kwargs
171171
self.xarray_combine_by_coords_kwargs = dict(combine_attrs='drop_conflicts')
172172
if xarray_combine_by_coords_kwargs is None:
173173
xarray_combine_by_coords_kwargs = {}
@@ -177,6 +177,11 @@ def __init__(
177177
}
178178
self._ds = None
179179

180+
if data_format is not None:
181+
self.df['_data_format_'] = data_format.value
182+
else:
183+
self.df = self.df.rename(columns={format_column_name: '_data_format_'})
184+
180185
def __repr__(self) -> str:
181186
return f'<{type(self).__name__} (name: {self.key}, asset(s): {len(self.df)})>'
182187

@@ -203,7 +208,9 @@ def _open_dataset(self):
203208
_open_dataset(
204209
record[self.path_column_name],
205210
record[self.variable_column_name],
206-
xarray_open_kwargs=self.xarray_open_kwargs,
211+
xarray_open_kwargs=_get_xarray_open_kwargs(
212+
record['_data_format_'], self.xarray_open_kwargs
213+
),
207214
preprocess=self.preprocess,
208215
expand_dims={
209216
agg.attribute_name: [record[agg.attribute_name]]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,format,path,time_range,dcpp_init_year
2+
CMIP,BCC,BCC-ESM1,piControl,r1i1p1f1,Amon,tasmax,gn,netcdf,./tests/sample_data/cmip/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/Amon/tasmax/gn/v20181214/tasmax/tasmax_Amon_BCC-ESM1_piControl_r1i1p1f1_gn_185001-230012.nc,185001-230012,
3+
CMIP,BCC,BCC-ESM1,piControl,r1i1p1f1,Amon,tasmin,gn,zarr,gs://cmip6/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/Amon/tasmin/gn/v20181214/,185001-230012,
4+
CMIP,BCC,BCC-CSM2-MR,abrupt-4xCO2,r1i1p1f1,Amon,tasmax,gn,netcdf,./tests/sample_data/cmip/CMIP6/CMIP/BCC/BCC-CSM2-MR/abrupt-4xCO2/r1i1p1f1/Amon/tasmax/gn/v20181016/tasmax/tasmax_Amon_BCC-CSM2-MR_abrupt-4xCO2_r1i1p1f1_gn_185001-200012.nc,185001-200012,
5+
CMIP,BCC,BCC-CSM2-MR,abrupt-4xCO2,r1i1p1f1,Amon,tasmin,gn,zarr,gs://cmip6/CMIP6/CMIP/BCC/BCC-CSM2-MR/abrupt-4xCO2/r1i1p1f1/Amon/tasmin/gn/v20181016/,185001-200012,
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
"esmcat_version": "0.1.0",
3+
"id": "sample-cmip6-mixed",
4+
"description": "This is a sample ESM collection for CMIP6 data in netcdf AND zarr format, local and remote.",
5+
"catalog_file": "cmip6-bcc-mixed-formats.csv",
6+
"attributes": [
7+
{
8+
"column_name": "activity_id",
9+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_activity_id.json"
10+
},
11+
{
12+
"column_name": "source_id",
13+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_source_id.json"
14+
},
15+
{
16+
"column_name": "institution_id",
17+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_institution_id.json"
18+
},
19+
{
20+
"column_name": "experiment_id",
21+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_experiment_id.json"
22+
},
23+
{ "column_name": "member_id", "vocabulary": "" },
24+
{
25+
"column_name": "table_id",
26+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_table_id.json"
27+
},
28+
{ "column_name": "variable_id", "vocabulary": "" },
29+
{
30+
"column_name": "grid_label",
31+
"vocabulary": "https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_grid_label.json"
32+
}
33+
],
34+
"assets": {
35+
"column_name": "path",
36+
"format_column_name": "format"
37+
},
38+
39+
"aggregation_control": {
40+
"variable_column_name": "variable_id",
41+
"groupby_attrs": [
42+
"activity_id",
43+
"institution_id",
44+
"source_id",
45+
"experiment_id",
46+
"table_id",
47+
"grid_label"
48+
],
49+
"aggregations": [
50+
{
51+
"type": "join_new",
52+
"attribute_name": "member_id",
53+
"options": { "coords": "minimal", "compat": "override" }
54+
},
55+
{
56+
"type": "join_existing",
57+
"attribute_name": "time_range",
58+
"options": { "dim": "time" }
59+
},
60+
{
61+
"type": "union",
62+
"attribute_name": "variable_id"
63+
}
64+
]
65+
}
66+
}

tests/test_core.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def funcs(ds):
2727
cdf_col_sample_cesmle,
2828
cdf_col_sample_cmip5,
2929
cdf_col_sample_cmip6,
30+
mixed_col_sample_cmip6,
3031
multi_variable_col,
3132
sample_df,
3233
sample_esmcol_data,
@@ -231,6 +232,7 @@ def test_multi_variable_catalog(query):
231232
dict(source_id=['CNRM-ESM2-1', 'CNRM-CM6-1', 'BCC-ESM1'], variable_id=['tasmax']),
232233
{'chunks': {'time': 1}},
233234
),
235+
(mixed_col_sample_cmip6, dict(institution_id='BCC'), {}),
234236
],
235237
)
236238
def test_to_dataset_dict(path, query, xarray_open_kwargs):

tests/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
zarr_col_aws_cesm = (
1313
'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'
1414
)
15+
mixed_col_sample_cmip6 = os.path.join(here, 'sample-collections/cmip6-bcc-mixed-formats.json')
1516

1617

1718
sample_df = pd.DataFrame(

0 commit comments

Comments
 (0)