Skip to content

Commit b24082e

Browse files
authored
Allow saving catalog via fsspec protocols (#469)
1 parent fba15d6 commit b24082e

File tree

3 files changed

+28
-17
lines changed

3 files changed

+28
-17
lines changed

intake_esm/cat.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ def save(
129129
catalog_type: str = 'dict',
130130
to_csv_kwargs: dict = None,
131131
json_dump_kwargs: dict = None,
132+
storage_options: typing.Dict[str, typing.Any] = None,
132133
) -> None:
133134
"""
134135
Save the catalog to a file.
@@ -138,14 +139,18 @@ def save(
138139
name: str
139140
The name of the file to save the catalog to.
140141
directory: str
141-
The directory to save the catalog to. If None, use the current directory
142+
The directory or cloud storage bucket to save the catalog to.
143+
If None, use the current directory
142144
catalog_type: str
143145
The type of catalog to save. Whether to save the catalog table as a dictionary
144146
in the JSON file or as a separate CSV file. Valid options are 'dict' and 'file'.
145147
to_csv_kwargs : dict, optional
146148
Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
147149
json_dump_kwargs : dict, optional
148150
Additional keyword arguments passed through to the :py:func:`~json.dump` function.
151+
storage_options: dict
152+
fsspec parameters passed to the backend file-system such as Google Cloud Storage,
153+
Amazon Web Service S3.
149154
150155
Notes
151156
-----
@@ -158,13 +163,12 @@ def save(
158163
raise ValueError(
159164
f'catalog_type must be either "dict" or "file". Received catalog_type={catalog_type}'
160165
)
161-
csv_file_name = pathlib.Path(f'{name}.csv')
162-
json_file_name = pathlib.Path(f'{name}.json')
163-
if directory:
164-
directory = pathlib.Path(directory)
165-
directory.mkdir(parents=True, exist_ok=True)
166-
csv_file_name = directory / csv_file_name
167-
json_file_name = directory / json_file_name
166+
if isinstance(directory, pathlib.Path):
167+
directory = str(directory)
168+
mapper = fsspec.get_mapper(directory or '.', storage_options=storage_options)
169+
fs = mapper.fs
170+
csv_file_name = f'{mapper.fs.protocol}://{mapper.root}/{name}.csv'
171+
json_file_name = f'{mapper.fs.protocol}://{mapper.root}/{name}.json'
168172

169173
data = self.dict().copy()
170174
for key in {'catalog_dict', 'catalog_file'}:
@@ -179,11 +183,13 @@ def save(
179183
extensions = {'gzip': '.gz', 'bz2': '.bz2', 'zip': '.zip', 'xz': '.xz', None: ''}
180184
csv_file_name = f'{csv_file_name}{extensions[compression]}'
181185
data['catalog_file'] = str(csv_file_name)
182-
self.df.to_csv(csv_file_name, **csv_kwargs)
186+
187+
with fs.open(csv_file_name, 'wb') as csv_outfile:
188+
self.df.to_csv(csv_outfile, **csv_kwargs)
183189
else:
184190
data['catalog_dict'] = self.df.to_dict(orient='records')
185191

186-
with open(json_file_name, 'w') as outfile:
192+
with fs.open(json_file_name, 'w') as outfile:
187193
json_kwargs = {'indent': 2}
188194
json_kwargs.update(json_dump_kwargs or {})
189195
json.dump(data, outfile, **json_kwargs)
@@ -350,12 +356,13 @@ def search(
350356
351357
"""
352358

353-
if not isinstance(query, QueryModel):
354-
_query = QueryModel(
359+
_query = (
360+
query
361+
if isinstance(query, QueryModel)
362+
else QueryModel(
355363
query=query, require_all_on=require_all_on, columns=self.df.columns.tolist()
356364
)
357-
else:
358-
_query = query
365+
)
359366

360367
results = search(
361368
df=self.df, query=_query.query, columns_with_iterables=self.columns_with_iterables

intake_esm/core.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
182182
# The canonical unique key is the key of a compatible group of assets
183183
try:
184184
return self._entries[key]
185-
except KeyError:
185+
except KeyError as e:
186186
if key in self.keys():
187187
keys_dict = self.esmcat._construct_group_keys(sep=self.sep)
188188
grouped = self.esmcat.grouped
@@ -210,7 +210,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
210210
return self._entries[key]
211211
raise KeyError(
212212
f'key={key} not found in catalog. You can access the list of valid keys via the .keys() method.'
213-
)
213+
) from e
214214

215215
def __contains__(self, key) -> bool:
216216
# Python falls back to iterating over the entire catalog
@@ -381,6 +381,7 @@ def serialize(
381381
catalog_type: str = 'dict',
382382
to_csv_kwargs: typing.Dict[typing.Any, typing.Any] = None,
383383
json_dump_kwargs: typing.Dict[typing.Any, typing.Any] = None,
384+
storage_options: typing.Dict[str, typing.Any] = None,
384385
) -> None:
385386
"""Serialize catalog to corresponding json and csv files.
386387
@@ -396,6 +397,9 @@ def serialize(
396397
Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
397398
json_dump_kwargs : dict, optional
398399
Additional keyword arguments passed through to the :py:func:`~json.dump` function.
400+
storage_options: dict
401+
fsspec parameters passed to the backend file-system such as Google Cloud Storage,
402+
Amazon Web Service S3.
399403
400404
Notes
401405
-----

tests/test_core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def test_catalog_serialize(tmp_path, catalog_type, to_csv_kwargs, json_dump_kwar
180180
name = 'CMIP6-MRI-ESM2-0'
181181
cat_subset.serialize(
182182
name=name,
183-
directory=local_store,
183+
directory=str(local_store),
184184
catalog_type=catalog_type,
185185
to_csv_kwargs=to_csv_kwargs,
186186
json_dump_kwargs=json_dump_kwargs,

0 commit comments

Comments
 (0)