Allow saving catalog via fsspec protocols (#469)

andersy005 · web-flow · commit b24082ee2349 · 2022-04-07T22:11:15.000-06:00
diff --git a/intake_esm/cat.py b/intake_esm/cat.py
@@ -129,6 +129,7 @@ def save(
         catalog_type: str = 'dict',
         to_csv_kwargs: dict = None,
         json_dump_kwargs: dict = None,
+        storage_options: typing.Dict[str, typing.Any] = None,
     ) -> None:
         """
         Save the catalog to a file.
@@ -138,14 +139,18 @@ def save(
         name: str
             The name of the file to save the catalog to.
         directory: str
-            The directory to save the catalog to. If None, use the current directory
+            The directory or cloud storage bucket to save the catalog to.
+            If None, use the current directory
         catalog_type: str
             The type of catalog to save. Whether to save the catalog table as a dictionary
             in the JSON file or as a separate CSV file. Valid options are 'dict' and 'file'.
         to_csv_kwargs : dict, optional
             Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
         json_dump_kwargs : dict, optional
             Additional keyword arguments passed through to the :py:func:`~json.dump` function.
+        storage_options: dict
+            fsspec parameters passed to the backend file-system such as Google Cloud Storage,
+            Amazon Web Service S3.
 
         Notes
         -----
@@ -158,13 +163,12 @@ def save(
             raise ValueError(
                 f'catalog_type must be either "dict" or "file". Received catalog_type={catalog_type}'
             )
-        csv_file_name = pathlib.Path(f'{name}.csv')
-        json_file_name = pathlib.Path(f'{name}.json')
-        if directory:
-            directory = pathlib.Path(directory)
-            directory.mkdir(parents=True, exist_ok=True)
-            csv_file_name = directory / csv_file_name
-            json_file_name = directory / json_file_name
+        if isinstance(directory, pathlib.Path):
+            directory = str(directory)
+        mapper = fsspec.get_mapper(directory or '.', storage_options=storage_options)
+        fs = mapper.fs
+        csv_file_name = f'{mapper.fs.protocol}://{mapper.root}/{name}.csv'
+        json_file_name = f'{mapper.fs.protocol}://{mapper.root}/{name}.json'
 
         data = self.dict().copy()
         for key in {'catalog_dict', 'catalog_file'}:
@@ -179,11 +183,13 @@ def save(
             extensions = {'gzip': '.gz', 'bz2': '.bz2', 'zip': '.zip', 'xz': '.xz', None: ''}
             csv_file_name = f'{csv_file_name}{extensions[compression]}'
             data['catalog_file'] = str(csv_file_name)
-            self.df.to_csv(csv_file_name, **csv_kwargs)
+
+            with fs.open(csv_file_name, 'wb') as csv_outfile:
+                self.df.to_csv(csv_outfile, **csv_kwargs)
         else:
             data['catalog_dict'] = self.df.to_dict(orient='records')
 
-        with open(json_file_name, 'w') as outfile:
+        with fs.open(json_file_name, 'w') as outfile:
             json_kwargs = {'indent': 2}
             json_kwargs.update(json_dump_kwargs or {})
             json.dump(data, outfile, **json_kwargs)
@@ -350,12 +356,13 @@ def search(
 
         """
 
-        if not isinstance(query, QueryModel):
-            _query = QueryModel(
+        _query = (
+            query
+            if isinstance(query, QueryModel)
+            else QueryModel(
                 query=query, require_all_on=require_all_on, columns=self.df.columns.tolist()
             )
-        else:
-            _query = query
+        )
 
         results = search(
             df=self.df, query=_query.query, columns_with_iterables=self.columns_with_iterables
diff --git a/intake_esm/core.py b/intake_esm/core.py
@@ -182,7 +182,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
         # The canonical unique key is the key of a compatible group of assets
         try:
             return self._entries[key]
-        except KeyError:
+        except KeyError as e:
             if key in self.keys():
                 keys_dict = self.esmcat._construct_group_keys(sep=self.sep)
                 grouped = self.esmcat.grouped
@@ -210,7 +210,7 @@ def __getitem__(self, key: str) -> ESMDataSource:
                 return self._entries[key]
             raise KeyError(
                 f'key={key} not found in catalog. You can access the list of valid keys via the .keys() method.'
-            )
+            ) from e
 
     def __contains__(self, key) -> bool:
         # Python falls back to iterating over the entire catalog
@@ -381,6 +381,7 @@ def serialize(
         catalog_type: str = 'dict',
         to_csv_kwargs: typing.Dict[typing.Any, typing.Any] = None,
         json_dump_kwargs: typing.Dict[typing.Any, typing.Any] = None,
+        storage_options: typing.Dict[str, typing.Any] = None,
     ) -> None:
         """Serialize catalog to corresponding json and csv files.
 
@@ -396,6 +397,9 @@ def serialize(
             Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
         json_dump_kwargs : dict, optional
             Additional keyword arguments passed through to the :py:func:`~json.dump` function.
+        storage_options: dict
+            fsspec parameters passed to the backend file-system such as Google Cloud Storage,
+            Amazon Web Service S3.
 
         Notes
         -----
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -180,7 +180,7 @@ def test_catalog_serialize(tmp_path, catalog_type, to_csv_kwargs, json_dump_kwar
     name = 'CMIP6-MRI-ESM2-0'
     cat_subset.serialize(
         name=name,
-        directory=local_store,
+        directory=str(local_store),
         catalog_type=catalog_type,
         to_csv_kwargs=to_csv_kwargs,
         json_dump_kwargs=json_dump_kwargs,