Skip to content

Commit 2c906fd

Browse files
authored
Merge pull request #133 from astronomy-commons/sam/fsspec
Sam/fsspec CloudCats 🐱
2 parents 4060450 + a5bcacb commit 2c906fd

57 files changed

Lines changed: 2412 additions & 164 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ venv/
111111
ENV/
112112
env.bak/
113113
venv.bak/
114+
env.sh
114115

115116
# Spyder project settings
116117
.spyderproject
@@ -144,4 +145,4 @@ tmp/
144145

145146
# Airspeed Velocity performance results
146147
_results/
147-
_html/
148+
_html/

cloud_tests/README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# HiPSCat Cloud Tests
2+
3+
### Performing HiPSCat cloud tests
4+
The only currently implemented cloud platform is abfs. In order to run the tests, you will need to export the following environmental variables in a command line:
5+
```bash
6+
export ABFS_LINCCDATA_ACCOUNT_NAME=lincc_account_name
7+
export ABFS_LINCCDATA_ACCOUNT_KEY=lincc_account_key
8+
```
9+
Then to run the tests:
10+
```bash
11+
pytest cloud_tests/ --timeout 10 --cloud abfs
12+
```
13+
The timeout needs to be increased to account for latency in contacting cloud buckets, and performing heavier i/o commputations.
14+
15+
16+
### How are we connecting to the cloud resources?
17+
18+
We have abstracted our entire i/o infrastructure to be read through the python [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) library. All that needs to be provided is a valid protocol pathway, and storage options for the cloud interface.
19+
20+
21+
### Adding tests for a new cloud interface protocol
22+
23+
There are various steps to have tests run on another cloud bucket provider (like s3 or gcs).
24+
25+
* 1.) You will have to create the container/bucket
26+
* 2.) You will have to edit `cloud_tests/conftest.py` in multiple places:
27+
```python
28+
...
29+
#...line 38...
30+
@pytest.fixture
31+
def example_cloud_path(cloud):
32+
if cloud == "abfs":
33+
return "abfs:///hipscat/pytests/hipscat"
34+
35+
#your new addition
36+
elif cloud == "new_protocol":
37+
return "new_protocol:///path/to/pytest/hipscat"
38+
39+
else:
40+
raise NotImplementedError("Cloud format not implemented for hipscat tests!")
41+
42+
@pytest.fixture
43+
def example_cloud_storage_options(cloud):
44+
if cloud == "abfs":
45+
storage_options = {
46+
"account_key" : os.environ.get("ABFS_LINCCDATA_ACCOUNT_KEY"),
47+
"account_name" : os.environ.get("ABFS_LINCCDATA_ACCOUNT_NAME")
48+
}
49+
return storage_options
50+
51+
#your new addition
52+
elif cloud == "new_protocol":
53+
storage_options = {
54+
"valid_storage_option_param1" : os.environ.get("NEW_PROTOCOL_PARAM1"),
55+
"valid_storage_option_param1" : os.environ.get("NEW_PROTOCOL_PARAM2"),
56+
...
57+
}
58+
59+
return {}
60+
```
61+
62+
* 3.) Finally, you will need to copy the entire `/tests/data/` directory into your newly created bucket. This can be accomplished by running the `copy_data_to_fs.py` script in the `cloud_tests/` directory.
63+
* 4.) Before running the tests, you will need to export your `valid_storage_option_param` into the environment.

cloud_tests/conftest.py

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
import dataclasses
2+
import os
3+
import os.path
4+
from typing import List
5+
6+
import pandas as pd
7+
import pytest
8+
9+
from hipscat.catalog import PartitionInfo
10+
from hipscat.catalog.association_catalog.association_catalog_info import AssociationCatalogInfo
11+
from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo
12+
from hipscat.catalog.catalog_info import CatalogInfo
13+
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
14+
from hipscat.inspection.almanac import Almanac
15+
from hipscat.pixel_math import HealpixPixel
16+
17+
DATA_DIR_NAME = "data"
18+
ALMANAC_DIR_NAME = "almanac"
19+
SMALL_SKY_DIR_NAME = "small_sky"
20+
SMALL_SKY_ORDER1_DIR_NAME = "small_sky_order1"
21+
SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME = "small_sky_to_small_sky_order1"
22+
TEST_DIR = os.path.dirname(__file__)
23+
24+
# pylint: disable=missing-function-docstring, redefined-outer-name
25+
26+
def pytest_addoption(parser):
27+
parser.addoption("--cloud", action="store", default="abfs")
28+
29+
30+
def pytest_generate_tests(metafunc):
31+
# This is called for every test. Only get/set command line arguments
32+
# if the argument is specified in the list of test "fixturenames".
33+
option_value = metafunc.config.option.cloud
34+
if 'cloud' in metafunc.fixturenames and option_value is not None:
35+
metafunc.parametrize("cloud", [option_value])
36+
37+
38+
@pytest.fixture
39+
def example_cloud_path(cloud):
40+
if cloud == "abfs":
41+
return "abfs:///hipscat/pytests/hipscat"
42+
43+
else:
44+
raise NotImplementedError("Cloud format not implemented for hipscat tests!")
45+
46+
@pytest.fixture
47+
def example_cloud_storage_options(cloud):
48+
if cloud == "abfs":
49+
storage_options = {
50+
"account_key" : os.environ.get("ABFS_LINCCDATA_ACCOUNT_KEY"),
51+
"account_name" : os.environ.get("ABFS_LINCCDATA_ACCOUNT_NAME")
52+
}
53+
return storage_options
54+
55+
return {}
56+
57+
58+
@pytest.fixture
59+
def tmp_dir_cloud(example_cloud_path):
60+
return os.path.join(example_cloud_path, "tmp")
61+
62+
63+
@pytest.fixture
64+
def test_data_dir_cloud(example_cloud_path):
65+
return os.path.join(example_cloud_path, DATA_DIR_NAME)
66+
67+
68+
@pytest.fixture
69+
def almanac_dir_cloud(test_data_dir_cloud):
70+
return os.path.join(test_data_dir_cloud, ALMANAC_DIR_NAME)
71+
72+
73+
@pytest.fixture
74+
def small_sky_dir_cloud(test_data_dir_cloud):
75+
return os.path.join(test_data_dir_cloud, SMALL_SKY_DIR_NAME)
76+
77+
78+
@pytest.fixture
79+
def small_sky_order1_dir_cloud(test_data_dir_cloud):
80+
return os.path.join(test_data_dir_cloud, SMALL_SKY_ORDER1_DIR_NAME)
81+
82+
83+
@pytest.fixture
84+
def small_sky_to_small_sky_order1_dir_cloud(test_data_dir_cloud):
85+
return os.path.join(test_data_dir_cloud, SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME)
86+
87+
88+
@pytest.fixture
89+
def catalog_pixels() -> List[HealpixPixel]:
90+
return [HealpixPixel(1, 0), HealpixPixel(1, 1), HealpixPixel(2, 8)]
91+
92+
93+
@pytest.fixture
94+
def association_catalog_path_cloud(test_data_dir_cloud) -> str:
95+
return os.path.join(test_data_dir_cloud, "small_sky_to_small_sky_order1")
96+
97+
98+
@pytest.fixture
99+
def association_catalog_info_file_cloud(association_catalog_path_cloud) -> str:
100+
return os.path.join(association_catalog_path_cloud, "catalog_info.json")
101+
102+
103+
@pytest.fixture
104+
def index_catalog_info_file_cloud(test_data_dir_cloud) -> str:
105+
return os.path.join(test_data_dir_cloud, "index_catalog", "catalog_info.json")
106+
107+
108+
@pytest.fixture
109+
def margin_cache_catalog_info_file_cloud(test_data_dir_cloud) -> str:
110+
return os.path.join(test_data_dir_cloud, "margin_cache", "catalog_info.json")
111+
112+
113+
@pytest.fixture
114+
def source_catalog_info_file_cloud(test_data_dir_cloud) -> str:
115+
return os.path.join(test_data_dir_cloud, "small_sky_source", "catalog_info.json")
116+
117+
118+
@pytest.fixture
119+
def association_catalog_info(association_catalog_info_data) -> AssociationCatalogInfo:
120+
return AssociationCatalogInfo(**association_catalog_info_data)
121+
122+
123+
@pytest.fixture
124+
def association_catalog_partition_join_file_cloud(association_catalog_path_cloud) -> str:
125+
return os.path.join(association_catalog_path_cloud, "partition_join_info.csv")
126+
127+
128+
@pytest.fixture
129+
def dataset_path_cloud(test_data_dir_cloud) -> str:
130+
return os.path.join(test_data_dir_cloud, "dataset")
131+
132+
133+
@pytest.fixture
134+
def base_catalog_info_file_cloud(dataset_path_cloud) -> str:
135+
return os.path.join(dataset_path_cloud, "catalog_info.json")
136+
137+
138+
@pytest.fixture
139+
def base_catalog_info(base_catalog_info_data) -> BaseCatalogInfo:
140+
return BaseCatalogInfo(**base_catalog_info_data)
141+
142+
143+
@pytest.fixture
144+
def catalog_path_cloud(test_data_dir_cloud) -> str:
145+
return os.path.join(test_data_dir_cloud, "catalog")
146+
147+
148+
@pytest.fixture
149+
def catalog_info_file_cloud(catalog_path_cloud) -> str:
150+
return os.path.join(catalog_path_cloud, "catalog_info.json")
151+
152+
@pytest.fixture
153+
def test_data_dir():
154+
return os.path.join(TEST_DIR, DATA_DIR_NAME)
155+
156+
157+
@pytest.fixture
158+
def small_sky_dir_local(test_data_dir):
159+
return os.path.join(test_data_dir, SMALL_SKY_DIR_NAME)
160+
161+
162+
@pytest.fixture
163+
def small_sky_order1_dir_local(test_data_dir):
164+
return os.path.join(test_data_dir, SMALL_SKY_ORDER1_DIR_NAME)
165+
166+
167+
@pytest.fixture
168+
def assert_catalog_info_matches_dict():
169+
def assert_match(catalog_info: BaseCatalogInfo, dictionary: dict):
170+
"""Check that all members of the catalog_info object match dictionary
171+
elements, where specified."""
172+
catalog_info_dict = dataclasses.asdict(catalog_info)
173+
for key, value in dictionary.items():
174+
assert catalog_info_dict[key] == value
175+
176+
return assert_match
177+
178+
179+
@pytest.fixture
180+
def base_catalog_info_data() -> dict:
181+
return {
182+
"catalog_name": "test_name",
183+
"catalog_type": "object",
184+
"total_rows": 10,
185+
}
186+
187+
188+
@pytest.fixture
189+
def catalog_info_data() -> dict:
190+
return {
191+
"catalog_name": "test_name",
192+
"catalog_type": "object",
193+
"total_rows": 10,
194+
"epoch": "J2000",
195+
"ra_column": "ra",
196+
"dec_column": "dec",
197+
}
198+
199+
200+
@pytest.fixture
201+
def association_catalog_info_data() -> dict:
202+
return {
203+
"catalog_name": "test_name",
204+
"catalog_type": "association",
205+
"total_rows": 10,
206+
"primary_catalog": "small_sky",
207+
"primary_column": "id",
208+
"join_catalog": "small_sky_order1",
209+
"join_column": "id",
210+
}
211+
212+
213+
@pytest.fixture
214+
def source_catalog_info() -> dict:
215+
return {
216+
"catalog_name": "test_source",
217+
"catalog_type": "source",
218+
"total_rows": 100,
219+
"epoch": "J2000",
220+
"ra_column": "source_ra",
221+
"dec_column": "source_dec",
222+
}
223+
224+
225+
@pytest.fixture
226+
def source_catalog_info_with_extra() -> dict:
227+
return {
228+
"catalog_name": "test_source",
229+
"catalog_type": "source",
230+
"total_rows": 100,
231+
"epoch": "J2000",
232+
"ra_column": "source_ra",
233+
"dec_column": "source_dec",
234+
"primary_catalog": "test_name",
235+
"mjd_column": "mjd",
236+
"band_column": "band",
237+
"mag_column": "mag",
238+
"mag_err_column": "",
239+
}
240+
241+
242+
@pytest.fixture
243+
def margin_cache_catalog_info() -> dict:
244+
return {
245+
"catalog_name": "test_margin",
246+
"catalog_type": "margin",
247+
"total_rows": 100,
248+
"primary_catalog": "test_name",
249+
"margin_threshold": 0.5,
250+
}
251+
252+
253+
@pytest.fixture
254+
def index_catalog_info() -> dict:
255+
return {
256+
"catalog_name": "test_index",
257+
"catalog_type": "index",
258+
"total_rows": 100,
259+
"primary_catalog": "test_name",
260+
"indexing_column": "id",
261+
}
262+
263+
264+
@pytest.fixture
265+
def index_catalog_info_with_extra() -> dict:
266+
return {
267+
"catalog_name": "test_index",
268+
"catalog_type": "index",
269+
"total_rows": 100,
270+
"primary_catalog": "test_name",
271+
"indexing_column": "id",
272+
"extra_columns": ["foo", "bar"],
273+
}
274+
275+
276+
277+
@pytest.fixture
278+
def catalog_info(catalog_info_data) -> CatalogInfo:
279+
return CatalogInfo(**catalog_info_data)
280+
281+
282+
@pytest.fixture
283+
def catalog_pixels_df() -> pd.DataFrame:
284+
return pd.DataFrame.from_dict(
285+
{
286+
PartitionInfo.METADATA_ORDER_COLUMN_NAME: [1, 1, 2],
287+
PartitionInfo.METADATA_DIR_COLUMN_NAME: [0, 0, 0],
288+
PartitionInfo.METADATA_PIXEL_COLUMN_NAME: [0, 1, 8],
289+
}
290+
)
291+
292+
293+
@pytest.fixture
294+
def association_catalog_join_pixels() -> pd.DataFrame:
295+
return pd.DataFrame.from_dict(
296+
{
297+
PartitionJoinInfo.PRIMARY_ORDER_COLUMN_NAME: [0, 0, 0, 0],
298+
PartitionJoinInfo.PRIMARY_PIXEL_COLUMN_NAME: [11, 11, 11, 11],
299+
PartitionJoinInfo.JOIN_ORDER_COLUMN_NAME: [1, 1, 1, 1],
300+
PartitionJoinInfo.JOIN_PIXEL_COLUMN_NAME: [44, 45, 46, 47],
301+
}
302+
)
303+
304+
305+
@pytest.fixture
306+
def default_almanac_cloud(example_cloud_path, example_cloud_storage_options):
307+
"""Set up default environment variables and fetch default almanac data."""
308+
309+
test_data_dir = os.path.join(example_cloud_path, "data")
310+
almanac_dir = os.path.join(example_cloud_path, "data", "almanac")
311+
312+
os.environ["HIPSCAT_ALMANAC_DIR"] = almanac_dir
313+
os.environ["HIPSCAT_DEFAULT_DIR"] = test_data_dir
314+
315+
return Almanac(storage_options=example_cloud_storage_options)

0 commit comments

Comments
 (0)