Skip to content

Commit 6117b8f

Browse files
authored
Merge 28063ae into f811ab3
2 parents f811ab3 + 28063ae commit 6117b8f

18 files changed

Lines changed: 204 additions & 248 deletions

File tree

src/hipscat/catalog/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Catalog data wrappers"""
22

3-
from .catalog import Catalog
43
from .association_catalog.association_catalog import AssociationCatalog
5-
from .dataset.dataset import Dataset
4+
from .catalog import Catalog
65
from .catalog_type import CatalogType
6+
from .dataset.dataset import Dataset
77
from .partition_info import PartitionInfo

src/hipscat/catalog/association_catalog/association_catalog.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
from typing import Tuple, Union
2+
23
import pandas as pd
34
from typing_extensions import TypeAlias
45

56
from hipscat.catalog.association_catalog.association_catalog_info import AssociationCatalogInfo
67
from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo
78
from hipscat.catalog.catalog_type import CatalogType
8-
from hipscat.catalog.dataset.dataset import Dataset
9-
from hipscat.io import FilePointer, paths
9+
from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset, PixelInputTypes
10+
from hipscat.io import FilePointer, file_io, paths
1011

1112

12-
class AssociationCatalog(Dataset):
13+
class AssociationCatalog(HealpixDataset):
1314
"""A HiPSCat Catalog for enabling fast joins between two HiPSCat catalogs
1415
15-
Catalogs of this type are partitioned based on the partitioning of both joining catalogs in the
16-
form 'Norder=/Dir=/Npix=/join_Norder=/join_Dir=/join_Npix=.parquet'. Where each partition
17-
contains the matching pair of hipscat indexes from each catalog's respective partitions to join.
16+
Catalogs of this type are partitioned based on the partitioning of the left catalog.
1817
The `partition_join_info` metadata file specifies all pairs of pixels in the Association
1918
Catalog, corresponding to each pair of partitions in each catalog that contain rows to join.
2019
"""
@@ -29,13 +28,14 @@ class AssociationCatalog(Dataset):
2928
def __init__(
3029
self,
3130
catalog_info: CatalogInfoClass,
31+
pixels: PixelInputTypes,
3232
join_pixels: JoinPixelInputTypes,
3333
catalog_path=None,
34-
storage_options: dict=None
34+
storage_options: dict = None,
3535
) -> None:
3636
if not catalog_info.catalog_type == CatalogType.ASSOCIATION:
3737
raise ValueError("Catalog info `catalog_type` must be 'association'")
38-
super().__init__(catalog_info, catalog_path, storage_options=storage_options)
38+
super().__init__(catalog_info, pixels, catalog_path, storage_options=storage_options)
3939
self.join_info = self._get_partition_join_info_from_pixels(join_pixels)
4040

4141
def get_join_pixels(self) -> pd.DataFrame:
@@ -60,11 +60,21 @@ def _get_partition_join_info_from_pixels(
6060
@classmethod
6161
def _read_args(
6262
cls, catalog_base_dir: FilePointer, storage_options: dict = None
63-
) -> Tuple[CatalogInfoClass, JoinPixelInputTypes]: # type: ignore[override]
63+
) -> Tuple[CatalogInfoClass, PixelInputTypes, JoinPixelInputTypes]: # type: ignore[override]
6464
args = super()._read_args(catalog_base_dir, storage_options=storage_options)
6565
partition_join_info_file = paths.get_partition_join_info_pointer(catalog_base_dir)
6666
partition_join_info = PartitionJoinInfo.read_from_file(
67-
partition_join_info_file,
68-
storage_options=storage_options
67+
partition_join_info_file, storage_options=storage_options
6968
)
7069
return args + (partition_join_info,)
70+
71+
@classmethod
72+
def _check_files_exist(cls, catalog_base_dir: FilePointer, storage_options: dict = None):
73+
super()._check_files_exist(catalog_base_dir, storage_options=storage_options)
74+
partition_join_info_file = paths.get_partition_join_info_pointer(catalog_base_dir)
75+
if not file_io.does_file_or_directory_exist(
76+
partition_join_info_file, storage_options=storage_options
77+
):
78+
raise FileNotFoundError(
79+
f"No partition join info found where expected: {str(partition_join_info_file)}"
80+
)

src/hipscat/catalog/catalog.py

Lines changed: 6 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,29 @@
11
"""Container class to hold catalog metadata and partition iteration"""
22
from __future__ import annotations
3-
from typing import List, Tuple, Union
43

54
import dataclasses
5+
from typing import List
66

77
import healpy as hp
88
import numpy as np
9-
import pandas as pd
10-
119
from typing_extensions import TypeAlias
10+
1211
from hipscat.catalog.catalog_info import CatalogInfo
1312
from hipscat.catalog.catalog_type import CatalogType
14-
from hipscat.catalog.dataset.dataset import Dataset
15-
from hipscat.catalog.partition_info import PartitionInfo
16-
from hipscat.io import FilePointer, file_io, paths
13+
from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset, PixelInputTypes
1714
from hipscat.pixel_math import HealpixPixel
1815
from hipscat.pixel_math.cone_filter import filter_pixels_by_cone
1916
from hipscat.pixel_tree.pixel_node_type import PixelNodeType
20-
from hipscat.pixel_tree.pixel_tree import PixelTree
21-
from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder
2217

2318

24-
class Catalog(Dataset):
19+
class Catalog(HealpixDataset):
2520
"""A HiPSCat Catalog with data stored in a HEALPix Hive partitioned structure
2621
2722
Catalogs of this type are partitioned spatially, contain `partition_info` metadata specifying
2823
the pixels in Catalog, and on disk conform to the parquet partitioning structure
2924
`Norder=/Dir=/Npix=.parquet`
3025
"""
3126

32-
PixelInputTypes = Union[pd.DataFrame, PartitionInfo, PixelTree, List[HealpixPixel]]
3327
HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE, CatalogType.MARGIN]
3428

3529
# Update CatalogInfoClass, used to check if the catalog_info is the correct type, and
@@ -42,7 +36,7 @@ def __init__(
4236
catalog_info: CatalogInfoClass,
4337
pixels: PixelInputTypes,
4438
catalog_path: str = None,
45-
storage_options: dict = None
39+
storage_options: dict = None,
4640
) -> None:
4741
"""Initializes a Catalog
4842
@@ -60,76 +54,7 @@ def __init__(
6054
f"Catalog info `catalog_type` must be one of "
6155
f"{', '.join([t.value for t in self.HIPS_CATALOG_TYPES])}"
6256
)
63-
super().__init__(catalog_info, catalog_path, storage_options)
64-
self.partition_info = self._get_partition_info_from_pixels(pixels)
65-
self.pixel_tree = self._get_pixel_tree_from_pixels(pixels)
66-
67-
@staticmethod
68-
def _get_partition_info_from_pixels(pixels: PixelInputTypes) -> PartitionInfo:
69-
if isinstance(pixels, PartitionInfo):
70-
return pixels
71-
if isinstance(pixels, pd.DataFrame):
72-
return PartitionInfo(pixels)
73-
if isinstance(pixels, PixelTree):
74-
return PartitionInfo.from_healpix(
75-
[
76-
HealpixPixel(node.hp_order, node.hp_pixel)
77-
for node in pixels.root_pixel.get_all_leaf_descendants()
78-
]
79-
)
80-
if pd.api.types.is_list_like(pixels):
81-
return PartitionInfo.from_healpix(pixels)
82-
raise TypeError("Pixels must be of type PartitionInfo, Dataframe, PixelTree, or List[HealpixPixel]")
83-
84-
@staticmethod
85-
def _get_pixel_tree_from_pixels(pixels: PixelInputTypes) -> PixelTree:
86-
if isinstance(pixels, PartitionInfo):
87-
return PixelTreeBuilder.from_partition_info_df(pixels.data_frame)
88-
if isinstance(pixels, pd.DataFrame):
89-
return PixelTreeBuilder.from_partition_info_df(pixels)
90-
if isinstance(pixels, PixelTree):
91-
return pixels
92-
if pd.api.types.is_list_like(pixels):
93-
return PixelTreeBuilder.from_healpix(pixels)
94-
raise TypeError("Pixels must be of type PartitionInfo, Dataframe, PixelTree, or List[HealpixPixel]")
95-
96-
def get_pixels(self):
97-
"""Get all healpix pixels that are contained in the catalog
98-
99-
Returns:
100-
data frame with per-pixel data.
101-
102-
The data frame contains the following columns:
103-
104-
- order: order of the destination pixel
105-
- pixel: pixel number *at the above order*
106-
- num_objects: the number of rows in the pixel's partition
107-
"""
108-
return self.partition_info.data_frame
109-
110-
def get_healpix_pixels(self) -> List[HealpixPixel]:
111-
"""Get healpix pixel objects for all pixels contained in the catalog.
112-
113-
Returns:
114-
List of HealpixPixel
115-
"""
116-
return self.partition_info.get_healpix_pixels()
117-
118-
@classmethod
119-
def _read_args(
120-
cls, catalog_base_dir: FilePointer, storage_options: dict = None
121-
) -> Tuple[CatalogInfoClass, PartitionInfo]:
122-
args = super()._read_args(catalog_base_dir, storage_options=storage_options)
123-
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
124-
partition_info = PartitionInfo.read_from_file(partition_info_file, storage_options=storage_options)
125-
return args + (partition_info,)
126-
127-
@classmethod
128-
def _check_files_exist(cls, catalog_base_dir: FilePointer, storage_options: dict = None):
129-
super()._check_files_exist(catalog_base_dir, storage_options=storage_options)
130-
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
131-
if not file_io.does_file_or_directory_exist(partition_info_file, storage_options=storage_options):
132-
raise FileNotFoundError(f"No partition info found where expected: {str(partition_info_file)}")
57+
super().__init__(catalog_info, pixels, catalog_path, storage_options)
13358

13459
def filter_by_cone(self, ra: float, dec: float, radius: float) -> Catalog:
13560
"""Filter the pixels in the catalog to only include the pixels that overlap with a cone

src/hipscat/catalog/dataset/dataset.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@ class Dataset:
1818
CatalogInfoClass: TypeAlias = BaseCatalogInfo
1919

2020
def __init__(
21-
self,
22-
catalog_info: CatalogInfoClass,
23-
catalog_path=None,
24-
storage_options: dict = None
21+
self, catalog_info: CatalogInfoClass, catalog_path=None, storage_options: dict = None
2522
) -> None:
2623
"""Initializes a Dataset
2724

src/hipscat/catalog/healpix_dataset/__init__.py

Whitespace-only changes.
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from typing import List, Tuple, Union
2+
3+
import pandas as pd
4+
from typing_extensions import TypeAlias
5+
6+
from hipscat.catalog.dataset import BaseCatalogInfo, Dataset
7+
from hipscat.catalog.partition_info import PartitionInfo
8+
from hipscat.io import FilePointer, file_io, paths
9+
from hipscat.pixel_math import HealpixPixel
10+
from hipscat.pixel_tree.pixel_tree import PixelTree
11+
from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder
12+
13+
PixelInputTypes = Union[pd.DataFrame, PartitionInfo, PixelTree, List[HealpixPixel]]
14+
15+
16+
class HealpixDataset(Dataset):
17+
"""A HiPSCat dataset partitioned with a HEALPix partitioning structure.
18+
19+
Catalogs of this type are partitioned based on the ra and dec of the points with each partition
20+
containing points within a given HEALPix pixel. The files are in the form 'Norder=/Dir=/Npix=.parquet'.
21+
"""
22+
23+
CatalogInfoClass: TypeAlias = BaseCatalogInfo
24+
catalog_info: CatalogInfoClass
25+
26+
def __init__(
27+
self,
28+
catalog_info: CatalogInfoClass,
29+
pixels: PixelInputTypes,
30+
catalog_path: str = None,
31+
storage_options: dict = None,
32+
) -> None:
33+
"""Initializes a Catalog
34+
35+
Args:
36+
catalog_info: CatalogInfo object with catalog metadata
37+
pixels: Specifies the pixels contained in the catalog. Can be either a Dataframe with
38+
columns `Norder`, `Dir`, and `Npix` matching a `partition_info.csv` file, a
39+
`PartitionInfo object`, or a `PixelTree` object
40+
catalog_path: If the catalog is stored on disk, specify the location of the catalog
41+
Does not load the catalog from this path, only store as metadata
42+
storage_options: dictionary that contains abstract filesystem credentials
43+
"""
44+
super().__init__(catalog_info, catalog_path, storage_options)
45+
self.partition_info = self._get_partition_info_from_pixels(pixels)
46+
self.pixel_tree = self._get_pixel_tree_from_pixels(pixels)
47+
48+
def get_pixels(self):
49+
"""Get all healpix pixels that are contained in the catalog
50+
51+
Returns:
52+
data frame with per-pixel data.
53+
54+
The data frame contains the following columns:
55+
56+
- order: order of the destination pixel
57+
- pixel: pixel number *at the above order*
58+
- num_objects: the number of rows in the pixel's partition
59+
"""
60+
return self.partition_info.data_frame
61+
62+
def get_healpix_pixels(self) -> List[HealpixPixel]:
63+
"""Get healpix pixel objects for all pixels contained in the catalog.
64+
65+
Returns:
66+
List of HealpixPixel
67+
"""
68+
return self.partition_info.get_healpix_pixels()
69+
70+
@staticmethod
71+
def _get_partition_info_from_pixels(pixels: PixelInputTypes) -> PartitionInfo:
72+
if isinstance(pixels, PartitionInfo):
73+
return pixels
74+
if isinstance(pixels, pd.DataFrame):
75+
return PartitionInfo(pixels)
76+
if isinstance(pixels, PixelTree):
77+
return PartitionInfo.from_healpix(
78+
[
79+
HealpixPixel(node.hp_order, node.hp_pixel)
80+
for node in pixels.root_pixel.get_all_leaf_descendants()
81+
]
82+
)
83+
if pd.api.types.is_list_like(pixels):
84+
return PartitionInfo.from_healpix(pixels)
85+
raise TypeError("Pixels must be of type PartitionInfo, Dataframe, PixelTree, or List[HealpixPixel]")
86+
87+
@staticmethod
88+
def _get_pixel_tree_from_pixels(pixels: PixelInputTypes) -> PixelTree:
89+
if isinstance(pixels, PartitionInfo):
90+
return PixelTreeBuilder.from_partition_info_df(pixels.data_frame)
91+
if isinstance(pixels, pd.DataFrame):
92+
return PixelTreeBuilder.from_partition_info_df(pixels)
93+
if isinstance(pixels, PixelTree):
94+
return pixels
95+
if pd.api.types.is_list_like(pixels):
96+
return PixelTreeBuilder.from_healpix(pixels)
97+
raise TypeError("Pixels must be of type PartitionInfo, Dataframe, PixelTree, or List[HealpixPixel]")
98+
99+
@classmethod
100+
def _read_args(
101+
cls, catalog_base_dir: FilePointer, storage_options: dict = None
102+
) -> Tuple[CatalogInfoClass, PartitionInfo]:
103+
args = super()._read_args(catalog_base_dir, storage_options=storage_options)
104+
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
105+
partition_info = PartitionInfo.read_from_file(partition_info_file, storage_options=storage_options)
106+
return args + (partition_info,)
107+
108+
@classmethod
109+
def _check_files_exist(cls, catalog_base_dir: FilePointer, storage_options: dict = None):
110+
super()._check_files_exist(catalog_base_dir, storage_options=storage_options)
111+
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
112+
if not file_io.does_file_or_directory_exist(partition_info_file, storage_options=storage_options):
113+
raise FileNotFoundError(f"No partition info found where expected: {str(partition_info_file)}")

src/hipscat/inspection/almanac.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
from __future__ import annotations
2-
from typing import List
32

43
import os
54
import warnings
5+
from typing import List
6+
67
import pandas as pd
78

89
from hipscat.catalog.catalog import CatalogType
910
from hipscat.catalog.dataset.dataset import Dataset
1011
from hipscat.inspection.almanac_info import AlmanacInfo
1112
from hipscat.io.file_io import file_pointer
1213

14+
1315
class Almanac:
1416
"""Single instance of an almanac, and available catalogs within namespaces
1517

src/hipscat/inspection/almanac_info.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from __future__ import annotations
2-
from typing import List
32

43
import dataclasses
54
import os
65
from dataclasses import dataclass, field
7-
import yaml
6+
from typing import List
87

8+
import yaml
99
from typing_extensions import Self
1010

1111
from hipscat.catalog.dataset import catalog_info_factory

src/hipscat/io/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
get_partition_info_pointer,
1111
get_point_map_file_pointer,
1212
get_provenance_pointer,
13-
pixel_association_directory,
14-
pixel_association_file,
1513
pixel_catalog_file,
1614
pixel_directory,
1715
)

0 commit comments

Comments
 (0)