Skip to content

Commit 64c9622

Browse files
authored
Merge c43e96a into ac210ed
2 parents ac210ed + c43e96a commit 64c9622

File tree

4 files changed

+507
-1
lines changed

4 files changed

+507
-1
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies = [
2222
"astropy>=7.0.0",
2323
"cdshealpix>=0.8.0",
2424
"fsspec>=2023.10.0", # Used for abstract filesystems
25+
"jinja2>=3.0.0", # Used for summary file templating
2526
"jproperties>=2.0.0",
2627
"mocpy>=0.19.0",
2728
"nested-pandas>=0.6.7,<0.7.0",
@@ -55,6 +56,7 @@ dev = [
5556
"pytest-cov", # Used to report total code coverage
5657
"pytest-mock", # Used to mock objects in tests
5758
"pytest-timeout", # Used to test for code efficiency
59+
"PyYAML", # Used to test Hugging-Face YAML metadata generation
5860
]
5961
plotting = [
6062
"matplotlib>=3.10.1",

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
git+https://github.com/lincc-frameworks/nested-pandas.git@main
1+
git+https://github.com/lincc-frameworks/nested-pandas.git@main
2+
datasets

src/hats/io/summary_file.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
from pathlib import Path
2+
from typing import Literal
3+
4+
import jinja2
5+
from upath import UPath
6+
7+
from hats.catalog.catalog_collection import CatalogCollection
8+
from hats.io.file_io import get_upath
9+
from hats.loaders.read_hats import read_hats
10+
11+
12+
def write_collection_summary_file(
13+
collection_path: str | Path | UPath,
14+
*,
15+
fmt: Literal["markdown"],
16+
filename: str | None = None,
17+
title: str | None = None,
18+
description: str | None = None,
19+
huggingface_metadata: bool = False,
20+
jinja2_template: str | None = None,
21+
) -> UPath:
22+
"""Write a summary readme file for a HATS catalog.
23+
24+
Parameters
25+
----------
26+
collection_path: str | Path | UPath
27+
The path to the HATS collection.
28+
fmt : str
29+
The format of the summary file. Currently only "markdown" is supported.
30+
filename: str | None, default=None
31+
The name of the summary file. If None, default depends on a `fmt`:
32+
- "README.md" for "markdown" format.
33+
title : str | None, default=None
34+
Title of the summary document. By default, generated based on catalog
35+
name. This default is a subject of frequent changes, do not rely on it.
36+
description : str | None, default=None
37+
Description of the catalog. By default, generated based on catalog
38+
metadata. The default is a subject of frequent changes, do not rely
39+
on it.
40+
huggingface_metadata : bool, default=False
41+
Whether to include Hugging Face specific metadata header in
42+
the Markdown file, by default False. Supported only when
43+
`fmt="markdown"`.
44+
jinja2_template : str, default=NOne
45+
`jinja2` template string to use for generating the summary file.
46+
If provided, it would override the default template:
47+
- `DEFAULT_MD_TEMPLATE` for `fmt="markdown"`.
48+
49+
Returns
50+
-------
51+
UPath
52+
The path to the written summary file.
53+
54+
Notes
55+
-----
56+
57+
1. Not all options are supported for all formats.
58+
2. Default template is the subject of frequent changes, do not rely on it.
59+
"""
60+
collection_path = get_upath(collection_path)
61+
if fmt != "markdown" and huggingface_metadata:
62+
raise ValueError("`huggingface_metadata=True` is supported only for `fmt='markdown'`")
63+
64+
collection = read_hats(collection_path)
65+
if not isinstance(collection, CatalogCollection):
66+
raise ValueError(
67+
f"The provided path '{collection_path}' contains a HATS catalog, but not a collection.'"
68+
)
69+
70+
name = collection.collection_properties.name
71+
if title is None:
72+
title = f"{name} HATS catalog"
73+
if description is None:
74+
# Should be extended in the future to include more details.
75+
description = f"This is the `{name}` HATS collection."
76+
77+
match fmt:
78+
case "markdown":
79+
content = generate_markdown_collection_summary(
80+
collection=collection,
81+
title=title,
82+
description=description,
83+
huggingface_metadata=huggingface_metadata,
84+
jinja2_template=jinja2_template,
85+
)
86+
case _:
87+
raise ValueError(f"Unsupported format: {fmt=}")
88+
89+
if filename is None:
90+
match fmt:
91+
case "markdown":
92+
filename = "README.md"
93+
case _:
94+
raise ValueError(f"Unsupported format: {fmt=}")
95+
96+
output_path = collection_path / filename
97+
98+
with output_path.open("w") as f:
99+
f.write(content)
100+
101+
return output_path
102+
103+
104+
# Should be extended in the future to include sections like:
105+
# - Load code examples
106+
# - File structure
107+
# - Statistics
108+
# - Column schema
109+
# - Sky maps
110+
# See https://github.com/astronomy-commons/hats/issues/615
111+
DEFAULT_MD_TEMPLATE = """
112+
{%- if huggingface_metadata %}
113+
---
114+
configs:
115+
- config_name: default
116+
data_dir: {{primary_table}}/dataset
117+
{%- for margin in all_margins %}
118+
- config_name: {{margin}}
119+
data_dir: {{margin}}/dataset
120+
{%- endfor %}
121+
{%- for index in all_indexes %}
122+
- config_name: {{index}}
123+
data_dir: {{index}}/dataset
124+
{%- endfor %}
125+
tags:
126+
- astronomy
127+
---
128+
{%- endif %}
129+
130+
# {{title}}
131+
132+
{{description}}
133+
"""
134+
135+
136+
def generate_markdown_collection_summary(
137+
collection: CatalogCollection,
138+
*,
139+
title: str,
140+
description: str,
141+
huggingface_metadata: bool,
142+
jinja2_template: str | None = None,
143+
) -> str:
144+
"""Generate Markdown summary content for a HATS collection.
145+
146+
Parameters
147+
----------
148+
title : str
149+
Title of the Markdown document.
150+
description : str
151+
Description of the catalog.
152+
huggingface_metadata : bool
153+
Whether to include Hugging Face specific metadata header in
154+
the Markdown file.
155+
jinja2_template : str | None
156+
157+
"""
158+
props = collection.collection_properties
159+
env = jinja2.Environment(undefined=jinja2.StrictUndefined)
160+
if jinja2_template is None:
161+
jinja2_template = DEFAULT_MD_TEMPLATE
162+
template = env.from_string(jinja2_template)
163+
164+
all_margins = props.all_margins or []
165+
all_indexes = list((props.all_indexes or {}).values())
166+
167+
return template.render(
168+
title=title,
169+
description=description,
170+
primary_table=props.hats_primary_table_url,
171+
all_margins=all_margins,
172+
all_indexes=all_indexes,
173+
huggingface_metadata=huggingface_metadata,
174+
)

0 commit comments

Comments
 (0)