Skip to content

Commit 0cd1e6c

Browse files
authored
Merge pull request #7 from lincc-frameworks/downloader-prototype
Downloader prototype and CLI rework
2 parents 2d5c9a5 + e64841a commit 0cd1e6c

File tree

10 files changed

+1707
-72
lines changed

10 files changed

+1707
-72
lines changed

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,11 @@ classifiers = [
1616
dynamic = ["version"]
1717
requires-python = ">=3.9"
1818
dependencies = [
19+
"astropy" # Used to load fits files of sources to query HSC cutout server
1920
]
2021

2122
[project.scripts]
2223
fibad = "fibad_cli.main:main"
23-
fibad-train = "fibad_cli.train:main"
24-
fibad-predict = "fibad_cli.predict:main"
2524

2625
[project.urls]
2726
"Source Code" = "https://github.com/lincc-frameworks/fibad"

src/fibad/download.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import contextlib
2+
import os
3+
from pathlib import Path
4+
from typing import Union
5+
6+
from astropy.table import Table, hstack
7+
8+
import fibad.downloadCutout.downloadCutout as dC
9+
10+
# These are the fields that are allowed to vary across the locations
11+
# input from the catalog fits file. Other values for HSC cutout server
12+
# must be provided by config.
13+
#
14+
# Order here is intentional, this is also a sort order to optimize
15+
# queries to the cutout server.
16+
variable_fields = ["tract", "ra", "dec"]
17+
18+
19+
@contextlib.contextmanager
20+
def working_directory(path: Path):
21+
"""
22+
Context Manager to change our working directory.
23+
24+
Supports downloadCutouts which always writes to cwd.
25+
"""
26+
old_cwd = Path.cwd()
27+
os.chdir(path)
28+
try:
29+
yield
30+
finally:
31+
os.chdir(old_cwd)
32+
33+
34+
def run(args, config):
35+
"""
36+
Main entrypoint for downloading cutouts from HSC for use with fibad
37+
"""
38+
39+
config = config.get("download", {})
40+
41+
print("Download command")
42+
43+
# Filter the fits file for the fields we want
44+
column_names = ["object_id"] + variable_fields
45+
locations = filterfits(config.get("fits_file"), column_names)
46+
47+
# Sort by tract, ra, dec to optimize speed that the cutout server can serve us
48+
#
49+
# TODO: See if this sort is performed by downloadCutouts
50+
# It appears downloadCutouts is doing some sorting prior to download, but
51+
# unclear if it is the same sort
52+
locations.sort(variable_fields)
53+
54+
# TODO slice up the locations
55+
locations = locations[0:10]
56+
57+
# make a list of rects
58+
rects = create_rects(locations, offset=0, default=rect_from_config(config))
59+
60+
# Configure global parameters for the downloader
61+
dC.set_max_connections(num=config.get("max_connections", 2))
62+
63+
# pass the rects to the cutout downloader
64+
download_cutout_group(
65+
rects=rects, cutout_dir=config.get("cutout_dir"), user=config["username"], password=config["password"]
66+
)
67+
68+
print(locations)
69+
70+
71+
# TODO add error checking
72+
def filterfits(filename: str, column_names: list[str]) -> Table:
73+
"""
74+
Read a fits file with the required column names for making cutouts
75+
76+
Returns an astropy table containing only the necessary fields
77+
78+
The easiest way to make one of these is to select from the main HSC catalog
79+
"""
80+
t = Table.read(filename)
81+
columns = [t[column] for column in column_names]
82+
return hstack(columns, uniq_col_name="{table_name}", table_names=column_names)
83+
84+
85+
def rect_from_config(config: dict) -> dC.Rect:
86+
"""
87+
Takes our Download config and loads cutout config
88+
common to all cutouts into a prototypical Rect for downloading
89+
"""
90+
return dC.Rect.create(
91+
sw=config["sw"],
92+
sh=config["sh"],
93+
filter=config["filter"],
94+
rerun=config["rerun"],
95+
type=config["type"],
96+
)
97+
98+
99+
def create_rects(locations: Table, offset: int = 0, default: dC.Rect = None) -> list[dC.Rect]:
100+
"""
101+
Create the rects we will need to pass to the downloader.
102+
One Rect per location in our list of sky locations.
103+
104+
Rects are created with all fields in the default rect pre-filled
105+
106+
Offset here is to allow multiple downloads on different sections of the source list
107+
without file name clobbering during the download phase. The offset is intended to be
108+
the index of the start of the locations table within some larger fits file.
109+
"""
110+
rects = []
111+
for index, location in enumerate(locations):
112+
args = {field: location[field] for field in variable_fields}
113+
args["lineno"] = index + offset
114+
args["tract"] = str(args["tract"])
115+
rect = dC.Rect.create(default=default, **args)
116+
rects.append(rect)
117+
118+
return rects
119+
120+
121+
def download_cutout_group(rects: list[dC.Rect], cutout_dir: Union[str, Path], user, password):
122+
"""
123+
Download cutouts to the given directory
124+
125+
Calls downloadCutout.download, so supports long lists of rects and
126+
"""
127+
with working_directory(Path(cutout_dir)):
128+
dC.download(rects, user=user, password=password, onmemory=False)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HSC Data download tool.
2+
This tool was downloaded from the [ssp-software/data-access-tools](https://hsc-gitlab.mtk.nao.ac.jp/ssp-software/data-access-tools/) gitlab repository.
3+
4+
This directory was initialized with a copy of the `pdr3/downloadCutout` directory at rev b628d6089acda041eea1041d1011ea154ebefc28 committed Feb 14 2024.

src/fibad/downloadCutout/README.md

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
downloadCutout.py
2+
==============================================================================
3+
4+
Download FITS cutouts from the website of HSC data release.
5+
6+
Requirements
7+
------------------------------------------------------------------------------
8+
9+
python >= 3.7
10+
11+
Usage
12+
------------------------------------------------------------------------------
13+
14+
### Download images of all bands at a location
15+
16+
```
17+
python3 downloadCutout.py --ra=222.222 --dec=44.444 --sw=0.5arcmin --sh=0.5arcmin --name="cutout-{filter}"
18+
```
19+
20+
Note that `{filter}` must appear in `--name`.
21+
Otherwise, the five images of the five bands will be written
22+
to a single file over and over.
23+
24+
### Use coordinate list
25+
26+
You can feed a coordinate list that is in nearly the same format as
27+
https://hsc-release.mtk.nao.ac.jp/das_cutout/pdr3/manual.html#list-to-upload
28+
29+
There are a few differences:
30+
31+
- There must not appear comments
32+
except for the mandatory one at the first line.
33+
34+
- You can use "all" as a value of "filter" field.
35+
36+
- There may be columns with unrecognised names,
37+
which are silently ignored.
38+
39+
It is permissible for the coordinate list to contain only coordinates.
40+
For example:
41+
42+
```
43+
#? ra dec
44+
222.222 44.444
45+
222.223 44.445
46+
222.224 44.446
47+
```
48+
49+
In this case, you have to specify other fields via the command line:
50+
51+
```
52+
python3 downloadCutout.py \
53+
--sw=5arcsec --sh=5arcsec \
54+
--image=true --variance=true --mask=true \
55+
--name="cutout_{tract}_{ra}_{dec}_{filter}" \
56+
--list=coordlist.txt # <- the name of the above list
57+
```
58+
59+
It is more efficient to use a list like the example above
60+
than to use a for-loop to call the script iteratively.
61+
62+
### Stop asking a password
63+
64+
To stop the script asking your password, put the password
65+
into an environment variable. (Default: `HSC_SSP_CAS_PASSWORD`)
66+
67+
```
68+
read -s HSC_SSP_CAS_PASSWORD
69+
export HSC_SSP_CAS_PASSWORD
70+
```
71+
72+
Then, run the script with `--user` option:
73+
74+
```
75+
python3 downloadCutout.py \
76+
--ra=222.222 --dec=44.444 --sw=0.5arcmin --sh=0.5arcmin \
77+
--name="cutout-{filter}" \
78+
--user=USERNAME
79+
```
80+
81+
If you are using your own personal laptop or desktop,
82+
you may pass your password through `--password` option.
83+
But you must never do so
84+
if there are other persons using the same computer.
85+
Remember that other persons can see your command lines
86+
with, for example, `top` command.
87+
(If it is GNU's `top`, press `C` key to see others' command lines).
88+
89+
### Synchronize processes
90+
91+
If you run a program in parallel which calls `downloadCutout.py` sporadically
92+
but frequently, the program needs synchronizing---the server refuses
93+
`downloadCutout.py` if many instances of which are run at the same time.
94+
95+
If your program does not have a synchronization mechanism,
96+
you can run `downloadCutout.py` with synchronization options:
97+
98+
```
99+
python3 downloadCutout.py .... \
100+
--semaphore=/home/yourname/semaphore --max-connections=4
101+
```
102+
103+
Because the processes synchronize with each other via the specified semaphore
104+
(this is not a posix semaphore but a hand-made semaphore-like object),
105+
the semaphore must be seen to all the processes.
106+
If the processes are distributed over a network,
107+
the semaphore must be placed in an NFS or any other shared filesystem.
108+
109+
Usage as a python module
110+
------------------------------------------------------------------------------
111+
112+
Here is an example:
113+
114+
```
115+
import downloadCutout
116+
117+
rect = downloadCutout.Rect.create(
118+
ra="11h11m11.111s",
119+
dec="-1d11m11.111s",
120+
sw="1arcmin",
121+
sh="1arcmin",
122+
)
123+
124+
images = downloadCutout.download(rect)
125+
126+
# Multiple images (of various filters) are returned.
127+
# We look into the first one of them.
128+
metadata, data = images[0]
129+
print(metadata)
130+
131+
# `data` is just the binary data of a FITS file.
132+
# You can use, for example, `astropy` to decode it.
133+
import io
134+
import astropy.io.fits
135+
hdus = astropy.io.fits.open(io.BytesIO(data))
136+
print(hdus)
137+
```

0 commit comments

Comments
 (0)