Skip to content

Commit aa044df

Browse files
authored
Merge pull request #3246 from smoors/20240303140713_new_pr_dataset
add generic Dataset easyblock
2 parents 10c5110 + 377dabc commit aa044df

File tree

1 file changed

+103
-0
lines changed

1 file changed

+103
-0
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
##
2+
# Copyright 2009-2025 Ghent University
3+
#
4+
# This file is part of EasyBuild,
5+
# originally created by the HPC team of Ghent University (http://ugent.be/hpc/en),
6+
# with support of Ghent University (http://ugent.be/hpc),
7+
# the Flemish Supercomputer Centre (VSC) (https://www.vscentrum.be),
8+
# Flemish Research Foundation (FWO) (http://www.fwo.be/en)
9+
# and the Department of Economy, Science and Innovation (EWI) (http://www.ewi-vlaanderen.be/en).
10+
#
11+
# https://github.com/easybuilders/easybuild
12+
#
13+
# EasyBuild is free software: you can redistribute it and/or modify
14+
# it under the terms of the GNU General Public License as published by
15+
# the Free Software Foundation v2.
16+
#
17+
# EasyBuild is distributed in the hope that it will be useful,
18+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+
# GNU General Public License for more details.
21+
#
22+
# You should have received a copy of the GNU General Public License
23+
# along with EasyBuild. If not, see <http://www.gnu.org/licenses/>.
24+
##
25+
"""
26+
EasyBuild support for installing datasets
27+
28+
@author: Samuel Moors (Vrije Universiteit Brussel)
29+
"""
30+
import os
31+
32+
from easybuild.framework.easyblock import EasyBlock
33+
from easybuild.easyblocks.generic.binary import Binary
34+
from easybuild.framework.easyconfig.default import CUSTOM
35+
from easybuild.tools.build_log import EasyBuildError
36+
from easybuild.tools.filetools import compute_checksum, create_index, is_readable, mkdir, move_file, remove_file
37+
from easybuild.tools.filetools import symlink
38+
from easybuild.tools.utilities import trace_msg
39+
40+
41+
class Dataset(Binary):
42+
"""Support for installing datasets"""
43+
44+
@staticmethod
45+
def extra_options(extra_vars=None):
46+
"""Extra easyconfig parameters specific to Data easyblock."""
47+
extra_vars = EasyBlock.extra_options(extra_vars)
48+
extra_vars.update({
49+
'extract_sources': [True, "Whether or not to extract data sources", CUSTOM],
50+
'data_install_path': [None, "Custom installation path for datasets", CUSTOM],
51+
'cleanup_data_sources': [False, "Whether or not to delete the data sources after installation", CUSTOM]
52+
})
53+
return extra_vars
54+
55+
def __init__(self, *args, **kwargs):
56+
"""Initialize Dataset-specific variables."""
57+
super().__init__(*args, **kwargs)
58+
59+
if self.cfg['sources']:
60+
raise EasyBuildError(
61+
"Easyconfig parameter 'sources' is not supported for this EasyBlock. Use 'data_sources' instead.")
62+
63+
if self.cfg['data_install_path']:
64+
self.installdir = self.cfg['data_install_path']
65+
66+
# extract/copy sources directly into installation directory
67+
self.build_in_installdir = True
68+
69+
def install_step(self):
70+
"""No install step, datasets are extracted directly into installdir"""
71+
pass
72+
73+
def post_processing_step(self):
74+
"""Add files to object_storage, remove duplicates, add symlinks"""
75+
trace_msg('adding files to object_storage...')
76+
77+
# creating object storage at root of software name to reuse identical files in different versions
78+
object_storage = os.path.join(os.pardir, 'object_storage')
79+
datafiles = create_index(os.curdir)
80+
81+
for datafile in datafiles:
82+
cks = compute_checksum(datafile, checksum_type='sha256')
83+
# using puppet-style object store, for example this checksum:
84+
# 00b68cbca8fe75a121e857359191f481d2e1262ce7c9998e9980fdb35c144733
85+
# is stored at:
86+
# 0/0/b/6/8/c/b/c/00b68cbca8fe75a121e857359191f481d2e1262ce7c9998e9980fdb35c144733
87+
objstor_file = os.path.join(object_storage, os.sep.join(list(cks[:8])), cks)
88+
mkdir(os.path.dirname(objstor_file), parents=True)
89+
if is_readable(objstor_file):
90+
remove_file(datafile)
91+
else:
92+
move_file(datafile, objstor_file)
93+
# use relative paths for symlinks to easily relocate data installations later on if needed
94+
symlink(objstor_file, datafile, use_abspath_source=False)
95+
self.log.debug(f"Created symlink {datafile} to {objstor_file}")
96+
97+
def cleanup_step(self):
98+
"""Cleanup sources after installation"""
99+
if self.cfg['cleanup_data_sources']:
100+
for src in self.src:
101+
self.log.info(f"Removing data source {src['name']}")
102+
remove_file(src['path'])
103+
super().cleanup_step()

0 commit comments

Comments
 (0)