Skip to content

add generic Dataset easyblock #3246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions easybuild/easyblocks/generic/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
##
# Copyright 2009-2025 Ghent University
#
# This file is part of EasyBuild,
# originally created by the HPC team of Ghent University (http://ugent.be/hpc/en),
# with support of Ghent University (http://ugent.be/hpc),
# the Flemish Supercomputer Centre (VSC) (https://www.vscentrum.be),
# Flemish Research Foundation (FWO) (http://www.fwo.be/en)
# and the Department of Economy, Science and Innovation (EWI) (http://www.ewi-vlaanderen.be/en).
#
# https://github.com/easybuilders/easybuild
#
# EasyBuild is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation v2.
#
# EasyBuild is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with EasyBuild. If not, see <http://www.gnu.org/licenses/>.
##
"""
EasyBuild support for installing datasets

@author: Samuel Moors (Vrije Universiteit Brussel)
"""
import os

from easybuild.framework.easyblock import EasyBlock
from easybuild.easyblocks.generic.binary import Binary
from easybuild.framework.easyconfig.default import CUSTOM
from easybuild.tools.build_log import EasyBuildError
from easybuild.tools.filetools import compute_checksum, create_index, is_readable, mkdir, move_file, remove_file
from easybuild.tools.filetools import symlink
from easybuild.tools.utilities import trace_msg


class Dataset(Binary):
"""Support for installing datasets"""

@staticmethod
def extra_options(extra_vars=None):
"""Extra easyconfig parameters specific to Data easyblock."""
extra_vars = EasyBlock.extra_options(extra_vars)
extra_vars.update({
'extract_sources': [True, "Whether or not to extract data sources", CUSTOM],
'data_install_path': [None, "Custom installation path for datasets", CUSTOM],
'cleanup_data_sources': [False, "Whether or not to delete the data sources after installation", CUSTOM]
})
return extra_vars

def __init__(self, *args, **kwargs):
"""Initialize Dataset-specific variables."""
super().__init__(*args, **kwargs)

if self.cfg['sources']:
raise EasyBuildError(
"Easyconfig parameter 'sources' is not supported for this EasyBlock. Use 'data_sources' instead.")

if self.cfg['data_install_path']:
self.installdir = self.cfg['data_install_path']

# extract/copy sources directly into installation directory
self.build_in_installdir = True

def install_step(self):
"""No install step, datasets are extracted directly into installdir"""
pass

def post_processing_step(self):
"""Add files to object_storage, remove duplicates, add symlinks"""
trace_msg('adding files to object_storage...')

# creating object storage at root of software name to reuse identical files in different versions
object_storage = os.path.join(os.pardir, 'object_storage')
datafiles = create_index(os.curdir)

for datafile in datafiles:
cks = compute_checksum(datafile, checksum_type='sha256')
# using puppet-style object store, for example this checksum:
# 00b68cbca8fe75a121e857359191f481d2e1262ce7c9998e9980fdb35c144733
# is stored at:
# 0/0/b/6/8/c/b/c/00b68cbca8fe75a121e857359191f481d2e1262ce7c9998e9980fdb35c144733
objstor_file = os.path.join(object_storage, os.sep.join(list(cks[:8])), cks)
mkdir(os.path.dirname(objstor_file), parents=True)
if is_readable(objstor_file):
remove_file(datafile)
else:
move_file(datafile, objstor_file)
# use relative paths for symlinks to easily relocate data installations later on if needed
symlink(objstor_file, datafile, use_abspath_source=False)
self.log.debug(f"Created symlink {datafile} to {objstor_file}")

def cleanup_step(self):
"""Cleanup sources after installation"""
if self.cfg['cleanup_data_sources']:
for src in self.src:
self.log.info(f"Removing data source {src['name']}")
remove_file(src['path'])
super().cleanup_step()