Merge pull request #20 from boegel/build_lock

mboisson · web-flow · commit e39976b92684 · 2020-03-30T10:50:25.000-04:00
bug fix, code cleanup + dedicated test for installation lock
diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py
@@ -3039,41 +3039,54 @@ def run_all_steps(self, run_test_cases):
         print_msg("building and installing %s..." % self.full_mod_name, log=self.log, silent=self.silent)
         trace_msg("installation prefix: %s" % self.installdir)
 
-        lockpath = build_option('lockpath') or os.path.join(install_path('software'), '.locks')
-        if not os.path.exists(lockpath):
-            mkdir(lockpath, parents=True)
-        lockfile_name = os.path.join(lockpath, ".%s.lock" % self.installdir.replace('/', '_'))
-        if os.path.exists(lockfile_name):
-            if build_option('wait_on_lock'):
-                while os.path.exists(lockfile_name):
-                    print_msg("Lock file %s exists. Waiting 60 seconds." % lockfile_name, silent=self.silent)
-                    time.sleep(60)
-            else:
-                print_msg("Build aborted. Lock file %s exists." % lockfile_name, silent=self.silent)
-                return False
+        ignore_locks = build_option('ignore_locks')
+
+        if ignore_locks:
+            self.log.info("Ignoring locks...")
         else:
+            locks_dir = build_option('locks_dir') or os.path.join(install_path('software'), '.locks')
+            lock_path = os.path.join(locks_dir, '%s.lock' % self.installdir.replace('/', '_'))
+
+            # if lock already exists, either abort or wait until it disappears
+            if os.path.exists(lock_path):
+                wait_on_lock = build_option('wait_on_lock')
+                if wait_on_lock:
+                    while os.path.exists(lock_path):
+                        print_msg("lock %s exists, waiting %d seconds..." % (lock_path, wait_on_lock),
+                                  silent=self.silent)
+                        time.sleep(wait_on_lock)
+                else:
+                    raise EasyBuildError("Lock %s already exists, aborting!", lock_path)
+
+            # create lock to avoid that another installation running in parallel messes things up;
+            # we use a directory as a lock, since that's atomically created
             try:
-                # create a new lock file
-                print_msg("Creating lock file %s" % lockfile_name, silent=self.silent)
-                f = open(lockfile_name, "w+")
-                f.close()
-
-                for (step_name, descr, step_methods, skippable) in steps:
-                    if self._skip_step(step_name, skippable):
-                        print_msg("%s [skipped]" % descr, log=self.log, silent=self.silent)
+                mkdir(lock_path, parents=True)
+            except EasyBuildError as err:
+                # clean up the error message a bit, get rid of the "Failed to create directory" part + quotes
+                stripped_err = str(err).split(':', 1)[1].strip().replace("'", '').replace('"', '')
+                raise EasyBuildError("Failed to create lock %s: %s", lock_path, stripped_err)
+
+            self.log.info("Lock created: %s", lock_path)
+
+        try:
+            for (step_name, descr, step_methods, skippable) in steps:
+                if self._skip_step(step_name, skippable):
+                    print_msg("%s [skipped]" % descr, log=self.log, silent=self.silent)
+                else:
+                    if self.dry_run:
+                        self.dry_run_msg("%s... [DRY RUN]\n", descr)
                     else:
-                        if self.dry_run:
-                            self.dry_run_msg("%s... [DRY RUN]\n", descr)
-                        else:
-                            print_msg("%s..." % descr, log=self.log, silent=self.silent)
-                        self.current_step = step_name
-                        self.run_step(step_name, step_methods)
-
-            except StopException:
-                pass
-            finally:
-                print_msg("Removing lock file %s" % lockfile_name, silent=self.silent)
-                os.remove(lockfile_name)
+                        print_msg("%s..." % descr, log=self.log, silent=self.silent)
+                    self.current_step = step_name
+                    self.run_step(step_name, step_methods)
+
+        except StopException:
+            pass
+        finally:
+            if not ignore_locks:
+                remove_dir(lock_path)
+                self.log.info("Lock removed: %s", lock_path)
 
         # return True for successfull build (or stopped build)
         return True
diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py
@@ -185,7 +185,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
         'job_output_dir',
         'job_polling_interval',
         'job_target_resource',
-        'lockpath',
+        'locks_dir',
         'modules_footer',
         'modules_header',
         'mpi_cmd_template',
@@ -226,6 +226,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
         'group_writable_installdir',
         'hidden',
         'ignore_checksums',
+        'ignore_locks',
         'install_latest_eb_release',
         'lib64_fallback_sanity_check',
         'logtostdout',
diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py
@@ -255,9 +255,12 @@ def basic_options(self):
             'extended-dry-run-ignore-errors': ("Ignore errors that occur during dry run", None, 'store_true', True),
             'force': ("Force to rebuild software even if it's already installed (i.e. if it can be found as module), "
                       "and skipping check for OS dependencies", None, 'store_true', False, 'f'),
+            'ignore-locks': ("Ignore locks that prevent two identical installations running in parallel",
+                             None, 'store_true', False),
             'job': ("Submit the build as a job", None, 'store_true', False),
             'logtostdout': ("Redirect main log to stdout", None, 'store_true', False, 'l'),
-            'lockpath': ("Specifies which path should be used to store lock files", None, 'store_or_None', None),
+            'locks-dir': ("Directory to store lock files (should be on a shared filesystem); "
+                "None implies .locks subdirectory of software installation directory", None, 'store_or_None', None),
             'missing-modules': ("Print list of missing modules for dependencies of specified easyconfigs",
                                 None, 'store_true', False, 'M'),
             'only-blocks': ("Only build listed blocks", 'strlist', 'extend', None, 'b', {'metavar': 'BLOCKS'}),
@@ -435,8 +438,8 @@ def override_options(self):
                                      None, 'store_true', False),
             'verify-easyconfig-filenames': ("Verify whether filename of specified easyconfigs matches with contents",
                                             None, 'store_true', False),
-            'wait-on-lock': ("Wait until lock file is removed when a lock if found",
-                             None, 'store_true', False),
+            'wait-on-lock': ("Wait interval (in seconds) to use when waiting for existing lock to be removed "
+                "(0: implies no waiting, but exiting with an error)", int, 'store', 0),
             'zip-logs': ("Zip logs that are copied to install directory, using specified command",
                          None, 'store_or_None', 'gzip'),
 
diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py
@@ -34,10 +34,12 @@
 import os
 import re
 import shutil
+import signal
 import stat
 import sys
 import tempfile
 from distutils.version import LooseVersion
+from functools import wraps
 from test.framework.utilities import EnhancedTestCase, TestLoaderFiltered
 from test.framework.package import mock_fpm
 from unittest import TextTestRunner
@@ -1441,7 +1443,7 @@ def test_module_only(self):
         os.remove(toy_core_mod)
 
         # test installing (only) additional module in Lua syntax (if Lmod is available)
-        lmod_abspath = which('lmod')
+        lmod_abspath = os.environ.get('LMOD_CMD') or which('lmod')
         if lmod_abspath is not None:
             args = common_args[:-1] + [
                 '--allow-modules-tool-mismatch',
@@ -2057,7 +2059,7 @@ def test_toy_modaltsoftname(self):
         self.assertTrue(os.path.exists(os.path.join(modules_path, 'yot', yot_name)))
 
         # only subdirectories for software should be created
-        self.assertEqual(sorted(os.listdir(software_path)), sorted(['.locks', 'toy']))
+        self.assertEqual(os.listdir(software_path), ['toy', '.locks'])
         self.assertEqual(sorted(os.listdir(os.path.join(software_path, 'toy'))), ['0.0-one', '0.0-two'])
 
         # only subdirectories for modules with alternative names should be created
@@ -2516,6 +2518,88 @@ def test_toy_ghost_installdir(self):
 
         self.assertFalse(os.path.exists(toy_installdir))
 
+    def test_toy_build_lock(self):
+        """Test toy installation when a lock is already in place."""
+
+        locks_dir = os.path.join(self.test_installpath, 'software', '.locks')
+        toy_installdir = os.path.join(self.test_installpath, 'software', 'toy', '0.0')
+        toy_lock_fn = toy_installdir.replace(os.path.sep, '_') + '.lock'
+
+        toy_lock_path = os.path.join(locks_dir, toy_lock_fn)
+        mkdir(toy_lock_path, parents=True)
+
+        error_pattern = "Lock .*_software_toy_0.0.lock already exists, aborting!"
+        self.assertErrorRegex(EasyBuildError, error_pattern, self.test_toy_build, raise_error=True, verbose=False)
+
+        locks_dir = os.path.join(self.test_prefix, 'locks')
+
+        # no lock in place, so installation proceeds as normal
+        extra_args = ['--locks-dir=%s' % locks_dir]
+        self.test_toy_build(extra_args=extra_args, verify=True, raise_error=True)
+
+        # put lock in place in custom locks dir, try again
+        toy_lock_path = os.path.join(locks_dir, toy_lock_fn)
+        mkdir(toy_lock_path, parents=True)
+        self.assertErrorRegex(EasyBuildError, error_pattern, self.test_toy_build,
+                              extra_args=extra_args, raise_error=True, verbose=False)
+
+        # also test use of --ignore-locks
+        self.test_toy_build(extra_args=extra_args + ['--ignore-locks'], verify=True, raise_error=True)
+
+        # define a context manager that remove a lock after a while, so we can check the use of --wait-for-lock
+        class remove_lock_after:
+            def __init__(self, seconds, lock_fp):
+                self.seconds = seconds
+                self.lock_fp = lock_fp
+
+            def remove_lock(self, *args):
+                remove_dir(self.lock_fp)
+
+            def __enter__(self):
+                signal.signal(signal.SIGALRM, self.remove_lock)
+                signal.alarm(self.seconds)
+
+            def __exit__(self, type, value, traceback):
+                pass
+
+        # wait for lock to be removed, with 1 second interval of checking
+        extra_args.append('--wait-on-lock=1')
+
+        wait_regex = re.compile("^== lock .*_software_toy_0.0.lock exists, waiting 1 seconds", re.M)
+        ok_regex = re.compile("^== COMPLETED: Installation ended successfully", re.M)
+
+        self.assertTrue(os.path.exists(toy_lock_path))
+
+        # use context manager to remove lock after 3 seconds
+        with remove_lock_after(3, toy_lock_path):
+            self.mock_stderr(True)
+            self.mock_stdout(True)
+            self.test_toy_build(extra_args=extra_args, verify=False, raise_error=True, testing=False)
+            stderr, stdout = self.get_stderr(), self.get_stdout()
+            self.mock_stderr(False)
+            self.mock_stdout(False)
+
+            self.assertEqual(stderr, '')
+
+            wait_matches = wait_regex.findall(stdout)
+            # we can't rely on an exact number of 'waiting' messages, so let's go with a range...
+            self.assertTrue(len(wait_matches) in range(2, 5))
+
+            self.assertTrue(ok_regex.search(stdout), "Pattern '%s' found in: %s" % (ok_regex.pattern, stdout))
+
+        # when there is no lock in place, --wait-on-lock has no impact
+        self.assertFalse(os.path.exists(toy_lock_path))
+        self.mock_stderr(True)
+        self.mock_stdout(True)
+        self.test_toy_build(extra_args=extra_args, verify=False, raise_error=True, testing=False)
+        stderr, stdout = self.get_stderr(), self.get_stdout()
+        self.mock_stderr(False)
+        self.mock_stdout(False)
+
+        self.assertEqual(stderr, '')
+        self.assertTrue(ok_regex.search(stdout), "Pattern '%s' found in: %s" % (ok_regex.pattern, stdout))
+        self.assertFalse(wait_regex.search(stdout), "Pattern '%s' not found in: %s" % (wait_regex.pattern, stdout))
+
 
 def suite():
     """ return all the tests in this file """