From 6d39f8dd20895197367ee05b5ec7b38154748060 Mon Sep 17 00:00:00 2001 From: Martin Kourim Date: Thu, 30 Apr 2026 16:58:21 +0200 Subject: [PATCH] feat(runner): lock workdir to prevent concurrent testruns Add acquire_workdir_lock that grabs a non-blocking flock on .lock at the start of regression.sh and node_upgrade.sh. The lock file lives next to the workdir so wiping the workdir does not drop the lock, and the fd is held by the calling shell so the kernel releases the lock automatically on exit. Pull util-linux into the base devShell so flock is available. --- flake.nix | 2 +- runner/node_upgrade.sh | 3 +++ runner/node_upgrade_pytest.sh | 2 ++ runner/regression.sh | 3 +++ scripts/common.sh | 31 +++++++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index 87362a393..e3b5a970b 100644 --- a/flake.nix +++ b/flake.nix @@ -21,7 +21,7 @@ { devShells = rec { base = pkgs.mkShell { - nativeBuildInputs = with pkgs; [ bash coreutils curl git gnugrep gnumake gnutar jq procps xz ]; + nativeBuildInputs = with pkgs; [ bash coreutils curl git gnugrep gnumake gnutar jq procps util-linux xz ]; }; postgres = pkgs.mkShell { nativeBuildInputs = with pkgs; [ glibcLocales postgresql lsof procps ]; diff --git a/runner/node_upgrade.sh b/runner/node_upgrade.sh index e38b78ec2..721fd03df 100755 --- a/runner/node_upgrade.sh +++ b/runner/node_upgrade.sh @@ -37,6 +37,9 @@ if is_venv_active; then exit 1 fi +# Refuse to start if another testrun is already using this workdir. +acquire_workdir_lock "$WORKDIR" || exit 1 + # shellcheck disable=SC1091 . runner/stop_cluster_instances.sh diff --git a/runner/node_upgrade_pytest.sh b/runner/node_upgrade_pytest.sh index 1f550cca0..a97eb675a 100755 --- a/runner/node_upgrade_pytest.sh +++ b/runner/node_upgrade_pytest.sh @@ -11,6 +11,8 @@ STATE_CLUSTER="${CARDANO_NODE_SOCKET_PATH_CI%/*}" # default era to use, can be overridden in each step if needed export CLUSTER_ERA="${CLUSTER_ERA:-"conway"}" export COMMAND_ERA="${COMMAND_ERA:-"$CLUSTER_ERA"}" + +: "${WORKDIR:?WORKDIR environment variable must be set}" CLUSTER_SCRIPTS_DIR="$WORKDIR/cluster0_${CLUSTER_ERA}" # init dir for step1 binaries diff --git a/runner/regression.sh b/runner/regression.sh index 72740d460..1e73706cc 100755 --- a/runner/regression.sh +++ b/runner/regression.sh @@ -27,6 +27,9 @@ if is_venv_active; then exit 1 fi +# Refuse to start if another testrun is already using this workdir. +acquire_workdir_lock "$WORKDIR" || exit 1 + # shellcheck disable=SC1091 . runner/stop_cluster_instances.sh diff --git a/scripts/common.sh b/scripts/common.sh index c558766d2..24e70e545 100644 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -18,6 +18,37 @@ is_venv_active() { [ -n "${VIRTUAL_ENV:-}" ] } +# Acquire an exclusive, non-blocking lock tied to the given workdir to prevent +# concurrent testruns from clobbering each other's workdir. The lock is held +# for the lifetime of the calling shell; it is released automatically on exit. +# The lock file lives next to the workdir so that wiping the workdir does not +# drop the lock. +# Usage: acquire_workdir_lock +acquire_workdir_lock() { + local workdir="${1:?acquire_workdir_lock requires a workdir argument}" + local lockfile="${workdir}.lock" + local lockfd + + if ! command -v flock >/dev/null 2>&1; then + echo "Error: 'flock' is required for testrun locking but was not found." >&2 + return 1 + fi + + # Open lock file in the current shell so the lock outlives this function. + if ! exec {lockfd}>"$lockfile"; then + echo "Error: failed to open lock file '$lockfile' for writing." >&2 + return 1 + fi + + if ! flock -n "$lockfd"; then + echo "Error: another testrun appears to be in progress." >&2 + echo "Lock '$lockfile' is held by another process; refusing to start a new testrun." >&2 + echo "If no testrun is running, simply retry (the lock is released automatically when the holding process exits)." >&2 + exec {lockfd}>&- + return 1 + fi +} + # Verify that VIRTUAL_ENV is activated and points to .venv inside the given top dir. # Compares canonicalized paths to tolerate symlinks and trailing slashes. # Usage: assert_correct_venv