From 9a920e353cc30570d99fabc2cd26cdbcd364d92a Mon Sep 17 00:00:00 2001 From: sunrisepeak Date: Wed, 24 Jun 2026 07:00:46 +0800 Subject: [PATCH] ci: split e2e suite into a parallel workflow + per-test timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linux job ran build → unit → e2e (~18m) → toolchain matrix → integration all in series (~34m wall-clock). Move the e2e suite to its own workflow (ci-linux-e2e.yml) that runs in PARALLEL on the same warm cache lineage, so the per-PR critical path becomes max(build+matrix, build+e2e) instead of their sum. ci-linux.yml keeps build + unit/integration + toolchain matrix + the xlings integration build; its toolchain steps no longer depend on e2e's warm-ups (GCC uses the mcpp.toml-pinned default already in the sandbox; the musl --target build auto-installs on demand). Also instrument tests/e2e/run_all.sh: each test now prints its elapsed time (portable ms timer — bash 5 EPOCHREALTIME, falling back to whole-second date), and the summary prints a slowest-first table + executed-total, so the long-pole tests surface for future sharding/optimization instead of hiding behind 'OK'. NOTE: the new check 'e2e suite (linux x86_64, self-host)' must be added to main's required-status-checks ruleset to gate merges (admin action). --- .github/workflows/ci-linux-e2e.yml | 114 +++++++++++++++++++++++++++++ .github/workflows/ci-linux.yml | 47 +++--------- tests/e2e/run_all.sh | 42 ++++++++++- 3 files changed, 166 insertions(+), 37 deletions(-) create mode 100644 .github/workflows/ci-linux-e2e.yml diff --git a/.github/workflows/ci-linux-e2e.yml b/.github/workflows/ci-linux-e2e.yml new file mode 100644 index 00000000..8c5b33e2 --- /dev/null +++ b/.github/workflows/ci-linux-e2e.yml @@ -0,0 +1,114 @@ +name: ci-linux-e2e + +# The e2e suite (tests/e2e/run_all.sh, ~18 min) split out of ci-linux.yml so it +# runs in PARALLEL with the build/unit/toolchain-matrix job instead of tacked on +# after it. Both workflows share the same cache lineage (mcpp sandbox + xlings + +# target/), so this job restores a warm build and the only added wall-clock vs. +# the inline version is one extra warm `mcpp build`. Net per-PR critical path +# drops from "build + … + e2e" to max(build+matrix, build+e2e). +# +# Paired workflows: ci-linux.yml (build + unit + toolchain matrix + integration), +# ci-macos.yml, ci-windows.yml. + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e: + name: e2e suite (linux x86_64, self-host) + runs-on: ubuntu-24.04 + timeout-minutes: 45 + env: + MCPP_HOME: /home/runner/.mcpp + steps: + - uses: actions/checkout@v4 + + # Same cache lineage as ci-linux.yml so this job lands on a warm + # toolchain/sandbox instead of re-installing it. Both workflows read + # (and may save) the same keys; actions/cache tolerates concurrent + # "already exists" saves. + - name: Cache mcpp sandbox + uses: actions/cache@v4 + with: + path: ~/.mcpp + key: mcpp-sandbox-${{ runner.os }}-ci-${{ hashFiles('mcpp.toml', '.xlings.json') }} + restore-keys: | + mcpp-sandbox-${{ runner.os }}-ci- + + - name: Cache xlings + uses: actions/cache@v4 + with: + path: ~/.xlings + key: xlings-${{ runner.os }}-v2-${{ hashFiles('.xlings.json') }} + restore-keys: | + xlings-${{ runner.os }}-v2- + + - name: Bootstrap mcpp via xlings + env: + XLINGS_NON_INTERACTIVE: '1' + XLINGS_VERSION: '0.4.30' + run: | + tarball="xlings-${XLINGS_VERSION}-linux-x86_64.tar.gz" + curl -fsSL -o "/tmp/${tarball}" \ + "https://github.com/d2learn/xlings/releases/download/v${XLINGS_VERSION}/${tarball}" + tar -xzf "/tmp/${tarball}" -C /tmp + "/tmp/xlings-${XLINGS_VERSION}-linux-x86_64/subos/default/bin/xlings" self install + export PATH="$HOME/.xlings/subos/default/bin:$PATH" + xlings --version + xlings install mcpp -y + MCPP="$HOME/.xlings/subos/default/bin/mcpp" + test -x "$MCPP" + "$MCPP" --version + echo "MCPP=$MCPP" >> "$GITHUB_ENV" + echo "XLINGS_BIN=$HOME/.xlings/subos/default/bin/xlings" >> "$GITHUB_ENV" + + - name: Cache target/ (build artifacts + BMIs) + uses: actions/cache@v4 + with: + path: target + key: mcpp-target-${{ runner.os }}-${{ hashFiles('src/**', 'tests/**', 'mcpp.toml', 'mcpp.lock') }} + restore-keys: | + mcpp-target-${{ runner.os }}- + + - name: Configure mirror + Build mcpp from source (self-host) + run: | + export MCPP_VENDORED_XLINGS="$XLINGS_BIN" + "$XLINGS_BIN" config --mirror GLOBAL 2>/dev/null || true + "$MCPP" self config --mirror GLOBAL 2>/dev/null || true + "$MCPP" build + + - name: E2E suite + # Per-test 600s timeout lives in tests/e2e/run_all.sh and identifies + # WHICH test hung; this caps the whole suite so a hang fails fast. + timeout-minutes: 25 + run: | + # Point the e2e runner at the freshly-built binary, not the + # bootstrap one. Tests cd into mktemp -d, so $MCPP must be + # absolute or the relative path breaks under the temp cwd. + MCPP=$(realpath "$(find target -type f -name mcpp -printf '%T@ %p\n' | sort -rn | head -1 | cut -d' ' -f2)") + test -x "$MCPP" + export MCPP + # Tests that set MCPP_HOME to a fresh tmpdir need an xlings to + # bootstrap from; surface the xlings binary installed above. + export MCPP_VENDORED_XLINGS="$XLINGS_BIN" + test -x "$MCPP_VENDORED_XLINGS" + # GitHub-hosted runners are outside CN; keep CI toolchain downloads on + # the global mirror while mcpp's default remains CN for fresh local + # sandboxes. E2E tests with their own MCPP_HOME read this variable. + export MCPP_E2E_TOOLCHAIN_MIRROR=GLOBAL + "$MCPP" self config --mirror "$MCPP_E2E_TOOLCHAIN_MIRROR" + "$MCPP" self config + # Pin the global default so test 28 (default-toolchain path) gets a + # deterministic GNU answer instead of an auto-install pick. + "$MCPP" toolchain default gcc@16.1.0 + # Warm musl once so fresh-home e2e tests inherit the payload. + "$MCPP" toolchain install gcc 15.1.0-musl + bash tests/e2e/run_all.sh diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 224abd68..f562f9ec 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -5,7 +5,12 @@ name: ci-linux # workflow no longer depends on a previous-release tarball — the # chicken-and-egg now lives upstream in the xlings index. # -# Paired workflows: ci-macos.yml, ci-windows.yml. +# This job covers build + unit/integration tests + the toolchain matrix + +# the xlings integration build. The ~18 min e2e suite is a SEPARATE workflow +# (ci-linux-e2e.yml) that runs in parallel on the same warm caches, so the +# per-PR critical path is max(this, e2e) instead of their sum. +# +# Paired workflows: ci-linux-e2e.yml, ci-macos.yml, ci-windows.yml. on: push: @@ -109,40 +114,12 @@ jobs: "$MCPP_FRESH" self config --mirror GLOBAL "$MCPP_FRESH" test - - name: E2E suite - # Step-level guard: a single hung test (historically 10_env_command.sh - # on slow xlings/network) used to eat the full 60-min job budget. - # Cap the suite at 25 min so a hang fails fast and we still have room - # for the downstream toolchain steps. Per-test 600s timeout lives in - # tests/e2e/run_all.sh and identifies WHICH test hung. - timeout-minutes: 25 - run: | - # Point the e2e runner at the freshly-built binary, not the - # bootstrap one. Tests cd into mktemp -d, so $MCPP must be - # absolute or the relative path breaks under the temp cwd. - MCPP=$(realpath "$(find target -type f -name mcpp -printf '%T@ %p\n' | sort -rn | head -1 | cut -d' ' -f2)") - test -x "$MCPP" - export MCPP - # Tests that set MCPP_HOME to a fresh tmpdir need an xlings - # to bootstrap from; surface the xlings binary installed - # above so they don't have to reinstall the sandbox. - export MCPP_VENDORED_XLINGS="$XLINGS_BIN" - test -x "$MCPP_VENDORED_XLINGS" - # GitHub-hosted runners are outside CN; keep CI toolchain downloads on - # the global mirror while mcpp's default remains CN for fresh local - # sandboxes. E2E tests with their own MCPP_HOME read this variable. - export MCPP_E2E_TOOLCHAIN_MIRROR=GLOBAL - "$MCPP" self config --mirror "$MCPP_E2E_TOOLCHAIN_MIRROR" - "$MCPP" self config - # Pin the global default so test 28 (which exercises the - # default-toolchain path) gets a deterministic GNU answer - # instead of whatever auto-install picks on a fresh sandbox. - "$MCPP" toolchain default gcc@16.1.0 - # Warm musl once in the persistent sandbox. Fresh-home e2e tests - # inherit this payload, and the later --target musl job reuses it - # instead of downloading a second copy into another home. - "$MCPP" toolchain install gcc 15.1.0-musl - bash tests/e2e/run_all.sh + # NOTE: the e2e suite (tests/e2e/run_all.sh) moved to ci-linux-e2e.yml + # so it runs in parallel with this job. The toolchain matrix below no + # longer relies on e2e's toolchain warm-ups: the GCC build uses the + # mcpp.toml-pinned default (gcc@16.1.0, already in the sandbox from the + # self-host build above), and the musl `--target` build auto-installs + # gcc@15.1.0-musl on demand (cached across runs). - name: Save freshly-built mcpp for toolchain tests run: | diff --git a/tests/e2e/run_all.sh b/tests/e2e/run_all.sh index c3d7d5a9..3facbafa 100755 --- a/tests/e2e/run_all.sh +++ b/tests/e2e/run_all.sh @@ -140,11 +140,34 @@ else echo "Per-test timeout: (no timeout/gtimeout on PATH)" fi +# Wall-clock in milliseconds, portable. bash 5 exposes EPOCHREALTIME +# ("secs.usecs"); older bash (e.g. macOS /bin/bash 3.2) falls back to +# whole-second `date`. Used to time each test so slow ones surface for +# later analysis/optimization instead of hiding behind a bare "OK". +_t_ms() { + if [[ -n "${EPOCHREALTIME:-}" ]]; then + local er=${EPOCHREALTIME} s us + s=${er%.*}; us=${er#*.} + echo $(( 10#$s * 1000 + 10#$us / 1000 )) + else + echo $(( $(date +%s) * 1000 )) + fi +} + +# Human-friendly duration from milliseconds: " " per executed test, for the slowest-first report for test in "$HERE"/[0-9]*.sh; do name="$(basename "$test")" @@ -156,14 +179,18 @@ for test in "$HERE"/[0-9]*.sh; do continue fi echo "=== $name ===" + _start_ms=$(_t_ms) if [[ -n "$TIMEOUT_CMD" ]]; then MCPP="$MCPP" "$TIMEOUT_CMD" "$E2E_TEST_TIMEOUT" bash "$test" else MCPP="$MCPP" bash "$test" fi rc=$? + _dur_ms=$(( $(_t_ms) - _start_ms )) + TIMINGS+=("$_dur_ms $name") + _dur="$(_fmt_ms "$_dur_ms")" if [[ $rc -eq 0 ]]; then - echo "PASS: $name" + echo "PASS: $name (${_dur})" ((PASS++)) elif [[ $rc -eq 124 ]]; then # GNU timeout: 124 = killed after deadline (TERM); 137 = SIGKILL after grace. @@ -172,7 +199,7 @@ for test in "$HERE"/[0-9]*.sh; do FAILED_TESTS+=("$name (TIMEOUT)") TIMED_OUT_TESTS+=("$name") else - echo "FAIL: $name (exit $rc)" + echo "FAIL: $name (exit $rc, ${_dur})" ((FAIL++)) FAILED_TESTS+=("$name (exit $rc)") fi @@ -180,6 +207,17 @@ done echo echo "===============================================" +# Timing report (slowest first) — surfaces the long-pole tests so the suite +# can be sharded/optimized. Also prints the executed-test total wall time. +if [[ ${#TIMINGS[@]} -gt 0 ]]; then + total_ms=0 + for t in "${TIMINGS[@]}"; do total_ms=$(( total_ms + ${t%% *} )); done + echo "E2E timing (slowest first; executed total $(_fmt_ms "$total_ms")):" + printf '%s\n' "${TIMINGS[@]}" | sort -rn | head -15 | while read -r ms nm; do + printf ' %8s %s\n' "$(_fmt_ms "$ms")" "$nm" + done + echo "===============================================" +fi echo "E2E Summary: $PASS passed, $FAIL failed, $SKIP skipped" if [[ ${#TIMED_OUT_TESTS[@]} -gt 0 ]]; then echo "Timed out: ${TIMED_OUT_TESTS[*]}"