From 9a920e353cc30570d99fabc2cd26cdbcd364d92a Mon Sep 17 00:00:00 2001
From: sunrisepeak <speakshen@163.com>
Date: Wed, 24 Jun 2026 07:00:46 +0800
Subject: [PATCH] ci: split e2e suite into a parallel workflow + per-test
 timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The linux job ran build → unit → e2e (~18m) → toolchain matrix → integration
all in series (~34m wall-clock). Move the e2e suite to its own workflow
(ci-linux-e2e.yml) that runs in PARALLEL on the same warm cache lineage, so
the per-PR critical path becomes max(build+matrix, build+e2e) instead of their
sum. ci-linux.yml keeps build + unit/integration + toolchain matrix + the
xlings integration build; its toolchain steps no longer depend on e2e's
warm-ups (GCC uses the mcpp.toml-pinned default already in the sandbox; the
musl --target build auto-installs on demand).

Also instrument tests/e2e/run_all.sh: each test now prints its elapsed time
(portable ms timer — bash 5 EPOCHREALTIME, falling back to whole-second date),
and the summary prints a slowest-first table + executed-total, so the long-pole
tests surface for future sharding/optimization instead of hiding behind 'OK'.

NOTE: the new check 'e2e suite (linux x86_64, self-host)' must be added to
main's required-status-checks ruleset to gate merges (admin action).
---
 .github/workflows/ci-linux-e2e.yml | 114 +++++++++++++++++++++++++++++
 .github/workflows/ci-linux.yml     |  47 +++---------
 tests/e2e/run_all.sh               |  42 ++++++++++-
 3 files changed, 166 insertions(+), 37 deletions(-)
 create mode 100644 .github/workflows/ci-linux-e2e.yml

diff --git a/.github/workflows/ci-linux-e2e.yml b/.github/workflows/ci-linux-e2e.yml
new file mode 100644
index 00000000..8c5b33e2
--- /dev/null
+++ b/.github/workflows/ci-linux-e2e.yml
@@ -0,0 +1,114 @@
+name: ci-linux-e2e
+
+# The e2e suite (tests/e2e/run_all.sh, ~18 min) split out of ci-linux.yml so it
+# runs in PARALLEL with the build/unit/toolchain-matrix job instead of tacked on
+# after it. Both workflows share the same cache lineage (mcpp sandbox + xlings +
+# target/), so this job restores a warm build and the only added wall-clock vs.
+# the inline version is one extra warm `mcpp build`. Net per-PR critical path
+# drops from "build + … + e2e" to max(build+matrix, build+e2e).
+#
+# Paired workflows: ci-linux.yml (build + unit + toolchain matrix + integration),
+# ci-macos.yml, ci-windows.yml.
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    name: e2e suite (linux x86_64, self-host)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 45
+    env:
+      MCPP_HOME: /home/runner/.mcpp
+    steps:
+      - uses: actions/checkout@v4
+
+      # Same cache lineage as ci-linux.yml so this job lands on a warm
+      # toolchain/sandbox instead of re-installing it. Both workflows read
+      # (and may save) the same keys; actions/cache tolerates concurrent
+      # "already exists" saves.
+      - name: Cache mcpp sandbox
+        uses: actions/cache@v4
+        with:
+          path: ~/.mcpp
+          key: mcpp-sandbox-${{ runner.os }}-ci-${{ hashFiles('mcpp.toml', '.xlings.json') }}
+          restore-keys: |
+            mcpp-sandbox-${{ runner.os }}-ci-
+
+      - name: Cache xlings
+        uses: actions/cache@v4
+        with:
+          path: ~/.xlings
+          key: xlings-${{ runner.os }}-v2-${{ hashFiles('.xlings.json') }}
+          restore-keys: |
+            xlings-${{ runner.os }}-v2-
+
+      - name: Bootstrap mcpp via xlings
+        env:
+          XLINGS_NON_INTERACTIVE: '1'
+          XLINGS_VERSION: '0.4.30'
+        run: |
+          tarball="xlings-${XLINGS_VERSION}-linux-x86_64.tar.gz"
+          curl -fsSL -o "/tmp/${tarball}" \
+            "https://github.com/d2learn/xlings/releases/download/v${XLINGS_VERSION}/${tarball}"
+          tar -xzf "/tmp/${tarball}" -C /tmp
+          "/tmp/xlings-${XLINGS_VERSION}-linux-x86_64/subos/default/bin/xlings" self install
+          export PATH="$HOME/.xlings/subos/default/bin:$PATH"
+          xlings --version
+          xlings install mcpp -y
+          MCPP="$HOME/.xlings/subos/default/bin/mcpp"
+          test -x "$MCPP"
+          "$MCPP" --version
+          echo "MCPP=$MCPP" >> "$GITHUB_ENV"
+          echo "XLINGS_BIN=$HOME/.xlings/subos/default/bin/xlings" >> "$GITHUB_ENV"
+
+      - name: Cache target/ (build artifacts + BMIs)
+        uses: actions/cache@v4
+        with:
+          path: target
+          key: mcpp-target-${{ runner.os }}-${{ hashFiles('src/**', 'tests/**', 'mcpp.toml', 'mcpp.lock') }}
+          restore-keys: |
+            mcpp-target-${{ runner.os }}-
+
+      - name: Configure mirror + Build mcpp from source (self-host)
+        run: |
+          export MCPP_VENDORED_XLINGS="$XLINGS_BIN"
+          "$XLINGS_BIN" config --mirror GLOBAL 2>/dev/null || true
+          "$MCPP" self config --mirror GLOBAL 2>/dev/null || true
+          "$MCPP" build
+
+      - name: E2E suite
+        # Per-test 600s timeout lives in tests/e2e/run_all.sh and identifies
+        # WHICH test hung; this caps the whole suite so a hang fails fast.
+        timeout-minutes: 25
+        run: |
+          # Point the e2e runner at the freshly-built binary, not the
+          # bootstrap one. Tests cd into mktemp -d, so $MCPP must be
+          # absolute or the relative path breaks under the temp cwd.
+          MCPP=$(realpath "$(find target -type f -name mcpp -printf '%T@ %p\n' | sort -rn | head -1 | cut -d' ' -f2)")
+          test -x "$MCPP"
+          export MCPP
+          # Tests that set MCPP_HOME to a fresh tmpdir need an xlings to
+          # bootstrap from; surface the xlings binary installed above.
+          export MCPP_VENDORED_XLINGS="$XLINGS_BIN"
+          test -x "$MCPP_VENDORED_XLINGS"
+          # GitHub-hosted runners are outside CN; keep CI toolchain downloads on
+          # the global mirror while mcpp's default remains CN for fresh local
+          # sandboxes. E2E tests with their own MCPP_HOME read this variable.
+          export MCPP_E2E_TOOLCHAIN_MIRROR=GLOBAL
+          "$MCPP" self config --mirror "$MCPP_E2E_TOOLCHAIN_MIRROR"
+          "$MCPP" self config
+          # Pin the global default so test 28 (default-toolchain path) gets a
+          # deterministic GNU answer instead of an auto-install pick.
+          "$MCPP" toolchain default gcc@16.1.0
+          # Warm musl once so fresh-home e2e tests inherit the payload.
+          "$MCPP" toolchain install gcc 15.1.0-musl
+          bash tests/e2e/run_all.sh
diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 224abd68..f562f9ec 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -5,7 +5,12 @@ name: ci-linux
 # workflow no longer depends on a previous-release tarball — the
 # chicken-and-egg now lives upstream in the xlings index.
 #
-# Paired workflows: ci-macos.yml, ci-windows.yml.
+# This job covers build + unit/integration tests + the toolchain matrix +
+# the xlings integration build. The ~18 min e2e suite is a SEPARATE workflow
+# (ci-linux-e2e.yml) that runs in parallel on the same warm caches, so the
+# per-PR critical path is max(this, e2e) instead of their sum.
+#
+# Paired workflows: ci-linux-e2e.yml, ci-macos.yml, ci-windows.yml.
 
 on:
   push:
@@ -109,40 +114,12 @@ jobs:
           "$MCPP_FRESH" self config --mirror GLOBAL
           "$MCPP_FRESH" test
 
-      - name: E2E suite
-        # Step-level guard: a single hung test (historically 10_env_command.sh
-        # on slow xlings/network) used to eat the full 60-min job budget.
-        # Cap the suite at 25 min so a hang fails fast and we still have room
-        # for the downstream toolchain steps. Per-test 600s timeout lives in
-        # tests/e2e/run_all.sh and identifies WHICH test hung.
-        timeout-minutes: 25
-        run: |
-          # Point the e2e runner at the freshly-built binary, not the
-          # bootstrap one. Tests cd into mktemp -d, so $MCPP must be
-          # absolute or the relative path breaks under the temp cwd.
-          MCPP=$(realpath "$(find target -type f -name mcpp -printf '%T@ %p\n' | sort -rn | head -1 | cut -d' ' -f2)")
-          test -x "$MCPP"
-          export MCPP
-          # Tests that set MCPP_HOME to a fresh tmpdir need an xlings
-          # to bootstrap from; surface the xlings binary installed
-          # above so they don't have to reinstall the sandbox.
-          export MCPP_VENDORED_XLINGS="$XLINGS_BIN"
-          test -x "$MCPP_VENDORED_XLINGS"
-          # GitHub-hosted runners are outside CN; keep CI toolchain downloads on
-          # the global mirror while mcpp's default remains CN for fresh local
-          # sandboxes. E2E tests with their own MCPP_HOME read this variable.
-          export MCPP_E2E_TOOLCHAIN_MIRROR=GLOBAL
-          "$MCPP" self config --mirror "$MCPP_E2E_TOOLCHAIN_MIRROR"
-          "$MCPP" self config
-          # Pin the global default so test 28 (which exercises the
-          # default-toolchain path) gets a deterministic GNU answer
-          # instead of whatever auto-install picks on a fresh sandbox.
-          "$MCPP" toolchain default gcc@16.1.0
-          # Warm musl once in the persistent sandbox. Fresh-home e2e tests
-          # inherit this payload, and the later --target musl job reuses it
-          # instead of downloading a second copy into another home.
-          "$MCPP" toolchain install gcc 15.1.0-musl
-          bash tests/e2e/run_all.sh
+      # NOTE: the e2e suite (tests/e2e/run_all.sh) moved to ci-linux-e2e.yml
+      # so it runs in parallel with this job. The toolchain matrix below no
+      # longer relies on e2e's toolchain warm-ups: the GCC build uses the
+      # mcpp.toml-pinned default (gcc@16.1.0, already in the sandbox from the
+      # self-host build above), and the musl `--target` build auto-installs
+      # gcc@15.1.0-musl on demand (cached across runs).
 
       - name: Save freshly-built mcpp for toolchain tests
         run: |
diff --git a/tests/e2e/run_all.sh b/tests/e2e/run_all.sh
index c3d7d5a9..3facbafa 100755
--- a/tests/e2e/run_all.sh
+++ b/tests/e2e/run_all.sh
@@ -140,11 +140,34 @@ else
     echo "Per-test timeout: <unavailable> (no timeout/gtimeout on PATH)"
 fi
 
+# Wall-clock in milliseconds, portable. bash 5 exposes EPOCHREALTIME
+# ("secs.usecs"); older bash (e.g. macOS /bin/bash 3.2) falls back to
+# whole-second `date`. Used to time each test so slow ones surface for
+# later analysis/optimization instead of hiding behind a bare "OK".
+_t_ms() {
+    if [[ -n "${EPOCHREALTIME:-}" ]]; then
+        local er=${EPOCHREALTIME} s us
+        s=${er%.*}; us=${er#*.}
+        echo $(( 10#$s * 1000 + 10#$us / 1000 ))
+    else
+        echo $(( $(date +%s) * 1000 ))
+    fi
+}
+
+# Human-friendly duration from milliseconds: "<Nms" / "1.23s".
+_fmt_ms() {
+    local ms=$1
+    if (( ms < 1000 )); then echo "${ms}ms"; else
+        printf '%d.%02ds' $(( ms / 1000 )) $(( (ms % 1000) / 10 ))
+    fi
+}
+
 PASS=0
 FAIL=0
 SKIP=0
 FAILED_TESTS=()
 TIMED_OUT_TESTS=()
+TIMINGS=()   # "<ms> <name>" per executed test, for the slowest-first report
 
 for test in "$HERE"/[0-9]*.sh; do
     name="$(basename "$test")"
@@ -156,14 +179,18 @@ for test in "$HERE"/[0-9]*.sh; do
         continue
     fi
     echo "=== $name ==="
+    _start_ms=$(_t_ms)
     if [[ -n "$TIMEOUT_CMD" ]]; then
         MCPP="$MCPP" "$TIMEOUT_CMD" "$E2E_TEST_TIMEOUT" bash "$test"
     else
         MCPP="$MCPP" bash "$test"
     fi
     rc=$?
+    _dur_ms=$(( $(_t_ms) - _start_ms ))
+    TIMINGS+=("$_dur_ms $name")
+    _dur="$(_fmt_ms "$_dur_ms")"
     if [[ $rc -eq 0 ]]; then
-        echo "PASS: $name"
+        echo "PASS: $name (${_dur})"
         ((PASS++))
     elif [[ $rc -eq 124 ]]; then
         # GNU timeout: 124 = killed after deadline (TERM); 137 = SIGKILL after grace.
@@ -172,7 +199,7 @@ for test in "$HERE"/[0-9]*.sh; do
         FAILED_TESTS+=("$name (TIMEOUT)")
         TIMED_OUT_TESTS+=("$name")
     else
-        echo "FAIL: $name (exit $rc)"
+        echo "FAIL: $name (exit $rc, ${_dur})"
         ((FAIL++))
         FAILED_TESTS+=("$name (exit $rc)")
     fi
@@ -180,6 +207,17 @@ done
 
 echo
 echo "==============================================="
+# Timing report (slowest first) — surfaces the long-pole tests so the suite
+# can be sharded/optimized. Also prints the executed-test total wall time.
+if [[ ${#TIMINGS[@]} -gt 0 ]]; then
+    total_ms=0
+    for t in "${TIMINGS[@]}"; do total_ms=$(( total_ms + ${t%% *} )); done
+    echo "E2E timing (slowest first; executed total $(_fmt_ms "$total_ms")):"
+    printf '%s\n' "${TIMINGS[@]}" | sort -rn | head -15 | while read -r ms nm; do
+        printf '  %8s  %s\n' "$(_fmt_ms "$ms")" "$nm"
+    done
+    echo "==============================================="
+fi
 echo "E2E Summary: $PASS passed, $FAIL failed, $SKIP skipped"
 if [[ ${#TIMED_OUT_TESTS[@]} -gt 0 ]]; then
     echo "Timed out: ${TIMED_OUT_TESTS[*]}"