SeismicSystems · samlaf · Jul 2, 2026 · Jul 2, 2026
diff --git a/deploy_tee/bootstrap_cli.py b/deploy_tee/bootstrap_cli.py
@@ -43,7 +43,7 @@ def down(argv: tuple[str, ...]) -> None:
 @app.command(name="configure", context_settings=PASSTHROUGH, add_help_option=False)
 @click.argument("argv", nargs=-1, type=click.UNPROCESSED)
 def configure(argv: tuple[str, ...]) -> None:
-    """Configure the network's genesis node (mints root_key locally)."""
+    """Configure a cohort in parallel: one genesis + N joiners, one command."""
     from deploy_tee import cohort_configure
 
     forward(cohort_configure.main, "seismic-tee-bootstrap configure", argv)

diff --git a/deploy_tee/cohort_configure.py b/deploy_tee/cohort_configure.py
@@ -1,93 +1,301 @@
-"""`seismic-tee-bootstrap configure` — found a network's genesis node.
-
-Configures the one node that *founds* a network: it mints `root_key` locally
-with OsRng (`genesis_node = true`, no peers) instead of fetching it from a
-peer. Designating the genesis node is a founding act, so it lives on the
-internal bootstrap CLI — not the operator `seismic-tee configure`, which only
-ever joins an existing network.
-
-Reuses the shared config-delivery primitive (`deploy_tee.configure`), so the
-POSTed config, the tdx-init contract, and the LUKS-wipe watch are identical to
-a joining node's — only `genesis_node`/`peers` differ.
-
-Today this configures a single genesis node. The parallel cohort founder —
-assign exactly one genesis + N joiners, POST all in parallel, and watch every
-node's disk wipe at once — is the planned extension of this command (so a
-4-node bootstrap is one invocation, not four terminals).
+"""`seismic-tee-bootstrap configure` — found a network in one command.
+
+Configures a whole cohort at once: the one genesis node (`--genesis`, mints
+`root_key` locally) plus every joining node (`--join`, fetches `root_key` from
+genesis via `getWrappedRootKey`). All nodes are POSTed and their first-boot
+LUKS wipes watched **in parallel**, so an N-node bootstrap is one command
+instead of N terminals. Exactly one node is genesis — assigned here, not left
+to a per-node flag — so a double-genesis network split is unrepresentable.
+
+Founding is an internal act, so this lives on the bootstrap CLI; joining an
+already-live network is the operator `seismic-tee configure`. Both go through
+the same `build_config` / `post_config_to_tdx_init` primitives and
+`status.poll_provisioning`, so each node's POSTed config and wipe-watch are
+identical — only the role (`genesis_node`/`peers`) differs.
 """
 
 import argparse
+import logging
+import shutil
+import sys
+import threading
+import time
+from collections import Counter
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass, field
 from pathlib import Path
 
-from deploy_tee.configure import deliver_config
+from deploy_tee import manifest as manifest_mod
+from deploy_tee.configure import (
+    ENCLAVE_PEER_PORT,
+    TDX_INIT_PORT,
+    build_config,
+    post_config_to_tdx_init,
+)
+from deploy_tee.descriptor import load_descriptor, require
+from deploy_tee.status import poll_provisioning
 from deploy_tee.utils.logging_setup import setup_logging
 
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Node:
+    """One cohort member, resolved from its descriptor + assigned role."""
+
+    name: str  # short label (the descriptor filename stem)
+    public_ip: str
+    fqdn: str
+    genesis: bool
+    peers: list[str] = field(default_factory=list)
+
+
+def _load_node(descriptor_path: Path, *, genesis: bool, peers: list[str]) -> Node:
+    descriptor = load_descriptor(descriptor_path)
+    return Node(
+        name=descriptor_path.stem,
+        public_ip=require(descriptor, "public_ip", descriptor_path),
+        fqdn=require(descriptor, "fqdn", descriptor_path),
+        genesis=genesis,
+        peers=peers,
+    )
+
+
+def build_cohort(genesis_path: Path, join_paths: list[Path]) -> list[Node]:
+    """Resolve the cohort: exactly one genesis (peers empty — it mints), and
+    every joiner pointed at the genesis node's enclave endpoint
+    (`http://<genesis_ip>:7878`), so joiners fetch `root_key` from it. Role
+    assignment lives here, not in a per-node flag, so there is exactly one
+    genesis by construction.
+    """
+    genesis = _load_node(genesis_path, genesis=True, peers=[])
+    genesis_peer = f"http://{genesis.public_ip}:{ENCLAVE_PEER_PORT}"
+    joiners = [_load_node(p, genesis=False, peers=[genesis_peer]) for p in join_paths]
+    nodes = [genesis, *joiners]
+
+    # A descriptor passed twice (--genesis reused as --join, or a copy-pasted
+    # --join) would race two conflicting POSTs against one node and silently
+    # collide on the name-keyed dashboard/result dicts — refuse instead.
+    for what, counts in (
+        ("name", Counter(n.name for n in nodes)),
+        ("public_ip", Counter(n.public_ip for n in nodes)),
+    ):
+        dupes = sorted(k for k, c in counts.items() if c > 1)
+        if dupes:
+            raise SystemExit(
+                f"duplicate node {what}(s) in cohort: {', '.join(dupes)} — "
+                "was the same descriptor passed more than once?"
+            )
+    return nodes
+
+
+def _configure_node(
+    node: Node,
+    manifest_path: Path,
+    email: str,
+    no_wait: bool,
+    states: dict[str, str],
+    stop: threading.Event,
+) -> bool:
+    """Build + POST one node's config, then (unless `no_wait`) poll its LUKS
+    wipe, writing the latest status line into `states[node.name]` for the
+    dashboard. Returns whether the node reached a ready state. Never raises —
+    a failure is recorded in `states` and reflected in the return, so one bad
+    node doesn't abort the rest of the cohort. `stop` (set on ctrl-C) ends the
+    wipe watch early so the worker joins promptly.
+    """
+    try:
+        states[node.name] = "building config…"
+        config = build_config(
+            manifest_path, node.fqdn, email, genesis_node=node.genesis, peers=node.peers
+        )
+        states[node.name] = f"POSTing config to tdx-init :{TDX_INIT_PORT}…"
+        post_config_to_tdx_init(node.public_ip, config)
+        if no_wait:
+            states[node.name] = "config delivered (not waiting)"
+            return True
+        for update in poll_provisioning(node.public_ip, stop=stop):
+            states[node.name] = update.line
+            if update.done:
+                return update.ok
+        return False  # stopped early, or defensive against a silent generator end
+    except Exception as e:  # noqa: BLE001 — surface per node, keep the cohort going
+        states[node.name] = f"ERROR: {e}"
+        return False
+
+
+class _Dashboard:
+    """Render N nodes' live status: an in-place multi-line block on a TTY,
+    else one line per node printed only when it changes (readable in CI logs).
+    """
+
+    def __init__(self, nodes: list[Node]) -> None:
+        self.isatty = sys.stdout.isatty()
+        self.order = [n.name for n in nodes]
+        self.labels = {
+            n.name: n.name + (" (genesis)" if n.genesis else "") for n in nodes
+        }
+        self._width = max(len(label) for label in self.labels.values())
+        self._painted = False
+        self._last: dict[str, str] = {}
+
+    def render(self, states: dict[str, str]) -> None:
+        if self.isatty:
+            cols = shutil.get_terminal_size((100, 24)).columns
+            if self._painted:
+                sys.stdout.write(f"\033[{len(self.order)}A")  # cursor up N lines
+            for name in self.order:
+                text = (
+                    f"{self.labels[name].rjust(self._width)}  {states.get(name, '…')}"
+                )
+                if len(text) >= cols:
+                    text = text[: cols - 1] + "…"
+                sys.stdout.write(f"\033[2K{text}\n")  # clear line + write
+            sys.stdout.flush()
+            self._painted = True
+        else:
+            for name in self.order:
+                line = states.get(name, "…")
+                if self._last.get(name) != line:
+                    print(f"{self.labels[name]}: {line}", flush=True)
+                    self._last[name] = line
+
+
+def _run_cohort(
+    nodes: list[Node], manifest_path: Path, email: str, no_wait: bool
+) -> dict[str, bool]:
+    """Configure every node concurrently, refreshing the dashboard until all
+    workers finish. Returns {node name: ok}. Threads suit this — the work is
+    blocking HTTP (POST + status polling), and N is small.
+    """
+    # The shared primitives log at INFO; that would corrupt the in-place
+    # dashboard, and the per-node status lines convey the same progress. Quiet
+    # them for the dashboard's duration (the process exits after, so no restore).
+    logging.getLogger("deploy_tee").setLevel(logging.WARNING)
+
+    states: dict[str, str] = {n.name: "queued…" for n in nodes}
+    dashboard = _Dashboard(nodes)
+    stop = threading.Event()
+    futures: dict[str, Future[bool]] = {}
+    with ThreadPoolExecutor(max_workers=len(nodes)) as pool:
+        for node in nodes:
+            futures[node.name] = pool.submit(
+                _configure_node, node, manifest_path, email, no_wait, states, stop
+            )
+        try:
+            # Refresh while workers block on POST/poll.
+            while not all(f.done() for f in futures.values()):
+                dashboard.render(states)
+                time.sleep(1)
+        except KeyboardInterrupt:
+            # Must set `stop` before the pool's context exit joins the workers
+            # — otherwise a wipe watch blocks that join for up to 1h+. With it,
+            # workers exit within a poll interval (a worker still inside the
+            # POST's listener wait is bounded at ~3min). The POSTs that landed
+            # keep provisioning server-side either way.
+            stop.set()
+            print("\nStopped watching — configured nodes keep provisioning.")
+            raise SystemExit(130) from None
+        dashboard.render(states)  # final paint of terminal states
+    return {name: f.result() for name, f in futures.items()}
+
+
+def _report(nodes: list[Node], results: dict[str, bool]) -> None:
+    print("\n" + "=" * 80)
+    print("COHORT CONFIGURED")
+    print("=" * 80)
+    for node in nodes:
+        ok = results.get(node.name, False)
+        role = "genesis" if node.genesis else "join"
+        print(f"  {'✓' if ok else '✗'} {node.name} ({role}) — https://{node.fqdn}/rpc")
+    print("=" * 80 + "\n")
+
+    genesis_failed = any(n.genesis and not results.get(n.name, False) for n in nodes)
+    if genesis_failed:
+        print(
+            "Genesis node did not come up — joiners cannot fetch root_key until "
+            "it does; they will keep retrying. Fix genesis first."
+        )
+    failed = [n.name for n in nodes if not results.get(n.name, False)]
+    if failed:
+        raise SystemExit(
+            f"{len(failed)}/{len(nodes)} node(s) failed: {', '.join(failed)}"
+        )
+
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         prog="seismic-tee-bootstrap configure",
-        description="Configure a network's genesis node (mints root_key locally).",
+        description="Configure a network cohort in parallel: one genesis + N joiners.",
     )
     parser.add_argument(
         "--genesis",
         type=Path,
         required=True,
         metavar="DESCRIPTOR",
         help=(
-            "Node descriptor JSON for the genesis node (from `up`, or "
-            "`pulumi stack output --json`). Exactly one node per network is "
-            "genesis; every other node joins via `seismic-tee configure`."
+            "Descriptor for the one genesis node (mints root_key locally). "
+            "Exactly one node per network is genesis; assigning it here (not a "
+            "per-node flag) makes a double-genesis split impossible."
+        ),
+    )
+    parser.add_argument(
+        "--join",
+        type=Path,
+        action="append",
+        default=[],
+        metavar="DESCRIPTOR",
+        help=(
+            "Descriptor for a joining node (fetches root_key from genesis via "
+            "getWrappedRootKey). Repeatable; omit for a genesis-only bring-up."
         ),
     )
     parser.add_argument(
         "--manifest",
         type=Path,
         required=True,
         metavar="FILE",
-        help=(
-            "Network manifest JSON (from `manifest assemble`). Merged into the "
-            "POSTed config as [network].manifest_base64; the same bytes every "
-            "node in the network uses."
-        ),
+        help="Network manifest JSON (from `manifest assemble`); → [network].",
     )
     parser.add_argument(
         "--email",
         default="ops@seismic.systems",
-        help=(
-            "Contact email for the node's certbot registration; goes into "
-            "[domain].email. Default: ops@seismic.systems."
-        ),
+        help="certbot contact email → [domain].email (default: ops@seismic.systems).",
     )
     parser.add_argument(
         "--no-wait",
         action="store_true",
         default=False,
-        help=(
-            "Don't watch first-boot LUKS provisioning after POSTing. Use for "
-            "CI/headless runs where there's no TTY and the wipe can take 1h+."
-        ),
+        help="Don't watch first-boot LUKS provisioning after POSTing.",
     )
     args = parser.parse_args()
-    if not args.genesis.is_file():
-        raise SystemExit(f"--genesis descriptor not found: {args.genesis}")
-    if not args.manifest.is_file():
-        raise SystemExit(f"--manifest file not found: {args.manifest}")
+    for path in [args.genesis, *args.join, args.manifest]:
+        if not path.is_file():
+            raise SystemExit(f"file not found: {path}")
     return args
 
 
 def main() -> None:
     setup_logging()
     args = parse_args()
-    # The founding node: genesis_node=True, no peers (it mints, never fetches).
-    deliver_config(
-        args.genesis,
-        args.manifest,
-        args.email,
-        genesis_node=True,
-        peers=[],
-        no_wait=args.no_wait,
+
+    # Validate the shared manifest once, so a bad one fails fast here rather
+    # than as N identical per-worker errors mid-dashboard.
+    try:
+        manifest_mod.validate_manifest_schema(args.manifest.read_bytes())
+    except manifest_mod.ManifestSchemaError as e:
+        raise SystemExit(f"--manifest {args.manifest}: invalid manifest: {e}") from None
+
+    nodes = build_cohort(args.genesis, args.join)
+    joiners = [n.name for n in nodes if not n.genesis]
+    print(
+        f"Configuring {len(nodes)} node(s): genesis={nodes[0].name}"
+        + (f", joining={joiners}" if joiners else " (genesis-only)")
     )
 
+    results = _run_cohort(nodes, args.manifest, args.email, args.no_wait)
+    _report(nodes, results)
+
 
 if __name__ == "__main__":
     main()
diff --git a/deploy_tee/pulumi/seismic_node/Pulumi.dev.yaml b/deploy_tee/pulumi/seismic_node/Pulumi.dev.yaml
@@ -4,28 +4,28 @@
 # (resource_group / vm_name / dns_record_name → <prefix>-<i>). Copy to a sibling
 # Pulumi.<env>.yaml (e.g. testnet) for non-dev cohorts.
 config:
-  # DCedsv6 TDX SKUs with local NVMe are currently available in westus3,
-  # which is useful for local/dev testing.
+  # TDX SKU availability varies by region: DCesv6 (no local disk, used below)
+  # is available in eastus; the DCedsv6 variants with local NVMe live in westus3.
   # See https://techcommunity.microsoft.com/blog/azureconfidentialcomputingblog/announcing-general-availability-of-azure-intel%C2%AE-tdx-confidential-vms/4495693
-  azure-native:location: westus3
+  azure-native:location: eastus
   # Resources that must already exist and are re-used across stacks.
   seismic-tee-deploy:dns_zone_resource_group: devnet2
   seismic-tee-deploy:dns_zone_name: seismicdev.net
   # ARM ID of the storage account hosting the VHD.
   # Required so Azure can authorize the cross-RG blob read for managed disk Import.
-  # Get via: az storage account show -n seismicimageswus3 -g seismic-images --query id -o tsv
-  seismic-tee-deploy:vhd_storage_account_id: /subscriptions/214887ea-51a7-4ca7-9cec-29b3cf3d311c/resourceGroups/seismic-images/providers/Microsoft.Storage/storageAccounts/seismicimageswus3
-  seismic-tee-deploy:vhd_blob_url: https://seismicimageswus3.blob.core.windows.net/dev/seismic-dev_2026-05-29.2bfa8f.vhd
+  # Get via: az storage account show -n seismicimages -g seismic-images --query id -o tsv
+  seismic-tee-deploy:vhd_storage_account_id: /subscriptions/214887ea-51a7-4ca7-9cec-29b3cf3d311c/resourceGroups/seismic-images/providers/Microsoft.Storage/storageAccounts/seismicimages
+  seismic-tee-deploy:vhd_blob_url: https://seismicimages.blob.core.windows.net/dev/seismic-dev_2026-07-02.7b7342.vhd
   # Stack-specific deploy config.
   seismic-tee-deploy:dns_record_name: tee-dev-az-1 # → tee-dev-az-1.seismicdev.net
   seismic-tee-deploy:resource_group: seismic-tee-dev
   seismic-tee-deploy:vm_name: tee-dev-az-1
   # D = general-purpose, C = confidential, e = Intel TDX, d = local disk,
   # s = premium storage capable.
-  seismic-tee-deploy:vm_size: Standard_DC4eds_v6
+  seismic-tee-deploy:vm_size: Standard_DC4es_v6
   # Keep small for dev deploys: first-boot dm-integrity initialization is slow
   # on large disks. See README "Local testing: data-disk formatting time".
-  seismic-tee-deploy:data_disk_size_gb: "32"
+  seismic-tee-deploy:data_disk_size_gb: "8"
   # Source IP (CIDR) allowed to SSH. Use your public IP to lock down :22.
   seismic-tee-deploy:source_ip_cidr: 0.0.0.0/0
 encryptionsalt: v1:cgAomGIsWCk=:v1:/sXIE2a9HDrpnOBC:cctZJQ9Ww80lgIXuoQaJ+RJ8ewzu2A==