From f551a0a977594539cb2430742869acce61cf2311 Mon Sep 17 00:00:00 2001 From: Samuel Laferriere <9342524+samlaf@users.noreply.github.com> Date: Tue, 30 Jun 2026 22:59:43 +0800 Subject: [PATCH] feat(deploy_tee)!: merge network manifest into config at POST time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `seismic-tee configure` gains a required --manifest. It merges the network-wide manifest into the per-node TOML (as [network].manifest_base64) and POSTs the result, replacing the old flow where the operator pre-embedded [network] into each node.toml. The manifest is one shared artifact across the cohort, so keeping it out of per-node tomls avoids copy-paste and drift. Pairs with tdx-init now requiring [network] (a manifest-less POST 400s). build_merged_config rejects a node.toml that already carries [network] (single source of truth) and validates the manifest schema before POSTing, so bad bytes fail locally instead of at the node. node.toml stays manifest-free ([domain] + [enclave]); the merge writes a throwaway temp config. Drop the now-obsolete `manifest embed` subcommand + embed_into_config: it mutated node.toml to carry [network], which configure now rejects — a subcommand whose output the pipeline refuses. configure merges instead, reusing render_network_section / validate_manifest_schema. Docs: update README command sequences + schema section and the seismic_node README (also fix its backwards "tdx-init waits for luks-setup" note); fix stale SeismicSystems/tdx-init references to the enclave crate (tdx-init now lives at enclave/crates/tdx-init). Add test_configure.py (merge / reject-pre-existing-[network] / reject-invalid-manifest); trim the embed test from test_manifest.py. --- deploy_tee/README.md | 37 +++++---- deploy_tee/bootstrap_cli.py | 2 +- deploy_tee/configure.py | 99 +++++++++++++++++++----- deploy_tee/manifest.py | 36 +-------- deploy_tee/pulumi/seismic_node/README.md | 24 +++--- deploy_tee/tests/test_configure.py | 74 ++++++++++++++++++ deploy_tee/tests/test_manifest.py | 21 +---- 7 files changed, 196 insertions(+), 97 deletions(-) create mode 100644 deploy_tee/tests/test_configure.py diff --git a/deploy_tee/README.md b/deploy_tee/README.md index b24fe367..d6d3b281 100644 --- a/deploy_tee/README.md +++ b/deploy_tee/README.md @@ -67,7 +67,7 @@ only consumes one to reach an already-running node. | `seismic-tee` | `configure` | any operator, on first boot | POST the node TOML to tdx-init. Reads the node address from `--node `. (Attestation verification is disabled pending attested-tls — see below.) | | `seismic-tee-bootstrap` | `up` / `down` | Seismic, internal | Provision / tear down a cohort of TDX nodes, one independent Pulumi stack each (drives Pulumi via the Automation API). | | `seismic-tee-bootstrap` | `genesis` | Seismic, once per network | Gather the cohort's summit pubkeys, build `genesis.toml`, POST it to each node. | -| `seismic-tee-bootstrap` | `manifest` | Seismic | Assemble / validate / embed the network manifest. | +| `seismic-tee-bootstrap` | `manifest` | Seismic | Assemble / validate the network manifest. | Only `seismic-tee` is cloud-agnostic and **never wraps Pulumi**; `seismic-tee-bootstrap` is the one allowed to. A **node descriptor** is a @@ -110,11 +110,12 @@ cd pulumi/seismic_node pulumi up --stack my-node && pulumi stack output --json --stack my-node > /tmp/node.json cd - -uv run seismic-tee configure --node /tmp/node.json --config ./node.toml +uv run seismic-tee configure --node /tmp/node.json --config ./node.toml --manifest ./network-manifest.json ``` -You never run `seismic-tee-bootstrap` — that's Seismic-internal network -creation. +The `--manifest` is the network identity doc you're given for the network +you're joining; it pins the `network_id`. You never run +`seismic-tee-bootstrap` — that's Seismic-internal network creation. ### Creating a new network (genesis ceremony) @@ -125,6 +126,8 @@ creation. # 1. Per node, write a TOML config (see "The TOML config" below). # Node 1: genesis_node = true, peers = [] # Nodes 2..N: genesis_node = false, peers = [] +# Assemble the one shared network manifest (all nodes use the same bytes): +# uv run seismic-tee-bootstrap manifest assemble ... -o network-manifest.json # 2. Provision the cohort — one independent Pulumi stack per node — and # capture each node's descriptor. Shared settings inherit from the dev @@ -135,9 +138,9 @@ creation. uv run seismic-tee-bootstrap up --count 2 --out-dir /tmp # → /tmp/dev-bootstrap-node-1.json, /tmp/dev-bootstrap-node-2.json -# 3. Configure each node: POST its TOML. -uv run seismic-tee configure --node /tmp/dev-bootstrap-node-1.json --config ./node-1.toml -uv run seismic-tee configure --node /tmp/dev-bootstrap-node-2.json --config ./node-2.toml +# 3. Configure each node: merge its TOML with the shared manifest and POST. +uv run seismic-tee configure --node /tmp/dev-bootstrap-node-1.json --config ./node-1.toml --manifest ./network-manifest.json +uv run seismic-tee configure --node /tmp/dev-bootstrap-node-2.json --config ./node-2.toml --manifest ./network-manifest.json # 4. Run the genesis ceremony once: builds genesis.toml from the cohort # and fans it out to every summit. @@ -147,10 +150,13 @@ uv run seismic-tee-bootstrap genesis --node /tmp/dev-bootstrap-node-1.json /tmp/ After step 3 each node is up; after step 4 they produce blocks. RPC is at `https:///rpc` (the `fqdn` from each descriptor). -## The TOML config (`--config`) +## The TOML config (`--config`) and manifest (`--manifest`) -The operator-supplied TOML is the bootstrap input for tdx-init. -Schema is owned by `tdx-init` (`#[serde(deny_unknown_fields)]`): +The per-node TOML carries `[domain]` + `[enclave]`. The network-wide +manifest is a separate file passed via `--manifest`; `configure` merges it +into the POST as `[network].manifest_base64`, so the manifest is never +copy-pasted into each node.toml. The combined schema is owned by `tdx-init` +(`#[serde(deny_unknown_fields)]`): ```toml [domain] @@ -160,12 +166,15 @@ email = "ops@seismic.systems" # Let's Encrypt registration [enclave] # optional; defaults applied if absent genesis_node = false # exactly one VM per network sets true peers = ["http://az-1.seismicdev.net:7878"] # required when genesis_node = false + +# [network].manifest_base64 is added by `configure` from --manifest — do NOT +# put it in the node.toml. tdx-init requires it and 400s a POST without it. ``` -Authoritative reference (4 fields total): -[tdx-init/src/config.rs](https://github.com/SeismicSystems/tdx-init/blob/main/src/config.rs). -tdx-init validates server-side; malformed payloads come back as -4xx and the deploy script raises. +Authoritative reference: +[enclave/crates/tdx-init/src/config.rs](https://github.com/SeismicSystems/enclave/blob/seismic/crates/tdx-init/src/config.rs). +tdx-init validates server-side; malformed or manifest-less payloads come +back as 4xx and the deploy script raises. `configure` POSTs this body verbatim to the VM's `http://:8080/`, retrying for 180s while the listener comes diff --git a/deploy_tee/bootstrap_cli.py b/deploy_tee/bootstrap_cli.py index 5bb28022..5f67d8a4 100644 --- a/deploy_tee/bootstrap_cli.py +++ b/deploy_tee/bootstrap_cli.py @@ -52,7 +52,7 @@ def genesis(argv: tuple[str, ...]) -> None: @app.command(name="manifest", context_settings=PASSTHROUGH, add_help_option=False) @click.argument("argv", nargs=-1, type=click.UNPROCESSED) def manifest(argv: tuple[str, ...]) -> None: - """Assemble / validate / embed a network manifest.""" + """Assemble / validate a network manifest.""" from deploy_tee import manifest as manifest_mod forward(manifest_mod.main, "seismic-tee-bootstrap manifest", argv) diff --git a/deploy_tee/configure.py b/deploy_tee/configure.py index 2a1f5d5b..9cff165a 100644 --- a/deploy_tee/configure.py +++ b/deploy_tee/configure.py @@ -1,32 +1,31 @@ #!/usr/bin/env python3 """VM configuration (single-operator, per-node). -POST the operator-supplied TOML to a provisioned node's tdx-init HTTP -receiver, then (optionally) verify TDX attestation. Both the node's -address and the expected measurements are *brought by the operator*, not -produced here: +Merge the per-node config (`--config`: domain + enclave) with the +network-wide manifest (`--manifest`) and POST the result to a provisioned +node's tdx-init HTTP receiver. The node address is *brought by the +operator* via a descriptor file (see deploy_tee/descriptor.py), typically +`pulumi stack output --json`. - - the node address comes from a descriptor file (see - deploy_tee/descriptor.py), typically `pulumi stack output --json`; - - the measurements come from a file the operator supplies (published by - seismic-images for the image being attested). - -The CLI never provisions infrastructure (Pulumi's job, standalone) and -never computes measurements (no seismic-images checkout, no Lima) — both -are security-relevant inputs that should not depend on implicit local -state. +The node.toml and the manifest stay separate source files; they are +merged only at POST time into a throwaway temp config, so the manifest +(shared across every node) is never copy-pasted into per-node tomls. The +CLI never provisions infrastructure (Pulumi's job, standalone). seismic-tee configure \\ - --node node-1.json --config node-1.toml --measurements node-1.measurements.json + --node node-1.json --config node-1.toml --manifest network-manifest.json """ import argparse import logging +import tempfile import time +import tomllib from pathlib import Path import requests +from deploy_tee import manifest as manifest_mod from deploy_tee.descriptor import load_descriptor, require from deploy_tee.proxy import ProxyClient from deploy_tee.utils.logging_setup import setup_logging @@ -41,6 +40,52 @@ TDX_INIT_RETRY_INTERVAL_SECONDS = 5 +def build_merged_config(config_path: Path, manifest_path: Path) -> Path: + """Merge the per-node config with the network manifest into a throwaway + temp config for POSTing, without mutating either source file. + + The manifest is a network-wide artifact shared by every node; the + node.toml is per-node (domain + enclave). Keeping them separate on disk + and merging only here avoids copy-pasting the manifest into each node's + config (and the drift that invites). enclave-server requires the manifest + at startup, so tdx-init now rejects a config without [network] — this is + where that section gets added. The merge is a text append, so the + operator's node.toml comments survive. + """ + config_text = config_path.read_text(encoding="utf-8") + try: + parsed = tomllib.loads(config_text) + except tomllib.TOMLDecodeError as e: + raise SystemExit(f"--config {config_path}: could not parse TOML: {e}") from None + if "network" in parsed: + raise SystemExit( + f"--config {config_path} already has a [network] section; the " + "manifest must come from --manifest. Remove it from the node.toml." + ) + + manifest_bytes = manifest_path.read_bytes() + try: + # Never POST bytes tdx-init would 400 at the far end. + manifest_mod.validate_manifest_schema(manifest_bytes) + except manifest_mod.ManifestSchemaError as e: + raise SystemExit(f"--manifest {manifest_path}: invalid manifest: {e}") from None + + merged = ( + config_text.rstrip("\n") + + "\n\n" + + manifest_mod.render_network_section(manifest_bytes) + ) + with tempfile.NamedTemporaryFile( + "w", + suffix=".toml", + prefix="seismic-node-config-", + delete=False, + encoding="utf-8", + ) as f: + f.write(merged) + return Path(f.name) + + def post_config_to_tdx_init(ip_address: str, config_path: Path) -> None: """Wait for tdx-init's HTTP listener to come up, then POST the operator-supplied TOML config verbatim. tdx-init validates the @@ -113,9 +158,20 @@ def parse_args() -> argparse.Namespace: type=Path, required=True, help=( - "Path to the operator-supplied node TOML ([domain] + " - "[enclave]). POSTed verbatim to tdx-init at port 8080. See " - "SeismicSystems/tdx-init for the schema." + "Path to the per-node TOML ([domain] + [enclave]). Merged with " + "--manifest at POST time; must not itself carry a [network] " + "section. See SeismicSystems/enclave crates/tdx-init for the schema." + ), + ) + parser.add_argument( + "--manifest", + type=Path, + required=True, + metavar="FILE", + help=( + "Network manifest JSON (from `deploy_tee.manifest assemble`). " + "Merged into the POSTed config as [network].manifest_base64; " + "shared across every node, so it lives outside the node.toml." ), ) parser.add_argument( @@ -147,6 +203,8 @@ def parse_args() -> argparse.Namespace: raise SystemExit(f"--node descriptor not found: {args.node}") if not args.config.is_file(): raise SystemExit(f"--config file not found: {args.config}") + if not args.manifest.is_file(): + raise SystemExit(f"--manifest file not found: {args.manifest}") return args @@ -164,12 +222,17 @@ def main() -> None: "attested-tls). Re-run without --measurements to POST config only." ) + # Merge node config + manifest before contacting the node, so bad local + # input fails fast (and a manifest-less POST can't crash-loop the node). + merged_config = build_merged_config(args.config, args.manifest) + logger.info(f"Merged {args.config} + {args.manifest} -> {merged_config}") + descriptor = load_descriptor(args.node) public_ip = require(descriptor, "public_ip", args.node) fqdn = descriptor.get("fqdn", public_ip) logger.info(f"Configuring node {fqdn} ({public_ip})...") - post_config_to_tdx_init(public_ip, args.config) + post_config_to_tdx_init(public_ip, merged_config) logger.info("Attestation verification is not yet wired (pending attested-tls).") print("\n" + "=" * 80) diff --git a/deploy_tee/manifest.py b/deploy_tee/manifest.py index e485809c..273fd441 100644 --- a/deploy_tee/manifest.py +++ b/deploy_tee/manifest.py @@ -3,7 +3,8 @@ The deploy tool is the manifest's *sole emitter*: `network_id = SHA-256(file bytes)`, so the file must be rendered deterministically once and then travel as opaque bytes through every hop -(deploy artifact -> node.toml embed -> tdx-init -> /run/seismic/conf/). +(deploy artifact -> `configure` merges it into the POST -> tdx-init -> +/run/seismic/conf/). The node-side parser lives in enclave/crates/seismic-attestation/src/manifest.rs; the schema here must stay in lockstep with it (the fixture-vector test in tests/test_manifest.py pins @@ -567,17 +568,6 @@ def render_network_section(manifest_bytes: bytes) -> str: return f'[network]\nmanifest_base64 = "{encoded}"\n' -def embed_into_config(config_path: Path, manifest_bytes: bytes) -> None: - """Append the `[network]` section to an existing node.toml.""" - text = config_path.read_text(encoding="utf-8") - if "network" in tomllib.loads(text): - raise GateError( - f"{config_path} already has a [network] section; remove it before embedding" - ) - section = render_network_section(manifest_bytes) - config_path.write_text(text.rstrip("\n") + "\n\n" + section, encoding="utf-8") - - def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( prog="python -m deploy_tee.manifest", description=__doc__ @@ -645,18 +635,6 @@ def add_common(p: argparse.ArgumentParser) -> None: val.add_argument("--manifest", type=Path, required=True) val.add_argument("--measurement-policy", type=Path, required=True) - emb = sub.add_parser( - "embed", - help="render the node.toml [network] section for tdx-init " - "(appends to --config, or prints to stdout)", - ) - emb.add_argument("--manifest", type=Path, required=True) - emb.add_argument( - "--config", - type=Path, - help="node.toml to append the section to; omit to print it", - ) - return parser.parse_args() @@ -689,16 +667,6 @@ def main() -> None: logger.info("wrote %s", args.out / MANIFEST_FILENAME) logger.info("wrote %s", args.out / POLICY_FILENAME) print(f"network_id: {assembled.network_id}") - elif args.command == "embed": - manifest_bytes = args.manifest.read_bytes() - # Never embed bytes that tdx-init would reject at POST time. - validate_manifest_schema(manifest_bytes) - if args.config: - embed_into_config(args.config, manifest_bytes) - logger.info("appended [network] section to %s", args.config) - else: - print(render_network_section(manifest_bytes), end="") - logger.info("network_id: %s", compute_network_id(manifest_bytes)) else: manifest_bytes = args.manifest.read_bytes() manifest = validate_manifest_schema(manifest_bytes) diff --git a/deploy_tee/pulumi/seismic_node/README.md b/deploy_tee/pulumi/seismic_node/README.md index 7cc8b40f..e9e339f5 100644 --- a/deploy_tee/pulumi/seismic_node/README.md +++ b/deploy_tee/pulumi/seismic_node/README.md @@ -74,18 +74,22 @@ tdx-init is reachable: ```bash PUBLIC_IP=$(pulumi stack output public_ip) -# tdx-init listens on :8080 once persistent-luks-setup completes -# (~30-60s after VM creation). Should respond to a connection; -# returns 4xx on an empty body but proves the listener is up. +# tdx-init's config listener (:8080) comes up early — right after +# network-online, before the enclave + LUKS-provisioning steps it gates +# (~30-60s after VM creation). Should respond to a connection; returns +# 4xx on an empty body but proves the listener is up. curl -v http://$PUBLIC_IP:8080/ +``` + +Deliver the node config with `seismic-tee configure`, which merges the +per-node TOML with the network manifest (tdx-init requires `[network]` and +400s a POST without it) and POSTs the result: -# (Optional) Hand-deliver a TOML config to fully boot the node — -# this is what the future post-provision config step will automate. -# For the first boot of a brand-new network, use node.bootstrap.toml. -# For normal restarts / steady state, use node.toml. -curl -X POST http://$PUBLIC_IP:8080/ \ - -H "Content-Type: application/toml" \ - --data-binary @node.bootstrap.toml +```bash +# First boot of a brand-new network: node.bootstrap.toml (genesis_node = true). +# Normal restarts / steady state: node.toml. +uv run seismic-tee configure --node \ + --config node.bootstrap.toml --manifest network-manifest.json ``` Once the TOML is accepted (200), the rest of the systemd graph diff --git a/deploy_tee/tests/test_configure.py b/deploy_tee/tests/test_configure.py new file mode 100644 index 00000000..549fe528 --- /dev/null +++ b/deploy_tee/tests/test_configure.py @@ -0,0 +1,74 @@ +"""Tests for deploy_tee.configure (stdlib unittest; no test deps in this repo). + +Run with: + uv run python -m unittest discover -s deploy_tee/tests -v +""" + +import tempfile +import tomllib +import unittest +from pathlib import Path + +from deploy_tee.configure import build_merged_config +from deploy_tee.manifest import render_manifest + +# Reuse the canonical valid manifest from the manifest tests rather than +# duplicate the schema here; build_merged_config validates it before merging. +from deploy_tee.tests.test_manifest import FIXTURE_MANIFEST + +NODE_TOML = """\ +[domain] +name = "node1.example.com" +email = "ops@example.com" + +[enclave] +genesis_node = true +peers = [] +""" + + +def _write(suffix: str, data) -> Path: + payload = data if isinstance(data, bytes) else data.encode() + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: + f.write(payload) + return Path(f.name) + + +class BuildMergedConfigTests(unittest.TestCase): + def setUp(self): + self._tmp = [] + self.manifest = _write(".json", render_manifest(FIXTURE_MANIFEST)) + self._tmp.append(self.manifest) + + def tearDown(self): + for p in self._tmp: + p.unlink(missing_ok=True) + + def _node(self, text: str) -> Path: + p = _write(".toml", text) + self._tmp.append(p) + return p + + def test_merges_manifest_into_network_section(self): + out = build_merged_config(self._node(NODE_TOML), self.manifest) + self._tmp.append(out) + merged = tomllib.loads(out.read_text()) + # Manifest landed as [network], and the per-node content is preserved. + self.assertTrue(merged["network"]["manifest_base64"]) + self.assertTrue(merged["enclave"]["genesis_node"]) + self.assertEqual(merged["domain"]["name"], "node1.example.com") + + def test_rejects_config_that_already_has_network(self): + node = self._node(NODE_TOML + '\n[network]\nmanifest_base64 = "ZWU="\n') + with self.assertRaises(SystemExit): + build_merged_config(node, self.manifest) + + def test_rejects_invalid_manifest(self): + bad = _write(".json", "{not json") + self._tmp.append(bad) + with self.assertRaises(SystemExit): + build_merged_config(self._node(NODE_TOML), bad) + + +if __name__ == "__main__": + unittest.main() diff --git a/deploy_tee/tests/test_manifest.py b/deploy_tee/tests/test_manifest.py index 060fe677..011bd924 100644 --- a/deploy_tee/tests/test_manifest.py +++ b/deploy_tee/tests/test_manifest.py @@ -18,7 +18,6 @@ ManifestSchemaError, assemble, compute_network_id, - embed_into_config, promote_measurements, render_manifest, render_network_section, @@ -200,7 +199,7 @@ def test_rejects_malformed_records(self): promote_measurements(raw, None) -class EmbedTests(unittest.TestCase): +class NetworkSectionTests(unittest.TestCase): def test_network_section_round_trips_exact_bytes(self): import base64 import tomllib @@ -210,24 +209,6 @@ def test_network_section_round_trips_exact_bytes(self): decoded = base64.standard_b64decode(section["network"]["manifest_base64"]) self.assertEqual(decoded, manifest_bytes) - def test_embed_appends_and_rejects_duplicate(self): - with tempfile.TemporaryDirectory() as tmp: - config = Path(tmp) / "node.toml" - config.write_text( - '[domain]\nname = "n1.example.com"\nemail = "ops@example.com"\n' - ) - manifest_bytes = render_manifest(FIXTURE_MANIFEST) - embed_into_config(config, manifest_bytes) - - import tomllib - - parsed = tomllib.loads(config.read_text()) - self.assertEqual(parsed["domain"]["name"], "n1.example.com") - self.assertIn("network", parsed) - - with self.assertRaisesRegex(GateError, "already has"): - embed_into_config(config, manifest_bytes) - class GateTests(unittest.TestCase): """End-to-end assembly + gates over a synthetic artifact set."""