diff --git a/deploy_tee/README.md b/deploy_tee/README.md index d6d3b28..db3704d 100644 --- a/deploy_tee/README.md +++ b/deploy_tee/README.md @@ -150,25 +150,28 @@ uv run seismic-tee-bootstrap genesis --node /tmp/dev-bootstrap-node-1.json /tmp/ After step 3 each node is up; after step 4 they produce blocks. RPC is at `https:///rpc` (the `fqdn` from each descriptor). -## The TOML config (`--config`) and manifest (`--manifest`) +## The TOML config (`--config`), manifest (`--manifest`), email (`--email`) -The per-node TOML carries `[domain]` + `[enclave]`. The network-wide -manifest is a separate file passed via `--manifest`; `configure` merges it -into the POST as `[network].manifest_base64`, so the manifest is never -copy-pasted into each node.toml. The combined schema is owned by `tdx-init` -(`#[serde(deny_unknown_fields)]`): +The per-node TOML carries **`[enclave]` only**. `configure` assembles the +full POST by appending two tables it owns, so neither is copy-pasted into +each node.toml (and can't drift): +- `[domain]` — `name` from the descriptor's `fqdn` (the cert domain), `email` + from `--email` (default `ops@seismic.systems`). +- `[network].manifest_base64` — from `--manifest`. -```toml -[domain] -name = "az-1.seismicdev.net" # FQDN clients reach this VM at -email = "ops@seismic.systems" # Let's Encrypt registration +`configure` rejects a node.toml that carries `[domain]` or `[network]`. The +combined schema is owned by `tdx-init` (`#[serde(deny_unknown_fields)]`): +```toml +# node.toml — [enclave] only [enclave] # optional; defaults applied if absent genesis_node = false # exactly one VM per network sets true peers = ["http://az-1.seismicdev.net:7878"] # required when genesis_node = false -# [network].manifest_base64 is added by `configure` from --manifest — do NOT -# put it in the node.toml. tdx-init requires it and 400s a POST without it. +# configure appends, before POSTing: +# [domain] name = , email = <--email> +# [network] manifest_base64 = <--manifest> +# tdx-init requires both and 400s a POST missing them. ``` Authoritative reference: diff --git a/deploy_tee/configure.py b/deploy_tee/configure.py index b330938..a78f039 100644 --- a/deploy_tee/configure.py +++ b/deploy_tee/configure.py @@ -17,6 +17,7 @@ """ import argparse +import json import logging import tempfile import time @@ -41,28 +42,34 @@ TDX_INIT_RETRY_INTERVAL_SECONDS = 5 -def build_merged_config(config_path: Path, manifest_path: Path) -> Path: - """Merge the per-node config with the network manifest into a throwaway - temp config for POSTing, without mutating either source file. - - The manifest is a network-wide artifact shared by every node; the - node.toml is per-node (domain + enclave). Keeping them separate on disk - and merging only here avoids copy-pasting the manifest into each node's - config (and the drift that invites). enclave-server requires the manifest - at startup, so tdx-init now rejects a config without [network] — this is - where that section gets added. The merge is a text append, so the - operator's node.toml comments survive. +def build_merged_config( + config_path: Path, manifest_path: Path, fqdn: str, email: str +) -> Path: + """Assemble the config POSTed to tdx-init, mutating no source. The fields + come from four inputs: the operator's per-node node.toml (`[enclave]`), + the descriptor's fqdn (→ `[domain].name`, the cert domain), CLI flags + (`--email` → `[domain].email`; more node fields may become flags later), + and the network manifest (`--manifest` → `[network]`). + + node.toml carries `[enclave]` only; `[domain]` and `[network]` are appended + here as fresh tables, a plain text append that preserves node.toml comments + and never reopens an existing table. That is also why node.toml must NOT + carry `[domain]`/`[network]` itself — a second copy of the cert domain or + network identity is exactly the drift that breaks cert issuance / forks the + network. """ config_text = config_path.read_text(encoding="utf-8") try: parsed = tomllib.loads(config_text) except tomllib.TOMLDecodeError as e: raise SystemExit(f"--config {config_path}: could not parse TOML: {e}") from None - if "network" in parsed: - raise SystemExit( - f"--config {config_path} already has a [network] section; the " - "manifest must come from --manifest. Remove it from the node.toml." - ) + for owned in ("domain", "network"): + if owned in parsed: + raise SystemExit( + f"--config {config_path} must not contain [{owned}] — configure " + "supplies it ([domain] from the descriptor fqdn + --email, " + f"[network] from --manifest). Remove [{owned}] from the node.toml." + ) manifest_bytes = manifest_path.read_bytes() try: @@ -71,9 +78,15 @@ def build_merged_config(config_path: Path, manifest_path: Path) -> Path: except manifest_mod.ManifestSchemaError as e: raise SystemExit(f"--manifest {manifest_path}: invalid manifest: {e}") from None + # json.dumps emits valid TOML basic strings for these simple ASCII values. + domain_section = ( + f"[domain]\nname = {json.dumps(fqdn)}\nemail = {json.dumps(email)}\n" + ) merged = ( config_text.rstrip("\n") + "\n\n" + + domain_section + + "\n" + manifest_mod.render_network_section(manifest_bytes) ) with tempfile.NamedTemporaryFile( @@ -159,9 +172,10 @@ def parse_args() -> argparse.Namespace: type=Path, required=True, help=( - "Path to the per-node TOML ([domain] + [enclave]). Merged with " - "--manifest at POST time; must not itself carry a [network] " - "section. See SeismicSystems/enclave crates/tdx-init for the schema." + "Path to the per-node TOML ([enclave] only: genesis_node, peers). " + "configure supplies [domain] (from the descriptor fqdn + --email) " + "and [network] (from --manifest), so the file must not carry them. " + "See SeismicSystems/enclave crates/tdx-init for the schema." ), ) parser.add_argument( @@ -175,6 +189,15 @@ def parse_args() -> argparse.Namespace: "shared across every node, so it lives outside the node.toml." ), ) + parser.add_argument( + "--email", + default="ops@seismic.systems", + help=( + "Contact email for the node's Let's Encrypt registration (certbot); " + "goes into [domain].email of the POSTed config. Same across a " + "cohort. Default: ops@seismic.systems." + ), + ) parser.add_argument( "--measurements", type=Path, @@ -233,14 +256,18 @@ def main() -> None: "attested-tls). Re-run without --measurements to POST config only." ) - # Merge node config + manifest before contacting the node, so bad local - # input fails fast (and a manifest-less POST can't crash-loop the node). - merged_config = build_merged_config(args.config, args.manifest) - logger.info(f"Merged {args.config} + {args.manifest} -> {merged_config}") - descriptor = load_descriptor(args.node) public_ip = require(descriptor, "public_ip", args.node) - fqdn = descriptor.get("fqdn", public_ip) + # fqdn is the cert domain (→ [domain].name) and must resolve to this node, + # so it's required (no public_ip fallback): a wrong/absent name fails certbot + # at boot. configure injects it from the descriptor rather than the operator + # retyping it in node.toml, so the cert domain and the DNS record can't drift. + fqdn = require(descriptor, "fqdn", args.node) + + # Assemble the POST config (node.toml [enclave] + injected [domain]/[network]) + # before contacting the node, so bad local input fails fast. + merged_config = build_merged_config(args.config, args.manifest, fqdn, args.email) + logger.info(f"Built config for {fqdn} -> {merged_config}") logger.info(f"Configuring node {fqdn} ({public_ip})...") post_config_to_tdx_init(public_ip, merged_config) diff --git a/deploy_tee/pulumi/seismic_node/node.bootstrap.toml b/deploy_tee/pulumi/seismic_node/node.bootstrap.toml index 0240be3..0908551 100644 --- a/deploy_tee/pulumi/seismic_node/node.bootstrap.toml +++ b/deploy_tee/pulumi/seismic_node/node.bootstrap.toml @@ -1,6 +1,6 @@ -[domain] -name = "tee-dev-az-1.seismicdev.net" -email = "ops@seismic.systems" +# Per-node config POSTed via `seismic-tee configure`, which supplies the rest: +# [domain] (name from the descriptor fqdn + --email) and [network] (from +# --manifest). This file carries [enclave] only. [enclave] # Only use this for the first boot of the first node in a brand-new network. diff --git a/deploy_tee/pulumi/seismic_node/node.toml b/deploy_tee/pulumi/seismic_node/node.toml index 9082898..139aff5 100644 --- a/deploy_tee/pulumi/seismic_node/node.toml +++ b/deploy_tee/pulumi/seismic_node/node.toml @@ -1,6 +1,6 @@ -[domain] -name = "tee-dev-az-1.seismicdev.net" -email = "ops@seismic.systems" +# Per-node config POSTed via `seismic-tee configure`, which supplies the rest: +# [domain] (name from the descriptor fqdn + --email) and [network] (from +# --manifest). This file carries [enclave] only. [enclave] # Normal steady-state config. After the initial bootstrap, nodes should fetch diff --git a/deploy_tee/tests/test_configure.py b/deploy_tee/tests/test_configure.py index 549fe52..10384d7 100644 --- a/deploy_tee/tests/test_configure.py +++ b/deploy_tee/tests/test_configure.py @@ -16,15 +16,14 @@ # duplicate the schema here; build_merged_config validates it before merging. from deploy_tee.tests.test_manifest import FIXTURE_MANIFEST +# node.toml carries [enclave] only; configure supplies [domain] and [network]. NODE_TOML = """\ -[domain] -name = "node1.example.com" -email = "ops@example.com" - [enclave] genesis_node = true peers = [] """ +FQDN = "node1.example.com" +EMAIL = "ops@example.com" def _write(suffix: str, data) -> Path: @@ -49,25 +48,32 @@ def _node(self, text: str) -> Path: self._tmp.append(p) return p - def test_merges_manifest_into_network_section(self): - out = build_merged_config(self._node(NODE_TOML), self.manifest) + def test_injects_domain_and_network_keeps_enclave(self): + out = build_merged_config(self._node(NODE_TOML), self.manifest, FQDN, EMAIL) self._tmp.append(out) merged = tomllib.loads(out.read_text()) - # Manifest landed as [network], and the per-node content is preserved. + # configure supplies [domain] (fqdn + email) and [network]; the + # operator's [enclave] survives untouched. + self.assertEqual(merged["domain"]["name"], FQDN) + self.assertEqual(merged["domain"]["email"], EMAIL) self.assertTrue(merged["network"]["manifest_base64"]) self.assertTrue(merged["enclave"]["genesis_node"]) - self.assertEqual(merged["domain"]["name"], "node1.example.com") - def test_rejects_config_that_already_has_network(self): + def test_rejects_config_with_domain(self): + node = self._node(NODE_TOML + '\n[domain]\nname = "x"\nemail = "y"\n') + with self.assertRaises(SystemExit): + build_merged_config(node, self.manifest, FQDN, EMAIL) + + def test_rejects_config_with_network(self): node = self._node(NODE_TOML + '\n[network]\nmanifest_base64 = "ZWU="\n') with self.assertRaises(SystemExit): - build_merged_config(node, self.manifest) + build_merged_config(node, self.manifest, FQDN, EMAIL) def test_rejects_invalid_manifest(self): bad = _write(".json", "{not json") self._tmp.append(bad) with self.assertRaises(SystemExit): - build_merged_config(self._node(NODE_TOML), bad) + build_merged_config(self._node(NODE_TOML), bad, FQDN, EMAIL) if __name__ == "__main__":