From 40ccad6d04628611de731a4268071cc48aefa6fe Mon Sep 17 00:00:00 2001 From: milx Date: Fri, 20 Feb 2026 02:47:10 +0330 Subject: [PATCH 1/3] feat(security): add Vault-managed mTLS certificate lifecycle for compute-agent --- README.md | 27 +- cmd/agent/main.go | 15 + go.mod | 15 + go.sum | 73 +++-- internal/certmanager/vault.go | 467 +++++++++++++++++++++++++++ internal/config/config.go | 64 +++- internal/config/vault_config_test.go | 54 ++++ internal/grpc/server.go | 79 ++++- 8 files changed, 759 insertions(+), 35 deletions(-) create mode 100644 internal/certmanager/vault.go create mode 100644 internal/config/vault_config_test.go diff --git a/README.md b/README.md index a878b9c..fbe8ad3 100644 --- a/README.md +++ b/README.md @@ -101,9 +101,30 @@ The agent is configured via environment variables: | Variable | Default | Description | |----------|---------|-------------| | `PERSYS_TLS_ENABLED` | `true` | Enable mTLS authentication | -| `PERSYS_TLS_CERT` | `/etc/persys/certs/agent.crt` | Server certificate path | -| `PERSYS_TLS_KEY` | `/etc/persys/certs/agent.key` | Server private key path | -| `PERSYS_TLS_CA` | `/etc/persys/certs/ca.crt` | CA certificate path | +| `PERSYS_TLS_CERT` | `/etc/persys/certs/agent/compute-agent.pem` | Server certificate path | +| `PERSYS_TLS_KEY` | `/etc/persys/certs/agent/compute-agent-key.pem` | Server private key path | +| `PERSYS_TLS_CA` | `/etc/persys/certs/agent/ca.pem` | CA certificate path | + +### Vault Certificate Manager Configuration + +When enabled, the agent retrieves and rotates certificates from Vault. +Rotation occurs at 80% of certificate lifetime, and if Vault is unavailable +the agent falls back to manual certificates on disk. + +| Variable | Default | Description | +|----------|---------|-------------| +| `PERSYS_VAULT_ENABLED` | `false` | Enable Vault-backed certificate manager | +| `PERSYS_VAULT_ADDR` | `http://127.0.0.1:8200` | Vault API address | +| `PERSYS_VAULT_AUTH_METHOD` | `token` | Auth mode: `token` or `approle` | +| `PERSYS_VAULT_TOKEN` | `` | Vault token for `token` auth | +| `PERSYS_VAULT_APPROLE_ROLE_ID` | `` | AppRole role_id for `approle` auth | +| `PERSYS_VAULT_APPROLE_SECRET_ID` | `` | AppRole secret_id for `approle` auth | +| `PERSYS_VAULT_PKI_MOUNT` | `pki` | Vault PKI mount path | +| `PERSYS_VAULT_PKI_ROLE` | `compute-agent` | Vault PKI role name | +| `PERSYS_VAULT_CERT_TTL` | `24h` | Requested certificate TTL | +| `PERSYS_VAULT_RETRY_INTERVAL` | `1m` | Retry interval when Vault is unavailable | +| `PERSYS_VAULT_SERVICE_NAME` | `compute-agent` | Primary service name added to DNS SANs | +| `PERSYS_VAULT_SERVICE_DOMAIN` | `` | Optional domain appended to service/host SANs | ### State Store Configuration diff --git a/cmd/agent/main.go b/cmd/agent/main.go index c6ff426..a982196 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -8,6 +8,7 @@ import ( "os/signal" "syscall" + "github.com/persys/compute-agent/internal/certmanager" "github.com/persys/compute-agent/internal/config" "github.com/persys/compute-agent/internal/control" "github.com/persys/compute-agent/internal/garbage" @@ -52,6 +53,17 @@ func main() { logger.Infof("Starting Persys Compute Agent v%s", version) logger.Infof("Node ID: %s", cfg.NodeID) + // Initialize certificate manager before starting TLS endpoints. + var certManagerCancel context.CancelFunc + if cfg.TLSEnabled { + certMgr := certmanager.New(cfg, logger) + certCtx, cancel := context.WithCancel(context.Background()) + certManagerCancel = cancel + if err := certMgr.Start(certCtx); err != nil { + logger.Fatalf("Failed to initialize certificate manager: %v", err) + } + } + // Initialize state store logger.Info("Initializing state store...") store, err := state.NewBoltStore(cfg.StateStorePath) @@ -241,6 +253,9 @@ func main() { if cancelControl != nil { cancelControl() } + if certManagerCancel != nil { + certManagerCancel() + } grpcServer.Stop() diff --git a/go.mod b/go.mod index 136d9ab..c345231 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/digitalocean/go-libvirt v0.0.0-20221205150000-2939327a8519 github.com/docker/docker v25.0.5+incompatible github.com/docker/go-connections v0.5.0 + github.com/hashicorp/vault/api v1.16.0 github.com/prometheus/client_golang v1.23.2 github.com/shirou/gopsutil/v3 v3.24.5 github.com/sirupsen/logrus v1.9.3 @@ -20,17 +21,30 @@ require ( require ( github.com/Microsoft/go-winio v0.6.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/distribution/reference v0.5.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-jose/go-jose/v4 v4.1.3 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/hashicorp/go-retryablehttp v0.7.7 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect + github.com/hashicorp/go-secure-stdlib/parseutil v0.1.6 // indirect + github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 // indirect + github.com/hashicorp/go-sockaddr v1.0.2 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/moby/term v0.5.0 // indirect github.com/morikuni/aec v1.0.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -42,6 +56,7 @@ require ( github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect + github.com/ryanuber/go-glob v1.0.0 // indirect github.com/shoenig/go-m1cpu v0.1.7 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/tklauser/go-sysconf v0.3.16 // indirect diff --git a/go.sum b/go.sum index 814ba23..b75080d 100644 --- a/go.sum +++ b/go.sum @@ -2,10 +2,12 @@ github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOEl github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= -github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= @@ -23,28 +25,58 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= +github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/go-test/deep v1.0.2 h1:onZX1rnHT3Wv6cqNgYyFOOlgVKJrksuCMCRvJStbMYw= +github.com/go-test/deep v1.0.2/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0 h1:Wqo399gCIufwto+VfwCSvsnfGpF/w5E9CNxSwbpD6No= github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0/go.mod h1:qmOFXW2epJhM0qSnUUYpldc7gVz2KMQwJ/QYCDIa7XU= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= +github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= +github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/go-secure-stdlib/parseutil v0.1.6 h1:om4Al8Oy7kCm/B86rLCLah4Dt5Aa0Fr5rYBG60OzwHQ= +github.com/hashicorp/go-secure-stdlib/parseutil v0.1.6/go.mod h1:QmrqtbKuxxSWTN3ETMPuB+VtEiBJ/A9XhoYGv8E1uD8= +github.com/hashicorp/go-secure-stdlib/strutil v0.1.1/go.mod h1:gKOamz3EwoIoJq7mlMIRBpVTAUn8qPCrEclOKKWhD3U= +github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 h1:kes8mmyCpxJsI7FTwtzRqEy9CdjCtrXrXGuOpxEA7Ts= +github.com/hashicorp/go-secure-stdlib/strutil v0.1.2/go.mod h1:Gou2R9+il93BqX25LAKCLuM+y9U2T4hlwvT1yprcna4= +github.com/hashicorp/go-sockaddr v1.0.2 h1:ztczhD1jLxIRjVejw8gFomI1BQZOe2WoVOu0SyteCQc= +github.com/hashicorp/go-sockaddr v1.0.2/go.mod h1:rB4wwRAUzs07qva3c5SdrY/NEtAUjGlgmH/UkBUC97A= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hashicorp/vault/api v1.16.0 h1:nbEYGJiAPGzT9U4oWgaaB0g+Rj8E59QuHKyA5LhwQN4= +github.com/hashicorp/vault/api v1.16.0/go.mod h1:KhuUhzOD8lDSk29AtzNjgAu2kxRA9jL9NAbkFlqvkBA= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -55,10 +87,21 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 h1:PwQumkgq4/acIiZhtifTV5OUqqiP82UAl0h87xj/l9k= github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg= +github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= +github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= @@ -73,8 +116,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= @@ -87,15 +129,15 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk= +github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk= -github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= -github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/go-m1cpu v0.1.7 h1:C76Yd0ObKR82W4vhfjZiCp0HxcSZ8Nqd84v+HZ0qyI0= github.com/shoenig/go-m1cpu v0.1.7/go.mod h1:KkDOw6m3ZJQAPHbrzkZki4hnx+pDRR1Lo+ldA56wD5w= -github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= -github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= github.com/shoenig/test v1.7.0 h1:eWcHtTXa6QLnBvm0jgEabMRN/uJ4DMV3M8xUGgRkZmk= +github.com/shoenig/test v1.7.0/go.mod h1:UxJ6u/x2v/TNs/LoLxBNJRV9DiwBBKYxXSyczsBHFoI= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -109,12 +151,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= -github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= -github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= -github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -171,6 +209,7 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -182,8 +221,6 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/internal/certmanager/vault.go b/internal/certmanager/vault.go new file mode 100644 index 0000000..a72d65f --- /dev/null +++ b/internal/certmanager/vault.go @@ -0,0 +1,467 @@ +package certmanager + +import ( + "context" + "crypto/tls" + "crypto/x509" + "errors" + "fmt" + "net" + "net/url" + "os" + "path/filepath" + "strings" + "sync" + "time" + + vault "github.com/hashicorp/vault/api" + "github.com/persys/compute-agent/internal/config" + "github.com/sirupsen/logrus" +) + +const ( + rotationFractionNumerator = 80 + rotationFractionDenominator = 100 + minRotationWait = 30 * time.Second +) + +// Manager handles Vault-backed certificate issuance and rotation. +type Manager struct { + cfg *config.Config + logger *logrus.Entry + + mu sync.RWMutex + current certMeta +} + +type certMeta struct { + notBefore time.Time + notAfter time.Time +} + +func New(cfg *config.Config, logger *logrus.Logger) *Manager { + return &Manager{ + cfg: cfg, + logger: logger.WithField("component", "vault-cert-manager"), + } +} + +// Start ensures initial certs exist and then rotates at 80% lifetime when Vault is enabled. +func (m *Manager) Start(ctx context.Context) error { + if !m.cfg.TLSEnabled { + return nil + } + if !m.cfg.VaultEnabled { + m.logger.Info("Vault cert manager disabled; using manual certificate files") + return nil + } + if existingMeta, ok := m.loadExistingCertMeta(); ok { + m.mu.Lock() + m.current = existingMeta + m.mu.Unlock() + m.logger.WithFields(logrus.Fields{ + "not_before": existingMeta.notBefore.UTC().Format(time.RFC3339), + "not_after": existingMeta.notAfter.UTC().Format(time.RFC3339), + }).Info("Using existing valid certificate from disk") + go m.rotationLoop(ctx) + return nil + } + + cli, err := m.newVaultClient() + if err != nil { + if m.manualCertAvailable() { + m.logger.WithError(err).Warn("Vault unavailable on startup, falling back to manual certificates") + go m.recoveryLoop(ctx) + return nil + } + return fmt.Errorf("vault unavailable and no manual cert fallback found: %w", err) + } + + if err := m.issueAndPersist(ctx, cli); err != nil { + if m.manualCertAvailable() { + m.logger.WithError(err).Warn("Vault certificate issuance failed, using manual certificates") + go m.recoveryLoop(ctx) + return nil + } + return fmt.Errorf("vault issuance failed and no manual cert fallback found: %w", err) + } + + go m.rotationLoop(ctx) + return nil +} + +func (m *Manager) rotationLoop(ctx context.Context) { + for { + renewAt := m.nextRenewAt() + wait := time.Until(renewAt) + if wait < minRotationWait { + wait = minRotationWait + } + + m.logger.WithField("next_rotation", renewAt.UTC().Format(time.RFC3339)).Info("Next certificate rotation scheduled") + + select { + case <-ctx.Done(): + return + case <-time.After(wait): + } + + cli, err := m.newVaultClient() + if err != nil { + m.logger.WithError(err).Warn("Vault not reachable during rotation window; retrying later") + continue + } + if err := m.issueAndPersist(ctx, cli); err != nil { + m.logger.WithError(err).Warn("Certificate rotation failed; retrying later") + continue + } + } +} + +func (m *Manager) recoveryLoop(ctx context.Context) { + ticker := time.NewTicker(m.cfg.VaultRetryInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + cli, err := m.newVaultClient() + if err != nil { + m.logger.WithError(err).Debug("Vault still unavailable while running on fallback certs") + continue + } + if err := m.issueAndPersist(ctx, cli); err != nil { + m.logger.WithError(err).Warn("Vault recovered but certificate issuance still failing") + continue + } + + m.logger.Info("Vault certificate provisioning recovered; enabling rotation loop") + go m.rotationLoop(ctx) + return + } +} + +func (m *Manager) newVaultClient() (*vault.Client, error) { + conf := vault.DefaultConfig() + conf.Address = m.cfg.VaultAddr + + client, err := vault.NewClient(conf) + if err != nil { + return nil, err + } + + switch strings.ToLower(m.cfg.VaultAuthMethod) { + case "token": + client.SetToken(m.cfg.VaultToken) + if _, err := client.Auth().Token().LookupSelf(); err != nil { + return nil, fmt.Errorf("token auth validation failed: %w", err) + } + case "approle": + secret, err := client.Logical().Write("auth/approle/login", map[string]interface{}{ + "role_id": m.cfg.VaultAppRoleID, + "secret_id": m.cfg.VaultAppSecretID, + }) + if err != nil { + return nil, fmt.Errorf("approle login failed: %w", err) + } + if secret == nil || secret.Auth == nil || secret.Auth.ClientToken == "" { + return nil, errors.New("approle login returned empty client token") + } + client.SetToken(secret.Auth.ClientToken) + default: + return nil, fmt.Errorf("unsupported vault auth method: %s", m.cfg.VaultAuthMethod) + } + + return client, nil +} + +func (m *Manager) issueAndPersist(ctx context.Context, client *vault.Client) error { + dnsSANs, ipSANs := m.detectSANs() + payload := map[string]interface{}{ + "common_name": m.cfg.VaultServiceName, + "ttl": m.cfg.VaultCertTTL.String(), + } + if len(dnsSANs) > 0 { + payload["alt_names"] = strings.Join(dnsSANs, ",") + } + if len(ipSANs) > 0 { + payload["ip_sans"] = strings.Join(ipSANs, ",") + } + + path := fmt.Sprintf("%s/issue/%s", strings.Trim(m.cfg.VaultPKIMount, "/"), m.cfg.VaultPKIRole) + secret, err := client.Logical().WriteWithContext(ctx, path, payload) + if err != nil { + return err + } + if secret == nil || secret.Data == nil { + return errors.New("empty response from vault issue endpoint") + } + + certPEM := asString(secret.Data["certificate"]) + keyPEM := asString(secret.Data["private_key"]) + issuingCA := asString(secret.Data["issuing_ca"]) + caChain := parseCAChain(secret.Data["ca_chain"]) + + if certPEM == "" || keyPEM == "" { + return errors.New("vault response missing certificate or private key") + } + + combinedCA := combineCA(issuingCA, caChain) + if combinedCA == "" { + return errors.New("vault response missing CA chain") + } + + notBefore, notAfter, err := certValidity(certPEM, keyPEM) + if err != nil { + return err + } + + if err := writeAtomic(m.cfg.TLSCertPath, certPEM, 0o644); err != nil { + return err + } + if err := writeAtomic(m.cfg.TLSKeyPath, keyPEM, 0o600); err != nil { + return err + } + if err := writeAtomic(m.cfg.TLSCAPath, combinedCA, 0o644); err != nil { + return err + } + + m.mu.Lock() + m.current = certMeta{ + notBefore: notBefore, + notAfter: notAfter, + } + m.mu.Unlock() + + m.logger.WithFields(logrus.Fields{ + "not_before": notBefore.UTC().Format(time.RFC3339), + "not_after": notAfter.UTC().Format(time.RFC3339), + "dns_sans": strings.Join(dnsSANs, ","), + "ip_sans": strings.Join(ipSANs, ","), + }).Info("Issued and installed certificate from Vault") + + return nil +} + +func (m *Manager) detectSANs() ([]string, []string) { + dnsSet := map[string]struct{}{} + ipSet := map[string]struct{}{} + addDNS := func(s string) { + s = strings.TrimSpace(strings.ToLower(s)) + if s != "" { + dnsSet[s] = struct{}{} + } + } + addIP := func(s string) { + s = strings.TrimSpace(s) + if ip := net.ParseIP(s); ip != nil { + ipSet[ip.String()] = struct{}{} + } + } + + service := strings.TrimSpace(m.cfg.VaultServiceName) + if service == "" { + service = "compute-agent" + } + addDNS(service) + addDNS("localhost") + addIP("127.0.0.1") + addIP("::1") + + if host, err := os.Hostname(); err == nil { + addDNS(host) + } + addDNS(m.cfg.NodeID) + + domain := strings.Trim(strings.ToLower(m.cfg.VaultServiceDomain), ".") + if domain != "" { + addDNS(service + "." + domain) + if host, err := os.Hostname(); err == nil { + short := strings.Split(host, ".")[0] + addDNS(short + "." + domain) + } + } + + if grpcHost := strings.TrimSpace(m.cfg.GRPCAddr); grpcHost != "" && grpcHost != "0.0.0.0" { + if ip := net.ParseIP(grpcHost); ip != nil { + addIP(ip.String()) + } else { + addDNS(grpcHost) + } + } + + if u, err := url.Parse(m.cfg.VaultAddr); err == nil { + host := u.Hostname() + if ip := net.ParseIP(host); ip != nil { + addIP(ip.String()) + } + } + + if addrs, err := net.InterfaceAddrs(); err == nil { + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok || ipNet.IP == nil || ipNet.IP.IsLoopback() { + continue + } + addIP(ipNet.IP.String()) + } + } + + dnsSANs := make([]string, 0, len(dnsSet)) + for s := range dnsSet { + dnsSANs = append(dnsSANs, s) + } + ipSANs := make([]string, 0, len(ipSet)) + for s := range ipSet { + ipSANs = append(ipSANs, s) + } + return dnsSANs, ipSANs +} + +func (m *Manager) manualCertAvailable() bool { + if _, err := tls.LoadX509KeyPair(m.cfg.TLSCertPath, m.cfg.TLSKeyPath); err != nil { + return false + } + caPEM, err := os.ReadFile(m.cfg.TLSCAPath) + if err != nil { + return false + } + pool := x509.NewCertPool() + return pool.AppendCertsFromPEM(caPEM) +} + +func (m *Manager) loadExistingCertMeta() (certMeta, bool) { + if !m.manualCertAvailable() { + return certMeta{}, false + } + keyPair, err := tls.LoadX509KeyPair(m.cfg.TLSCertPath, m.cfg.TLSKeyPath) + if err != nil || len(keyPair.Certificate) == 0 { + return certMeta{}, false + } + leaf, err := x509.ParseCertificate(keyPair.Certificate[0]) + if err != nil { + return certMeta{}, false + } + now := time.Now() + if now.Before(leaf.NotBefore) || !now.Before(leaf.NotAfter) { + return certMeta{}, false + } + return certMeta{notBefore: leaf.NotBefore, notAfter: leaf.NotAfter}, true +} + +func (m *Manager) nextRenewAt() time.Time { + m.mu.RLock() + meta := m.current + m.mu.RUnlock() + + if meta.notAfter.IsZero() || meta.notBefore.IsZero() || !meta.notAfter.After(meta.notBefore) { + return time.Now().Add(m.cfg.VaultRetryInterval) + } + + lifetime := meta.notAfter.Sub(meta.notBefore) + rotationPoint := meta.notBefore.Add(lifetime * rotationFractionNumerator / rotationFractionDenominator) + if rotationPoint.Before(time.Now()) { + return time.Now().Add(minRotationWait) + } + return rotationPoint +} + +func certValidity(certPEM, keyPEM string) (time.Time, time.Time, error) { + keyPair, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM)) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse issued keypair: %w", err) + } + if len(keyPair.Certificate) == 0 { + return time.Time{}, time.Time{}, errors.New("issued keypair contains no certificate") + } + leaf, err := x509.ParseCertificate(keyPair.Certificate[0]) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse issued leaf certificate: %w", err) + } + return leaf.NotBefore, leaf.NotAfter, nil +} + +func combineCA(issuingCA string, chain []string) string { + parts := make([]string, 0, 1+len(chain)) + if trimmed := strings.TrimSpace(issuingCA); trimmed != "" { + parts = append(parts, trimmed) + } + for _, c := range chain { + if trimmed := strings.TrimSpace(c); trimmed != "" { + parts = append(parts, trimmed) + } + } + return strings.Join(parts, "\n") +} + +func parseCAChain(v interface{}) []string { + switch raw := v.(type) { + case []interface{}: + out := make([]string, 0, len(raw)) + for _, item := range raw { + if s := strings.TrimSpace(asString(item)); s != "" { + out = append(out, s) + } + } + return out + case []string: + out := make([]string, 0, len(raw)) + for _, item := range raw { + if s := strings.TrimSpace(item); s != "" { + out = append(out, s) + } + } + return out + default: + s := strings.TrimSpace(asString(v)) + if s == "" { + return nil + } + return []string{s} + } +} + +func asString(v interface{}) string { + switch t := v.(type) { + case string: + return t + case []byte: + return string(t) + case nil: + return "" + default: + return fmt.Sprintf("%v", t) + } +} + +func writeAtomic(path, contents string, mode os.FileMode) error { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + + tmp, err := os.CreateTemp(dir, ".tmp-cert-*") + if err != nil { + return err + } + tmpName := tmp.Name() + defer os.Remove(tmpName) + + if _, err := tmp.WriteString(contents); err != nil { + _ = tmp.Close() + return err + } + if err := tmp.Chmod(mode); err != nil { + _ = tmp.Close() + return err + } + if err := tmp.Close(); err != nil { + return err + } + + return os.Rename(tmpName, path) +} diff --git a/internal/config/config.go b/internal/config/config.go index 5fb0ebc..277235f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -22,6 +22,20 @@ type Config struct { TLSKeyPath string TLSCAPath string + // Vault certificate manager configuration + VaultEnabled bool + VaultAddr string + VaultAuthMethod string + VaultToken string + VaultAppRoleID string + VaultAppSecretID string + VaultPKIMount string + VaultPKIRole string + VaultCertTTL time.Duration + VaultServiceName string + VaultServiceDomain string + VaultRetryInterval time.Duration + // State store configuration StateStorePath string @@ -62,10 +76,22 @@ func Load() (*Config, error) { GRPCPort: getEnvAsInt("PERSYS_GRPC_PORT", 50051), // TLS defaults - TLSEnabled: getEnvAsBool("PERSYS_TLS_ENABLED", true), - TLSCertPath: getEnv("PERSYS_TLS_CERT", "/etc/persys/certs/agent/compute-agent.pem"), - TLSKeyPath: getEnv("PERSYS_TLS_KEY", "/etc/persys/certs/agent/compute-agent-key.pem"), - TLSCAPath: getEnv("PERSYS_TLS_CA", "/etc/persys/certs/agent/ca.pem"), + TLSEnabled: getEnvAsBool("PERSYS_TLS_ENABLED", true), + TLSCertPath: getEnv("PERSYS_TLS_CERT", "/etc/persys/certs/agent/compute-agent.pem"), + TLSKeyPath: getEnv("PERSYS_TLS_KEY", "/etc/persys/certs/agent/compute-agent-key.pem"), + TLSCAPath: getEnv("PERSYS_TLS_CA", "/etc/persys/certs/agent/ca.pem"), + VaultEnabled: getEnvAsBool("PERSYS_VAULT_ENABLED", true), + VaultAddr: getEnv("PERSYS_VAULT_ADDR", "http://localhost:8200"), + VaultAuthMethod: strings.ToLower(getEnv("PERSYS_VAULT_AUTH_METHOD", "token")), + VaultToken: getEnv("PERSYS_VAULT_TOKEN", ""), + VaultAppRoleID: getEnv("PERSYS_VAULT_APPROLE_ROLE_ID", ""), + VaultAppSecretID: getEnv("PERSYS_VAULT_APPROLE_SECRET_ID", ""), + VaultPKIMount: getEnv("PERSYS_VAULT_PKI_MOUNT", "pki"), + VaultPKIRole: getEnv("PERSYS_VAULT_PKI_ROLE", "compute-agent"), + VaultCertTTL: getEnvAsDuration("PERSYS_VAULT_CERT_TTL", 24*time.Hour), + VaultServiceName: getEnv("PERSYS_VAULT_SERVICE_NAME", "compute-agent"), + VaultServiceDomain: getEnv("PERSYS_VAULT_SERVICE_DOMAIN", ""), + VaultRetryInterval: getEnvAsDuration("PERSYS_VAULT_RETRY_INTERVAL", 2*time.Minute), // State store defaults StateStorePath: getEnv("PERSYS_STATE_PATH", "/var/lib/persys/state.db"), @@ -123,6 +149,36 @@ func (c *Config) Validate() error { } } + if c.VaultEnabled { + if !c.TLSEnabled { + return fmt.Errorf("vault cert manager requires TLS to be enabled") + } + if c.VaultAddr == "" { + return fmt.Errorf("vault is enabled but PERSYS_VAULT_ADDR is empty") + } + if c.VaultPKIMount == "" || c.VaultPKIRole == "" { + return fmt.Errorf("vault is enabled but PKI mount/role is not configured") + } + switch c.VaultAuthMethod { + case "token": + if c.VaultToken == "" { + return fmt.Errorf("vault token auth selected but PERSYS_VAULT_TOKEN is empty") + } + case "approle": + if c.VaultAppRoleID == "" || c.VaultAppSecretID == "" { + return fmt.Errorf("vault approle auth selected but role_id/secret_id is missing") + } + default: + return fmt.Errorf("unsupported vault auth method %q (expected token|approle)", c.VaultAuthMethod) + } + if c.VaultCertTTL <= 0 { + return fmt.Errorf("vault cert TTL must be positive") + } + if c.VaultRetryInterval <= 0 { + return fmt.Errorf("vault retry interval must be positive") + } + } + if c.StateStorePath == "" { return fmt.Errorf("state store path cannot be empty") } diff --git a/internal/config/vault_config_test.go b/internal/config/vault_config_test.go new file mode 100644 index 0000000..58382dc --- /dev/null +++ b/internal/config/vault_config_test.go @@ -0,0 +1,54 @@ +package config + +import ( + "testing" + "time" +) + +func TestValidate_VaultTokenAuthRequiresToken(t *testing.T) { + cfg := &Config{ + GRPCPort: 50051, + StateStorePath: "/tmp/state.db", + DockerEnabled: true, + TLSEnabled: true, + TLSCertPath: "/tmp/cert.pem", + TLSKeyPath: "/tmp/key.pem", + TLSCAPath: "/tmp/ca.pem", + + VaultEnabled: true, + VaultAddr: "http://127.0.0.1:8200", + VaultAuthMethod: "token", + VaultPKIMount: "pki", + VaultPKIRole: "compute-agent", + VaultCertTTL: time.Hour, + VaultRetryInterval: time.Minute, + } + + if err := cfg.Validate(); err == nil { + t.Fatal("expected vault token auth validation error") + } +} + +func TestValidate_VaultAppRoleAuthRequiresCredentials(t *testing.T) { + cfg := &Config{ + GRPCPort: 50051, + StateStorePath: "/tmp/state.db", + DockerEnabled: true, + TLSEnabled: true, + TLSCertPath: "/tmp/cert.pem", + TLSKeyPath: "/tmp/key.pem", + TLSCAPath: "/tmp/ca.pem", + + VaultEnabled: true, + VaultAddr: "http://127.0.0.1:8200", + VaultAuthMethod: "approle", + VaultPKIMount: "pki", + VaultPKIRole: "compute-agent", + VaultCertTTL: time.Hour, + VaultRetryInterval: time.Minute, + } + + if err := cfg.Validate(); err == nil { + t.Fatal("expected vault approle auth validation error") + } +} diff --git a/internal/grpc/server.go b/internal/grpc/server.go index 9fd8145..c4706c1 100644 --- a/internal/grpc/server.go +++ b/internal/grpc/server.go @@ -10,6 +10,7 @@ import ( "os" "sort" "strings" + "sync" "time" "github.com/persys/compute-agent/internal/config" @@ -106,31 +107,89 @@ func (s *Server) Stop() { // loadTLSConfig loads TLS certificates for mTLS func (s *Server) loadTLSConfig() (*tls.Config, error) { - // Load server certificate and key - cert, err := tls.LoadX509KeyPair(s.config.TLSCertPath, s.config.TLSKeyPath) + provider := &dynamicTLSProvider{ + certPath: s.config.TLSCertPath, + keyPath: s.config.TLSKeyPath, + caPath: s.config.TLSCAPath, + } + // Prime initial load so startup fails fast on invalid/missing files. + if _, err := provider.getConfig(); err != nil { + return nil, err + } + + return &tls.Config{ + MinVersion: tls.VersionTLS12, + GetConfigForClient: func(*tls.ClientHelloInfo) (*tls.Config, error) { + return provider.getConfig() + }, + }, nil +} + +type dynamicTLSProvider struct { + certPath string + keyPath string + caPath string + + mu sync.RWMutex + cached *tls.Config + certModTime time.Time + keyModTime time.Time + caModTime time.Time +} + +func (d *dynamicTLSProvider) getConfig() (*tls.Config, error) { + certInfo, err := os.Stat(d.certPath) if err != nil { - return nil, fmt.Errorf("failed to load server certificate: %w", err) + return nil, fmt.Errorf("failed to stat server certificate: %w", err) + } + keyInfo, err := os.Stat(d.keyPath) + if err != nil { + return nil, fmt.Errorf("failed to stat server key: %w", err) } + caInfo, err := os.Stat(d.caPath) + if err != nil { + return nil, fmt.Errorf("failed to stat CA certificate: %w", err) + } + + d.mu.RLock() + cached := d.cached + certUnchanged := d.certModTime.Equal(certInfo.ModTime()) + keyUnchanged := d.keyModTime.Equal(keyInfo.ModTime()) + caUnchanged := d.caModTime.Equal(caInfo.ModTime()) + d.mu.RUnlock() - // Load CA certificate for client verification - caCert, err := os.ReadFile(s.config.TLSCAPath) + if cached != nil && certUnchanged && keyUnchanged && caUnchanged { + return cached, nil + } + + keyPair, err := tls.LoadX509KeyPair(d.certPath, d.keyPath) + if err != nil { + return nil, fmt.Errorf("failed to load server certificate: %w", err) + } + caCert, err := os.ReadFile(d.caPath) if err != nil { return nil, fmt.Errorf("failed to read CA certificate: %w", err) } - caCertPool := x509.NewCertPool() if !caCertPool.AppendCertsFromPEM(caCert) { return nil, fmt.Errorf("failed to parse CA certificate") } - tlsConfig := &tls.Config{ - Certificates: []tls.Certificate{cert}, + updated := &tls.Config{ + MinVersion: tls.VersionTLS12, + Certificates: []tls.Certificate{keyPair}, ClientCAs: caCertPool, ClientAuth: tls.RequireAndVerifyClientCert, - MinVersion: tls.VersionTLS12, } - return tlsConfig, nil + d.mu.Lock() + d.cached = updated + d.certModTime = certInfo.ModTime() + d.keyModTime = keyInfo.ModTime() + d.caModTime = caInfo.ModTime() + d.mu.Unlock() + + return updated, nil } // ApplyWorkload handles workload create/update requests From 80da10e835417244d9030bcfd7b0d70cf36bbf31 Mon Sep 17 00:00:00 2001 From: milx Date: Fri, 20 Feb 2026 02:59:24 +0330 Subject: [PATCH 2/3] Fix: Disable Vault for unit tests to pass in CI --- internal/config/config_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5ab6c48..4f3636c 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -33,6 +33,7 @@ func TestLoad_NodeLabelsFromEnv(t *testing.T) { t.Setenv("PERSYS_COMPOSE_ENABLED", "false") t.Setenv("PERSYS_VM_ENABLED", "false") t.Setenv("PERSYS_TLS_ENABLED", "false") + t.Setenv("PERSYS_VAULT_ENABLED", "false") t.Setenv("PERSYS_NODE_REGION", "us-east-1") t.Setenv("PERSYS_NODE_ENV", "prod") t.Setenv("PERSYS_NODE_LABELS", "team=platform,zone=use1-az2,invalid,noequal=,=novalue") @@ -78,6 +79,7 @@ func TestLoad_SchedulerAddrFromEnv(t *testing.T) { t.Setenv("PERSYS_COMPOSE_ENABLED", "false") t.Setenv("PERSYS_VM_ENABLED", "false") t.Setenv("PERSYS_TLS_ENABLED", "false") + t.Setenv("PERSYS_VAULT_ENABLED", "false") t.Setenv("PERSYS_SCHEDULER_ADDR", "10.0.0.9:8085") t.Setenv("PERSYS_SCHEDULER_INSECURE", "true") From 957fbee2872d3762fbb4f5c227bb87b47f7f579a Mon Sep 17 00:00:00 2001 From: milx Date: Fri, 20 Feb 2026 03:13:50 +0330 Subject: [PATCH 3/3] Fix: Persist rotated cert and key as one consistent snapshot --- internal/certmanager/vault.go | 30 +++++++++++++++++++++++------- internal/grpc/server.go | 9 +++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/internal/certmanager/vault.go b/internal/certmanager/vault.go index a72d65f..93fa58a 100644 --- a/internal/certmanager/vault.go +++ b/internal/certmanager/vault.go @@ -220,13 +220,7 @@ func (m *Manager) issueAndPersist(ctx context.Context, client *vault.Client) err return err } - if err := writeAtomic(m.cfg.TLSCertPath, certPEM, 0o644); err != nil { - return err - } - if err := writeAtomic(m.cfg.TLSKeyPath, keyPEM, 0o600); err != nil { - return err - } - if err := writeAtomic(m.cfg.TLSCAPath, combinedCA, 0o644); err != nil { + if err := writeCertBundleAtomic(m.cfg.TLSCertPath, certPEM, m.cfg.TLSKeyPath, keyPEM, m.cfg.TLSCAPath, combinedCA); err != nil { return err } @@ -465,3 +459,25 @@ func writeAtomic(path, contents string, mode os.FileMode) error { return os.Rename(tmpName, path) } + +func writeCertBundleAtomic(certPath, certPEM, keyPath, keyPEM, caPath, caPEM string) error { + // Validate material before changing anything on disk. + if _, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM)); err != nil { + return fmt.Errorf("invalid cert/key pair: %w", err) + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM([]byte(caPEM)) { + return errors.New("invalid CA PEM") + } + + if err := writeAtomic(keyPath, keyPEM, 0o600); err != nil { + return err + } + if err := writeAtomic(certPath, certPEM, 0o644); err != nil { + return err + } + if err := writeAtomic(caPath, caPEM, 0o644); err != nil { + return err + } + return nil +} diff --git a/internal/grpc/server.go b/internal/grpc/server.go index c4706c1..9da6600 100644 --- a/internal/grpc/server.go +++ b/internal/grpc/server.go @@ -164,14 +164,23 @@ func (d *dynamicTLSProvider) getConfig() (*tls.Config, error) { keyPair, err := tls.LoadX509KeyPair(d.certPath, d.keyPath) if err != nil { + if cached != nil { + return cached, nil + } return nil, fmt.Errorf("failed to load server certificate: %w", err) } caCert, err := os.ReadFile(d.caPath) if err != nil { + if cached != nil { + return cached, nil + } return nil, fmt.Errorf("failed to read CA certificate: %w", err) } caCertPool := x509.NewCertPool() if !caCertPool.AppendCertsFromPEM(caCert) { + if cached != nil { + return cached, nil + } return nil, fmt.Errorf("failed to parse CA certificate") }