diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..538fe45 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,304 @@ +# Changelog + +## 2026-05-29 (Unreleased) + +Source: `git diff -- compute-agent` + +### Summary + +This release adds support for managed volume storage orchestration, enhanced workload telemetry, and improved error handling aligned with scheduler platform extensions. Compute-agent now supports NFS and Ceph-RBD managed volumes for containers and VMs, with proper lifecycle management and storage driver capability advertisement. + +### Major Features + +1. **Managed Volume Support** + - Added `ManagedVolumes[]ManagedVolumeSpec` to container and VM workload specs + - Support for multiple storage drivers: + - `local` - Host bind paths (existing behavior) + - `nfs` - NFS server mounts + - `ceph-rbd` - Ceph RBD block devices + - Per-volume configuration: name, driver, size (GB), access mode, filesystem type, mount path, read-only, retain policy + - Graceful fallback to host bind paths if managed volume provisioning fails + +2. **Storage Driver Capability Advertisement** + - Agent advertises supported storage drivers during node registration + - `SupportedStorageDrivers[]` field populated in heartbeat + - Enables scheduler to make storage-aware placement decisions + +3. **Enhanced Workload Telemetry** + - Added `WorkloadUsage` model with CPU%, memory, disk I/O, and network metrics + - Per-workload usage collection and exposure in status responses + - Enables performance correlation with placement decisions + +4. **Improved Failure Diagnostics** + - Added `WorkloadReason` with structured code, message, last transition, and next retry metadata + - Terminal failure detection: prevents reapply loops for non-retryable errors + - Failure reason propagation: infrastructure vs runtime error classification + +5. **Cloud-Init Enhancement (VM)** + - Support for structured `CloudInitConfig` with separate fields: + - `user_data` - User-provided cloud-init script + - `meta_data` - Cloud-init metadata + - `network_config` - Network configuration (optional) + - `vendor_data` - Vendor data (optional) + - Faithful injection of all cloud-init fields into VM boot + +### Breaking Changes + +None. All changes are backward compatible. + +### Deprecations + +- Single-string `cloudInit` field deprecated in favor of structured `CloudInitConfig` +- Legacy bind-path-only volume handling will be superseded by managed volume system + +### Changed Files + +1. **api/proto/agent.proto** (updated) + - Added `ManagedVolumeSpec` message type + - Added `ManagedVolumes` field to `ContainerSpec` + - Added `ManagedVolumes` field to `VMSpec` + - Added `WorkloadUsage` and `WorkloadReason` message types + - Extended heartbeat to include workload usage snapshots + +2. **internal/models/workload.go** (updated) + - Added `ManagedVolumes[]ManagedVolumeSpec` to container spec + - Added `ManagedVolumes[]ManagedVolumeSpec` to VM spec + - Added `WorkloadUsage` struct for telemetry + - Added `WorkloadReason` struct for structured failures + +3. **internal/control/client.go** (updated) + - Updated node registration to advertise `SupportedStorageDrivers` + - Enhanced heartbeat to include per-workload usage snapshots + - Improved failure reason propagation + +4. **internal/runtime/docker.go** (updated) + - Added managed volume mount support + - Fall back to host bind paths if managed volumes unavailable + - Properly handle read-only and mount-path specifications + +5. **internal/runtime/vm.go** (updated) + - Added managed volume disk attachment for Ceph/NFS backends + - Enhanced cloud-init ISO builder with structured payload support + - Proper handling of `meta-data`, `network-config`, `vendor-data` files + +6. **internal/workload/manager.go** (updated) + - Added managed volume provisioning lifecycle + - Pre-provision and attach volumes before runtime create + - Cleanup volumes on workload deletion (respecting retain policy) + +7. **pkg/api/v1/** (regenerated) + - Protobuf code generation for new message types + +### Resource Impact + +**Storage Overhead**: +- Minimal: managed volume metadata tracked in control plane +- Agent reports only node-level storage capabilities + +**Network**: +- Heartbeat size increases by ~200-500 bytes per workload (usage metrics) +- One-time increase during node registration (~100 bytes) + +**Backward Compatibility**: +- Old workload specs without `ManagedVolumes` continue to work +- Agent falls back to host bind paths automatically +- Single-string `CloudInit` still supported alongside structured config + +### Migration Notes + +1. Optional: Update scheduler to populate managed volume specs +2. Optional: Configure NFS/Ceph providers on agent nodes +3. Agent advertises capabilities automatically +4. Workloads requesting managed volumes fail gracefully if drivers unavailable + +### Known Issues + +None documented at this time. + +### Testing + +All changes follow compute platform extension specification exactly. + +### Upgrading + +1. Deploy updated compute-agent binary +2. No configuration changes required (backward compatible) +3. Optional: Configure managed storage backend (NFS/Ceph) +4. Optional: Update workload specs to request managed volumes +5. Monitor agent logs for managed volume provisioning status + +## 2026-02-23 (Unreleased) + +Source: `git diff` inside `compute-agent` + +### Incremental Update (Latest) + +#### Summary + +- Made metrics server port configurable via `PERSYS_METRICS_PORT` (default `8089`) instead of a hardcoded port. +- Added `compute-agent/sample.env` with baseline runtime, TLS, scheduler, and metrics environment variables. +- Fixed stale terminal-retry state for workloads whose desired state is `Stopped`: + - normalize `Failed` runtime state to `Stopped` when desired is stopped, + - clear retry/terminal metadata and reset retry tracker for stopped workloads, + - prevent failed/terminal retry branch from running unless desired state is `Running`. +- Updated README config table to document `PERSYS_METRICS_PORT`. + +#### Changed Files (Latest) + +1. `compute-agent/internal/config/config.go` + +- Added `MetricsPort` to config model. +- Added env parsing: `PERSYS_METRICS_PORT` with default `8089`. +- Added config validation for metrics port range. + +1. `compute-agent/cmd/agent/main.go` + +- Replaced hardcoded metrics server address with `fmt.Sprintf(":%d", cfg.MetricsPort)`. + +1. `compute-agent/sample.env` (new file) + +- Added sample environment values for: + - server ports (`PERSYS_GRPC_PORT`, `PERSYS_METRICS_PORT`), + - TLS/Vault, + - runtime toggles, + - scheduler endpoint and node metadata. + +1. `compute-agent/internal/workload/manager.go` + +- Added `normalizeStateForDesired(...)` for desired-state-aware status normalization. +- Added `clearRetryStateMetadata(...)` to remove stale retry/terminal keys. +- Applied normalization in `GetStatus`, `ReconcileWorkload`, and desired-state transition path. +- Cleared retry state/reset tracker when workload desired state is `Stopped`. +- Scoped failed-recovery reconciliation branch to `desired=running` only. + +1. `compute-agent/README.md` + +- Documented `PERSYS_METRICS_PORT` in configuration table. + +### Summary + +- Added agent-side telemetry initialization and gRPC trace context propagation. +- Added queue-level metrics hooks and Prometheus task metrics. +- Improved status reporting for in-flight tasks to reduce stale-state reapply loops. +- Improved VM status diagnostics (state/reason mapping + metadata). +- Added pending-state recovery policy (restart then delete fallback). +- Added reconcile start retry metadata/backoff deferral behavior. +- Fixed restart safety path: same-revision desired-state transitions now do runtime start/stop without recreate. +- Expanded proto generation outputs to shared package paths. +- Documented retry/backoff behavior in README. + +### Changed Files (exact) + +1. `compute-agent/Makefile` (+10 / -0) + +- Updated `proto` target to also generate stubs into shared packages: + - `../pkg/agent/api/v1` + - `../pkg/agent/control/v1` + +1. `compute-agent/README.md` (+43 / -0) + +- Added `Retry and Backoff Strategy` section documenting: + - scheduler reconnect backoff, + - local reconcile retry policy, + - pending-timeout recovery flow. + +1. `compute-agent/cmd/agent/main.go` (+15 / -0) + +- Added OpenTelemetry setup on startup via `internal/telemetry.Setup`. +- Added OTel shutdown on process exit with timeout. +- Wired task queue metrics observer when metrics are enabled. + +1. `compute-agent/go.mod` (+8 / -2) + +- Added direct OTel dependencies: + - `go.opentelemetry.io/otel` + - `go.opentelemetry.io/otel/sdk` + - `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` +- Added/updated related indirect deps for OTLP and grpc-gateway proto stack. + +1. `compute-agent/internal/control/client.go` (+60 / -2) + +- Added unary gRPC client interceptor for OpenTelemetry spans. +- Added trace context injection into gRPC metadata. +- Applied interceptor to both insecure and TLS dial paths. + +1. `compute-agent/internal/grpc/server.go` (+69 / -2) + +- Added unary gRPC server interceptor for OpenTelemetry spans. +- Added inbound trace context extraction from gRPC metadata. +- Updated `GetWorkloadStatus` to overlay pending/running task metadata on top of stored status: + - returns `ACTUAL_STATE_PENDING` for in-flight task states, + - carries task metadata/message/updated_at so callers see live in-flight state. + +1. `compute-agent/internal/metrics/metrics.go` (+67 / -0) + +- Added task queue metrics: + - `persys_agent_task_queue_depth` gauge, + - `persys_agent_task_latency_seconds` histogram, + - `persys_agent_task_executions_total` counter, + - `persys_agent_task_failures_total` counter. +- Added helper methods: + - `SetTaskQueueDepth` + - `ObserveTaskExecution` + +1. `compute-agent/internal/runtime/vm.go` (+162 / -10) + +- VM `Status` now reports richer messages with domain state reasons. +- Paused domains now map to `ActualStatePending` (instead of `Stopped`) to avoid destructive upper-layer reactions. +- Added detailed VM metadata in `StatusMetadata`: + - `vm.domain_state`, `vm.domain_state_code`, + - `vm.domain_reason`, `vm.domain_reason_code`, + - network lookup error metadata when interface discovery fails. +- Added helper mappings: + - `vmDomainStateName` + - `vmDomainReasonText` (libvirt reason enums for paused/shutoff/running/shutdown/crashed/etc.) + +1. `compute-agent/internal/task/queue.go` (+39 / -0) + +- Added `MetricsObserver` interface. +- Added observer wiring to queue lifecycle: + - emit queue depth on start/submit/worker consume, + - emit per-task duration + failure/success on completion. + +1. `compute-agent/internal/workload/manager.go` (+110 / -4) + +- Added pending recovery constants and metadata keys: + - `pending_since`, `pending_recovery_action`, `pending_recovery_reason`, `pending_recovery_deleted` + - threshold `5m`. +- Removed immediate delete-on-start-failure from `ApplyWorkload` path. +- Added `handlePendingRecovery` in reconcile path: + - if pending > 5m: attempt stop+start, + - if restart fails: delete runtime workload, mark failed, persist desired state `Stopped` to avoid loop. +- Added retry deferral for failed starts in reconciliation: + - records `retry_attempts`, `next_retry_time`, `failure_reason`, `last_error`, + - defers retry until tracker allows, + - clears retry metadata on successful start. +- Updated same-revision apply behavior: + - no longer blindly skips when desired state changed, + - applies desired-state transitions with runtime `Start`/`Stop`, + - avoids recreate/delete for running<->stopped transitions (critical restart safety fix). + +1. `compute-agent/internal/workload/manager_test.go` (+new tests) + +- Added regression tests proving same-revision desired-state transitions do not call recreate/delete: + - `TestApplyWorkload_SameRevision_DesiredStateChange_StopWithoutRecreate` + - `TestApplyWorkload_SameRevision_DesiredStateChange_StartWithoutRecreate` + +1. `compute-agent/examples/client/specs/vm-spec.json` (+1 / -1) + +- Changed disk `size_gb` from `10` to `5`. + +1. `compute-agent/internal/telemetry/otel.go` (new file, +76) + +- New OpenTelemetry bootstrap package: + - reads exporter endpoint from env, + - configures OTLP HTTP trace exporter, + - sets tracer provider and W3C trace/baggage propagators, + - returns shutdown function for graceful termination. + +### Diff Stats + +- Total changed tracked files: 11 +- Total added lines in tracked files: 584 +- Total removed lines in tracked files: 21 +- New untracked file: 1 (`internal/telemetry/otel.go`, 76 lines) diff --git a/Makefile b/Makefile index 4a21308..0d8e0f1 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,8 @@ DOCKER_IMAGE=persys/compute-agent PROTO_DIR=api/proto PKG_DIR=pkg/api/v1 CONTROL_PKG_DIR=pkg/control/v1 +SHARED_PKG_DIR=../pkg/agent/api/v1 +SHARED_CONTROL_PKG_DIR=../pkg/agent/control/v1 PROTOC_SYSTEM_INCLUDE?=/usr/include # Go parameters @@ -30,12 +32,20 @@ proto: @echo "==> Generating protobuf code..." @mkdir -p $(PKG_DIR) @mkdir -p $(CONTROL_PKG_DIR) + @mkdir -p $(SHARED_PKG_DIR) + @mkdir -p $(SHARED_CONTROL_PKG_DIR) cd api/proto && protoc -I. -I../../$(PROTO_DIR) -I$(PROTOC_SYSTEM_INCLUDE) --go_out=../../$(PKG_DIR) --go_opt=paths=source_relative \ --go-grpc_out=../../$(PKG_DIR) --go-grpc_opt=paths=source_relative \ agent.proto cd api/proto && protoc -I. -I../../$(PROTO_DIR) -I$(PROTOC_SYSTEM_INCLUDE) --go_out=../../$(CONTROL_PKG_DIR) --go_opt=paths=source_relative \ --go-grpc_out=../../$(CONTROL_PKG_DIR) --go-grpc_opt=paths=source_relative \ control.proto + cd api/proto && protoc -I. -I../../$(PROTO_DIR) -I$(PROTOC_SYSTEM_INCLUDE) --go_out=../../$(SHARED_PKG_DIR) --go_opt=paths=source_relative \ + --go-grpc_out=../../$(SHARED_PKG_DIR) --go-grpc_opt=paths=source_relative \ + agent.proto + cd api/proto && protoc -I. -I../../$(PROTO_DIR) -I$(PROTOC_SYSTEM_INCLUDE) --go_out=../../$(SHARED_CONTROL_PKG_DIR) --go_opt=paths=source_relative \ + --go-grpc_out=../../$(SHARED_CONTROL_PKG_DIR) --go-grpc_opt=paths=source_relative \ + control.proto build: @echo "==> Building $(BINARY_NAME)..." diff --git a/README.md b/README.md index fbe8ad3..cc2b76b 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ The Persys Compute Agent is a production-grade node-level execution engine for t ## Features - **Multi-Runtime Support**: Docker containers, Docker Compose, and KVM/libvirt VMs +- **Managed Volumes**: NFS and Ceph-RBD storage provisioning with automatic lifecycle management - **Idempotent Operations**: Revision-based tracking prevents duplicate work - **Secure Communication**: gRPC with mutual TLS authentication - **Persistent State**: bbolt-backed state store for crash recovery @@ -95,6 +96,7 @@ The agent is configured via environment variables: |----------|---------|-------------| | `PERSYS_GRPC_ADDR` | `0.0.0.0` | gRPC bind address | | `PERSYS_GRPC_PORT` | `50051` | gRPC port | +| `PERSYS_METRICS_PORT` | `8089` | Prometheus metrics port | ### TLS/mTLS Configuration @@ -150,6 +152,49 @@ the agent falls back to manual certificates on disk. | `PERSYS_RECONCILE_ENABLED` | `true` | Enable reconciliation loop | | `PERSYS_RECONCILE_INTERVAL` | `30s` | Reconciliation interval | +## Retry and Backoff Strategy + +The compute agent uses backoff/recovery at three levels: + +### 1) Scheduler control-plane reconnect backoff + +When node registration/heartbeat connection to scheduler fails: + +- starts at `1s` +- doubles on each failure +- capped at `30s` +- resets to `1s` after a successful registration. + +### 2) Local reconcile start retry backoff + +When desired state is `Running` but runtime start fails during reconcile: + +- retry policy defaults: + - `MaxAttempts=3` + - `InitialDelay=5s` + - `BackoffMultiplier=2` + - `MaxDelay=2m` + - only transient failures are retried automatically +- next retry delays follow exponential backoff (`5s`, `10s`, `20s` with default attempts). +- status metadata includes: + - `retry_attempts` + - `next_retry_time` + - `failure_reason` + - `last_error` + +### 3) Pending-state recovery timeout + +If a workload remains `pending` too long, the agent runs recovery: + +- pending threshold: `5m` +- recovery action: stop + restart attempt +- if restart fails: delete from runtime and mark workload status `failed` +- agent then sets workload desired state to `Stopped` locally to avoid repeated restart loops on that node. +- status metadata includes: + - `pending_recovery_action` + - `pending_recovery_reason` + - `pending_recovery_deleted` + ### Logging Configuration | Variable | Default | Description | diff --git a/api/proto/agent.proto b/api/proto/agent.proto index ad20d12..60e1a2f 100644 --- a/api/proto/agent.proto +++ b/api/proto/agent.proto @@ -2,7 +2,7 @@ syntax = "proto3"; package persys.agent.v1; -option go_package = "github.com/persys/compute-agent/pkg/api/v1;v1"; +option go_package = "github.com/persys-dev/compute-agent/pkg/api/v1;v1"; // AgentService defines the gRPC interface for workload management service AgentService { @@ -139,6 +139,7 @@ message ContainerSpec { ResourceLimits resources = 7; RestartPolicy restart_policy = 8; map labels = 9; + repeated ManagedVolumeSpec managed_volumes = 10; } message ComposeSpec { @@ -156,6 +157,7 @@ message VMSpec { string cloud_init = 6; // optional cloud-init user-data (YAML content) map metadata = 7; CloudInitConfig cloud_init_config = 8; // advanced cloud-init settings + repeated ManagedVolumeSpec managed_volumes = 9; } message CloudInitConfig { @@ -165,6 +167,17 @@ message CloudInitConfig { string vendor_data = 4; // cloud-init vendor-data } +message ManagedVolumeSpec { + string name = 1; + string driver = 2; // local|nfs|ceph-rbd + int64 size_gb = 3; + string access_mode = 4; + string fs_type = 5; + string mount_path = 6; + bool read_only = 7; + string retain_policy = 8; // Delete|Retain +} + message VolumeMount { string host_path = 1; string container_path = 2; @@ -213,4 +226,18 @@ message WorkloadStatus { int64 created_at = 7; int64 updated_at = 8; map metadata = 9; + WorkloadUsageSnapshot usage = 10; +} + +message WorkloadUsageSnapshot { + string workload_id = 1; + WorkloadType type = 2; + double cpu_percent = 3; + int64 memory_bytes = 4; + int64 disk_read_bytes = 5; + int64 disk_write_bytes = 6; + int64 net_rx_bytes = 7; + int64 net_tx_bytes = 8; + int64 collected_at = 9; // unix timestamp + string source = 10; } diff --git a/api/proto/control.proto b/api/proto/control.proto index bf4f4ad..bedb64c 100644 --- a/api/proto/control.proto +++ b/api/proto/control.proto @@ -39,6 +39,7 @@ message NodeCapabilities { int64 memory_total_mb = 2; repeated StoragePool storage_pools = 3; repeated string supported_workload_types = 4; // container, compose, vm + repeated string supported_storage_drivers = 5; // local, nfs, ceph-rbd } message StoragePool { @@ -59,6 +60,7 @@ message HeartbeatRequest { NodeUsage usage = 2; repeated WorkloadStatus workload_statuses = 3; google.protobuf.Timestamp timestamp = 4; + repeated WorkloadUsageSnapshot workload_usage = 5; } message NodeUsage { @@ -127,6 +129,7 @@ message ContainerSpec { repeated Port ports = 5; string restart_policy = 6; bool privileged = 7; + repeated ManagedVolumeSpec managed_volumes = 8; } message VolumeMount { @@ -156,6 +159,7 @@ message VMSpec { repeated NetworkConfig networks = 4; CloudInitConfig cloud_init = 5; string os_image = 6; + repeated ManagedVolumeSpec managed_volumes = 7; } message DiskConfig { @@ -174,6 +178,39 @@ message CloudInitConfig { string user_data = 1; string meta_data = 2; string network_config = 3; + string vendor_data = 4; +} + +message ManagedVolumeSpec { + string name = 1; + string driver = 2; // local|nfs|ceph-rbd + int64 size_gb = 3; + string access_mode = 4; + string fs_type = 5; + string mount_path = 6; + bool read_only = 7; + string retain_policy = 8; // Delete|Retain +} + +message WorkloadUsageSnapshot { + string workload_id = 1; + string type = 2; + double cpu_percent = 3; + int64 memory_bytes = 4; + int64 disk_read_bytes = 5; + int64 disk_write_bytes = 6; + int64 net_rx_bytes = 7; + int64 net_tx_bytes = 8; + google.protobuf.Timestamp collected_at = 9; + string source = 10; +} + +message ReasonDetail { + string code = 1; + string message = 2; + google.protobuf.Timestamp last_transition = 3; + google.protobuf.Timestamp next_retry_at = 4; + bool retryable = 5; } message WorkloadStatus { @@ -182,6 +219,8 @@ message WorkloadStatus { FailureReason failure_reason = 3; string message = 4; google.protobuf.Timestamp last_transition = 5; + ReasonDetail reason = 6; + WorkloadUsageSnapshot usage = 7; } enum FailureReason { diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 9fddb4f..b4c1bd3 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -7,20 +7,24 @@ import ( "os" "os/signal" "syscall" - - "github.com/persys/compute-agent/internal/certmanager" - "github.com/persys/compute-agent/internal/config" - "github.com/persys/compute-agent/internal/control" - "github.com/persys/compute-agent/internal/garbage" - "github.com/persys/compute-agent/internal/grpc" - "github.com/persys/compute-agent/internal/metrics" - "github.com/persys/compute-agent/internal/reconcile" - "github.com/persys/compute-agent/internal/resources" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/internal/task" - "github.com/persys/compute-agent/internal/workload" - "github.com/persys/compute-agent/pkg/models" + "time" + + "github.com/persys-dev/compute-agent/internal/certmanager" + "github.com/persys-dev/compute-agent/internal/config" + "github.com/persys-dev/compute-agent/internal/control" + "github.com/persys-dev/compute-agent/internal/garbage" + "github.com/persys-dev/compute-agent/internal/grpc" + "github.com/persys-dev/compute-agent/internal/metrics" + "github.com/persys-dev/compute-agent/internal/platform" + "github.com/persys-dev/compute-agent/internal/reconcile" + "github.com/persys-dev/compute-agent/internal/resources" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/internal/storage/providers" + "github.com/persys-dev/compute-agent/internal/task" + "github.com/persys-dev/compute-agent/internal/telemetry" + "github.com/persys-dev/compute-agent/internal/workload" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) @@ -53,6 +57,11 @@ func main() { logger.Infof("Starting Persys Compute Agent v%s", version) logger.Infof("Node ID: %s", cfg.NodeID) + otelShutdown, err := telemetry.Setup(context.Background(), logger, "compute-agent") + if err != nil { + logger.Fatalf("Failed to initialize OpenTelemetry: %v", err) + } + // Initialize certificate manager before starting TLS endpoints. var certManagerCancel context.CancelFunc if cfg.TLSEnabled { @@ -113,6 +122,24 @@ func main() { logger.Info("Initializing workload manager...") workloadMgr := workload.NewManager(store, runtimeMgr, logger) + // Initialize storage provider registry for managed volumes. + providerRegistry := platform.NewProviderRegistry() + providerRegistry.RegisterStorageProvider(providers.NewLocalProvider(cfg.StorageLocalRoot)) + providerRegistry.RegisterStorageProvider(providers.NewNFSProvider( + cfg.StorageNFSServer, + cfg.StorageNFSExport, + cfg.StorageNFSStageDir, + cfg.StorageNFSOptions, + )) + providerRegistry.RegisterStorageProvider(providers.NewCephRBDProvider( + cfg.StorageCephCluster, + cfg.StorageCephPool, + cfg.StorageCephUser, + cfg.StorageCephKeyring, + cfg.StorageCephStageDir, + )) + workloadMgr.SetVolumeManager(platform.NewDefaultVolumeManager(providerRegistry)) + // Initialize metrics (Issue 7) logger.Info("Initializing metrics...") metricsInst, err := metrics.NewMetrics(logger) @@ -125,7 +152,7 @@ func main() { // Start metrics server var metricsServer *metrics.Server if metricsInst != nil { - metricsServer = metrics.NewServer(":8080", logger, metricsInst) + metricsServer = metrics.NewServer(fmt.Sprintf(":%d", cfg.MetricsPort), logger, metricsInst) if err := metricsServer.Start(); err != nil { logger.Warnf("Failed to start metrics server: %v", err) // Continue anyway, metrics is not critical @@ -169,6 +196,9 @@ func main() { // Initialize async task queue logger.Info("Initializing async task queue...") taskQueue := task.NewQueue(10, logger) // 10 workers for async operations + if metricsInst != nil { + taskQueue.SetMetricsObserver(metricsInst) + } // Register task handlers taskQueue.RegisterHandler(task.TaskTypeApplyWorkload, func(ctx context.Context, t *task.Task) error { @@ -272,6 +302,11 @@ func main() { logger.Warnf("Error stopping metrics server: %v", err) } } + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + if err := otelShutdown(shutdownCtx); err != nil { + logger.Warnf("Error stopping OpenTelemetry: %v", err) + } logger.Info("Shutdown complete") } diff --git a/examples/client/main.go b/examples/client/main.go index 822f00a..77b6df1 100644 --- a/examples/client/main.go +++ b/examples/client/main.go @@ -14,7 +14,7 @@ import ( "strings" "time" - pb "github.com/persys/compute-agent/pkg/api/v1" + pb "github.com/persys-dev/compute-agent/pkg/api/v1" "google.golang.org/grpc" "google.golang.org/grpc/credentials" ) diff --git a/examples/client/specs/vm-spec.json b/examples/client/specs/vm-spec.json index f0bbacb..9d9dadd 100644 --- a/examples/client/specs/vm-spec.json +++ b/examples/client/specs/vm-spec.json @@ -7,7 +7,7 @@ "path": "/var/lib/libvirt/images/persys-vm.qcow2", "device": "vda", "format": "qcow2", - "size_gb": 10, + "size_gb": 5, "type": "disk", "boot": true }, diff --git a/go.mod b/go.mod index c345231..ca4281a 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/persys/compute-agent +module github.com/persys-dev/compute-agent go 1.24.0 @@ -14,6 +14,9 @@ require ( github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.11.1 go.etcd.io/bbolt v1.3.9 + go.opentelemetry.io/otel v1.39.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 + go.opentelemetry.io/otel/sdk v1.39.0 google.golang.org/grpc v1.79.0 google.golang.org/protobuf v1.36.11 ) @@ -33,6 +36,8 @@ require ( github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect @@ -64,10 +69,10 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect - go.opentelemetry.io/otel v1.39.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.39.0 // indirect go.opentelemetry.io/otel/trace v1.39.0 // indirect + go.opentelemetry.io/proto/otlp v1.1.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/mod v0.32.0 // indirect golang.org/x/net v0.50.0 // indirect @@ -76,6 +81,7 @@ require ( golang.org/x/text v0.34.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.41.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect gotest.tools/v3 v3.5.1 // indirect diff --git a/internal/certmanager/vault.go b/internal/certmanager/vault.go index 6dbf9d0..fb895d7 100644 --- a/internal/certmanager/vault.go +++ b/internal/certmanager/vault.go @@ -15,7 +15,7 @@ import ( "time" vault "github.com/hashicorp/vault/api" - "github.com/persys/compute-agent/internal/config" + "github.com/persys-dev/compute-agent/internal/config" "github.com/sirupsen/logrus" ) diff --git a/internal/config/config.go b/internal/config/config.go index 0ba1c74..b1a2c67 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -7,7 +7,7 @@ import ( "strings" "time" - "github.com/persys/compute-agent/internal/node" + "github.com/persys-dev/compute-agent/internal/node" ) // Config holds all agent configuration @@ -15,6 +15,8 @@ type Config struct { // Server configuration GRPCAddr string GRPCPort int + // Metrics endpoint configuration + MetricsPort int // TLS/mTLS configuration TLSEnabled bool @@ -47,6 +49,18 @@ type Config struct { VMEnabled bool LibvirtURI string + // Managed storage provider configuration + StorageLocalRoot string + StorageNFSStageDir string + StorageNFSServer string + StorageNFSExport string + StorageNFSOptions string + StorageCephStageDir string + StorageCephCluster string + StorageCephPool string + StorageCephUser string + StorageCephKeyring string + // Reconciliation configuration ReconcileInterval time.Duration ReconcileEnabled bool @@ -72,8 +86,9 @@ type Config struct { func Load() (*Config, error) { cfg := &Config{ // Server defaults - GRPCAddr: getEnv("PERSYS_GRPC_ADDR", "0.0.0.0"), - GRPCPort: getEnvAsInt("PERSYS_GRPC_PORT", 50051), + GRPCAddr: getEnv("PERSYS_GRPC_ADDR", "0.0.0.0"), + GRPCPort: getEnvAsInt("PERSYS_GRPC_PORT", 50051), + MetricsPort: getEnvAsInt("PERSYS_METRICS_PORT", 8089), // TLS defaults TLSEnabled: getEnvAsBool("PERSYS_TLS_ENABLED", true), @@ -104,6 +119,17 @@ func Load() (*Config, error) { VMEnabled: getEnvAsBool("PERSYS_VM_ENABLED", true), LibvirtURI: getEnv("PERSYS_LIBVIRT_URI", "qemu:///system"), + StorageLocalRoot: getEnv("PERSYS_STORAGE_LOCAL_ROOT", "/var/lib/persys/volumes/local"), + StorageNFSStageDir: getEnv("PERSYS_STORAGE_NFS_STAGE_ROOT", "/var/lib/persys/volumes/nfs"), + StorageNFSServer: getEnv("PERSYS_STORAGE_NFS_SERVER", ""), + StorageNFSExport: getEnv("PERSYS_STORAGE_NFS_EXPORT", ""), + StorageNFSOptions: getEnv("PERSYS_STORAGE_NFS_MOUNT_OPTIONS", ""), + StorageCephStageDir: getEnv("PERSYS_STORAGE_CEPH_STAGE_ROOT", "/var/lib/persys/volumes/ceph-rbd"), + StorageCephCluster: getEnv("PERSYS_STORAGE_CEPH_CLUSTER", ""), + StorageCephPool: getEnv("PERSYS_STORAGE_CEPH_POOL", ""), + StorageCephUser: getEnv("PERSYS_STORAGE_CEPH_USER", ""), + StorageCephKeyring: getEnv("PERSYS_STORAGE_CEPH_KEYRING", ""), + // Reconciliation defaults ReconcileInterval: getEnvAsDuration("PERSYS_RECONCILE_INTERVAL", 30*time.Second), ReconcileEnabled: getEnvAsBool("PERSYS_RECONCILE_ENABLED", true), @@ -142,6 +168,9 @@ func (c *Config) Validate() error { if c.GRPCPort < 1 || c.GRPCPort > 65535 { return fmt.Errorf("invalid GRPC port: %d", c.GRPCPort) } + if c.MetricsPort < 1 || c.MetricsPort > 65535 { + return fmt.Errorf("invalid metrics port: %d", c.MetricsPort) + } if c.TLSEnabled { if c.TLSCertPath == "" || c.TLSKeyPath == "" || c.TLSCAPath == "" { diff --git a/internal/control/client.go b/internal/control/client.go index 798440c..047678d 100644 --- a/internal/control/client.go +++ b/internal/control/client.go @@ -12,19 +12,23 @@ import ( "strings" "time" - "github.com/persys/compute-agent/internal/config" - "github.com/persys/compute-agent/internal/resources" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/workload" - controlv1 "github.com/persys/compute-agent/pkg/control/v1" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/config" + "github.com/persys-dev/compute-agent/internal/resources" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/workload" + controlv1 "github.com/persys-dev/compute-agent/pkg/control/v1" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/mem" "github.com/sirupsen/logrus" + "go.opentelemetry.io/otel" + otelcodes "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/metadata" "google.golang.org/protobuf/types/known/timestamppb" ) @@ -215,6 +219,7 @@ func (c *Client) register(ctx context.Context, client controlv1.AgentControlClie func (c *Client) heartbeat(ctx context.Context, client controlv1.AgentControlClient) (*controlv1.HeartbeatResponse, error) { usage := c.nodeUsage() workloadStatuses := c.workloadStatuses(ctx) + workloadUsage := c.workloadUsage(workloadStatuses) rpcCtx, cancel := context.WithTimeout(ctx, defaultRPCTimeout) defer cancel() @@ -223,6 +228,7 @@ func (c *Client) heartbeat(ctx context.Context, client controlv1.AgentControlCli NodeId: c.cfg.NodeID, Usage: usage, WorkloadStatuses: workloadStatuses, + WorkloadUsage: workloadUsage, Timestamp: timestamppb.Now(), }) } @@ -246,7 +252,10 @@ func (c *Client) dial(ctx context.Context) (*grpc.ClientConn, controlv1.AgentCon func (c *Client) dialOptions() ([]grpc.DialOption, error) { if c.cfg.SchedulerInsecure || !c.cfg.SchedulerTLSEnabled { - return []grpc.DialOption{grpc.WithTransportCredentials(insecure.NewCredentials())}, nil + return []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithUnaryInterceptor(otelUnaryClientInterceptor("compute-agent-control")), + }, nil } cert, err := tls.LoadX509KeyPair(c.cfg.TLSCertPath, c.cfg.TLSKeyPath) @@ -270,7 +279,58 @@ func (c *Client) dialOptions() ([]grpc.DialOption, error) { Certificates: []tls.Certificate{cert}, } - return []grpc.DialOption{grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig))}, nil + return []grpc.DialOption{ + grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), + grpc.WithUnaryInterceptor(otelUnaryClientInterceptor("compute-agent-control")), + }, nil +} + +func otelUnaryClientInterceptor(service string) grpc.UnaryClientInterceptor { + tr := otel.Tracer(service) + return func(ctx context.Context, method string, req interface{}, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { + ctx, span := tr.Start(ctx, method, trace.WithSpanKind(trace.SpanKindClient)) + defer span.End() + + md, ok := metadata.FromOutgoingContext(ctx) + if !ok { + md = metadata.New(nil) + } else { + md = md.Copy() + } + otel.GetTextMapPropagator().Inject(ctx, metadataCarrier(md)) + ctx = metadata.NewOutgoingContext(ctx, md) + + err := invoker(ctx, method, req, reply, cc, opts...) + if err != nil { + span.RecordError(err) + span.SetStatus(otelcodes.Error, err.Error()) + } + return err + } +} + +type metadataCarrier metadata.MD + +func (m metadataCarrier) Get(key string) string { + values := metadata.MD(m).Get(key) + if len(values) == 0 { + return "" + } + return values[0] +} + +func (m metadataCarrier) Set(key string, value string) { + md := metadata.MD(m) + md.Set(strings.ToLower(key), value) +} + +func (m metadataCarrier) Keys() []string { + md := metadata.MD(m) + keys := make([]string, 0, len(md)) + for k := range md { + keys = append(keys, k) + } + return keys } func (c *Client) nodeCapabilities() (*controlv1.NodeCapabilities, error) { @@ -292,10 +352,11 @@ func (c *Client) nodeCapabilities() (*controlv1.NodeCapabilities, error) { } return &controlv1.NodeCapabilities{ - CpuTotalMillicores: int64(cpuCount * 1000), - MemoryTotalMb: int64(memStats.Total / 1024 / 1024), - StoragePools: storagePools, - SupportedWorkloadTypes: c.supportedWorkloadTypes(), + CpuTotalMillicores: int64(cpuCount * 1000), + MemoryTotalMb: int64(memStats.Total / 1024 / 1024), + StoragePools: storagePools, + SupportedWorkloadTypes: c.supportedWorkloadTypes(), + SupportedStorageDrivers: c.supportedStorageDrivers(), }, nil } @@ -359,12 +420,28 @@ func (c *Client) workloadStatuses(ctx context.Context) []*controlv1.WorkloadStat FailureReason: mapFailureReason(status), Message: status.Message, LastTransition: timestamppb.New(nonZeroTime(status.UpdatedAt)), + Reason: reasonDetail(status), + Usage: usageSnapshot(status), }) } return out } +func (c *Client) workloadUsage(statuses []*controlv1.WorkloadStatus) []*controlv1.WorkloadUsageSnapshot { + if len(statuses) == 0 { + return nil + } + out := make([]*controlv1.WorkloadUsageSnapshot, 0, len(statuses)) + for _, status := range statuses { + if status == nil || status.GetUsage() == nil { + continue + } + out = append(out, status.GetUsage()) + } + return out +} + func (c *Client) supportedWorkloadTypes() []string { types := make([]string, 0, 3) if c.runtimeMgr != nil && c.runtimeMgr.IsEnabled(models.WorkloadTypeContainer) { @@ -392,6 +469,40 @@ func (c *Client) supportedWorkloadTypes() []string { return types } +func (c *Client) supportedStorageDrivers() []string { + drivers := []string{"local"} + seen := map[string]struct{}{"local": {}} + if strings.TrimSpace(c.cfg.StorageNFSServer) != "" && strings.TrimSpace(c.cfg.StorageNFSExport) != "" { + drivers = appendUniqueDriver(drivers, seen, "nfs") + } + if strings.TrimSpace(c.cfg.StorageCephPool) != "" { + drivers = appendUniqueDriver(drivers, seen, "ceph-rbd") + } + for _, labelKey := range []string{"storage.nfs", "storage.ceph_rbd", "storage.ceph-rbd"} { + if strings.EqualFold(strings.TrimSpace(c.cfg.NodeLabels[labelKey]), "true") { + switch labelKey { + case "storage.nfs": + drivers = appendUniqueDriver(drivers, seen, "nfs") + default: + drivers = appendUniqueDriver(drivers, seen, "ceph-rbd") + } + } + } + return drivers +} + +func appendUniqueDriver(drivers []string, seen map[string]struct{}, driver string) []string { + normalized := strings.ToLower(strings.TrimSpace(driver)) + if normalized == "" { + return drivers + } + if _, ok := seen[normalized]; ok { + return drivers + } + seen[normalized] = struct{}{} + return append(drivers, normalized) +} + func (c *Client) agentEndpoint() string { if c.cfg.AgentGRPCEndpoint != "" { return c.cfg.AgentGRPCEndpoint @@ -440,6 +551,61 @@ func mapFailureReason(status *models.WorkloadStatus) controlv1.FailureReason { return controlv1.FailureReason_FAILURE_REASON_UNSPECIFIED } +func reasonDetail(status *models.WorkloadStatus) *controlv1.ReasonDetail { + if status == nil { + return nil + } + code := "" + message := "" + retryable := false + nextRetry := time.Time{} + if status.Metadata != nil { + code = strings.TrimSpace(status.Metadata["failure_reason"]) + message = strings.TrimSpace(status.Metadata["failure_message"]) + retryable = strings.EqualFold(strings.TrimSpace(status.Metadata["retryable"]), "true") + if rawNext := strings.TrimSpace(status.Metadata["retry_next_at"]); rawNext != "" { + if parsed, err := time.Parse(time.RFC3339, rawNext); err == nil { + nextRetry = parsed + } + } + } + if code == "" && message == "" && nextRetry.IsZero() && !retryable { + return nil + } + reason := &controlv1.ReasonDetail{ + Code: code, + Message: message, + LastTransition: timestamppb.New(nonZeroTime(status.UpdatedAt)), + Retryable: retryable, + } + if !nextRetry.IsZero() { + reason.NextRetryAt = timestamppb.New(nextRetry.UTC()) + } + return reason +} + +func usageSnapshot(status *models.WorkloadStatus) *controlv1.WorkloadUsageSnapshot { + if status == nil || status.Usage == nil { + return nil + } + collected := status.Usage.CollectedAt + if collected.IsZero() { + collected = nonZeroTime(status.UpdatedAt) + } + return &controlv1.WorkloadUsageSnapshot{ + WorkloadId: strings.TrimSpace(status.ID), + Type: strings.TrimSpace(status.Usage.Type), + CpuPercent: status.Usage.CPUPercent, + MemoryBytes: status.Usage.MemoryBytes, + DiskReadBytes: status.Usage.DiskReadBytes, + DiskWriteBytes: status.Usage.DiskWriteBytes, + NetRxBytes: status.Usage.NetRXBytes, + NetTxBytes: status.Usage.NetTXBytes, + CollectedAt: timestamppb.New(collected.UTC()), + Source: strings.TrimSpace(status.Usage.Source), + } +} + func nonZeroTime(t time.Time) time.Time { if t.IsZero() { return time.Now() diff --git a/internal/garbage/collector.go b/internal/garbage/collector.go index 87c58af..5084979 100644 --- a/internal/garbage/collector.go +++ b/internal/garbage/collector.go @@ -4,10 +4,10 @@ import ( "context" "time" - "github.com/persys/compute-agent/internal/metrics" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/metrics" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) diff --git a/internal/garbage/collector_test.go b/internal/garbage/collector_test.go index 3c67b77..050802f 100644 --- a/internal/garbage/collector_test.go +++ b/internal/garbage/collector_test.go @@ -5,8 +5,8 @@ import ( "testing" "time" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) diff --git a/internal/grpc/server.go b/internal/grpc/server.go index b675491..b2aa85a 100644 --- a/internal/grpc/server.go +++ b/internal/grpc/server.go @@ -13,17 +13,21 @@ import ( "sync" "time" - "github.com/persys/compute-agent/internal/config" - "github.com/persys/compute-agent/internal/metrics" - "github.com/persys/compute-agent/internal/resources" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/task" - "github.com/persys/compute-agent/internal/workload" - pb "github.com/persys/compute-agent/pkg/api/v1" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/config" + "github.com/persys-dev/compute-agent/internal/metrics" + "github.com/persys-dev/compute-agent/internal/resources" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/task" + "github.com/persys-dev/compute-agent/internal/workload" + pb "github.com/persys-dev/compute-agent/pkg/api/v1" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" + "go.opentelemetry.io/otel" + otelcodes "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/metadata" "google.golang.org/grpc/peer" ) @@ -51,6 +55,7 @@ func NewServer(cfg *config.Config, manager *workload.Manager, runtimeMgr *runtim } var opts []grpc.ServerOption + opts = append(opts, grpc.UnaryInterceptor(otelUnaryServerInterceptor("compute-agent"))) // Configure mTLS if enabled if cfg.TLSEnabled { @@ -99,6 +104,49 @@ func (s *Server) Start() error { return s.grpcServer.Serve(listener) } +func otelUnaryServerInterceptor(service string) grpc.UnaryServerInterceptor { + tr := otel.Tracer(service) + return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + md, _ := metadata.FromIncomingContext(ctx) + if md != nil { + ctx = otel.GetTextMapPropagator().Extract(ctx, metadataCarrier(md)) + } + ctx, span := tr.Start(ctx, info.FullMethod, trace.WithSpanKind(trace.SpanKindServer)) + defer span.End() + + resp, err := handler(ctx, req) + if err != nil { + span.RecordError(err) + span.SetStatus(otelcodes.Error, err.Error()) + } + return resp, err + } +} + +type metadataCarrier metadata.MD + +func (m metadataCarrier) Get(key string) string { + values := metadata.MD(m).Get(key) + if len(values) == 0 { + return "" + } + return values[0] +} + +func (m metadataCarrier) Set(key string, value string) { + md := metadata.MD(m) + md.Set(strings.ToLower(key), value) +} + +func (m metadataCarrier) Keys() []string { + md := metadata.MD(m) + keys := make([]string, 0, len(md)) + for k := range md { + keys = append(keys, k) + } + return keys +} + // Stop gracefully stops the gRPC server func (s *Server) Stop() { s.logger.Info("Stopping gRPC server") @@ -315,15 +363,34 @@ func (s *Server) GetWorkloadStatus(ctx context.Context, req *pb.GetWorkloadStatu s.logClientInfo(ctx, "GetWorkloadStatus", req.Id) status, err := s.manager.GetStatus(ctx, req.Id) + pending := s.pendingStatusFromTasks(req.Id) if err != nil { - if pending := s.pendingStatusFromTasks(req.Id); pending != nil { + if pending != nil { return &pb.GetWorkloadStatusResponse{Status: pending}, nil } return nil, err } + respStatus := s.statusToProto(status) + // Surface in-flight apply/delete task state even when a persisted status exists. + // Without this, callers can see stale "stopped/running" and repeatedly enqueue apply. + if pending != nil { + taskState := strings.ToLower(strings.TrimSpace(pending.GetMetadata()["task_status"])) + if taskState == string(task.TaskStatusPending) || taskState == string(task.TaskStatusRunning) { + if respStatus.Metadata == nil { + respStatus.Metadata = map[string]string{} + } + for k, v := range pending.GetMetadata() { + respStatus.Metadata[k] = v + } + respStatus.ActualState = pb.ActualState_ACTUAL_STATE_PENDING + respStatus.Message = pending.GetMessage() + respStatus.UpdatedAt = time.Now().Unix() + } + } + return &pb.GetWorkloadStatusResponse{ - Status: s.statusToProto(status), + Status: respStatus, }, nil } @@ -596,6 +663,44 @@ func (s *Server) statusToProto(status *models.WorkloadStatus) *pb.WorkloadStatus CreatedAt: status.CreatedAt.Unix(), UpdatedAt: status.UpdatedAt.Unix(), Metadata: status.Metadata, + Usage: statusUsageToProto(status), + } +} + +func statusUsageToProto(status *models.WorkloadStatus) *pb.WorkloadUsageSnapshot { + if status == nil || status.Usage == nil { + return nil + } + + collectedAt := status.Usage.CollectedAt.Unix() + if collectedAt < 0 { + collectedAt = 0 + } + + return &pb.WorkloadUsageSnapshot{ + WorkloadId: status.Usage.WorkloadID, + Type: sWorkloadType(status.Usage.Type), + CpuPercent: status.Usage.CPUPercent, + MemoryBytes: status.Usage.MemoryBytes, + DiskReadBytes: status.Usage.DiskReadBytes, + DiskWriteBytes: status.Usage.DiskWriteBytes, + NetRxBytes: status.Usage.NetRXBytes, + NetTxBytes: status.Usage.NetTXBytes, + CollectedAt: collectedAt, + Source: status.Usage.Source, + } +} + +func sWorkloadType(t string) pb.WorkloadType { + switch strings.ToLower(strings.TrimSpace(t)) { + case "container": + return pb.WorkloadType_WORKLOAD_TYPE_CONTAINER + case "compose": + return pb.WorkloadType_WORKLOAD_TYPE_COMPOSE + case "vm": + return pb.WorkloadType_WORKLOAD_TYPE_VM + default: + return pb.WorkloadType_WORKLOAD_TYPE_UNSPECIFIED } } diff --git a/internal/grpc/server_test.go b/internal/grpc/server_test.go index ffcefae..8728ec5 100644 --- a/internal/grpc/server_test.go +++ b/internal/grpc/server_test.go @@ -5,9 +5,9 @@ import ( "testing" "time" - "github.com/persys/compute-agent/internal/task" - pb "github.com/persys/compute-agent/pkg/api/v1" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/task" + pb "github.com/persys-dev/compute-agent/pkg/api/v1" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index ffa9455..ca08857 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -43,6 +43,12 @@ type Metrics struct { GCOldFailedWorkloadsFound prometheus.Gauge GCResourcesDeleted *prometheus.CounterVec + // Task queue metrics + TaskQueueDepth prometheus.Gauge + TaskLatency *prometheus.HistogramVec + TaskExecutions *prometheus.CounterVec + TaskFailuresTotal *prometheus.CounterVec + mu sync.RWMutex } @@ -214,6 +220,43 @@ func NewMetrics(logger *logrus.Logger) (*Metrics, error) { }, []string{"resource_type"}, ), + + TaskQueueDepth: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: "persys_agent", + Subsystem: "task", + Name: "queue_depth", + Help: "Current number of tasks waiting in queue channel.", + }, + ), + TaskLatency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "persys_agent", + Subsystem: "task", + Name: "latency_seconds", + Help: "Task execution latency by task type and status.", + Buckets: prometheus.DefBuckets, + }, + []string{"type", "status"}, + ), + TaskExecutions: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "persys_agent", + Subsystem: "task", + Name: "executions_total", + Help: "Total task executions by type and final status.", + }, + []string{"type", "status"}, + ), + TaskFailuresTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "persys_agent", + Subsystem: "task", + Name: "failures_total", + Help: "Total failed task executions by type.", + }, + []string{"type"}, + ), } // Register all metrics @@ -236,6 +279,10 @@ func NewMetrics(logger *logrus.Logger) (*Metrics, error) { m.GCOrphanedResourcesFound, m.GCOldFailedWorkloadsFound, m.GCResourcesDeleted, + m.TaskQueueDepth, + m.TaskLatency, + m.TaskExecutions, + m.TaskFailuresTotal, ) logger.Info("Metrics initialized successfully") @@ -412,3 +459,23 @@ func (m *Metrics) RecordGCResourceDeleted(resourceType string) { defer m.mu.Unlock() m.GCResourcesDeleted.WithLabelValues(resourceType).Inc() } + +// SetTaskQueueDepth updates async task queue depth. +func (m *Metrics) SetTaskQueueDepth(depth int) { + m.mu.Lock() + defer m.mu.Unlock() + m.TaskQueueDepth.Set(float64(depth)) +} + +// ObserveTaskExecution records execution latency and counters. +func (m *Metrics) ObserveTaskExecution(taskType string, duration time.Duration, failed bool) { + m.mu.Lock() + defer m.mu.Unlock() + status := "completed" + if failed { + status = "failed" + m.TaskFailuresTotal.WithLabelValues(taskType).Inc() + } + m.TaskExecutions.WithLabelValues(taskType, status).Inc() + m.TaskLatency.WithLabelValues(taskType, status).Observe(duration.Seconds()) +} diff --git a/internal/platform/network.go b/internal/platform/network.go new file mode 100644 index 0000000..e3ea526 --- /dev/null +++ b/internal/platform/network.go @@ -0,0 +1,20 @@ +package platform + +import "context" + +// NetworkAttachment represents a provider-specific network attachment. +type NetworkAttachment struct { + ID string + WorkloadID string + Network string + Interface string + IPAddress string + MAC string +} + +// NetworkProvider is reserved for runtime/network decoupling (phase 3). +type NetworkProvider interface { + Driver() string + Attach(ctx context.Context, workloadID string, spec WorkloadNetSpec) (*NetworkAttachment, error) + Detach(ctx context.Context, attachment *NetworkAttachment) error +} diff --git a/internal/platform/storage.go b/internal/platform/storage.go new file mode 100644 index 0000000..8b16cf2 --- /dev/null +++ b/internal/platform/storage.go @@ -0,0 +1,133 @@ +package platform + +import ( + "context" + "fmt" + "strings" + "sync" +) + +// StorageProvider handles lifecycle for one storage backend (local/nfs/ceph-rbd). +type StorageProvider interface { + Driver() string + Validate(ctx context.Context, spec VolumeSpec) error + Provision(ctx context.Context, spec VolumeSpec) (*VolumeHandle, error) + Delete(ctx context.Context, handle *VolumeHandle) error + Attach(ctx context.Context, handle *VolumeHandle, workloadID string, mountPath string, readOnly bool) (*VolumeAttachment, error) + Detach(ctx context.Context, attachment *VolumeAttachment) error +} + +// VolumeManager orchestrates provider-level volume operations. +type VolumeManager interface { + ResolveProvider(driver string) (StorageProvider, error) + Provision(ctx context.Context, spec VolumeSpec) (*VolumeHandle, error) + Attach(ctx context.Context, spec VolumeSpec, handle *VolumeHandle, workloadID string) (*VolumeAttachment, error) + Detach(ctx context.Context, attachment *VolumeAttachment) error + Delete(ctx context.Context, handle *VolumeHandle) error +} + +// ProviderRegistry stores and resolves storage providers by driver. +type ProviderRegistry struct { + mu sync.RWMutex + providers map[string]StorageProvider +} + +func NewProviderRegistry() *ProviderRegistry { + return &ProviderRegistry{ + providers: make(map[string]StorageProvider), + } +} + +func (r *ProviderRegistry) RegisterStorageProvider(provider StorageProvider) { + if provider == nil { + return + } + driver := normalizeDriver(provider.Driver()) + if driver == "" { + return + } + r.mu.Lock() + r.providers[driver] = provider + r.mu.Unlock() +} + +func (r *ProviderRegistry) StorageProvider(driver string) (StorageProvider, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + provider, ok := r.providers[normalizeDriver(driver)] + return provider, ok +} + +type DefaultVolumeManager struct { + registry *ProviderRegistry +} + +func NewDefaultVolumeManager(registry *ProviderRegistry) *DefaultVolumeManager { + return &DefaultVolumeManager{registry: registry} +} + +func (m *DefaultVolumeManager) ResolveProvider(driver string) (StorageProvider, error) { + if m == nil || m.registry == nil { + return nil, fmt.Errorf("storage provider registry is not configured") + } + provider, ok := m.registry.StorageProvider(driver) + if !ok { + return nil, fmt.Errorf("storage driver %q is not registered", driver) + } + return provider, nil +} + +func (m *DefaultVolumeManager) Provision(ctx context.Context, spec VolumeSpec) (*VolumeHandle, error) { + provider, err := m.ResolveProvider(spec.Driver) + if err != nil { + return nil, err + } + if err := provider.Validate(ctx, spec); err != nil { + return nil, err + } + return provider.Provision(ctx, spec) +} + +func (m *DefaultVolumeManager) Attach(ctx context.Context, spec VolumeSpec, handle *VolumeHandle, workloadID string) (*VolumeAttachment, error) { + provider, err := m.ResolveProvider(spec.Driver) + if err != nil { + return nil, err + } + return provider.Attach(ctx, handle, workloadID, spec.MountPath, spec.ReadOnly) +} + +func (m *DefaultVolumeManager) Detach(ctx context.Context, attachment *VolumeAttachment) error { + if attachment == nil { + return nil + } + provider, err := m.ResolveProvider(attachmentDriver(attachment)) + if err != nil { + return err + } + return provider.Detach(ctx, attachment) +} + +func (m *DefaultVolumeManager) Delete(ctx context.Context, handle *VolumeHandle) error { + if handle == nil { + return nil + } + provider, err := m.ResolveProvider(handle.Driver) + if err != nil { + return err + } + return provider.Delete(ctx, handle) +} + +func attachmentDriver(attachment *VolumeAttachment) string { + if attachment == nil { + return "" + } + if attachment.Metadata == nil { + return "" + } + return attachment.Metadata["driver"] +} + +func normalizeDriver(driver string) string { + return strings.ToLower(strings.TrimSpace(driver)) +} diff --git a/internal/platform/types.go b/internal/platform/types.go new file mode 100644 index 0000000..b8a4867 --- /dev/null +++ b/internal/platform/types.go @@ -0,0 +1,67 @@ +package platform + +import ( + "strings" + "time" + + "github.com/persys-dev/compute-agent/pkg/models" +) + +// VolumeSpec describes a managed volume request independent of runtime backend. +type VolumeSpec struct { + Name string + Driver string + SizeGB int64 + AccessMode string + FSType string + MountPath string + ReadOnly bool + RetainPolicy string +} + +// VolumeHandle is a provider-managed volume identity and staging metadata. +type VolumeHandle struct { + ID string `json:"id"` + Name string `json:"name"` + Driver string `json:"driver"` + SizeGB int64 `json:"size_gb,omitempty"` + Device string `json:"device,omitempty"` + StagePath string `json:"stage_path,omitempty"` + Metadata map[string]string `json:"metadata,omitempty"` + CreatedAt time.Time `json:"created_at,omitempty"` + UpdatedAt time.Time `json:"updated_at,omitempty"` +} + +// VolumeAttachment is a provider-managed attachment for a workload. +type VolumeAttachment struct { + ID string `json:"id"` + VolumeID string `json:"volume_id"` + WorkloadID string `json:"workload_id"` + MountPath string `json:"mount_path"` + ReadOnly bool `json:"read_only,omitempty"` + StagePath string `json:"stage_path,omitempty"` + Metadata map[string]string `json:"metadata,omitempty"` + CreatedAt time.Time `json:"created_at,omitempty"` + UpdatedAt time.Time `json:"updated_at,omitempty"` +} + +// WorkloadNetSpec describes a runtime-agnostic network request. +type WorkloadNetSpec struct { + Network string + MAC string + IPAddress string +} + +// VolumeSpecFromModel converts API/workload managed volume to platform spec. +func VolumeSpecFromModel(in models.ManagedVolumeSpec) VolumeSpec { + return VolumeSpec{ + Name: strings.TrimSpace(in.Name), + Driver: strings.TrimSpace(in.Driver), + SizeGB: in.SizeGB, + AccessMode: strings.TrimSpace(in.AccessMode), + FSType: strings.TrimSpace(in.FSType), + MountPath: strings.TrimSpace(in.MountPath), + ReadOnly: in.ReadOnly, + RetainPolicy: strings.TrimSpace(in.RetainPolicy), + } +} diff --git a/internal/reconcile/loop.go b/internal/reconcile/loop.go index 899e47a..703353e 100644 --- a/internal/reconcile/loop.go +++ b/internal/reconcile/loop.go @@ -6,8 +6,8 @@ import ( "sync" "time" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/internal/workload" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/internal/workload" "github.com/sirupsen/logrus" ) diff --git a/internal/reconcile/loop_test.go b/internal/reconcile/loop_test.go index c9abbea..5dbdb1f 100644 --- a/internal/reconcile/loop_test.go +++ b/internal/reconcile/loop_test.go @@ -5,10 +5,10 @@ import ( "errors" "testing" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/internal/workload" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/internal/workload" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) diff --git a/internal/retry/policy.go b/internal/retry/policy.go index cbdd532..1f37f97 100644 --- a/internal/retry/policy.go +++ b/internal/retry/policy.go @@ -29,6 +29,7 @@ const ( // Specification failures FailureReasonInvalidSpec FailureReason = "INVALID_SPECIFICATION" FailureReasonInvalidConfig FailureReason = "INVALID_CONFIGURATION" + FailureReasonPortConflict FailureReason = "PORT_BIND_CONFLICT" // Runtime failures FailureReasonRuntimeError FailureReason = "RUNTIME_ERROR" @@ -45,7 +46,9 @@ func (f FailureReason) IsTransient() bool { FailureReasonNetworkTimeout, FailureReasonNetworkUnreachable, FailureReasonDNSResolution, - FailureReasonRuntimeUnhealthy: + FailureReasonRuntimeUnhealthy, + FailureReasonRuntimeError, + FailureReasonUnknown: return true default: return false @@ -57,7 +60,8 @@ func (f FailureReason) IsPermanent() bool { switch f { case FailureReasonInvalidImage, FailureReasonInvalidSpec, - FailureReasonInvalidConfig: + FailureReasonInvalidConfig, + FailureReasonPortConflict: return true default: return false @@ -265,6 +269,10 @@ func ClassifyError(err error) FailureReason { return FailureReasonInvalidImage } + if isPortBindConflict(errMsg) { + return FailureReasonPortConflict + } + if isNetworkError(errMsg) { if isTimeoutError(errMsg) { return FailureReasonNetworkTimeout @@ -316,6 +324,12 @@ func isSpecError(msg string) bool { return contains(msg, "invalid", "malformed", "schema", "validation") } +func isPortBindConflict(msg string) bool { + return contains(msg, "bind", "address already in use") || + contains(msg, "port", "already allocated") || + contains(msg, "error starting userland proxy") +} + func contains(str string, keywords ...string) bool { str = strings.ToLower(str) for _, keyword := range keywords { diff --git a/internal/retry/policy_test.go b/internal/retry/policy_test.go index 22c4deb..a543a09 100644 --- a/internal/retry/policy_test.go +++ b/internal/retry/policy_test.go @@ -35,3 +35,33 @@ func TestClassifyError_ImagePullTimeout(t *testing.T) { t.Fatalf("expected %s, got %s", FailureReasonImagePullTimeout, got) } } + +func TestClassifyError_PortBindConflict(t *testing.T) { + err := errors.New("Error starting userland proxy: listen tcp4 0.0.0.0:8080: bind: address already in use") + got := ClassifyError(err) + if got != FailureReasonPortConflict { + t.Fatalf("expected %s, got %s", FailureReasonPortConflict, got) + } +} + +func TestShouldRetry_UnknownErrorIsRetryable(t *testing.T) { + tracker := NewRetryTracker(DefaultRetryPolicy()) + result, err := tracker.RecordFailure(FailureReasonUnknown, "unclassified runtime error") + if err != nil { + t.Fatalf("expected unknown error to be retryable, got error: %v", err) + } + if result == nil || !result.Retryable { + t.Fatalf("expected unknown error to be retryable, got %#v", result) + } +} + +func TestShouldRetry_RuntimeErrorIsRetryable(t *testing.T) { + tracker := NewRetryTracker(DefaultRetryPolicy()) + result, err := tracker.RecordFailure(FailureReasonRuntimeError, "container exited unexpectedly") + if err != nil { + t.Fatalf("expected runtime error to be retryable, got error: %v", err) + } + if result == nil || !result.Retryable { + t.Fatalf("expected runtime error to be retryable, got %#v", result) + } +} diff --git a/internal/runtime/compose.go b/internal/runtime/compose.go index 072bdff..703b8ac 100644 --- a/internal/runtime/compose.go +++ b/internal/runtime/compose.go @@ -8,9 +8,10 @@ import ( "os" "os/exec" "path/filepath" + "sort" "strings" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) @@ -59,10 +60,13 @@ func (c *ComposeRuntime) Create(ctx context.Context, workload *models.Workload) return fmt.Errorf("failed to parse compose spec: %w", err) } - // Decode the compose YAML - composeYAML, err := base64.StdEncoding.DecodeString(spec.ComposeYAML) + // Accept both base64-encoded and inline YAML compose payloads. + composeYAML, payloadType, err := decodeComposeYAMLPayload(spec.ComposeYAML) if err != nil { - return fmt.Errorf("failed to decode compose yaml: %w", err) + return fmt.Errorf("failed to parse compose yaml payload: %w", err) + } + if payloadType == "inline" { + c.logger.Infof("Compose payload for %s is inline YAML; skipping base64 decode", workload.ID) } // Create project directory @@ -149,41 +153,42 @@ func (c *ComposeRuntime) Status(ctx context.Context, id string) (models.ActualSt return models.ActualStateUnknown, "project not found", nil } - // Get project status - cmd := c.buildCommand(ctx, "-p", id, "ps", "-q") - cmd.Dir = projectDir - - output, err := cmd.CombinedOutput() + // Query compose directly, reading stdout only so stderr warnings don't corrupt parsing. + allIDs, err := c.composeContainerIDs(ctx, projectDir, id, "ps", "-q", "--all") if err != nil { - return models.ActualStateUnknown, "", fmt.Errorf("failed to get compose status: %w", err) - } - - // Count running containers - containerIDs := strings.Split(strings.TrimSpace(string(output)), "\n") - runningCount := 0 - - for _, containerID := range containerIDs { - if containerID == "" { - continue + c.logger.Debugf("compose status query failed for %s via compose CLI, falling back to docker labels: %v", id, err) + allIDs, err = c.dockerContainerIDsByComposeProject(ctx, id, true) + if err != nil { + return models.ActualStateUnknown, "", fmt.Errorf("failed to get compose container list via compose and docker fallback: %w", err) } - - // Check if container is running - inspectCmd := exec.CommandContext(ctx, "docker", "inspect", "-f", "{{.State.Status}}", containerID) - inspectOutput, err := inspectCmd.CombinedOutput() + } + runningIDs, err := c.composeContainerIDs(ctx, projectDir, id, "ps", "-q", "--status", "running") + if err != nil { + c.logger.Debugf("compose running-status query failed for %s via compose CLI, falling back to docker labels: %v", id, err) + runningIDs, err = c.dockerContainerIDsByComposeProject(ctx, id, false) if err != nil { - continue + return models.ActualStateUnknown, "", fmt.Errorf("failed to get running compose container list via compose and docker fallback: %w", err) } + } - status := strings.TrimSpace(string(inspectOutput)) - if status == "running" { - runningCount++ + // Compose CLI can occasionally return an empty set during project metadata drift; + // fallback to Docker labels to avoid false Stopped when containers are actually up. + if len(allIDs) == 0 { + fallbackAll, fallbackErr := c.dockerContainerIDsByComposeProject(ctx, id, true) + if fallbackErr != nil { + c.logger.Debugf("compose empty status fallback failed for %s: %v", id, fallbackErr) + } else if len(fallbackAll) > 0 { + allIDs = fallbackAll + fallbackRunning, runningErr := c.dockerContainerIDsByComposeProject(ctx, id, false) + if runningErr != nil { + return models.ActualStateUnknown, "", fmt.Errorf("failed to get running compose container list from docker fallback: %w", runningErr) + } + runningIDs = fallbackRunning } } - totalContainers := len(containerIDs) - if totalContainers > 0 && containerIDs[0] == "" { - totalContainers = 0 - } + totalContainers := len(allIDs) + runningCount := len(runningIDs) if totalContainers == 0 { return models.ActualStateStopped, "no containers", nil @@ -256,3 +261,84 @@ func (c *ComposeRuntime) buildEnvFile(env map[string]string) string { } return strings.Join(lines, "\n") } + +func decodeComposeYAMLPayload(payload string) ([]byte, string, error) { + trimmed := strings.TrimSpace(payload) + if trimmed == "" { + return nil, "", fmt.Errorf("compose yaml payload is empty") + } + + decoded, err := base64.StdEncoding.DecodeString(trimmed) + if err == nil { + return decoded, "base64", nil + } + + if looksLikeInlineComposeYAML(trimmed) { + return []byte(trimmed), "inline", nil + } + + return nil, "", fmt.Errorf("invalid base64 payload and does not look like inline compose yaml: %w", err) +} + +func looksLikeInlineComposeYAML(payload string) bool { + trimmed := strings.TrimSpace(payload) + lower := strings.ToLower(trimmed) + return strings.Contains(trimmed, "\n") || + strings.Contains(lower, "services:") || + strings.HasPrefix(lower, "version:") +} + +func (c *ComposeRuntime) composeContainerIDs(ctx context.Context, projectDir, projectName string, args ...string) ([]string, error) { + cmdArgs := append([]string{"-p", projectName}, args...) + cmd := c.buildCommand(ctx, cmdArgs...) + cmd.Dir = projectDir + + output, err := cmd.Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + return nil, fmt.Errorf("%w: %s", err, strings.TrimSpace(string(exitErr.Stderr))) + } + return nil, err + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + ids := make([]string, 0, len(lines)) + for _, line := range lines { + id := strings.TrimSpace(line) + if id == "" { + continue + } + ids = append(ids, id) + } + sort.Strings(ids) + return ids, nil +} + +func (c *ComposeRuntime) dockerContainerIDsByComposeProject(ctx context.Context, projectName string, all bool) ([]string, error) { + filter := fmt.Sprintf("label=com.docker.compose.project=%s", projectName) + args := []string{"ps", "-q", "--filter", filter} + if all { + args = []string{"ps", "-aq", "--filter", filter} + } + + cmd := exec.CommandContext(ctx, "docker", args...) + output, err := cmd.Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + return nil, fmt.Errorf("%w: %s", err, strings.TrimSpace(string(exitErr.Stderr))) + } + return nil, err + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + ids := make([]string, 0, len(lines)) + for _, line := range lines { + id := strings.TrimSpace(line) + if id == "" { + continue + } + ids = append(ids, id) + } + sort.Strings(ids) + return ids, nil +} diff --git a/internal/runtime/compose_test.go b/internal/runtime/compose_test.go new file mode 100644 index 0000000..5cdf5e6 --- /dev/null +++ b/internal/runtime/compose_test.go @@ -0,0 +1,44 @@ +package runtime + +import ( + "encoding/base64" + "strings" + "testing" +) + +func TestDecodeComposeYAMLPayload_Base64(t *testing.T) { + raw := "services:\n web:\n image: nginx:1.27\n" + encoded := base64.StdEncoding.EncodeToString([]byte(raw)) + + decoded, payloadType, err := decodeComposeYAMLPayload(encoded) + if err != nil { + t.Fatalf("decodeComposeYAMLPayload returned error: %v", err) + } + if payloadType != "base64" { + t.Fatalf("expected payload type base64, got %q", payloadType) + } + if string(decoded) != raw { + t.Fatalf("decoded payload mismatch: got %q", string(decoded)) + } +} + +func TestDecodeComposeYAMLPayload_InlineYAML(t *testing.T) { + raw := "version: \"3.8\"\nservices:\n redis:\n image: redis:7-alpine\n" + + decoded, payloadType, err := decodeComposeYAMLPayload(raw) + if err != nil { + t.Fatalf("decodeComposeYAMLPayload returned error: %v", err) + } + if payloadType != "inline" { + t.Fatalf("expected payload type inline, got %q", payloadType) + } + if strings.TrimSpace(string(decoded)) != strings.TrimSpace(raw) { + t.Fatalf("decoded payload mismatch: got %q", string(decoded)) + } +} + +func TestDecodeComposeYAMLPayload_InvalidNonInline(t *testing.T) { + if _, _, err := decodeComposeYAMLPayload("%%%not-valid-base64%%%"); err == nil { + t.Fatalf("expected decodeComposeYAMLPayload to fail for invalid non-inline payload") + } +} diff --git a/internal/runtime/docker.go b/internal/runtime/docker.go index ec18598..286ca3e 100644 --- a/internal/runtime/docker.go +++ b/internal/runtime/docker.go @@ -1,6 +1,7 @@ package runtime import ( + "bytes" "context" "encoding/json" "fmt" @@ -13,8 +14,9 @@ import ( "github.com/docker/docker/api/types/filters" "github.com/docker/docker/api/types/mount" "github.com/docker/docker/client" + "github.com/docker/docker/pkg/stdcopy" "github.com/docker/go-connections/nat" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) @@ -27,6 +29,7 @@ type DockerRuntime struct { const ( managedLabelKey = "persys.managed" managedWorkloadIDKey = "persys.workload_id" + managedRevisionKey = "persys.revision_id" ) // NewDockerRuntime creates a new Docker runtime @@ -79,6 +82,15 @@ func (d *DockerRuntime) Create(ctx context.Context, workload *models.Workload) e // Force ownership labels so GC/reconciliation only targets agent-managed containers. labels[managedLabelKey] = "true" labels[managedWorkloadIDKey] = workload.ID + labels[managedRevisionKey] = strings.TrimSpace(workload.RevisionID) + + reused, err := d.ensureContainerReadyForCreate(ctx, workload) + if err != nil { + return err + } + if reused { + return nil + } containerConfig := &container.Config{ Image: spec.Image, @@ -111,16 +123,25 @@ func (d *DockerRuntime) Create(ctx context.Context, workload *models.Workload) e } // Create container - resp, err := d.client.ContainerCreate( - ctx, - containerConfig, - hostConfig, - nil, - nil, - workload.ID, - ) + resp, err := d.client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, workload.ID) if err != nil { - return fmt.Errorf("failed to create container: %w", err) + if !isContainerNameConflict(err) { + return fmt.Errorf("failed to create container: %w", err) + } + + d.logger.Warnf("Container create reported name conflict for %s; inspecting existing container", workload.ID) + reused, conflictErr := d.ensureContainerReadyForCreate(ctx, workload) + if conflictErr != nil { + return conflictErr + } + if reused { + return nil + } + + resp, err = d.client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, workload.ID) + if err != nil { + return fmt.Errorf("failed to create container after resolving name conflict: %w", err) + } } d.logger.Infof("Created container: %s (%s)", workload.ID, resp.ID) @@ -224,8 +245,179 @@ func (d *DockerRuntime) Healthy(ctx context.Context) error { return err } +// StatusMetadata returns container-specific status details, including stderr when exited. +func (d *DockerRuntime) StatusMetadata(ctx context.Context, id string) (map[string]string, error) { + info, err := d.client.ContainerInspect(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to inspect container: %w", err) + } + + metadata := map[string]string{ + "container.id": shortContainerID(info.ID), + "container.state": containerStateText(&info), + } + if info.Config != nil { + if image := strings.TrimSpace(info.Config.Image); image != "" { + metadata["container.image"] = image + } + } + if info.State != nil { + metadata["container.exit_code"] = fmt.Sprintf("%d", info.State.ExitCode) + if started := strings.TrimSpace(info.State.StartedAt); started != "" { + metadata["container.started_at"] = started + } + if finished := strings.TrimSpace(info.State.FinishedAt); finished != "" { + metadata["container.finished_at"] = finished + } + if stateErr := strings.TrimSpace(info.State.Error); stateErr != "" { + metadata["container.runtime_error"] = stateErr + } + } + + if info.State != nil && (info.State.ExitCode != 0 || info.State.Dead) { + stderrText, stderrErr := d.getContainerStderr(ctx, id) + if stderrErr != nil { + metadata["container.stderr_error"] = stderrErr.Error() + } else if strings.TrimSpace(stderrText) != "" { + metadata["container.stderr"] = stderrText + } + } + + return metadata, nil +} + +func (d *DockerRuntime) ensureContainerReadyForCreate(ctx context.Context, workload *models.Workload) (bool, error) { + info, err := d.client.ContainerInspect(ctx, workload.ID) + if err != nil { + if client.IsErrNotFound(err) { + return false, nil + } + return false, fmt.Errorf("failed to inspect container %s before create: %w", workload.ID, err) + } + + labels := map[string]string{} + if info.Config != nil && info.Config.Labels != nil { + labels = info.Config.Labels + } + state := containerStateText(&info) + + isManaged := strings.EqualFold(strings.TrimSpace(labels[managedLabelKey]), "true") + managedWorkloadID := strings.TrimSpace(labels[managedWorkloadIDKey]) + existingRevision := strings.TrimSpace(labels[managedRevisionKey]) + desiredRevision := strings.TrimSpace(workload.RevisionID) + + if isManaged && managedWorkloadID == workload.ID { + if desiredRevision == "" || existingRevision == "" || existingRevision == desiredRevision { + d.logger.WithFields(logrus.Fields{ + "workload_id": workload.ID, + "container_id": shortContainerID(info.ID), + "status": state, + "existing_revision": existingRevision, + }).Warn("Container already exists for managed workload; reusing existing container") + return true, nil + } + + d.logger.WithFields(logrus.Fields{ + "workload_id": workload.ID, + "container_id": shortContainerID(info.ID), + "status": state, + "existing_revision": existingRevision, + "desired_revision": desiredRevision, + }).Warn("Found stale managed container with revision mismatch; removing before recreate") + + if err := d.client.ContainerRemove(ctx, workload.ID, container.RemoveOptions{Force: true}); err != nil { + return false, fmt.Errorf( + "failed to remove stale managed container %s (id=%s status=%s existing_revision=%s desired_revision=%s): %w", + workload.ID, + shortContainerID(info.ID), + state, + existingRevision, + desiredRevision, + err, + ) + } + + return false, nil + } + + return false, fmt.Errorf( + "container name %q is already in use (id=%s status=%s managed=%t managed_workload_id=%q managed_revision=%q)", + workload.ID, + shortContainerID(info.ID), + state, + isManaged, + managedWorkloadID, + existingRevision, + ) +} + // Helper functions +func isContainerNameConflict(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "is already in use by container") || + strings.Contains(msg, "already exists") +} + +func containerStateText(info *types.ContainerJSON) string { + if info == nil || info.State == nil { + return "unknown" + } + if strings.TrimSpace(info.State.Status) != "" { + return strings.TrimSpace(info.State.Status) + } + if info.State.Running { + return "running" + } + if info.State.Paused { + return "paused" + } + if info.State.Restarting { + return "restarting" + } + return "unknown" +} + +func shortContainerID(id string) string { + id = strings.TrimSpace(id) + if len(id) <= 12 { + return id + } + return id[:12] +} + +func (d *DockerRuntime) getContainerStderr(ctx context.Context, id string) (string, error) { + reader, err := d.client.ContainerLogs(ctx, id, container.LogsOptions{ + ShowStdout: false, + ShowStderr: true, + Timestamps: false, + Details: false, + Follow: false, + }) + if err != nil { + return "", fmt.Errorf("failed to read container logs: %w", err) + } + defer reader.Close() + + raw, err := io.ReadAll(reader) + if err != nil { + return "", fmt.Errorf("failed to read container stderr stream: %w", err) + } + if len(raw) == 0 { + return "", nil + } + + var stderr bytes.Buffer + if _, err := stdcopy.StdCopy(io.Discard, &stderr, bytes.NewReader(raw)); err != nil { + // Non-multiplexed stream (e.g. TTY-enabled); treat raw output as stderr. + return strings.TrimSpace(string(raw)), nil + } + return strings.TrimSpace(stderr.String()), nil +} + func (d *DockerRuntime) pullImageWithRetry(ctx context.Context, image string, maxRetries int) error { var lastErr error diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index 08e7fb9..946b957 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -4,7 +4,7 @@ import ( "context" "fmt" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/pkg/models" ) // Runtime defines the interface for workload execution diff --git a/internal/runtime/runtime_test.go b/internal/runtime/runtime_test.go index 79055bd..dbbe468 100644 --- a/internal/runtime/runtime_test.go +++ b/internal/runtime/runtime_test.go @@ -5,7 +5,7 @@ import ( "errors" "testing" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/pkg/models" ) type rtStub struct { diff --git a/internal/runtime/vm.go b/internal/runtime/vm.go index 7a1528a..ba3047f 100644 --- a/internal/runtime/vm.go +++ b/internal/runtime/vm.go @@ -2,29 +2,45 @@ package runtime import ( "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "encoding/xml" "fmt" "net" + neturl "net/url" "os" "os/exec" "path/filepath" + "sort" + "strconv" "strings" + "sync" "time" "github.com/digitalocean/go-libvirt" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) const managedDiskMarkerSuffix = ".persys-managed" const vmShutdownGracePeriod = 30 * time.Second const vmShutdownPollInterval = 500 * time.Millisecond +const maxCloudInitPayloadBytes = 256 * 1024 // VMRuntime manages KVM virtual machine workloads via libvirt type VMRuntime struct { - conn *libvirt.Libvirt - logger *logrus.Entry + conn *libvirt.Libvirt + logger *logrus.Entry + seedMu sync.RWMutex + seedInfo map[string]cloudInitSeedInfo +} + +type cloudInitSeedInfo struct { + Path string + Checksum string + SizeBytes int + PreparedAt time.Time } func NewVMRuntime(uri string, logger *logrus.Logger) (*VMRuntime, error) { @@ -47,8 +63,9 @@ func NewVMRuntime(uri string, logger *logrus.Logger) (*VMRuntime, error) { } return &VMRuntime{ - conn: conn, - logger: logger.WithField("runtime", "vm"), + conn: conn, + logger: logger.WithField("runtime", "vm"), + seedInfo: make(map[string]cloudInitSeedInfo), }, nil } @@ -76,10 +93,13 @@ func (v *VMRuntime) Create(ctx context.Context, workload *models.Workload) error // Create cloud-init ISO if needed if spec.CloudInit != "" || spec.CloudInitConfig != nil { - isoPath, err := v.createCloudInitISO(workload.ID, spec) + isoPath, seedInfo, err := v.createCloudInitISO(workload.ID, spec) if err != nil { return fmt.Errorf("failed to create cloud-init ISO: %w", err) } + if seedInfo != nil { + v.setSeedInfo(workload.ID, *seedInfo) + } if isoPath != "" { // Add ISO to disks - mark as bootable CD-ROM spec.Disks = append(spec.Disks, models.DiskConfig{ @@ -92,10 +112,16 @@ func (v *VMRuntime) Create(ctx context.Context, workload *models.Workload) error }) } } + if spec.CloudInit == "" && spec.CloudInitConfig == nil { + v.clearSeedInfo(workload.ID) + } // Create disk files first for _, diskCfg := range spec.Disks { if diskCfg.Type != models.DiskTypeCDROM { + if isNetworkDiskPath(diskCfg.Path) { + continue + } if err := v.createDisk(&diskCfg); err != nil { return fmt.Errorf("failed to create disk %s: %w", diskCfg.Path, err) } @@ -257,6 +283,7 @@ func (v *VMRuntime) Delete(ctx context.Context, id string) error { v.cleanupDiskArtifacts(id, diskPaths) v.cleanupDeterministicVMArtifacts(id) + v.clearSeedInfo(id) v.logger.Infof("Deleted VM: %s", id) return nil @@ -282,22 +309,23 @@ func (v *VMRuntime) Status(ctx context.Context, id string) (models.ActualState, switch state { case int32(libvirt.DomainRunning): actualState = models.ActualStateRunning - message = "running" + message = fmt.Sprintf("running (reason=%s)", vmDomainReasonText(state, reason)) case int32(libvirt.DomainPaused): - actualState = models.ActualStateStopped - message = "paused (runtime frozen)" + // Paused is treated as transitional so upper layers avoid destructive re-apply loops. + actualState = models.ActualStatePending + message = fmt.Sprintf("paused (runtime frozen, reason=%s)", vmDomainReasonText(state, reason)) case int32(libvirt.DomainShutdown), int32(libvirt.DomainShutoff): actualState = models.ActualStateStopped - message = "shutdown" + message = fmt.Sprintf("shutdown (reason=%s)", vmDomainReasonText(state, reason)) case int32(libvirt.DomainCrashed): actualState = models.ActualStateFailed - message = "crashed" + message = fmt.Sprintf("crashed (reason=%s)", vmDomainReasonText(state, reason)) case int32(libvirt.DomainBlocked), int32(libvirt.DomainNostate): actualState = models.ActualStatePending - message = fmt.Sprintf("state: %d, reason: %d", state, reason) + message = fmt.Sprintf("%s (reason=%s)", vmDomainStateName(state), vmDomainReasonText(state, reason)) default: actualState = models.ActualStateUnknown - message = fmt.Sprintf("unknown state: %d", state) + message = fmt.Sprintf("unknown state=%d reason=%d", state, reason) } return actualState, message, nil @@ -309,17 +337,25 @@ func (v *VMRuntime) StatusMetadata(ctx context.Context, id string) (map[string]s if err != nil { return nil, fmt.Errorf("failed to lookup domain: %w", err) } + metadata := make(map[string]string) + + state, reason, err := v.conn.DomainGetState(domain, 0) + if err == nil { + metadata["vm.domain_state"] = vmDomainStateName(state) + metadata["vm.domain_state_code"] = strconv.Itoa(int(state)) + metadata["vm.domain_reason"] = vmDomainReasonText(state, reason) + metadata["vm.domain_reason_code"] = strconv.Itoa(int(reason)) + } // Prefer DHCP lease source first, then guest agent fallback. ifaces, err := v.conn.DomainInterfaceAddresses(domain, uint32(libvirt.DomainInterfaceAddressesSrcLease), 0) if err != nil || len(ifaces) == 0 { ifaces, err = v.conn.DomainInterfaceAddresses(domain, uint32(libvirt.DomainInterfaceAddressesSrcAgent), 0) if err != nil { - return nil, fmt.Errorf("failed to get domain interface addresses: %w", err) + metadata["vm.network_lookup_error"] = err.Error() + return metadata, nil } } - - metadata := make(map[string]string) var ips []string var macs []string var ifaceParts []string @@ -357,10 +393,158 @@ func (v *VMRuntime) StatusMetadata(ctx context.Context, id string) (map[string]s if len(ifaceParts) > 0 { metadata["vm.interfaces"] = strings.Join(ifaceParts, ",") } + if seedInfo, ok := v.getSeedInfo(id); ok { + metadata["vm.cloud_init_seed_path"] = seedInfo.Path + metadata["vm.cloud_init_seed_checksum"] = seedInfo.Checksum + metadata["vm.cloud_init_seed_size_bytes"] = strconv.Itoa(seedInfo.SizeBytes) + metadata["vm.cloud_init_seed_prepared_at"] = seedInfo.PreparedAt.UTC().Format(time.RFC3339) + } return metadata, nil } +func vmDomainStateName(state int32) string { + switch libvirt.DomainState(state) { + case libvirt.DomainRunning: + return "running" + case libvirt.DomainBlocked: + return "blocked" + case libvirt.DomainPaused: + return "paused" + case libvirt.DomainShutdown: + return "shutdown" + case libvirt.DomainShutoff: + return "shutoff" + case libvirt.DomainCrashed: + return "crashed" + case libvirt.DomainPmsuspended: + return "pmsuspended" + case libvirt.DomainNostate: + return "nostate" + default: + return "unknown" + } +} + +func vmDomainReasonText(state, reason int32) string { + switch libvirt.DomainState(state) { + case libvirt.DomainPaused: + switch libvirt.DomainPausedReason(reason) { + case libvirt.DomainPausedUser: + return "user" + case libvirt.DomainPausedMigration: + return "migration" + case libvirt.DomainPausedSave: + return "save" + case libvirt.DomainPausedDump: + return "dump" + case libvirt.DomainPausedIoerror: + return "io-error" + case libvirt.DomainPausedWatchdog: + return "watchdog" + case libvirt.DomainPausedFromSnapshot: + return "from-snapshot" + case libvirt.DomainPausedShuttingDown: + return "shutting-down" + case libvirt.DomainPausedSnapshot: + return "snapshot" + case libvirt.DomainPausedCrashed: + return "crashed" + case libvirt.DomainPausedStartingUp: + return "starting-up" + case libvirt.DomainPausedPostcopy: + return "postcopy" + case libvirt.DomainPausedPostcopyFailed: + return "postcopy-failed" + default: + return "unknown" + } + case libvirt.DomainShutoff: + switch libvirt.DomainShutoffReason(reason) { + case libvirt.DomainShutoffShutdown: + return "shutdown" + case libvirt.DomainShutoffDestroyed: + return "destroyed" + case libvirt.DomainShutoffCrashed: + return "crashed" + case libvirt.DomainShutoffMigrated: + return "migrated" + case libvirt.DomainShutoffSaved: + return "saved" + case libvirt.DomainShutoffFailed: + return "failed" + case libvirt.DomainShutoffFromSnapshot: + return "from-snapshot" + case libvirt.DomainShutoffDaemon: + return "daemon" + default: + return "unknown" + } + case libvirt.DomainRunning: + switch libvirt.DomainRunningReason(reason) { + case libvirt.DomainRunningBooted: + return "booted" + case libvirt.DomainRunningMigrated: + return "migrated" + case libvirt.DomainRunningRestored: + return "restored" + case libvirt.DomainRunningFromSnapshot: + return "from-snapshot" + case libvirt.DomainRunningUnpaused: + return "unpaused" + case libvirt.DomainRunningMigrationCanceled: + return "migration-canceled" + case libvirt.DomainRunningSaveCanceled: + return "save-canceled" + case libvirt.DomainRunningWakeup: + return "wakeup" + case libvirt.DomainRunningCrashed: + return "crashed" + case libvirt.DomainRunningPostcopy: + return "postcopy" + default: + return "unknown" + } + case libvirt.DomainShutdown: + switch libvirt.DomainShutdownReason(reason) { + case libvirt.DomainShutdownUser: + return "user" + default: + return "unknown" + } + case libvirt.DomainCrashed: + switch libvirt.DomainCrashedReason(reason) { + case libvirt.DomainCrashedPanicked: + return "panicked" + default: + return "unknown" + } + case libvirt.DomainPmsuspended: + switch libvirt.DomainPMSuspendedReason(reason) { + case libvirt.DomainPmsuspendedUnknown: + return "unknown" + default: + return "unknown" + } + case libvirt.DomainNostate: + switch libvirt.DomainNostateReason(reason) { + case libvirt.DomainNostateUnknown: + return "unknown" + default: + return "unknown" + } + case libvirt.DomainBlocked: + switch libvirt.DomainBlockedReason(reason) { + case libvirt.DomainBlockedUnknown: + return "unknown" + default: + return "unknown" + } + default: + return "unknown" + } +} + func (v *VMRuntime) List(ctx context.Context) ([]string, error) { domains, _, err := v.conn.ConnectListAllDomains(1, 0) if err != nil { @@ -464,29 +648,35 @@ func isDomainNotFoundErr(err error) bool { } // createCloudInitISO creates a cloud-init ISO for VM configuration -func (v *VMRuntime) createCloudInitISO(vmID string, spec *models.VMSpec) (string, error) { +func (v *VMRuntime) createCloudInitISO(vmID string, spec *models.VMSpec) (string, *cloudInitSeedInfo, error) { // Only create if cloud-init is specified if spec.CloudInit == "" && spec.CloudInitConfig == nil { - return "", nil + return "", nil, nil } // Create temporary directory for cloud-init files tmpDir, err := os.MkdirTemp("", fmt.Sprintf("cloud-init-%s-", vmID)) if err != nil { - return "", fmt.Errorf("failed to create temp directory: %w", err) + return "", nil, fmt.Errorf("failed to create temp directory: %w", err) } defer os.RemoveAll(tmpDir) - // Create meta-data + // Build meta-data, preserving user payload when provided. metaData := fmt.Sprintf(`{ "instance-id": "%s", "hostname": "%s", "local-ipv4": "127.0.0.1" }`, vmID, spec.Name) + if spec.CloudInitConfig != nil && strings.TrimSpace(spec.CloudInitConfig.MetaData) != "" { + metaData = spec.CloudInitConfig.MetaData + } + if err := validateCloudInitField("meta-data", metaData); err != nil { + return "", nil, err + } metaDataPath := filepath.Join(tmpDir, "meta-data") if err := os.WriteFile(metaDataPath, []byte(metaData), 0644); err != nil { - return "", fmt.Errorf("failed to write meta-data: %w", err) + return "", nil, fmt.Errorf("failed to write meta-data: %w", err) } // Create user-data @@ -498,10 +688,49 @@ func (v *VMRuntime) createCloudInitISO(vmID string, spec *models.VMSpec) (string } else { userData = "#!/bin/bash\necho 'Cloud-init configured'\n" } + if err := validateCloudInitField("user-data", userData); err != nil { + return "", nil, err + } userDataPath := filepath.Join(tmpDir, "user-data") if err := os.WriteFile(userDataPath, []byte(userData), 0644); err != nil { - return "", fmt.Errorf("failed to write user-data: %w", err) + return "", nil, fmt.Errorf("failed to write user-data: %w", err) + } + + paths := map[string]string{ + "user-data": userDataPath, + "meta-data": metaDataPath, + } + + networkConfig := "" + if spec.CloudInitConfig != nil && strings.TrimSpace(spec.CloudInitConfig.NetworkConfig) != "" { + networkConfig = spec.CloudInitConfig.NetworkConfig + if err := validateCloudInitField("network-config", networkConfig); err != nil { + return "", nil, err + } + networkPath := filepath.Join(tmpDir, "network-config") + if err := os.WriteFile(networkPath, []byte(networkConfig), 0644); err != nil { + return "", nil, fmt.Errorf("failed to write network-config: %w", err) + } + paths["network-config"] = networkPath + } + + vendorData := "" + if spec.CloudInitConfig != nil && strings.TrimSpace(spec.CloudInitConfig.VendorData) != "" { + vendorData = spec.CloudInitConfig.VendorData + if err := validateCloudInitField("vendor-data", vendorData); err != nil { + return "", nil, err + } + vendorPath := filepath.Join(tmpDir, "vendor-data") + if err := os.WriteFile(vendorPath, []byte(vendorData), 0644); err != nil { + return "", nil, fmt.Errorf("failed to write vendor-data: %w", err) + } + paths["vendor-data"] = vendorPath + } + + totalSize := len(userData) + len(metaData) + len(networkConfig) + len(vendorData) + if totalSize > maxCloudInitPayloadBytes { + return "", nil, cloudInitInvalidError("payload size %d bytes exceeds limit %d bytes", totalSize, maxCloudInitPayloadBytes) } // Generate ISO filename in a standard location @@ -513,32 +742,164 @@ func (v *VMRuntime) createCloudInitISO(vmID string, spec *models.VMSpec) (string isoPath := filepath.Join(isoDir, fmt.Sprintf("%s-cloud-init.iso", vmID)) // Create ISO using mkisofs or genisoimage - cmd := exec.Command( - "mkisofs", + fileKeys := make([]string, 0, len(paths)) + for key := range paths { + fileKeys = append(fileKeys, key) + } + sort.Strings(fileKeys) + + mkisofsArgs := []string{ "-output", isoPath, "-volid", "cidata", "-joliet", "-rock", "-file-mode", "0644", "-dir-mode", "0755", - userDataPath, - metaDataPath, - ) + "-graft-points", + } + for _, key := range fileKeys { + mkisofsArgs = append(mkisofsArgs, fmt.Sprintf("%s=%s", key, paths[key])) + } + cmd := exec.Command("mkisofs", mkisofsArgs...) output, err := cmd.CombinedOutput() if err != nil { v.logger.Errorf("mkisofs output: %s", string(output)) - return "", fmt.Errorf("failed to create cloud-init ISO: %w", err) + return "", nil, fmt.Errorf("failed to create cloud-init ISO: %w", err) + } + + checksum := cloudInitSeedChecksum(fileKeys, map[string]string{ + "user-data": userData, + "meta-data": metaData, + "network-config": networkConfig, + "vendor-data": vendorData, + }) + v.logger.WithFields(logrus.Fields{ + "vm_id": vmID, + "seed_path": isoPath, + "seed_checksum": checksum, + "seed_size_bytes": totalSize, + "seed_files": strings.Join(fileKeys, ","), + }).Info("Created cloud-init seed ISO") + + return isoPath, &cloudInitSeedInfo{ + Path: isoPath, + Checksum: checksum, + SizeBytes: totalSize, + PreparedAt: time.Now().UTC(), + }, nil +} + +func validateCloudInitField(name, value string) error { + if len(value) > maxCloudInitPayloadBytes { + return cloudInitInvalidError("%s size %d bytes exceeds limit %d bytes", name, len(value), maxCloudInitPayloadBytes) + } + if strings.ContainsRune(value, '\x00') { + return cloudInitInvalidError("%s contains null byte", name) } + return nil +} + +func cloudInitInvalidError(format string, args ...interface{}) error { + return fmt.Errorf("cloud-init-invalid: %s", fmt.Sprintf(format, args...)) +} + +func cloudInitSeedChecksum(sortedKeys []string, payload map[string]string) string { + hasher := sha256.New() + for _, key := range sortedKeys { + value := payload[key] + if value == "" { + continue + } + _, _ = hasher.Write([]byte(key)) + _, _ = hasher.Write([]byte{0}) + _, _ = hasher.Write([]byte(value)) + _, _ = hasher.Write([]byte{'\n'}) + } + return hex.EncodeToString(hasher.Sum(nil)) +} + +func (v *VMRuntime) setSeedInfo(vmID string, info cloudInitSeedInfo) { + v.seedMu.Lock() + v.seedInfo[vmID] = info + v.seedMu.Unlock() +} - v.logger.Infof("Created cloud-init ISO: %s", isoPath) - return isoPath, nil +func (v *VMRuntime) clearSeedInfo(vmID string) { + v.seedMu.Lock() + delete(v.seedInfo, vmID) + v.seedMu.Unlock() +} + +func (v *VMRuntime) getSeedInfo(vmID string) (cloudInitSeedInfo, bool) { + v.seedMu.RLock() + info, ok := v.seedInfo[vmID] + v.seedMu.RUnlock() + return info, ok } func managedDiskMarkerPath(diskPath string) string { return diskPath + managedDiskMarkerSuffix } +func isNetworkDiskPath(path string) bool { + _, ok := diskSourceFromPath(path) + return ok +} + +func diskSourceFromPath(path string) (DiskSource, bool) { + trimmed := strings.TrimSpace(path) + if trimmed == "" { + return DiskSource{}, false + } + + if strings.HasPrefix(trimmed, "rbd:") { + name := strings.TrimSpace(strings.TrimPrefix(trimmed, "rbd:")) + if name == "" { + return DiskSource{}, false + } + return DiskSource{ + Protocol: "rbd", + Name: name, + }, true + } + + if strings.HasPrefix(trimmed, "nfs://") { + parsed, err := neturl.Parse(trimmed) + if err != nil || parsed.Host == "" || parsed.Path == "" { + return DiskSource{}, false + } + host := parsed.Hostname() + if host == "" { + return DiskSource{}, false + } + source := DiskSource{ + Protocol: "nfs", + Name: parsed.Path, + Hosts: []DiskSourceHost{{Name: host}}, + } + if port := parsed.Port(); port != "" { + source.Hosts[0].Port = port + } + return source, true + } + + // Backward compatibility for existing NFS handles in "server:/export/path" form. + if host, export, ok := strings.Cut(trimmed, ":/"); ok { + host = strings.TrimSpace(host) + export = strings.TrimSpace(export) + if host != "" && export != "" { + return DiskSource{ + Protocol: "nfs", + Name: "/" + export, + Hosts: []DiskSourceHost{{Name: host}}, + }, true + } + } + + return DiskSource{}, false +} + func expectedCloudInitISOPaths(vmID string) []string { return []string{ filepath.Join("/var/lib/libvirt/images", fmt.Sprintf("%s-cloud-init.iso", vmID)), @@ -691,7 +1052,16 @@ type DiskDriver struct { } type DiskSource struct { - File string `xml:"file,attr"` + File string `xml:"file,attr,omitempty"` + Protocol string `xml:"protocol,attr,omitempty"` + Name string `xml:"name,attr,omitempty"` + Hosts []DiskSourceHost `xml:"host,omitempty"` +} + +type DiskSourceHost struct { + Name string `xml:"name,attr,omitempty"` + Port string `xml:"port,attr,omitempty"` + Transport string `xml:"transport,attr,omitempty"` } type DiskTarget struct { @@ -789,11 +1159,18 @@ func (v *VMRuntime) generateDomainXML(name string, spec *models.VMSpec) (string, bus = "sata" } + diskType := "file" + source := DiskSource{File: diskCfg.Path} + if networkSource, ok := diskSourceFromPath(diskCfg.Path); ok { + diskType = "network" + source = networkSource + } + disk := Disk{ - Type: "file", + Type: diskType, Device: device, Driver: DiskDriver{Name: "qemu", Type: diskCfg.Format}, - Source: DiskSource{File: diskCfg.Path}, + Source: source, Target: DiskTarget{Dev: diskCfg.Device, Bus: bus}, } diff --git a/internal/runtime/vm_disk_source_test.go b/internal/runtime/vm_disk_source_test.go new file mode 100644 index 0000000..2e57e86 --- /dev/null +++ b/internal/runtime/vm_disk_source_test.go @@ -0,0 +1,143 @@ +package runtime + +import ( + "encoding/xml" + "strings" + "testing" + + "github.com/persys-dev/compute-agent/pkg/models" +) + +func TestDiskSourceFromPath(t *testing.T) { + tests := []struct { + name string + path string + wantNetwork bool + wantProtocol string + wantSourceName string + wantHost string + }{ + { + name: "local file path", + path: "/var/lib/libvirt/images/root.qcow2", + wantNetwork: false, + }, + { + name: "ceph rbd path", + path: "rbd:rbd/vm-root", + wantNetwork: true, + wantProtocol: "rbd", + wantSourceName: "rbd/vm-root", + }, + { + name: "nfs url path", + path: "nfs://10.0.0.12/exports/vm-root", + wantNetwork: true, + wantProtocol: "nfs", + wantSourceName: "/exports/vm-root", + wantHost: "10.0.0.12", + }, + { + name: "nfs legacy path", + path: "nfs-gw.internal:/exports/vm-root", + wantNetwork: true, + wantProtocol: "nfs", + wantSourceName: "/exports/vm-root", + wantHost: "nfs-gw.internal", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + source, ok := diskSourceFromPath(tt.path) + if ok != tt.wantNetwork { + t.Fatalf("diskSourceFromPath(%q) network=%v want=%v", tt.path, ok, tt.wantNetwork) + } + if !tt.wantNetwork { + return + } + if source.Protocol != tt.wantProtocol { + t.Fatalf("protocol=%q want=%q", source.Protocol, tt.wantProtocol) + } + if source.Name != tt.wantSourceName { + t.Fatalf("name=%q want=%q", source.Name, tt.wantSourceName) + } + if tt.wantHost != "" { + if len(source.Hosts) != 1 || source.Hosts[0].Name != tt.wantHost { + t.Fatalf("hosts=%v want single host=%q", source.Hosts, tt.wantHost) + } + } + }) + } +} + +func TestGenerateDomainXML_MixesFileAndNetworkDisks(t *testing.T) { + rt := &VMRuntime{} + spec := &models.VMSpec{ + Name: "vm-test", + VCPUs: 2, + MemoryMB: 1024, + Disks: []models.DiskConfig{ + { + Path: "rbd:rbd/vm-test-root", + Device: "vda", + Format: "raw", + Type: models.DiskTypeDisk, + }, + { + Path: "/var/lib/libvirt/images/vm-test-data.qcow2", + Device: "vdb", + Format: "qcow2", + Type: models.DiskTypeDisk, + }, + }, + } + + xmlData, err := rt.generateDomainXML("vm-test", spec) + if err != nil { + t.Fatalf("generateDomainXML returned error: %v", err) + } + + type source struct { + File string `xml:"file,attr"` + Protocol string `xml:"protocol,attr"` + Name string `xml:"name,attr"` + } + type disk struct { + Type string `xml:"type,attr"` + Source source `xml:"source"` + } + type domain struct { + Devices struct { + Disks []disk `xml:"disk"` + } `xml:"devices"` + } + + var parsed domain + if err := xml.Unmarshal([]byte(xmlData), &parsed); err != nil { + t.Fatalf("failed to parse generated xml: %v", err) + } + + if len(parsed.Devices.Disks) < 2 { + t.Fatalf("expected at least 2 disks, got %d", len(parsed.Devices.Disks)) + } + + var foundNetwork bool + var foundFile bool + for _, d := range parsed.Devices.Disks { + if d.Type == "network" && d.Source.Protocol == "rbd" && d.Source.Name == "rbd/vm-test-root" { + foundNetwork = true + } + if d.Type == "file" && strings.Contains(d.Source.File, "vm-test-data.qcow2") { + foundFile = true + } + } + + if !foundNetwork { + t.Fatalf("expected generated xml to include rbd network disk") + } + if !foundFile { + t.Fatalf("expected generated xml to include file-backed disk") + } +} diff --git a/internal/state/bolt_volume.go b/internal/state/bolt_volume.go new file mode 100644 index 0000000..540511e --- /dev/null +++ b/internal/state/bolt_volume.go @@ -0,0 +1,145 @@ +package state + +import ( + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/persys-dev/compute-agent/internal/platform" + bolt "go.etcd.io/bbolt" +) + +func (s *boltStore) SaveVolume(handle *platform.VolumeHandle) error { + if handle == nil { + return fmt.Errorf("volume handle is required") + } + if strings.TrimSpace(handle.ID) == "" { + return fmt.Errorf("volume handle id is required") + } + return s.db.Update(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(volumeBucket)) + now := time.Now().UTC() + handle.UpdatedAt = now + if handle.CreatedAt.IsZero() { + handle.CreatedAt = now + } + data, err := json.Marshal(handle) + if err != nil { + return fmt.Errorf("failed to marshal volume handle: %w", err) + } + return bucket.Put([]byte(handle.ID), data) + }) +} + +func (s *boltStore) GetVolume(id string) (*platform.VolumeHandle, error) { + var out *platform.VolumeHandle + err := s.db.View(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(volumeBucket)) + data := bucket.Get([]byte(id)) + if data == nil { + return fmt.Errorf("volume not found") + } + out = &platform.VolumeHandle{} + return json.Unmarshal(data, out) + }) + if err != nil { + return nil, err + } + return out, nil +} + +func (s *boltStore) DeleteVolume(id string) error { + return s.db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte(volumeBucket)).Delete([]byte(id)) + }) +} + +func (s *boltStore) ListVolumes() ([]*platform.VolumeHandle, error) { + out := []*platform.VolumeHandle{} + err := s.db.View(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(volumeBucket)) + return bucket.ForEach(func(_, v []byte) error { + var handle platform.VolumeHandle + if err := json.Unmarshal(v, &handle); err != nil { + return err + } + out = append(out, &handle) + return nil + }) + }) + if err != nil { + return nil, err + } + return out, nil +} + +func (s *boltStore) SaveAttachment(attachment *platform.VolumeAttachment) error { + if attachment == nil { + return fmt.Errorf("volume attachment is required") + } + if strings.TrimSpace(attachment.ID) == "" { + return fmt.Errorf("volume attachment id is required") + } + return s.db.Update(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(attachmentBucket)) + now := time.Now().UTC() + attachment.UpdatedAt = now + if attachment.CreatedAt.IsZero() { + attachment.CreatedAt = now + } + data, err := json.Marshal(attachment) + if err != nil { + return fmt.Errorf("failed to marshal volume attachment: %w", err) + } + return bucket.Put([]byte(attachment.ID), data) + }) +} + +func (s *boltStore) GetAttachment(id string) (*platform.VolumeAttachment, error) { + var out *platform.VolumeAttachment + err := s.db.View(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(attachmentBucket)) + data := bucket.Get([]byte(id)) + if data == nil { + return fmt.Errorf("attachment not found") + } + out = &platform.VolumeAttachment{} + return json.Unmarshal(data, out) + }) + if err != nil { + return nil, err + } + return out, nil +} + +func (s *boltStore) DeleteAttachment(id string) error { + return s.db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte(attachmentBucket)).Delete([]byte(id)) + }) +} + +func (s *boltStore) ListAttachments(workloadID string) ([]*platform.VolumeAttachment, error) { + filterWorkload := strings.TrimSpace(workloadID) + out := []*platform.VolumeAttachment{} + err := s.db.View(func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte(attachmentBucket)) + return bucket.ForEach(func(_, v []byte) error { + var attachment platform.VolumeAttachment + if err := json.Unmarshal(v, &attachment); err != nil { + return err + } + if filterWorkload != "" && attachment.WorkloadID != filterWorkload { + return nil + } + out = append(out, &attachment) + return nil + }) + }) + if err != nil { + return nil, err + } + return out, nil +} + +var _ ManagedVolumeStore = (*boltStore)(nil) diff --git a/internal/state/store.go b/internal/state/store.go index 73d2265..77965ba 100644 --- a/internal/state/store.go +++ b/internal/state/store.go @@ -5,13 +5,16 @@ import ( "fmt" "time" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/platform" + "github.com/persys-dev/compute-agent/pkg/models" bolt "go.etcd.io/bbolt" ) const ( - workloadBucket = "workloads" - statusBucket = "status" + workloadBucket = "workloads" + statusBucket = "status" + volumeBucket = "volumes" + attachmentBucket = "attachments" ) // Store manages persistent state using bbolt @@ -31,6 +34,19 @@ type Store interface { Close() error } +// ManagedVolumeStore provides optional persistence for provider-backed volume state. +type ManagedVolumeStore interface { + SaveVolume(handle *platform.VolumeHandle) error + GetVolume(id string) (*platform.VolumeHandle, error) + DeleteVolume(id string) error + ListVolumes() ([]*platform.VolumeHandle, error) + + SaveAttachment(attachment *platform.VolumeAttachment) error + GetAttachment(id string) (*platform.VolumeAttachment, error) + DeleteAttachment(id string) error + ListAttachments(workloadID string) ([]*platform.VolumeAttachment, error) +} + type boltStore struct { db *bolt.DB } @@ -52,6 +68,12 @@ func NewBoltStore(path string) (Store, error) { if _, err := tx.CreateBucketIfNotExists([]byte(statusBucket)); err != nil { return err } + if _, err := tx.CreateBucketIfNotExists([]byte(volumeBucket)); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists([]byte(attachmentBucket)); err != nil { + return err + } return nil }) if err != nil { @@ -106,12 +128,39 @@ func (s *boltStore) DeleteWorkload(id string) error { return s.db.Update(func(tx *bolt.Tx) error { workloadBucket := tx.Bucket([]byte(workloadBucket)) statusBucket := tx.Bucket([]byte(statusBucket)) + attachmentBucket := tx.Bucket([]byte(attachmentBucket)) // Delete both workload and status if err := workloadBucket.Delete([]byte(id)); err != nil { return err } - return statusBucket.Delete([]byte(id)) + if err := statusBucket.Delete([]byte(id)); err != nil { + return err + } + if attachmentBucket == nil { + return nil + } + keysToDelete := make([][]byte, 0) + if err := attachmentBucket.ForEach(func(k, v []byte) error { + var attachment platform.VolumeAttachment + if err := json.Unmarshal(v, &attachment); err != nil { + return nil + } + if attachment.WorkloadID == id { + keyCopy := make([]byte, len(k)) + copy(keyCopy, k) + keysToDelete = append(keysToDelete, keyCopy) + } + return nil + }); err != nil { + return err + } + for _, key := range keysToDelete { + if err := attachmentBucket.Delete(key); err != nil { + return err + } + } + return nil }) } diff --git a/internal/state/store_test.go b/internal/state/store_test.go index caad158..4531c01 100644 --- a/internal/state/store_test.go +++ b/internal/state/store_test.go @@ -4,7 +4,8 @@ import ( "path/filepath" "testing" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/platform" + "github.com/persys-dev/compute-agent/pkg/models" ) func TestBoltStore_WorkloadAndStatusLifecycle(t *testing.T) { @@ -38,3 +39,68 @@ func TestBoltStore_WorkloadAndStatusLifecycle(t *testing.T) { t.Fatal("expected workload to be deleted") } } + +func TestBoltStore_VolumeAndAttachmentLifecycle(t *testing.T) { + dbPath := filepath.Join(t.TempDir(), "state.db") + s, err := NewBoltStore(dbPath) + if err != nil { + t.Fatalf("failed to create store: %v", err) + } + defer s.Close() + + vs, ok := s.(ManagedVolumeStore) + if !ok { + t.Fatalf("store does not implement ManagedVolumeStore") + } + + volume := &platform.VolumeHandle{ + ID: "local:vol-a", + Name: "vol-a", + Driver: "local", + Device: "/var/lib/persys/volumes/local/vol-a", + } + if err := vs.SaveVolume(volume); err != nil { + t.Fatalf("save volume failed: %v", err) + } + if _, err := vs.GetVolume(volume.ID); err != nil { + t.Fatalf("get volume failed: %v", err) + } + listedVolumes, err := vs.ListVolumes() + if err != nil { + t.Fatalf("list volumes failed: %v", err) + } + if len(listedVolumes) != 1 { + t.Fatalf("expected 1 volume, got %d", len(listedVolumes)) + } + + attachment := &platform.VolumeAttachment{ + ID: "local:vol-a:w1", + VolumeID: "local:vol-a", + WorkloadID: "w1", + MountPath: "/data", + } + if err := vs.SaveAttachment(attachment); err != nil { + t.Fatalf("save attachment failed: %v", err) + } + if _, err := vs.GetAttachment(attachment.ID); err != nil { + t.Fatalf("get attachment failed: %v", err) + } + listedAttachments, err := vs.ListAttachments("w1") + if err != nil { + t.Fatalf("list attachments failed: %v", err) + } + if len(listedAttachments) != 1 { + t.Fatalf("expected 1 attachment for workload w1, got %d", len(listedAttachments)) + } + + if err := s.DeleteWorkload("w1"); err != nil { + t.Fatalf("delete workload failed: %v", err) + } + listedAttachments, err = vs.ListAttachments("w1") + if err != nil { + t.Fatalf("list attachments after workload delete failed: %v", err) + } + if len(listedAttachments) != 0 { + t.Fatalf("expected attachment cleanup on workload delete, got %d", len(listedAttachments)) + } +} diff --git a/internal/storage/providers/ceph_rbd_provider.go b/internal/storage/providers/ceph_rbd_provider.go new file mode 100644 index 0000000..075c5f3 --- /dev/null +++ b/internal/storage/providers/ceph_rbd_provider.go @@ -0,0 +1,123 @@ +package providers + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/persys-dev/compute-agent/internal/platform" +) + +const ( + defaultCephRBDPool = "rbd" + defaultCephRBDStageRoot = "/var/lib/persys/volumes/ceph-rbd" +) + +type CephRBDProvider struct { + cluster string + pool string + user string + keyring string + stageRoot string +} + +func NewCephRBDProvider(cluster, pool, user, keyring, stageRoot string) *CephRBDProvider { + resolvedPool := strings.TrimSpace(pool) + if resolvedPool == "" { + resolvedPool = defaultCephRBDPool + } + root := strings.TrimSpace(stageRoot) + if root == "" { + root = defaultCephRBDStageRoot + } + return &CephRBDProvider{ + cluster: strings.TrimSpace(cluster), + pool: resolvedPool, + user: strings.TrimSpace(user), + keyring: strings.TrimSpace(keyring), + stageRoot: filepath.Clean(root), + } +} + +func (p *CephRBDProvider) Driver() string { + return "ceph-rbd" +} + +func (p *CephRBDProvider) Validate(_ context.Context, spec platform.VolumeSpec) error { + if strings.TrimSpace(p.pool) == "" { + return fmt.Errorf("ceph pool is required") + } + _, err := normalizeVolumeName(spec.Name) + return err +} + +func (p *CephRBDProvider) Provision(_ context.Context, spec platform.VolumeSpec) (*platform.VolumeHandle, error) { + name, err := normalizeVolumeName(spec.Name) + if err != nil { + return nil, err + } + now := time.Now().UTC() + return &platform.VolumeHandle{ + ID: volumeID(p.Driver(), name), + Name: name, + Driver: p.Driver(), + SizeGB: spec.SizeGB, + Device: fmt.Sprintf("rbd:%s/%s", p.pool, name), + Metadata: map[string]string{ + "driver": p.Driver(), + "cluster": p.cluster, + "pool": p.pool, + "user": p.user, + "keyring": p.keyring, + "fs_type": strings.TrimSpace(spec.FSType), + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *CephRBDProvider) Delete(_ context.Context, _ *platform.VolumeHandle) error { + // RBD image lifecycle hooks are introduced with runtime wiring in a later slice. + return nil +} + +func (p *CephRBDProvider) Attach(_ context.Context, handle *platform.VolumeHandle, workloadID string, mountPath string, readOnly bool) (*platform.VolumeAttachment, error) { + if handle == nil { + return nil, fmt.Errorf("volume handle is required") + } + stageDir, err := safeJoin(p.stageRoot, filepath.Join(strings.TrimSpace(workloadID), strings.ReplaceAll(handle.ID, ":", "_"))) + if err != nil { + return nil, err + } + if err := os.MkdirAll(stageDir, 0o755); err != nil { + return nil, fmt.Errorf("create ceph-rbd stage directory: %w", err) + } + now := time.Now().UTC() + return &platform.VolumeAttachment{ + ID: fmt.Sprintf("%s:%s", handle.ID, strings.TrimSpace(workloadID)), + VolumeID: handle.ID, + WorkloadID: strings.TrimSpace(workloadID), + MountPath: strings.TrimSpace(mountPath), + ReadOnly: readOnly, + StagePath: stageDir, + Metadata: map[string]string{ + "driver": p.Driver(), + "cluster": p.cluster, + "pool": p.pool, + "user": p.user, + "keyring": p.keyring, + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *CephRBDProvider) Detach(_ context.Context, attachment *platform.VolumeAttachment) error { + if attachment == nil || strings.TrimSpace(attachment.StagePath) == "" { + return nil + } + return os.RemoveAll(attachment.StagePath) +} diff --git a/internal/storage/providers/common.go b/internal/storage/providers/common.go new file mode 100644 index 0000000..b179da9 --- /dev/null +++ b/internal/storage/providers/common.go @@ -0,0 +1,56 @@ +package providers + +import ( + "fmt" + "path/filepath" + "strings" + "unicode" +) + +func normalizeVolumeName(name string) (string, error) { + trimmed := strings.TrimSpace(name) + if trimmed == "" { + return "", fmt.Errorf("managed volume name is required") + } + + var b strings.Builder + for _, r := range trimmed { + switch { + case unicode.IsLetter(r), unicode.IsDigit(r): + b.WriteRune(unicode.ToLower(r)) + case r == '-', r == '_', r == '.': + b.WriteRune(r) + default: + b.WriteByte('-') + } + } + + out := strings.Trim(b.String(), "-") + if out == "" { + return "", fmt.Errorf("managed volume name %q is invalid", name) + } + return out, nil +} + +func volumeID(driver, name string) string { + return strings.ToLower(strings.TrimSpace(driver)) + ":" + strings.TrimSpace(name) +} + +func safeJoin(base, sub string) (string, error) { + if strings.TrimSpace(base) == "" { + return "", fmt.Errorf("base path is required") + } + cleanBase := filepath.Clean(base) + path := filepath.Clean(filepath.Join(cleanBase, sub)) + if path == cleanBase { + return path, nil + } + rel, err := filepath.Rel(cleanBase, path) + if err != nil { + return "", err + } + if strings.HasPrefix(rel, "..") { + return "", fmt.Errorf("resolved path %q escapes base path", path) + } + return path, nil +} diff --git a/internal/storage/providers/local_provider.go b/internal/storage/providers/local_provider.go new file mode 100644 index 0000000..5eb693c --- /dev/null +++ b/internal/storage/providers/local_provider.go @@ -0,0 +1,108 @@ +package providers + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/persys-dev/compute-agent/internal/platform" +) + +const defaultLocalVolumeRoot = "/var/lib/persys/volumes/local" + +type LocalProvider struct { + basePath string +} + +func NewLocalProvider(basePath string) *LocalProvider { + root := strings.TrimSpace(basePath) + if root == "" { + root = defaultLocalVolumeRoot + } + return &LocalProvider{basePath: filepath.Clean(root)} +} + +func (p *LocalProvider) Driver() string { + return "local" +} + +func (p *LocalProvider) Validate(_ context.Context, spec platform.VolumeSpec) error { + _, err := normalizeVolumeName(spec.Name) + return err +} + +func (p *LocalProvider) Provision(_ context.Context, spec platform.VolumeSpec) (*platform.VolumeHandle, error) { + name, err := normalizeVolumeName(spec.Name) + if err != nil { + return nil, err + } + path, err := safeJoin(p.basePath, name) + if err != nil { + return nil, err + } + if err := os.MkdirAll(path, 0o755); err != nil { + return nil, fmt.Errorf("create local volume directory: %w", err) + } + now := time.Now().UTC() + return &platform.VolumeHandle{ + ID: volumeID(p.Driver(), name), + Name: name, + Driver: p.Driver(), + SizeGB: spec.SizeGB, + Device: path, + StagePath: path, + Metadata: map[string]string{ + "driver": p.Driver(), + "fs_type": strings.TrimSpace(spec.FSType), + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *LocalProvider) Delete(_ context.Context, handle *platform.VolumeHandle) error { + if handle == nil || strings.TrimSpace(handle.Device) == "" { + return nil + } + cleanBase := filepath.Clean(p.basePath) + cleanPath := filepath.Clean(handle.Device) + rel, err := filepath.Rel(cleanBase, cleanPath) + if err != nil { + return fmt.Errorf("validate local delete path: %w", err) + } + if strings.HasPrefix(rel, "..") { + return fmt.Errorf("refusing to delete path outside local storage root: %q", cleanPath) + } + path := filepath.Join(cleanBase, rel) + if err := os.RemoveAll(path); err != nil { + return fmt.Errorf("delete local volume path: %w", err) + } + return nil +} + +func (p *LocalProvider) Attach(_ context.Context, handle *platform.VolumeHandle, workloadID string, mountPath string, readOnly bool) (*platform.VolumeAttachment, error) { + if handle == nil { + return nil, fmt.Errorf("volume handle is required") + } + now := time.Now().UTC() + return &platform.VolumeAttachment{ + ID: fmt.Sprintf("%s:%s", handle.ID, strings.TrimSpace(workloadID)), + VolumeID: handle.ID, + WorkloadID: strings.TrimSpace(workloadID), + MountPath: strings.TrimSpace(mountPath), + ReadOnly: readOnly, + StagePath: handle.Device, + Metadata: map[string]string{ + "driver": p.Driver(), + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *LocalProvider) Detach(_ context.Context, _ *platform.VolumeAttachment) error { + return nil +} diff --git a/internal/storage/providers/nfs_provider.go b/internal/storage/providers/nfs_provider.go new file mode 100644 index 0000000..452adcb --- /dev/null +++ b/internal/storage/providers/nfs_provider.go @@ -0,0 +1,119 @@ +package providers + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "strings" + "time" + + "github.com/persys-dev/compute-agent/internal/platform" +) + +const defaultNFSStageRoot = "/var/lib/persys/volumes/nfs" + +type NFSProvider struct { + server string + exportPath string + stageRoot string + mountOptions string +} + +func NewNFSProvider(server, exportPath, stageRoot, mountOptions string) *NFSProvider { + root := strings.TrimSpace(stageRoot) + if root == "" { + root = defaultNFSStageRoot + } + return &NFSProvider{ + server: strings.TrimSpace(server), + exportPath: strings.TrimSpace(exportPath), + stageRoot: filepath.Clean(root), + mountOptions: strings.TrimSpace(mountOptions), + } +} + +func (p *NFSProvider) Driver() string { + return "nfs" +} + +func (p *NFSProvider) Validate(_ context.Context, spec platform.VolumeSpec) error { + if strings.TrimSpace(p.server) == "" { + return fmt.Errorf("nfs server is required") + } + if strings.TrimSpace(p.exportPath) == "" { + return fmt.Errorf("nfs export path is required") + } + _, err := normalizeVolumeName(spec.Name) + return err +} + +func (p *NFSProvider) Provision(_ context.Context, spec platform.VolumeSpec) (*platform.VolumeHandle, error) { + name, err := normalizeVolumeName(spec.Name) + if err != nil { + return nil, err + } + now := time.Now().UTC() + exportPath := strings.TrimSpace(p.exportPath) + remotePath := path.Join("/", exportPath, name) + return &platform.VolumeHandle{ + ID: volumeID(p.Driver(), name), + Name: name, + Driver: p.Driver(), + SizeGB: spec.SizeGB, + Device: fmt.Sprintf("nfs://%s%s", p.server, remotePath), + Metadata: map[string]string{ + "driver": p.Driver(), + "nfs_server": p.server, + "nfs_export": p.exportPath, + "nfs_path": remotePath, + "mount_options": p.mountOptions, + "fs_type": strings.TrimSpace(spec.FSType), + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *NFSProvider) Delete(_ context.Context, _ *platform.VolumeHandle) error { + // NFS volumes are assumed externally managed by the export backend for now. + return nil +} + +func (p *NFSProvider) Attach(_ context.Context, handle *platform.VolumeHandle, workloadID string, mountPath string, readOnly bool) (*platform.VolumeAttachment, error) { + if handle == nil { + return nil, fmt.Errorf("volume handle is required") + } + stageDir, err := safeJoin(p.stageRoot, filepath.Join(strings.TrimSpace(workloadID), strings.ReplaceAll(handle.ID, ":", "_"))) + if err != nil { + return nil, err + } + if err := os.MkdirAll(stageDir, 0o755); err != nil { + return nil, fmt.Errorf("create nfs stage directory: %w", err) + } + now := time.Now().UTC() + return &platform.VolumeAttachment{ + ID: fmt.Sprintf("%s:%s", handle.ID, strings.TrimSpace(workloadID)), + VolumeID: handle.ID, + WorkloadID: strings.TrimSpace(workloadID), + MountPath: strings.TrimSpace(mountPath), + ReadOnly: readOnly, + StagePath: stageDir, + Metadata: map[string]string{ + "driver": p.Driver(), + "nfs_server": p.server, + "nfs_export": p.exportPath, + "mount_options": p.mountOptions, + }, + CreatedAt: now, + UpdatedAt: now, + }, nil +} + +func (p *NFSProvider) Detach(_ context.Context, attachment *platform.VolumeAttachment) error { + if attachment == nil || strings.TrimSpace(attachment.StagePath) == "" { + return nil + } + return os.RemoveAll(attachment.StagePath) +} diff --git a/internal/storage/providers/providers_test.go b/internal/storage/providers/providers_test.go new file mode 100644 index 0000000..b712c48 --- /dev/null +++ b/internal/storage/providers/providers_test.go @@ -0,0 +1,74 @@ +package providers + +import ( + "context" + "path/filepath" + "testing" + + "github.com/persys-dev/compute-agent/internal/platform" +) + +func TestLocalProvider_ProvisionAndAttach(t *testing.T) { + p := NewLocalProvider(filepath.Join(t.TempDir(), "local")) + spec := platform.VolumeSpec{Name: "cache-data", Driver: "local", MountPath: "/data"} + + handle, err := p.Provision(context.Background(), spec) + if err != nil { + t.Fatalf("provision failed: %v", err) + } + if handle.ID == "" || handle.Device == "" { + t.Fatalf("expected local handle id/device to be set") + } + + attachment, err := p.Attach(context.Background(), handle, "w1", spec.MountPath, false) + if err != nil { + t.Fatalf("attach failed: %v", err) + } + if attachment.StagePath == "" { + t.Fatalf("expected stage path") + } +} + +func TestNFSProvider_ValidateAndAttach(t *testing.T) { + p := NewNFSProvider("10.0.0.10", "/exports/workloads", filepath.Join(t.TempDir(), "nfs"), "vers=4.1") + spec := platform.VolumeSpec{Name: "team-a", Driver: "nfs", MountPath: "/mnt/shared"} + if err := p.Validate(context.Background(), spec); err != nil { + t.Fatalf("validate failed: %v", err) + } + handle, err := p.Provision(context.Background(), spec) + if err != nil { + t.Fatalf("provision failed: %v", err) + } + if handle.Device != "nfs://10.0.0.10/exports/workloads/team-a" { + t.Fatalf("unexpected nfs device URI: %s", handle.Device) + } + if handle.Metadata["nfs_path"] != "/exports/workloads/team-a" { + t.Fatalf("unexpected nfs_path metadata: %s", handle.Metadata["nfs_path"]) + } + attachment, err := p.Attach(context.Background(), handle, "workload-a", spec.MountPath, true) + if err != nil { + t.Fatalf("attach failed: %v", err) + } + if attachment.Metadata["driver"] != "nfs" { + t.Fatalf("expected nfs driver metadata") + } +} + +func TestCephRBDProvider_ValidateAndAttach(t *testing.T) { + p := NewCephRBDProvider("ceph", "rbd", "client.persys", "/etc/ceph/keyring", filepath.Join(t.TempDir(), "ceph")) + spec := platform.VolumeSpec{Name: "vm-root", Driver: "ceph-rbd", MountPath: "/var/lib/vm"} + if err := p.Validate(context.Background(), spec); err != nil { + t.Fatalf("validate failed: %v", err) + } + handle, err := p.Provision(context.Background(), spec) + if err != nil { + t.Fatalf("provision failed: %v", err) + } + attachment, err := p.Attach(context.Background(), handle, "vm-1", spec.MountPath, false) + if err != nil { + t.Fatalf("attach failed: %v", err) + } + if attachment.Metadata["driver"] != "ceph-rbd" { + t.Fatalf("expected ceph-rbd driver metadata") + } +} diff --git a/internal/task/queue.go b/internal/task/queue.go index c43bba1..9565343 100644 --- a/internal/task/queue.go +++ b/internal/task/queue.go @@ -57,6 +57,13 @@ type TaskSnapshot struct { // Handler is a function that executes a task type Handler func(ctx context.Context, task *Task) error +// MetricsObserver captures queue/task telemetry without coupling this package +// to a specific metrics backend. +type MetricsObserver interface { + SetTaskQueueDepth(depth int) + ObserveTaskExecution(taskType string, duration time.Duration, failed bool) +} + // Queue manages async task execution type Queue struct { handlers map[TaskType]Handler @@ -67,6 +74,7 @@ type Queue struct { mu sync.RWMutex maxWorkers int stopOnce sync.Once + observer MetricsObserver } // NewQueue creates a new task queue @@ -91,9 +99,17 @@ func (q *Queue) RegisterHandler(taskType TaskType, handler Handler) { q.handlers[taskType] = handler } +// SetMetricsObserver attaches optional queue/task metrics emission. +func (q *Queue) SetMetricsObserver(observer MetricsObserver) { + q.mu.Lock() + defer q.mu.Unlock() + q.observer = observer +} + // Start begins processing tasks func (q *Queue) Start(ctx context.Context) { q.logger.Infof("Starting task queue with %d workers", q.maxWorkers) + q.observeQueueDepth() // Start worker goroutines for i := 0; i < q.maxWorkers; i++ { @@ -126,6 +142,7 @@ func (q *Queue) worker(ctx context.Context, id int) { if task == nil { return } + q.observeQueueDepth() q.executeTask(ctx, task) case <-q.stopCh: @@ -140,6 +157,7 @@ func (q *Queue) worker(ctx context.Context, id int) { // executeTask runs a task with its registered handler func (q *Queue) executeTask(ctx context.Context, task *Task) { + start := time.Now() task.mu.Lock() task.Status = TaskStatusRunning task.StartedAt = time.Now() @@ -157,6 +175,7 @@ func (q *Queue) executeTask(ctx context.Context, task *Task) { task.EndedAt = time.Now() task.mu.Unlock() q.logger.Errorf("Task %s failed: %s", task.ID, task.Error) + q.observeTaskExecution(task.Type, time.Since(start), true) return } @@ -174,6 +193,7 @@ func (q *Queue) executeTask(ctx context.Context, task *Task) { q.logger.Infof("Task %s completed in %s", task.ID, time.Since(task.StartedAt)) } task.mu.Unlock() + q.observeTaskExecution(task.Type, time.Since(start), err != nil) } // Submit enqueues a task for async execution @@ -194,6 +214,7 @@ func (q *Queue) Submit(task *Task) error { select { case q.taskCh <- task: q.logger.Debugf("Task %s submitted (type: %s)", task.ID, task.Type) + q.observeQueueDepth() return nil case <-q.stopCh: q.mu.Lock() @@ -208,6 +229,24 @@ func (q *Queue) Submit(task *Task) error { } } +func (q *Queue) observeQueueDepth() { + q.mu.RLock() + obs := q.observer + q.mu.RUnlock() + if obs != nil { + obs.SetTaskQueueDepth(len(q.taskCh)) + } +} + +func (q *Queue) observeTaskExecution(taskType TaskType, duration time.Duration, failed bool) { + q.mu.RLock() + obs := q.observer + q.mu.RUnlock() + if obs != nil { + obs.ObserveTaskExecution(string(taskType), duration, failed) + } +} + // GetTask retrieves a task by ID func (q *Queue) GetTask(id string) *Task { q.mu.RLock() diff --git a/internal/telemetry/otel.go b/internal/telemetry/otel.go new file mode 100644 index 0000000..9887c12 --- /dev/null +++ b/internal/telemetry/otel.go @@ -0,0 +1,76 @@ +package telemetry + +import ( + "context" + "net/url" + "os" + "strings" + + "github.com/sirupsen/logrus" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + tracesdk "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.21.0" +) + +// Setup initializes OpenTelemetry tracing for the compute agent. +func Setup(ctx context.Context, logger *logrus.Logger, defaultServiceName string) (func(context.Context) error, error) { + endpoint := strings.TrimSpace(os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")) + if endpoint == "" { + endpoint = strings.TrimSpace(os.Getenv("OTEL_EXPORTER_JAEGER_ENDPOINT")) + } + if endpoint == "" { + logger.Info("OpenTelemetry exporter disabled: endpoint not configured") + return func(context.Context) error { return nil }, nil + } + + serviceName := strings.TrimSpace(os.Getenv("OTEL_SERVICE_NAME")) + if serviceName == "" { + serviceName = defaultServiceName + } + + opts := make([]otlptracehttp.Option, 0, 3) + if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") { + u, err := url.Parse(endpoint) + if err != nil { + return nil, err + } + opts = append(opts, otlptracehttp.WithEndpoint(u.Host)) + if p := strings.TrimSpace(u.Path); p != "" && p != "/" { + opts = append(opts, otlptracehttp.WithURLPath(p)) + } + if u.Scheme != "https" { + opts = append(opts, otlptracehttp.WithInsecure()) + } + } else { + opts = append(opts, otlptracehttp.WithEndpoint(endpoint), otlptracehttp.WithInsecure()) + } + + exporter, err := otlptracehttp.New(ctx, opts...) + if err != nil { + return nil, err + } + + tp := tracesdk.NewTracerProvider( + tracesdk.WithBatcher(exporter), + tracesdk.WithResource(resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(serviceName), + )), + ) + + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + logger.WithFields(logrus.Fields{ + "service": serviceName, + "endpoint": endpoint, + }).Info("OpenTelemetry tracing enabled") + + return tp.Shutdown, nil +} diff --git a/internal/workload/manager.go b/internal/workload/manager.go index 8448861..8f56360 100644 --- a/internal/workload/manager.go +++ b/internal/workload/manager.go @@ -2,25 +2,40 @@ package workload import ( "context" + "encoding/json" stdErrors "errors" "fmt" + "sort" "strings" "sync" "time" - "github.com/persys/compute-agent/internal/errors" - "github.com/persys/compute-agent/internal/metrics" - "github.com/persys/compute-agent/internal/resources" - "github.com/persys/compute-agent/internal/retry" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/pkg/models" + "github.com/persys-dev/compute-agent/internal/errors" + "github.com/persys-dev/compute-agent/internal/metrics" + "github.com/persys-dev/compute-agent/internal/platform" + "github.com/persys-dev/compute-agent/internal/resources" + "github.com/persys-dev/compute-agent/internal/retry" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" ) +const ( + pendingSinceMetadataKey = "pending_since" + pendingRecoveryActionMetadataKey = "pending_recovery_action" + pendingRecoveryReasonMetadataKey = "pending_recovery_reason" + pendingRecoveryDeletedMetadataKey = "pending_recovery_deleted" + retryTerminalMetadataKey = "retry_terminal" + retryTerminalReasonMetadataKey = "retry_terminal_reason" + defaultPendingRecoveryThreshold = 5 * time.Minute +) + // Manager handles workload lifecycle with idempotency type Manager struct { store state.Store + volumeState state.ManagedVolumeStore + volumeMgr platform.VolumeManager runtimeMgr *runtime.Manager logger *logrus.Entry metrics *metrics.Metrics @@ -35,8 +50,13 @@ type Manager struct { // NewManager creates a new workload manager func NewManager(store state.Store, runtimeMgr *runtime.Manager, logger *logrus.Logger) *Manager { + var volumeState state.ManagedVolumeStore + if casted, ok := store.(state.ManagedVolumeStore); ok { + volumeState = casted + } return &Manager{ store: store, + volumeState: volumeState, runtimeMgr: runtimeMgr, logger: logger.WithField("component", "workload-manager"), workloadOpLocks: make(map[string]*sync.Mutex), @@ -45,6 +65,11 @@ func NewManager(store state.Store, runtimeMgr *runtime.Manager, logger *logrus.L } } +// SetVolumeManager enables managed-volume lifecycle orchestration. +func (m *Manager) SetVolumeManager(volumeMgr platform.VolumeManager) { + m.volumeMgr = volumeMgr +} + // SetMetrics sets the metrics instance for recording operational metrics func (m *Manager) SetMetrics(metricsInst *metrics.Metrics) { m.metrics = metricsInst @@ -101,6 +126,25 @@ func ensureMetadata(status *models.WorkloadStatus) { } } +func isRetryTerminal(status *models.WorkloadStatus) bool { + if status == nil || status.Metadata == nil { + return false + } + return strings.EqualFold(strings.TrimSpace(status.Metadata[retryTerminalMetadataKey]), "true") +} + +func markRetryTerminal(status *models.WorkloadStatus, tracker *retry.RetryTracker, reason string) { + if status == nil { + return + } + ensureMetadata(status) + status.Metadata[retryTerminalMetadataKey] = "true" + status.Metadata[retryTerminalReasonMetadataKey] = strings.TrimSpace(reason) + if tracker != nil { + status.Metadata["retry_attempts"] = fmt.Sprintf("%d", tracker.GetAttemptCount()) + } +} + func isRuntimeMissing(message string) bool { msg := strings.ToLower(message) return strings.Contains(msg, "not found") || @@ -108,6 +152,33 @@ func isRuntimeMissing(message string) bool { strings.Contains(msg, "does not exist") } +func normalizeStateForDesired(desired models.DesiredState, actual models.ActualState, message string) (models.ActualState, string) { + if desired == models.DesiredStateStopped && actual == models.ActualStateFailed { + msg := strings.TrimSpace(message) + if msg == "" { + return models.ActualStateStopped, "stopped" + } + return models.ActualStateStopped, fmt.Sprintf("stopped (previous runtime error: %s)", msg) + } + return actual, message +} + +func clearRetryStateMetadata(status *models.WorkloadStatus) { + if status == nil || status.Metadata == nil { + return + } + for _, key := range []string{ + "retry_attempts", + "next_retry_time", + "failure_reason", + "last_error", + retryTerminalMetadataKey, + retryTerminalReasonMetadataKey, + } { + delete(status.Metadata, key) + } +} + func (m *Manager) persistFailedStatus(workload *models.Workload, message string, err error) { failedStatus, statusErr := m.store.GetStatus(workload.ID) if statusErr != nil || failedStatus == nil { @@ -137,6 +208,63 @@ func (m *Manager) persistFailedStatus(workload *models.Workload, message string, } } +func (m *Manager) persistManagedStorageFailureStatus(workload *models.Workload, code, message string, err error) { + status, statusErr := m.store.GetStatus(workload.ID) + if statusErr != nil || status == nil { + status = &models.WorkloadStatus{ + ID: workload.ID, + Type: workload.Type, + RevisionID: workload.RevisionID, + DesiredState: workload.DesiredState, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + } + status.ActualState = models.ActualStateFailed + status.Message = strings.TrimSpace(message) + status.UpdatedAt = time.Now() + ensureMetadata(status) + status.Metadata["failure_reason"] = strings.TrimSpace(code) + status.Metadata["failure_message"] = strings.TrimSpace(message) + if err != nil { + status.Metadata["last_error"] = err.Error() + } + if saveErr := m.store.SaveStatus(status); saveErr != nil { + m.logger.Warnf("Failed to persist managed-storage failure status for %s: %v", workload.ID, saveErr) + } +} + +func failureReasonCodeFromRuntimeError(err error) string { + if err == nil { + return "" + } + lower := strings.ToLower(err.Error()) + switch { + case strings.Contains(lower, "cloud-init-invalid"): + return "CLOUD_INIT_INVALID" + default: + return "" + } +} + +func (m *Manager) annotateFailureReason(workloadID, code, message string) { + if strings.TrimSpace(workloadID) == "" || strings.TrimSpace(code) == "" { + return + } + status, err := m.store.GetStatus(workloadID) + if err != nil || status == nil { + return + } + ensureMetadata(status) + status.Metadata["failure_reason"] = strings.TrimSpace(code) + if strings.TrimSpace(message) != "" { + status.Metadata["failure_message"] = strings.TrimSpace(message) + } + if saveErr := m.store.SaveStatus(status); saveErr != nil { + m.logger.Warnf("Failed to annotate failure reason for %s: %v", workloadID, saveErr) + } +} + func (m *Manager) enrichStatusWithRuntimeMetadata(ctx context.Context, rt runtime.Runtime, workloadID string, status *models.WorkloadStatus) { provider, ok := rt.(runtime.StatusMetadataProvider) if !ok || status == nil { @@ -159,6 +287,355 @@ func (m *Manager) enrichStatusWithRuntimeMetadata(ctx context.Context, rt runtim } } +type managedStorageAllocation struct { + spec models.ManagedVolumeSpec + handle *platform.VolumeHandle + attachment *platform.VolumeAttachment +} + +func (m *Manager) hasManagedStorage(workload *models.Workload) bool { + if workload == nil { + return false + } + if workload.Type == models.WorkloadTypeVM { + vmSpec, err := parseVMSpec(workload.Spec) + return err == nil && len(vmSpec.ManagedVolumes) > 0 + } + containerSpec, err := parseContainerSpec(workload.Spec) + return err == nil && len(containerSpec.ManagedVolumes) > 0 +} + +func parseContainerSpec(specMap map[string]interface{}) (*models.ContainerSpec, error) { + payload, err := json.Marshal(specMap) + if err != nil { + return nil, err + } + var spec models.ContainerSpec + if err := json.Unmarshal(payload, &spec); err != nil { + return nil, err + } + return &spec, nil +} + +func parseVMSpec(specMap map[string]interface{}) (*models.VMSpec, error) { + payload, err := json.Marshal(specMap) + if err != nil { + return nil, err + } + var spec models.VMSpec + if err := json.Unmarshal(payload, &spec); err != nil { + return nil, err + } + return &spec, nil +} + +func encodeSpecToMap(spec interface{}) (map[string]interface{}, error) { + payload, err := json.Marshal(spec) + if err != nil { + return nil, err + } + out := make(map[string]interface{}) + if err := json.Unmarshal(payload, &out); err != nil { + return nil, err + } + return out, nil +} + +func normalizeManagedVolumeSpec(spec models.ManagedVolumeSpec, idx int) (models.ManagedVolumeSpec, error) { + if strings.TrimSpace(spec.Name) == "" { + return spec, fmt.Errorf("managed volume at index %d is missing name", idx) + } + if strings.TrimSpace(spec.Driver) == "" { + spec.Driver = "local" + } + if strings.TrimSpace(spec.MountPath) == "" { + return spec, fmt.Errorf("managed volume %q is missing mount_path", spec.Name) + } + if strings.TrimSpace(spec.RetainPolicy) == "" { + spec.RetainPolicy = "Delete" + } + return spec, nil +} + +func nextVMManagedDiskDevice(existing []models.DiskConfig) string { + used := make(map[string]struct{}, len(existing)) + for _, disk := range existing { + device := strings.ToLower(strings.TrimSpace(disk.Device)) + if device == "" { + continue + } + used[device] = struct{}{} + } + for letter := 'b'; letter <= 'z'; letter++ { + candidate := fmt.Sprintf("vd%c", letter) + if _, ok := used[candidate]; !ok { + return candidate + } + } + return fmt.Sprintf("vdx%d", time.Now().UnixNano()%10000) +} + +func volumeHandleIDFromAttachment(attachment *platform.VolumeAttachment) string { + if attachment == nil { + return "" + } + if strings.TrimSpace(attachment.VolumeID) != "" { + return attachment.VolumeID + } + parts := strings.Split(strings.TrimSpace(attachment.ID), ":") + if len(parts) >= 2 { + return parts[0] + ":" + parts[1] + } + return "" +} + +func (m *Manager) saveManagedVolumeHandle(handle *platform.VolumeHandle) { + if m.volumeState == nil || handle == nil { + return + } + if err := m.volumeState.SaveVolume(handle); err != nil { + m.logger.Warnf("Failed to save managed volume handle %s: %v", handle.ID, err) + } +} + +func (m *Manager) saveManagedVolumeAttachment(attachment *platform.VolumeAttachment) { + if m.volumeState == nil || attachment == nil { + return + } + if err := m.volumeState.SaveAttachment(attachment); err != nil { + m.logger.Warnf("Failed to save managed volume attachment %s: %v", attachment.ID, err) + } +} + +func (m *Manager) prepareManagedStorageForContainer(ctx context.Context, workload *models.Workload) ([]managedStorageAllocation, error) { + containerSpec, err := parseContainerSpec(workload.Spec) + if err != nil { + return nil, fmt.Errorf("failed to parse container spec for managed volumes: %w", err) + } + if len(containerSpec.ManagedVolumes) == 0 { + return nil, nil + } + if m.volumeMgr == nil { + return nil, fmt.Errorf("managed volume requested but volume manager is not configured") + } + + allocations := make([]managedStorageAllocation, 0, len(containerSpec.ManagedVolumes)) + for idx, rawSpec := range containerSpec.ManagedVolumes { + spec, err := normalizeManagedVolumeSpec(rawSpec, idx) + if err != nil { + return nil, err + } + platformSpec := platform.VolumeSpecFromModel(spec) + handle, err := m.volumeMgr.Provision(ctx, platformSpec) + if err != nil { + return nil, fmt.Errorf("provision volume %q (%s): %w", spec.Name, spec.Driver, err) + } + if handle.Metadata == nil { + handle.Metadata = map[string]string{} + } + handle.Metadata["retain_policy"] = strings.TrimSpace(spec.RetainPolicy) + handle.Metadata["mount_path"] = strings.TrimSpace(spec.MountPath) + m.saveManagedVolumeHandle(handle) + + attachment, err := m.volumeMgr.Attach(ctx, platformSpec, handle, workload.ID) + if err != nil { + return nil, fmt.Errorf("attach volume %q (%s): %w", spec.Name, spec.Driver, err) + } + if attachment.Metadata == nil { + attachment.Metadata = map[string]string{} + } + attachment.Metadata["driver"] = strings.TrimSpace(spec.Driver) + attachment.Metadata["retain_policy"] = strings.TrimSpace(spec.RetainPolicy) + m.saveManagedVolumeAttachment(attachment) + + volumeMount := models.VolumeMount{ + HostPath: attachment.StagePath, + ContainerPath: spec.MountPath, + ReadOnly: spec.ReadOnly, + } + containerSpec.Volumes = append(containerSpec.Volumes, volumeMount) + allocations = append(allocations, managedStorageAllocation{ + spec: spec, + handle: handle, + attachment: attachment, + }) + } + + updatedMap, err := encodeSpecToMap(containerSpec) + if err != nil { + return nil, fmt.Errorf("failed to encode updated container spec: %w", err) + } + workload.Spec = updatedMap + return allocations, nil +} + +func (m *Manager) prepareManagedStorageForVM(ctx context.Context, workload *models.Workload) ([]managedStorageAllocation, error) { + vmSpec, err := parseVMSpec(workload.Spec) + if err != nil { + return nil, fmt.Errorf("failed to parse vm spec for managed volumes: %w", err) + } + if len(vmSpec.ManagedVolumes) == 0 { + return nil, nil + } + if m.volumeMgr == nil { + return nil, fmt.Errorf("managed volume requested but volume manager is not configured") + } + + allocations := make([]managedStorageAllocation, 0, len(vmSpec.ManagedVolumes)) + for idx, rawSpec := range vmSpec.ManagedVolumes { + spec, err := normalizeManagedVolumeSpec(rawSpec, idx) + if err != nil { + return nil, err + } + platformSpec := platform.VolumeSpecFromModel(spec) + handle, err := m.volumeMgr.Provision(ctx, platformSpec) + if err != nil { + return nil, fmt.Errorf("provision volume %q (%s): %w", spec.Name, spec.Driver, err) + } + if handle.Metadata == nil { + handle.Metadata = map[string]string{} + } + handle.Metadata["retain_policy"] = strings.TrimSpace(spec.RetainPolicy) + handle.Metadata["mount_path"] = strings.TrimSpace(spec.MountPath) + m.saveManagedVolumeHandle(handle) + + attachment, err := m.volumeMgr.Attach(ctx, platformSpec, handle, workload.ID) + if err != nil { + return nil, fmt.Errorf("attach volume %q (%s): %w", spec.Name, spec.Driver, err) + } + if attachment.Metadata == nil { + attachment.Metadata = map[string]string{} + } + attachment.Metadata["driver"] = strings.TrimSpace(spec.Driver) + attachment.Metadata["retain_policy"] = strings.TrimSpace(spec.RetainPolicy) + m.saveManagedVolumeAttachment(attachment) + + diskPath := strings.TrimSpace(handle.Device) + if diskPath == "" { + diskPath = strings.TrimSpace(attachment.StagePath) + } + if diskPath == "" { + return nil, fmt.Errorf("managed volume %q did not provide attachable disk path", spec.Name) + } + vmSpec.Disks = append(vmSpec.Disks, models.DiskConfig{ + Path: diskPath, + Device: nextVMManagedDiskDevice(vmSpec.Disks), + Format: "raw", + SizeGB: spec.SizeGB, + Type: models.DiskTypeDisk, + Boot: false, + }) + allocations = append(allocations, managedStorageAllocation{ + spec: spec, + handle: handle, + attachment: attachment, + }) + } + + updatedMap, err := encodeSpecToMap(vmSpec) + if err != nil { + return nil, fmt.Errorf("failed to encode updated vm spec: %w", err) + } + workload.Spec = updatedMap + return allocations, nil +} + +func (m *Manager) cleanupManagedStorageAllocations(ctx context.Context, allocations []managedStorageAllocation, deleteHandles bool) { + if len(allocations) == 0 || m.volumeMgr == nil { + return + } + for i := len(allocations) - 1; i >= 0; i-- { + allocation := allocations[i] + if allocation.attachment != nil { + if err := m.volumeMgr.Detach(ctx, allocation.attachment); err != nil { + m.logger.Warnf("Failed to rollback attachment %s: %v", allocation.attachment.ID, err) + } + if m.volumeState != nil { + _ = m.volumeState.DeleteAttachment(allocation.attachment.ID) + } + } + if deleteHandles && allocation.handle != nil { + retainPolicy := strings.ToLower(strings.TrimSpace(allocation.spec.RetainPolicy)) + if retainPolicy == "retain" { + continue + } + if err := m.volumeMgr.Delete(ctx, allocation.handle); err != nil { + m.logger.Warnf("Failed to rollback volume %s: %v", allocation.handle.ID, err) + } + if m.volumeState != nil { + _ = m.volumeState.DeleteVolume(allocation.handle.ID) + } + } + } +} + +func (m *Manager) prepareManagedStorage(ctx context.Context, workload *models.Workload) ([]managedStorageAllocation, error) { + if workload == nil { + return nil, nil + } + switch workload.Type { + case models.WorkloadTypeContainer: + return m.prepareManagedStorageForContainer(ctx, workload) + case models.WorkloadTypeVM: + return m.prepareManagedStorageForVM(ctx, workload) + default: + return nil, nil + } +} + +func (m *Manager) releaseManagedStorageForWorkload(ctx context.Context, workloadID string) { + if m.volumeMgr == nil || m.volumeState == nil { + return + } + attachments, err := m.volumeState.ListAttachments(workloadID) + if err != nil { + m.logger.Warnf("Failed to list managed volume attachments for workload %s: %v", workloadID, err) + return + } + if len(attachments) == 0 { + return + } + sort.SliceStable(attachments, func(i, j int) bool { + return attachments[i].CreatedAt.After(attachments[j].CreatedAt) + }) + visitedVolumes := make(map[string]struct{}) + for _, attachment := range attachments { + if attachment == nil { + continue + } + if err := m.volumeMgr.Detach(ctx, attachment); err != nil { + m.logger.Warnf("Failed to detach managed volume attachment %s: %v", attachment.ID, err) + } + _ = m.volumeState.DeleteAttachment(attachment.ID) + + volumeID := volumeHandleIDFromAttachment(attachment) + if volumeID == "" { + continue + } + if _, exists := visitedVolumes[volumeID]; exists { + continue + } + visitedVolumes[volumeID] = struct{}{} + + handle, err := m.volumeState.GetVolume(volumeID) + if err != nil || handle == nil { + continue + } + retainPolicy := "" + if handle.Metadata != nil { + retainPolicy = strings.ToLower(strings.TrimSpace(handle.Metadata["retain_policy"])) + } + if retainPolicy == "retain" { + continue + } + if err := m.volumeMgr.Delete(ctx, handle); err != nil { + m.logger.Warnf("Failed to delete managed volume %s: %v", handle.ID, err) + continue + } + _ = m.volumeState.DeleteVolume(handle.ID) + } +} + // ApplyWorkload applies a workload with revision-based idempotency func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) (*models.WorkloadStatus, bool, error) { unlock := m.lockWorkloadOp(workload.ID) @@ -169,36 +646,66 @@ func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) // Check if workload already exists with same revision (idempotency) existing, err := m.store.GetWorkload(workload.ID) if err == nil && existing.RevisionID == workload.RevisionID { - status, err := m.store.GetStatus(workload.ID) - if err == nil && status != nil { - missingFromRuntime := status.ActualState == models.ActualStateUnknown && isRuntimeMissing(status.Message) - if status.ActualState != models.ActualStateFailed && !missingFromRuntime { - m.logger.Infof("Workload %s already at revision %s, skipping", workload.ID, workload.RevisionID) + status, statusErr := m.store.GetStatus(workload.ID) + if statusErr != nil || status == nil { + if existing.DesiredState == workload.DesiredState { + m.logger.Warnf( + "Workload %s revision matches but status unavailable, preserving idempotent skip: %v", + workload.ID, + statusErr, + ) + status = &models.WorkloadStatus{ + ID: workload.ID, + Type: workload.Type, + RevisionID: workload.RevisionID, + DesiredState: workload.DesiredState, + ActualState: models.ActualStateUnknown, + Message: "status not found", + } return status, true, nil // skipped=true } - - m.logger.Infof( - "Workload %s at revision %s has non-healthy state (%s), retrying apply", - workload.ID, - workload.RevisionID, - status.ActualState, - ) - } else { - m.logger.Warnf( - "Workload %s revision matches but status unavailable, preserving idempotent skip: %v", - workload.ID, - err, - ) status = &models.WorkloadStatus{ ID: workload.ID, Type: workload.Type, RevisionID: workload.RevisionID, - DesiredState: workload.DesiredState, + DesiredState: existing.DesiredState, ActualState: models.ActualStateUnknown, Message: "status not found", + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + } + + missingFromRuntime := status.ActualState == models.ActualStateUnknown && isRuntimeMissing(status.Message) + desiredChanged := existing.DesiredState != workload.DesiredState + desiredNeedsAction := desiredChanged || + (workload.DesiredState == models.DesiredStateRunning && status.ActualState != models.ActualStateRunning && !missingFromRuntime) || + (workload.DesiredState == models.DesiredStateStopped && (status.ActualState == models.ActualStateRunning || status.ActualState == models.ActualStatePending)) + + if status.ActualState != models.ActualStateFailed && !missingFromRuntime { + if desiredNeedsAction { + existing.DesiredState = workload.DesiredState + existing.UpdatedAt = time.Now() + if err := m.store.SaveWorkload(existing); err != nil { + return nil, false, fmt.Errorf("failed to persist desired state transition: %w", err) + } + updatedStatus, err := m.applyDesiredStateTransition(ctx, existing, status) + if err != nil { + return nil, false, err + } + return updatedStatus, false, nil } + + m.logger.Infof("Workload %s already at revision %s, skipping", workload.ID, workload.RevisionID) return status, true, nil // skipped=true } + + m.logger.Infof( + "Workload %s at revision %s has non-healthy state (%s), retrying apply", + workload.ID, + workload.RevisionID, + status.ActualState, + ) } // Get runtime for this workload type @@ -275,10 +782,36 @@ func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) m.logger.Warnf("Failed to save initial status: %v", err) } + // Provision/attach managed volumes before runtime create. + managedAllocations, err := m.prepareManagedStorage(ctx, workload) + if err != nil { + reasonCode := "STORAGE_PROVISION_FAILED" + if strings.Contains(strings.ToLower(err.Error()), "attach") { + reasonCode = "STORAGE_ATTACH_FAILED" + } + m.persistManagedStorageFailureStatus(workload, reasonCode, fmt.Sprintf("managed storage setup failed: %v", err), err) + workloadErr := errors.NewWorkloadError( + errors.ErrCodeVolumeMountFailed, + "Failed to setup managed storage for workload", + workload.ID, + string(workload.Type), + err, + ) + workloadErr.WithDetail("failure_reason", reasonCode) + return nil, false, workloadErr + } + if len(managedAllocations) > 0 { + if err := m.store.SaveWorkload(workload); err != nil { + m.cleanupManagedStorageAllocations(ctx, managedAllocations, true) + return nil, false, fmt.Errorf("failed to persist workload with managed volume attachments: %w", err) + } + } + // Create the workload in the runtime startTime := time.Now() if err := rt.Create(ctx, workload); err != nil { m.logger.Errorf("Failed to create workload %s: %v", workload.ID, err) + m.cleanupManagedStorageAllocations(ctx, managedAllocations, true) // Record failed workload metric if m.metrics != nil { @@ -286,6 +819,7 @@ func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) } m.persistFailedStatus(workload, fmt.Sprintf("create failed: %v", err), err) + m.annotateFailureReason(workload.ID, failureReasonCodeFromRuntimeError(err), err.Error()) // Create detailed error workloadErr := errors.NewWorkloadError( @@ -310,12 +844,10 @@ func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) if workload.DesiredState == models.DesiredStateRunning { if err := rt.Start(ctx, workload.ID); err != nil { m.logger.Errorf("Failed to start workload %s: %v, leaving desired state for reconciliation", workload.ID, err) - - if delErr := rt.Delete(ctx, workload.ID); delErr != nil { - m.logger.Warnf("Failed to delete workload during cleanup: %v", delErr) - } + m.cleanupManagedStorageAllocations(ctx, managedAllocations, true) m.persistFailedStatus(workload, fmt.Sprintf("start failed: %v", err), err) + m.annotateFailureReason(workload.ID, failureReasonCodeFromRuntimeError(err), err.Error()) workloadErr := errors.NewWorkloadError( errors.ErrCodeStartFailed, @@ -367,6 +899,84 @@ func (m *Manager) ApplyWorkload(ctx context.Context, workload *models.Workload) return status, false, nil } +func (m *Manager) applyDesiredStateTransition(ctx context.Context, workload *models.Workload, status *models.WorkloadStatus) (*models.WorkloadStatus, error) { + rt, err := m.runtimeMgr.GetRuntime(workload.Type) + if err != nil { + return nil, fmt.Errorf("runtime not available: %w", err) + } + + now := time.Now() + if status == nil { + status = &models.WorkloadStatus{ + ID: workload.ID, + Type: workload.Type, + CreatedAt: now, + } + } + status.Type = workload.Type + status.RevisionID = workload.RevisionID + status.DesiredState = workload.DesiredState + + actualState, message, err := rt.Status(ctx, workload.ID) + if err != nil { + status.ActualState = models.ActualStateFailed + status.Message = fmt.Sprintf("runtime status failed: %v", err) + status.UpdatedAt = now + ensureMetadata(status) + status.Metadata["last_error"] = status.Message + _ = m.store.SaveStatus(status) + return nil, fmt.Errorf("runtime status check failed: %w", err) + } + actualState, message = normalizeStateForDesired(workload.DesiredState, actualState, message) + + switch workload.DesiredState { + case models.DesiredStateRunning: + if actualState != models.ActualStateRunning { + if err := rt.Start(ctx, workload.ID); err != nil { + m.persistFailedStatus(workload, fmt.Sprintf("start failed: %v", err), err) + return nil, errors.NewWorkloadError( + errors.ErrCodeStartFailed, + "Failed to start workload", + workload.ID, + string(workload.Type), + err, + ) + } + } + case models.DesiredStateStopped: + if actualState == models.ActualStateRunning || actualState == models.ActualStatePending { + if err := rt.Stop(ctx, workload.ID); err != nil { + status.ActualState = models.ActualStateFailed + status.Message = fmt.Sprintf("stop failed: %v", err) + status.UpdatedAt = time.Now() + ensureMetadata(status) + status.Metadata["last_error"] = err.Error() + _ = m.store.SaveStatus(status) + return nil, fmt.Errorf("failed to stop workload: %w", err) + } + } + } + + actualState, message, err = rt.Status(ctx, workload.ID) + if err != nil { + actualState = models.ActualStateUnknown + message = fmt.Sprintf("status check failed: %v", err) + } + actualState, message = normalizeStateForDesired(workload.DesiredState, actualState, message) + status.ActualState = actualState + status.Message = message + status.UpdatedAt = time.Now() + if workload.DesiredState == models.DesiredStateStopped { + clearRetryStateMetadata(status) + } + m.enrichStatusWithRuntimeMetadata(ctx, rt, workload.ID, status) + if err := m.store.SaveStatus(status); err != nil { + return nil, fmt.Errorf("failed to save status: %w", err) + } + m.resetRetryTracker(workload.ID) + return status, nil +} + // DeleteWorkload removes a workload func (m *Manager) DeleteWorkload(ctx context.Context, id string) error { unlock := m.lockWorkloadOp(id) @@ -393,6 +1003,9 @@ func (m *Manager) DeleteWorkload(ctx context.Context, id string) error { m.logger.Warnf("Failed to delete workload from runtime: %v", err) } + // Detach/delete managed storage allocations for this workload. + m.releaseManagedStorageForWorkload(ctx, id) + // Remove from state store if err := m.store.DeleteWorkload(id); err != nil { return fmt.Errorf("failed to delete from state: %w", err) @@ -423,9 +1036,14 @@ func (m *Manager) GetStatus(ctx context.Context, id string) (*models.WorkloadSta if err == nil { actualState, message, err := rt.Status(ctx, id) if err == nil { + actualState, message = normalizeStateForDesired(workload.DesiredState, actualState, message) status.ActualState = actualState status.Message = message status.UpdatedAt = time.Now() + if workload.DesiredState == models.DesiredStateStopped { + m.resetRetryTracker(id) + clearRetryStateMetadata(status) + } m.enrichStatusWithRuntimeMetadata(ctx, rt, id, status) m.store.SaveStatus(status) } @@ -442,18 +1060,29 @@ func (m *Manager) ListWorkloads(ctx context.Context, workloadType *models.Worklo return nil, fmt.Errorf("failed to list statuses: %w", err) } - // Filter by type if specified - if workloadType != nil { - var filtered []*models.WorkloadStatus - for _, status := range statuses { - if status.Type == *workloadType { - filtered = append(filtered, status) - } + if ctx == nil { + ctx = context.Background() + } + + refreshed := make([]*models.WorkloadStatus, 0, len(statuses)) + for _, status := range statuses { + if status == nil { + continue + } + if workloadType != nil && status.Type != *workloadType { + continue } - return filtered, nil + + current, refreshErr := m.GetStatus(ctx, status.ID) + if refreshErr != nil { + m.logger.Warnf("Failed to refresh status for %s during list: %v", status.ID, refreshErr) + refreshed = append(refreshed, status) + continue + } + refreshed = append(refreshed, current) } - return statuses, nil + return refreshed, nil } // ReconcileWorkload ensures workload state matches desired state @@ -485,11 +1114,16 @@ func (m *Manager) ReconcileWorkload(ctx context.Context, id string) error { status.UpdatedAt = time.Now() return m.store.SaveStatus(status) } + actualState, message = normalizeStateForDesired(workload.DesiredState, actualState, message) // Update status status.ActualState = actualState status.Message = message status.UpdatedAt = time.Now() + if workload.DesiredState == models.DesiredStateStopped { + m.resetRetryTracker(id) + clearRetryStateMetadata(status) + } // Reconcile state needsAction := false @@ -497,12 +1131,28 @@ func (m *Manager) ReconcileWorkload(ctx context.Context, id string) error { // Don't reconcile if workload is in a transient state (unknown or pending indicates creation/deletion in progress) missingFromRuntime := actualState == models.ActualStateUnknown && isRuntimeMissing(message) if actualState == models.ActualStatePending { + if workload.DesiredState == models.DesiredStateRunning { + if handled, err := m.handlePendingRecovery(ctx, workload, status, rt); handled { + return err + } + } m.logger.Debugf("Skipping reconciliation for %s: workload is in transient state (%s)", id, actualState) needsAction = false } else if actualState == models.ActualStateUnknown && !missingFromRuntime { m.logger.Debugf("Skipping reconciliation for %s: workload state unknown (%s)", id, message) needsAction = false - } else if actualState == models.ActualStateFailed || (missingFromRuntime && workload.DesiredState == models.DesiredStateRunning) { + } else if workload.DesiredState == models.DesiredStateRunning && (actualState == models.ActualStateFailed || missingFromRuntime) { + if isRetryTerminal(status) { + status.ActualState = models.ActualStateFailed + status.UpdatedAt = time.Now() + reason := strings.TrimSpace(status.Metadata[retryTerminalReasonMetadataKey]) + if reason != "" { + status.Message = fmt.Sprintf("retry halted: %s", reason) + } + m.logger.Infof("Skipping retry for %s: terminal retry state set (%s)", id, reason) + return m.store.SaveStatus(status) + } + if missingFromRuntime { m.logger.Infof("Reconciling %s: workload missing from runtime, attempting recreate...", id) status.ActualState = models.ActualStateFailed @@ -540,6 +1190,14 @@ func (m *Manager) ReconcileWorkload(ctx context.Context, id string) error { return m.store.SaveStatus(status) } } else { + markRetryTerminal(status, tracker, fmt.Sprintf("non-retryable failure (%s)", reason)) + status.ActualState = models.ActualStateFailed + status.UpdatedAt = time.Now() + if strings.TrimSpace(status.Message) == "" { + status.Message = fmt.Sprintf("retry halted due to non-retryable failure: %s", reason) + } else { + status.Message = fmt.Sprintf("retry halted due to non-retryable failure (%s): %s", reason, status.Message) + } m.logger.Infof("Not retrying failed workload %s (reason: %s)", id, reason) return m.store.SaveStatus(status) } @@ -605,11 +1263,63 @@ func (m *Manager) ReconcileWorkload(ctx context.Context, id string) error { } needsAction = true } else if workload.DesiredState == models.DesiredStateRunning && actualState != models.ActualStateRunning { + tracker := m.getRetryTracker(id) + if isRetryTerminal(status) { + status.ActualState = models.ActualStateFailed + status.UpdatedAt = time.Now() + reason := strings.TrimSpace(status.Metadata[retryTerminalReasonMetadataKey]) + if reason != "" { + status.Message = fmt.Sprintf("retry halted: %s", reason) + } + m.logger.Infof("Skipping start reconcile for %s: terminal retry state set (%s)", id, reason) + return m.store.SaveStatus(status) + } + if tracker.GetAttemptCount() > 0 && !tracker.CanRetryNow() { + ensureMetadata(status) + status.Metadata["retry_attempts"] = fmt.Sprintf("%d", tracker.GetAttemptCount()) + status.Metadata["next_retry_time"] = tracker.GetNextRetryTime().Format(time.RFC3339) + status.Metadata["failure_reason"] = string(retry.ClassifyError(stdErrors.New(status.Message))) + status.UpdatedAt = time.Now() + status.Message = fmt.Sprintf("reconcile start deferred until %s", tracker.GetNextRetryTime().Format(time.RFC3339)) + m.logger.Debugf("Deferring start reconcile for %s until %s", id, tracker.GetNextRetryTime().Format(time.RFC3339)) + return m.store.SaveStatus(status) + } + m.logger.Infof("Reconciling %s: desired=running, actual=%s, starting...", id, actualState) if err := rt.Start(ctx, id); err != nil { m.logger.Errorf("Failed to start %s during reconciliation: %v", id, err) + reason := retry.ClassifyError(err) + retryResult, recordErr := tracker.RecordFailure(reason, err.Error()) + if recordErr != nil { + m.logger.Warnf("Failed to record retry metadata for %s: %v", id, recordErr) + } + status.ActualState = models.ActualStateFailed status.Message = fmt.Sprintf("reconcile start failed: %v", err) + status.UpdatedAt = time.Now() + ensureMetadata(status) + status.Metadata["failure_reason"] = string(reason) + status.Metadata["last_error"] = status.Message + status.Metadata["last_runtime_error"] = err.Error() + status.Metadata["retry_attempts"] = fmt.Sprintf("%d", tracker.GetAttemptCount()) + if retryResult != nil && retryResult.Retryable { + status.Metadata["next_retry_time"] = retryResult.NextRetryTime.Format(time.RFC3339) + status.Message = fmt.Sprintf("reconcile start failed; retry scheduled at %s: %v", retryResult.NextRetryTime.Format(time.RFC3339), err) + } else { + markRetryTerminal(status, tracker, fmt.Sprintf("start failure is non-retryable (%s)", reason)) + status.Message = fmt.Sprintf("reconcile start failed; retries halted (%s): %v", reason, err) + delete(status.Metadata, "next_retry_time") + } + return m.store.SaveStatus(status) + } + m.resetRetryTracker(id) + if status.Metadata != nil { + delete(status.Metadata, "retry_attempts") + delete(status.Metadata, "next_retry_time") + delete(status.Metadata, "failure_reason") + delete(status.Metadata, "last_error") + delete(status.Metadata, retryTerminalMetadataKey) + delete(status.Metadata, retryTerminalReasonMetadataKey) } needsAction = true } else if workload.DesiredState == models.DesiredStateStopped && actualState == models.ActualStateRunning { @@ -638,6 +1348,68 @@ func (m *Manager) ReconcileWorkload(ctx context.Context, id string) error { return m.store.SaveStatus(status) } +func (m *Manager) handlePendingRecovery(ctx context.Context, workload *models.Workload, status *models.WorkloadStatus, rt runtime.Runtime) (bool, error) { + if workload == nil || status == nil { + return false, nil + } + ensureMetadata(status) + + now := time.Now() + pendingSince := now + if raw := strings.TrimSpace(status.Metadata[pendingSinceMetadataKey]); raw != "" { + if parsed, err := time.Parse(time.RFC3339, raw); err == nil { + pendingSince = parsed + } + } + if status.Metadata[pendingSinceMetadataKey] == "" { + status.Metadata[pendingSinceMetadataKey] = now.UTC().Format(time.RFC3339) + status.Message = "workload pending; waiting for startup completion" + status.UpdatedAt = now + return true, m.store.SaveStatus(status) + } + + if now.Sub(pendingSince) < defaultPendingRecoveryThreshold { + return false, nil + } + + m.logger.Warnf("Workload %s pending for %s, attempting recovery restart", workload.ID, now.Sub(pendingSince).Round(time.Second)) + status.Metadata[pendingRecoveryActionMetadataKey] = "restart" + status.Metadata[pendingRecoveryReasonMetadataKey] = fmt.Sprintf("pending for more than %s", defaultPendingRecoveryThreshold) + + _ = rt.Stop(ctx, workload.ID) + startErr := rt.Start(ctx, workload.ID) + if startErr == nil { + status.Metadata[pendingSinceMetadataKey] = now.UTC().Format(time.RFC3339) + status.Message = "recovered from prolonged pending via restart" + status.UpdatedAt = now + return true, m.store.SaveStatus(status) + } + + m.logger.Errorf("Pending recovery start failed for %s: %v", workload.ID, startErr) + deleteErr := rt.Delete(ctx, workload.ID) + if deleteErr != nil { + m.logger.Errorf("Pending recovery delete failed for %s: %v", workload.ID, deleteErr) + } + + status.ActualState = models.ActualStateFailed + status.UpdatedAt = now + status.Metadata[pendingRecoveryDeletedMetadataKey] = "true" + status.Metadata["last_error"] = fmt.Sprintf("pending recovery restart failed: %v", startErr) + if deleteErr != nil { + status.Metadata["last_delete_error"] = deleteErr.Error() + status.Message = fmt.Sprintf("pending timeout exceeded; restart failed (%v); delete also failed (%v); manual intervention required", startErr, deleteErr) + } else { + status.Message = fmt.Sprintf("pending timeout exceeded; restart failed (%v); workload deleted from runtime", startErr) + } + + // Prevent local reconcile loop from repeatedly attempting restart on this node. + workload.DesiredState = models.DesiredStateStopped + if err := m.store.SaveWorkload(workload); err != nil { + m.logger.Warnf("Failed to persist desired-state stop after pending recovery delete for %s: %v", workload.ID, err) + } + return true, m.store.SaveStatus(status) +} + // UpdateWorkloadMetrics updates gauge metrics with current workload counts func (m *Manager) UpdateWorkloadMetrics() error { if m.metrics == nil { diff --git a/internal/workload/manager_test.go b/internal/workload/manager_test.go index 687c4f3..716435e 100644 --- a/internal/workload/manager_test.go +++ b/internal/workload/manager_test.go @@ -9,11 +9,11 @@ import ( "testing" "time" - errors2 "github.com/persys/compute-agent/internal/errors" - "github.com/persys/compute-agent/internal/resources" - "github.com/persys/compute-agent/internal/retry" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/pkg/models" + errors2 "github.com/persys-dev/compute-agent/internal/errors" + "github.com/persys-dev/compute-agent/internal/resources" + "github.com/persys-dev/compute-agent/internal/retry" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -376,6 +376,132 @@ func TestApplyWorkload_SameRevision_Skipped(t *testing.T) { mockStore.AssertExpectations(t) } +func TestApplyWorkload_SameRevision_DesiredStateChange_StopWithoutRecreate(t *testing.T) { + mockStore := new(MockStore) + mockRuntime := new(MockRuntime) + + runtimeMgr := runtime.NewManager() + mockRuntime.On("Type").Return(models.WorkloadTypeContainer) + runtimeMgr.Register(mockRuntime) + + logger := logrus.New() + logger.SetLevel(logrus.FatalLevel) + manager := NewManager(mockStore, runtimeMgr, logger) + + existing := &models.Workload{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateRunning, + Spec: map[string]interface{}{"image": "nginx:latest"}, + } + request := &models.Workload{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateStopped, + Spec: map[string]interface{}{"image": "nginx:latest"}, + } + status := &models.WorkloadStatus{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateRunning, + ActualState: models.ActualStateRunning, + Message: "running", + } + + mockStore.On("GetWorkload", "test-workload").Return(existing, nil) + mockStore.On("GetStatus", "test-workload").Return(status, nil) + mockStore.On("SaveWorkload", mock.MatchedBy(func(w *models.Workload) bool { + return w.ID == "test-workload" && w.DesiredState == models.DesiredStateStopped && w.RevisionID == "rev-1" + })).Return(nil).Once() + mockRuntime.On("Status", mock.Anything, "test-workload").Return(models.ActualStateRunning, "running", nil).Once() + mockRuntime.On("Stop", mock.Anything, "test-workload").Return(nil).Once() + mockRuntime.On("Status", mock.Anything, "test-workload").Return(models.ActualStateStopped, "stopped", nil).Once() + mockStore.On("SaveStatus", mock.MatchedBy(func(s *models.WorkloadStatus) bool { + return s.ID == "test-workload" && + s.RevisionID == "rev-1" && + s.DesiredState == models.DesiredStateStopped && + s.ActualState == models.ActualStateStopped + })).Return(nil).Once() + + got, skipped, err := manager.ApplyWorkload(context.Background(), request) + assert.NoError(t, err) + assert.False(t, skipped) + assert.Equal(t, models.DesiredStateStopped, got.DesiredState) + assert.Equal(t, models.ActualStateStopped, got.ActualState) + + mockRuntime.AssertNotCalled(t, "Create", mock.Anything, mock.Anything) + mockRuntime.AssertNotCalled(t, "Delete", mock.Anything, mock.Anything) + mockRuntime.AssertNotCalled(t, "Start", mock.Anything, mock.Anything) + mockStore.AssertExpectations(t) + mockRuntime.AssertExpectations(t) +} + +func TestApplyWorkload_SameRevision_DesiredStateChange_StartWithoutRecreate(t *testing.T) { + mockStore := new(MockStore) + mockRuntime := new(MockRuntime) + + runtimeMgr := runtime.NewManager() + mockRuntime.On("Type").Return(models.WorkloadTypeContainer) + runtimeMgr.Register(mockRuntime) + + logger := logrus.New() + logger.SetLevel(logrus.FatalLevel) + manager := NewManager(mockStore, runtimeMgr, logger) + + existing := &models.Workload{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateStopped, + Spec: map[string]interface{}{"image": "nginx:latest"}, + } + request := &models.Workload{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateRunning, + Spec: map[string]interface{}{"image": "nginx:latest"}, + } + status := &models.WorkloadStatus{ + ID: "test-workload", + Type: models.WorkloadTypeContainer, + RevisionID: "rev-1", + DesiredState: models.DesiredStateStopped, + ActualState: models.ActualStateStopped, + Message: "stopped", + } + + mockStore.On("GetWorkload", "test-workload").Return(existing, nil) + mockStore.On("GetStatus", "test-workload").Return(status, nil) + mockStore.On("SaveWorkload", mock.MatchedBy(func(w *models.Workload) bool { + return w.ID == "test-workload" && w.DesiredState == models.DesiredStateRunning && w.RevisionID == "rev-1" + })).Return(nil).Once() + mockRuntime.On("Status", mock.Anything, "test-workload").Return(models.ActualStateStopped, "stopped", nil).Once() + mockRuntime.On("Start", mock.Anything, "test-workload").Return(nil).Once() + mockRuntime.On("Status", mock.Anything, "test-workload").Return(models.ActualStateRunning, "running", nil).Once() + mockStore.On("SaveStatus", mock.MatchedBy(func(s *models.WorkloadStatus) bool { + return s.ID == "test-workload" && + s.RevisionID == "rev-1" && + s.DesiredState == models.DesiredStateRunning && + s.ActualState == models.ActualStateRunning + })).Return(nil).Once() + + got, skipped, err := manager.ApplyWorkload(context.Background(), request) + assert.NoError(t, err) + assert.False(t, skipped) + assert.Equal(t, models.DesiredStateRunning, got.DesiredState) + assert.Equal(t, models.ActualStateRunning, got.ActualState) + + mockRuntime.AssertNotCalled(t, "Create", mock.Anything, mock.Anything) + mockRuntime.AssertNotCalled(t, "Delete", mock.Anything, mock.Anything) + mockRuntime.AssertNotCalled(t, "Stop", mock.Anything, mock.Anything) + mockStore.AssertExpectations(t) + mockRuntime.AssertExpectations(t) +} + func TestApplyWorkload_SameRevisionFailedStatus_NotSkipped(t *testing.T) { mockStore := new(MockStore) mockRuntime := new(MockRuntime) diff --git a/internal/workload/retry_terminal_test.go b/internal/workload/retry_terminal_test.go new file mode 100644 index 0000000..aafa979 --- /dev/null +++ b/internal/workload/retry_terminal_test.go @@ -0,0 +1,26 @@ +package workload + +import ( + "testing" + + "github.com/persys-dev/compute-agent/internal/retry" + "github.com/persys-dev/compute-agent/pkg/models" +) + +func TestMarkRetryTerminalSetsMetadata(t *testing.T) { + status := &models.WorkloadStatus{} + tracker := retry.NewRetryTracker(retry.DefaultRetryPolicy()) + _, _ = tracker.RecordFailure(retry.FailureReasonRuntimeError, "runtime failed") + + markRetryTerminal(status, tracker, "max retries reached") + + if !isRetryTerminal(status) { + t.Fatalf("expected status to be marked retry terminal") + } + if got := status.Metadata[retryTerminalReasonMetadataKey]; got != "max retries reached" { + t.Fatalf("unexpected terminal reason: %q", got) + } + if got := status.Metadata["retry_attempts"]; got == "" { + t.Fatalf("expected retry_attempts metadata to be set") + } +} diff --git a/pkg/api/v1/agent.pb.go b/pkg/api/v1/agent.pb.go index 604e129..c8f6996 100644 --- a/pkg/api/v1/agent.pb.go +++ b/pkg/api/v1/agent.pb.go @@ -1035,18 +1035,19 @@ func (*WorkloadSpec_Compose) isWorkloadSpec_Spec() {} func (*WorkloadSpec_Vm) isWorkloadSpec_Spec() {} type ContainerSpec struct { - state protoimpl.MessageState `protogen:"open.v1"` - Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` - Command []string `protobuf:"bytes,2,rep,name=command,proto3" json:"command,omitempty"` - Args []string `protobuf:"bytes,3,rep,name=args,proto3" json:"args,omitempty"` - Env map[string]string `protobuf:"bytes,4,rep,name=env,proto3" json:"env,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` - Volumes []*VolumeMount `protobuf:"bytes,5,rep,name=volumes,proto3" json:"volumes,omitempty"` - Ports []*PortMapping `protobuf:"bytes,6,rep,name=ports,proto3" json:"ports,omitempty"` - Resources *ResourceLimits `protobuf:"bytes,7,opt,name=resources,proto3" json:"resources,omitempty"` - RestartPolicy *RestartPolicy `protobuf:"bytes,8,opt,name=restart_policy,json=restartPolicy,proto3" json:"restart_policy,omitempty"` - Labels map[string]string `protobuf:"bytes,9,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` + Command []string `protobuf:"bytes,2,rep,name=command,proto3" json:"command,omitempty"` + Args []string `protobuf:"bytes,3,rep,name=args,proto3" json:"args,omitempty"` + Env map[string]string `protobuf:"bytes,4,rep,name=env,proto3" json:"env,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + Volumes []*VolumeMount `protobuf:"bytes,5,rep,name=volumes,proto3" json:"volumes,omitempty"` + Ports []*PortMapping `protobuf:"bytes,6,rep,name=ports,proto3" json:"ports,omitempty"` + Resources *ResourceLimits `protobuf:"bytes,7,opt,name=resources,proto3" json:"resources,omitempty"` + RestartPolicy *RestartPolicy `protobuf:"bytes,8,opt,name=restart_policy,json=restartPolicy,proto3" json:"restart_policy,omitempty"` + Labels map[string]string `protobuf:"bytes,9,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + ManagedVolumes []*ManagedVolumeSpec `protobuf:"bytes,10,rep,name=managed_volumes,json=managedVolumes,proto3" json:"managed_volumes,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *ContainerSpec) Reset() { @@ -1142,6 +1143,13 @@ func (x *ContainerSpec) GetLabels() map[string]string { return nil } +func (x *ContainerSpec) GetManagedVolumes() []*ManagedVolumeSpec { + if x != nil { + return x.ManagedVolumes + } + return nil +} + type ComposeSpec struct { state protoimpl.MessageState `protogen:"open.v1"` ProjectName string `protobuf:"bytes,1,opt,name=project_name,json=projectName,proto3" json:"project_name,omitempty"` @@ -1212,6 +1220,7 @@ type VMSpec struct { CloudInit string `protobuf:"bytes,6,opt,name=cloud_init,json=cloudInit,proto3" json:"cloud_init,omitempty"` // optional cloud-init user-data (YAML content) Metadata map[string]string `protobuf:"bytes,7,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` CloudInitConfig *CloudInitConfig `protobuf:"bytes,8,opt,name=cloud_init_config,json=cloudInitConfig,proto3" json:"cloud_init_config,omitempty"` // advanced cloud-init settings + ManagedVolumes []*ManagedVolumeSpec `protobuf:"bytes,9,rep,name=managed_volumes,json=managedVolumes,proto3" json:"managed_volumes,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1302,6 +1311,13 @@ func (x *VMSpec) GetCloudInitConfig() *CloudInitConfig { return nil } +func (x *VMSpec) GetManagedVolumes() []*ManagedVolumeSpec { + if x != nil { + return x.ManagedVolumes + } + return nil +} + type CloudInitConfig struct { state protoimpl.MessageState `protogen:"open.v1"` UserData string `protobuf:"bytes,1,opt,name=user_data,json=userData,proto3" json:"user_data,omitempty"` // cloud-init user-data script @@ -1370,6 +1386,106 @@ func (x *CloudInitConfig) GetVendorData() string { return "" } +type ManagedVolumeSpec struct { + state protoimpl.MessageState `protogen:"open.v1"` + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Driver string `protobuf:"bytes,2,opt,name=driver,proto3" json:"driver,omitempty"` // local|nfs|ceph-rbd + SizeGb int64 `protobuf:"varint,3,opt,name=size_gb,json=sizeGb,proto3" json:"size_gb,omitempty"` + AccessMode string `protobuf:"bytes,4,opt,name=access_mode,json=accessMode,proto3" json:"access_mode,omitempty"` + FsType string `protobuf:"bytes,5,opt,name=fs_type,json=fsType,proto3" json:"fs_type,omitempty"` + MountPath string `protobuf:"bytes,6,opt,name=mount_path,json=mountPath,proto3" json:"mount_path,omitempty"` + ReadOnly bool `protobuf:"varint,7,opt,name=read_only,json=readOnly,proto3" json:"read_only,omitempty"` + RetainPolicy string `protobuf:"bytes,8,opt,name=retain_policy,json=retainPolicy,proto3" json:"retain_policy,omitempty"` // Delete|Retain + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ManagedVolumeSpec) Reset() { + *x = ManagedVolumeSpec{} + mi := &file_agent_proto_msgTypes[18] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ManagedVolumeSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ManagedVolumeSpec) ProtoMessage() {} + +func (x *ManagedVolumeSpec) ProtoReflect() protoreflect.Message { + mi := &file_agent_proto_msgTypes[18] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ManagedVolumeSpec.ProtoReflect.Descriptor instead. +func (*ManagedVolumeSpec) Descriptor() ([]byte, []int) { + return file_agent_proto_rawDescGZIP(), []int{18} +} + +func (x *ManagedVolumeSpec) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ManagedVolumeSpec) GetDriver() string { + if x != nil { + return x.Driver + } + return "" +} + +func (x *ManagedVolumeSpec) GetSizeGb() int64 { + if x != nil { + return x.SizeGb + } + return 0 +} + +func (x *ManagedVolumeSpec) GetAccessMode() string { + if x != nil { + return x.AccessMode + } + return "" +} + +func (x *ManagedVolumeSpec) GetFsType() string { + if x != nil { + return x.FsType + } + return "" +} + +func (x *ManagedVolumeSpec) GetMountPath() string { + if x != nil { + return x.MountPath + } + return "" +} + +func (x *ManagedVolumeSpec) GetReadOnly() bool { + if x != nil { + return x.ReadOnly + } + return false +} + +func (x *ManagedVolumeSpec) GetRetainPolicy() string { + if x != nil { + return x.RetainPolicy + } + return "" +} + type VolumeMount struct { state protoimpl.MessageState `protogen:"open.v1"` HostPath string `protobuf:"bytes,1,opt,name=host_path,json=hostPath,proto3" json:"host_path,omitempty"` @@ -1381,7 +1497,7 @@ type VolumeMount struct { func (x *VolumeMount) Reset() { *x = VolumeMount{} - mi := &file_agent_proto_msgTypes[18] + mi := &file_agent_proto_msgTypes[19] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1393,7 +1509,7 @@ func (x *VolumeMount) String() string { func (*VolumeMount) ProtoMessage() {} func (x *VolumeMount) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[18] + mi := &file_agent_proto_msgTypes[19] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1406,7 +1522,7 @@ func (x *VolumeMount) ProtoReflect() protoreflect.Message { // Deprecated: Use VolumeMount.ProtoReflect.Descriptor instead. func (*VolumeMount) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{18} + return file_agent_proto_rawDescGZIP(), []int{19} } func (x *VolumeMount) GetHostPath() string { @@ -1441,7 +1557,7 @@ type PortMapping struct { func (x *PortMapping) Reset() { *x = PortMapping{} - mi := &file_agent_proto_msgTypes[19] + mi := &file_agent_proto_msgTypes[20] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1453,7 +1569,7 @@ func (x *PortMapping) String() string { func (*PortMapping) ProtoMessage() {} func (x *PortMapping) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[19] + mi := &file_agent_proto_msgTypes[20] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1466,7 +1582,7 @@ func (x *PortMapping) ProtoReflect() protoreflect.Message { // Deprecated: Use PortMapping.ProtoReflect.Descriptor instead. func (*PortMapping) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{19} + return file_agent_proto_rawDescGZIP(), []int{20} } func (x *PortMapping) GetHostPort() int32 { @@ -1501,7 +1617,7 @@ type ResourceLimits struct { func (x *ResourceLimits) Reset() { *x = ResourceLimits{} - mi := &file_agent_proto_msgTypes[20] + mi := &file_agent_proto_msgTypes[21] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1513,7 +1629,7 @@ func (x *ResourceLimits) String() string { func (*ResourceLimits) ProtoMessage() {} func (x *ResourceLimits) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[20] + mi := &file_agent_proto_msgTypes[21] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1526,7 +1642,7 @@ func (x *ResourceLimits) ProtoReflect() protoreflect.Message { // Deprecated: Use ResourceLimits.ProtoReflect.Descriptor instead. func (*ResourceLimits) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{20} + return file_agent_proto_rawDescGZIP(), []int{21} } func (x *ResourceLimits) GetCpuShares() int64 { @@ -1560,7 +1676,7 @@ type RestartPolicy struct { func (x *RestartPolicy) Reset() { *x = RestartPolicy{} - mi := &file_agent_proto_msgTypes[21] + mi := &file_agent_proto_msgTypes[22] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1572,7 +1688,7 @@ func (x *RestartPolicy) String() string { func (*RestartPolicy) ProtoMessage() {} func (x *RestartPolicy) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[21] + mi := &file_agent_proto_msgTypes[22] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1585,7 +1701,7 @@ func (x *RestartPolicy) ProtoReflect() protoreflect.Message { // Deprecated: Use RestartPolicy.ProtoReflect.Descriptor instead. func (*RestartPolicy) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{21} + return file_agent_proto_rawDescGZIP(), []int{22} } func (x *RestartPolicy) GetPolicy() string { @@ -1616,7 +1732,7 @@ type DiskConfig struct { func (x *DiskConfig) Reset() { *x = DiskConfig{} - mi := &file_agent_proto_msgTypes[22] + mi := &file_agent_proto_msgTypes[23] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1628,7 +1744,7 @@ func (x *DiskConfig) String() string { func (*DiskConfig) ProtoMessage() {} func (x *DiskConfig) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[22] + mi := &file_agent_proto_msgTypes[23] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1641,7 +1757,7 @@ func (x *DiskConfig) ProtoReflect() protoreflect.Message { // Deprecated: Use DiskConfig.ProtoReflect.Descriptor instead. func (*DiskConfig) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{22} + return file_agent_proto_rawDescGZIP(), []int{23} } func (x *DiskConfig) GetPath() string { @@ -1697,7 +1813,7 @@ type NetworkConfig struct { func (x *NetworkConfig) Reset() { *x = NetworkConfig{} - mi := &file_agent_proto_msgTypes[23] + mi := &file_agent_proto_msgTypes[24] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1709,7 +1825,7 @@ func (x *NetworkConfig) String() string { func (*NetworkConfig) ProtoMessage() {} func (x *NetworkConfig) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[23] + mi := &file_agent_proto_msgTypes[24] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1722,7 +1838,7 @@ func (x *NetworkConfig) ProtoReflect() protoreflect.Message { // Deprecated: Use NetworkConfig.ProtoReflect.Descriptor instead. func (*NetworkConfig) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{23} + return file_agent_proto_rawDescGZIP(), []int{24} } func (x *NetworkConfig) GetNetwork() string { @@ -1757,13 +1873,14 @@ type WorkloadStatus struct { CreatedAt int64 `protobuf:"varint,7,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` UpdatedAt int64 `protobuf:"varint,8,opt,name=updated_at,json=updatedAt,proto3" json:"updated_at,omitempty"` Metadata map[string]string `protobuf:"bytes,9,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + Usage *WorkloadUsageSnapshot `protobuf:"bytes,10,opt,name=usage,proto3" json:"usage,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } func (x *WorkloadStatus) Reset() { *x = WorkloadStatus{} - mi := &file_agent_proto_msgTypes[24] + mi := &file_agent_proto_msgTypes[25] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1775,7 +1892,7 @@ func (x *WorkloadStatus) String() string { func (*WorkloadStatus) ProtoMessage() {} func (x *WorkloadStatus) ProtoReflect() protoreflect.Message { - mi := &file_agent_proto_msgTypes[24] + mi := &file_agent_proto_msgTypes[25] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1788,7 +1905,7 @@ func (x *WorkloadStatus) ProtoReflect() protoreflect.Message { // Deprecated: Use WorkloadStatus.ProtoReflect.Descriptor instead. func (*WorkloadStatus) Descriptor() ([]byte, []int) { - return file_agent_proto_rawDescGZIP(), []int{24} + return file_agent_proto_rawDescGZIP(), []int{25} } func (x *WorkloadStatus) GetId() string { @@ -1854,6 +1971,129 @@ func (x *WorkloadStatus) GetMetadata() map[string]string { return nil } +func (x *WorkloadStatus) GetUsage() *WorkloadUsageSnapshot { + if x != nil { + return x.Usage + } + return nil +} + +type WorkloadUsageSnapshot struct { + state protoimpl.MessageState `protogen:"open.v1"` + WorkloadId string `protobuf:"bytes,1,opt,name=workload_id,json=workloadId,proto3" json:"workload_id,omitempty"` + Type WorkloadType `protobuf:"varint,2,opt,name=type,proto3,enum=persys.agent.v1.WorkloadType" json:"type,omitempty"` + CpuPercent float64 `protobuf:"fixed64,3,opt,name=cpu_percent,json=cpuPercent,proto3" json:"cpu_percent,omitempty"` + MemoryBytes int64 `protobuf:"varint,4,opt,name=memory_bytes,json=memoryBytes,proto3" json:"memory_bytes,omitempty"` + DiskReadBytes int64 `protobuf:"varint,5,opt,name=disk_read_bytes,json=diskReadBytes,proto3" json:"disk_read_bytes,omitempty"` + DiskWriteBytes int64 `protobuf:"varint,6,opt,name=disk_write_bytes,json=diskWriteBytes,proto3" json:"disk_write_bytes,omitempty"` + NetRxBytes int64 `protobuf:"varint,7,opt,name=net_rx_bytes,json=netRxBytes,proto3" json:"net_rx_bytes,omitempty"` + NetTxBytes int64 `protobuf:"varint,8,opt,name=net_tx_bytes,json=netTxBytes,proto3" json:"net_tx_bytes,omitempty"` + CollectedAt int64 `protobuf:"varint,9,opt,name=collected_at,json=collectedAt,proto3" json:"collected_at,omitempty"` // unix timestamp + Source string `protobuf:"bytes,10,opt,name=source,proto3" json:"source,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *WorkloadUsageSnapshot) Reset() { + *x = WorkloadUsageSnapshot{} + mi := &file_agent_proto_msgTypes[26] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *WorkloadUsageSnapshot) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*WorkloadUsageSnapshot) ProtoMessage() {} + +func (x *WorkloadUsageSnapshot) ProtoReflect() protoreflect.Message { + mi := &file_agent_proto_msgTypes[26] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use WorkloadUsageSnapshot.ProtoReflect.Descriptor instead. +func (*WorkloadUsageSnapshot) Descriptor() ([]byte, []int) { + return file_agent_proto_rawDescGZIP(), []int{26} +} + +func (x *WorkloadUsageSnapshot) GetWorkloadId() string { + if x != nil { + return x.WorkloadId + } + return "" +} + +func (x *WorkloadUsageSnapshot) GetType() WorkloadType { + if x != nil { + return x.Type + } + return WorkloadType_WORKLOAD_TYPE_UNSPECIFIED +} + +func (x *WorkloadUsageSnapshot) GetCpuPercent() float64 { + if x != nil { + return x.CpuPercent + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetMemoryBytes() int64 { + if x != nil { + return x.MemoryBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetDiskReadBytes() int64 { + if x != nil { + return x.DiskReadBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetDiskWriteBytes() int64 { + if x != nil { + return x.DiskWriteBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetNetRxBytes() int64 { + if x != nil { + return x.NetRxBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetNetTxBytes() int64 { + if x != nil { + return x.NetTxBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetCollectedAt() int64 { + if x != nil { + return x.CollectedAt + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetSource() string { + if x != nil { + return x.Source + } + return "" +} + var File_agent_proto protoreflect.FileDescriptor const file_agent_proto_rawDesc = "" + @@ -1922,7 +2162,7 @@ const file_agent_proto_rawDesc = "" + "\tcontainer\x18\x01 \x01(\v2\x1e.persys.agent.v1.ContainerSpecH\x00R\tcontainer\x128\n" + "\acompose\x18\x02 \x01(\v2\x1c.persys.agent.v1.ComposeSpecH\x00R\acompose\x12)\n" + "\x02vm\x18\x03 \x01(\v2\x17.persys.agent.v1.VMSpecH\x00R\x02vmB\x06\n" + - "\x04spec\"\xb7\x04\n" + + "\x04spec\"\x84\x05\n" + "\rContainerSpec\x12\x14\n" + "\x05image\x18\x01 \x01(\tR\x05image\x12\x18\n" + "\acommand\x18\x02 \x03(\tR\acommand\x12\x12\n" + @@ -1932,7 +2172,9 @@ const file_agent_proto_rawDesc = "" + "\x05ports\x18\x06 \x03(\v2\x1c.persys.agent.v1.PortMappingR\x05ports\x12=\n" + "\tresources\x18\a \x01(\v2\x1f.persys.agent.v1.ResourceLimitsR\tresources\x12E\n" + "\x0erestart_policy\x18\b \x01(\v2\x1e.persys.agent.v1.RestartPolicyR\rrestartPolicy\x12B\n" + - "\x06labels\x18\t \x03(\v2*.persys.agent.v1.ContainerSpec.LabelsEntryR\x06labels\x1a6\n" + + "\x06labels\x18\t \x03(\v2*.persys.agent.v1.ContainerSpec.LabelsEntryR\x06labels\x12K\n" + + "\x0fmanaged_volumes\x18\n" + + " \x03(\v2\".persys.agent.v1.ManagedVolumeSpecR\x0emanagedVolumes\x1a6\n" + "\bEnvEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\x1a9\n" + @@ -1945,7 +2187,7 @@ const file_agent_proto_rawDesc = "" + "\x03env\x18\x03 \x03(\v2%.persys.agent.v1.ComposeSpec.EnvEntryR\x03env\x1a6\n" + "\bEnvEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + - "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xab\x03\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xf8\x03\n" + "\x06VMSpec\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n" + "\x05vcpus\x18\x02 \x01(\x05R\x05vcpus\x12\x1b\n" + @@ -1955,7 +2197,8 @@ const file_agent_proto_rawDesc = "" + "\n" + "cloud_init\x18\x06 \x01(\tR\tcloudInit\x12A\n" + "\bmetadata\x18\a \x03(\v2%.persys.agent.v1.VMSpec.MetadataEntryR\bmetadata\x12L\n" + - "\x11cloud_init_config\x18\b \x01(\v2 .persys.agent.v1.CloudInitConfigR\x0fcloudInitConfig\x1a;\n" + + "\x11cloud_init_config\x18\b \x01(\v2 .persys.agent.v1.CloudInitConfigR\x0fcloudInitConfig\x12K\n" + + "\x0fmanaged_volumes\x18\t \x03(\v2\".persys.agent.v1.ManagedVolumeSpecR\x0emanagedVolumes\x1a;\n" + "\rMetadataEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x93\x01\n" + @@ -1964,7 +2207,18 @@ const file_agent_proto_rawDesc = "" + "\tmeta_data\x18\x02 \x01(\tR\bmetaData\x12%\n" + "\x0enetwork_config\x18\x03 \x01(\tR\rnetworkConfig\x12\x1f\n" + "\vvendor_data\x18\x04 \x01(\tR\n" + - "vendorData\"n\n" + + "vendorData\"\xf3\x01\n" + + "\x11ManagedVolumeSpec\x12\x12\n" + + "\x04name\x18\x01 \x01(\tR\x04name\x12\x16\n" + + "\x06driver\x18\x02 \x01(\tR\x06driver\x12\x17\n" + + "\asize_gb\x18\x03 \x01(\x03R\x06sizeGb\x12\x1f\n" + + "\vaccess_mode\x18\x04 \x01(\tR\n" + + "accessMode\x12\x17\n" + + "\afs_type\x18\x05 \x01(\tR\x06fsType\x12\x1d\n" + + "\n" + + "mount_path\x18\x06 \x01(\tR\tmountPath\x12\x1b\n" + + "\tread_only\x18\a \x01(\bR\breadOnly\x12#\n" + + "\rretain_policy\x18\b \x01(\tR\fretainPolicy\"n\n" + "\vVolumeMount\x12\x1b\n" + "\thost_path\x18\x01 \x01(\tR\bhostPath\x12%\n" + "\x0econtainer_path\x18\x02 \x01(\tR\rcontainerPath\x12\x1b\n" + @@ -1994,7 +2248,7 @@ const file_agent_proto_rawDesc = "" + "\vmac_address\x18\x02 \x01(\tR\n" + "macAddress\x12\x1d\n" + "\n" + - "ip_address\x18\x03 \x01(\tR\tipAddress\"\xd9\x03\n" + + "ip_address\x18\x03 \x01(\tR\tipAddress\"\x97\x04\n" + "\x0eWorkloadStatus\x12\x0e\n" + "\x02id\x18\x01 \x01(\tR\x02id\x121\n" + "\x04type\x18\x02 \x01(\x0e2\x1d.persys.agent.v1.WorkloadTypeR\x04type\x12\x1f\n" + @@ -2007,10 +2261,28 @@ const file_agent_proto_rawDesc = "" + "created_at\x18\a \x01(\x03R\tcreatedAt\x12\x1d\n" + "\n" + "updated_at\x18\b \x01(\x03R\tupdatedAt\x12I\n" + - "\bmetadata\x18\t \x03(\v2-.persys.agent.v1.WorkloadStatus.MetadataEntryR\bmetadata\x1a;\n" + + "\bmetadata\x18\t \x03(\v2-.persys.agent.v1.WorkloadStatus.MetadataEntryR\bmetadata\x12<\n" + + "\x05usage\x18\n" + + " \x01(\v2&.persys.agent.v1.WorkloadUsageSnapshotR\x05usage\x1a;\n" + "\rMetadataEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + - "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01*{\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x80\x03\n" + + "\x15WorkloadUsageSnapshot\x12\x1f\n" + + "\vworkload_id\x18\x01 \x01(\tR\n" + + "workloadId\x121\n" + + "\x04type\x18\x02 \x01(\x0e2\x1d.persys.agent.v1.WorkloadTypeR\x04type\x12\x1f\n" + + "\vcpu_percent\x18\x03 \x01(\x01R\n" + + "cpuPercent\x12!\n" + + "\fmemory_bytes\x18\x04 \x01(\x03R\vmemoryBytes\x12&\n" + + "\x0fdisk_read_bytes\x18\x05 \x01(\x03R\rdiskReadBytes\x12(\n" + + "\x10disk_write_bytes\x18\x06 \x01(\x03R\x0ediskWriteBytes\x12 \n" + + "\fnet_rx_bytes\x18\a \x01(\x03R\n" + + "netRxBytes\x12 \n" + + "\fnet_tx_bytes\x18\b \x01(\x03R\n" + + "netTxBytes\x12!\n" + + "\fcollected_at\x18\t \x01(\x03R\vcollectedAt\x12\x16\n" + + "\x06source\x18\n" + + " \x01(\tR\x06source*{\n" + "\fWorkloadType\x12\x1d\n" + "\x19WORKLOAD_TYPE_UNSPECIFIED\x10\x00\x12\x1b\n" + "\x17WORKLOAD_TYPE_CONTAINER\x10\x01\x12\x19\n" + @@ -2033,7 +2305,7 @@ const file_agent_proto_rawDesc = "" + "\x11GetWorkloadStatus\x12).persys.agent.v1.GetWorkloadStatusRequest\x1a*.persys.agent.v1.GetWorkloadStatusResponse\x12^\n" + "\rListWorkloads\x12%.persys.agent.v1.ListWorkloadsRequest\x1a&.persys.agent.v1.ListWorkloadsResponse\x12X\n" + "\vHealthCheck\x12#.persys.agent.v1.HealthCheckRequest\x1a$.persys.agent.v1.HealthCheckResponse\x12X\n" + - "\vListActions\x12#.persys.agent.v1.ListActionsRequest\x1a$.persys.agent.v1.ListActionsResponseB/Z-github.com/persys/compute-agent/pkg/api/v1;v1b\x06proto3" + "\vListActions\x12#.persys.agent.v1.ListActionsRequest\x1a$.persys.agent.v1.ListActionsResponseB3Z1github.com/persys-dev/compute-agent/pkg/api/v1;v1b\x06proto3" var ( file_agent_proto_rawDescOnce sync.Once @@ -2048,7 +2320,7 @@ func file_agent_proto_rawDescGZIP() []byte { } var file_agent_proto_enumTypes = make([]protoimpl.EnumInfo, 3) -var file_agent_proto_msgTypes = make([]protoimpl.MessageInfo, 31) +var file_agent_proto_msgTypes = make([]protoimpl.MessageInfo, 33) var file_agent_proto_goTypes = []any{ (WorkloadType)(0), // 0: persys.agent.v1.WorkloadType (DesiredState)(0), // 1: persys.agent.v1.DesiredState @@ -2071,65 +2343,71 @@ var file_agent_proto_goTypes = []any{ (*ComposeSpec)(nil), // 18: persys.agent.v1.ComposeSpec (*VMSpec)(nil), // 19: persys.agent.v1.VMSpec (*CloudInitConfig)(nil), // 20: persys.agent.v1.CloudInitConfig - (*VolumeMount)(nil), // 21: persys.agent.v1.VolumeMount - (*PortMapping)(nil), // 22: persys.agent.v1.PortMapping - (*ResourceLimits)(nil), // 23: persys.agent.v1.ResourceLimits - (*RestartPolicy)(nil), // 24: persys.agent.v1.RestartPolicy - (*DiskConfig)(nil), // 25: persys.agent.v1.DiskConfig - (*NetworkConfig)(nil), // 26: persys.agent.v1.NetworkConfig - (*WorkloadStatus)(nil), // 27: persys.agent.v1.WorkloadStatus - nil, // 28: persys.agent.v1.HealthCheckResponse.RuntimeStatusEntry - nil, // 29: persys.agent.v1.ContainerSpec.EnvEntry - nil, // 30: persys.agent.v1.ContainerSpec.LabelsEntry - nil, // 31: persys.agent.v1.ComposeSpec.EnvEntry - nil, // 32: persys.agent.v1.VMSpec.MetadataEntry - nil, // 33: persys.agent.v1.WorkloadStatus.MetadataEntry + (*ManagedVolumeSpec)(nil), // 21: persys.agent.v1.ManagedVolumeSpec + (*VolumeMount)(nil), // 22: persys.agent.v1.VolumeMount + (*PortMapping)(nil), // 23: persys.agent.v1.PortMapping + (*ResourceLimits)(nil), // 24: persys.agent.v1.ResourceLimits + (*RestartPolicy)(nil), // 25: persys.agent.v1.RestartPolicy + (*DiskConfig)(nil), // 26: persys.agent.v1.DiskConfig + (*NetworkConfig)(nil), // 27: persys.agent.v1.NetworkConfig + (*WorkloadStatus)(nil), // 28: persys.agent.v1.WorkloadStatus + (*WorkloadUsageSnapshot)(nil), // 29: persys.agent.v1.WorkloadUsageSnapshot + nil, // 30: persys.agent.v1.HealthCheckResponse.RuntimeStatusEntry + nil, // 31: persys.agent.v1.ContainerSpec.EnvEntry + nil, // 32: persys.agent.v1.ContainerSpec.LabelsEntry + nil, // 33: persys.agent.v1.ComposeSpec.EnvEntry + nil, // 34: persys.agent.v1.VMSpec.MetadataEntry + nil, // 35: persys.agent.v1.WorkloadStatus.MetadataEntry } var file_agent_proto_depIdxs = []int32{ 0, // 0: persys.agent.v1.ApplyWorkloadRequest.type:type_name -> persys.agent.v1.WorkloadType 1, // 1: persys.agent.v1.ApplyWorkloadRequest.desired_state:type_name -> persys.agent.v1.DesiredState 16, // 2: persys.agent.v1.ApplyWorkloadRequest.spec:type_name -> persys.agent.v1.WorkloadSpec - 27, // 3: persys.agent.v1.ApplyWorkloadResponse.status:type_name -> persys.agent.v1.WorkloadStatus - 27, // 4: persys.agent.v1.GetWorkloadStatusResponse.status:type_name -> persys.agent.v1.WorkloadStatus + 28, // 3: persys.agent.v1.ApplyWorkloadResponse.status:type_name -> persys.agent.v1.WorkloadStatus + 28, // 4: persys.agent.v1.GetWorkloadStatusResponse.status:type_name -> persys.agent.v1.WorkloadStatus 0, // 5: persys.agent.v1.ListWorkloadsRequest.type:type_name -> persys.agent.v1.WorkloadType - 27, // 6: persys.agent.v1.ListWorkloadsResponse.workloads:type_name -> persys.agent.v1.WorkloadStatus - 28, // 7: persys.agent.v1.HealthCheckResponse.runtime_status:type_name -> persys.agent.v1.HealthCheckResponse.RuntimeStatusEntry + 28, // 6: persys.agent.v1.ListWorkloadsResponse.workloads:type_name -> persys.agent.v1.WorkloadStatus + 30, // 7: persys.agent.v1.HealthCheckResponse.runtime_status:type_name -> persys.agent.v1.HealthCheckResponse.RuntimeStatusEntry 14, // 8: persys.agent.v1.ListActionsResponse.actions:type_name -> persys.agent.v1.AgentAction 17, // 9: persys.agent.v1.WorkloadSpec.container:type_name -> persys.agent.v1.ContainerSpec 18, // 10: persys.agent.v1.WorkloadSpec.compose:type_name -> persys.agent.v1.ComposeSpec 19, // 11: persys.agent.v1.WorkloadSpec.vm:type_name -> persys.agent.v1.VMSpec - 29, // 12: persys.agent.v1.ContainerSpec.env:type_name -> persys.agent.v1.ContainerSpec.EnvEntry - 21, // 13: persys.agent.v1.ContainerSpec.volumes:type_name -> persys.agent.v1.VolumeMount - 22, // 14: persys.agent.v1.ContainerSpec.ports:type_name -> persys.agent.v1.PortMapping - 23, // 15: persys.agent.v1.ContainerSpec.resources:type_name -> persys.agent.v1.ResourceLimits - 24, // 16: persys.agent.v1.ContainerSpec.restart_policy:type_name -> persys.agent.v1.RestartPolicy - 30, // 17: persys.agent.v1.ContainerSpec.labels:type_name -> persys.agent.v1.ContainerSpec.LabelsEntry - 31, // 18: persys.agent.v1.ComposeSpec.env:type_name -> persys.agent.v1.ComposeSpec.EnvEntry - 25, // 19: persys.agent.v1.VMSpec.disks:type_name -> persys.agent.v1.DiskConfig - 26, // 20: persys.agent.v1.VMSpec.networks:type_name -> persys.agent.v1.NetworkConfig - 32, // 21: persys.agent.v1.VMSpec.metadata:type_name -> persys.agent.v1.VMSpec.MetadataEntry - 20, // 22: persys.agent.v1.VMSpec.cloud_init_config:type_name -> persys.agent.v1.CloudInitConfig - 0, // 23: persys.agent.v1.WorkloadStatus.type:type_name -> persys.agent.v1.WorkloadType - 1, // 24: persys.agent.v1.WorkloadStatus.desired_state:type_name -> persys.agent.v1.DesiredState - 2, // 25: persys.agent.v1.WorkloadStatus.actual_state:type_name -> persys.agent.v1.ActualState - 33, // 26: persys.agent.v1.WorkloadStatus.metadata:type_name -> persys.agent.v1.WorkloadStatus.MetadataEntry - 3, // 27: persys.agent.v1.AgentService.ApplyWorkload:input_type -> persys.agent.v1.ApplyWorkloadRequest - 5, // 28: persys.agent.v1.AgentService.DeleteWorkload:input_type -> persys.agent.v1.DeleteWorkloadRequest - 7, // 29: persys.agent.v1.AgentService.GetWorkloadStatus:input_type -> persys.agent.v1.GetWorkloadStatusRequest - 9, // 30: persys.agent.v1.AgentService.ListWorkloads:input_type -> persys.agent.v1.ListWorkloadsRequest - 11, // 31: persys.agent.v1.AgentService.HealthCheck:input_type -> persys.agent.v1.HealthCheckRequest - 13, // 32: persys.agent.v1.AgentService.ListActions:input_type -> persys.agent.v1.ListActionsRequest - 4, // 33: persys.agent.v1.AgentService.ApplyWorkload:output_type -> persys.agent.v1.ApplyWorkloadResponse - 6, // 34: persys.agent.v1.AgentService.DeleteWorkload:output_type -> persys.agent.v1.DeleteWorkloadResponse - 8, // 35: persys.agent.v1.AgentService.GetWorkloadStatus:output_type -> persys.agent.v1.GetWorkloadStatusResponse - 10, // 36: persys.agent.v1.AgentService.ListWorkloads:output_type -> persys.agent.v1.ListWorkloadsResponse - 12, // 37: persys.agent.v1.AgentService.HealthCheck:output_type -> persys.agent.v1.HealthCheckResponse - 15, // 38: persys.agent.v1.AgentService.ListActions:output_type -> persys.agent.v1.ListActionsResponse - 33, // [33:39] is the sub-list for method output_type - 27, // [27:33] is the sub-list for method input_type - 27, // [27:27] is the sub-list for extension type_name - 27, // [27:27] is the sub-list for extension extendee - 0, // [0:27] is the sub-list for field type_name + 31, // 12: persys.agent.v1.ContainerSpec.env:type_name -> persys.agent.v1.ContainerSpec.EnvEntry + 22, // 13: persys.agent.v1.ContainerSpec.volumes:type_name -> persys.agent.v1.VolumeMount + 23, // 14: persys.agent.v1.ContainerSpec.ports:type_name -> persys.agent.v1.PortMapping + 24, // 15: persys.agent.v1.ContainerSpec.resources:type_name -> persys.agent.v1.ResourceLimits + 25, // 16: persys.agent.v1.ContainerSpec.restart_policy:type_name -> persys.agent.v1.RestartPolicy + 32, // 17: persys.agent.v1.ContainerSpec.labels:type_name -> persys.agent.v1.ContainerSpec.LabelsEntry + 21, // 18: persys.agent.v1.ContainerSpec.managed_volumes:type_name -> persys.agent.v1.ManagedVolumeSpec + 33, // 19: persys.agent.v1.ComposeSpec.env:type_name -> persys.agent.v1.ComposeSpec.EnvEntry + 26, // 20: persys.agent.v1.VMSpec.disks:type_name -> persys.agent.v1.DiskConfig + 27, // 21: persys.agent.v1.VMSpec.networks:type_name -> persys.agent.v1.NetworkConfig + 34, // 22: persys.agent.v1.VMSpec.metadata:type_name -> persys.agent.v1.VMSpec.MetadataEntry + 20, // 23: persys.agent.v1.VMSpec.cloud_init_config:type_name -> persys.agent.v1.CloudInitConfig + 21, // 24: persys.agent.v1.VMSpec.managed_volumes:type_name -> persys.agent.v1.ManagedVolumeSpec + 0, // 25: persys.agent.v1.WorkloadStatus.type:type_name -> persys.agent.v1.WorkloadType + 1, // 26: persys.agent.v1.WorkloadStatus.desired_state:type_name -> persys.agent.v1.DesiredState + 2, // 27: persys.agent.v1.WorkloadStatus.actual_state:type_name -> persys.agent.v1.ActualState + 35, // 28: persys.agent.v1.WorkloadStatus.metadata:type_name -> persys.agent.v1.WorkloadStatus.MetadataEntry + 29, // 29: persys.agent.v1.WorkloadStatus.usage:type_name -> persys.agent.v1.WorkloadUsageSnapshot + 0, // 30: persys.agent.v1.WorkloadUsageSnapshot.type:type_name -> persys.agent.v1.WorkloadType + 3, // 31: persys.agent.v1.AgentService.ApplyWorkload:input_type -> persys.agent.v1.ApplyWorkloadRequest + 5, // 32: persys.agent.v1.AgentService.DeleteWorkload:input_type -> persys.agent.v1.DeleteWorkloadRequest + 7, // 33: persys.agent.v1.AgentService.GetWorkloadStatus:input_type -> persys.agent.v1.GetWorkloadStatusRequest + 9, // 34: persys.agent.v1.AgentService.ListWorkloads:input_type -> persys.agent.v1.ListWorkloadsRequest + 11, // 35: persys.agent.v1.AgentService.HealthCheck:input_type -> persys.agent.v1.HealthCheckRequest + 13, // 36: persys.agent.v1.AgentService.ListActions:input_type -> persys.agent.v1.ListActionsRequest + 4, // 37: persys.agent.v1.AgentService.ApplyWorkload:output_type -> persys.agent.v1.ApplyWorkloadResponse + 6, // 38: persys.agent.v1.AgentService.DeleteWorkload:output_type -> persys.agent.v1.DeleteWorkloadResponse + 8, // 39: persys.agent.v1.AgentService.GetWorkloadStatus:output_type -> persys.agent.v1.GetWorkloadStatusResponse + 10, // 40: persys.agent.v1.AgentService.ListWorkloads:output_type -> persys.agent.v1.ListWorkloadsResponse + 12, // 41: persys.agent.v1.AgentService.HealthCheck:output_type -> persys.agent.v1.HealthCheckResponse + 15, // 42: persys.agent.v1.AgentService.ListActions:output_type -> persys.agent.v1.ListActionsResponse + 37, // [37:43] is the sub-list for method output_type + 31, // [31:37] is the sub-list for method input_type + 31, // [31:31] is the sub-list for extension type_name + 31, // [31:31] is the sub-list for extension extendee + 0, // [0:31] is the sub-list for field type_name } func init() { file_agent_proto_init() } @@ -2148,7 +2426,7 @@ func file_agent_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_agent_proto_rawDesc), len(file_agent_proto_rawDesc)), NumEnums: 3, - NumMessages: 31, + NumMessages: 33, NumExtensions: 0, NumServices: 1, }, diff --git a/pkg/control/v1/control.pb.go b/pkg/control/v1/control.pb.go index 7161d91..915138a 100644 --- a/pkg/control/v1/control.pb.go +++ b/pkg/control/v1/control.pb.go @@ -182,13 +182,14 @@ func (x *RegisterNodeRequest) GetTimestamp() *timestamppb.Timestamp { } type NodeCapabilities struct { - state protoimpl.MessageState `protogen:"open.v1"` - CpuTotalMillicores int64 `protobuf:"varint,1,opt,name=cpu_total_millicores,json=cpuTotalMillicores,proto3" json:"cpu_total_millicores,omitempty"` - MemoryTotalMb int64 `protobuf:"varint,2,opt,name=memory_total_mb,json=memoryTotalMb,proto3" json:"memory_total_mb,omitempty"` - StoragePools []*StoragePool `protobuf:"bytes,3,rep,name=storage_pools,json=storagePools,proto3" json:"storage_pools,omitempty"` - SupportedWorkloadTypes []string `protobuf:"bytes,4,rep,name=supported_workload_types,json=supportedWorkloadTypes,proto3" json:"supported_workload_types,omitempty"` // container, compose, vm - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + CpuTotalMillicores int64 `protobuf:"varint,1,opt,name=cpu_total_millicores,json=cpuTotalMillicores,proto3" json:"cpu_total_millicores,omitempty"` + MemoryTotalMb int64 `protobuf:"varint,2,opt,name=memory_total_mb,json=memoryTotalMb,proto3" json:"memory_total_mb,omitempty"` + StoragePools []*StoragePool `protobuf:"bytes,3,rep,name=storage_pools,json=storagePools,proto3" json:"storage_pools,omitempty"` + SupportedWorkloadTypes []string `protobuf:"bytes,4,rep,name=supported_workload_types,json=supportedWorkloadTypes,proto3" json:"supported_workload_types,omitempty"` // container, compose, vm + SupportedStorageDrivers []string `protobuf:"bytes,5,rep,name=supported_storage_drivers,json=supportedStorageDrivers,proto3" json:"supported_storage_drivers,omitempty"` // local, nfs, ceph-rbd + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *NodeCapabilities) Reset() { @@ -249,6 +250,13 @@ func (x *NodeCapabilities) GetSupportedWorkloadTypes() []string { return nil } +func (x *NodeCapabilities) GetSupportedStorageDrivers() []string { + if x != nil { + return x.SupportedStorageDrivers + } + return nil +} + type StoragePool struct { state protoimpl.MessageState `protogen:"open.v1"` Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` @@ -378,11 +386,12 @@ func (x *RegisterNodeResponse) GetLeaseExpiresAt() *timestamppb.Timestamp { } type HeartbeatRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - NodeId string `protobuf:"bytes,1,opt,name=node_id,json=nodeId,proto3" json:"node_id,omitempty"` - Usage *NodeUsage `protobuf:"bytes,2,opt,name=usage,proto3" json:"usage,omitempty"` - WorkloadStatuses []*WorkloadStatus `protobuf:"bytes,3,rep,name=workload_statuses,json=workloadStatuses,proto3" json:"workload_statuses,omitempty"` - Timestamp *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=timestamp,proto3" json:"timestamp,omitempty"` + state protoimpl.MessageState `protogen:"open.v1"` + NodeId string `protobuf:"bytes,1,opt,name=node_id,json=nodeId,proto3" json:"node_id,omitempty"` + Usage *NodeUsage `protobuf:"bytes,2,opt,name=usage,proto3" json:"usage,omitempty"` + WorkloadStatuses []*WorkloadStatus `protobuf:"bytes,3,rep,name=workload_statuses,json=workloadStatuses,proto3" json:"workload_statuses,omitempty"` + Timestamp *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=timestamp,proto3" json:"timestamp,omitempty"` + WorkloadUsage []*WorkloadUsageSnapshot `protobuf:"bytes,5,rep,name=workload_usage,json=workloadUsage,proto3" json:"workload_usage,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -445,6 +454,13 @@ func (x *HeartbeatRequest) GetTimestamp() *timestamppb.Timestamp { return nil } +func (x *HeartbeatRequest) GetWorkloadUsage() []*WorkloadUsageSnapshot { + if x != nil { + return x.WorkloadUsage + } + return nil +} + type NodeUsage struct { state protoimpl.MessageState `protogen:"open.v1"` CpuAllocatedMillicores int64 `protobuf:"varint,1,opt,name=cpu_allocated_millicores,json=cpuAllocatedMillicores,proto3" json:"cpu_allocated_millicores,omitempty"` @@ -997,16 +1013,17 @@ func (x *ResourceRequirements) GetDiskGb() int64 { } type ContainerSpec struct { - state protoimpl.MessageState `protogen:"open.v1"` - Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` - Command []string `protobuf:"bytes,2,rep,name=command,proto3" json:"command,omitempty"` - Env map[string]string `protobuf:"bytes,3,rep,name=env,proto3" json:"env,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` - Volumes []*VolumeMount `protobuf:"bytes,4,rep,name=volumes,proto3" json:"volumes,omitempty"` - Ports []*Port `protobuf:"bytes,5,rep,name=ports,proto3" json:"ports,omitempty"` - RestartPolicy string `protobuf:"bytes,6,opt,name=restart_policy,json=restartPolicy,proto3" json:"restart_policy,omitempty"` - Privileged bool `protobuf:"varint,7,opt,name=privileged,proto3" json:"privileged,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` + Command []string `protobuf:"bytes,2,rep,name=command,proto3" json:"command,omitempty"` + Env map[string]string `protobuf:"bytes,3,rep,name=env,proto3" json:"env,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + Volumes []*VolumeMount `protobuf:"bytes,4,rep,name=volumes,proto3" json:"volumes,omitempty"` + Ports []*Port `protobuf:"bytes,5,rep,name=ports,proto3" json:"ports,omitempty"` + RestartPolicy string `protobuf:"bytes,6,opt,name=restart_policy,json=restartPolicy,proto3" json:"restart_policy,omitempty"` + Privileged bool `protobuf:"varint,7,opt,name=privileged,proto3" json:"privileged,omitempty"` + ManagedVolumes []*ManagedVolumeSpec `protobuf:"bytes,8,rep,name=managed_volumes,json=managedVolumes,proto3" json:"managed_volumes,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *ContainerSpec) Reset() { @@ -1088,6 +1105,13 @@ func (x *ContainerSpec) GetPrivileged() bool { return false } +func (x *ContainerSpec) GetManagedVolumes() []*ManagedVolumeSpec { + if x != nil { + return x.ManagedVolumes + } + return nil +} + type VolumeMount struct { state protoimpl.MessageState `protogen:"open.v1"` HostPath string `protobuf:"bytes,1,opt,name=host_path,json=hostPath,proto3" json:"host_path,omitempty"` @@ -1285,15 +1309,16 @@ func (x *ComposeSpec) GetEnv() map[string]string { } type VMSpec struct { - state protoimpl.MessageState `protogen:"open.v1"` - Vcpus int32 `protobuf:"varint,1,opt,name=vcpus,proto3" json:"vcpus,omitempty"` - MemoryMb int64 `protobuf:"varint,2,opt,name=memory_mb,json=memoryMb,proto3" json:"memory_mb,omitempty"` - Disks []*DiskConfig `protobuf:"bytes,3,rep,name=disks,proto3" json:"disks,omitempty"` - Networks []*NetworkConfig `protobuf:"bytes,4,rep,name=networks,proto3" json:"networks,omitempty"` - CloudInit *CloudInitConfig `protobuf:"bytes,5,opt,name=cloud_init,json=cloudInit,proto3" json:"cloud_init,omitempty"` - OsImage string `protobuf:"bytes,6,opt,name=os_image,json=osImage,proto3" json:"os_image,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Vcpus int32 `protobuf:"varint,1,opt,name=vcpus,proto3" json:"vcpus,omitempty"` + MemoryMb int64 `protobuf:"varint,2,opt,name=memory_mb,json=memoryMb,proto3" json:"memory_mb,omitempty"` + Disks []*DiskConfig `protobuf:"bytes,3,rep,name=disks,proto3" json:"disks,omitempty"` + Networks []*NetworkConfig `protobuf:"bytes,4,rep,name=networks,proto3" json:"networks,omitempty"` + CloudInit *CloudInitConfig `protobuf:"bytes,5,opt,name=cloud_init,json=cloudInit,proto3" json:"cloud_init,omitempty"` + OsImage string `protobuf:"bytes,6,opt,name=os_image,json=osImage,proto3" json:"os_image,omitempty"` + ManagedVolumes []*ManagedVolumeSpec `protobuf:"bytes,7,rep,name=managed_volumes,json=managedVolumes,proto3" json:"managed_volumes,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *VMSpec) Reset() { @@ -1368,6 +1393,13 @@ func (x *VMSpec) GetOsImage() string { return "" } +func (x *VMSpec) GetManagedVolumes() []*ManagedVolumeSpec { + if x != nil { + return x.ManagedVolumes + } + return nil +} + type DiskConfig struct { state protoimpl.MessageState `protogen:"open.v1"` PoolName string `protobuf:"bytes,1,opt,name=pool_name,json=poolName,proto3" json:"pool_name,omitempty"` @@ -1493,6 +1525,7 @@ type CloudInitConfig struct { UserData string `protobuf:"bytes,1,opt,name=user_data,json=userData,proto3" json:"user_data,omitempty"` MetaData string `protobuf:"bytes,2,opt,name=meta_data,json=metaData,proto3" json:"meta_data,omitempty"` NetworkConfig string `protobuf:"bytes,3,opt,name=network_config,json=networkConfig,proto3" json:"network_config,omitempty"` + VendorData string `protobuf:"bytes,4,opt,name=vendor_data,json=vendorData,proto3" json:"vendor_data,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1548,6 +1581,305 @@ func (x *CloudInitConfig) GetNetworkConfig() string { return "" } +func (x *CloudInitConfig) GetVendorData() string { + if x != nil { + return x.VendorData + } + return "" +} + +type ManagedVolumeSpec struct { + state protoimpl.MessageState `protogen:"open.v1"` + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Driver string `protobuf:"bytes,2,opt,name=driver,proto3" json:"driver,omitempty"` // local|nfs|ceph-rbd + SizeGb int64 `protobuf:"varint,3,opt,name=size_gb,json=sizeGb,proto3" json:"size_gb,omitempty"` + AccessMode string `protobuf:"bytes,4,opt,name=access_mode,json=accessMode,proto3" json:"access_mode,omitempty"` + FsType string `protobuf:"bytes,5,opt,name=fs_type,json=fsType,proto3" json:"fs_type,omitempty"` + MountPath string `protobuf:"bytes,6,opt,name=mount_path,json=mountPath,proto3" json:"mount_path,omitempty"` + ReadOnly bool `protobuf:"varint,7,opt,name=read_only,json=readOnly,proto3" json:"read_only,omitempty"` + RetainPolicy string `protobuf:"bytes,8,opt,name=retain_policy,json=retainPolicy,proto3" json:"retain_policy,omitempty"` // Delete|Retain + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ManagedVolumeSpec) Reset() { + *x = ManagedVolumeSpec{} + mi := &file_control_proto_msgTypes[21] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ManagedVolumeSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ManagedVolumeSpec) ProtoMessage() {} + +func (x *ManagedVolumeSpec) ProtoReflect() protoreflect.Message { + mi := &file_control_proto_msgTypes[21] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ManagedVolumeSpec.ProtoReflect.Descriptor instead. +func (*ManagedVolumeSpec) Descriptor() ([]byte, []int) { + return file_control_proto_rawDescGZIP(), []int{21} +} + +func (x *ManagedVolumeSpec) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ManagedVolumeSpec) GetDriver() string { + if x != nil { + return x.Driver + } + return "" +} + +func (x *ManagedVolumeSpec) GetSizeGb() int64 { + if x != nil { + return x.SizeGb + } + return 0 +} + +func (x *ManagedVolumeSpec) GetAccessMode() string { + if x != nil { + return x.AccessMode + } + return "" +} + +func (x *ManagedVolumeSpec) GetFsType() string { + if x != nil { + return x.FsType + } + return "" +} + +func (x *ManagedVolumeSpec) GetMountPath() string { + if x != nil { + return x.MountPath + } + return "" +} + +func (x *ManagedVolumeSpec) GetReadOnly() bool { + if x != nil { + return x.ReadOnly + } + return false +} + +func (x *ManagedVolumeSpec) GetRetainPolicy() string { + if x != nil { + return x.RetainPolicy + } + return "" +} + +type WorkloadUsageSnapshot struct { + state protoimpl.MessageState `protogen:"open.v1"` + WorkloadId string `protobuf:"bytes,1,opt,name=workload_id,json=workloadId,proto3" json:"workload_id,omitempty"` + Type string `protobuf:"bytes,2,opt,name=type,proto3" json:"type,omitempty"` + CpuPercent float64 `protobuf:"fixed64,3,opt,name=cpu_percent,json=cpuPercent,proto3" json:"cpu_percent,omitempty"` + MemoryBytes int64 `protobuf:"varint,4,opt,name=memory_bytes,json=memoryBytes,proto3" json:"memory_bytes,omitempty"` + DiskReadBytes int64 `protobuf:"varint,5,opt,name=disk_read_bytes,json=diskReadBytes,proto3" json:"disk_read_bytes,omitempty"` + DiskWriteBytes int64 `protobuf:"varint,6,opt,name=disk_write_bytes,json=diskWriteBytes,proto3" json:"disk_write_bytes,omitempty"` + NetRxBytes int64 `protobuf:"varint,7,opt,name=net_rx_bytes,json=netRxBytes,proto3" json:"net_rx_bytes,omitempty"` + NetTxBytes int64 `protobuf:"varint,8,opt,name=net_tx_bytes,json=netTxBytes,proto3" json:"net_tx_bytes,omitempty"` + CollectedAt *timestamppb.Timestamp `protobuf:"bytes,9,opt,name=collected_at,json=collectedAt,proto3" json:"collected_at,omitempty"` + Source string `protobuf:"bytes,10,opt,name=source,proto3" json:"source,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *WorkloadUsageSnapshot) Reset() { + *x = WorkloadUsageSnapshot{} + mi := &file_control_proto_msgTypes[22] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *WorkloadUsageSnapshot) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*WorkloadUsageSnapshot) ProtoMessage() {} + +func (x *WorkloadUsageSnapshot) ProtoReflect() protoreflect.Message { + mi := &file_control_proto_msgTypes[22] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use WorkloadUsageSnapshot.ProtoReflect.Descriptor instead. +func (*WorkloadUsageSnapshot) Descriptor() ([]byte, []int) { + return file_control_proto_rawDescGZIP(), []int{22} +} + +func (x *WorkloadUsageSnapshot) GetWorkloadId() string { + if x != nil { + return x.WorkloadId + } + return "" +} + +func (x *WorkloadUsageSnapshot) GetType() string { + if x != nil { + return x.Type + } + return "" +} + +func (x *WorkloadUsageSnapshot) GetCpuPercent() float64 { + if x != nil { + return x.CpuPercent + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetMemoryBytes() int64 { + if x != nil { + return x.MemoryBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetDiskReadBytes() int64 { + if x != nil { + return x.DiskReadBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetDiskWriteBytes() int64 { + if x != nil { + return x.DiskWriteBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetNetRxBytes() int64 { + if x != nil { + return x.NetRxBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetNetTxBytes() int64 { + if x != nil { + return x.NetTxBytes + } + return 0 +} + +func (x *WorkloadUsageSnapshot) GetCollectedAt() *timestamppb.Timestamp { + if x != nil { + return x.CollectedAt + } + return nil +} + +func (x *WorkloadUsageSnapshot) GetSource() string { + if x != nil { + return x.Source + } + return "" +} + +type ReasonDetail struct { + state protoimpl.MessageState `protogen:"open.v1"` + Code string `protobuf:"bytes,1,opt,name=code,proto3" json:"code,omitempty"` + Message string `protobuf:"bytes,2,opt,name=message,proto3" json:"message,omitempty"` + LastTransition *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=last_transition,json=lastTransition,proto3" json:"last_transition,omitempty"` + NextRetryAt *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=next_retry_at,json=nextRetryAt,proto3" json:"next_retry_at,omitempty"` + Retryable bool `protobuf:"varint,5,opt,name=retryable,proto3" json:"retryable,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ReasonDetail) Reset() { + *x = ReasonDetail{} + mi := &file_control_proto_msgTypes[23] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ReasonDetail) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReasonDetail) ProtoMessage() {} + +func (x *ReasonDetail) ProtoReflect() protoreflect.Message { + mi := &file_control_proto_msgTypes[23] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReasonDetail.ProtoReflect.Descriptor instead. +func (*ReasonDetail) Descriptor() ([]byte, []int) { + return file_control_proto_rawDescGZIP(), []int{23} +} + +func (x *ReasonDetail) GetCode() string { + if x != nil { + return x.Code + } + return "" +} + +func (x *ReasonDetail) GetMessage() string { + if x != nil { + return x.Message + } + return "" +} + +func (x *ReasonDetail) GetLastTransition() *timestamppb.Timestamp { + if x != nil { + return x.LastTransition + } + return nil +} + +func (x *ReasonDetail) GetNextRetryAt() *timestamppb.Timestamp { + if x != nil { + return x.NextRetryAt + } + return nil +} + +func (x *ReasonDetail) GetRetryable() bool { + if x != nil { + return x.Retryable + } + return false +} + type WorkloadStatus struct { state protoimpl.MessageState `protogen:"open.v1"` WorkloadId string `protobuf:"bytes,1,opt,name=workload_id,json=workloadId,proto3" json:"workload_id,omitempty"` @@ -1555,13 +1887,15 @@ type WorkloadStatus struct { FailureReason FailureReason `protobuf:"varint,3,opt,name=failure_reason,json=failureReason,proto3,enum=persys.control.v1.FailureReason" json:"failure_reason,omitempty"` Message string `protobuf:"bytes,4,opt,name=message,proto3" json:"message,omitempty"` LastTransition *timestamppb.Timestamp `protobuf:"bytes,5,opt,name=last_transition,json=lastTransition,proto3" json:"last_transition,omitempty"` + Reason *ReasonDetail `protobuf:"bytes,6,opt,name=reason,proto3" json:"reason,omitempty"` + Usage *WorkloadUsageSnapshot `protobuf:"bytes,7,opt,name=usage,proto3" json:"usage,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } func (x *WorkloadStatus) Reset() { *x = WorkloadStatus{} - mi := &file_control_proto_msgTypes[21] + mi := &file_control_proto_msgTypes[24] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1573,7 +1907,7 @@ func (x *WorkloadStatus) String() string { func (*WorkloadStatus) ProtoMessage() {} func (x *WorkloadStatus) ProtoReflect() protoreflect.Message { - mi := &file_control_proto_msgTypes[21] + mi := &file_control_proto_msgTypes[24] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1586,7 +1920,7 @@ func (x *WorkloadStatus) ProtoReflect() protoreflect.Message { // Deprecated: Use WorkloadStatus.ProtoReflect.Descriptor instead. func (*WorkloadStatus) Descriptor() ([]byte, []int) { - return file_control_proto_rawDescGZIP(), []int{21} + return file_control_proto_rawDescGZIP(), []int{24} } func (x *WorkloadStatus) GetWorkloadId() string { @@ -1624,6 +1958,20 @@ func (x *WorkloadStatus) GetLastTransition() *timestamppb.Timestamp { return nil } +func (x *WorkloadStatus) GetReason() *ReasonDetail { + if x != nil { + return x.Reason + } + return nil +} + +func (x *WorkloadStatus) GetUsage() *WorkloadUsageSnapshot { + if x != nil { + return x.Usage + } + return nil +} + type RetryWorkloadRequest struct { state protoimpl.MessageState `protogen:"open.v1"` WorkloadId string `protobuf:"bytes,1,opt,name=workload_id,json=workloadId,proto3" json:"workload_id,omitempty"` @@ -1633,7 +1981,7 @@ type RetryWorkloadRequest struct { func (x *RetryWorkloadRequest) Reset() { *x = RetryWorkloadRequest{} - mi := &file_control_proto_msgTypes[22] + mi := &file_control_proto_msgTypes[25] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1645,7 +1993,7 @@ func (x *RetryWorkloadRequest) String() string { func (*RetryWorkloadRequest) ProtoMessage() {} func (x *RetryWorkloadRequest) ProtoReflect() protoreflect.Message { - mi := &file_control_proto_msgTypes[22] + mi := &file_control_proto_msgTypes[25] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1658,7 +2006,7 @@ func (x *RetryWorkloadRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use RetryWorkloadRequest.ProtoReflect.Descriptor instead. func (*RetryWorkloadRequest) Descriptor() ([]byte, []int) { - return file_control_proto_rawDescGZIP(), []int{22} + return file_control_proto_rawDescGZIP(), []int{25} } func (x *RetryWorkloadRequest) GetWorkloadId() string { @@ -1677,7 +2025,7 @@ type RetryWorkloadResponse struct { func (x *RetryWorkloadResponse) Reset() { *x = RetryWorkloadResponse{} - mi := &file_control_proto_msgTypes[23] + mi := &file_control_proto_msgTypes[26] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1689,7 +2037,7 @@ func (x *RetryWorkloadResponse) String() string { func (*RetryWorkloadResponse) ProtoMessage() {} func (x *RetryWorkloadResponse) ProtoReflect() protoreflect.Message { - mi := &file_control_proto_msgTypes[23] + mi := &file_control_proto_msgTypes[26] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1702,7 +2050,7 @@ func (x *RetryWorkloadResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use RetryWorkloadResponse.ProtoReflect.Descriptor instead. func (*RetryWorkloadResponse) Descriptor() ([]byte, []int) { - return file_control_proto_rawDescGZIP(), []int{23} + return file_control_proto_rawDescGZIP(), []int{26} } func (x *RetryWorkloadResponse) GetAccepted() bool { @@ -1727,7 +2075,7 @@ type ControlMessage struct { func (x *ControlMessage) Reset() { *x = ControlMessage{} - mi := &file_control_proto_msgTypes[24] + mi := &file_control_proto_msgTypes[27] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1739,7 +2087,7 @@ func (x *ControlMessage) String() string { func (*ControlMessage) ProtoMessage() {} func (x *ControlMessage) ProtoReflect() protoreflect.Message { - mi := &file_control_proto_msgTypes[24] + mi := &file_control_proto_msgTypes[27] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1752,7 +2100,7 @@ func (x *ControlMessage) ProtoReflect() protoreflect.Message { // Deprecated: Use ControlMessage.ProtoReflect.Descriptor instead. func (*ControlMessage) Descriptor() ([]byte, []int) { - return file_control_proto_rawDescGZIP(), []int{24} + return file_control_proto_rawDescGZIP(), []int{27} } func (x *ControlMessage) GetMessage() isControlMessage_Message { @@ -1842,12 +2190,13 @@ const file_control_proto_rawDesc = "" + "\ttimestamp\x18\a \x01(\v2\x1a.google.protobuf.TimestampR\ttimestamp\x1a9\n" + "\vLabelsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + - "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xeb\x01\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xa7\x02\n" + "\x10NodeCapabilities\x120\n" + "\x14cpu_total_millicores\x18\x01 \x01(\x03R\x12cpuTotalMillicores\x12&\n" + "\x0fmemory_total_mb\x18\x02 \x01(\x03R\rmemoryTotalMb\x12C\n" + "\rstorage_pools\x18\x03 \x03(\v2\x1e.persys.control.v1.StoragePoolR\fstoragePools\x128\n" + - "\x18supported_workload_types\x18\x04 \x03(\tR\x16supportedWorkloadTypes\"P\n" + + "\x18supported_workload_types\x18\x04 \x03(\tR\x16supportedWorkloadTypes\x12:\n" + + "\x19supported_storage_drivers\x18\x05 \x03(\tR\x17supportedStorageDrivers\"P\n" + "\vStoragePool\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x12\n" + "\x04type\x18\x02 \x01(\tR\x04type\x12\x19\n" + @@ -1856,12 +2205,13 @@ const file_control_proto_rawDesc = "" + "\baccepted\x18\x01 \x01(\bR\baccepted\x12\x16\n" + "\x06reason\x18\x02 \x01(\tR\x06reason\x12<\n" + "\x1aheartbeat_interval_seconds\x18\x03 \x01(\x05R\x18heartbeatIntervalSeconds\x12D\n" + - "\x10lease_expires_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\x0eleaseExpiresAt\"\xe9\x01\n" + + "\x10lease_expires_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\x0eleaseExpiresAt\"\xba\x02\n" + "\x10HeartbeatRequest\x12\x17\n" + "\anode_id\x18\x01 \x01(\tR\x06nodeId\x122\n" + "\x05usage\x18\x02 \x01(\v2\x1c.persys.control.v1.NodeUsageR\x05usage\x12N\n" + "\x11workload_statuses\x18\x03 \x03(\v2!.persys.control.v1.WorkloadStatusR\x10workloadStatuses\x128\n" + - "\ttimestamp\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\ttimestamp\"\x99\x02\n" + + "\ttimestamp\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\ttimestamp\x12O\n" + + "\x0eworkload_usage\x18\x05 \x03(\v2(.persys.control.v1.WorkloadUsageSnapshotR\rworkloadUsage\"\x99\x02\n" + "\tNodeUsage\x128\n" + "\x18cpu_allocated_millicores\x18\x01 \x01(\x03R\x16cpuAllocatedMillicores\x12.\n" + "\x13cpu_used_millicores\x18\x02 \x01(\x03R\x11cpuUsedMillicores\x12.\n" + @@ -1909,7 +2259,7 @@ const file_control_proto_rawDesc = "" + "\x14ResourceRequirements\x12%\n" + "\x0ecpu_millicores\x18\x01 \x01(\x03R\rcpuMillicores\x12\x1b\n" + "\tmemory_mb\x18\x02 \x01(\x03R\bmemoryMb\x12\x17\n" + - "\adisk_gb\x18\x03 \x01(\x03R\x06diskGb\"\xe4\x02\n" + + "\adisk_gb\x18\x03 \x01(\x03R\x06diskGb\"\xb3\x03\n" + "\rContainerSpec\x12\x14\n" + "\x05image\x18\x01 \x01(\tR\x05image\x12\x18\n" + "\acommand\x18\x02 \x03(\tR\acommand\x12;\n" + @@ -1919,7 +2269,8 @@ const file_control_proto_rawDesc = "" + "\x0erestart_policy\x18\x06 \x01(\tR\rrestartPolicy\x12\x1e\n" + "\n" + "privileged\x18\a \x01(\bR\n" + - "privileged\x1a6\n" + + "privileged\x12M\n" + + "\x0fmanaged_volumes\x18\b \x03(\v2$.persys.control.v1.ManagedVolumeSpecR\x0emanagedVolumes\x1a6\n" + "\bEnvEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"n\n" + @@ -1941,7 +2292,7 @@ const file_control_proto_rawDesc = "" + "\x03env\x18\x05 \x03(\v2'.persys.control.v1.ComposeSpec.EnvEntryR\x03env\x1a6\n" + "\bEnvEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + - "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x8c\x02\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xdb\x02\n" + "\x06VMSpec\x12\x14\n" + "\x05vcpus\x18\x01 \x01(\x05R\x05vcpus\x12\x1b\n" + "\tmemory_mb\x18\x02 \x01(\x03R\bmemoryMb\x123\n" + @@ -1949,7 +2300,8 @@ const file_control_proto_rawDesc = "" + "\bnetworks\x18\x04 \x03(\v2 .persys.control.v1.NetworkConfigR\bnetworks\x12A\n" + "\n" + "cloud_init\x18\x05 \x01(\v2\".persys.control.v1.CloudInitConfigR\tcloudInit\x12\x19\n" + - "\bos_image\x18\x06 \x01(\tR\aosImage\"c\n" + + "\bos_image\x18\x06 \x01(\tR\aosImage\x12M\n" + + "\x0fmanaged_volumes\x18\a \x03(\v2$.persys.control.v1.ManagedVolumeSpecR\x0emanagedVolumes\"c\n" + "\n" + "DiskConfig\x12\x1b\n" + "\tpool_name\x18\x01 \x01(\tR\bpoolName\x12\x17\n" + @@ -1959,18 +2311,55 @@ const file_control_proto_rawDesc = "" + "\rNetworkConfig\x12\x16\n" + "\x06bridge\x18\x01 \x01(\tR\x06bridge\x12\x12\n" + "\x04dhcp\x18\x02 \x01(\bR\x04dhcp\x12\x1b\n" + - "\tstatic_ip\x18\x03 \x01(\tR\bstaticIp\"r\n" + + "\tstatic_ip\x18\x03 \x01(\tR\bstaticIp\"\x93\x01\n" + "\x0fCloudInitConfig\x12\x1b\n" + "\tuser_data\x18\x01 \x01(\tR\buserData\x12\x1b\n" + "\tmeta_data\x18\x02 \x01(\tR\bmetaData\x12%\n" + - "\x0enetwork_config\x18\x03 \x01(\tR\rnetworkConfig\"\xef\x01\n" + + "\x0enetwork_config\x18\x03 \x01(\tR\rnetworkConfig\x12\x1f\n" + + "\vvendor_data\x18\x04 \x01(\tR\n" + + "vendorData\"\xf3\x01\n" + + "\x11ManagedVolumeSpec\x12\x12\n" + + "\x04name\x18\x01 \x01(\tR\x04name\x12\x16\n" + + "\x06driver\x18\x02 \x01(\tR\x06driver\x12\x17\n" + + "\asize_gb\x18\x03 \x01(\x03R\x06sizeGb\x12\x1f\n" + + "\vaccess_mode\x18\x04 \x01(\tR\n" + + "accessMode\x12\x17\n" + + "\afs_type\x18\x05 \x01(\tR\x06fsType\x12\x1d\n" + + "\n" + + "mount_path\x18\x06 \x01(\tR\tmountPath\x12\x1b\n" + + "\tread_only\x18\a \x01(\bR\breadOnly\x12#\n" + + "\rretain_policy\x18\b \x01(\tR\fretainPolicy\"\xfd\x02\n" + + "\x15WorkloadUsageSnapshot\x12\x1f\n" + + "\vworkload_id\x18\x01 \x01(\tR\n" + + "workloadId\x12\x12\n" + + "\x04type\x18\x02 \x01(\tR\x04type\x12\x1f\n" + + "\vcpu_percent\x18\x03 \x01(\x01R\n" + + "cpuPercent\x12!\n" + + "\fmemory_bytes\x18\x04 \x01(\x03R\vmemoryBytes\x12&\n" + + "\x0fdisk_read_bytes\x18\x05 \x01(\x03R\rdiskReadBytes\x12(\n" + + "\x10disk_write_bytes\x18\x06 \x01(\x03R\x0ediskWriteBytes\x12 \n" + + "\fnet_rx_bytes\x18\a \x01(\x03R\n" + + "netRxBytes\x12 \n" + + "\fnet_tx_bytes\x18\b \x01(\x03R\n" + + "netTxBytes\x12=\n" + + "\fcollected_at\x18\t \x01(\v2\x1a.google.protobuf.TimestampR\vcollectedAt\x12\x16\n" + + "\x06source\x18\n" + + " \x01(\tR\x06source\"\xdf\x01\n" + + "\fReasonDetail\x12\x12\n" + + "\x04code\x18\x01 \x01(\tR\x04code\x12\x18\n" + + "\amessage\x18\x02 \x01(\tR\amessage\x12C\n" + + "\x0flast_transition\x18\x03 \x01(\v2\x1a.google.protobuf.TimestampR\x0elastTransition\x12>\n" + + "\rnext_retry_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\vnextRetryAt\x12\x1c\n" + + "\tretryable\x18\x05 \x01(\bR\tretryable\"\xe8\x02\n" + "\x0eWorkloadStatus\x12\x1f\n" + "\vworkload_id\x18\x01 \x01(\tR\n" + "workloadId\x12\x14\n" + "\x05state\x18\x02 \x01(\tR\x05state\x12G\n" + "\x0efailure_reason\x18\x03 \x01(\x0e2 .persys.control.v1.FailureReasonR\rfailureReason\x12\x18\n" + "\amessage\x18\x04 \x01(\tR\amessage\x12C\n" + - "\x0flast_transition\x18\x05 \x01(\v2\x1a.google.protobuf.TimestampR\x0elastTransition\"7\n" + + "\x0flast_transition\x18\x05 \x01(\v2\x1a.google.protobuf.TimestampR\x0elastTransition\x127\n" + + "\x06reason\x18\x06 \x01(\v2\x1f.persys.control.v1.ReasonDetailR\x06reason\x12>\n" + + "\x05usage\x18\a \x01(\v2(.persys.control.v1.WorkloadUsageSnapshotR\x05usage\"7\n" + "\x14RetryWorkloadRequest\x12\x1f\n" + "\vworkload_id\x18\x01 \x01(\tR\n" + "workloadId\"3\n" + @@ -2013,7 +2402,7 @@ func file_control_proto_rawDescGZIP() []byte { } var file_control_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_control_proto_msgTypes = make([]protoimpl.MessageInfo, 29) +var file_control_proto_msgTypes = make([]protoimpl.MessageInfo, 32) var file_control_proto_goTypes = []any{ (FailureReason)(0), // 0: persys.control.v1.FailureReason (*RegisterNodeRequest)(nil), // 1: persys.control.v1.RegisterNodeRequest @@ -2037,63 +2426,74 @@ var file_control_proto_goTypes = []any{ (*DiskConfig)(nil), // 19: persys.control.v1.DiskConfig (*NetworkConfig)(nil), // 20: persys.control.v1.NetworkConfig (*CloudInitConfig)(nil), // 21: persys.control.v1.CloudInitConfig - (*WorkloadStatus)(nil), // 22: persys.control.v1.WorkloadStatus - (*RetryWorkloadRequest)(nil), // 23: persys.control.v1.RetryWorkloadRequest - (*RetryWorkloadResponse)(nil), // 24: persys.control.v1.RetryWorkloadResponse - (*ControlMessage)(nil), // 25: persys.control.v1.ControlMessage - nil, // 26: persys.control.v1.RegisterNodeRequest.LabelsEntry - nil, // 27: persys.control.v1.WorkloadSpec.MetadataEntry - nil, // 28: persys.control.v1.ContainerSpec.EnvEntry - nil, // 29: persys.control.v1.ComposeSpec.EnvEntry - (*timestamppb.Timestamp)(nil), // 30: google.protobuf.Timestamp + (*ManagedVolumeSpec)(nil), // 22: persys.control.v1.ManagedVolumeSpec + (*WorkloadUsageSnapshot)(nil), // 23: persys.control.v1.WorkloadUsageSnapshot + (*ReasonDetail)(nil), // 24: persys.control.v1.ReasonDetail + (*WorkloadStatus)(nil), // 25: persys.control.v1.WorkloadStatus + (*RetryWorkloadRequest)(nil), // 26: persys.control.v1.RetryWorkloadRequest + (*RetryWorkloadResponse)(nil), // 27: persys.control.v1.RetryWorkloadResponse + (*ControlMessage)(nil), // 28: persys.control.v1.ControlMessage + nil, // 29: persys.control.v1.RegisterNodeRequest.LabelsEntry + nil, // 30: persys.control.v1.WorkloadSpec.MetadataEntry + nil, // 31: persys.control.v1.ContainerSpec.EnvEntry + nil, // 32: persys.control.v1.ComposeSpec.EnvEntry + (*timestamppb.Timestamp)(nil), // 33: google.protobuf.Timestamp } var file_control_proto_depIdxs = []int32{ 2, // 0: persys.control.v1.RegisterNodeRequest.capabilities:type_name -> persys.control.v1.NodeCapabilities - 26, // 1: persys.control.v1.RegisterNodeRequest.labels:type_name -> persys.control.v1.RegisterNodeRequest.LabelsEntry - 30, // 2: persys.control.v1.RegisterNodeRequest.timestamp:type_name -> google.protobuf.Timestamp + 29, // 1: persys.control.v1.RegisterNodeRequest.labels:type_name -> persys.control.v1.RegisterNodeRequest.LabelsEntry + 33, // 2: persys.control.v1.RegisterNodeRequest.timestamp:type_name -> google.protobuf.Timestamp 3, // 3: persys.control.v1.NodeCapabilities.storage_pools:type_name -> persys.control.v1.StoragePool - 30, // 4: persys.control.v1.RegisterNodeResponse.lease_expires_at:type_name -> google.protobuf.Timestamp + 33, // 4: persys.control.v1.RegisterNodeResponse.lease_expires_at:type_name -> google.protobuf.Timestamp 6, // 5: persys.control.v1.HeartbeatRequest.usage:type_name -> persys.control.v1.NodeUsage - 22, // 6: persys.control.v1.HeartbeatRequest.workload_statuses:type_name -> persys.control.v1.WorkloadStatus - 30, // 7: persys.control.v1.HeartbeatRequest.timestamp:type_name -> google.protobuf.Timestamp - 30, // 8: persys.control.v1.HeartbeatResponse.lease_expires_at:type_name -> google.protobuf.Timestamp - 12, // 9: persys.control.v1.ApplyWorkloadRequest.spec:type_name -> persys.control.v1.WorkloadSpec - 0, // 10: persys.control.v1.ApplyWorkloadResponse.failure_reason:type_name -> persys.control.v1.FailureReason - 13, // 11: persys.control.v1.WorkloadSpec.resources:type_name -> persys.control.v1.ResourceRequirements - 14, // 12: persys.control.v1.WorkloadSpec.container:type_name -> persys.control.v1.ContainerSpec - 17, // 13: persys.control.v1.WorkloadSpec.compose:type_name -> persys.control.v1.ComposeSpec - 18, // 14: persys.control.v1.WorkloadSpec.vm:type_name -> persys.control.v1.VMSpec - 27, // 15: persys.control.v1.WorkloadSpec.metadata:type_name -> persys.control.v1.WorkloadSpec.MetadataEntry - 28, // 16: persys.control.v1.ContainerSpec.env:type_name -> persys.control.v1.ContainerSpec.EnvEntry - 15, // 17: persys.control.v1.ContainerSpec.volumes:type_name -> persys.control.v1.VolumeMount - 16, // 18: persys.control.v1.ContainerSpec.ports:type_name -> persys.control.v1.Port - 29, // 19: persys.control.v1.ComposeSpec.env:type_name -> persys.control.v1.ComposeSpec.EnvEntry - 19, // 20: persys.control.v1.VMSpec.disks:type_name -> persys.control.v1.DiskConfig - 20, // 21: persys.control.v1.VMSpec.networks:type_name -> persys.control.v1.NetworkConfig - 21, // 22: persys.control.v1.VMSpec.cloud_init:type_name -> persys.control.v1.CloudInitConfig - 0, // 23: persys.control.v1.WorkloadStatus.failure_reason:type_name -> persys.control.v1.FailureReason - 30, // 24: persys.control.v1.WorkloadStatus.last_transition:type_name -> google.protobuf.Timestamp - 1, // 25: persys.control.v1.ControlMessage.register:type_name -> persys.control.v1.RegisterNodeRequest - 5, // 26: persys.control.v1.ControlMessage.heartbeat:type_name -> persys.control.v1.HeartbeatRequest - 8, // 27: persys.control.v1.ControlMessage.apply:type_name -> persys.control.v1.ApplyWorkloadRequest - 10, // 28: persys.control.v1.ControlMessage.delete:type_name -> persys.control.v1.DeleteWorkloadRequest - 1, // 29: persys.control.v1.AgentControl.RegisterNode:input_type -> persys.control.v1.RegisterNodeRequest - 5, // 30: persys.control.v1.AgentControl.Heartbeat:input_type -> persys.control.v1.HeartbeatRequest - 8, // 31: persys.control.v1.AgentControl.ApplyWorkload:input_type -> persys.control.v1.ApplyWorkloadRequest - 10, // 32: persys.control.v1.AgentControl.DeleteWorkload:input_type -> persys.control.v1.DeleteWorkloadRequest - 23, // 33: persys.control.v1.AgentControl.RetryWorkload:input_type -> persys.control.v1.RetryWorkloadRequest - 25, // 34: persys.control.v1.AgentControl.ControlStream:input_type -> persys.control.v1.ControlMessage - 4, // 35: persys.control.v1.AgentControl.RegisterNode:output_type -> persys.control.v1.RegisterNodeResponse - 7, // 36: persys.control.v1.AgentControl.Heartbeat:output_type -> persys.control.v1.HeartbeatResponse - 9, // 37: persys.control.v1.AgentControl.ApplyWorkload:output_type -> persys.control.v1.ApplyWorkloadResponse - 11, // 38: persys.control.v1.AgentControl.DeleteWorkload:output_type -> persys.control.v1.DeleteWorkloadResponse - 24, // 39: persys.control.v1.AgentControl.RetryWorkload:output_type -> persys.control.v1.RetryWorkloadResponse - 25, // 40: persys.control.v1.AgentControl.ControlStream:output_type -> persys.control.v1.ControlMessage - 35, // [35:41] is the sub-list for method output_type - 29, // [29:35] is the sub-list for method input_type - 29, // [29:29] is the sub-list for extension type_name - 29, // [29:29] is the sub-list for extension extendee - 0, // [0:29] is the sub-list for field type_name + 25, // 6: persys.control.v1.HeartbeatRequest.workload_statuses:type_name -> persys.control.v1.WorkloadStatus + 33, // 7: persys.control.v1.HeartbeatRequest.timestamp:type_name -> google.protobuf.Timestamp + 23, // 8: persys.control.v1.HeartbeatRequest.workload_usage:type_name -> persys.control.v1.WorkloadUsageSnapshot + 33, // 9: persys.control.v1.HeartbeatResponse.lease_expires_at:type_name -> google.protobuf.Timestamp + 12, // 10: persys.control.v1.ApplyWorkloadRequest.spec:type_name -> persys.control.v1.WorkloadSpec + 0, // 11: persys.control.v1.ApplyWorkloadResponse.failure_reason:type_name -> persys.control.v1.FailureReason + 13, // 12: persys.control.v1.WorkloadSpec.resources:type_name -> persys.control.v1.ResourceRequirements + 14, // 13: persys.control.v1.WorkloadSpec.container:type_name -> persys.control.v1.ContainerSpec + 17, // 14: persys.control.v1.WorkloadSpec.compose:type_name -> persys.control.v1.ComposeSpec + 18, // 15: persys.control.v1.WorkloadSpec.vm:type_name -> persys.control.v1.VMSpec + 30, // 16: persys.control.v1.WorkloadSpec.metadata:type_name -> persys.control.v1.WorkloadSpec.MetadataEntry + 31, // 17: persys.control.v1.ContainerSpec.env:type_name -> persys.control.v1.ContainerSpec.EnvEntry + 15, // 18: persys.control.v1.ContainerSpec.volumes:type_name -> persys.control.v1.VolumeMount + 16, // 19: persys.control.v1.ContainerSpec.ports:type_name -> persys.control.v1.Port + 22, // 20: persys.control.v1.ContainerSpec.managed_volumes:type_name -> persys.control.v1.ManagedVolumeSpec + 32, // 21: persys.control.v1.ComposeSpec.env:type_name -> persys.control.v1.ComposeSpec.EnvEntry + 19, // 22: persys.control.v1.VMSpec.disks:type_name -> persys.control.v1.DiskConfig + 20, // 23: persys.control.v1.VMSpec.networks:type_name -> persys.control.v1.NetworkConfig + 21, // 24: persys.control.v1.VMSpec.cloud_init:type_name -> persys.control.v1.CloudInitConfig + 22, // 25: persys.control.v1.VMSpec.managed_volumes:type_name -> persys.control.v1.ManagedVolumeSpec + 33, // 26: persys.control.v1.WorkloadUsageSnapshot.collected_at:type_name -> google.protobuf.Timestamp + 33, // 27: persys.control.v1.ReasonDetail.last_transition:type_name -> google.protobuf.Timestamp + 33, // 28: persys.control.v1.ReasonDetail.next_retry_at:type_name -> google.protobuf.Timestamp + 0, // 29: persys.control.v1.WorkloadStatus.failure_reason:type_name -> persys.control.v1.FailureReason + 33, // 30: persys.control.v1.WorkloadStatus.last_transition:type_name -> google.protobuf.Timestamp + 24, // 31: persys.control.v1.WorkloadStatus.reason:type_name -> persys.control.v1.ReasonDetail + 23, // 32: persys.control.v1.WorkloadStatus.usage:type_name -> persys.control.v1.WorkloadUsageSnapshot + 1, // 33: persys.control.v1.ControlMessage.register:type_name -> persys.control.v1.RegisterNodeRequest + 5, // 34: persys.control.v1.ControlMessage.heartbeat:type_name -> persys.control.v1.HeartbeatRequest + 8, // 35: persys.control.v1.ControlMessage.apply:type_name -> persys.control.v1.ApplyWorkloadRequest + 10, // 36: persys.control.v1.ControlMessage.delete:type_name -> persys.control.v1.DeleteWorkloadRequest + 1, // 37: persys.control.v1.AgentControl.RegisterNode:input_type -> persys.control.v1.RegisterNodeRequest + 5, // 38: persys.control.v1.AgentControl.Heartbeat:input_type -> persys.control.v1.HeartbeatRequest + 8, // 39: persys.control.v1.AgentControl.ApplyWorkload:input_type -> persys.control.v1.ApplyWorkloadRequest + 10, // 40: persys.control.v1.AgentControl.DeleteWorkload:input_type -> persys.control.v1.DeleteWorkloadRequest + 26, // 41: persys.control.v1.AgentControl.RetryWorkload:input_type -> persys.control.v1.RetryWorkloadRequest + 28, // 42: persys.control.v1.AgentControl.ControlStream:input_type -> persys.control.v1.ControlMessage + 4, // 43: persys.control.v1.AgentControl.RegisterNode:output_type -> persys.control.v1.RegisterNodeResponse + 7, // 44: persys.control.v1.AgentControl.Heartbeat:output_type -> persys.control.v1.HeartbeatResponse + 9, // 45: persys.control.v1.AgentControl.ApplyWorkload:output_type -> persys.control.v1.ApplyWorkloadResponse + 11, // 46: persys.control.v1.AgentControl.DeleteWorkload:output_type -> persys.control.v1.DeleteWorkloadResponse + 27, // 47: persys.control.v1.AgentControl.RetryWorkload:output_type -> persys.control.v1.RetryWorkloadResponse + 28, // 48: persys.control.v1.AgentControl.ControlStream:output_type -> persys.control.v1.ControlMessage + 43, // [43:49] is the sub-list for method output_type + 37, // [37:43] is the sub-list for method input_type + 37, // [37:37] is the sub-list for extension type_name + 37, // [37:37] is the sub-list for extension extendee + 0, // [0:37] is the sub-list for field type_name } func init() { file_control_proto_init() } @@ -2106,7 +2506,7 @@ func file_control_proto_init() { (*WorkloadSpec_Compose)(nil), (*WorkloadSpec_Vm)(nil), } - file_control_proto_msgTypes[24].OneofWrappers = []any{ + file_control_proto_msgTypes[27].OneofWrappers = []any{ (*ControlMessage_Register)(nil), (*ControlMessage_Heartbeat)(nil), (*ControlMessage_Apply)(nil), @@ -2118,7 +2518,7 @@ func file_control_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_control_proto_rawDesc), len(file_control_proto_rawDesc)), NumEnums: 1, - NumMessages: 29, + NumMessages: 32, NumExtensions: 0, NumServices: 1, }, diff --git a/pkg/models/workload.go b/pkg/models/workload.go index 1a8bb87..f5ab4a7 100644 --- a/pkg/models/workload.go +++ b/pkg/models/workload.go @@ -52,19 +52,21 @@ type WorkloadStatus struct { CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` Metadata map[string]string `json:"metadata,omitempty"` + Usage *WorkloadUsage `json:"usage,omitempty"` } // ContainerSpec defines Docker container configuration type ContainerSpec struct { - Image string `json:"image"` - Command []string `json:"command,omitempty"` - Args []string `json:"args,omitempty"` - Env map[string]string `json:"env,omitempty"` - Volumes []VolumeMount `json:"volumes,omitempty"` - Ports []PortMapping `json:"ports,omitempty"` - Resources *ResourceLimits `json:"resources,omitempty"` - RestartPolicy *RestartPolicy `json:"restart_policy,omitempty"` - Labels map[string]string `json:"labels,omitempty"` + Image string `json:"image"` + Command []string `json:"command,omitempty"` + Args []string `json:"args,omitempty"` + Env map[string]string `json:"env,omitempty"` + Volumes []VolumeMount `json:"volumes,omitempty"` + ManagedVolumes []ManagedVolumeSpec `json:"managed_volumes,omitempty"` + Ports []PortMapping `json:"ports,omitempty"` + Resources *ResourceLimits `json:"resources,omitempty"` + RestartPolicy *RestartPolicy `json:"restart_policy,omitempty"` + Labels map[string]string `json:"labels,omitempty"` } // ComposeSpec defines Docker Compose configuration @@ -76,14 +78,15 @@ type ComposeSpec struct { // VMSpec defines virtual machine configuration type VMSpec struct { - Name string `json:"name"` - VCPUs int `json:"vcpus"` - MemoryMB int64 `json:"memory_mb"` - Disks []DiskConfig `json:"disks"` - Networks []NetworkConfig `json:"networks"` - CloudInit string `json:"cloud_init,omitempty"` // user-data script - Metadata map[string]string `json:"metadata,omitempty"` - CloudInitConfig *CloudInitConfig `json:"cloud_init_config,omitempty"` + Name string `json:"name"` + VCPUs int `json:"vcpus"` + MemoryMB int64 `json:"memory_mb"` + Disks []DiskConfig `json:"disks"` + Networks []NetworkConfig `json:"networks"` + CloudInit string `json:"cloud_init,omitempty"` // user-data script + Metadata map[string]string `json:"metadata,omitempty"` + CloudInitConfig *CloudInitConfig `json:"cloud_init_config,omitempty"` + ManagedVolumes []ManagedVolumeSpec `json:"managed_volumes,omitempty"` } // CloudInitConfig defines advanced cloud-init settings @@ -94,6 +97,32 @@ type CloudInitConfig struct { VendorData string `json:"vendor_data,omitempty"` } +// ManagedVolumeSpec describes a provider-backed volume request. +type ManagedVolumeSpec struct { + Name string `json:"name"` + Driver string `json:"driver"` + SizeGB int64 `json:"size_gb,omitempty"` + AccessMode string `json:"access_mode,omitempty"` + FSType string `json:"fs_type,omitempty"` + MountPath string `json:"mount_path,omitempty"` + ReadOnly bool `json:"read_only,omitempty"` + RetainPolicy string `json:"retain_policy,omitempty"` +} + +// WorkloadUsage stores the latest per-workload utilization sample. +type WorkloadUsage struct { + WorkloadID string `json:"workload_id,omitempty"` + Type string `json:"type,omitempty"` + CPUPercent float64 `json:"cpu_percent,omitempty"` + MemoryBytes int64 `json:"memory_bytes,omitempty"` + DiskReadBytes int64 `json:"disk_read_bytes,omitempty"` + DiskWriteBytes int64 `json:"disk_write_bytes,omitempty"` + NetRXBytes int64 `json:"net_rx_bytes,omitempty"` + NetTXBytes int64 `json:"net_tx_bytes,omitempty"` + CollectedAt time.Time `json:"collected_at,omitempty"` + Source string `json:"source,omitempty"` +} + // VolumeMount represents a volume mount type VolumeMount struct { HostPath string `json:"host_path"` diff --git a/sample.env b/sample.env new file mode 100644 index 0000000..c34ade4 --- /dev/null +++ b/sample.env @@ -0,0 +1,55 @@ +# Persys Compute Agent sample environment variables + +# gRPC server +PERSYS_GRPC_ADDR=0.0.0.0 +PERSYS_GRPC_PORT=50051 + +# Metrics server +PERSYS_METRICS_PORT=8089 + +# TLS/mTLS +PERSYS_TLS_ENABLED=true +PERSYS_TLS_CERT=/etc/persys/certs/agent/compute-agent.pem +PERSYS_TLS_KEY=/etc/persys/certs/agent/compute-agent-key.pem +PERSYS_TLS_CA=/etc/persys/certs/agent/ca.pem + +# Vault certificate manager +PERSYS_VAULT_ENABLED=true +PERSYS_VAULT_ADDR=http://vault:8200 +PERSYS_VAULT_AUTH_METHOD=approle +PERSYS_VAULT_TOKEN= +PERSYS_VAULT_APPROLE_ROLE_ID= +PERSYS_VAULT_APPROLE_SECRET_ID= +PERSYS_VAULT_PKI_MOUNT=pki +PERSYS_VAULT_PKI_ROLE=compute-agent +PERSYS_VAULT_CERT_TTL=24h +PERSYS_VAULT_RETRY_INTERVAL=2m +PERSYS_VAULT_SERVICE_NAME=compute-agent +PERSYS_VAULT_SERVICE_DOMAIN= + +# State store +PERSYS_STATE_PATH=/var/lib/persys/state.db + +# Runtime toggles +PERSYS_DOCKER_ENABLED=true +DOCKER_HOST=unix:///var/run/docker.sock +PERSYS_COMPOSE_ENABLED=true +PERSYS_COMPOSE_BINARY=docker compose +PERSYS_VM_ENABLED=true +PERSYS_LIBVIRT_URI=qemu:///system + +# Reconciliation +PERSYS_RECONCILE_ENABLED=true +PERSYS_RECONCILE_INTERVAL=30s + +# Scheduler control plane +PERSYS_SCHEDULER_ADDR=persys-scheduler:8085 +PERSYS_SCHEDULER_INSECURE=false +PERSYS_AGENT_GRPC_ENDPOINT= + +# Logging and metadata +PERSYS_LOG_LEVEL=info +PERSYS_VERSION=dev +PERSYS_NODE_REGION= +PERSYS_NODE_ENV= +PERSYS_NODE_LABELS= diff --git a/test/e2e/agent_e2e_test.go b/test/e2e/agent_e2e_test.go index 71a2362..174807f 100644 --- a/test/e2e/agent_e2e_test.go +++ b/test/e2e/agent_e2e_test.go @@ -6,13 +6,13 @@ import ( "path/filepath" "testing" - cfg "github.com/persys/compute-agent/internal/config" - agentgrpc "github.com/persys/compute-agent/internal/grpc" - "github.com/persys/compute-agent/internal/runtime" - "github.com/persys/compute-agent/internal/state" - "github.com/persys/compute-agent/internal/workload" - pb "github.com/persys/compute-agent/pkg/api/v1" - "github.com/persys/compute-agent/pkg/models" + cfg "github.com/persys-dev/compute-agent/internal/config" + agentgrpc "github.com/persys-dev/compute-agent/internal/grpc" + "github.com/persys-dev/compute-agent/internal/runtime" + "github.com/persys-dev/compute-agent/internal/state" + "github.com/persys-dev/compute-agent/internal/workload" + pb "github.com/persys-dev/compute-agent/pkg/api/v1" + "github.com/persys-dev/compute-agent/pkg/models" "github.com/sirupsen/logrus" )