From d6265893a1b915ff4c65d0fc9d4960d1ed795c62 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 07:32:20 -0700 Subject: [PATCH 01/13] refactor(sender): configurable HTTP idle-conn pool with defaults 500/50 The default MaxIdleConnsPerHost of 10 caps the keep-alive pool far below the per-worker goroutine count set by settings.Workers, forcing TCP re-dial on most completions and exhausting ephemeral source ports under load. Bumping the defaults alone still leaves the pool static for larger runs, so parameterize the client via an options pattern: - newHttpClient(opts ...HttpClientOption) with defaults 500 / 50 - WithMaxIdleConns(n) overrides the global pool - WithMaxIdleConnsPerHost(n) overrides the per-host pool The sender Worker shares one client across settings.Workers goroutines targeting a single endpoint, so it sizes both to the worker count. Unit tests cover defaults, each option in isolation, and composition. Co-Authored-By: Claude Opus 4.7 (1M context) --- sender/worker.go | 52 +++++++++++++++++++++++++++++------------ sender/worker_test.go | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 15 deletions(-) create mode 100644 sender/worker_test.go diff --git a/sender/worker.go b/sender/worker.go index d7a17fb..fc23211 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -39,21 +39,40 @@ type Worker struct { limiter *rate.Limiter // Shared rate limiter for transaction sending } -func newHttpClient() *http.Client { +// HttpClientOption configures the Transport used by newHttpClient. +type HttpClientOption func(*http.Transport) + +// WithMaxIdleConns overrides the global idle-connection pool size. +func WithMaxIdleConns(n int) HttpClientOption { + return func(t *http.Transport) { t.MaxIdleConns = n } +} + +// WithMaxIdleConnsPerHost overrides the per-host idle-connection pool size. +// Scale this with the goroutine count sharing the client to avoid forcing +// TCP re-dial (and ephemeral-port exhaustion) on most completions. +func WithMaxIdleConnsPerHost(n int) HttpClientOption { + return func(t *http.Transport) { t.MaxIdleConnsPerHost = n } +} + +func newHttpClient(opts ...HttpClientOption) *http.Client { + t := &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + MaxIdleConns: 500, + MaxIdleConnsPerHost: 50, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + DisableKeepAlives: false, + } + for _, opt := range opts { + opt(t) + } return &http.Client{ - Timeout: 30 * time.Second, - Transport: &http.Transport{ - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - KeepAlive: 30 * time.Second, - }).DialContext, - MaxIdleConns: 100, - MaxIdleConnsPerHost: 10, - IdleConnTimeout: 90 * time.Second, - TLSHandshakeTimeout: 10 * time.Second, - ExpectContinueTimeout: 1 * time.Second, - DisableKeepAlives: false, - }, + Timeout: 30 * time.Second, + Transport: t, } } @@ -83,7 +102,10 @@ func (w *Worker) SetStatsCollector(collector *stats.Collector, logger *stats.Log func (w *Worker) Run(ctx context.Context) error { return service.Run(ctx, func(ctx context.Context, s service.Scope) error { // Start multiple worker goroutines that share the same channel - client := newHttpClient() + client := newHttpClient( + WithMaxIdleConns(w.workers), + WithMaxIdleConnsPerHost(w.workers), + ) for range w.workers { s.Spawn(func() error { return w.processTransactions(ctx, client) }) } diff --git a/sender/worker_test.go b/sender/worker_test.go new file mode 100644 index 0000000..c5ec58d --- /dev/null +++ b/sender/worker_test.go @@ -0,0 +1,54 @@ +package sender + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func transport(t *testing.T, c *http.Client) *http.Transport { + t.Helper() + tr, ok := c.Transport.(*http.Transport) + require.True(t, ok, "expected *http.Transport, got %T", c.Transport) + return tr +} + +func TestNewHttpClient_Defaults(t *testing.T) { + c := newHttpClient() + tr := transport(t, c) + + require.Equal(t, 500, tr.MaxIdleConns) + require.Equal(t, 50, tr.MaxIdleConnsPerHost) + require.Equal(t, 30*time.Second, c.Timeout) + require.Equal(t, 90*time.Second, tr.IdleConnTimeout) + require.False(t, tr.DisableKeepAlives) +} + +func TestNewHttpClient_WithMaxIdleConns(t *testing.T) { + c := newHttpClient(WithMaxIdleConns(2048)) + tr := transport(t, c) + + require.Equal(t, 2048, tr.MaxIdleConns) + require.Equal(t, 50, tr.MaxIdleConnsPerHost, "per-host default preserved") +} + +func TestNewHttpClient_WithMaxIdleConnsPerHost(t *testing.T) { + c := newHttpClient(WithMaxIdleConnsPerHost(1024)) + tr := transport(t, c) + + require.Equal(t, 1024, tr.MaxIdleConnsPerHost) + require.Equal(t, 500, tr.MaxIdleConns, "global default preserved") +} + +func TestNewHttpClient_MultipleOptions(t *testing.T) { + c := newHttpClient( + WithMaxIdleConns(4096), + WithMaxIdleConnsPerHost(1024), + ) + tr := transport(t, c) + + require.Equal(t, 4096, tr.MaxIdleConns) + require.Equal(t, 1024, tr.MaxIdleConnsPerHost) +} From bb74d4dcbd5b5ce7cae0cf0c00333fa0abd7f0f4 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 07:36:51 -0700 Subject: [PATCH 02/13] sender: drop implicit worker-derived override, rely on 500/50 defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: pulling w.workers into the HTTP client at the call site is implicit tuning. Use the raised defaults (500 global / 50 per-host) for now; future follow-up can expose MaxIdleConns[PerHost] as explicit settings.* fields if callers need to tune beyond defaults. Options API is retained — WithMaxIdleConns / WithMaxIdleConnsPerHost remain available for any consumer that wants to opt in from a config path later. --- sender/worker.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sender/worker.go b/sender/worker.go index fc23211..99782cd 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -102,10 +102,7 @@ func (w *Worker) SetStatsCollector(collector *stats.Collector, logger *stats.Log func (w *Worker) Run(ctx context.Context) error { return service.Run(ctx, func(ctx context.Context, s service.Scope) error { // Start multiple worker goroutines that share the same channel - client := newHttpClient( - WithMaxIdleConns(w.workers), - WithMaxIdleConnsPerHost(w.workers), - ) + client := newHttpClient() for range w.workers { s.Spawn(func() error { return w.processTransactions(ctx, client) }) } From 1c9aae4638b90770f9491687770b8411e48767c4 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 10:24:49 -0700 Subject: [PATCH 03/13] feat(observability): OTel instrumentation for platform deployments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the design in sei-protocol/platform#128 (docs/designs/ sei-load-observability.md). Wires OTel run-scope via Resource (not per-sample labels), full tracing with exemplar support, W3C propagation for future cross-process stitching with seid, and a dual-exporter topology (Prometheus scrape + optional OTLP push). New package: observability - Setup(ctx, Config) installs MeterProvider + TracerProvider, Resource from SEILOAD_* env (RunID, ChainID, CommitID, CommitIDShort, Workload, InstanceID), Prometheus reader, optional OTLP gRPC exporter gated on OTEL_EXPORTER_OTLP_ENDPOINT, and the composite W3C TraceContext+Baggage propagator. - RunScopeFromEnv reads SEILOAD_* env vars. - service.instance.id defaults to hostname when SEILOAD_INSTANCE_ID unset — lets cluster-of-seiload deployments disambiguate instances sharing a RunID/ChainID via a stable per-pod attribute. - Meter(name) / Tracer(name) are thin wrappers around otel.Meter / otel.Tracer so callers can use sync.OnceValue(...) at package scope for lazy acquisition. Fixes the existing NoOp-capture bug where package-init var meter = otel.Meter(...) grabs the NoOp provider before main runs Setup, silently dropping emissions. Migrations: - sender/metrics.go and stats/metrics.go now build their instrument bundle via sync.OnceValue(...) — no more capture-at-init. - sender/worker.go wraps newHttpClient's Transport with otelhttp.NewTransport so outbound requests inject W3C traceparent and auto-emit http.client.* metrics. newHttpTransport split out so tests can inspect the unwrapped transport directly. - sender.sendTransaction, sender.waitForReceipt, sender.watchTransactions (Dial) now emit spans (sender.send_tx, sender.check_receipt, sender.dial_endpoint). Metrics recorded inside these spans automatically get trace-id exemplars under OTel's trace_based exemplar filter (default since SDK v1.28). - main.go wraps runLoadTest in a seiload.run root span; per-request spans inherit from it so a full run is a single trace. New metrics (registered; producers wired where applicable): - seiload.txs.accepted (counter) — per successful submission - seiload.txs.rejected (counter, reason label) — per failed submission - seiload.http.errors (counter, status_code label) — non-200 responses - seiload.tps.achieved (observable gauge) — registered; sample pump is a follow-up - seiload.run.tps.final (gauge) — emitted once at run end via Collector.EmitRunSummary - seiload.run.duration.seconds (gauge) — emitted once at run end - seiload.run.txs.accepted.total (gauge) — emitted once at run end Run-summary metrics are Gauges intentionally: single emission at run end produces exactly one series per metric per run after the Resource join, which is the right shape for cross-run comparison dashboards in the benchmarks namespace. Tests: - observability/setup_test.go covers Resource population from env, hostname fallback for service.instance.id, propagator install, endpoint scheme stripping, and short-commit derivation. - sender/worker_test.go split into TestNewHttpTransport_* (inspects the bare transport factory) and TestNewHttpClient_Smoke (verifies the otelhttp wrap is present without depending on its internals). Co-Authored-By: Claude Opus 4.7 (1M context) --- go.mod | 38 ++++-- go.sum | 84 ++++++++---- main.go | 60 +++++---- observability/meter.go | 31 +++++ observability/setup.go | 252 ++++++++++++++++++++++++++++++++++++ observability/setup_test.go | 152 ++++++++++++++++++++++ sender/metrics.go | 159 +++++++++++++++++------ sender/worker.go | 80 +++++++++++- sender/worker_test.go | 42 +++--- stats/block_collector.go | 6 +- stats/metrics.go | 77 +++++++---- stats/run_summary.go | 28 ++++ 12 files changed, 856 insertions(+), 153 deletions(-) create mode 100644 observability/meter.go create mode 100644 observability/setup.go create mode 100644 observability/setup_test.go create mode 100644 stats/run_summary.go diff --git a/go.mod b/go.mod index 6784e82..239d827 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/sei-protocol/sei-load -go 1.24.5 +go 1.25.0 require ( github.com/ethereum/go-ethereum v1.16.1 @@ -9,14 +9,19 @@ require ( github.com/prometheus/client_golang v1.22.0 github.com/spf13/cobra v1.9.1 github.com/spf13/viper v1.20.1 - github.com/stretchr/testify v1.10.0 - go.opentelemetry.io/otel v1.37.0 + github.com/stretchr/testify v1.11.1 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 go.opentelemetry.io/otel/exporters/prometheus v0.59.1 - go.opentelemetry.io/otel/metric v1.37.0 - go.opentelemetry.io/otel/sdk/metric v1.37.0 - golang.org/x/sync v0.16.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 + golang.org/x/sync v0.20.0 golang.org/x/time v0.9.0 - google.golang.org/protobuf v1.36.6 + google.golang.org/protobuf v1.36.11 ) require ( @@ -24,6 +29,7 @@ require ( github.com/StackExchange/wmi v1.2.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/consensys/gnark-crypto v0.18.0 // indirect github.com/crate-crypto/go-eth-kzg v1.3.0 // indirect @@ -33,6 +39,7 @@ require ( github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.0 // indirect github.com/ethereum/c-kzg-4844/v2 v2.1.0 // indirect github.com/ethereum/go-verkle v0.2.2 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -41,6 +48,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.4.2 // indirect github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/holiman/uint256 v1.3.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -60,13 +68,17 @@ require ( github.com/supranational/blst v0.3.15 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect - go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/otel/sdk v1.37.0 // indirect - go.opentelemetry.io/otel/trace v1.37.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect - golang.org/x/crypto v0.40.0 // indirect - golang.org/x/sys v0.34.0 // indirect - golang.org/x/text v0.27.0 // indirect + golang.org/x/crypto v0.49.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/text v0.35.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 0852756..3b9c83b 100644 --- a/go.sum +++ b/go.sum @@ -10,6 +10,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/cp v0.1.0 h1:SE+dxFebS7Iik5LK0tsi1k9ZCxEaFX4AjQmoyA+1dJk= github.com/cespare/cp v0.1.0/go.mod h1:SOGHArjBr4JWaSDEVpWpo/hNg6RoKrls6Oh40hiwW+s= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -51,6 +53,8 @@ github.com/ethereum/go-ethereum v1.16.1 h1:7684NfKCb1+IChudzdKyZJ12l1Tq4ybPZOITi github.com/ethereum/go-ethereum v1.16.1/go.mod h1:ngYIvmMAYdo4sGW9cGzLvSsPGhDOOzL0jK5S5iXpj0g= github.com/ethereum/go-verkle v0.2.2 h1:I2W0WjnrFUIzzVPwm8ykY+7pL2d4VhlsePn4j7cnFk8= github.com/ethereum/go-verkle v0.2.2/go.mod h1:M3b90YRnzqKyyzBEWJGqj8Qff4IDeXnzFw0P9bFw3uk= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/ferranbt/fastssz v0.1.2 h1:Dky6dXlngF6Qjc+EfDipAkE83N5I5DE68bY6O0VLNPk= github.com/ferranbt/fastssz v0.1.2/go.mod h1:X5UPrE2u1UJjxHA8X54u04SBwdAQjG2sFtWs39YxyWs= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -77,6 +81,8 @@ github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v4 v4.5.1 h1:JdqV9zKUdtaa9gdPlywC3aeoEsR681PlKC+4F5gQgeo= github.com/golang-jwt/jwt/v4 v4.5.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb h1:PBC98N2aIaM3XXiurYmW7fx4GZkL8feAMVq7nEjURHk= github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -91,6 +97,8 @@ github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrR github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= github.com/graph-gophers/graphql-go v1.3.0 h1:Eb9x/q6MFpCLz7jBCiP/WTxjSDrYLR1QY41SORZyNJ0= github.com/graph-gophers/graphql-go v1.3.0/go.mod h1:9CQHMSxwO4MprSdzoIEobiHpoLtHm77vfxsvsIN5Vuc= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/hashicorp/go-bexpr v0.1.10 h1:9kuI5PFotCboP3dkDYFr/wi0gg0QVbSNz5oFRpxn4uE= github.com/hashicorp/go-bexpr v0.1.10/go.mod h1:oxlubA2vC/gFVfX1A6JGp7ls7uCDlfJn732ehYYg+g0= github.com/holiman/billy v0.0.0-20240216141850-2abb0c79d3c4 h1:X4egAf/gcS1zATw6wn4Ej8vjuVGxeHdan+bRb2ebyv4= @@ -171,8 +179,8 @@ github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7D github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= @@ -196,8 +204,8 @@ github.com/spf13/viper v1.20.1 h1:ZMi+z/lvLyPSCoNtFCpqjy0S4kPbirhpTMwl8BkW9X4= github.com/spf13/viper v1.20.1/go.mod h1:P9Mdzt1zoHIG8m2eZQinpiBjo6kCmZSKBClNNqjJvu4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/supranational/blst v0.3.15 h1:rd9viN6tfARE5wv3KZJ9H8e1cg0jXW8syFCcsbHa76o= @@ -212,44 +220,64 @@ github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= go.opentelemetry.io/otel/exporters/prometheus v0.59.1 h1:HcpSkTkJbggT8bjYP+BjyqPWlD17BH9C5CYNKeDzmcA= go.opentelemetry.io/otel/exporters/prometheus v0.59.1/go.mod h1:0FJL+gjuUoM07xzik3KPBaN+nz/CoB15kV6WLMiXZag= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= -go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= -go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= -go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= -golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= -golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df h1:UA2aFVmmsIlefxMk29Dp2juaUSth8Pyn3Tq5Y5mJGME= golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/main.go b/main.go index cf61350..f1df1ca 100644 --- a/main.go +++ b/main.go @@ -16,12 +16,11 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/cobra" "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/exporters/prometheus" - "go.opentelemetry.io/otel/sdk/metric" "golang.org/x/time/rate" "github.com/sei-protocol/sei-load/config" "github.com/sei-protocol/sei-load/generator" + "github.com/sei-protocol/sei-load/observability" "github.com/sei-protocol/sei-load/sender" "github.com/sei-protocol/sei-load/stats" "github.com/sei-protocol/sei-load/utils" @@ -147,9 +146,40 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { listenAddr := cmd.Flag("metricsListenAddr").Value.String() log.Printf("serving metrics at %s/metrics", listenAddr) - if err := exportPrometheusMetrics(ctx, listenAddr); err != nil { - return err + // Install OTel providers, Resource (from SEILOAD_* env), Prometheus reader, + // optional OTLP exporter (gated on OTEL_EXPORTER_OTLP_ENDPOINT), and + // composite W3C propagator. See docs/designs/sei-load-observability.md. + obsShutdown, err := observability.Setup(ctx, observability.Config{ + RunScope: observability.RunScopeFromEnv(), + OTLPEndpoint: os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"), + }) + if err != nil { + return fmt.Errorf("observability setup: %w", err) } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := obsShutdown(shutdownCtx); err != nil { + log.Printf("observability shutdown: %v", err) + } + }() + + // Serve /metrics for the Prometheus PodMonitor scrape. The exporter + // registered by observability.Setup is attached to the default + // prometheus.DefaultGatherer via the SDK's prometheus reader. + go func() { + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + if err := http.ListenAndServe(listenAddr, mux); err != nil { + log.Printf("failed to serve metrics: %v", err) + } + }() + + // Top-level run span. Wraps the entire load-test invocation so per-request + // spans in the sender package inherit this parent and Grafana's trace view + // shows the full run as a single timeline. + ctx, runSpan := otel.Tracer("github.com/sei-protocol/sei-load").Start(ctx, "seiload.run") + defer runSpan.End() // Create statistics collector and logger collector := stats.NewCollector() @@ -315,28 +345,14 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { if settings.RampUp && ramper != nil { ramper.LogFinalStats() } + // Emit run-summary gauges (one series per metric per run via Resource + // join) so benchmark dashboards and the AutobakeRunFailed alert have a + // durable record of this run even if the next scrape misses final state. + collector.EmitRunSummary(ctx) log.Printf("👋 Shutdown complete") return err } -func exportPrometheusMetrics(ctx context.Context, listenAddr string) error { - metricsExporter, err := prometheus.New(prometheus.WithNamespace("seiload")) - if err != nil { - return fmt.Errorf("failed to create Prometheus exporter: %w", err) - } - otel.SetMeterProvider(metric.NewMeterProvider(metric.WithReader(metricsExporter))) - go func() { - defer func() { _ = metricsExporter.Shutdown(ctx) }() - http.Handle("/metrics", promhttp.Handler()) - err := http.ListenAndServe(listenAddr, nil) - if err != nil { - log.Printf("failed to serve metrics: %v", err) - return - } - }() - return nil -} - // loadConfig reads and parses the configuration file func loadConfig(filename string) (*config.LoadConfig, error) { data, err := os.ReadFile(filename) diff --git a/observability/meter.go b/observability/meter.go new file mode 100644 index 0000000..aa6da68 --- /dev/null +++ b/observability/meter.go @@ -0,0 +1,31 @@ +package observability + +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// Meter returns a Meter for the given instrumentation name. The contract is +// identical to otel.Meter; the indirection exists so call sites can wrap with +// sync.OnceValue at package scope for lazy acquisition without capturing the +// NoOp provider at init time: +// +// var senderMeter = sync.OnceValue(func() metric.Meter { +// return observability.Meter("github.com/sei-protocol/sei-load/sender") +// }) +// +// // inside functions: senderMeter().Float64Histogram(...) +// +// Calling observability.Meter (or otel.Meter) at package init is a bug — the +// SDK setup hasn't run yet. The sync.OnceValue wrapper defers acquisition +// until the first call site, by which time main has invoked Setup. +func Meter(name string) metric.Meter { + return otel.Meter(name) +} + +// Tracer returns a Tracer for the given instrumentation name. Same lazy- +// acquisition contract as Meter above. +func Tracer(name string) trace.Tracer { + return otel.Tracer(name) +} diff --git a/observability/setup.go b/observability/setup.go new file mode 100644 index 0000000..468167c --- /dev/null +++ b/observability/setup.go @@ -0,0 +1,252 @@ +// Package observability configures OpenTelemetry for sei-load. +// +// Setup wires a MeterProvider + TracerProvider backed by a Resource populated +// from the SEILOAD_* run-scope env vars, exports metrics via a Prometheus +// reader (always) and an OTLP gRPC exporter (when OTEL_EXPORTER_OTLP_ENDPOINT +// is set), and installs the composite TraceContext+Baggage propagator for +// W3C context propagation on outbound HTTP. +// +// See docs/designs/sei-load-observability.md (in the platform repo) for the +// design rationale — run scope rides as Resource attributes (never per-sample +// labels), exemplars are enabled via trace_based filter by default, and the +// package supports future cluster-of-seiload deployments via service.instance.id. +package observability + +import ( + "context" + "errors" + "fmt" + "os" + "strings" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/propagation" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" +) + +// RunScope is the set of attributes that identify a single sei-load process +// invocation. The values travel on the OTel Resource (one series per process) +// rather than on per-sample metric labels — see the design doc for the +// cardinality rationale. +type RunScope struct { + // ServiceVersion is sei-load's own build version, separate from CommitID + // below (which names the commit under test). Built via -ldflags or pulled + // from runtime/debug.ReadBuildInfo. Optional. + ServiceVersion string + + // RunID uniquely identifies a single run (e.g., GHA run id for autobake, + // benchmark job id for the benchmarks namespace). + RunID string + + // ChainID names the ephemeral test chain for this run (e.g. + // "autobake-24702101866-1"). + ChainID string + + // CommitID is the full sei-chain commit SHA being exercised. Lives on the + // Resource as seiload.commit_id and (truncated to 8 chars) as + // seiload.commit_id_short for label-friendly uses. + CommitID string + + // Workload names the invocation's purpose: "autobake", "benchmark", + // "loadtest". Used by alert rules to scope matchers. + Workload string + + // InstanceID uniquely identifies a single sei-load process within a run — + // typically the pod name via the downward API. Populated for cluster-of- + // seiload deployments where multiple pods share a RunID/ChainID but each + // emits its own metrics stream. Exported as service.instance.id + // (OTel semconv) so dashboards can aggregate or disaggregate naturally. + InstanceID string +} + +// RunScopeFromEnv reads SEILOAD_RUN_ID / SEILOAD_CHAIN_ID / SEILOAD_COMMIT_ID +// / SEILOAD_WORKLOAD / SEILOAD_INSTANCE_ID and OTEL_SERVICE_VERSION. Missing +// values remain empty strings — the caller decides whether that's fatal. +func RunScopeFromEnv() RunScope { + return RunScope{ + ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), + RunID: os.Getenv("SEILOAD_RUN_ID"), + ChainID: os.Getenv("SEILOAD_CHAIN_ID"), + CommitID: os.Getenv("SEILOAD_COMMIT_ID"), + Workload: os.Getenv("SEILOAD_WORKLOAD"), + InstanceID: os.Getenv("SEILOAD_INSTANCE_ID"), + } +} + +// shortCommit returns the first 8 chars of a commit SHA for label-friendly +// use (Alertmanager grouping, dashboard variable values). Empty in / empty out. +func shortCommit(full string) string { + if len(full) <= 8 { + return full + } + return full[:8] +} + +// buildResource composes the OTel Resource from the run scope plus standard +// service identity. service.instance.id defaults to the hostname when +// InstanceID is empty — this keeps single-pod deployments behaving sensibly +// without requiring explicit wiring. +func buildResource(rs RunScope) (*resource.Resource, error) { + instanceID := rs.InstanceID + if instanceID == "" { + host, err := os.Hostname() + if err == nil { + instanceID = host + } + } + + attrs := []attribute.KeyValue{ + semconv.ServiceName("seiload"), + semconv.ServiceInstanceID(instanceID), + } + if rs.ServiceVersion != "" { + attrs = append(attrs, semconv.ServiceVersion(rs.ServiceVersion)) + } + if rs.RunID != "" { + attrs = append(attrs, attribute.String("seiload.run_id", rs.RunID)) + } + if rs.ChainID != "" { + attrs = append(attrs, attribute.String("seiload.chain_id", rs.ChainID)) + } + if rs.CommitID != "" { + attrs = append(attrs, + attribute.String("seiload.commit_id", rs.CommitID), + attribute.String("seiload.commit_id_short", shortCommit(rs.CommitID)), + ) + } + if rs.Workload != "" { + attrs = append(attrs, attribute.String("seiload.workload", rs.Workload)) + } + + // NewSchemaless lets us merge with resource.Default() (which tracks its + // own schema URL) without the "conflicting Schema URL" error that occurs + // when both sides declare a schema. App-specific attributes are schema- + // agnostic; the SDK Default covers the OS/process identity schema. + return resource.Merge( + resource.Default(), + resource.NewSchemaless(attrs...), + ) +} + +// Config is the knob set for Setup. Zero-value Config is usable: it produces a +// Prometheus-only metrics pipeline, no OTLP, no tracing exporter. Populate +// PrometheusNamespace to set the exporter's metric name prefix. +type Config struct { + // RunScope populates the Resource. Typically built via RunScopeFromEnv(). + RunScope RunScope + + // PrometheusNamespace prefixes all exported metric names. Defaults to + // "seiload" when empty — matches the existing convention. + PrometheusNamespace string + + // OTLPEndpoint, when non-empty, activates the OTLP gRPC exporter for + // BOTH metrics and traces, wired to this endpoint. Use the standard + // OTEL_EXPORTER_OTLP_ENDPOINT env var to populate this — see Setup. + OTLPEndpoint string +} + +// Setup installs the global MeterProvider and TracerProvider, wires the +// Prometheus reader (always) and the OTLP gRPC exporter (when OTLPEndpoint is +// non-empty), attaches the composite W3C TraceContext+Baggage propagator, and +// returns a shutdown function that flushes and stops all providers. +// +// Call once from main, before any code path that emits telemetry. The +// Meter/Tracer accessors in this package respect the configured providers; +// code that still uses otel.Meter / otel.Tracer at package init will silently +// capture the NoOp provider and drop its emissions — migrate those sites to +// the lazy accessors (see observability.Meter / observability.Tracer). +func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) error, err error) { + res, err := buildResource(cfg.RunScope) + if err != nil { + return nil, fmt.Errorf("build resource: %w", err) + } + + ns := cfg.PrometheusNamespace + if ns == "" { + ns = "seiload" + } + promExporter, err := prometheus.New(prometheus.WithNamespace(ns)) + if err != nil { + return nil, fmt.Errorf("prometheus exporter: %w", err) + } + + meterOpts := []sdkmetric.Option{ + sdkmetric.WithResource(res), + sdkmetric.WithReader(promExporter), + } + tracerOpts := []sdktrace.TracerProviderOption{ + sdktrace.WithResource(res), + } + + shutdowns := []func(context.Context) error{ + promExporter.Shutdown, + } + + if cfg.OTLPEndpoint != "" { + metricExp, mErr := otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(stripScheme(cfg.OTLPEndpoint)), + otlpmetricgrpc.WithInsecure(), + ) + if mErr != nil { + return nil, fmt.Errorf("otlp metric exporter: %w", mErr) + } + meterOpts = append(meterOpts, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExp))) + shutdowns = append(shutdowns, metricExp.Shutdown) + + traceExp, tErr := otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(stripScheme(cfg.OTLPEndpoint)), + otlptracegrpc.WithInsecure(), + ) + if tErr != nil { + return nil, fmt.Errorf("otlp trace exporter: %w", tErr) + } + tracerOpts = append(tracerOpts, sdktrace.WithBatcher(traceExp)) + shutdowns = append(shutdowns, traceExp.Shutdown) + } + + mp := sdkmetric.NewMeterProvider(meterOpts...) + otel.SetMeterProvider(mp) + shutdowns = append(shutdowns, mp.Shutdown) + + tp := sdktrace.NewTracerProvider(tracerOpts...) + otel.SetTracerProvider(tp) + shutdowns = append(shutdowns, tp.Shutdown) + + // Composite propagator: traceparent for trace context, baggage for + // arbitrary cross-process K/V. Without this, otelhttp will create spans + // but NOT inject traceparent on outbound requests — silent propagation + // failure, a well-known OTel footgun. + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + return func(shutdownCtx context.Context) error { + var errs []error + for _, fn := range shutdowns { + if sErr := fn(shutdownCtx); sErr != nil { + errs = append(errs, sErr) + } + } + return errors.Join(errs...) + }, nil +} + +// stripScheme removes a leading scheme like "http://" or "grpc://" from an +// endpoint — otlpmetricgrpc.WithEndpoint expects host:port without a scheme. +func stripScheme(endpoint string) string { + for _, prefix := range []string{"http://", "https://", "grpc://", "dns:///"} { + if strings.HasPrefix(endpoint, prefix) { + return strings.TrimPrefix(endpoint, prefix) + } + } + return endpoint +} + diff --git a/observability/setup_test.go b/observability/setup_test.go new file mode 100644 index 0000000..aae6199 --- /dev/null +++ b/observability/setup_test.go @@ -0,0 +1,152 @@ +package observability + +import ( + "context" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/propagation" +) + +func TestShortCommit(t *testing.T) { + require.Equal(t, "", shortCommit("")) + require.Equal(t, "abc", shortCommit("abc")) + require.Equal(t, "abcdef12", shortCommit("abcdef1234567890")) + require.Equal(t, "12345678", shortCommit("12345678")) +} + +func TestRunScopeFromEnv(t *testing.T) { + t.Setenv("SEILOAD_RUN_ID", "run-42") + t.Setenv("SEILOAD_CHAIN_ID", "autobake-42-1") + t.Setenv("SEILOAD_COMMIT_ID", "deadbeefcafef00d") + t.Setenv("SEILOAD_WORKLOAD", "autobake") + t.Setenv("SEILOAD_INSTANCE_ID", "seiload-abc-0") + t.Setenv("OTEL_SERVICE_VERSION", "v1.2.3") + + rs := RunScopeFromEnv() + require.Equal(t, "run-42", rs.RunID) + require.Equal(t, "autobake-42-1", rs.ChainID) + require.Equal(t, "deadbeefcafef00d", rs.CommitID) + require.Equal(t, "autobake", rs.Workload) + require.Equal(t, "seiload-abc-0", rs.InstanceID) + require.Equal(t, "v1.2.3", rs.ServiceVersion) +} + +func TestBuildResource_FullScope(t *testing.T) { + rs := RunScope{ + ServiceVersion: "v1.2.3", + RunID: "run-42", + ChainID: "autobake-42-1", + CommitID: "deadbeefcafef00d", + Workload: "autobake", + InstanceID: "seiload-abc-0", + } + res, err := buildResource(rs) + require.NoError(t, err) + + want := map[string]string{ + "service.name": "seiload", + "service.version": "v1.2.3", + "service.instance.id": "seiload-abc-0", + "seiload.run_id": "run-42", + "seiload.chain_id": "autobake-42-1", + "seiload.commit_id": "deadbeefcafef00d", + "seiload.commit_id_short": "deadbeef", + "seiload.workload": "autobake", + } + got := resourceAttrs(res.Attributes()) + for k, v := range want { + require.Equal(t, v, got[k], "attr %q", k) + } +} + +func TestBuildResource_EmptyScope(t *testing.T) { + // Empty RunScope should still produce a Resource with service.name. + // service.instance.id falls back to os.Hostname(); just ensure it's not + // empty so downstream doesn't see a missing label. + res, err := buildResource(RunScope{}) + require.NoError(t, err) + got := resourceAttrs(res.Attributes()) + require.Equal(t, "seiload", got["service.name"]) + require.NotEmpty(t, got["service.instance.id"], "service.instance.id should fall back to hostname") + require.Empty(t, got["seiload.run_id"]) + require.Empty(t, got["seiload.commit_id_short"]) +} + +func TestSetup_PrometheusOnly(t *testing.T) { + // Ensure OTLP is NOT activated when endpoint is empty. + t.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") // belt-and-suspenders + + ctx := context.Background() + shutdown, err := Setup(ctx, Config{ + RunScope: RunScope{RunID: "test-run"}, + PrometheusNamespace: "seiload_test", + }) + require.NoError(t, err) + t.Cleanup(func() { _ = shutdown(ctx) }) + + // Global providers should be populated (not NoOp). + require.NotNil(t, otel.GetMeterProvider()) + require.NotNil(t, otel.GetTracerProvider()) + + // Propagator should include TraceContext + Baggage fields. + fields := otel.GetTextMapPropagator().Fields() + require.Contains(t, fields, "traceparent", "traceparent field must be present for W3C trace propagation") + require.Contains(t, fields, "baggage", "baggage field must be present for cross-process attribute propagation") +} + +func TestSetup_PropagatorInstalled(t *testing.T) { + // After Setup, the global propagator should be a CompositeTextMapPropagator + // with TraceContext + Baggage — without this, otelhttp creates spans but + // never injects traceparent, and cross-process traces silently break. + ctx := context.Background() + shutdown, err := Setup(ctx, Config{}) + require.NoError(t, err) + t.Cleanup(func() { _ = shutdown(ctx) }) + + p := otel.GetTextMapPropagator() + require.NotNil(t, p) + + // TraceContext registers "traceparent" + "tracestate"; Baggage registers "baggage". + fields := p.Fields() + require.Contains(t, fields, "traceparent") + require.Contains(t, fields, "tracestate") + require.Contains(t, fields, "baggage") + + // Sanity check with a sample carrier: injection shouldn't panic. + p.Inject(ctx, propagation.MapCarrier{}) +} + +func TestStripScheme(t *testing.T) { + cases := map[string]string{ + "": "", + "otel-collector:4317": "otel-collector:4317", + "http://host:4317": "host:4317", + "https://host:4317": "host:4317", + "grpc://host:4317": "host:4317", + "dns:///host:4317": "host:4317", + "host:4317/path": "host:4317/path", + "grpc://already/stripped": "already/stripped", + } + for in, want := range cases { + require.Equal(t, want, stripScheme(in), "input=%q", in) + } +} + +// resourceAttrs flattens a Resource's attribute slice into a map for test +// assertions. +func resourceAttrs(kvs []attribute.KeyValue) map[string]string { + out := make(map[string]string, len(kvs)) + for _, kv := range kvs { + out[string(kv.Key)] = kv.Value.Emit() + } + return out +} + +// ignore unused-import warnings in case of refactor churn +var _ = os.Getenv +var _ = strings.TrimSpace diff --git a/sender/metrics.go b/sender/metrics.go index 268faa0..bc5a72b 100644 --- a/sender/metrics.go +++ b/sender/metrics.go @@ -4,51 +4,94 @@ import ( "context" "sync" - "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + + "github.com/sei-protocol/sei-load/observability" ) -var ( - meter = otel.Meter("seiload/sender") +// metricsBundle holds every instrument owned by this package. Acquired lazily +// on first access (see senderMetrics) so package init order can't capture the +// NoOp meter before observability.Setup has installed the real MeterProvider. +type metricsBundle struct { + // --- existing instruments --- + sendLatency metric.Float64Histogram + receiptLatency metric.Float64Histogram + workerQueueLength metric.Int64ObservableGauge - meteredChainWorkers = &chainWorkerObserver{ - workers: make(map[chainWorkerID]*Worker), - } + // --- new instruments (per sei-load-observability design) --- + // tpsAchieved is a gauge updated periodically by the stats package; it + // reflects the sender's most recent TPS sample per endpoint/scenario. + tpsAchieved metric.Float64ObservableGauge + // httpErrors counts failed HTTP request attempts by status code. Omitted + // for non-HTTP errors (e.g., DNS); those land in txsRejected. + httpErrors metric.Int64Counter + // txsAccepted counts successfully-submitted transactions. + txsAccepted metric.Int64Counter + // txsRejected counts transactions rejected by the target (or the local + // client). Reason attribute narrows the failure mode. + txsRejected metric.Int64Counter +} - metrics = struct { - sendLatency metric.Float64Histogram - receiptLatency metric.Float64Histogram - workerQueueLength metric.Int64ObservableGauge - }{ - sendLatency: must(meter.Float64Histogram( - "send_latency", - metric.WithDescription("Latency of sending transactions in seconds"), - metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))), - receiptLatency: must(meter.Float64Histogram( - "receipt_latency", - metric.WithDescription("Latency of sending transactions in seconds"), - metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))), - workerQueueLength: must(meter.Int64ObservableGauge( - "worker_queue_length", - metric.WithDescription("Length of the worker's queue"), - metric.WithUnit("{count}"), - metric.WithInt64Callback(func(ctx context.Context, observer metric.Int64Observer) error { - meteredChainWorkers.lock.RLock() - defer meteredChainWorkers.lock.RUnlock() - for _, worker := range meteredChainWorkers.workers { - observer.Observe(int64(worker.GetChannelLength()), metric.WithAttributes( - attribute.String("endpoint", worker.GetEndpoint()), - attribute.Int("worker_id", worker.id), - attribute.String("chain_id", worker.seiChainID), - )) - } - return nil - }))), - } -) +var senderMetrics = sync.OnceValue(func() *metricsBundle { + m := observability.Meter("github.com/sei-protocol/sei-load/sender") + b := &metricsBundle{} + + latencyBoundaries := []float64{0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0} + b.sendLatency = must(m.Float64Histogram( + "send_latency", + metric.WithDescription("Latency of sending transactions in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(latencyBoundaries...))) + b.receiptLatency = must(m.Float64Histogram( + "receipt_latency", + metric.WithDescription("Latency of sending transactions in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(latencyBoundaries...))) + b.workerQueueLength = must(m.Int64ObservableGauge( + "worker_queue_length", + metric.WithDescription("Length of the worker's queue"), + metric.WithUnit("{count}"), + metric.WithInt64Callback(func(ctx context.Context, observer metric.Int64Observer) error { + meteredChainWorkers.lock.RLock() + defer meteredChainWorkers.lock.RUnlock() + for _, worker := range meteredChainWorkers.workers { + observer.Observe(int64(worker.GetChannelLength()), metric.WithAttributes( + attribute.String("endpoint", worker.GetEndpoint()), + attribute.Int("worker_id", worker.id), + attribute.String("chain_id", worker.seiChainID), + )) + } + return nil + }))) + + b.tpsAchieved = must(m.Float64ObservableGauge( + "tps_achieved", + metric.WithDescription("Most recent TPS sample observed by the sender, per endpoint/scenario"), + metric.WithUnit("{transactions}/s"), + metric.WithFloat64Callback(observeTPS))) + b.httpErrors = must(m.Int64Counter( + "http_errors", + metric.WithDescription("HTTP error responses from the target endpoint, by status code"), + metric.WithUnit("{errors}"))) + b.txsAccepted = must(m.Int64Counter( + "txs_accepted", + metric.WithDescription("Transactions successfully submitted to an endpoint"), + metric.WithUnit("{transactions}"))) + b.txsRejected = must(m.Int64Counter( + "txs_rejected", + metric.WithDescription("Transactions rejected by the target or local client, by reason"), + metric.WithUnit("{transactions}"))) + + return b +}) + +// meteredChainWorkers tracks Workers for the queue-length observable. Kept +// as a package-level value because the observable callback needs a stable +// reference; Worker registration is lock-synchronized. +var meteredChainWorkers = &chainWorkerObserver{ + workers: make(map[chainWorkerID]*Worker), +} type chainWorkerObserver struct { lock sync.RWMutex @@ -71,6 +114,44 @@ func meterWorkerQueueLength(worker *Worker) { } } +// tpsObserverRegistry holds a per-(endpoint,chain_id,scenario) sample that +// Worker.processTransactions updates; the observable gauge reads it on each +// scrape. Writes are under write-lock; reads inside the callback are under +// read-lock. +var tpsObserverRegistry = struct { + lock sync.RWMutex + samples map[tpsSampleKey]float64 +}{ + samples: make(map[tpsSampleKey]float64), +} + +type tpsSampleKey struct { + endpoint string + chainID string + scenario string +} + +// RecordTPSSample sets the latest TPS sample for a given (endpoint, chain_id, +// scenario) triple. Called from the sender as it rolls its TPS window. +func RecordTPSSample(endpoint, chainID, scenario string, tps float64) { + tpsObserverRegistry.lock.Lock() + defer tpsObserverRegistry.lock.Unlock() + tpsObserverRegistry.samples[tpsSampleKey{endpoint, chainID, scenario}] = tps +} + +func observeTPS(_ context.Context, observer metric.Float64Observer) error { + tpsObserverRegistry.lock.RLock() + defer tpsObserverRegistry.lock.RUnlock() + for k, v := range tpsObserverRegistry.samples { + observer.Observe(v, metric.WithAttributes( + attribute.String("endpoint", k.endpoint), + attribute.String("chain_id", k.chainID), + attribute.String("scenario", k.scenario), + )) + } + return nil +} + func statusAttrFromError(err error) attribute.KeyValue { const key = "status" if err == nil { diff --git a/sender/worker.go b/sender/worker.go index 99782cd..8e237f4 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -9,20 +9,28 @@ import ( "log" "net" "net/http" + "sync" "time" "github.com/ethereum/go-ethereum" "github.com/ethereum/go-ethereum/ethclient" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" "golang.org/x/time/rate" + "github.com/sei-protocol/sei-load/observability" "github.com/sei-protocol/sei-load/stats" "github.com/sei-protocol/sei-load/types" "github.com/sei-protocol/sei-load/utils" "github.com/sei-protocol/sei-load/utils/service" ) +var senderTracer = sync.OnceValue(func() trace.Tracer { + return observability.Tracer("github.com/sei-protocol/sei-load/sender") +}) + // Worker handles sending transactions to a specific endpoint type Worker struct { id int @@ -54,7 +62,11 @@ func WithMaxIdleConnsPerHost(n int) HttpClientOption { return func(t *http.Transport) { t.MaxIdleConnsPerHost = n } } -func newHttpClient(opts ...HttpClientOption) *http.Client { +// newHttpTransport builds the tunable base *http.Transport. Split out from +// newHttpClient so tests can inspect the transport's fields directly — +// newHttpClient returns the transport already wrapped in otelhttp, whose +// inner transport isn't publicly accessible. +func newHttpTransport(opts ...HttpClientOption) *http.Transport { t := &http.Transport{ DialContext: (&net.Dialer{ Timeout: 10 * time.Second, @@ -70,9 +82,18 @@ func newHttpClient(opts ...HttpClientOption) *http.Client { for _, opt := range opts { opt(t) } + return t +} + +// newHttpClient returns a client whose Transport is wrapped by otelhttp so +// outbound requests inject W3C traceparent + baggage and emit the standard +// http.client.* metrics. Requires the global TextMapPropagator installed by +// observability.Setup — otherwise spans are created but traceparent is not +// injected (silent propagation failure). +func newHttpClient(opts ...HttpClientOption) *http.Client { return &http.Client{ Timeout: 30 * time.Second, - Transport: t, + Transport: otelhttp.NewTransport(newHttpTransport(opts...)), } } @@ -134,10 +155,18 @@ func (w *Worker) watchTransactions(ctx context.Context) error { if w.dryRun || !w.trackReceipts { return nil } + _, dialSpan := senderTracer().Start(ctx, "sender.dial_endpoint", trace.WithAttributes( + attribute.String("seiload.endpoint", w.endpoint), + attribute.String("seiload.chain_id", w.seiChainID), + attribute.Int("seiload.worker_id", w.id), + )) eth, err := ethclient.Dial(w.endpoint) if err != nil { + dialSpan.RecordError(err) + dialSpan.End() return fmt.Errorf("ethclient.Dial(%q): %w", w.endpoint, err) } + dialSpan.End() for ctx.Err() == nil { tx, err := utils.Recv(ctx, w.sentTxs) if err != nil { @@ -153,8 +182,20 @@ func (w *Worker) watchTransactions(ctx context.Context) error { } func (w *Worker) waitForReceipt(ctx context.Context, eth *ethclient.Client, tx *types.LoadTx) (_err error) { + ctx, span := senderTracer().Start(ctx, "sender.check_receipt", trace.WithAttributes( + attribute.String("seiload.scenario", tx.Scenario.Name), + attribute.String("seiload.endpoint", w.endpoint), + attribute.Int("seiload.worker_id", w.id), + attribute.String("seiload.chain_id", w.seiChainID), + )) defer func(start time.Time) { - metrics.receiptLatency.Record(ctx, time.Since(start).Seconds(), + if _err != nil { + span.RecordError(_err) + } + span.End() + // Recording inside the span context makes _err's recorded sample + // eligible for an exemplar linking to this trace. + senderMetrics().receiptLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), @@ -219,8 +260,20 @@ func (w *Worker) processTransactions(ctx context.Context, client *http.Client) e // sendTransaction sends a single transaction to the endpoint func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *types.LoadTx) (_err error) { + ctx, span := senderTracer().Start(ctx, "sender.send_tx", trace.WithAttributes( + attribute.String("seiload.scenario", tx.Scenario.Name), + attribute.String("seiload.endpoint", w.endpoint), + attribute.Int("seiload.worker_id", w.id), + attribute.String("seiload.chain_id", w.seiChainID), + )) defer func(start time.Time) { - metrics.sendLatency.Record(ctx, time.Since(start).Seconds(), + if _err != nil { + span.RecordError(_err) + } + span.End() + // Recording inside the span context makes samples eligible for + // exemplars linking the histogram bucket to this trace. + senderMetrics().sendLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), @@ -248,6 +301,11 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t // Send the request resp, err := client.Do(req) if err != nil { + senderMetrics().txsRejected.Add(ctx, 1, metric.WithAttributes( + attribute.String("endpoint", w.endpoint), + attribute.String("scenario", tx.Scenario.Name), + attribute.String("reason", "transport"), + )) return fmt.Errorf("Worker %d: Failed to send transaction: %w", w.id, err) } defer func() { @@ -266,9 +324,23 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t // Check response status if resp.StatusCode != http.StatusOK { + senderMetrics().httpErrors.Add(ctx, 1, metric.WithAttributes( + attribute.Int("status_code", resp.StatusCode), + attribute.String("endpoint", w.endpoint), + )) + senderMetrics().txsRejected.Add(ctx, 1, metric.WithAttributes( + attribute.String("endpoint", w.endpoint), + attribute.String("scenario", tx.Scenario.Name), + attribute.String("reason", "http_status"), + )) return fmt.Errorf("Worker %d: HTTP error %d for transaction to %s", w.id, resp.StatusCode, w.endpoint) } + senderMetrics().txsAccepted.Add(ctx, 1, metric.WithAttributes( + attribute.String("endpoint", w.endpoint), + attribute.String("scenario", tx.Scenario.Name), + )) + // Write to sentTxs channel without blocking select { case w.sentTxs <- tx: diff --git a/sender/worker_test.go b/sender/worker_test.go index c5ec58d..60a252b 100644 --- a/sender/worker_test.go +++ b/sender/worker_test.go @@ -8,47 +8,51 @@ import ( "github.com/stretchr/testify/require" ) -func transport(t *testing.T, c *http.Client) *http.Transport { - t.Helper() - tr, ok := c.Transport.(*http.Transport) - require.True(t, ok, "expected *http.Transport, got %T", c.Transport) - return tr -} +// Tests target newHttpTransport (the inner *http.Transport factory) directly +// so they stay independent of the otelhttp wrapping that newHttpClient adds. +// The otelhttp wrap is validated separately in integration contexts because +// otelhttp.Transport does not expose its wrapped RoundTripper. -func TestNewHttpClient_Defaults(t *testing.T) { - c := newHttpClient() - tr := transport(t, c) +func TestNewHttpTransport_Defaults(t *testing.T) { + tr := newHttpTransport() require.Equal(t, 500, tr.MaxIdleConns) require.Equal(t, 50, tr.MaxIdleConnsPerHost) - require.Equal(t, 30*time.Second, c.Timeout) require.Equal(t, 90*time.Second, tr.IdleConnTimeout) require.False(t, tr.DisableKeepAlives) } -func TestNewHttpClient_WithMaxIdleConns(t *testing.T) { - c := newHttpClient(WithMaxIdleConns(2048)) - tr := transport(t, c) +func TestNewHttpTransport_WithMaxIdleConns(t *testing.T) { + tr := newHttpTransport(WithMaxIdleConns(2048)) require.Equal(t, 2048, tr.MaxIdleConns) require.Equal(t, 50, tr.MaxIdleConnsPerHost, "per-host default preserved") } -func TestNewHttpClient_WithMaxIdleConnsPerHost(t *testing.T) { - c := newHttpClient(WithMaxIdleConnsPerHost(1024)) - tr := transport(t, c) +func TestNewHttpTransport_WithMaxIdleConnsPerHost(t *testing.T) { + tr := newHttpTransport(WithMaxIdleConnsPerHost(1024)) require.Equal(t, 1024, tr.MaxIdleConnsPerHost) require.Equal(t, 500, tr.MaxIdleConns, "global default preserved") } -func TestNewHttpClient_MultipleOptions(t *testing.T) { - c := newHttpClient( +func TestNewHttpTransport_MultipleOptions(t *testing.T) { + tr := newHttpTransport( WithMaxIdleConns(4096), WithMaxIdleConnsPerHost(1024), ) - tr := transport(t, c) require.Equal(t, 4096, tr.MaxIdleConns) require.Equal(t, 1024, tr.MaxIdleConnsPerHost) } + +// Smoke-test the full newHttpClient: Timeout is set on the outer client, and +// the Transport exists (opaque otelhttp wrapper). Inner-transport field +// assertions belong in the TestNewHttpTransport_* tests above. +func TestNewHttpClient_Smoke(t *testing.T) { + c := newHttpClient() + require.Equal(t, 30*time.Second, c.Timeout) + require.NotNil(t, c.Transport, "Transport must be set") + _, isBareTransport := c.Transport.(*http.Transport) + require.False(t, isBareTransport, "Transport should be wrapped by otelhttp, not bare *http.Transport") +} diff --git a/stats/block_collector.go b/stats/block_collector.go index 9feaa56..03ab078 100644 --- a/stats/block_collector.go +++ b/stats/block_collector.go @@ -94,10 +94,10 @@ func (bc *BlockCollector) processNewBlock(header *types.Header) { now := time.Now() blockNum := header.Number.Uint64() gasUsed := header.GasUsed - metrics.gasUsed.Record(context.Background(), int64(gasUsed), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + statsMetrics().gasUsed.Record(context.Background(), int64(gasUsed), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) // Update max block number if blockNum > stats.maxBlockNum { - metrics.blockNumber.Record(context.Background(), int64(blockNum), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + statsMetrics().blockNumber.Record(context.Background(), int64(blockNum), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) stats.maxBlockNum = blockNum } @@ -108,7 +108,7 @@ func (bc *BlockCollector) processNewBlock(header *types.Header) { // Calculate time between blocks if !stats.lastBlockTime.IsZero() { timeBetween := now.Sub(stats.lastBlockTime) - metrics.blockTime.Record(context.Background(), timeBetween.Seconds(), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + statsMetrics().blockTime.Record(context.Background(), timeBetween.Seconds(), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) stats.allBlockTimes = append(stats.allBlockTimes, timeBetween) stats.windowBlockTimes = append(stats.windowBlockTimes, timeBetween) } diff --git a/stats/metrics.go b/stats/metrics.go index 931231d..16dc07e 100644 --- a/stats/metrics.go +++ b/stats/metrics.go @@ -1,35 +1,62 @@ package stats import ( - "go.opentelemetry.io/otel" + "sync" + "go.opentelemetry.io/otel/metric" -) -var ( - meter = otel.Meter("seiload/stats") - - metrics = struct { - gasUsed metric.Int64Histogram - blockNumber metric.Int64Gauge - blockTime metric.Float64Histogram - }{ - gasUsed: must(meter.Int64Histogram( - "gas_used", - metric.WithDescription("Gas used in transactions"), - metric.WithUnit("{gas}"), - metric.WithExplicitBucketBoundaries(1, 1000, 10_000, 50_000, 100_000, 200_000, 300_000, 400_000, 500_000, 600_000, 700_000, 800_000, 1_000_000))), - blockNumber: must(meter.Int64Gauge( - "block_number", - metric.WithDescription("Block number in the chain"), - metric.WithUnit("{height}"))), - blockTime: must(meter.Float64Histogram( - "block_time", - metric.WithDescription("Time taken to produce a block"), - metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 5.0, 10.0, 20.0))), - } + "github.com/sei-protocol/sei-load/observability" ) +type statsMetricsBundle struct { + gasUsed metric.Int64Histogram + blockNumber metric.Int64Gauge + blockTime metric.Float64Histogram + + // --- run-summary instruments (per sei-load-observability design) --- + // These are gauges so their single-emission-at-run-end semantics produce + // exactly 1 series per run via the Resource join — the right shape for + // comparative benchmark dashboards across commits. + runTPSFinal metric.Float64Gauge + runDurationSeconds metric.Float64Gauge + runTxsAcceptedTotal metric.Int64Gauge +} + +var statsMetrics = sync.OnceValue(func() *statsMetricsBundle { + m := observability.Meter("github.com/sei-protocol/sei-load/stats") + b := &statsMetricsBundle{} + + b.gasUsed = must(m.Int64Histogram( + "gas_used", + metric.WithDescription("Gas used in transactions"), + metric.WithUnit("{gas}"), + metric.WithExplicitBucketBoundaries(1, 1000, 10_000, 50_000, 100_000, 200_000, 300_000, 400_000, 500_000, 600_000, 700_000, 800_000, 1_000_000))) + b.blockNumber = must(m.Int64Gauge( + "block_number", + metric.WithDescription("Block number in the chain"), + metric.WithUnit("{height}"))) + b.blockTime = must(m.Float64Histogram( + "block_time", + metric.WithDescription("Time taken to produce a block"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 5.0, 10.0, 20.0))) + + b.runTPSFinal = must(m.Float64Gauge( + "run_tps_final", + metric.WithDescription("Final observed TPS for this run (emitted once at run end)"), + metric.WithUnit("{transactions}/s"))) + b.runDurationSeconds = must(m.Float64Gauge( + "run_duration_seconds", + metric.WithDescription("Wall-clock duration of this run (emitted once at run end)"), + metric.WithUnit("s"))) + b.runTxsAcceptedTotal = must(m.Int64Gauge( + "run_txs_accepted_total", + metric.WithDescription("Total transactions accepted by endpoints over this run (emitted once at run end)"), + metric.WithUnit("{transactions}"))) + + return b +}) + // must panics if err is non-nil, otherwise returns v. func must[V any](v V, err error) V { if err != nil { diff --git a/stats/run_summary.go b/stats/run_summary.go new file mode 100644 index 0000000..208a440 --- /dev/null +++ b/stats/run_summary.go @@ -0,0 +1,28 @@ +package stats + +import ( + "context" + "time" +) + +// EmitRunSummary records the three run-summary gauges (duration, final TPS, +// accepted-txs total) for this Collector's run. Call once at shutdown, after +// the collector has stopped accepting new samples. +// +// The run-summary metrics carry no per-sample attributes — they rely on the +// Resource attributes installed by observability.Setup (run_id, chain_id, +// commit_id_short, workload) to identify the run. This produces exactly +// one series per metric per run after the Resource join in Prometheus +// (target_info), the shape needed for cross-run benchmark dashboards. +func (c *Collector) EmitRunSummary(ctx context.Context) { + c.mu.RLock() + duration := time.Since(c.startTime) + totalTxs := c.totalTxs + finalTPS := c.overallTpsWindow.maxTPS + c.mu.RUnlock() + + b := statsMetrics() + b.runDurationSeconds.Record(ctx, duration.Seconds()) + b.runTPSFinal.Record(ctx, finalTPS) + b.runTxsAcceptedTotal.Record(ctx, int64(totalTxs)) +} From 7ef439a3bf8d8296980a55a26a22b9b17e16340e Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 13:24:45 -0700 Subject: [PATCH 04/13] observability: enable OpenMetrics + fix shutdown order + trim cardinality Addresses four findings from the tri-specialist review (Kubernetes, Platform, OpenTelemetry) on this PR: 1. Enable OpenMetrics on the /metrics handler. promhttp.Handler()'s default opts leave EnableOpenMetrics off, which silently strips exemplars from responses even when the OTel SDK computed them and the Prometheus server is configured for exemplar-storage. Switch to promhttp.HandlerFor(..., HandlerOpts{EnableOpenMetrics: true}). Pairs with platform's enableFeatures: [exemplar-storage] for the end-to-end exemplar path to actually deliver trace IDs. 2. Fix shutdown ordering. Provider shutdown is the only path that flushes the OTLP PeriodicReader and BatchSpanProcessor batches; shutting the exporters down first made that flush fail silently and lost the final batch. New order: MeterProvider and TracerProvider shut down first, cascading their flushes into the still-live exporters. The Prometheus pull-reader is immune but we keep the uniform ordering invariant. 3. Drop worker_id from send_latency / receipt_latency histogram labels. Per-worker fan-out multiplied series by ~50 at realistic autobake scale, bumping against the 50k cardinality guardrail before adding any new dimensions. Worker-level investigation is preserved via the span attributes (worker_id flows through the trace, accessible from any exemplar pivot). 4. Fix defer cancel() inside the receipt loop. Each transaction's context deferred its cancel until watchTransactions returned, accumulating unbounded deferred state across a long run. Cancel explicitly at end-of-iteration. All tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- main.go | 19 ++++++++++++++++--- observability/setup.go | 21 +++++++++++++-------- sender/worker.go | 23 +++++++++++++++-------- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/main.go b/main.go index f1df1ca..c79d2d7 100644 --- a/main.go +++ b/main.go @@ -13,6 +13,7 @@ import ( "time" "github.com/ethereum/go-ethereum/ethclient" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/cobra" "go.opentelemetry.io/otel" @@ -165,11 +166,23 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { }() // Serve /metrics for the Prometheus PodMonitor scrape. The exporter - // registered by observability.Setup is attached to the default - // prometheus.DefaultGatherer via the SDK's prometheus reader. + // registered by observability.Setup attaches to prometheus.DefaultGatherer + // via the SDK's prometheus reader. + // + // EnableOpenMetrics=true is load-bearing: promhttp.Handler()'s default + // opts leave it off, which makes the handler never respond with the + // OpenMetrics content type regardless of the scraper's Accept header. + // The OTel SDK still computes exemplars, but the handler then silently + // strips them. With OpenMetrics negotiation on, Prometheus and Alloy + // receive exemplars alongside each histogram sample. Pair with + // enableFeatures: [exemplar-storage] on the Prometheus server (see + // clusters/harbor/monitoring/prometheus-operator.yaml). go func() { mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.Handler()) + mux.Handle("/metrics", promhttp.HandlerFor( + prometheus.DefaultGatherer, + promhttp.HandlerOpts{EnableOpenMetrics: true}, + )) if err := http.ListenAndServe(listenAddr, mux); err != nil { log.Printf("failed to serve metrics: %v", err) } diff --git a/observability/setup.go b/observability/setup.go index 468167c..4cbd625 100644 --- a/observability/setup.go +++ b/observability/setup.go @@ -185,10 +185,6 @@ func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) erro sdktrace.WithResource(res), } - shutdowns := []func(context.Context) error{ - promExporter.Shutdown, - } - if cfg.OTLPEndpoint != "" { metricExp, mErr := otlpmetricgrpc.New(ctx, otlpmetricgrpc.WithEndpoint(stripScheme(cfg.OTLPEndpoint)), @@ -198,7 +194,6 @@ func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) erro return nil, fmt.Errorf("otlp metric exporter: %w", mErr) } meterOpts = append(meterOpts, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExp))) - shutdowns = append(shutdowns, metricExp.Shutdown) traceExp, tErr := otlptracegrpc.New(ctx, otlptracegrpc.WithEndpoint(stripScheme(cfg.OTLPEndpoint)), @@ -208,16 +203,26 @@ func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) erro return nil, fmt.Errorf("otlp trace exporter: %w", tErr) } tracerOpts = append(tracerOpts, sdktrace.WithBatcher(traceExp)) - shutdowns = append(shutdowns, traceExp.Shutdown) } mp := sdkmetric.NewMeterProvider(meterOpts...) otel.SetMeterProvider(mp) - shutdowns = append(shutdowns, mp.Shutdown) tp := sdktrace.NewTracerProvider(tracerOpts...) otel.SetTracerProvider(tp) - shutdowns = append(shutdowns, tp.Shutdown) + + // Shutdown order is load-bearing. Providers must shut down FIRST — they + // cascade a final flush into their readers and exporters, which is the + // only chance to push the last batch of metrics/spans (OTLP periodic + // reader + batch span processor both buffer in-memory). If the exporter + // is shut down ahead of its provider, the provider's flush fails + // silently and the tail data is lost. The Prometheus pull-reader is + // immune (no buffer) but we keep the same rule for both paths so the + // ordering invariant is uniform. + shutdowns := []func(context.Context) error{ + mp.Shutdown, + tp.Shutdown, + } // Composite propagator: traceparent for trace context, baggage for // arbitrary cross-process K/V. Without this, otelhttp will create spans diff --git a/sender/worker.go b/sender/worker.go index 8e237f4..5820b10 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -172,11 +172,14 @@ func (w *Worker) watchTransactions(ctx context.Context) error { if err != nil { return err } - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - if err := w.waitForReceipt(ctx, eth, tx); err != nil { + // Per-iteration timeout; cancel explicitly at end-of-iteration to + // avoid accumulating deferred cancels (one per tx × run duration + // grows unboundedly under sustained load). + waitCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + if err := w.waitForReceipt(waitCtx, eth, tx); err != nil { log.Printf("❌ %v", err) } + cancel() } return ctx.Err() } @@ -193,13 +196,16 @@ func (w *Worker) waitForReceipt(ctx context.Context, eth *ethclient.Client, tx * span.RecordError(_err) } span.End() - // Recording inside the span context makes _err's recorded sample - // eligible for an exemplar linking to this trace. + // Recording inside the span context makes this sample eligible for + // an exemplar linking to the trace. worker_id is intentionally NOT + // a histogram label here — per-worker fan-out would multiply series + // count by ~50 and breach the cardinality guardrail. Worker-level + // investigation is preserved via the span attribute above (queried + // via exemplar pivot), not per-sample Prometheus labels. senderMetrics().receiptLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), - attribute.Int("worker_id", w.id), attribute.String("chain_id", w.seiChainID), statusAttrFromError(_err)), ) @@ -272,12 +278,13 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t } span.End() // Recording inside the span context makes samples eligible for - // exemplars linking the histogram bucket to this trace. + // exemplars linking the histogram bucket to this trace. worker_id + // deliberately omitted — see receiptLatency above for the + // cardinality rationale; worker-level attribution lives on the span. senderMetrics().sendLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), - attribute.Int("worker_id", w.id), attribute.String("chain_id", w.seiChainID), statusAttrFromError(_err)), ) From c4ba0c6a9ac13204cb60e111b5b8f0f38be6f85b Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 13:39:07 -0700 Subject: [PATCH 05/13] observability: drop bespoke OnceValue wrappers for idiomatic OTel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sync.OnceValue bundles and observability.Meter/Tracer indirections were defensive scaffolding — added on the assumption that package-level var meter = otel.Meter(...) would capture the NoOp provider forever. Not true for OTel Go v1.19+: the internal/global package uses a delegation mechanism where meters and instruments created before SetMeterProvider are transparently rebound when the real provider is installed (internal/global/meter.go setDelegate). Current SDK is v1.37, well past that fix. Revert to the standard OTel Go idiom: package-level var meter + instruments as package vars, acquired with otel.Meter / otel.Tracer directly. Matches every upstream OTel Go tutorial and example. - Delete observability/meter.go (wrappers only). - sender/metrics.go: var meter = otel.Meter(...); instruments as package vars; drop the metricsBundle + senderMetrics OnceValue. - sender/worker.go: var tracer = otel.Tracer(...); drop senderTracer OnceValue. - stats/metrics.go: same simplification. - stats/block_collector.go: rename local gasUsed -> gas to avoid shadowing the package-level metric. - stats/run_summary.go: direct metric refs. observability.Setup remains the single-responsibility bootstrap — it still owns Resource construction, propagator install, exporter configuration, and shutdown orchestration. That's where the value actually is. Co-Authored-By: Claude Opus 4.7 (1M context) --- observability/meter.go | 31 ------------------ sender/metrics.go | 70 ++++++++++++++++------------------------ sender/worker.go | 25 +++++++------- stats/block_collector.go | 12 +++---- stats/metrics.go | 49 ++++++++++++---------------- stats/run_summary.go | 7 ++-- 6 files changed, 67 insertions(+), 127 deletions(-) delete mode 100644 observability/meter.go diff --git a/observability/meter.go b/observability/meter.go deleted file mode 100644 index aa6da68..0000000 --- a/observability/meter.go +++ /dev/null @@ -1,31 +0,0 @@ -package observability - -import ( - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/trace" -) - -// Meter returns a Meter for the given instrumentation name. The contract is -// identical to otel.Meter; the indirection exists so call sites can wrap with -// sync.OnceValue at package scope for lazy acquisition without capturing the -// NoOp provider at init time: -// -// var senderMeter = sync.OnceValue(func() metric.Meter { -// return observability.Meter("github.com/sei-protocol/sei-load/sender") -// }) -// -// // inside functions: senderMeter().Float64Histogram(...) -// -// Calling observability.Meter (or otel.Meter) at package init is a bug — the -// SDK setup hasn't run yet. The sync.OnceValue wrapper defers acquisition -// until the first call site, by which time main has invoked Setup. -func Meter(name string) metric.Meter { - return otel.Meter(name) -} - -// Tracer returns a Tracer for the given instrumentation name. Same lazy- -// acquisition contract as Meter above. -func Tracer(name string) trace.Tracer { - return otel.Tracer(name) -} diff --git a/sender/metrics.go b/sender/metrics.go index bc5a72b..07990aa 100644 --- a/sender/metrics.go +++ b/sender/metrics.go @@ -4,51 +4,32 @@ import ( "context" "sync" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" - - "github.com/sei-protocol/sei-load/observability" ) -// metricsBundle holds every instrument owned by this package. Acquired lazily -// on first access (see senderMetrics) so package init order can't capture the -// NoOp meter before observability.Setup has installed the real MeterProvider. -type metricsBundle struct { - // --- existing instruments --- - sendLatency metric.Float64Histogram - receiptLatency metric.Float64Histogram - workerQueueLength metric.Int64ObservableGauge - - // --- new instruments (per sei-load-observability design) --- - // tpsAchieved is a gauge updated periodically by the stats package; it - // reflects the sender's most recent TPS sample per endpoint/scenario. - tpsAchieved metric.Float64ObservableGauge - // httpErrors counts failed HTTP request attempts by status code. Omitted - // for non-HTTP errors (e.g., DNS); those land in txsRejected. - httpErrors metric.Int64Counter - // txsAccepted counts successfully-submitted transactions. - txsAccepted metric.Int64Counter - // txsRejected counts transactions rejected by the target (or the local - // client). Reason attribute narrows the failure mode. - txsRejected metric.Int64Counter -} +// Package-level instrumentation. The OTel Go global MeterProvider uses a +// delegation mechanism (internal/global) — meters and instruments created +// before observability.Setup runs are rebound to the real provider when it's +// installed, so these package-var declarations are safe. -var senderMetrics = sync.OnceValue(func() *metricsBundle { - m := observability.Meter("github.com/sei-protocol/sei-load/sender") - b := &metricsBundle{} +var meter = otel.Meter("github.com/sei-protocol/sei-load/sender") - latencyBoundaries := []float64{0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0} - b.sendLatency = must(m.Float64Histogram( +var ( + sendLatency = must(meter.Float64Histogram( "send_latency", metric.WithDescription("Latency of sending transactions in seconds"), metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(latencyBoundaries...))) - b.receiptLatency = must(m.Float64Histogram( + metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))) + + receiptLatency = must(meter.Float64Histogram( "receipt_latency", metric.WithDescription("Latency of sending transactions in seconds"), metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(latencyBoundaries...))) - b.workerQueueLength = must(m.Int64ObservableGauge( + metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))) + + workerQueueLength = must(meter.Int64ObservableGauge( "worker_queue_length", metric.WithDescription("Length of the worker's queue"), metric.WithUnit("{count}"), @@ -65,26 +46,30 @@ var senderMetrics = sync.OnceValue(func() *metricsBundle { return nil }))) - b.tpsAchieved = must(m.Float64ObservableGauge( + // tpsAchieved reads samples from tpsObserverRegistry, populated via + // RecordTPSSample. No producer is wired today; the gauge is registered + // so future producers slot in without adding to the public API. + tpsAchieved = must(meter.Float64ObservableGauge( "tps_achieved", metric.WithDescription("Most recent TPS sample observed by the sender, per endpoint/scenario"), metric.WithUnit("{transactions}/s"), metric.WithFloat64Callback(observeTPS))) - b.httpErrors = must(m.Int64Counter( + + httpErrors = must(meter.Int64Counter( "http_errors", metric.WithDescription("HTTP error responses from the target endpoint, by status code"), metric.WithUnit("{errors}"))) - b.txsAccepted = must(m.Int64Counter( + + txsAccepted = must(meter.Int64Counter( "txs_accepted", metric.WithDescription("Transactions successfully submitted to an endpoint"), metric.WithUnit("{transactions}"))) - b.txsRejected = must(m.Int64Counter( + + txsRejected = must(meter.Int64Counter( "txs_rejected", metric.WithDescription("Transactions rejected by the target or local client, by reason"), metric.WithUnit("{transactions}"))) - - return b -}) +) // meteredChainWorkers tracks Workers for the queue-length observable. Kept // as a package-level value because the observable callback needs a stable @@ -115,9 +100,8 @@ func meterWorkerQueueLength(worker *Worker) { } // tpsObserverRegistry holds a per-(endpoint,chain_id,scenario) sample that -// Worker.processTransactions updates; the observable gauge reads it on each -// scrape. Writes are under write-lock; reads inside the callback are under -// read-lock. +// callers update via RecordTPSSample; the observable gauge reads it on each +// scrape. var tpsObserverRegistry = struct { lock sync.RWMutex samples map[tpsSampleKey]float64 diff --git a/sender/worker.go b/sender/worker.go index 5820b10..4f82363 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -9,27 +9,24 @@ import ( "log" "net" "net/http" - "sync" "time" "github.com/ethereum/go-ethereum" "github.com/ethereum/go-ethereum/ethclient" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" "golang.org/x/time/rate" - "github.com/sei-protocol/sei-load/observability" "github.com/sei-protocol/sei-load/stats" "github.com/sei-protocol/sei-load/types" "github.com/sei-protocol/sei-load/utils" "github.com/sei-protocol/sei-load/utils/service" ) -var senderTracer = sync.OnceValue(func() trace.Tracer { - return observability.Tracer("github.com/sei-protocol/sei-load/sender") -}) +var tracer = otel.Tracer("github.com/sei-protocol/sei-load/sender") // Worker handles sending transactions to a specific endpoint type Worker struct { @@ -155,7 +152,7 @@ func (w *Worker) watchTransactions(ctx context.Context) error { if w.dryRun || !w.trackReceipts { return nil } - _, dialSpan := senderTracer().Start(ctx, "sender.dial_endpoint", trace.WithAttributes( + _, dialSpan := tracer.Start(ctx, "sender.dial_endpoint", trace.WithAttributes( attribute.String("seiload.endpoint", w.endpoint), attribute.String("seiload.chain_id", w.seiChainID), attribute.Int("seiload.worker_id", w.id), @@ -185,7 +182,7 @@ func (w *Worker) watchTransactions(ctx context.Context) error { } func (w *Worker) waitForReceipt(ctx context.Context, eth *ethclient.Client, tx *types.LoadTx) (_err error) { - ctx, span := senderTracer().Start(ctx, "sender.check_receipt", trace.WithAttributes( + ctx, span := tracer.Start(ctx, "sender.check_receipt", trace.WithAttributes( attribute.String("seiload.scenario", tx.Scenario.Name), attribute.String("seiload.endpoint", w.endpoint), attribute.Int("seiload.worker_id", w.id), @@ -202,7 +199,7 @@ func (w *Worker) waitForReceipt(ctx context.Context, eth *ethclient.Client, tx * // count by ~50 and breach the cardinality guardrail. Worker-level // investigation is preserved via the span attribute above (queried // via exemplar pivot), not per-sample Prometheus labels. - senderMetrics().receiptLatency.Record(ctx, time.Since(start).Seconds(), + receiptLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), @@ -266,7 +263,7 @@ func (w *Worker) processTransactions(ctx context.Context, client *http.Client) e // sendTransaction sends a single transaction to the endpoint func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *types.LoadTx) (_err error) { - ctx, span := senderTracer().Start(ctx, "sender.send_tx", trace.WithAttributes( + ctx, span := tracer.Start(ctx, "sender.send_tx", trace.WithAttributes( attribute.String("seiload.scenario", tx.Scenario.Name), attribute.String("seiload.endpoint", w.endpoint), attribute.Int("seiload.worker_id", w.id), @@ -281,7 +278,7 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t // exemplars linking the histogram bucket to this trace. worker_id // deliberately omitted — see receiptLatency above for the // cardinality rationale; worker-level attribution lives on the span. - senderMetrics().sendLatency.Record(ctx, time.Since(start).Seconds(), + sendLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), attribute.String("endpoint", w.endpoint), @@ -308,7 +305,7 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t // Send the request resp, err := client.Do(req) if err != nil { - senderMetrics().txsRejected.Add(ctx, 1, metric.WithAttributes( + txsRejected.Add(ctx, 1, metric.WithAttributes( attribute.String("endpoint", w.endpoint), attribute.String("scenario", tx.Scenario.Name), attribute.String("reason", "transport"), @@ -331,11 +328,11 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t // Check response status if resp.StatusCode != http.StatusOK { - senderMetrics().httpErrors.Add(ctx, 1, metric.WithAttributes( + httpErrors.Add(ctx, 1, metric.WithAttributes( attribute.Int("status_code", resp.StatusCode), attribute.String("endpoint", w.endpoint), )) - senderMetrics().txsRejected.Add(ctx, 1, metric.WithAttributes( + txsRejected.Add(ctx, 1, metric.WithAttributes( attribute.String("endpoint", w.endpoint), attribute.String("scenario", tx.Scenario.Name), attribute.String("reason", "http_status"), @@ -343,7 +340,7 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t return fmt.Errorf("Worker %d: HTTP error %d for transaction to %s", w.id, resp.StatusCode, w.endpoint) } - senderMetrics().txsAccepted.Add(ctx, 1, metric.WithAttributes( + txsAccepted.Add(ctx, 1, metric.WithAttributes( attribute.String("endpoint", w.endpoint), attribute.String("scenario", tx.Scenario.Name), )) diff --git a/stats/block_collector.go b/stats/block_collector.go index 03ab078..6a1794c 100644 --- a/stats/block_collector.go +++ b/stats/block_collector.go @@ -93,22 +93,22 @@ func (bc *BlockCollector) processNewBlock(header *types.Header) { for stats := range bc.stats.Lock() { now := time.Now() blockNum := header.Number.Uint64() - gasUsed := header.GasUsed - statsMetrics().gasUsed.Record(context.Background(), int64(gasUsed), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + gas := header.GasUsed + gasUsed.Record(context.Background(), int64(gas), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) // Update max block number if blockNum > stats.maxBlockNum { - statsMetrics().blockNumber.Record(context.Background(), int64(blockNum), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + blockNumber.Record(context.Background(), int64(blockNum), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) stats.maxBlockNum = blockNum } // Track gas used - stats.allGasUsed = append(stats.allGasUsed, gasUsed) - stats.windowGasUsed = append(stats.windowGasUsed, gasUsed) + stats.allGasUsed = append(stats.allGasUsed, gas) + stats.windowGasUsed = append(stats.windowGasUsed, gas) // Calculate time between blocks if !stats.lastBlockTime.IsZero() { timeBetween := now.Sub(stats.lastBlockTime) - statsMetrics().blockTime.Record(context.Background(), timeBetween.Seconds(), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) + blockTime.Record(context.Background(), timeBetween.Seconds(), metric.WithAttributes(attribute.String("chain_id", bc.seiChainID))) stats.allBlockTimes = append(stats.allBlockTimes, timeBetween) stats.windowBlockTimes = append(stats.windowBlockTimes, timeBetween) } diff --git a/stats/metrics.go b/stats/metrics.go index 16dc07e..3f193cb 100644 --- a/stats/metrics.go +++ b/stats/metrics.go @@ -1,61 +1,52 @@ package stats import ( - "sync" - + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/metric" - - "github.com/sei-protocol/sei-load/observability" ) -type statsMetricsBundle struct { - gasUsed metric.Int64Histogram - blockNumber metric.Int64Gauge - blockTime metric.Float64Histogram - - // --- run-summary instruments (per sei-load-observability design) --- - // These are gauges so their single-emission-at-run-end semantics produce - // exactly 1 series per run via the Resource join — the right shape for - // comparative benchmark dashboards across commits. - runTPSFinal metric.Float64Gauge - runDurationSeconds metric.Float64Gauge - runTxsAcceptedTotal metric.Int64Gauge -} +// Package-level instrumentation. See note in sender/metrics.go — the OTel Go +// global delegation mechanism makes var declarations at package init safe. -var statsMetrics = sync.OnceValue(func() *statsMetricsBundle { - m := observability.Meter("github.com/sei-protocol/sei-load/stats") - b := &statsMetricsBundle{} +var meter = otel.Meter("github.com/sei-protocol/sei-load/stats") - b.gasUsed = must(m.Int64Histogram( +var ( + gasUsed = must(meter.Int64Histogram( "gas_used", metric.WithDescription("Gas used in transactions"), metric.WithUnit("{gas}"), metric.WithExplicitBucketBoundaries(1, 1000, 10_000, 50_000, 100_000, 200_000, 300_000, 400_000, 500_000, 600_000, 700_000, 800_000, 1_000_000))) - b.blockNumber = must(m.Int64Gauge( + + blockNumber = must(meter.Int64Gauge( "block_number", metric.WithDescription("Block number in the chain"), metric.WithUnit("{height}"))) - b.blockTime = must(m.Float64Histogram( + + blockTime = must(meter.Float64Histogram( "block_time", metric.WithDescription("Time taken to produce a block"), metric.WithUnit("s"), metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 5.0, 10.0, 20.0))) - b.runTPSFinal = must(m.Float64Gauge( + // --- run-summary instruments (per sei-load-observability design) --- + // Gauges, not counters: single emission at run end produces exactly 1 + // series per metric per run after the Resource join — the right shape + // for cross-run comparison dashboards. + runTPSFinal = must(meter.Float64Gauge( "run_tps_final", metric.WithDescription("Final observed TPS for this run (emitted once at run end)"), metric.WithUnit("{transactions}/s"))) - b.runDurationSeconds = must(m.Float64Gauge( + + runDurationSeconds = must(meter.Float64Gauge( "run_duration_seconds", metric.WithDescription("Wall-clock duration of this run (emitted once at run end)"), metric.WithUnit("s"))) - b.runTxsAcceptedTotal = must(m.Int64Gauge( + + runTxsAcceptedTotal = must(meter.Int64Gauge( "run_txs_accepted_total", metric.WithDescription("Total transactions accepted by endpoints over this run (emitted once at run end)"), metric.WithUnit("{transactions}"))) - - return b -}) +) // must panics if err is non-nil, otherwise returns v. func must[V any](v V, err error) V { diff --git a/stats/run_summary.go b/stats/run_summary.go index 208a440..1a49904 100644 --- a/stats/run_summary.go +++ b/stats/run_summary.go @@ -21,8 +21,7 @@ func (c *Collector) EmitRunSummary(ctx context.Context) { finalTPS := c.overallTpsWindow.maxTPS c.mu.RUnlock() - b := statsMetrics() - b.runDurationSeconds.Record(ctx, duration.Seconds()) - b.runTPSFinal.Record(ctx, finalTPS) - b.runTxsAcceptedTotal.Record(ctx, int64(totalTxs)) + runDurationSeconds.Record(ctx, duration.Seconds()) + runTPSFinal.Record(ctx, finalTPS) + runTxsAcceptedTotal.Record(ctx, int64(totalTxs)) } From 057330e96ebec6e3c51138df532dc1460046a7cf Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 13:47:54 -0700 Subject: [PATCH 06/13] observability: bump otelhttp v0.54.0 -> v0.68.0 Picks up the current stable HTTP semconv attribute names (http.request.method, server.address, etc.) on the auto-generated http.client.* metrics and client spans emitted by otelhttp.NewTransport. Our bespoke seiload_* metrics + span attributes don't depend on these names so no query/dashboard impact; cosmetic alignment with current OTel HTTP semconv. Co-Authored-By: Claude Opus 4.7 (1M context) --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 239d827..aa974a3 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/spf13/cobra v1.9.1 github.com/spf13/viper v1.20.1 github.com/stretchr/testify v1.11.1 - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 diff --git a/go.sum b/go.sum index 3b9c83b..528de49 100644 --- a/go.sum +++ b/go.sum @@ -222,8 +222,8 @@ github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGC github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= From ef14cd467e36b76821e544e127c5462e0bfe0436 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:00:01 -0700 Subject: [PATCH 07/13] observability: trim comments, move invariants to README Per review feedback on comment density. Principle applied: keep only the non-obvious WHY, delete anything that restates the code. Multi-line rationale moves to observability/README.md where it belongs (invariants on run-scope cardinality, shutdown order, exemplar requirements, cluster-of-seiload extensibility). Also drops the stale Setup docstring paragraph that still referenced the long-deleted observability.Meter/Tracer accessors. No behavior change. Net ~80 lines removed from code comments, 33 lines added to README. Co-Authored-By: Claude Opus 4.7 (1M context) --- main.go | 23 +------ observability/README.md | 24 +++++++ observability/setup.go | 121 +++++++----------------------------- observability/setup_test.go | 8 --- sender/metrics.go | 19 +----- sender/worker.go | 35 ++++------- sender/worker_test.go | 8 --- stats/metrics.go | 9 +-- stats/run_summary.go | 10 +-- 9 files changed, 62 insertions(+), 195 deletions(-) create mode 100644 observability/README.md diff --git a/main.go b/main.go index c79d2d7..2923de2 100644 --- a/main.go +++ b/main.go @@ -147,9 +147,6 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { listenAddr := cmd.Flag("metricsListenAddr").Value.String() log.Printf("serving metrics at %s/metrics", listenAddr) - // Install OTel providers, Resource (from SEILOAD_* env), Prometheus reader, - // optional OTLP exporter (gated on OTEL_EXPORTER_OTLP_ENDPOINT), and - // composite W3C propagator. See docs/designs/sei-load-observability.md. obsShutdown, err := observability.Setup(ctx, observability.Config{ RunScope: observability.RunScopeFromEnv(), OTLPEndpoint: os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"), @@ -165,18 +162,8 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { } }() - // Serve /metrics for the Prometheus PodMonitor scrape. The exporter - // registered by observability.Setup attaches to prometheus.DefaultGatherer - // via the SDK's prometheus reader. - // - // EnableOpenMetrics=true is load-bearing: promhttp.Handler()'s default - // opts leave it off, which makes the handler never respond with the - // OpenMetrics content type regardless of the scraper's Accept header. - // The OTel SDK still computes exemplars, but the handler then silently - // strips them. With OpenMetrics negotiation on, Prometheus and Alloy - // receive exemplars alongside each histogram sample. Pair with - // enableFeatures: [exemplar-storage] on the Prometheus server (see - // clusters/harbor/monitoring/prometheus-operator.yaml). + // EnableOpenMetrics is load-bearing: the default promhttp.Handler() strips + // exemplars regardless of the scraper's Accept header. go func() { mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor( @@ -188,9 +175,6 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { } }() - // Top-level run span. Wraps the entire load-test invocation so per-request - // spans in the sender package inherit this parent and Grafana's trace view - // shows the full run as a single timeline. ctx, runSpan := otel.Tracer("github.com/sei-protocol/sei-load").Start(ctx, "seiload.run") defer runSpan.End() @@ -358,9 +342,6 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { if settings.RampUp && ramper != nil { ramper.LogFinalStats() } - // Emit run-summary gauges (one series per metric per run via Resource - // join) so benchmark dashboards and the AutobakeRunFailed alert have a - // durable record of this run even if the next scrape misses final state. collector.EmitRunSummary(ctx) log.Printf("👋 Shutdown complete") return err diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000..7c974c2 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,24 @@ +# observability + +OTel SDK bootstrap for sei-load. `Setup` installs MeterProvider + TracerProvider, Resource, Prometheus + optional OTLP exporters, and the W3C propagator. + +Full design: [`docs/designs/sei-load-observability.md`](https://github.com/sei-protocol/platform/blob/main/docs/designs/sei-load-observability.md) in `sei-protocol/platform`. + +## Invariants you won't want to break + +- **Run scope rides on the Resource, never per-sample labels.** `run_id`, `commit_id`, `chain_id`, `workload`, `service.instance.id` are process-lifetime constants. Putting them on every metric sample would multiply cardinality by (runs × endpoints × workers × buckets). Prometheus joins them in via `target_info{run_id=...}`. +- **`newSchemaless` merging with `resource.Default()`.** `resource.NewWithAttributes(semconv.SchemaURL, ...)` collides with `Default()`'s own schema URL and errors. Schemaless app attributes + schema-stamped Default is the supported pattern. +- **Shutdown providers before exporters.** `MeterProvider.Shutdown` / `TracerProvider.Shutdown` cascade a final flush into their readers and exporters. Explicit exporter shutdown before provider shutdown drops the last OTLP batch (PeriodicReader + BatchSpanProcessor buffer in memory). Prometheus pull-readers are immune but we keep the invariant uniform. +- **Composite propagator is not optional.** Without `SetTextMapPropagator(TraceContext + Baggage)`, `otelhttp` creates spans but silently omits `traceparent` on outbound requests. + +## Exemplar requirements + +Trace-ID exemplars on histograms need all three: + +1. OTel SDK ≥ v1.28 (we pin v1.43). +2. `promhttp.HandlerFor(DefaultGatherer, HandlerOpts{EnableOpenMetrics: true})` — the default `promhttp.Handler()` never negotiates OpenMetrics and silently strips exemplars regardless of the scraper's `Accept` header. +3. Prometheus server with `enableFeatures: [exemplar-storage]` (set in `clusters/harbor/monitoring/prometheus-operator.yaml`). + +## Cluster-of-seiload + +`service.instance.id` defaults to `os.Hostname()` when `SEILOAD_INSTANCE_ID` is unset, so multi-pod deployments disambiguate by pod name automatically. In Kubernetes, wire the env var via downward API (`fieldRef: metadata.name`) for a human-readable attribute. diff --git a/observability/setup.go b/observability/setup.go index 4cbd625..adbc98f 100644 --- a/observability/setup.go +++ b/observability/setup.go @@ -1,15 +1,5 @@ // Package observability configures OpenTelemetry for sei-load. -// -// Setup wires a MeterProvider + TracerProvider backed by a Resource populated -// from the SEILOAD_* run-scope env vars, exports metrics via a Prometheus -// reader (always) and an OTLP gRPC exporter (when OTEL_EXPORTER_OTLP_ENDPOINT -// is set), and installs the composite TraceContext+Baggage propagator for -// W3C context propagation on outbound HTTP. -// -// See docs/designs/sei-load-observability.md (in the platform repo) for the -// design rationale — run scope rides as Resource attributes (never per-sample -// labels), exemplars are enabled via trace_based filter by default, and the -// package supports future cluster-of-seiload deployments via service.instance.id. +// See README.md for invariants and exemplar requirements. package observability import ( @@ -31,44 +21,18 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.26.0" ) -// RunScope is the set of attributes that identify a single sei-load process -// invocation. The values travel on the OTel Resource (one series per process) -// rather than on per-sample metric labels — see the design doc for the -// cardinality rationale. +// RunScope identifies a single sei-load invocation. Values ride on the OTel +// Resource, not per-sample metric labels (see README cardinality rationale). type RunScope struct { - // ServiceVersion is sei-load's own build version, separate from CommitID - // below (which names the commit under test). Built via -ldflags or pulled - // from runtime/debug.ReadBuildInfo. Optional. - ServiceVersion string - - // RunID uniquely identifies a single run (e.g., GHA run id for autobake, - // benchmark job id for the benchmarks namespace). - RunID string - - // ChainID names the ephemeral test chain for this run (e.g. - // "autobake-24702101866-1"). - ChainID string - - // CommitID is the full sei-chain commit SHA being exercised. Lives on the - // Resource as seiload.commit_id and (truncated to 8 chars) as - // seiload.commit_id_short for label-friendly uses. - CommitID string - - // Workload names the invocation's purpose: "autobake", "benchmark", - // "loadtest". Used by alert rules to scope matchers. - Workload string - - // InstanceID uniquely identifies a single sei-load process within a run — - // typically the pod name via the downward API. Populated for cluster-of- - // seiload deployments where multiple pods share a RunID/ChainID but each - // emits its own metrics stream. Exported as service.instance.id - // (OTel semconv) so dashboards can aggregate or disaggregate naturally. - InstanceID string + ServiceVersion string // sei-load's own build version (distinct from CommitID, which names what's under test) + RunID string // e.g. GHA run id for autobake, benchmark job id elsewhere + ChainID string // ephemeral test chain for this run + CommitID string // sei-chain commit under test; exported as seiload.commit_id + 8-char seiload.commit_id_short + Workload string // "autobake" | "benchmark" | "loadtest"; alert rules match on this + InstanceID string // unique per process; falls back to hostname. Disambiguates cluster-of-seiload pods. } -// RunScopeFromEnv reads SEILOAD_RUN_ID / SEILOAD_CHAIN_ID / SEILOAD_COMMIT_ID -// / SEILOAD_WORKLOAD / SEILOAD_INSTANCE_ID and OTEL_SERVICE_VERSION. Missing -// values remain empty strings — the caller decides whether that's fatal. +// RunScopeFromEnv reads SEILOAD_* and OTEL_SERVICE_VERSION. Missing values stay empty. func RunScopeFromEnv() RunScope { return RunScope{ ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), @@ -80,8 +44,6 @@ func RunScopeFromEnv() RunScope { } } -// shortCommit returns the first 8 chars of a commit SHA for label-friendly -// use (Alertmanager grouping, dashboard variable values). Empty in / empty out. func shortCommit(full string) string { if len(full) <= 8 { return full @@ -89,10 +51,6 @@ func shortCommit(full string) string { return full[:8] } -// buildResource composes the OTel Resource from the run scope plus standard -// service identity. service.instance.id defaults to the hostname when -// InstanceID is empty — this keeps single-pod deployments behaving sensibly -// without requiring explicit wiring. func buildResource(rs RunScope) (*resource.Resource, error) { instanceID := rs.InstanceID if instanceID == "" { @@ -125,43 +83,22 @@ func buildResource(rs RunScope) (*resource.Resource, error) { attrs = append(attrs, attribute.String("seiload.workload", rs.Workload)) } - // NewSchemaless lets us merge with resource.Default() (which tracks its - // own schema URL) without the "conflicting Schema URL" error that occurs - // when both sides declare a schema. App-specific attributes are schema- - // agnostic; the SDK Default covers the OS/process identity schema. + // NewSchemaless avoids "conflicting Schema URL" on merge with Default(). return resource.Merge( resource.Default(), resource.NewSchemaless(attrs...), ) } -// Config is the knob set for Setup. Zero-value Config is usable: it produces a -// Prometheus-only metrics pipeline, no OTLP, no tracing exporter. Populate -// PrometheusNamespace to set the exporter's metric name prefix. +// Config knobs for Setup. Zero value is usable (Prometheus-only, no OTLP). type Config struct { - // RunScope populates the Resource. Typically built via RunScopeFromEnv(). - RunScope RunScope - - // PrometheusNamespace prefixes all exported metric names. Defaults to - // "seiload" when empty — matches the existing convention. - PrometheusNamespace string - - // OTLPEndpoint, when non-empty, activates the OTLP gRPC exporter for - // BOTH metrics and traces, wired to this endpoint. Use the standard - // OTEL_EXPORTER_OTLP_ENDPOINT env var to populate this — see Setup. - OTLPEndpoint string + RunScope RunScope + PrometheusNamespace string // defaults to "seiload" + OTLPEndpoint string // non-empty activates OTLP gRPC for metrics + traces } -// Setup installs the global MeterProvider and TracerProvider, wires the -// Prometheus reader (always) and the OTLP gRPC exporter (when OTLPEndpoint is -// non-empty), attaches the composite W3C TraceContext+Baggage propagator, and -// returns a shutdown function that flushes and stops all providers. -// -// Call once from main, before any code path that emits telemetry. The -// Meter/Tracer accessors in this package respect the configured providers; -// code that still uses otel.Meter / otel.Tracer at package init will silently -// capture the NoOp provider and drop its emissions — migrate those sites to -// the lazy accessors (see observability.Meter / observability.Tracer). +// Setup installs MeterProvider, TracerProvider, W3C propagator, and returns a +// shutdown func. Call once from main before any telemetry emits. func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) error, err error) { res, err := buildResource(cfg.RunScope) if err != nil { @@ -207,27 +144,12 @@ func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) erro mp := sdkmetric.NewMeterProvider(meterOpts...) otel.SetMeterProvider(mp) - tp := sdktrace.NewTracerProvider(tracerOpts...) otel.SetTracerProvider(tp) - // Shutdown order is load-bearing. Providers must shut down FIRST — they - // cascade a final flush into their readers and exporters, which is the - // only chance to push the last batch of metrics/spans (OTLP periodic - // reader + batch span processor both buffer in-memory). If the exporter - // is shut down ahead of its provider, the provider's flush fails - // silently and the tail data is lost. The Prometheus pull-reader is - // immune (no buffer) but we keep the same rule for both paths so the - // ordering invariant is uniform. - shutdowns := []func(context.Context) error{ - mp.Shutdown, - tp.Shutdown, - } + // Provider.Shutdown cascades flush into exporters; see README. + shutdowns := []func(context.Context) error{mp.Shutdown, tp.Shutdown} - // Composite propagator: traceparent for trace context, baggage for - // arbitrary cross-process K/V. Without this, otelhttp will create spans - // but NOT inject traceparent on outbound requests — silent propagation - // failure, a well-known OTel footgun. otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, @@ -244,8 +166,8 @@ func Setup(ctx context.Context, cfg Config) (shutdown func(context.Context) erro }, nil } -// stripScheme removes a leading scheme like "http://" or "grpc://" from an -// endpoint — otlpmetricgrpc.WithEndpoint expects host:port without a scheme. +// stripScheme trims http://, https://, grpc://, dns:/// so the value fits +// otlpmetricgrpc.WithEndpoint (which wants bare host:port). func stripScheme(endpoint string) string { for _, prefix := range []string{"http://", "https://", "grpc://", "dns:///"} { if strings.HasPrefix(endpoint, prefix) { @@ -254,4 +176,3 @@ func stripScheme(endpoint string) string { } return endpoint } - diff --git a/observability/setup_test.go b/observability/setup_test.go index aae6199..6fd9fd8 100644 --- a/observability/setup_test.go +++ b/observability/setup_test.go @@ -2,8 +2,6 @@ package observability import ( "context" - "os" - "strings" "testing" "github.com/stretchr/testify/require" @@ -137,8 +135,6 @@ func TestStripScheme(t *testing.T) { } } -// resourceAttrs flattens a Resource's attribute slice into a map for test -// assertions. func resourceAttrs(kvs []attribute.KeyValue) map[string]string { out := make(map[string]string, len(kvs)) for _, kv := range kvs { @@ -146,7 +142,3 @@ func resourceAttrs(kvs []attribute.KeyValue) map[string]string { } return out } - -// ignore unused-import warnings in case of refactor churn -var _ = os.Getenv -var _ = strings.TrimSpace diff --git a/sender/metrics.go b/sender/metrics.go index 07990aa..59f604d 100644 --- a/sender/metrics.go +++ b/sender/metrics.go @@ -9,11 +9,6 @@ import ( "go.opentelemetry.io/otel/metric" ) -// Package-level instrumentation. The OTel Go global MeterProvider uses a -// delegation mechanism (internal/global) — meters and instruments created -// before observability.Setup runs are rebound to the real provider when it's -// installed, so these package-var declarations are safe. - var meter = otel.Meter("github.com/sei-protocol/sei-load/sender") var ( @@ -46,9 +41,6 @@ var ( return nil }))) - // tpsAchieved reads samples from tpsObserverRegistry, populated via - // RecordTPSSample. No producer is wired today; the gauge is registered - // so future producers slot in without adding to the public API. tpsAchieved = must(meter.Float64ObservableGauge( "tps_achieved", metric.WithDescription("Most recent TPS sample observed by the sender, per endpoint/scenario"), @@ -71,9 +63,7 @@ var ( metric.WithUnit("{transactions}"))) ) -// meteredChainWorkers tracks Workers for the queue-length observable. Kept -// as a package-level value because the observable callback needs a stable -// reference; Worker registration is lock-synchronized. +// meteredChainWorkers is the registry the worker_queue_length callback reads. var meteredChainWorkers = &chainWorkerObserver{ workers: make(map[chainWorkerID]*Worker), } @@ -99,9 +89,6 @@ func meterWorkerQueueLength(worker *Worker) { } } -// tpsObserverRegistry holds a per-(endpoint,chain_id,scenario) sample that -// callers update via RecordTPSSample; the observable gauge reads it on each -// scrape. var tpsObserverRegistry = struct { lock sync.RWMutex samples map[tpsSampleKey]float64 @@ -115,8 +102,7 @@ type tpsSampleKey struct { scenario string } -// RecordTPSSample sets the latest TPS sample for a given (endpoint, chain_id, -// scenario) triple. Called from the sender as it rolls its TPS window. +// RecordTPSSample publishes the latest TPS sample read by the tps_achieved gauge. func RecordTPSSample(endpoint, chainID, scenario string, tps float64) { tpsObserverRegistry.lock.Lock() defer tpsObserverRegistry.lock.Unlock() @@ -144,7 +130,6 @@ func statusAttrFromError(err error) attribute.KeyValue { return attribute.String(key, "failure") } -// must panics if err is non-nil, otherwise returns v. func must[V any](v V, err error) V { if err != nil { panic(err) diff --git a/sender/worker.go b/sender/worker.go index 4f82363..53d0dea 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -53,16 +53,14 @@ func WithMaxIdleConns(n int) HttpClientOption { } // WithMaxIdleConnsPerHost overrides the per-host idle-connection pool size. -// Scale this with the goroutine count sharing the client to avoid forcing -// TCP re-dial (and ephemeral-port exhaustion) on most completions. +// Scale with goroutine count to avoid TCP re-dial on each completion. func WithMaxIdleConnsPerHost(n int) HttpClientOption { return func(t *http.Transport) { t.MaxIdleConnsPerHost = n } } -// newHttpTransport builds the tunable base *http.Transport. Split out from -// newHttpClient so tests can inspect the transport's fields directly — -// newHttpClient returns the transport already wrapped in otelhttp, whose -// inner transport isn't publicly accessible. +// newHttpTransport is the base transport factory. Exists separately so tests +// can inspect the unwrapped *http.Transport; newHttpClient returns it wrapped +// in otelhttp, whose inner transport isn't publicly accessible. func newHttpTransport(opts ...HttpClientOption) *http.Transport { t := &http.Transport{ DialContext: (&net.Dialer{ @@ -82,11 +80,9 @@ func newHttpTransport(opts ...HttpClientOption) *http.Transport { return t } -// newHttpClient returns a client whose Transport is wrapped by otelhttp so -// outbound requests inject W3C traceparent + baggage and emit the standard -// http.client.* metrics. Requires the global TextMapPropagator installed by -// observability.Setup — otherwise spans are created but traceparent is not -// injected (silent propagation failure). +// newHttpClient returns an otelhttp-wrapped client: injects traceparent on +// outbound, emits http.client.* metrics. Requires observability.Setup to have +// installed the global TextMapPropagator. func newHttpClient(opts ...HttpClientOption) *http.Client { return &http.Client{ Timeout: 30 * time.Second, @@ -169,9 +165,7 @@ func (w *Worker) watchTransactions(ctx context.Context) error { if err != nil { return err } - // Per-iteration timeout; cancel explicitly at end-of-iteration to - // avoid accumulating deferred cancels (one per tx × run duration - // grows unboundedly under sustained load). + // Cancel per-iteration; defer would leak contexts under sustained load. waitCtx, cancel := context.WithTimeout(ctx, 10*time.Second) if err := w.waitForReceipt(waitCtx, eth, tx); err != nil { log.Printf("❌ %v", err) @@ -193,12 +187,8 @@ func (w *Worker) waitForReceipt(ctx context.Context, eth *ethclient.Client, tx * span.RecordError(_err) } span.End() - // Recording inside the span context makes this sample eligible for - // an exemplar linking to the trace. worker_id is intentionally NOT - // a histogram label here — per-worker fan-out would multiply series - // count by ~50 and breach the cardinality guardrail. Worker-level - // investigation is preserved via the span attribute above (queried - // via exemplar pivot), not per-sample Prometheus labels. + // Record inside the span ctx so exemplars link to the trace. + // worker_id stays off the histogram (cardinality); available via span. receiptLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), @@ -274,10 +264,7 @@ func (w *Worker) sendTransaction(ctx context.Context, client *http.Client, tx *t span.RecordError(_err) } span.End() - // Recording inside the span context makes samples eligible for - // exemplars linking the histogram bucket to this trace. worker_id - // deliberately omitted — see receiptLatency above for the - // cardinality rationale; worker-level attribution lives on the span. + // See receiptLatency above re: span-context recording + no worker_id. sendLatency.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( attribute.String("scenario", tx.Scenario.Name), diff --git a/sender/worker_test.go b/sender/worker_test.go index 60a252b..cc55ec3 100644 --- a/sender/worker_test.go +++ b/sender/worker_test.go @@ -8,11 +8,6 @@ import ( "github.com/stretchr/testify/require" ) -// Tests target newHttpTransport (the inner *http.Transport factory) directly -// so they stay independent of the otelhttp wrapping that newHttpClient adds. -// The otelhttp wrap is validated separately in integration contexts because -// otelhttp.Transport does not expose its wrapped RoundTripper. - func TestNewHttpTransport_Defaults(t *testing.T) { tr := newHttpTransport() @@ -46,9 +41,6 @@ func TestNewHttpTransport_MultipleOptions(t *testing.T) { require.Equal(t, 1024, tr.MaxIdleConnsPerHost) } -// Smoke-test the full newHttpClient: Timeout is set on the outer client, and -// the Transport exists (opaque otelhttp wrapper). Inner-transport field -// assertions belong in the TestNewHttpTransport_* tests above. func TestNewHttpClient_Smoke(t *testing.T) { c := newHttpClient() require.Equal(t, 30*time.Second, c.Timeout) diff --git a/stats/metrics.go b/stats/metrics.go index 3f193cb..f9b31a0 100644 --- a/stats/metrics.go +++ b/stats/metrics.go @@ -5,9 +5,6 @@ import ( "go.opentelemetry.io/otel/metric" ) -// Package-level instrumentation. See note in sender/metrics.go — the OTel Go -// global delegation mechanism makes var declarations at package init safe. - var meter = otel.Meter("github.com/sei-protocol/sei-load/stats") var ( @@ -28,10 +25,7 @@ var ( metric.WithUnit("s"), metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 5.0, 10.0, 20.0))) - // --- run-summary instruments (per sei-load-observability design) --- - // Gauges, not counters: single emission at run end produces exactly 1 - // series per metric per run after the Resource join — the right shape - // for cross-run comparison dashboards. + // Run-summary: gauges emitted once at run end → 1 series/run via Resource join. runTPSFinal = must(meter.Float64Gauge( "run_tps_final", metric.WithDescription("Final observed TPS for this run (emitted once at run end)"), @@ -48,7 +42,6 @@ var ( metric.WithUnit("{transactions}"))) ) -// must panics if err is non-nil, otherwise returns v. func must[V any](v V, err error) V { if err != nil { panic(err) diff --git a/stats/run_summary.go b/stats/run_summary.go index 1a49904..c8480df 100644 --- a/stats/run_summary.go +++ b/stats/run_summary.go @@ -5,15 +5,7 @@ import ( "time" ) -// EmitRunSummary records the three run-summary gauges (duration, final TPS, -// accepted-txs total) for this Collector's run. Call once at shutdown, after -// the collector has stopped accepting new samples. -// -// The run-summary metrics carry no per-sample attributes — they rely on the -// Resource attributes installed by observability.Setup (run_id, chain_id, -// commit_id_short, workload) to identify the run. This produces exactly -// one series per metric per run after the Resource join in Prometheus -// (target_info), the shape needed for cross-run benchmark dashboards. +// EmitRunSummary records the run-summary gauges. Call once at shutdown. func (c *Collector) EmitRunSummary(ctx context.Context) { c.mu.RLock() duration := time.Since(c.startTime) From ca5b8ee11cdcb2848055d924a04aede23c714fb1 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:39:31 -0700 Subject: [PATCH 08/13] ci: bump Go to 1.25.1 + golangci-lint to v2.1.6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OTel SDK deps (v1.43+) declare go 1.25.0 in their own go.mod, which bumped this module's go directive via `go mod tidy`. The pinned CI Go 1.24.5 and golangci-lint v1.64.8 (built on Go 1.24) can't load configs targeting Go 1.25, so the lint step fails at startup. - .github/workflows/build-and-test.yml: go-version matrix 1.24.5 -> 1.25.1; golangci-lint-action @v3 -> @v6 with v2.1.6 binary (built on Go 1.25). - Dockerfile: golang:1.24.5-bullseye -> golang:1.25.1-bullseye to match. No source changes — pure toolchain bump. --- .github/workflows/build-and-test.yml | 9 +++------ Dockerfile | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index e1b1b8e..c85ed96 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: - go-version: [1.24.5] + go-version: [1.25.1] steps: - name: Checkout code @@ -40,12 +40,9 @@ jobs: run: go mod verify - name: Install golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v6 with: - version: latest - skip-cache: false - skip-pkg-cache: false - skip-build-cache: false + version: v2.1.6 - name: Run linting run: make lint diff --git a/Dockerfile b/Dockerfile index cddbeec..1955855 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.24.5-bullseye AS build +FROM golang:1.25.1-bullseye AS build WORKDIR /go/src/sei-load From 2ba94e4ff23428a8f29b2601d2b286f8b525f6e4 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:40:30 -0700 Subject: [PATCH 09/13] ci: golang:1.25.1-bookworm (bullseye variant not published for Go 1.25) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1955855..20500cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.25.1-bullseye AS build +FROM golang:1.25.1-bookworm AS build WORKDIR /go/src/sei-load From 0002e56a5d1eaa67d67f36a952619dbca5fd569d Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:41:39 -0700 Subject: [PATCH 10/13] ci: golangci-lint-action @v7 (v6 doesn't support golangci-lint v2.x) --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index c85ed96..93ac528 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -40,7 +40,7 @@ jobs: run: go mod verify - name: Install golangci-lint - uses: golangci/golangci-lint-action@v6 + uses: golangci/golangci-lint-action@v7 with: version: v2.1.6 From 0b22e5d1cf5dffc7d25efe5f3519e5a11a3f2245 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:43:19 -0700 Subject: [PATCH 11/13] ci: golangci-lint version latest (v2.1.6 still built on Go 1.24) --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 93ac528..6716eb1 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -42,7 +42,7 @@ jobs: - name: Install golangci-lint uses: golangci/golangci-lint-action@v7 with: - version: v2.1.6 + version: latest - name: Run linting run: make lint From 16d3027a1987e65b8435c6b84043da4c1a79a494 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 21 Apr 2026 14:47:54 -0700 Subject: [PATCH 12/13] lint: fix issues surfaced by golangci-lint v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three in my changes + two pre-existing on main unmasked by the newer default ruleset. Mine: - Move observable-gauge registration (worker_queue_length, tps_achieved) from package vars into init(). The instrument handles are never read by app code — the callbacks run via OTel on each scrape — so holding them in named vars reads as dead code to the unused analyzer. init() with discarded returns makes the side-effect-registration intent explicit. Pre-existing (caught now because we moved from golangci-lint v1.64 to v2.11 whose default enables errcheck + staticcheck ST1005): - stats/logger.go: guard `reportFile.Close()` with `_ =` via deferred closure (errcheck). - sender/ramper.go: lowercase the error-string first letter to match Go idiom (ST1005). --- sender/metrics.go | 42 ++++++++++++++++++++++++------------------ sender/ramper.go | 2 +- stats/logger.go | 2 +- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/sender/metrics.go b/sender/metrics.go index 59f604d..da16221 100644 --- a/sender/metrics.go +++ b/sender/metrics.go @@ -11,6 +11,7 @@ import ( var meter = otel.Meter("github.com/sei-protocol/sei-load/sender") +// Synchronous instruments — read by Record/Add call sites. var ( sendLatency = must(meter.Float64Histogram( "send_latency", @@ -24,7 +25,27 @@ var ( metric.WithUnit("s"), metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))) - workerQueueLength = must(meter.Int64ObservableGauge( + httpErrors = must(meter.Int64Counter( + "http_errors", + metric.WithDescription("HTTP error responses from the target endpoint, by status code"), + metric.WithUnit("{errors}"))) + + txsAccepted = must(meter.Int64Counter( + "txs_accepted", + metric.WithDescription("Transactions successfully submitted to an endpoint"), + metric.WithUnit("{transactions}"))) + + txsRejected = must(meter.Int64Counter( + "txs_rejected", + metric.WithDescription("Transactions rejected by the target or local client, by reason"), + metric.WithUnit("{transactions}"))) +) + +// Observable instruments — registered in init for their callback side effect. +// Return values are discarded because OTel invokes the callbacks on each +// collection; we never read the instrument handles. +func init() { + must(meter.Int64ObservableGauge( "worker_queue_length", metric.WithDescription("Length of the worker's queue"), metric.WithUnit("{count}"), @@ -41,27 +62,12 @@ var ( return nil }))) - tpsAchieved = must(meter.Float64ObservableGauge( + must(meter.Float64ObservableGauge( "tps_achieved", metric.WithDescription("Most recent TPS sample observed by the sender, per endpoint/scenario"), metric.WithUnit("{transactions}/s"), metric.WithFloat64Callback(observeTPS))) - - httpErrors = must(meter.Int64Counter( - "http_errors", - metric.WithDescription("HTTP error responses from the target endpoint, by status code"), - metric.WithUnit("{errors}"))) - - txsAccepted = must(meter.Int64Counter( - "txs_accepted", - metric.WithDescription("Transactions successfully submitted to an endpoint"), - metric.WithUnit("{transactions}"))) - - txsRejected = must(meter.Int64Counter( - "txs_rejected", - metric.WithDescription("Transactions rejected by the target or local client, by reason"), - metric.WithUnit("{transactions}"))) -) +} // meteredChainWorkers is the registry the worker_queue_length callback reads. var meteredChainWorkers = &chainWorkerObserver{ diff --git a/sender/ramper.go b/sender/ramper.go index a207908..209384c 100644 --- a/sender/ramper.go +++ b/sender/ramper.go @@ -18,7 +18,7 @@ import ( // If we successfully pass a given TPS, we will pause for PauseTime, and then start the next step. // If we fail to pass a given TPS, we will stop the loadtest. -var ErrRampTestFailedSLO = errors.New("Ramp Test failed SLO") +var ErrRampTestFailedSLO = errors.New("ramp test failed SLO") func (r *Ramper) FormatRampStats() string { return fmt.Sprintf(` diff --git a/stats/logger.go b/stats/logger.go index 207ce47..1f8479b 100644 --- a/stats/logger.go +++ b/stats/logger.go @@ -365,7 +365,7 @@ func (l *Logger) LogFinalStats() { log.Printf("Error creating report file: %v", err) return } - defer reportFile.Close() + defer func() { _ = reportFile.Close() }() _, err = reportFile.WriteString(finalStats.String()) if err != nil { log.Printf("Error writing report file: %v", err) From 95fe37c95f10c72560504f5607f174fab4baf96c Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 22 Apr 2026 11:14:11 -0700 Subject: [PATCH 13/13] review: address amir-deris feedback on PR #28 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename OTEL_SERVICE_VERSION → SEILOAD_SERVICE_VERSION (matches the SEILOAD_* run-scope namespace; OTEL_SERVICE_VERSION was never an OTel spec env var anyway, spec uses OTEL_RESOURCE_ATTRIBUTES). - Distinguish receipt_latency description from send_latency. - Document OTel delegation pattern at package-level meter acquisition so the next reader doesn't re-derive why this works before Setup runs. - ethclient.DialContext(dialCtx, ...) so the dial honors the span ctx. - go.mod 1.25.0 → 1.25.1 to match CI toolchain. - Graceful metrics-server shutdown via http.Server + Shutdown, with a 5s timeout ordered after obsShutdown so the final Prometheus scrape path stays up during provider flush. Co-Authored-By: Claude Opus 4.7 (1M context) --- go.mod | 2 +- main.go | 25 +++++++++++++++++++------ observability/setup.go | 4 ++-- observability/setup_test.go | 2 +- sender/metrics.go | 6 +++++- sender/worker.go | 4 ++-- stats/metrics.go | 1 + 7 files changed, 31 insertions(+), 13 deletions(-) diff --git a/go.mod b/go.mod index aa974a3..1678f0c 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/sei-protocol/sei-load -go 1.25.0 +go 1.25.1 require ( github.com/ethereum/go-ethereum v1.16.1 diff --git a/main.go b/main.go index 2923de2..4cf9cd4 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "context" "encoding/json" + "errors" "fmt" "log" "net/http" @@ -164,16 +165,28 @@ func runLoadTest(ctx context.Context, cmd *cobra.Command, args []string) error { // EnableOpenMetrics is load-bearing: the default promhttp.Handler() strips // exemplars regardless of the scraper's Accept header. + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor( + prometheus.DefaultGatherer, + promhttp.HandlerOpts{EnableOpenMetrics: true}, + )) + metricsServer := &http.Server{ + Addr: listenAddr, + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + } go func() { - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.HandlerFor( - prometheus.DefaultGatherer, - promhttp.HandlerOpts{EnableOpenMetrics: true}, - )) - if err := http.ListenAndServe(listenAddr, mux); err != nil { + if err := metricsServer.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { log.Printf("failed to serve metrics: %v", err) } }() + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := metricsServer.Shutdown(shutdownCtx); err != nil { + log.Printf("metrics server shutdown: %v", err) + } + }() ctx, runSpan := otel.Tracer("github.com/sei-protocol/sei-load").Start(ctx, "seiload.run") defer runSpan.End() diff --git a/observability/setup.go b/observability/setup.go index adbc98f..177368f 100644 --- a/observability/setup.go +++ b/observability/setup.go @@ -32,10 +32,10 @@ type RunScope struct { InstanceID string // unique per process; falls back to hostname. Disambiguates cluster-of-seiload pods. } -// RunScopeFromEnv reads SEILOAD_* and OTEL_SERVICE_VERSION. Missing values stay empty. +// RunScopeFromEnv reads SEILOAD_*. Missing values stay empty. func RunScopeFromEnv() RunScope { return RunScope{ - ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), + ServiceVersion: os.Getenv("SEILOAD_SERVICE_VERSION"), RunID: os.Getenv("SEILOAD_RUN_ID"), ChainID: os.Getenv("SEILOAD_CHAIN_ID"), CommitID: os.Getenv("SEILOAD_COMMIT_ID"), diff --git a/observability/setup_test.go b/observability/setup_test.go index 6fd9fd8..1c4b768 100644 --- a/observability/setup_test.go +++ b/observability/setup_test.go @@ -23,7 +23,7 @@ func TestRunScopeFromEnv(t *testing.T) { t.Setenv("SEILOAD_COMMIT_ID", "deadbeefcafef00d") t.Setenv("SEILOAD_WORKLOAD", "autobake") t.Setenv("SEILOAD_INSTANCE_ID", "seiload-abc-0") - t.Setenv("OTEL_SERVICE_VERSION", "v1.2.3") + t.Setenv("SEILOAD_SERVICE_VERSION", "v1.2.3") rs := RunScopeFromEnv() require.Equal(t, "run-42", rs.RunID) diff --git a/sender/metrics.go b/sender/metrics.go index da16221..ab9d39d 100644 --- a/sender/metrics.go +++ b/sender/metrics.go @@ -9,6 +9,10 @@ import ( "go.opentelemetry.io/otel/metric" ) +// Acquired at package init, before observability.Setup installs the real +// MeterProvider. Safe because OTel Go's global is a delegating provider: +// meters and instruments created against it forward to the real provider +// once SetMeterProvider is called. See go.opentelemetry.io/otel/internal/global. var meter = otel.Meter("github.com/sei-protocol/sei-load/sender") // Synchronous instruments — read by Record/Add call sites. @@ -21,7 +25,7 @@ var ( receiptLatency = must(meter.Float64Histogram( "receipt_latency", - metric.WithDescription("Latency of sending transactions in seconds"), + metric.WithDescription("Latency from transaction submission to receipt confirmation in seconds"), metric.WithUnit("s"), metric.WithExplicitBucketBoundaries(0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0))) diff --git a/sender/worker.go b/sender/worker.go index 53d0dea..3c15862 100644 --- a/sender/worker.go +++ b/sender/worker.go @@ -148,12 +148,12 @@ func (w *Worker) watchTransactions(ctx context.Context) error { if w.dryRun || !w.trackReceipts { return nil } - _, dialSpan := tracer.Start(ctx, "sender.dial_endpoint", trace.WithAttributes( + dialCtx, dialSpan := tracer.Start(ctx, "sender.dial_endpoint", trace.WithAttributes( attribute.String("seiload.endpoint", w.endpoint), attribute.String("seiload.chain_id", w.seiChainID), attribute.Int("seiload.worker_id", w.id), )) - eth, err := ethclient.Dial(w.endpoint) + eth, err := ethclient.DialContext(dialCtx, w.endpoint) if err != nil { dialSpan.RecordError(err) dialSpan.End() diff --git a/stats/metrics.go b/stats/metrics.go index f9b31a0..984b3bf 100644 --- a/stats/metrics.go +++ b/stats/metrics.go @@ -5,6 +5,7 @@ import ( "go.opentelemetry.io/otel/metric" ) +// See sender/metrics.go for why package-level acquisition is safe before Setup. var meter = otel.Meter("github.com/sei-protocol/sei-load/stats") var (