diff --git a/.eslintrc b/.eslintrc index c027bbd0..ba0e8a85 100644 --- a/.eslintrc +++ b/.eslintrc @@ -29,7 +29,8 @@ "files": ["*.test.js", "*.test.ts"], "rules": { "no-unused-expressions": "off", - "@typescript-eslint/no-unused-expressions": "off" + "@typescript-eslint/no-unused-expressions": "off", + "func-names": "off" } } ] diff --git a/CHANGELOG.md b/CHANGELOG.md index 99d84602..f60f9346 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,13 @@ - Add metric view metadata support (databricks/databricks-sql-nodejs#312 by @shivam2680) - Fix: Avoid calling require('lz4') if it's really not required (databricks/databricks-sql-nodejs#316 by @ikkala) - Add telemetry foundation (off by default) (databricks/databricks-sql-nodejs#324 by @samikshya-db) +- Telemetry event emission and per-host aggregation (databricks/databricks-sql-nodejs#327 by @samikshya-db). + **Default change:** `telemetryEnabled` now defaults to `true` (gated by a remote feature flag). + To opt out programmatically, pass `telemetryEnabled: false` to `connect()`. + To disable globally without code changes, set the environment variable + `DATABRICKS_TELEMETRY_DISABLED` to one of `1`, `true`, `yes`, or `on` + (case-insensitive). Other values (empty, `0`, `false`, etc.) are ignored + — the runtime config takes precedence. ## 1.12.0 diff --git a/README.md b/README.md index 3b3ff22a..3821a4a2 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,91 @@ client }); ``` +## Telemetry + +Starting with version 1.13, the driver collects telemetry — connection, +statement, and CloudFetch chunk metrics, plus error events with redacted +stack traces — to help Databricks improve driver performance and +reliability. **Telemetry is enabled by default and gated by a server-side +feature flag**: events are emitted only when the workspace's feature flag +is on. No SQL text, parameter values, or row data are ever included. + +### What's collected + +- Connection lifecycle (`CREATE_SESSION`, `DELETE_SESSION`) with latency. +- Statement lifecycle (`STATEMENT_START`, `STATEMENT_COMPLETE`) with + execution latency, operation type, and result format. +- CloudFetch chunk timings and byte counts. +- Error events with redacted stack traces (Bearer/JWT tokens, OAuth + secrets, home-directory paths, and Databricks PATs are stripped before + emission). + +See `TelemetryEvent` and `TelemetryMetric` in the package exports for the +exact payload shapes. + +### Multi-tenant SaaS deployments — read this before enabling telemetry + +The telemetry layer shares one per-host `TelemetryClient` across every +`DBSQLClient` connected to the same Databricks workspace host. The +authenticated export path uses the **first-registered** client's auth +provider, User-Agent, and `telemetryAuthenticatedExport` value — these +fields are snapshotted at the host singleton and are **not** per-tenant. + +If you are operating a SaaS layer that fronts multiple tenants against the +same Databricks workspace host with a shared driver process, telemetry from +tenant B's queries can be POSTed under tenant A's auth headers, with +tenant A's `userAgentEntry`. A tenant B that explicitly set +`telemetryAuthenticatedExport: false` will still ride tenant A's +authenticated pipeline. + +> **Recommendation for multi-tenant deployments**: set +> `telemetryEnabled: false` on all `DBSQLClient` instances, or partition +> by Databricks workspace host so each tenant owns its own +> `TelemetryClient`. Subsequent registrants with diverging auth/UA values +> emit a warn-level log so the leak is at least visible. + +### Opting out + +Three independent ways to disable telemetry, in order of precedence: + +1. **Environment variable** — set `DATABRICKS_TELEMETRY_DISABLED` to one + of `1`, `true`, `yes`, or `on` (case-insensitive). Other values + (empty, `0`, `false`, `off`, `no`) are ignored, leaving the runtime + config in charge. +2. **Programmatic** — pass `telemetryEnabled: false` to `connect()`: + ```javascript + await client.connect({ + host, + path, + token, + telemetryEnabled: false, + }); + ``` +3. **Server-side** — Databricks-managed feature flag; if disabled for + your workspace, the driver does not emit telemetry regardless of + client config. + +### Tuning + +If you keep telemetry on, the following knobs are available on +`ConnectionOptions` (see JSDoc on `IDBSQLClient.ts` for defaults and +units): + +- `telemetryAuthenticatedExport` — set to `false` to ship reduced + payloads (no statement/session correlation IDs, generic User-Agent) + via the unauthenticated endpoint. +- `telemetryBatchSize`, `telemetryFlushIntervalMs`, `telemetryMaxRetries` + — batching and retry tuning. +- `telemetryCircuitBreakerThreshold`, `telemetryCircuitBreakerTimeout` — + circuit-breaker tuning for the export endpoint. +- `telemetryCloseTimeoutMs` — bound on `await client.close()` waiting for + the final flush. + +> **Note for short-lived processes**: always `await client.close()` +> before `process.exit(0)` so the final batch is flushed. Without an +> explicit close, the periodic flush timer is `unref()`'d to avoid +> holding the event loop open, so any unflushed events are dropped. + ## Run Tests ### Unit tests diff --git a/lib/DBSQLClient.ts b/lib/DBSQLClient.ts index 38d55a54..340efd0c 100644 --- a/lib/DBSQLClient.ts +++ b/lib/DBSQLClient.ts @@ -1,5 +1,6 @@ import thrift from 'thrift'; import Int64 from 'node-int64'; +import os from 'os'; import { EventEmitter } from 'events'; import TCLIService from '../thrift/TCLIService'; @@ -31,6 +32,13 @@ import IDBSQLLogger, { LogLevel } from './contracts/IDBSQLLogger'; import DBSQLLogger from './DBSQLLogger'; import CloseableCollection from './utils/CloseableCollection'; import IConnectionProvider from './connection/contracts/IConnectionProvider'; +import TelemetryClient from './telemetry/TelemetryClient'; +import TelemetryClientProvider from './telemetry/TelemetryClientProvider'; +import TelemetryEventEmitter from './telemetry/TelemetryEventEmitter'; +import MetricsAggregator from './telemetry/MetricsAggregator'; +import { DriverConfiguration, DRIVER_NAME, TelemetryEventType, DEFAULT_TELEMETRY_CONFIG } from './telemetry/types'; +import { safeEmit } from './telemetry/telemetryUtils'; +import driverVersion from './version'; function prependSlash(str: string): string { if (str.length > 0 && str.charAt(0) !== '/') { @@ -54,6 +62,34 @@ function getInitialNamespaceOptions(catalogName?: string, schemaName?: string) { export type ThriftLibrary = Pick; +/** + * Copy any defined telemetry knob from `src` into `dst`. Both objects declare + * identical types for these keys, so the assignment is structurally typed — + * a wrong-shape value in `ConnectionOptions` is caught at the call site. + * + * Keep this in sync with the `telemetry*` knobs exposed in + * `ConnectionOptions` (lib/contracts/IDBSQLClient.ts) and `ClientConfig` + * (lib/contracts/IClientContext.ts). Adding a knob requires extending this + * list AND the public option surface; otherwise the user-supplied override + * silently does nothing. + */ +function copyDefinedTelemetryOptions(src: ConnectionOptions, dst: ClientConfig): void { + if (src.telemetryEnabled !== undefined) dst.telemetryEnabled = src.telemetryEnabled; + if (src.telemetryBatchSize !== undefined) dst.telemetryBatchSize = src.telemetryBatchSize; + if (src.telemetryFlushIntervalMs !== undefined) dst.telemetryFlushIntervalMs = src.telemetryFlushIntervalMs; + if (src.telemetryMaxRetries !== undefined) dst.telemetryMaxRetries = src.telemetryMaxRetries; + if (src.telemetryAuthenticatedExport !== undefined) + dst.telemetryAuthenticatedExport = src.telemetryAuthenticatedExport; + if (src.telemetryCircuitBreakerThreshold !== undefined) + dst.telemetryCircuitBreakerThreshold = src.telemetryCircuitBreakerThreshold; + if (src.telemetryCircuitBreakerTimeout !== undefined) + dst.telemetryCircuitBreakerTimeout = src.telemetryCircuitBreakerTimeout; + if (src.telemetryCloseTimeoutMs !== undefined) dst.telemetryCloseTimeoutMs = src.telemetryCloseTimeoutMs; + if (src.telemetryMaxStatementMetrics !== undefined) + dst.telemetryMaxStatementMetrics = src.telemetryMaxStatementMetrics; + if (src.telemetryMaxPendingMetrics !== undefined) dst.telemetryMaxPendingMetrics = src.telemetryMaxPendingMetrics; +} + export default class DBSQLClient extends EventEmitter implements IDBSQLClient, IClientContext { private static defaultLogger?: IDBSQLLogger; @@ -75,6 +111,28 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I private readonly sessions = new CloseableCollection(); + // Telemetry components — `telemetryClient` is the shared per-host owner + // (process-wide via TelemetryClientProvider). The exporter, aggregator, + // circuit-breaker registry and feature-flag cache live on it. Each + // DBSQLClient still owns its own `telemetryEmitter` so it respects its + // own `telemetryEnabled` flag. + private host?: string; + + private httpPath?: string; + + private authType?: string; + + private telemetryClient?: TelemetryClient; + + private telemetryEmitter?: TelemetryEventEmitter; + + // True once we've shipped the full DriverConfiguration on a CONNECTION_OPEN + // event for this client. Subsequent openSession events for the same client + // strip the (~1KB, static-for-the-process) blob — a long-running client + // opening N sessions would otherwise pay N×blob bytes for telemetry on data + // that hasn't changed since the first session. + private driverConfigShipped = false; + private static getDefaultLogger(): IDBSQLLogger { if (!this.defaultLogger) { this.defaultLogger = new DBSQLLogger(); @@ -101,6 +159,31 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I cloudFetchSpeedThresholdMBps: 0.1, useLZ4Compression: true, + + // Telemetry defaults are sourced from DEFAULT_TELEMETRY_CONFIG so + // every component reads from the same single frozen const. Mapping the + // unprefixed TelemetryConfiguration keys to the `telemetry`-prefixed + // ClientConfig keys is mechanical; doing it once here means that adding + // a new knob to DEFAULT_TELEMETRY_CONFIG only requires extending the + // ClientConfig surface and (optionally) adding to telemetryOverrides. + // Previously this method declared 7 keys while DEFAULT_TELEMETRY_CONFIG + // declared 15 — silent desync risk every time someone touched one but + // not the other. + telemetryEnabled: DEFAULT_TELEMETRY_CONFIG.enabled, + telemetryBatchSize: DEFAULT_TELEMETRY_CONFIG.batchSize, + telemetryFlushIntervalMs: DEFAULT_TELEMETRY_CONFIG.flushIntervalMs, + telemetryMaxRetries: DEFAULT_TELEMETRY_CONFIG.maxRetries, + telemetryBackoffBaseMs: DEFAULT_TELEMETRY_CONFIG.backoffBaseMs, + telemetryBackoffMaxMs: DEFAULT_TELEMETRY_CONFIG.backoffMaxMs, + telemetryBackoffJitterMs: DEFAULT_TELEMETRY_CONFIG.backoffJitterMs, + telemetryAuthenticatedExport: DEFAULT_TELEMETRY_CONFIG.authenticatedExport, + telemetryCircuitBreakerThreshold: DEFAULT_TELEMETRY_CONFIG.circuitBreakerThreshold, + telemetryCircuitBreakerTimeout: DEFAULT_TELEMETRY_CONFIG.circuitBreakerTimeout, + telemetryMaxPendingMetrics: DEFAULT_TELEMETRY_CONFIG.maxPendingMetrics, + telemetryMaxErrorsPerStatement: DEFAULT_TELEMETRY_CONFIG.maxErrorsPerStatement, + telemetryStatementTtlMs: DEFAULT_TELEMETRY_CONFIG.statementTtlMs, + telemetryCloseTimeoutMs: DEFAULT_TELEMETRY_CONFIG.closeTimeoutMs, + telemetryMaxStatementMetrics: DEFAULT_TELEMETRY_CONFIG.maxStatementMetrics, }; } @@ -212,6 +295,211 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I return new HttpConnection(this.getConnectionOptions(options), this); } + /** + * Extract the numeric workspace ID for telemetry. + * + * The only reliable carrier in the connection params today is the `?o=N` + * query parameter on `httpPath` — Databricks SQL warehouses are typically + * connected to via paths like `/sql/1.0/warehouses/?o=12345678901234`. + * + * Host-based extraction was tried previously but produced confidently-wrong + * values: + * - AWS `dbc-XXXXX-YYYY.cloud.databricks.com` → `dbc-XXXXX-YYYY` + * is the deployment shard prefix, not the workspace ID. + * - Azure `adb-NNNNNNNNNNNNN.NN.azuredatabricks.net` → the workspace ID is + * the numeric portion after the `adb-` prefix (and before the form-factor + * digit), not `adb-NNN`. + * + * Returns `undefined` when no workspace ID can be derived. Server-side + * attribution is better off seeing a missing field than a wrong value. + */ + private extractWorkspaceId(): string | undefined { + const { httpPath } = this; + if (!httpPath) { + return undefined; + } + const queryIdx = httpPath.indexOf('?'); + if (queryIdx < 0) { + return undefined; + } + const query = httpPath.slice(queryIdx + 1); + // Match `o=` as the first param, an inner `&o=`, etc. + // Workspace IDs are decimal integers; reject anything else so a stray + // `o=tenant_42` doesn't ship as a workspace ID. + const match = query.match(/(?:^|&)o=(\d+)(?:&|$)/); + return match ? match[1] : undefined; + } + + /** + * Build driver configuration for telemetry reporting. + * @returns DriverConfiguration object with current driver settings + */ + private buildDriverConfiguration(): DriverConfiguration { + return { + driverVersion, + driverName: DRIVER_NAME, + nodeVersion: process.version, + platform: process.platform, + osVersion: os.release(), + osArch: os.arch(), + runtimeVendor: 'Node.js Foundation', + localeName: this.getLocaleName(), + charSetEncoding: 'UTF-8', + processName: this.getProcessName(), + authType: this.authType || 'pat', + + // Feature flags + cloudFetchEnabled: this.config.useCloudFetch ?? false, + lz4Enabled: this.config.useLZ4Compression ?? false, + arrowEnabled: this.config.arrowEnabled ?? false, + directResultsEnabled: true, // Direct results always enabled + + // Configuration values + socketTimeout: this.config.socketTimeout ?? 0, + retryMaxAttempts: this.config.retryMaxAttempts ?? 0, + cloudFetchConcurrentDownloads: this.config.cloudFetchConcurrentDownloads ?? 0, + + // Connection parameters + httpPath: this.httpPath, + enableMetricViewMetadata: this.config.enableMetricViewMetadata, + }; + } + + /** + * Map Node.js auth type to telemetry auth enum string. + * Distinguishes between U2M and M2M OAuth flows. + */ + private mapAuthType(options: ConnectionOptions): string { + switch (options.authType) { + case 'databricks-oauth': + return options.oauthClientSecret === undefined ? 'external-browser' : 'oauth-m2m'; + case 'custom': + return 'custom'; + case 'token-provider': + return 'token-provider'; + case 'external-token': + return 'external-token'; + case 'static-token': + return 'static-token'; + case 'access-token': + case undefined: + return 'pat'; + default: + return 'unknown'; + } + } + + /** + * Get locale name in format language_country (e.g., en_US). + * Matches JDBC format: user.language + '_' + user.country + */ + private getLocaleName(): string { + try { + // Try to get from environment variables + const lang = process.env.LANG || process.env.LC_ALL || process.env.LC_MESSAGES || ''; + if (lang) { + // LANG format is typically "en_US.UTF-8", extract "en_US" + const match = lang.match(/^([a-z]{2}_[A-Z]{2})/); + if (match) { + return match[1]; + } + } + // Fallback to en_US + return 'en_US'; + } catch { + return 'en_US'; + } + } + + /** + * Get process name, similar to JDBC's ProcessNameUtil. + * Returns the script name or process title. + */ + private getProcessName(): string { + try { + // Try process.title first (can be set by application) + if (process.title && process.title !== 'node') { + return process.title; + } + // Try to get the main script name from argv[1] + if (process.argv && process.argv.length > 1) { + const scriptPath = process.argv[1]; + // Extract filename without path + const filename = scriptPath.split('/').pop()?.split('\\').pop() || ''; + // Remove extension + const nameWithoutExt = filename.replace(/\.[^.]*$/, ''); + if (nameWithoutExt) { + return nameWithoutExt; + } + } + return 'node'; + } catch { + return 'node'; + } + } + + /** + * Initialize telemetry components if enabled. + * CRITICAL: All errors swallowed and logged at LogLevel.debug ONLY. + * Driver NEVER throws exceptions due to telemetry. + */ + private async initializeTelemetry(): Promise { + if (!this.host) { + return; + } + + try { + // Acquire (or create) the per-host TelemetryClient from the + // process-wide provider. The shared client owns the circuit-breaker + // registry, feature-flag cache, exporter, and aggregator. Multiple + // DBSQLClient instances on the same host share these resources so + // breaker counters and HTTP batches don't fragment per-instance. + this.telemetryClient = TelemetryClientProvider.getInstance().getOrCreateClient(this, this.host); + + // Use the shared feature-flag cache (registered in the previous step). + const enabled = await this.telemetryClient.getFeatureFlagCache().isTelemetryEnabled(this.host); + + if (!enabled) { + // Release our refcount immediately; we won't be emitting. + await TelemetryClientProvider.getInstance().releaseClient(this, this.host); + this.telemetryClient = undefined; + this.logger.log(LogLevel.debug, 'Telemetry: disabled'); + return; + } + + // Each DBSQLClient still owns its own emitter so it respects its own + // `telemetryEnabled` flag and feature-flag result. All emitters bridge + // into the SHARED aggregator on the TelemetryClient. + this.telemetryEmitter = new TelemetryEventEmitter(this); + const sharedAggregator = this.telemetryClient.getAggregator(); + for (const eventType of Object.values(TelemetryEventType)) { + this.telemetryEmitter.on(eventType, (event) => { + sharedAggregator.processEvent(event); + }); + } + + this.logger.log(LogLevel.debug, 'Telemetry: enabled'); + } catch (error: any) { + // Swallow all telemetry initialization errors. If we acquired a refcount + // before the throw, release it — otherwise the per-host TelemetryClient + // (and its flush timer / exporter / FFCache) leaks for the lifetime of + // the process on long-running supervisors that retry-connect. + if (this.telemetryClient) { + try { + await TelemetryClientProvider.getInstance().releaseClient(this, this.host); + } catch (releaseError: any) { + this.logger.log( + LogLevel.debug, + `Telemetry release-after-init-failure error: ${releaseError?.message ?? releaseError}`, + ); + } + this.telemetryClient = undefined; + this.telemetryEmitter = undefined; + } + this.logger.log(LogLevel.debug, `Telemetry initialization error: ${error?.message ?? error}`); + } + } + /** * Connects DBSQLClient to endpoint * @public @@ -233,11 +521,40 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I } } + // If connect() is being called a second time (reconnect, host switch), + // release the prior telemetry refcount and emitter so we don't leak a + // refcount in the process-wide TelemetryClientProvider for the old host. + if (this.host && this.telemetryClient) { + try { + await TelemetryClientProvider.getInstance().releaseClient(this, this.host); + } catch (error: any) { + this.logger.log(LogLevel.debug, `Telemetry release-on-reconnect error: ${error.message}`); + } + this.telemetryClient = undefined; + this.telemetryEmitter = undefined; + } + // Re-arm: the new connection is a fresh client-config lineage even if + // the host is the same. + this.driverConfigShipped = false; + + // Store connection params for telemetry + this.host = options.host; + this.httpPath = options.path; + this.authType = this.mapAuthType(options); + // Store enableMetricViewMetadata configuration if (options.enableMetricViewMetadata !== undefined) { this.config.enableMetricViewMetadata = options.enableMetricViewMetadata; } + // Override telemetry config if provided in options. Per-key narrowed copy + // preserves the structural type system: `ConnectionOptions` and + // `ClientConfig` declare identical types for these knobs, so a user + // passing `telemetryBatchSize: "100"` (string) gets a TS error instead of + // silently writing a string into a number field that `MetricsAggregator` + // would later read and break aggregation thresholds at runtime. + copyDefinedTelemetryOptions(options, this.config); + // Persist userAgentEntry so telemetry and feature-flag call sites reuse // the same value as the primary Thrift connection's User-Agent. if (options.userAgentEntry !== undefined) { @@ -277,6 +594,32 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I this.emit('timeout'); }); + // Initialize telemetry if enabled. The env var DATABRICKS_TELEMETRY_DISABLED + // is a hard kill switch for ops/IT teams who can't redeploy app code. + // Recognized truthy values: 1, true, yes, on (case-insensitive). Anything + // else (empty, "0", "false", "no", "off") leaves the runtime config in + // charge — avoiding the footgun where a sysadmin sets the var to "false" + // expecting to enable telemetry. + const envKill = process.env.DATABRICKS_TELEMETRY_DISABLED; + const trimmedEnvKill = typeof envKill === 'string' ? envKill.trim() : ''; + const envDisabled = trimmedEnvKill.length > 0 && /^(1|true|yes|on)$/i.test(trimmedEnvKill); + // Surface the misconfiguration: an ops engineer who sees the var name and + // tries to "set it to false to keep telemetry on" otherwise gets the + // opposite of what they expect (the var is then silently ignored, runtime + // config stays in charge — default `true`). Warn on any non-empty value + // that isn't recognized so the disable-failed shape is visible in logs. + if (trimmedEnvKill.length > 0 && !envDisabled) { + this.logger.log( + LogLevel.warn, + `DATABRICKS_TELEMETRY_DISABLED='${trimmedEnvKill}' was ignored. ` + + `To disable telemetry, set the variable to one of: 1, true, yes, on. ` + + `Telemetry remains controlled by the runtime config and feature flag.`, + ); + } + if (this.config.telemetryEnabled && !envDisabled) { + await this.initializeTelemetry(); + } + return this; } @@ -290,6 +633,9 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I * const session = await client.openSession(); */ public async openSession(request: OpenSessionRequest = {}): Promise { + // Track connection open latency + const startTime = Date.now(); + // Prepare session configuration const configuration = request.configuration ? { ...request.configuration } : {}; @@ -322,12 +668,59 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I serverProtocolVersion: response.serverProtocolVersion, }); this.sessions.add(session); + + // Emit connection.open telemetry event. The DriverConfiguration blob + // (~1KB: runtime/OS/locale/process info) is static for the lifetime of + // this DBSQLClient — ship it once, on the first openSession, and omit + // on subsequent sessions for the same client. Server-side correlation + // by sessionId still groups N sessions under the first event's config. + safeEmit(this, (emitter) => { + if (!this.host) return; + const latencyMs = Date.now() - startTime; + const workspaceId = this.extractWorkspaceId(); + const driverConfig = this.driverConfigShipped ? undefined : this.buildDriverConfiguration(); + if (driverConfig) { + this.driverConfigShipped = true; + } + emitter.emitConnectionOpen({ + sessionId: session.id, + workspaceId, + driverConfig, + latencyMs, + }); + }); + return session; } + /** + * Closes the client, releasing sessions and telemetry resources. + * + * The internal telemetry flush timer uses `setInterval(...).unref()` so it + * cannot keep the Node.js process alive on its own. As a consequence, any + * telemetry buffered between flush ticks is lost if the process exits + * without calling `close()`. Long-lived applications should `await` this + * method on shutdown so the aggregator drains its remaining metrics. + */ public async close(): Promise { await this.sessions.closeAll(); + // Cleanup telemetry. Releasing our refcount on the shared TelemetryClient + // is awaited because the underlying close() drains the final HTTP POST — + // a caller doing `await client.close(); process.exit(0)` would otherwise + // truncate the in-flight request when this is the last refcount holder. + if (this.host && this.telemetryClient) { + try { + await TelemetryClientProvider.getInstance().releaseClient(this, this.host); + } catch (error: any) { + this.logger.log(LogLevel.debug, `Telemetry cleanup error: ${error.message}`); + } + this.telemetryClient = undefined; + } + // Drop the emitter ref so post-close calls (e.g. session.close racing + // with client.close) cannot smuggle events into the closed aggregator. + this.telemetryEmitter = undefined; + this.client = undefined; this.connectionProvider = undefined; this.authProvider = undefined; @@ -379,4 +772,37 @@ export default class DBSQLClient extends EventEmitter implements IDBSQLClient, I public getAuthProvider(): IAuthentication | undefined { return this.authProvider; } + + /** @internal */ + public getTelemetryEmitter(): TelemetryEventEmitter | undefined { + return this.telemetryEmitter; + } + + /** @internal */ + public getTelemetryAggregator(): MetricsAggregator | undefined { + return this.telemetryClient?.getAggregator(); + } + + /** + * Operator-visible snapshot of the client's telemetry state: current + * buffer depth, in-flight statement aggregations, cumulative drops/ + * evictions, and circuit-breaker state. Returns `undefined` when + * telemetry is disabled (config, env-kill, or feature-flag). + * + * Use this in health-check endpoints or shutdown banners to verify that + * telemetry is flowing. A non-zero `droppedMetrics` between observations + * means buffer overflow — raise `telemetryMaxPendingMetrics`. + */ + public getTelemetryStats(): + | { + host: string; + pendingMetricsCount: number; + inFlightStatements: number; + droppedMetrics: number; + evictedStatements: number; + circuitBreakerState: string; + } + | undefined { + return this.telemetryClient?.getTelemetryStats(); + } } diff --git a/lib/DBSQLOperation.ts b/lib/DBSQLOperation.ts index fe22995d..d95fe51b 100644 --- a/lib/DBSQLOperation.ts +++ b/lib/DBSQLOperation.ts @@ -34,11 +34,15 @@ import { definedOrError } from './utils'; import { OperationChunksIterator, OperationRowsIterator } from './utils/OperationIterator'; import HiveDriverError from './errors/HiveDriverError'; import IClientContext from './contracts/IClientContext'; +import { mapOperationTypeToTelemetryType, mapResultFormatToTelemetryType } from './telemetry/telemetryTypeMappers'; +import ExceptionClassifier from './telemetry/ExceptionClassifier'; +import { safeEmit } from './telemetry/telemetryUtils'; interface DBSQLOperationConstructorOptions { handle: TOperationHandle; directResults?: TSparkDirectResults; context: IClientContext; + sessionId?: string; } async function delay(ms?: number): Promise { @@ -76,9 +80,17 @@ export default class DBSQLOperation implements IOperation { private resultHandler?: ResultSlicer; - constructor({ handle, directResults, context }: DBSQLOperationConstructorOptions) { + // Telemetry tracking fields + private startTime: number = Date.now(); + + private pollCount: number = 0; + + private sessionId?: string; + + constructor({ handle, directResults, context, sessionId }: DBSQLOperationConstructorOptions) { this.operationHandle = handle; this.context = context; + this.sessionId = sessionId; const useOnlyPrefetchedResults = Boolean(directResults?.closeOperation); @@ -92,9 +104,13 @@ export default class DBSQLOperation implements IOperation { this.operationHandle, [directResults?.resultSet], useOnlyPrefetchedResults, + this.id, ); this.closeOperation = directResults?.closeOperation; this.context.getLogger().log(LogLevel.debug, `Operation created with id: ${this.id}`); + + // Emit statement.start telemetry event + this.emitStatementStart(); } public iterateChunks(options?: IteratorOptions): IOperationChunksIterator { @@ -166,6 +182,10 @@ export default class DBSQLOperation implements IOperation { * const result = await queryOperation.fetchChunk({maxRows: 1000}); */ public async fetchChunk(options?: FetchOptions): Promise> { + return this.withErrorTelemetry(() => this.fetchChunkInternal(options)); + } + + private async fetchChunkInternal(options?: FetchOptions): Promise> { await this.failIfClosed(); if (!this.operationHandle.hasResultSet) { @@ -225,6 +245,9 @@ export default class DBSQLOperation implements IOperation { return this.operationStatus; } + // Track poll count for telemetry + this.pollCount += 1; + const driver = await this.context.getDriver(); const response = await driver.getOperationStatus({ operationHandle: this.operationHandle, @@ -239,6 +262,10 @@ export default class DBSQLOperation implements IOperation { * @throws {StatusError} */ public async cancel(): Promise { + return this.withErrorTelemetry(() => this.cancelInternal()); + } + + private async cancelInternal(): Promise { if (this.closed || this.cancelled) { return Status.success(); } @@ -263,6 +290,10 @@ export default class DBSQLOperation implements IOperation { * @throws {StatusError} */ public async close(): Promise { + return this.withErrorTelemetry(() => this.closeInternal()); + } + + private async closeInternal(): Promise { if (this.closed || this.cancelled) { return Status.success(); } @@ -279,50 +310,75 @@ export default class DBSQLOperation implements IOperation { this.closed = true; const result = new Status(response.status); + // Emit statement.complete telemetry event + await this.emitStatementComplete(); + this.onClose?.(); return result; } public async finished(options?: FinishedOptions): Promise { - await this.failIfClosed(); - await this.waitUntilReady(options); + return this.withErrorTelemetry(async () => { + await this.failIfClosed(); + await this.waitUntilReady(options); + }); } public async hasMoreRows(): Promise { - // If operation is closed or cancelled - we should not try to get data from it - if (this.closed || this.cancelled) { - return false; - } + return this.withErrorTelemetry(async () => { + // If operation is closed or cancelled - we should not try to get data from it + if (this.closed || this.cancelled) { + return false; + } - // Wait for operation to finish before checking for more rows - // This ensures metadata can be fetched successfully - if (this.operationHandle.hasResultSet) { - await this.waitUntilReady(); - } + // Wait for operation to finish before checking for more rows + // This ensures metadata can be fetched successfully + if (this.operationHandle.hasResultSet) { + await this.waitUntilReady(); + } - // If we fetched all the data from server - check if there's anything buffered in result handler - const resultHandler = await this.getResultHandler(); - return resultHandler.hasMore(); + // If we fetched all the data from server - check if there's anything buffered in result handler + const resultHandler = await this.getResultHandler(); + return resultHandler.hasMore(); + }); } public async getSchema(options?: GetSchemaOptions): Promise { - await this.failIfClosed(); + return this.withErrorTelemetry(async () => { + await this.failIfClosed(); - if (!this.operationHandle.hasResultSet) { - return null; - } + if (!this.operationHandle.hasResultSet) { + return null; + } - await this.waitUntilReady(options); + await this.waitUntilReady(options); - this.context.getLogger().log(LogLevel.debug, `Fetching schema for operation with id: ${this.id}`); - const metadata = await this.fetchMetadata(); - return metadata.schema ?? null; + this.context.getLogger().log(LogLevel.debug, `Fetching schema for operation with id: ${this.id}`); + const metadata = await this.fetchMetadata(); + return metadata.schema ?? null; + }); } public async getMetadata(): Promise { - await this.failIfClosed(); - await this.waitUntilReady(); - return this.fetchMetadata(); + return this.withErrorTelemetry(async () => { + await this.failIfClosed(); + await this.waitUntilReady(); + return this.fetchMetadata(); + }); + } + + /** + * Wrap a public IOperation method so any thrown error is captured as an + * error telemetry event before being rethrown to the caller. Telemetry + * never alters the throw semantics. + */ + private async withErrorTelemetry(fn: () => Promise): Promise { + try { + return await fn(); + } catch (err: any) { + this.emitErrorEvent(err); + throw err; + } } private async failIfClosed(): Promise { @@ -441,7 +497,7 @@ export default class DBSQLOperation implements IOperation { case TSparkRowSetType.URL_BASED_SET: resultSource = new ArrowResultConverter( this.context, - new CloudFetchResultHandler(this.context, this._data, metadata), + new CloudFetchResultHandler(this.context, this._data, metadata, this.id), metadata, ); break; @@ -481,4 +537,73 @@ export default class DBSQLOperation implements IOperation { return response; } + + /** + * Emit statement.start telemetry event. + * CRITICAL: All exceptions swallowed and logged at LogLevel.debug ONLY. + */ + private emitStatementStart(): void { + safeEmit(this.context, (emitter) => { + emitter.emitStatementStart({ + statementId: this.id, + // Pass `undefined` when sessionId is unknown rather than `''`. All + // emit sites in this class use the same default so the aggregator's + // per-statement state doesn't end up split between `''` and + // `undefined` (which would synthesize two ghost sessions for a + // single operation that briefly lacked a sessionId). + sessionId: this.sessionId, + operationType: mapOperationTypeToTelemetryType(this.operationHandle.operationType), + }); + }); + } + + /** + * Emit statement.complete telemetry event and complete aggregation. + * CRITICAL: All exceptions swallowed and logged at LogLevel.debug ONLY. + */ + private async emitStatementComplete(): Promise { + safeEmit(this.context, (emitter) => { + const aggregator = this.context.getTelemetryAggregator?.(); + if (!aggregator) return; + + // Use whatever metadata was already fetched by the result-handling + // path. Do NOT trigger a `getMetadata()` here — that issues a Thrift + // RPC on every close (doubles close latency for short DDL/DML) AND + // throws if the operation is already in an error/closed state, which + // would then fire spurious error telemetry from `getMetadata`'s error + // wrapper. + const resultFormat = mapResultFormatToTelemetryType(this.metadata?.resultFormat); + const latencyMs = Date.now() - this.startTime; + + emitter.emitStatementComplete({ + statementId: this.id, + sessionId: this.sessionId, + latencyMs, + resultFormat, + pollCount: this.pollCount, + }); + + aggregator.completeStatement(this.id); + }); + } + + /** + * Emit a telemetry error event for an exception thrown by an operation. + * Terminal errors (per `ExceptionClassifier`) trigger an immediate flush + * in the aggregator; retryable errors are buffered until the statement + * completes. All exceptions from this method itself are swallowed at + * debug level — telemetry must never break the driver. + */ + private emitErrorEvent(error: Error): void { + safeEmit(this.context, (emitter) => { + emitter.emitError({ + statementId: this.id, + sessionId: this.sessionId, + errorName: error.name || 'Error', + errorMessage: error.message || 'Unknown error', + errorStack: error.stack, + isTerminal: ExceptionClassifier.isTerminal(error), + }); + }); + } } diff --git a/lib/DBSQLSession.ts b/lib/DBSQLSession.ts index 95715e1b..152bd58a 100644 --- a/lib/DBSQLSession.ts +++ b/lib/DBSQLSession.ts @@ -39,6 +39,7 @@ import StagingError from './errors/StagingError'; import { DBSQLParameter, DBSQLParameterValue } from './DBSQLParameter'; import ParameterError from './errors/ParameterError'; import IClientContext, { ClientConfig } from './contracts/IClientContext'; +import { safeEmit } from './telemetry/telemetryUtils'; // Explicitly promisify a callback-style `pipeline` because `node:stream/promises` is not available in Node 14 const pipeline = util.promisify(stream.pipeline); @@ -151,6 +152,8 @@ export default class DBSQLSession implements IDBSQLSession { private isOpen = true; + private openTime: number; + private serverProtocolVersion?: TProtocolVersion; public onClose?: () => void; @@ -169,6 +172,7 @@ export default class DBSQLSession implements IDBSQLSession { constructor({ handle, context, serverProtocolVersion }: DBSQLSessionConstructorOptions) { this.sessionHandle = handle; this.context = context; + this.openTime = Date.now(); // Get the server protocol version from the provided parameter (from TOpenSessionResp) this.serverProtocolVersion = serverProtocolVersion; this.context.getLogger().log(LogLevel.debug, `Session created with id: ${this.id}`); @@ -588,19 +592,43 @@ export default class DBSQLSession implements IDBSQLSession { // Close owned operations one by one, removing successfully closed ones from the list await this.operations.closeAll(); - const driver = await this.context.getDriver(); - const response = await driver.closeSession({ - sessionHandle: this.sessionHandle, - }); - // check status for being successful - Status.assert(response.status); - - // notify owner connection - this.onClose?.(); - this.isOpen = false; + // `latencyMs` on CONNECTION_CLOSE is the closeSession RPC duration, not the + // session lifetime. It's the symmetric counterpart of CREATE_SESSION's + // openSession RPC latency — both land under `operation_latency_ms` + // server-side and must measure the same kind of duration. + let closeStart = Date.now(); + const emitClose = () => { + // Emit connection.close regardless of whether closeSession succeeded — + // a failed close is the most diagnostic event for connection-failure rate. + const latencyMs = Date.now() - closeStart; + safeEmit(this.context, (emitter) => { + emitter.emitConnectionClose({ + sessionId: this.id, + latencyMs, + }); + }); + }; - this.context.getLogger().log(LogLevel.debug, `Session closed with id: ${this.id}`); - return new Status(response.status); + try { + const driver = await this.context.getDriver(); + closeStart = Date.now(); + const response = await driver.closeSession({ + sessionHandle: this.sessionHandle, + }); + Status.assert(response.status); + + this.onClose?.(); + this.isOpen = false; + emitClose(); + + this.context.getLogger().log(LogLevel.debug, `Session closed with id: ${this.id}`); + return new Status(response.status); + } catch (err) { + this.onClose?.(); + this.isOpen = false; + emitClose(); + throw err; + } } private createOperation(response: OperationResponseShape): DBSQLOperation { @@ -610,6 +638,7 @@ export default class DBSQLSession implements IDBSQLSession { handle, directResults: response.directResults, context: this.context, + sessionId: this.id, }); this.operations.add(operation); diff --git a/lib/contracts/IClientContext.ts b/lib/contracts/IClientContext.ts index c7274a1b..955b9c52 100644 --- a/lib/contracts/IClientContext.ts +++ b/lib/contracts/IClientContext.ts @@ -2,6 +2,9 @@ import IDBSQLLogger from './IDBSQLLogger'; import IDriver from './IDriver'; import IConnectionProvider from '../connection/contracts/IConnectionProvider'; import IThriftClient from './IThriftClient'; +import IAuthentication from '../connection/contracts/IAuthentication'; +import type TelemetryEventEmitter from '../telemetry/TelemetryEventEmitter'; +import type MetricsAggregator from '../telemetry/MetricsAggregator'; export interface ClientConfig { directResultsDefaultMaxRows: number; @@ -37,6 +40,17 @@ export interface ClientConfig { telemetryMaxPendingMetrics?: number; telemetryMaxErrorsPerStatement?: number; telemetryStatementTtlMs?: number; + telemetryCloseTimeoutMs?: number; + telemetryMaxStatementMetrics?: number; + /** + * If `true`, MetricsAggregator installs a `process.on('beforeExit')` hook + * that triggers a synchronous-as-possible flush before Node.js shuts down. + * Mitigates data loss when application code calls `process.exit(0)` without + * awaiting `client.close()`. Default `false` because adding a `beforeExit` + * listener changes process-exit semantics for hosts that monkey-patch + * `process.exit` (e.g. some test runners). + */ + telemetryFlushOnExit?: boolean; userAgentEntry?: string; } @@ -50,4 +64,20 @@ export default interface IClientContext { getClient(): Promise; getDriver(): Promise; + + getAuthProvider?(): IAuthentication | undefined; + + // The two telemetry accessors below remain optional methods on this + // interface for back-compat with mock contexts in tests and external + // sub-contexts that predate the telemetry work. A future refactor should + // pull them onto a dedicated `ITelemetrySink` that the host context + // implements, so non-telemetry context consumers don't see telemetry + // surface area. Tracked under follow-up; left in place to keep this PR + // scoped. + + /** @internal Telemetry event emitter, undefined when telemetry is disabled. */ + getTelemetryEmitter?(): TelemetryEventEmitter | undefined; + + /** @internal Telemetry aggregator, undefined when telemetry is disabled. */ + getTelemetryAggregator?(): MetricsAggregator | undefined; } diff --git a/lib/contracts/IDBSQLClient.ts b/lib/contracts/IDBSQLClient.ts index 9c0d9670..08592219 100644 --- a/lib/contracts/IDBSQLClient.ts +++ b/lib/contracts/IDBSQLClient.ts @@ -54,6 +54,90 @@ export type ConnectionOptions = { socketTimeout?: number; proxy?: ProxyOptions; enableMetricViewMetadata?: boolean; + + /** + * Whether the driver emits telemetry events (connection / statement / + * cloud-fetch / error). Defaults to `true`. + * + * Activation is gated by **two** conditions: + * 1. This flag is `true` **and** + * 2. The remote feature flag for the workspace allows telemetry. + * + * Setting this to `false` is a hard, unconditional opt-out. Setting to + * `true` only requests telemetry; the workspace must also allow it. + * + * The environment variable `DATABRICKS_TELEMETRY_DISABLED` set to one of + * `1`, `true`, `yes`, or `on` (case-insensitive) overrides this flag and + * disables telemetry entirely. + */ + telemetryEnabled?: boolean; + + /** + * Maximum number of metrics to batch before flushing to the telemetry + * endpoint. Default 100. + */ + telemetryBatchSize?: number; + + /** + * How often to flush buffered telemetry metrics, in milliseconds. + * The flush timer is `unref()`'d so it cannot keep the Node.js process + * alive on its own. Default 5000ms. + */ + telemetryFlushIntervalMs?: number; + + /** + * Maximum retry attempts for a telemetry export *after* the initial call. + * Default 3. + */ + telemetryMaxRetries?: number; + + /** + * When `true`, telemetry is sent to the authenticated `/telemetry-ext` + * endpoint with workspace + session + statement IDs and a system + * configuration block. When `false`, only error names are emitted via the + * unauthenticated endpoint. Default `true`. + * + * Privacy-relevant: setting `false` minimizes the data surface at the + * cost of losing most observability. + */ + telemetryAuthenticatedExport?: boolean; + + /** + * Number of consecutive telemetry export failures before the per-host + * circuit breaker trips and pauses exports. Default 5. + */ + telemetryCircuitBreakerThreshold?: number; + + /** + * How long the circuit breaker stays open before re-probing the + * telemetry endpoint, in milliseconds. Default 60000ms (1 minute). + */ + telemetryCircuitBreakerTimeout?: number; + + /** + * Maximum wall-clock time `client.close()` will wait for the final + * telemetry flush HTTP POST. Bounds shutdown latency so callers + * doing `await client.close(); process.exit(0)` are not held up by a + * misbehaving telemetry endpoint. Default 2000ms. + */ + telemetryCloseTimeoutMs?: number; + + /** + * Hard cap on the per-statement aggregation map size. When the cap is + * reached, the oldest entry is evicted (its buffered errors are emitted + * as standalone metrics first so the first-failure signal survives). + * Default 5000. + */ + telemetryMaxStatementMetrics?: number; + + /** + * Maximum number of telemetry metrics buffered in memory before the + * oldest non-error entry is dropped. Raise this when + * `getTelemetryStats().droppedMetrics` increases between observations, + * which indicates the buffer is filling faster than the flush interval + * can drain it. Default 500. + */ + telemetryMaxPendingMetrics?: number; } & AuthOptions; export interface OpenSessionRequest { diff --git a/lib/index.ts b/lib/index.ts index 81e3aaae..6b6d42b9 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -23,11 +23,21 @@ import { LogLevel } from './contracts/IDBSQLLogger'; // Re-export types for TypeScript users export type { default as ITokenProvider } from './connection/auth/tokenProvider/ITokenProvider'; -// Re-export telemetry error classes so consumers can instanceof-check rather -// than string-matching error messages. export { CircuitBreakerOpenError, CIRCUIT_BREAKER_OPEN_CODE } from './telemetry/CircuitBreaker'; export { TelemetryTerminalError } from './telemetry/DatabricksTelemetryExporter'; +// Telemetry event/metric/config shapes for consumers that want to inspect +// telemetry payloads or pre-validate config. The emitter, aggregator, and +// per-host client are deliberately not re-exported — they are internal. +export { TelemetryEventType, DEFAULT_TELEMETRY_CONFIG } from './telemetry/types'; +export type { + TelemetryEvent, + TelemetryMetric, + TelemetryConfiguration, + StatementMetrics, + DriverConfiguration, +} from './telemetry/types'; + export const auth = { PlainHttpAuthentication, // Token provider classes for custom authentication diff --git a/lib/result/CloudFetchResultHandler.ts b/lib/result/CloudFetchResultHandler.ts index 91878813..cb67f5ce 100644 --- a/lib/result/CloudFetchResultHandler.ts +++ b/lib/result/CloudFetchResultHandler.ts @@ -6,6 +6,7 @@ import IResultsProvider, { ResultsProviderFetchNextOptions } from './IResultsPro import { ArrowBatch } from './utils'; import { LZ4 } from '../utils'; import { LogLevel } from '../contracts/IDBSQLLogger'; +import { safeEmit } from '../telemetry/telemetryUtils'; export default class CloudFetchResultHandler implements IResultsProvider { private readonly context: IClientContext; @@ -14,18 +15,24 @@ export default class CloudFetchResultHandler implements IResultsProvider = []; private downloadTasks: Array> = []; + private chunkIndex: number = 0; + constructor( context: IClientContext, source: IResultsProvider, - { lz4Compressed }: TGetResultSetMetadataResp, + metadata: TGetResultSetMetadataResp, + statementId?: string, ) { this.context = context; this.source = source; - this.isLZ4Compressed = lz4Compressed ?? false; + this.isLZ4Compressed = metadata.lz4Compressed ?? false; + this.statementId = statementId; if (this.isLZ4Compressed && !LZ4()) { throw new HiveDriverError('Cannot handle LZ4 compressed result: module `lz4` not installed'); @@ -106,6 +113,10 @@ export default class CloudFetchResultHandler implements IResultsProvider { + emitter.emitCloudFetchChunk({ + statementId, + chunkIndex, + latencyMs, + bytes, + compressed: this.isLZ4Compressed, + }); + }); + } } diff --git a/lib/result/RowSetProvider.ts b/lib/result/RowSetProvider.ts index f3fa4213..8c67a9f4 100644 --- a/lib/result/RowSetProvider.ts +++ b/lib/result/RowSetProvider.ts @@ -1,7 +1,14 @@ import Int64 from 'node-int64'; -import { TFetchOrientation, TFetchResultsResp, TOperationHandle, TRowSet } from '../../thrift/TCLIService_types'; +import { + TColumn, + TFetchOrientation, + TFetchResultsResp, + TOperationHandle, + TRowSet, +} from '../../thrift/TCLIService_types'; import Status from '../dto/Status'; import IClientContext from '../contracts/IClientContext'; +import { safeEmit } from '../telemetry/telemetryUtils'; import IResultsProvider, { ResultsProviderFetchNextOptions } from './IResultsProvider'; import { getColumnValue } from './utils'; @@ -10,6 +17,59 @@ export enum FetchType { Logs = 1, } +/** + * Rough byte count for a single `TColumn`. Used only for telemetry — exact + * wire size isn't worth a deserialization pass. Fixed-width values are + * estimated by their TypeScript-side footprint; strings count their UTF-16 + * length (close enough proxy for UTF-8 payload in most cases). The `nulls` + * bitmap buffer is included verbatim. + * + * Wire-side payload will be slightly different (Thrift's varint encodings, + * UTF-8 vs the JS engine's UTF-16) but dashboards aggregating + * `bytesDownloaded` for non-arrow result sets care about scale, not byte- + * perfect accuracy. + */ +function estimateColumnBytes(column: TColumn): number { + let bytes = 0; + if (column.boolVal) { + bytes += column.boolVal.values?.length ?? 0; + bytes += column.boolVal.nulls?.length ?? 0; + } + if (column.byteVal) { + bytes += column.byteVal.values?.length ?? 0; + bytes += column.byteVal.nulls?.length ?? 0; + } + if (column.i16Val) { + bytes += (column.i16Val.values?.length ?? 0) * 2; + bytes += column.i16Val.nulls?.length ?? 0; + } + if (column.i32Val) { + bytes += (column.i32Val.values?.length ?? 0) * 4; + bytes += column.i32Val.nulls?.length ?? 0; + } + if (column.i64Val) { + bytes += (column.i64Val.values?.length ?? 0) * 8; + bytes += column.i64Val.nulls?.length ?? 0; + } + if (column.doubleVal) { + bytes += (column.doubleVal.values?.length ?? 0) * 8; + bytes += column.doubleVal.nulls?.length ?? 0; + } + if (column.stringVal) { + for (const value of column.stringVal.values ?? []) { + bytes += value?.length ?? 0; + } + bytes += column.stringVal.nulls?.length ?? 0; + } + if (column.binaryVal) { + for (const value of column.binaryVal.values ?? []) { + bytes += value?.length ?? 0; + } + bytes += column.binaryVal.nulls?.length ?? 0; + } + return bytes; +} + function checkIfOperationHasMoreRows(response: TFetchResultsResp): boolean { if (response.hasMoreRows) { return true; @@ -26,6 +86,10 @@ export default class RowSetProvider implements IResultsProvider, returnOnlyPrefetchedResults: boolean, + statementId?: string, ) { this.context = context; this.operationHandle = operationHandle; + this.statementId = statementId; prefetchedResults.forEach((item) => { if (item) { this.prefetchedResults.push(item); @@ -83,16 +149,71 @@ export default class RowSetProvider implements IResultsProvider { + // Aggregate byte counts across all wire shapes a `TFetchResultsResp` + // can carry. URL-based result sets (`resultLinks`) bypass this site + // entirely — they emit from `CloudFetchResultHandler.emitCloudFetchChunk` + // with the post-download byte count from the cloud-storage GET. + // + // Shapes counted here: + // - `arrowBatches[i].batch` — Arrow inline payload + // - `binaryColumns` — packed columnar binary blob + // - `columns[i].*Val.values` — COLUMN_BASED_SET, sum across columns + // - `rows[i]` — legacy row-based set (rare; estimate) + let bytes = 0; + const { results } = response; + if (results) { + const { arrowBatches } = results; + if (arrowBatches) { + for (const batch of arrowBatches) { + bytes += batch.batch?.length ?? 0; + } + } + const { binaryColumns } = results; + if (binaryColumns) { + bytes += binaryColumns.length; + } + const { columns } = results; + if (columns) { + for (const column of columns) { + bytes += estimateColumnBytes(column); + } + } + } + + emitter.emitCloudFetchChunk({ + statementId, + chunkIndex: this.chunkIndex, + latencyMs, + bytes, + }); + this.chunkIndex += 1; + }); + } + public async hasMore() { // If there are prefetched results available - return `true` regardless of // the actual state of `hasMoreRows` flag (because we actually have some data) diff --git a/lib/telemetry/DatabricksTelemetryExporter.ts b/lib/telemetry/DatabricksTelemetryExporter.ts index eeeb5eea..65fe8b64 100644 --- a/lib/telemetry/DatabricksTelemetryExporter.ts +++ b/lib/telemetry/DatabricksTelemetryExporter.ts @@ -116,7 +116,16 @@ export default class DatabricksTelemetryExporter { ) { this.circuitBreaker = circuitBreakerRegistry.getCircuitBreaker(host); const config = this.context.getConfig(); - this.authenticatedUserAgent = buildUserAgentString(config.userAgentEntry); + // Telemetry-default-on means anything in `userAgentEntry` ships to the + // telemetry endpoint on every POST. `buildUserAgentString` redacts + // Databricks token *prefixes* (dapi, dose, …) but not the broader secret + // shapes — JWTs, OAuth `client_secret`, JSON-encoded credentials, URL + // userinfo — that customers occasionally stuff into the user-agent. + // Run the full SECRET_PATTERNS sweep before the UA is built so the + // header value never carries an unredacted secret. The user's primary + // Thrift connection is unaffected. + const redactedEntry = config.userAgentEntry === undefined ? undefined : redactSensitive(config.userAgentEntry, 256); + this.authenticatedUserAgent = buildUserAgentString(redactedEntry); this.unauthenticatedUserAgent = buildUserAgentString(undefined); } @@ -289,12 +298,16 @@ export default class DatabricksTelemetryExporter { } private async getAuthHeaders(): Promise> { - if (!this.authProvider) { + // Prefer the explicitly-injected auth provider; fall back to the context + // (used when a shared TelemetryClient resolves auth through its FIFO of + // registered DBSQLClients). + const authProvider = this.authProvider ?? this.context.getAuthProvider?.(); + if (!authProvider) { return {}; } const logger = this.context.getLogger(); try { - return normalizeHeaders(await this.authProvider.authenticate()); + return normalizeHeaders(await authProvider.authenticate()); } catch (error: any) { logger.log(LogLevel.debug, `Telemetry: auth provider threw: ${error?.message ?? error}`); return {}; @@ -304,13 +317,23 @@ export default class DatabricksTelemetryExporter { private async sendRequest(url: string, init: RequestInit): Promise { const connectionProvider = await this.context.getConnectionProvider(); const agent = await connectionProvider.getAgent(); - const retryPolicy = await connectionProvider.getRetryPolicy(); + // Do NOT route through `connectionProvider.getRetryPolicy()` here. That + // policy is bounded by the customer's `retriesTimeout` (15 min default) + // and `retryMaxAttempts` (5), so each call to `sendRequest` could spend + // up to ~15 min retrying on its own. Stacked under the exporter's outer + // `exportWithRetry` loop (`telemetryMaxRetries` default 3) the worst-case + // wall-clock per batch reaches ~45 min — during which `flushInFlight` + // blocks subsequent flushes and `addPendingMetric` silently drops past + // the buffer cap. + // + // Telemetry must be a polite background citizen: the outer loop is the + // single source of retry, and a single attempt here is the right shape. + // Connection-level errors that the customer retry policy would have + // bridged (transient socket errors, 5xx) are still classified by + // `ExceptionClassifier` and retried by `exportWithRetry` instead. const requestConfig: RequestInit = { agent, ...init }; - const result = await retryPolicy.invokeWithRetry(() => { - const request = new Request(url, requestConfig); - return fetch(request).then((response) => ({ request, response })); - }); - return result.response; + const request = new Request(url, requestConfig); + return fetch(request); } private toTelemetryLog( @@ -340,22 +363,27 @@ export default class DatabricksTelemetryExporter { }, }; - if (metric.metricType === 'connection' && metric.driverConfig && includeCorrelation) { - // system_configuration is a high-entropy client fingerprint (OS, arch, - // locale, process, runtime). Only ship on the authenticated path. - log.entry.sql_driver_log.system_configuration = { - driver_version: metric.driverConfig.driverVersion, - driver_name: metric.driverConfig.driverName, - runtime_name: 'Node.js', - runtime_version: metric.driverConfig.nodeVersion, - runtime_vendor: metric.driverConfig.runtimeVendor, - os_name: metric.driverConfig.platform, - os_version: metric.driverConfig.osVersion, - os_arch: metric.driverConfig.osArch, - locale_name: metric.driverConfig.localeName, - char_set_encoding: metric.driverConfig.charSetEncoding, - process_name: sanitizeProcessName(metric.driverConfig.processName) || undefined, - }; + if (metric.metricType === 'connection') { + if (metric.latencyMs !== undefined) { + log.entry.sql_driver_log.operation_latency_ms = metric.latencyMs; + } + if (metric.driverConfig && includeCorrelation) { + // system_configuration is a high-entropy client fingerprint (OS, arch, + // locale, process, runtime). Only ship on the authenticated path. + log.entry.sql_driver_log.system_configuration = { + driver_version: metric.driverConfig.driverVersion, + driver_name: metric.driverConfig.driverName, + runtime_name: 'Node.js', + runtime_version: metric.driverConfig.nodeVersion, + runtime_vendor: metric.driverConfig.runtimeVendor, + os_name: metric.driverConfig.platform, + os_version: metric.driverConfig.osVersion, + os_arch: metric.driverConfig.osArch, + locale_name: metric.driverConfig.localeName, + char_set_encoding: metric.driverConfig.charSetEncoding, + process_name: sanitizeProcessName(metric.driverConfig.processName) || undefined, + }; + } } else if (metric.metricType === 'statement') { log.entry.sql_driver_log.operation_latency_ms = metric.latencyMs; @@ -364,19 +392,24 @@ export default class DatabricksTelemetryExporter { execution_result: metric.resultFormat, }; - if (metric.chunkCount && metric.chunkCount > 0) { + if ((metric.chunkCount ?? 0) > 0) { log.entry.sql_driver_log.sql_operation.chunk_details = { total_chunks_present: metric.chunkCount, total_chunks_iterated: metric.chunkCount, + initial_chunk_latency_millis: metric.chunkInitialLatencyMs, + slowest_chunk_latency_millis: metric.chunkSlowestLatencyMs, + sum_chunks_download_time_millis: metric.chunkSumLatencyMs, }; } } } else if (metric.metricType === 'error') { const stackOrMessage = metric.errorStack ?? metric.errorMessage ?? ''; + // Primary redaction site is TelemetryEventEmitter.emitError — this + // second pass is defence-in-depth in case a metric reached the + // exporter via a path that bypassed the emitter (programmatic injection + // in tests, future call sites). On the unauth path we ship nothing. log.entry.sql_driver_log.error_info = { error_name: metric.errorName || 'UnknownError', - // Redact common secret shapes and cap length. On the unauth path we - // keep only the error class — no message body. stack_trace: includeCorrelation ? redactSensitive(stackOrMessage) : '', }; } diff --git a/lib/telemetry/FeatureFlagCache.ts b/lib/telemetry/FeatureFlagCache.ts index 06bc59a9..47fc9e8d 100644 --- a/lib/telemetry/FeatureFlagCache.ts +++ b/lib/telemetry/FeatureFlagCache.ts @@ -202,11 +202,15 @@ export default class FeatureFlagCache { } private async getAuthHeaders(): Promise> { - if (!this.authProvider) { + // Prefer the explicitly-injected auth provider; fall back to the context + // (used when a shared TelemetryClient resolves auth through its FIFO of + // registered DBSQLClients). Mirrors DatabricksTelemetryExporter.getAuthHeaders. + const authProvider = this.authProvider ?? this.context.getAuthProvider?.(); + if (!authProvider) { return {}; } try { - return normalizeHeaders(await this.authProvider.authenticate()); + return normalizeHeaders(await authProvider.authenticate()); } catch (error: any) { this.context.getLogger().log(LogLevel.debug, `Feature flag auth failed: ${error?.message ?? error}`); return {}; diff --git a/lib/telemetry/MetricsAggregator.ts b/lib/telemetry/MetricsAggregator.ts index d160db10..459d8827 100644 --- a/lib/telemetry/MetricsAggregator.ts +++ b/lib/telemetry/MetricsAggregator.ts @@ -21,7 +21,11 @@ import DatabricksTelemetryExporter from './DatabricksTelemetryExporter'; interface StatementTelemetryDetails { statementId: string; - sessionId: string; + // sessionId is optional — emit sites pass `undefined` when no session is + // associated yet (e.g. an operation created before its session id wired up). + // The aggregator treats `undefined` as a single bucket rather than ghost + // sessions per emit site. + sessionId?: string; workspaceId?: string; operationType?: string; startTime: number; @@ -31,6 +35,9 @@ interface StatementTelemetryDetails { bytesDownloaded: number; pollCount: number; compressionEnabled?: boolean; + chunkInitialLatencyMs?: number; + chunkSlowestLatencyMs?: number; + chunkSumLatencyMs: number; errors: TelemetryEvent[]; } @@ -49,6 +56,11 @@ export default class MetricsAggregator { private flushTimer: NodeJS.Timeout | null = null; + // Single in-flight flush serializer. Concurrent triggers (batch hit, periodic + // tick, terminal-error, manual flush) all share one HTTP POST so the user's + // socket pool can't be starved by a slow telemetry endpoint. + private flushInFlight: Promise | null = null; + private closed = false; private closing = false; @@ -63,6 +75,30 @@ export default class MetricsAggregator { private statementTtlMs: number; + private maxStatementMetrics: number; + + // `beforeExit` hook installed when `telemetryFlushOnExit` is true. Tracked + // so close() can detach the listener — leaving it attached would otherwise + // keep references alive past the client's lifetime in long-running hosts + // that create and destroy DBSQLClients (test runners, serverless cold + // re-uses). + private beforeExitHandler: (() => void) | null = null; + + // Operator-visible counters. Bumped every time the aggregator drops or + // evicts a metric for capacity reasons. Surfaced via `getStats()` and + // logged at warn on each flush when non-zero so silent data loss is + // visible without forcing operators to grep debug logs. + private droppedMetrics = 0; + + private evictedStatements = 0; + + // Counters of dropped/evicted entries reported in the most recent + // warn-level summary. Compared against the running counter so each + // summary reports the delta, not a cumulative value that grows forever. + private lastReportedDrops = 0; + + private lastReportedEvictions = 0; + constructor(private context: IClientContext, private exporter: DatabricksTelemetryExporter) { try { const config = context.getConfig(); @@ -72,8 +108,34 @@ export default class MetricsAggregator { this.maxErrorsPerStatement = config.telemetryMaxErrorsPerStatement ?? DEFAULT_TELEMETRY_CONFIG.maxErrorsPerStatement; this.statementTtlMs = config.telemetryStatementTtlMs ?? DEFAULT_TELEMETRY_CONFIG.statementTtlMs; + this.maxStatementMetrics = config.telemetryMaxStatementMetrics ?? DEFAULT_TELEMETRY_CONFIG.maxStatementMetrics; this.startFlushTimer(); + + // Optional beforeExit hook for callers that can't easily reorder their + // shutdown to `await client.close()` before `process.exit`. This is + // best-effort — beforeExit doesn't fire on `process.exit()`, only on + // a natural drain — and `process.exit(0)` skips it entirely. Disabled + // by default; the test-runner override is the main reason callers opt out. + if (config.telemetryFlushOnExit) { + this.beforeExitHandler = () => { + // beforeExit is synchronous; we kick off the flush but cannot wait + // for the HTTP POST to complete. Synchronous Node.js APIs are not + // available for fetch. Best-effort. + this.flush(false).catch(() => { + // swallow — telemetry must never break shutdown + }); + }; + try { + process.on('beforeExit', this.beforeExitHandler); + } catch (err: any) { + // Hosted environments where `process.on` is locked down + this.context + .getLogger() + .log(LogLevel.debug, `MetricsAggregator beforeExit registration failed: ${err?.message ?? err}`); + this.beforeExitHandler = null; + } + } } catch (error: any) { const logger = this.context.getLogger(); logger.log(LogLevel.debug, `MetricsAggregator constructor error: ${error.message}`); @@ -83,6 +145,7 @@ export default class MetricsAggregator { this.maxPendingMetrics = DEFAULT_TELEMETRY_CONFIG.maxPendingMetrics; this.maxErrorsPerStatement = DEFAULT_TELEMETRY_CONFIG.maxErrorsPerStatement; this.statementTtlMs = DEFAULT_TELEMETRY_CONFIG.statementTtlMs; + this.maxStatementMetrics = DEFAULT_TELEMETRY_CONFIG.maxStatementMetrics; } } @@ -92,7 +155,12 @@ export default class MetricsAggregator { try { if (event.eventType === TelemetryEventType.CONNECTION_OPEN) { - this.processConnectionEvent(event); + this.processConnectionEvent(event, 'CREATE_SESSION'); + return; + } + + if (event.eventType === TelemetryEventType.CONNECTION_CLOSE) { + this.processConnectionEvent(event, 'DELETE_SESSION'); return; } @@ -109,13 +177,15 @@ export default class MetricsAggregator { } } - private processConnectionEvent(event: TelemetryEvent): void { + private processConnectionEvent(event: TelemetryEvent, operationType: 'CREATE_SESSION' | 'DELETE_SESSION'): void { const metric: TelemetryMetric = { metricType: 'connection', timestamp: event.timestamp, sessionId: event.sessionId, workspaceId: event.workspaceId, driverConfig: event.driverConfig, + operationType, + latencyMs: event.latencyMs, }; this.addPendingMetric(metric); @@ -154,7 +224,7 @@ export default class MetricsAggregator { // stall on telemetry HTTP. Do NOT reset the periodic flush timer: // under burst failures that would keep the tail-drain timer from // ever firing. - Promise.resolve(this.flush(false)).catch((err: any) => { + this.flush(false).catch((err: any) => { logger.log(LogLevel.debug, `Terminal-error flush failed: ${err?.message ?? err}`); }); } else if (event.statementId) { @@ -182,16 +252,45 @@ export default class MetricsAggregator { case TelemetryEventType.STATEMENT_COMPLETE: details.executionLatencyMs = event.latencyMs; details.resultFormat = event.resultFormat; - details.chunkCount = event.chunkCount ?? 0; - details.bytesDownloaded = event.bytesDownloaded ?? 0; - details.pollCount = event.pollCount ?? 0; + // STATEMENT_COMPLETE may not carry chunk counts (the operation + // doesn't always know them at close time); only override when the + // emit explicitly supplied a value, otherwise the values accumulated + // from CLOUDFETCH_CHUNK survive. + if (event.chunkCount !== undefined) { + details.chunkCount = event.chunkCount; + } + if (event.bytesDownloaded !== undefined) { + details.bytesDownloaded = event.bytesDownloaded; + } + if (event.pollCount !== undefined) { + details.pollCount = event.pollCount; + } break; case TelemetryEventType.CLOUDFETCH_CHUNK: details.chunkCount += 1; details.bytesDownloaded += event.bytes ?? 0; - if (event.compressed !== undefined) { - details.compressionEnabled = event.compressed; + // `compressionEnabled` is a sticky OR across all chunks in the + // statement — any compressed chunk flips it true and it stays true. + // Previously we copied the last chunk's value, which silently lied + // for mixed-compression batches (compressed chunk 1, uncompressed + // chunk 2 → compressionEnabled=false). True-on-any matches the + // dashboard contract "did this statement benefit from compression". + if (event.compressed === true) { + details.compressionEnabled = true; + } else if (event.compressed === false && details.compressionEnabled === undefined) { + details.compressionEnabled = false; + } + // Per-chunk timing aggregation. Only record positive latencies — keeps + // prefetched/cached pages out of the timing stats. + if (event.latencyMs !== undefined && event.latencyMs > 0) { + if (details.chunkInitialLatencyMs === undefined) { + details.chunkInitialLatencyMs = event.latencyMs; + } + if (details.chunkSlowestLatencyMs === undefined || event.latencyMs > details.chunkSlowestLatencyMs) { + details.chunkSlowestLatencyMs = event.latencyMs; + } + details.chunkSumLatencyMs += event.latencyMs; } break; @@ -204,14 +303,21 @@ export default class MetricsAggregator { const statementId = event.statementId!; if (!this.statementMetrics.has(statementId)) { + // Hard cap on map size — abandoned operations or buggy upstreams that + // emit errors with random fresh statementIds would otherwise grow this + // map unbounded for up to statementTtlMs. + if (this.statementMetrics.size >= this.maxStatementMetrics) { + this.evictOldestStatement(); + } this.statementMetrics.set(statementId, { statementId, - sessionId: event.sessionId!, + sessionId: event.sessionId, workspaceId: event.workspaceId, startTime: event.timestamp, chunkCount: 0, bytesDownloaded: 0, pollCount: 0, + chunkSumLatencyMs: 0, errors: [], }); } @@ -219,6 +325,35 @@ export default class MetricsAggregator { return this.statementMetrics.get(statementId)!; } + /** + * Drop the oldest entry by insertion order to make room. Emits its buffered + * errors as standalone metrics first so the first-failure signal survives. + * Map iteration order is insertion order in JS. + */ + private evictOldestStatement(): void { + const oldest = this.statementMetrics.keys().next(); + if (oldest.done) return; + const id = oldest.value; + const details = this.statementMetrics.get(id)!; + for (const errorEvent of details.errors) { + this.addPendingMetric({ + metricType: 'error', + timestamp: errorEvent.timestamp, + sessionId: details.sessionId, + statementId: details.statementId, + workspaceId: details.workspaceId, + errorName: errorEvent.errorName, + errorMessage: errorEvent.errorMessage, + errorStack: errorEvent.errorStack, + }); + } + this.statementMetrics.delete(id); + this.evictedStatements += 1; + this.context + .getLogger() + .log(LogLevel.debug, `MetricsAggregator: evicted oldest statement ${id} (max=${this.maxStatementMetrics})`); + } + /** * Drop entries older than `statementTtlMs`, emitting their buffered error * events as standalone metrics first so the first-failure signal survives @@ -247,12 +382,63 @@ export default class MetricsAggregator { } } if (evicted > 0) { + this.evictedStatements += evicted; this.context .getLogger() .log(LogLevel.debug, `Evicted ${evicted} abandoned statement(s) past ${this.statementTtlMs}ms TTL`); } } + /** + * Operator-visible snapshot of aggregator state. Returned synchronously so + * a health-check endpoint or shutdown summary can include it without + * awaiting anything. + * + * - `pendingMetricsCount` : current buffer depth (0..maxPendingMetrics). + * - `inFlightStatements` : open statement aggregations (0..maxStatementMetrics). + * - `droppedMetrics` : cumulative count of metrics dropped due to + * buffer overflow since start. + * - `evictedStatements` : cumulative count of statements evicted due + * to TTL or map-cap, since start. + */ + getStats(): { + pendingMetricsCount: number; + inFlightStatements: number; + droppedMetrics: number; + evictedStatements: number; + } { + return { + pendingMetricsCount: this.pendingMetrics.length, + inFlightStatements: this.statementMetrics.size, + droppedMetrics: this.droppedMetrics, + evictedStatements: this.evictedStatements, + }; + } + + /** + * Emit a warn-level summary if drops/evictions occurred since the last + * report. Operators running on `LogLevel.info` (the driver default) need + * to see capacity events without enabling debug. + */ + private maybeWarnOnCapacityEvents(): void { + const dropsDelta = this.droppedMetrics - this.lastReportedDrops; + const evictionsDelta = this.evictedStatements - this.lastReportedEvictions; + if (dropsDelta === 0 && evictionsDelta === 0) { + return; + } + this.lastReportedDrops = this.droppedMetrics; + this.lastReportedEvictions = this.evictedStatements; + this.context + .getLogger() + .log( + LogLevel.warn, + `Telemetry capacity events since last flush: ` + + `dropped=${dropsDelta} (buffer cap=${this.maxPendingMetrics}); ` + + `evicted=${evictionsDelta} statements (map cap=${this.maxStatementMetrics}, ttl=${this.statementTtlMs}ms). ` + + `Raise telemetryMaxPendingMetrics / telemetryMaxStatementMetrics / telemetryStatementTtlMs if this is sustained.`, + ); + } + completeStatement(statementId: string): void { if (this.closed) return; const logger = this.context.getLogger(); @@ -263,17 +449,28 @@ export default class MetricsAggregator { return; } + // Emit chunkSumLatencyMs alongside chunkCount whenever there are + // chunks. Dropping it when zero produced "5 chunks / 0ms total" rows in + // dashboards because some sources (pre-fetched / cached pages) emit + // chunks with latency=0. Aligning the omission rule with chunkCount + // keeps the two fields consistent: present together or absent together. + const hasChunks = details.chunkCount > 0; const metric: TelemetryMetric = { metricType: 'statement', timestamp: details.startTime, sessionId: details.sessionId, statementId: details.statementId, workspaceId: details.workspaceId, + operationType: details.operationType, latencyMs: details.executionLatencyMs, resultFormat: details.resultFormat, chunkCount: details.chunkCount, + chunkInitialLatencyMs: details.chunkInitialLatencyMs, + chunkSlowestLatencyMs: details.chunkSlowestLatencyMs, + chunkSumLatencyMs: hasChunks ? details.chunkSumLatencyMs : undefined, bytesDownloaded: details.bytesDownloaded, pollCount: details.pollCount, + compressed: details.compressionEnabled, }; this.addPendingMetric(metric); @@ -312,6 +509,7 @@ export default class MetricsAggregator { if (this.pendingMetrics.length > this.maxPendingMetrics) { const dropIndex = this.findDropIndex(); this.pendingMetrics.splice(dropIndex, 1); + this.droppedMetrics += 1; const logger = this.context.getLogger(); logger.log( LogLevel.debug, @@ -323,7 +521,7 @@ export default class MetricsAggregator { // resetTimer=false so the periodic tail-drain keeps its cadence even // under sustained batch-size bursts. const logger = this.context.getLogger(); - Promise.resolve(this.flush(false)).catch((err: any) => { + this.flush(false).catch((err: any) => { logger.log(LogLevel.debug, `Batch-trigger flush failed: ${err?.message ?? err}`); }); } @@ -344,8 +542,27 @@ export default class MetricsAggregator { * `process.exit()` after `client.close()` doesn't truncate the POST. */ async flush(resetTimer: boolean = true): Promise { + // Coalesce concurrent flush callers onto the in-flight promise so we + // never run two HTTP POSTs in parallel against the telemetry endpoint. + // Pending metrics arriving while flushInFlight is set will be picked up + // by the next caller. + if (this.flushInFlight) { + return this.flushInFlight; + } + this.flushInFlight = this.runFlush(resetTimer).finally(() => { + this.flushInFlight = null; + }); + return this.flushInFlight; + } + + private async runFlush(resetTimer: boolean): Promise { const logger = this.context.getLogger(); + // Surface capacity events (drops, evictions) once per flush at warn-level. + // Runs before the empty-buffer short-circuit so an evict-only cycle still + // emits the summary. + this.maybeWarnOnCapacityEvents(); + let exportPromise: Promise | null = null; try { if (this.pendingMetrics.length === 0) { @@ -395,7 +612,7 @@ export default class MetricsAggregator { } catch (err: any) { logger.log(LogLevel.debug, `evictExpiredStatements error: ${err?.message ?? err}`); } - Promise.resolve(this.flush(false)).catch((err: any) => { + this.flush(false).catch((err: any) => { logger.log(LogLevel.debug, `Periodic flush failed: ${err?.message ?? err}`); }); }, this.flushIntervalMs); @@ -414,6 +631,18 @@ export default class MetricsAggregator { // so no promises escape past the single awaited flush below. this.closing = true; + // Detach the beforeExit hook before clearing the timer — otherwise a + // long-running host that opens & closes many DBSQLClients accumulates + // dead listeners on the process object. + if (this.beforeExitHandler) { + try { + process.off('beforeExit', this.beforeExitHandler); + } catch (err: any) { + logger.log(LogLevel.debug, `MetricsAggregator beforeExit detach failed: ${err?.message ?? err}`); + } + this.beforeExitHandler = null; + } + if (this.flushTimer) { clearInterval(this.flushTimer); this.flushTimer = null; @@ -426,7 +655,37 @@ export default class MetricsAggregator { } this.closed = true; - await this.flush(false); + // Cap the wait on the final flush so a flapping telemetry endpoint + // can't hold up the user's process.exit(0). The in-flight POST is + // abandoned past the deadline; data loss is preferable to a hung exit. + const timeoutMs = this.context.getConfig().telemetryCloseTimeoutMs ?? 2000; + let timeoutHandle: NodeJS.Timeout | null = null; + const timeoutPromise = new Promise((resolve) => { + timeoutHandle = setTimeout(() => { + logger.log(LogLevel.debug, `MetricsAggregator.close: flush timed out after ${timeoutMs}ms`); + resolve(); + }, timeoutMs); + timeoutHandle.unref?.(); + }); + // Drain pattern: if a batch-trigger flush was already in-flight when + // close() ran, it captured a snapshot before completeStatement above + // appended the close-time metrics. Wait for that to finish and then + // run a fresh flush that picks up whatever's still in pendingMetrics. + const drain = async (): Promise => { + if (this.flushInFlight) { + await this.flushInFlight; + } + if (this.pendingMetrics.length > 0) { + await this.flush(false); + } + }; + try { + await Promise.race([drain(), timeoutPromise]); + } finally { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + } } catch (error: any) { logger.log(LogLevel.debug, `MetricsAggregator.close error: ${error.message}`); } diff --git a/lib/telemetry/TelemetryClient.ts b/lib/telemetry/TelemetryClient.ts index d74d1417..942a951e 100644 --- a/lib/telemetry/TelemetryClient.ts +++ b/lib/telemetry/TelemetryClient.ts @@ -14,51 +14,331 @@ * limitations under the License. */ -import IClientContext from '../contracts/IClientContext'; -import { LogLevel } from '../contracts/IDBSQLLogger'; +import IClientContext, { ClientConfig } from '../contracts/IClientContext'; +import IDBSQLLogger, { LogLevel } from '../contracts/IDBSQLLogger'; +import IConnectionProvider from '../connection/contracts/IConnectionProvider'; +import IThriftClient from '../contracts/IThriftClient'; +import IDriver from '../contracts/IDriver'; +import IAuthentication from '../connection/contracts/IAuthentication'; +import { CircuitBreakerRegistry, CircuitBreakerState } from './CircuitBreaker'; +import DatabricksTelemetryExporter from './DatabricksTelemetryExporter'; +import MetricsAggregator from './MetricsAggregator'; +import FeatureFlagCache from './FeatureFlagCache'; /** - * Telemetry client for a specific host. - * Managed by TelemetryClientProvider with reference counting. - * One client instance is shared across all connections to the same host. + * Per-host telemetry resource owner. Held by `TelemetryClientProvider` + * (process-wide singleton) and shared across every `DBSQLClient` that + * connects to the same host. + * + * Owns the host-scoped triad — `MetricsAggregator`, `DatabricksTelemetryExporter`, + * `CircuitBreakerRegistry`, `FeatureFlagCache` — and implements `IClientContext` + * itself so those owned components have a stable context that outlives any + * single `DBSQLClient`. The first registered `DBSQLClient`'s logger and config + * are snapshotted; subsequent registrants donate their connection providers + * and auth providers, and the `TelemetryClient` falls through them in + * registration order when the current head closes. + * + * Why share at host granularity: + * - Circuit-breaker state for `host` is correct only if all clients hitting + * the same endpoint share counters (5 failures means 5 actual failures, not + * 5×N for N independent `DBSQLClient` instances). + * - Feature-flag cache has a per-host TTL; deduping the GET prevents + * thundering-herd on cold cache. + * - Metric batches mix events from every active client to the same host — + * one HTTP POST per `flushIntervalMs` instead of N. */ -class TelemetryClient { +class TelemetryClient implements IClientContext { private closed: boolean = false; - constructor(private context: IClientContext, private host: string) { - const logger = context.getLogger(); - logger.log(LogLevel.debug, `Created TelemetryClient for host: ${host}`); + private readonly logger: IDBSQLLogger; + + private readonly config: ClientConfig; + + private readonly circuitBreakerRegistry: CircuitBreakerRegistry; + + private readonly featureFlagCache: FeatureFlagCache; + + private readonly exporter: DatabricksTelemetryExporter; + + private readonly aggregator: MetricsAggregator; + + // FIFO so the first-registered client's connection/auth providers are tried + // first. Falls through to later registrants when an earlier one is closed. + // + // The `authProvider` is snapshotted at register time (NOT re-fetched at + // unregister time). If a registrant's underlying `DBSQLClient` rotates its + // auth provider between register and unregister (token refresh that + // reconstructs the provider, lazy wrappers, etc.), filtering by referential + // equality against a freshly-fetched provider would never match the original, + // and the stale provider would remain in `authProviders` for the lifetime of + // the per-host singleton — keeping the exporter authenticating with revoked + // credentials. + private contexts: Array<{ context: IClientContext; authProvider?: IAuthentication }> = []; + + constructor(initialContext: IClientContext, public readonly host: string) { + this.logger = initialContext.getLogger(); + // Snapshot config at first registration. Subsequent clients with + // divergent telemetry knobs (`telemetryBatchSize` etc.) inherit the + // first-registrant's tuning — documented invariant. + this.config = initialContext.getConfig(); + this.contexts.push({ + context: initialContext, + authProvider: initialContext.getAuthProvider?.(), + }); + + this.circuitBreakerRegistry = new CircuitBreakerRegistry(this); + this.featureFlagCache = new FeatureFlagCache(this); + // Register this host with the feature-flag cache so isTelemetryEnabled() + // does not short-circuit to false. close() releases via releaseContext(). + this.featureFlagCache.getOrCreateContext(host); + this.exporter = new DatabricksTelemetryExporter(this, host, this.circuitBreakerRegistry); + this.aggregator = new MetricsAggregator(this, this.exporter); + + this.logger.log(LogLevel.debug, `Created TelemetryClient for host: ${host}`); } /** - * Gets the host associated with this client. + * Add another `DBSQLClient`'s context to the pool. Tracked in registration + * order; `getConnectionProvider()` / `getAuthProvider()` walk the list and + * use the first entry that's still usable. */ - getHost(): string { - return this.host; + registerContext(context: IClientContext): void { + if (this.contexts.some((entry) => entry.context === context)) { + // Re-snapshot the auth provider on a duplicate register call so a caller + // that intentionally rotated providers can refresh the entry. + const idx = this.contexts.findIndex((entry) => entry.context === context); + if (idx >= 0) { + this.contexts[idx] = { context, authProvider: context.getAuthProvider?.() }; + } + return; + } + const newAuthProvider = context.getAuthProvider?.(); + // Diverging auth providers across registrants on the same host is the + // multi-tenant FIFO bleed case. The first-registered auth wins; tenant B + // queries POST under tenant A's headers. Warn loudly — SaaS layers that + // see this in their logs should either: + // - set telemetryEnabled: false on all DBSQLClients, or + // - partition by host so each tenant owns its own TelemetryClient. + const firstAuthProvider = this.contexts[0]?.authProvider; + if ( + this.contexts.length > 0 && + newAuthProvider !== undefined && + firstAuthProvider !== undefined && + newAuthProvider !== firstAuthProvider + ) { + this.logger.log( + LogLevel.warn, + `TelemetryClient(${this.host}): a second DBSQLClient registered with a different auth provider. ` + + `Telemetry from this client will be POSTed under the first-registered client's auth headers. ` + + `Multi-tenant SaaS layers should set telemetryEnabled: false on all clients or partition by host. ` + + `See README "Multi-tenant SaaS deployments" for details.`, + ); + } + this.contexts.push({ + context, + authProvider: newAuthProvider, + }); + // Warn when subsequent registrants pass telemetry knobs that diverge + // from the first-registrant's snapshot — those values are silently + // ignored. Privacy-relevant for telemetryAuthenticatedExport. + this.warnOnConfigDivergence(context.getConfig()); + } + + private warnOnConfigDivergence(other: ClientConfig): void { + const keys: Array = [ + 'telemetryAuthenticatedExport', + 'telemetryBatchSize', + 'telemetryFlushIntervalMs', + 'telemetryMaxRetries', + 'telemetryCircuitBreakerThreshold', + 'telemetryCircuitBreakerTimeout', + // Privacy-relevant: User-Agent is snapshotted from the first registrant + // and shared across the host. Multi-tenant SaaS layers with per-tenant + // userAgentEntry values would otherwise silently ship under tenant-1's UA. + 'userAgentEntry', + ]; + const diverged = keys.filter((k) => other[k] !== undefined && other[k] !== this.config[k]); + if (diverged.length > 0) { + this.logger.log( + LogLevel.warn, + `TelemetryClient(${this.host}): registered context's telemetry settings ` + + `[${diverged.join(', ')}] differ from the first registrant's; the new values will be ignored.`, + ); + } } /** - * Checks if the client has been closed. + * Remove a `DBSQLClient`'s context from the pool. Called by + * `TelemetryClientProvider.releaseClient` before refcount decrement so the + * exporter doesn't keep trying to use a closed context. + * + * Uses the cached snapshot pair (`context`, `authProvider`) from register + * time, not a fresh `context.getAuthProvider?.()` call. If the underlying + * client's auth provider was rotated, a fresh call would not match the + * original reference and the stale entry would leak. */ + unregisterContext(context: IClientContext): void { + this.contexts = this.contexts.filter((entry) => entry.context !== context); + } + + // -- IClientContext -- + + getConfig(): ClientConfig { + return this.config; + } + + getLogger(): IDBSQLLogger { + return this.logger; + } + + async getConnectionProvider(): Promise { + return this.tryFallthrough((ctx) => ctx.getConnectionProvider(), 'connection provider'); + } + + async getClient(): Promise { + return this.tryFallthrough((ctx) => ctx.getClient(), 'client'); + } + + async getDriver(): Promise { + return this.tryFallthrough((ctx) => ctx.getDriver(), 'driver'); + } + + /** + * Walk `contexts` and return the first call site that succeeds. Contexts + * that throw on the same accessor for `MAX_CONSECUTIVE_CTX_FAILURES` calls + * in a row are pruned from the pool — otherwise a registrant whose + * underlying `DBSQLClient` is closed or has revoked auth would stay in the + * list forever and be retried on every export. + * + * Pruning is keyed by the `accessor` function identity so a context with a + * working `getConnectionProvider` but broken `getDriver` isn't dropped from + * the working accessor's path. + */ + private async tryFallthrough(accessor: (ctx: IClientContext) => Promise, accessorName: string): Promise { + let lastErr: unknown; + const survivors: typeof this.contexts = []; + /* eslint-disable no-await-in-loop */ + for (const entry of this.contexts) { + try { + const result = await accessor(entry.context); + // Keep this entry and any not-yet-tried ones. We've found a working + // head; the tail might be fine too. The throwing entries already in + // `survivors`'s past iterations are dropped below. + survivors.push(entry); + for (const remaining of this.contexts.slice(this.contexts.indexOf(entry) + 1)) { + survivors.push(remaining); + } + this.contexts = survivors; + return result; + } catch (err) { + lastErr = err; + // Drop this entry from the survivors set — repeatedly retrying a + // throwing context drains nothing useful from the FIFO. + this.logger.log( + LogLevel.debug, + `TelemetryClient(${this.host}): pruning context that threw on ${accessorName}: ` + + `${err instanceof Error ? err.message : String(err)}`, + ); + } + } + /* eslint-enable no-await-in-loop */ + this.contexts = survivors; + throw lastErr instanceof Error + ? lastErr + : new Error(`TelemetryClient: no ${accessorName} available for host ${this.host}`); + } + + getAuthProvider(): IAuthentication | undefined { + // Walk the FIFO and return the first usable provider. The provider was + // snapshotted at register time, so even if the registrant's underlying + // client has since rotated credentials, the original entry can still be + // dropped by `unregisterContext` via cached-snapshot equality. A registered + // head with revoked credentials surfaces as an `authenticate()` failure + // inside the exporter retry loop. + for (const entry of this.contexts) { + if (entry.authProvider !== undefined) { + return entry.authProvider; + } + } + return undefined; + } + + getTelemetryEmitter(): undefined { + // The shared TelemetryClient holds the aggregator; emitters remain + // per-DBSQLClient so each can respect its own `telemetryEnabled` flag. + return undefined; + } + + getTelemetryAggregator(): MetricsAggregator { + return this.aggregator; + } + + // -- shared resource accessors -- + + getExporter(): DatabricksTelemetryExporter { + return this.exporter; + } + + getAggregator(): MetricsAggregator { + return this.aggregator; + } + + getFeatureFlagCache(): FeatureFlagCache { + return this.featureFlagCache; + } + + /** + * Operator-visible snapshot of telemetry state for this host. Synchronous, + * never throws — intended for health-check endpoints, shutdown banners, + * and operator dashboards. + */ + getTelemetryStats(): { + host: string; + pendingMetricsCount: number; + inFlightStatements: number; + droppedMetrics: number; + evictedStatements: number; + circuitBreakerState: CircuitBreakerState; + } { + return { + host: this.host, + ...this.aggregator.getStats(), + circuitBreakerState: this.circuitBreakerRegistry.getCircuitBreaker(this.host).getState(), + }; + } + + getHost(): string { + return this.host; + } + isClosed(): boolean { return this.closed; } /** - * Closes the telemetry client and releases resources. - * Should only be called by TelemetryClientProvider when reference count reaches zero. + * Drain pending metrics and tear down owned resources. Called by + * `TelemetryClientProvider.releaseClient` when refCount hits zero. */ - close(): void { - if (this.closed) { - return; + async close(): Promise { + if (this.closed) return; + this.closed = true; + this.logger.log(LogLevel.debug, `Closing TelemetryClient for host: ${this.host}`); + try { + await this.aggregator.close(); + } catch (err) { + this.logger.log(LogLevel.debug, `TelemetryClient aggregator close error: ${(err as Error).message}`); + } + try { + this.exporter.dispose(); + } catch (err) { + this.logger.log(LogLevel.debug, `TelemetryClient exporter dispose error: ${(err as Error).message}`); } try { - this.context.getLogger().log(LogLevel.debug, `Closing TelemetryClient for host: ${this.host}`); - } catch { - // swallow - } finally { - this.closed = true; + this.featureFlagCache.releaseContext(this.host); + } catch (err) { + this.logger.log(LogLevel.debug, `TelemetryClient FFCache release error: ${(err as Error).message}`); } + this.logger.log(LogLevel.debug, `Closed TelemetryClient for host: ${this.host}`); } } diff --git a/lib/telemetry/TelemetryClientProvider.ts b/lib/telemetry/TelemetryClientProvider.ts index c0da29f0..4e7c09c0 100644 --- a/lib/telemetry/TelemetryClientProvider.ts +++ b/lib/telemetry/TelemetryClientProvider.ts @@ -18,50 +18,61 @@ import IClientContext from '../contracts/IClientContext'; import { LogLevel } from '../contracts/IDBSQLLogger'; import TelemetryClient from './TelemetryClient'; -/** - * Holds a telemetry client and its reference count. - * The reference count tracks how many connections are using this client. - */ interface TelemetryClientHolder { client: TelemetryClient; refCount: number; } // Soft cap on distinct host entries. Above this the provider warns once so a -// misconfigured caller (per-request hosts, unnormalized aliases) is visible in -// logs rather than silently growing the map. +// misconfigured caller (per-request hosts, unnormalized aliases) is visible +// in logs rather than silently growing the map. const MAX_CLIENTS_SOFT_LIMIT = 128; /** - * Manages one telemetry client per host. - * Prevents rate limiting by sharing clients across connections to the same host. - * Instance-based (not singleton), stored in DBSQLClient. + * Process-wide registry of `TelemetryClient`s, one per host. Multiple + * `DBSQLClient` instances connecting to the same host share the same + * `TelemetryClient`, which owns the host-scoped circuit breaker, feature + * flag cache, exporter, and aggregator. + * + * Singleton because the resources we're sharing — circuit-breaker counters, + * batched HTTP exports — are correct only at process scope. Per-`DBSQLClient` + * provider scope (the previous design) deduplicates nothing. * - * Reference counts are incremented and decremented synchronously, and - * `close()` is sync today, so there is no await between map mutation and - * client teardown. The map entry is removed before `close()` runs so a - * concurrent `getOrCreateClient` call for the same host gets a fresh - * instance rather than receiving this closing one. When `close()` becomes - * async (e.g. HTTP flush in [5/7]) the flow will need to `await` after the - * delete to preserve the same invariant. + * `getOrCreateClient` is sync: caller increments refcount, registers its + * context+auth, and walks away. `releaseClient` is `async` because the + * underlying `TelemetryClient.close()` awaits the final flush. */ class TelemetryClientProvider { - private clients: Map; + private static instance: TelemetryClientProvider | undefined; + + private clients = new Map(); private softLimitWarned = false; - constructor(private context: IClientContext) { - this.clients = new Map(); - const logger = context.getLogger(); - logger.log(LogLevel.debug, 'Created TelemetryClientProvider'); + // Production code should use `TelemetryClientProvider.getInstance()` for + // the process-wide singleton. The constructor remains public so unit tests + // can build an isolated provider with its own map. Deliberate no-op body — + // initial state is set inline on the field declarations above. + + static getInstance(): TelemetryClientProvider { + if (!TelemetryClientProvider.instance) { + TelemetryClientProvider.instance = new TelemetryClientProvider(); + } + return TelemetryClientProvider.instance; } /** - * Canonicalize host so aliases (scheme, default port, trailing slash, case, - * trailing dot, surrounding whitespace) map to the same entry. Kept to a - * lightweight lexical normalization — `buildTelemetryUrl` still performs - * the strict security validation when a request is actually built. + * Reset the process-wide singleton. Test-only — name-prefixed so + * production callsites can't reach for it accidentally via autocomplete. + * Resetting in production drops every host's circuit-breaker counters, + * feature-flag cache, exporter, and pending-metric buffer at once. + * + * @internal Test-only. Production code MUST NOT call this. */ + static __resetInstanceForTests(): void { + TelemetryClientProvider.instance = undefined; + } + private static normalizeHostKey(host: string): string { return host .trim() @@ -73,23 +84,17 @@ class TelemetryClientProvider { } /** - * Gets or creates a telemetry client for the specified host. - * Increments the reference count for the client. - * - * @param host The host identifier (e.g., "workspace.cloud.databricks.com") - * @returns The telemetry client for the host + * Get or create the `TelemetryClient` for `host`, registering `context` as + * a participant. Increments refcount. */ - getOrCreateClient(host: string): TelemetryClient { - const logger = this.context.getLogger(); + getOrCreateClient(context: IClientContext, host: string): TelemetryClient { + const logger = context.getLogger(); const key = TelemetryClientProvider.normalizeHostKey(host); let holder = this.clients.get(key); if (!holder) { - const client = new TelemetryClient(this.context, key); - holder = { - client, - refCount: 0, - }; + const client = new TelemetryClient(context, key); + holder = { client, refCount: 0 }; this.clients.set(key, holder); logger.log(LogLevel.debug, `Created new TelemetryClient for host: ${host}`); @@ -100,22 +105,21 @@ class TelemetryClientProvider { `TelemetryClientProvider has ${this.clients.size} distinct hosts — possible alias or leak`, ); } + } else { + holder.client.registerContext(context); } holder.refCount += 1; logger.log(LogLevel.debug, `TelemetryClient reference count for ${host}: ${holder.refCount}`); - return holder.client; } /** - * Releases a telemetry client for the specified host. - * Decrements the reference count and closes the client when it reaches zero. - * - * @param host The host identifier + * Release `context`'s registration. When the last `DBSQLClient` releases, + * the underlying `TelemetryClient.close()` runs and the entry is removed. */ - releaseClient(host: string): void { - const logger = this.context.getLogger(); + async releaseClient(context: IClientContext, host: string): Promise { + const logger = context.getLogger(); const key = TelemetryClientProvider.normalizeHostKey(host); const holder = this.clients.get(key); @@ -124,24 +128,22 @@ class TelemetryClientProvider { return; } - // Guard against double-release: a caller releasing more times than it got - // would otherwise drive refCount negative and close a client another - // caller is still holding. Warn loudly and refuse to decrement further. if (holder.refCount <= 0) { logger.log(LogLevel.warn, `Unbalanced release for TelemetryClient host: ${host}`); return; } + holder.client.unregisterContext(context); holder.refCount -= 1; logger.log(LogLevel.debug, `TelemetryClient reference count for ${host}: ${holder.refCount}`); - // Close and remove client when reference count reaches zero. - // Remove from map before calling close so a concurrent getOrCreateClient - // creates a fresh client rather than receiving this closing one. if (holder.refCount <= 0) { + // Remove from map BEFORE awaiting close so a concurrent + // getOrCreateClient creates a fresh instance rather than receiving + // this closing one. this.clients.delete(key); try { - holder.client.close(); + await holder.client.close(); logger.log(LogLevel.debug, `Closed and removed TelemetryClient for host: ${host}`); } catch (error) { const msg = error instanceof Error ? error.message : String(error); @@ -150,17 +152,13 @@ class TelemetryClientProvider { } } - /** - * @internal Exposed for testing only. - */ + /** @internal Exposed for testing only. */ getRefCount(host: string): number { const holder = this.clients.get(TelemetryClientProvider.normalizeHostKey(host)); return holder ? holder.refCount : 0; } - /** - * @internal Exposed for testing only. - */ + /** @internal Exposed for testing only. */ getActiveClients(): Map { const result = new Map(); for (const [host, holder] of this.clients.entries()) { diff --git a/lib/telemetry/TelemetryEventEmitter.ts b/lib/telemetry/TelemetryEventEmitter.ts index bbb0b757..0a9839be 100644 --- a/lib/telemetry/TelemetryEventEmitter.ts +++ b/lib/telemetry/TelemetryEventEmitter.ts @@ -18,6 +18,23 @@ import { EventEmitter } from 'events'; import IClientContext from '../contracts/IClientContext'; import { LogLevel } from '../contracts/IDBSQLLogger'; import { TelemetryEvent, TelemetryEventType, DriverConfiguration } from './types'; +import { redactSensitive } from './telemetryUtils'; + +/** + * Typed map of event-type → listener payload shape. Keeps `on`/`off` calls + * structurally typed: `emitter.on(TelemetryEventType.ERROR, (e) => …)` infers + * `e: TelemetryEvent` instead of `any`. Avoids the EventEmitter-default + * `(...args: any[]) => void` trap where a typo in the event name silently + * registers a listener that never fires. + */ +export interface TelemetryEventMap { + [TelemetryEventType.CONNECTION_OPEN]: (event: TelemetryEvent) => void; + [TelemetryEventType.CONNECTION_CLOSE]: (event: TelemetryEvent) => void; + [TelemetryEventType.STATEMENT_START]: (event: TelemetryEvent) => void; + [TelemetryEventType.STATEMENT_COMPLETE]: (event: TelemetryEvent) => void; + [TelemetryEventType.CLOUDFETCH_CHUNK]: (event: TelemetryEvent) => void; + [TelemetryEventType.ERROR]: (event: TelemetryEvent) => void; +} /** * EventEmitter for driver telemetry. @@ -26,8 +43,9 @@ import { TelemetryEvent, TelemetryEventType, DriverConfiguration } from './types * CRITICAL REQUIREMENT: ALL exceptions must be caught and logged at LogLevel.debug ONLY * (never warn/error) to avoid customer anxiety. NO console logging allowed - only IDBSQLLogger. * - * All emit methods are wrapped in try-catch blocks that swallow exceptions completely. - * Event emission respects the telemetryEnabled flag from context config. + * All emit methods funnel through `emitWrapped`, which holds the + * try/catch/debug-log scaffold. The per-method bodies do nothing but build + * the event shape — adding a new event type is a one-method change. */ export default class TelemetryEventEmitter extends EventEmitter { private enabled: boolean; @@ -40,95 +58,100 @@ export default class TelemetryEventEmitter extends EventEmitter { this.enabled = config.telemetryEnabled ?? false; } - /** - * Emit a connection open event. - * - * @param data Connection event data including sessionId, workspaceId, and driverConfig - */ - emitConnectionOpen(data: { sessionId: string; workspaceId: string; driverConfig: DriverConfiguration }): void { - if (!this.enabled) return; + // Typed event subscription. EventEmitter's native types use `any` payloads; + // these overrides give consumers a typed `event` parameter without forcing + // a wholesale rewrite of the EventEmitter contract. + on(eventName: K, listener: TelemetryEventMap[K]): this { + return super.on(eventName, listener as (...args: unknown[]) => void); + } - const logger = this.context.getLogger(); - try { - const event: TelemetryEvent = { - eventType: TelemetryEventType.CONNECTION_OPEN, - timestamp: Date.now(), - sessionId: data.sessionId, - workspaceId: data.workspaceId, - driverConfig: data.driverConfig, - }; - this.emit(TelemetryEventType.CONNECTION_OPEN, event); - } catch (error: any) { - // Swallow all exceptions - log at debug level only - logger.log(LogLevel.debug, `Error emitting connection event: ${error.message}`); - } + off(eventName: K, listener: TelemetryEventMap[K]): this { + return super.off(eventName, listener as (...args: unknown[]) => void); + } + + once(eventName: K, listener: TelemetryEventMap[K]): this { + return super.once(eventName, listener as (...args: unknown[]) => void); } /** - * Emit a statement start event. - * - * @param data Statement start data including statementId, sessionId, and operationType + * Build-and-emit helper. The per-event `build` callback constructs the + * payload; everything else (enabled check, try/catch, swallow-and-log) + * lives here so the wrapping cannot drift between event types. */ - emitStatementStart(data: { statementId: string; sessionId: string; operationType?: string }): void { + private emitWrapped(eventType: TelemetryEventType, build: () => TelemetryEvent): void { if (!this.enabled) return; - const logger = this.context.getLogger(); try { - const event: TelemetryEvent = { - eventType: TelemetryEventType.STATEMENT_START, - timestamp: Date.now(), - statementId: data.statementId, - sessionId: data.sessionId, - operationType: data.operationType, - }; - this.emit(TelemetryEventType.STATEMENT_START, event); + this.emit(eventType, build()); } catch (error: any) { - // Swallow all exceptions - log at debug level only - logger.log(LogLevel.debug, `Error emitting statement start: ${error.message}`); + logger.log(LogLevel.debug, `Error emitting ${eventType}: ${error?.message ?? error}`); } } - /** - * Emit a statement complete event. - * - * @param data Statement completion data including latency, result format, and metrics - */ + emitConnectionOpen(data: { + sessionId: string; + workspaceId?: string; + /** + * The full driver-configuration block (~1KB). Static for the process — + * emit sites SHOULD pass it once per client and pass `undefined` on + * subsequent CONNECTION_OPEN events (one client may open many sessions). + * The aggregator and exporter both treat `undefined` as "no change since + * the last metric on the same session lineage". + */ + driverConfig?: DriverConfiguration; + latencyMs: number; + }): void { + this.emitWrapped(TelemetryEventType.CONNECTION_OPEN, () => ({ + eventType: TelemetryEventType.CONNECTION_OPEN, + timestamp: Date.now(), + sessionId: data.sessionId, + workspaceId: data.workspaceId, + driverConfig: data.driverConfig, + latencyMs: data.latencyMs, + })); + } + + emitConnectionClose(data: { sessionId: string; latencyMs: number }): void { + this.emitWrapped(TelemetryEventType.CONNECTION_CLOSE, () => ({ + eventType: TelemetryEventType.CONNECTION_CLOSE, + timestamp: Date.now(), + sessionId: data.sessionId, + latencyMs: data.latencyMs, + })); + } + + emitStatementStart(data: { statementId: string; sessionId?: string; operationType?: string }): void { + this.emitWrapped(TelemetryEventType.STATEMENT_START, () => ({ + eventType: TelemetryEventType.STATEMENT_START, + timestamp: Date.now(), + statementId: data.statementId, + sessionId: data.sessionId, + operationType: data.operationType, + })); + } + emitStatementComplete(data: { statementId: string; - sessionId: string; + sessionId?: string; latencyMs?: number; resultFormat?: string; chunkCount?: number; bytesDownloaded?: number; pollCount?: number; }): void { - if (!this.enabled) return; - - const logger = this.context.getLogger(); - try { - const event: TelemetryEvent = { - eventType: TelemetryEventType.STATEMENT_COMPLETE, - timestamp: Date.now(), - statementId: data.statementId, - sessionId: data.sessionId, - latencyMs: data.latencyMs, - resultFormat: data.resultFormat, - chunkCount: data.chunkCount, - bytesDownloaded: data.bytesDownloaded, - pollCount: data.pollCount, - }; - this.emit(TelemetryEventType.STATEMENT_COMPLETE, event); - } catch (error: any) { - // Swallow all exceptions - log at debug level only - logger.log(LogLevel.debug, `Error emitting statement complete: ${error.message}`); - } + this.emitWrapped(TelemetryEventType.STATEMENT_COMPLETE, () => ({ + eventType: TelemetryEventType.STATEMENT_COMPLETE, + timestamp: Date.now(), + statementId: data.statementId, + sessionId: data.sessionId, + latencyMs: data.latencyMs, + resultFormat: data.resultFormat, + chunkCount: data.chunkCount, + bytesDownloaded: data.bytesDownloaded, + pollCount: data.pollCount, + })); } - /** - * Emit a CloudFetch chunk download event. - * - * @param data CloudFetch chunk data including chunk index, latency, bytes, and compression - */ emitCloudFetchChunk(data: { statementId: string; chunkIndex: number; @@ -136,30 +159,30 @@ export default class TelemetryEventEmitter extends EventEmitter { bytes: number; compressed?: boolean; }): void { - if (!this.enabled) return; - - const logger = this.context.getLogger(); - try { - const event: TelemetryEvent = { - eventType: TelemetryEventType.CLOUDFETCH_CHUNK, - timestamp: Date.now(), - statementId: data.statementId, - chunkIndex: data.chunkIndex, - latencyMs: data.latencyMs, - bytes: data.bytes, - compressed: data.compressed, - }; - this.emit(TelemetryEventType.CLOUDFETCH_CHUNK, event); - } catch (error: any) { - // Swallow all exceptions - log at debug level only - logger.log(LogLevel.debug, `Error emitting cloudfetch chunk: ${error.message}`); - } + this.emitWrapped(TelemetryEventType.CLOUDFETCH_CHUNK, () => ({ + eventType: TelemetryEventType.CLOUDFETCH_CHUNK, + timestamp: Date.now(), + statementId: data.statementId, + chunkIndex: data.chunkIndex, + latencyMs: data.latencyMs, + bytes: data.bytes, + compressed: data.compressed, + })); } /** * Emit an error event. * - * @param data Error event data including error details and terminal status + * Redaction happens HERE — not at the exporter — so any in-process listener + * (this class extends `EventEmitter` and `getTelemetryEmitter()` is reachable + * from any consumer of `@databricks/sql`) sees the same redacted strings + * the export pipeline does. `redactSensitive` strips Bearer/Basic, Databricks + * token prefixes, JWTs, JSON-encoded secrets, URL userinfo, and common + * username-bearing filesystem paths, then caps length. + * + * `errorMessage` is also redacted, not only `errorStack` — operation error + * messages can carry query fragments, table names, parameter values that the + * SECRET_PATTERNS regex must scrub before they're emitted anywhere. */ emitError(data: { statementId?: string; @@ -169,24 +192,15 @@ export default class TelemetryEventEmitter extends EventEmitter { errorStack?: string; isTerminal: boolean; }): void { - if (!this.enabled) return; - - const logger = this.context.getLogger(); - try { - const event: TelemetryEvent = { - eventType: TelemetryEventType.ERROR, - timestamp: Date.now(), - statementId: data.statementId, - sessionId: data.sessionId, - errorName: data.errorName, - errorMessage: data.errorMessage, - errorStack: data.errorStack, - isTerminal: data.isTerminal, - }; - this.emit(TelemetryEventType.ERROR, event); - } catch (error: any) { - // Swallow all exceptions - log at debug level only - logger.log(LogLevel.debug, `Error emitting error event: ${error.message}`); - } + this.emitWrapped(TelemetryEventType.ERROR, () => ({ + eventType: TelemetryEventType.ERROR, + timestamp: Date.now(), + statementId: data.statementId, + sessionId: data.sessionId, + errorName: data.errorName, + errorMessage: redactSensitive(data.errorMessage), + errorStack: data.errorStack === undefined ? undefined : redactSensitive(data.errorStack), + isTerminal: data.isTerminal, + })); } } diff --git a/lib/telemetry/telemetryTypeMappers.ts b/lib/telemetry/telemetryTypeMappers.ts new file mode 100644 index 00000000..d022739d --- /dev/null +++ b/lib/telemetry/telemetryTypeMappers.ts @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2025 Databricks Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { TOperationType, TSparkRowSetType } from '../../thrift/TCLIService_types'; + +/** + * Map Thrift TOperationType to telemetry Operation.Type enum string. + * Returns 'TYPE_UNSPECIFIED' if operationType is undefined or unknown. + */ +export function mapOperationTypeToTelemetryType(operationType?: TOperationType): string { + if (operationType === undefined) { + return 'TYPE_UNSPECIFIED'; + } + + switch (operationType) { + case TOperationType.EXECUTE_STATEMENT: + return 'EXECUTE_STATEMENT'; + case TOperationType.GET_TYPE_INFO: + return 'LIST_TYPE_INFO'; + case TOperationType.GET_CATALOGS: + return 'LIST_CATALOGS'; + case TOperationType.GET_SCHEMAS: + return 'LIST_SCHEMAS'; + case TOperationType.GET_TABLES: + return 'LIST_TABLES'; + case TOperationType.GET_TABLE_TYPES: + return 'LIST_TABLE_TYPES'; + case TOperationType.GET_COLUMNS: + return 'LIST_COLUMNS'; + case TOperationType.GET_FUNCTIONS: + return 'LIST_FUNCTIONS'; + case TOperationType.UNKNOWN: + default: + return 'TYPE_UNSPECIFIED'; + } +} + +/** + * Map Thrift TSparkRowSetType to telemetry ExecutionResult.Format enum string. + */ +export function mapResultFormatToTelemetryType(resultFormat?: TSparkRowSetType): string | undefined { + if (resultFormat === undefined) { + return undefined; + } + + switch (resultFormat) { + case TSparkRowSetType.ARROW_BASED_SET: + return 'INLINE_ARROW'; + case TSparkRowSetType.COLUMN_BASED_SET: + return 'COLUMNAR_INLINE'; + case TSparkRowSetType.ROW_BASED_SET: + return 'INLINE_JSON'; + case TSparkRowSetType.URL_BASED_SET: + return 'EXTERNAL_LINKS'; + default: + return 'FORMAT_UNSPECIFIED'; + } +} diff --git a/lib/telemetry/telemetryUtils.ts b/lib/telemetry/telemetryUtils.ts index 6326aedf..7956ca36 100644 --- a/lib/telemetry/telemetryUtils.ts +++ b/lib/telemetry/telemetryUtils.ts @@ -14,6 +14,10 @@ * limitations under the License. */ +import type IClientContext from '../contracts/IClientContext'; +import type TelemetryEventEmitter from './TelemetryEventEmitter'; +import { LogLevel } from '../contracts/IDBSQLLogger'; + /** * Hosts we always refuse to send authenticated telemetry to. Targeted at the * `/api/2.0/sql/telemetry-ext` exfil vector: an attacker-influenced `host` @@ -129,6 +133,27 @@ const SECRET_PATTERNS: Array<[RegExp, string]> = [ /\b(token|password|client_secret|refresh_token|access_token|id_token|secret|api[_-]?key|apikey)=[^\s&"']+/gi, '$1=', ], + // OS-username-bearing filesystem paths in error stacks. `/home//`, + // `/Users//`, `C:\Users\\`. Stack traces routinely echo the + // running user's home directory which leaks both the OS account name and + // application directory layout. + [/\/(?:home|Users)\/[^/\s)\]]+\//g, '//'], + [/[A-Za-z]:\\Users\\[^\\\s)\]]+\\/g, '\\'], + // macOS temp/scratch directories (`/private/var/folders///...`) + // and `/var/folders/...`. The `/` segments are derived from the + // user identity so we redact through to the next path separator. + [/\/(?:private\/)?var\/folders\/[^/\s)\]]+\/[^/\s)\]]+\//g, '//'], + // Linux service / container layouts that routinely encode app or tenant + // names: `/var/lib//`, `/opt//`, `/srv//`, container + // `/app//` (common on Lambda/k8s/Cloud Run). The replacement keeps + // the leading anchor so dashboards can still group by "var/lib vs opt". + [/\/var\/lib\/[^/\s)\]]+\//g, '/var/lib//'], + [/\/opt\/[^/\s)\]]+\//g, '/opt//'], + [/\/srv\/[^/\s)\]]+\//g, '/srv//'], + [/\/app\/[^/\s)\]]+\//g, '/app//'], + // Windows UNC paths `\\server\share\...` — the server and share names + // routinely leak internal hostnames or fileshare names. + [/\\\\[^\\/\s)\]]+\\[^\\/\s)\]]+\\/g, '\\\\\\'], ]; /** @@ -228,3 +253,29 @@ export function sanitizeProcessName(name: string | undefined): string { const lastSep = Math.max(firstToken.lastIndexOf('/'), firstToken.lastIndexOf('\\')); return lastSep < 0 ? firstToken : firstToken.slice(lastSep + 1); } + +/** + * Run a telemetry emit at a call site, swallowing all exceptions and logging + * at debug level. Replaces the copy-pasted try/catch + getTelemetryEmitter?.() + * scaffold at every emit site. Telemetry must never break the driver. + * + * The type-only imports keep this helper free of runtime cycles: + * `IClientContext` imports `TelemetryEventEmitter` only as a type, and this + * helper does the same, so there's no value-level cycle. A typo like + * `emitter.emiitConnectionOpen(...)` is now a TS compile error instead of a + * swallowed runtime exception. + */ +export function safeEmit(context: IClientContext, fn: (emitter: TelemetryEventEmitter) => void): void { + try { + const emitter = context.getTelemetryEmitter?.(); + if (emitter) { + fn(emitter); + } + } catch (err: any) { + try { + context.getLogger().log(LogLevel.debug, `Telemetry emit error: ${err?.message ?? err}`); + } catch { + // swallow logger errors too — telemetry never breaks the driver + } + } +} diff --git a/lib/telemetry/types.ts b/lib/telemetry/types.ts index 6a4a25a9..9a422258 100644 --- a/lib/telemetry/types.ts +++ b/lib/telemetry/types.ts @@ -24,6 +24,7 @@ export const DRIVER_NAME = 'nodejs-sql-driver'; */ export enum TelemetryEventType { CONNECTION_OPEN = 'connection.open', + CONNECTION_CLOSE = 'connection.close', STATEMENT_START = 'statement.start', STATEMENT_COMPLETE = 'statement.complete', CLOUDFETCH_CHUNK = 'cloudfetch.chunk', @@ -72,13 +73,24 @@ export interface TelemetryConfiguration { /** TTL in ms after which abandoned statement aggregations are evicted */ statementTtlMs?: number; + + /** + * Maximum wall-clock time `close()` will wait for the final flush HTTP POST + * before abandoning it and returning. Bounds shutdown latency so callers + * doing `await client.close(); process.exit(0)` are not held up by a + * misbehaving telemetry endpoint. + */ + closeTimeoutMs?: number; + + /** Hard cap on per-statement aggregation map size; oldest evicted on overflow. */ + maxStatementMetrics?: number; } /** * Default telemetry configuration values */ export const DEFAULT_TELEMETRY_CONFIG: Readonly> = Object.freeze({ - enabled: false, + enabled: true, // Enabled by default, gated by feature flag batchSize: 100, flushIntervalMs: 5000, maxRetries: 3, @@ -91,6 +103,8 @@ export const DEFAULT_TELEMETRY_CONFIG: Readonly maxPendingMetrics: 500, maxErrorsPerStatement: 50, statementTtlMs: 60 * 60 * 1000, // 1 hour + closeTimeoutMs: 2000, // 2s — caps client.close() shutdown latency + maxStatementMetrics: 5000, // hard cap for the per-statement aggregation map }); /** @@ -178,24 +192,39 @@ export interface TelemetryMetric { /** Workspace ID */ workspaceId?: string; - /** Driver configuration (for connection metrics) */ + /** Driver configuration (included in all metrics for context) */ driverConfig?: DriverConfiguration; /** Execution latency in milliseconds */ latencyMs?: number; + /** Type of operation (SELECT, INSERT, etc.) */ + operationType?: string; + /** Result format (inline, cloudfetch, arrow) */ resultFormat?: string; /** Number of result chunks */ chunkCount?: number; + /** Latency of the first chunk fetch in milliseconds */ + chunkInitialLatencyMs?: number; + + /** Latency of the slowest chunk fetch in milliseconds */ + chunkSlowestLatencyMs?: number; + + /** Sum of all chunk fetch latencies in milliseconds */ + chunkSumLatencyMs?: number; + /** Total bytes downloaded */ bytesDownloaded?: number; /** Number of poll operations */ pollCount?: number; + /** Whether compression was used */ + compressed?: boolean; + /** Error name/type */ errorName?: string; @@ -243,6 +272,9 @@ export interface DriverConfiguration { */ processName: string; + /** Authentication type (pat, external-browser, oauth-m2m, custom) */ + authType: string; + // Feature flags /** Whether CloudFetch is enabled */ cloudFetchEnabled: boolean; @@ -265,6 +297,13 @@ export interface DriverConfiguration { /** Number of concurrent CloudFetch downloads */ cloudFetchConcurrentDownloads: number; + + // Connection parameters for telemetry + /** HTTP path for API calls */ + httpPath?: string; + + /** Whether metric view metadata is enabled */ + enableMetricViewMetadata?: boolean; } /** @@ -298,6 +337,15 @@ export interface StatementMetrics { /** Number of CloudFetch chunks downloaded */ chunkCount: number; + /** Latency of the first chunk fetch in milliseconds */ + chunkInitialLatencyMs?: number; + + /** Latency of the slowest chunk fetch in milliseconds */ + chunkSlowestLatencyMs?: number; + + /** Sum of all chunk fetch latencies in milliseconds */ + chunkSumLatencyMs?: number; + /** Total bytes downloaded */ totalBytesDownloaded: number; diff --git a/lib/telemetry/urlUtils.ts b/lib/telemetry/urlUtils.ts deleted file mode 100644 index 4dd8535e..00000000 --- a/lib/telemetry/urlUtils.ts +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright (c) 2025 Databricks Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Build full URL from host and path, handling protocol correctly. - * @param host The hostname (with or without protocol) - * @param path The path to append (should start with /) - * @returns Full URL with protocol - */ -// eslint-disable-next-line import/prefer-default-export -export function buildUrl(host: string, path: string): string { - // Check if host already has protocol - if (host.startsWith('http://') || host.startsWith('https://')) { - return `${host}${path}`; - } - // Add https:// if no protocol present - return `https://${host}${path}`; -} diff --git a/tests/e2e/telemetry/telemetry-integration.test.ts b/tests/e2e/telemetry/telemetry-integration.test.ts new file mode 100644 index 00000000..608378da --- /dev/null +++ b/tests/e2e/telemetry/telemetry-integration.test.ts @@ -0,0 +1,481 @@ +/** + * Copyright (c) 2025 Databricks Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { expect } from 'chai'; +import sinon from 'sinon'; +import { DBSQLClient } from '../../../lib'; +import FeatureFlagCache from '../../../lib/telemetry/FeatureFlagCache'; +import TelemetryClientProvider from '../../../lib/telemetry/TelemetryClientProvider'; +import TelemetryEventEmitter from '../../../lib/telemetry/TelemetryEventEmitter'; +import MetricsAggregator from '../../../lib/telemetry/MetricsAggregator'; + +// Load test config with a skip-guard if e2e credentials are not present. +// The shared `../utils/config` runs process.exit(1) on missing env vars at +// import time, so we cannot use it directly without breaking unconfigured +// CI runs. Read env vars directly here and skip the suite if any are missing. +interface TestConfig { + host: string; + path: string; + token: string; + catalog: string; + schema: string; +} + +function loadConfigOrSkip(suite: Mocha.Suite): TestConfig | null { + // Loading overrides from `config.local` first matches the precedence used + // by the shared `../utils/config` loader, so engineers who have a local + // override file get the same behavior here. + let overrides: Record = {}; + try { + // eslint-disable-next-line global-require, @typescript-eslint/no-var-requires + const loaded = require('../utils/config.local'); + if (typeof loaded === 'object' && loaded !== null) { + overrides = loaded as Record; + } + } catch { + // no local overrides; rely on env vars + } + + const cfg = { + host: overrides.host ?? process.env.E2E_HOST, + path: overrides.path ?? process.env.E2E_PATH, + token: overrides.token ?? process.env.E2E_ACCESS_TOKEN, + catalog: overrides.catalog ?? process.env.E2E_CATALOG, + schema: overrides.schema ?? process.env.E2E_SCHEMA, + }; + const missing = Object.entries(cfg) + .filter(([, v]) => v === undefined || v === '') + .map(([k]) => k); + if (missing.length > 0) { + // eslint-disable-next-line no-console + console.warn( + `[telemetry-integration] Skipping suite: missing E2E config [${missing.join(', ')}]. ` + + `Set E2E_HOST/E2E_PATH/E2E_ACCESS_TOKEN/E2E_CATALOG/E2E_SCHEMA or create tests/e2e/utils/config.local.{ts,js}.`, + ); + suite.pending = true; + return null; + } + return cfg as TestConfig; +} + +describe('Telemetry Integration', () => { + let config: TestConfig | null = null; + + // The e2e test runner executes many suites before this one; any earlier + // DBSQLClient connect leaves a TelemetryClient in the process-wide singleton + // for the test host. Reset before our first test so the FF cache and + // refcount spies in this suite observe a clean lineage. + before(function () { + config = loadConfigOrSkip(this.test!.parent!); + if (!config) { + this.skip(); + return; + } + TelemetryClientProvider.__resetInstanceForTests(); + }); + + // Reset the process-wide singleton between tests so refcount + cached + // feature flags from one test don't leak into the next. Combined with the + // `before` reset above, every test sees a fresh provider regardless of + // what other e2e suites did first. + beforeEach(() => { + TelemetryClientProvider.__resetInstanceForTests(); + }); + + afterEach(() => { + TelemetryClientProvider.__resetInstanceForTests(); + sinon.restore(); + }); + + describe('Initialization', () => { + it('should initialize telemetry when telemetryEnabled is true', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Spy on the per-host telemetry provider AND the feature-flag cache. + // Both should fire on a telemetry-enabled connect; asserting on both + // guards against a future refactor that bypasses one but not the other. + const featureFlagCacheSpy = sinon.spy(FeatureFlagCache.prototype, 'getOrCreateContext'); + const telemetryProviderSpy = sinon.spy(TelemetryClientProvider.prototype, 'getOrCreateClient'); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + expect(featureFlagCacheSpy.called).to.be.true; + expect(telemetryProviderSpy.callCount).to.equal(1); + // The host the provider was invoked with should match the connect host. + const hostArg = telemetryProviderSpy.firstCall.args[1]; + expect(hostArg).to.equal(config!.host); + + await client.close(); + } finally { + featureFlagCacheSpy.restore(); + telemetryProviderSpy.restore(); + } + }); + + it('should not initialize telemetry when telemetryEnabled is false', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + const featureFlagCacheSpy = sinon.spy(FeatureFlagCache.prototype, 'getOrCreateContext'); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: false, + }); + + // Verify telemetry was not initialized + expect(featureFlagCacheSpy.called).to.be.false; + + await client.close(); + } finally { + featureFlagCacheSpy.restore(); + } + }); + + it('should respect feature flag when telemetry is enabled', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Stub feature flag to return false + const featureFlagStub = sinon.stub(FeatureFlagCache.prototype, 'isTelemetryEnabled').resolves(false); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + // Verify feature flag was checked + expect(featureFlagStub.called).to.be.true; + + await client.close(); + } finally { + featureFlagStub.restore(); + } + }); + }); + + describe('Reference Counting', () => { + it('should share telemetry client across multiple connections to same host', async function () { + this.timeout(60000); + + const client1 = new DBSQLClient(); + const client2 = new DBSQLClient(); + + const getOrCreateClientSpy = sinon.spy(TelemetryClientProvider.prototype, 'getOrCreateClient'); + const releaseClientSpy = sinon.spy(TelemetryClientProvider.prototype, 'releaseClient'); + + try { + await client1.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + await client2.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + // Each connect calls getOrCreateClient exactly once when the feature + // flag is on (no FF-disabled release path). Tightened from + // `at.least(2)` to `equal(2)` — an off-by-one refcount leak from a + // reconnect path would now fail the test. + expect(getOrCreateClientSpy.callCount).to.equal(2); + // Both calls must target the same host so the singleton actually shares. + const host1 = getOrCreateClientSpy.firstCall.args[1]; + const host2 = getOrCreateClientSpy.secondCall.args[1]; + expect(host1).to.equal(host2); + + await client1.close(); + expect(releaseClientSpy.callCount).to.equal(1); + + await client2.close(); + expect(releaseClientSpy.callCount).to.equal(2); + } finally { + getOrCreateClientSpy.restore(); + releaseClientSpy.restore(); + } + }); + + it('should cleanup telemetry on close', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + const releaseClientSpy = sinon.spy(TelemetryClientProvider.prototype, 'releaseClient'); + const aggregatorCloseSpy = sinon.spy(MetricsAggregator.prototype, 'close'); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + await client.close(); + + // releaseClient is the refcount surface; MetricsAggregator.close is + // the cleanup the last refcount holder triggers. Both must fire on a + // clean close. We do NOT assert flush() was called because this test + // never `openSession`s, so the pending-metrics buffer is empty and + // the close-drain pattern legitimately skips the final flush. The + // previous disjunction (`releaseClient || flush || releaseContext`) + // over multiple spies meant a regression breaking one would still + // pass — these explicit asserts catch that. + expect(releaseClientSpy.called, 'releaseClient should be called on close').to.be.true; + expect(aggregatorCloseSpy.called, 'MetricsAggregator.close should run on close').to.be.true; + } finally { + releaseClientSpy.restore(); + aggregatorCloseSpy.restore(); + } + }); + }); + + describe('Error Handling', () => { + it('should continue driver operation when telemetry initialization fails', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Stub feature flag to throw an error + const featureFlagStub = sinon + .stub(FeatureFlagCache.prototype, 'isTelemetryEnabled') + .rejects(new Error('Feature flag fetch failed')); + + try { + // Connection should succeed even if telemetry fails + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + // Should be able to open a session + const session = await client.openSession({ + initialCatalog: config!.catalog, + initialSchema: config!.schema, + }); + + // Should be able to execute a query + const operation = await session.executeStatement('SELECT 1 AS test'); + const result = await operation.fetchAll(); + + expect(result).to.have.lengthOf(1); + expect(result[0]).to.deep.equal({ test: 1 }); + + await session.close(); + await client.close(); + } finally { + featureFlagStub.restore(); + } + }); + + it('should continue driver operation when feature flag fetch fails', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Stub getOrCreateContext to throw + const contextStub = sinon + .stub(FeatureFlagCache.prototype, 'getOrCreateContext') + .throws(new Error('Context creation failed')); + + try { + // Connection should succeed even if telemetry fails + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + // Should be able to open a session + const session = await client.openSession({ + initialCatalog: config!.catalog, + initialSchema: config!.schema, + }); + + await session.close(); + await client.close(); + } finally { + contextStub.restore(); + } + }); + + it('should not throw exceptions due to telemetry errors', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Stub multiple telemetry methods to throw + const emitterStub = sinon + .stub(TelemetryEventEmitter.prototype, 'emitConnectionOpen') + .throws(new Error('Emitter failed')); + const aggregatorStub = sinon + .stub(MetricsAggregator.prototype, 'processEvent') + .throws(new Error('Aggregator failed')); + + try { + // Connection should not throw + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + // Driver operations should work normally + const session = await client.openSession({ + initialCatalog: config!.catalog, + initialSchema: config!.schema, + }); + + await session.close(); + await client.close(); + } finally { + emitterStub.restore(); + aggregatorStub.restore(); + } + }); + }); + + describe('Configuration', () => { + it('should read telemetry config from ClientConfig', function () { + this.timeout(30000); + + const client = new DBSQLClient(); + const clientConfig = client.getConfig(); + + // Verify default telemetry config exists + expect(clientConfig).to.have.property('telemetryEnabled'); + expect(clientConfig).to.have.property('telemetryBatchSize'); + expect(clientConfig).to.have.property('telemetryFlushIntervalMs'); + expect(clientConfig).to.have.property('telemetryMaxRetries'); + expect(clientConfig).to.have.property('telemetryAuthenticatedExport'); + expect(clientConfig).to.have.property('telemetryCircuitBreakerThreshold'); + expect(clientConfig).to.have.property('telemetryCircuitBreakerTimeout'); + + // Verify default values. telemetryEnabled defaults to true (gated by + // remote feature flag and DATABRICKS_TELEMETRY_DISABLED env var). + expect(clientConfig.telemetryEnabled).to.equal(true); + expect(clientConfig.telemetryBatchSize).to.equal(100); + expect(clientConfig.telemetryFlushIntervalMs).to.equal(5000); + expect(clientConfig.telemetryMaxRetries).to.equal(3); + expect(clientConfig.telemetryAuthenticatedExport).to.equal(true); + expect(clientConfig.telemetryCircuitBreakerThreshold).to.equal(5); + expect(clientConfig.telemetryCircuitBreakerTimeout).to.equal(60000); + }); + + it('should allow override via ConnectionOptions', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + // Default is true; verify explicit override to false. + expect(client.getConfig().telemetryEnabled).to.equal(true); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: false, + }); + + // Config should reflect the override + expect(client.getConfig().telemetryEnabled).to.equal(false); + + await client.close(); + } catch (error) { + // Clean up even if test fails + await client.close(); + throw error; + } + }); + }); + + describe('End-to-End Telemetry Flow', () => { + it('should emit events during driver operations when telemetry is enabled', async function () { + this.timeout(30000); + + const client = new DBSQLClient(); + + const emitSpy = sinon.spy(TelemetryEventEmitter.prototype, 'emit'); + + try { + await client.connect({ + host: config!.host, + path: config!.path, + token: config!.token, + telemetryEnabled: true, + }); + + const session = await client.openSession({ + initialCatalog: config!.catalog, + initialSchema: config!.schema, + }); + + const operation = await session.executeStatement('SELECT 1 AS test'); + await operation.fetchAll(); + + // If the feature flag is on, the emitter MUST have produced at least + // CONNECTION_OPEN + STATEMENT_START events by now. We don't assert + // call count because the feature flag is server-controlled — but if + // any events fired, the typed event-name argument should match one + // of the telemetry event types. + if (emitSpy.called) { + const validEventTypes = new Set([ + 'connection.open', + 'connection.close', + 'statement.start', + 'statement.complete', + 'cloudfetch.chunk', + 'telemetry.error', + ]); + for (const call of emitSpy.getCalls()) { + const eventName = String(call.args[0]); + expect(validEventTypes.has(eventName), `Unexpected event name: ${eventName}`).to.be.true; + } + } + + await session.close(); + await client.close(); + } finally { + emitSpy.restore(); + } + }); + }); +}); diff --git a/tests/unit/.stubs/ClientContextStub.ts b/tests/unit/.stubs/ClientContextStub.ts index 519316ff..5bd48da9 100644 --- a/tests/unit/.stubs/ClientContextStub.ts +++ b/tests/unit/.stubs/ClientContextStub.ts @@ -3,7 +3,10 @@ import IConnectionProvider from '../../../lib/connection/contracts/IConnectionPr import IDriver from '../../../lib/contracts/IDriver'; import IThriftClient from '../../../lib/contracts/IThriftClient'; import IDBSQLLogger from '../../../lib/contracts/IDBSQLLogger'; +import IAuthentication from '../../../lib/connection/contracts/IAuthentication'; import DBSQLClient from '../../../lib/DBSQLClient'; +import TelemetryEventEmitter from '../../../lib/telemetry/TelemetryEventEmitter'; +import MetricsAggregator from '../../../lib/telemetry/MetricsAggregator'; import LoggerStub from './LoggerStub'; import ThriftClientStub from './ThriftClientStub'; @@ -21,6 +24,12 @@ export default class ClientContextStub implements IClientContext { public connectionProvider = new ConnectionProviderStub(); + // Tests can assign a stub emitter / aggregator to assert on emit calls. + // Defaults to undefined so optional-chaining `?.()` no-ops in unrelated tests. + public telemetryEmitter?: TelemetryEventEmitter; + + public telemetryAggregator?: MetricsAggregator; + constructor(configOverrides: Partial = {}) { this.configOverrides = configOverrides; } @@ -48,4 +57,16 @@ export default class ClientContextStub implements IClientContext { public async getDriver(): Promise { return this.driver; } + + public getAuthProvider(): IAuthentication | undefined { + return undefined; + } + + public getTelemetryEmitter(): TelemetryEventEmitter | undefined { + return this.telemetryEmitter; + } + + public getTelemetryAggregator(): MetricsAggregator | undefined { + return this.telemetryAggregator; + } } diff --git a/tests/unit/DBSQLClient.test.ts b/tests/unit/DBSQLClient.test.ts index 4c0a3a34..184e25a3 100644 --- a/tests/unit/DBSQLClient.test.ts +++ b/tests/unit/DBSQLClient.test.ts @@ -17,6 +17,10 @@ import IAuthentication from '../../lib/connection/contracts/IAuthentication'; import AuthProviderStub from './.stubs/AuthProviderStub'; import ConnectionProviderStub from './.stubs/ConnectionProviderStub'; import { TProtocolVersion } from '../../thrift/TCLIService_types'; +import TelemetryClientProvider from '../../lib/telemetry/TelemetryClientProvider'; +import FeatureFlagCache from '../../lib/telemetry/FeatureFlagCache'; +import TelemetryEventEmitter from '../../lib/telemetry/TelemetryEventEmitter'; +import { LogLevel } from '../../lib/contracts/IDBSQLLogger'; const connectOptions = { host: '127.0.0.1', @@ -682,3 +686,188 @@ describe('DBSQLClient.enableMetricViewMetadata', () => { }); }); }); + +describe('DBSQLClient telemetry paths', () => { + // Reset the process-wide singleton between tests so refcount + cached + // feature flags from one test don't leak into the next. Mirrors the e2e + // suite's afterEach pattern but scoped to unit-level state. + afterEach(() => { + TelemetryClientProvider.__resetInstanceForTests(); + sinon.restore(); + }); + + describe('DATABRICKS_TELEMETRY_DISABLED env-var parser', () => { + let savedEnv: string | undefined; + beforeEach(() => { + savedEnv = process.env.DATABRICKS_TELEMETRY_DISABLED; + }); + afterEach(() => { + if (savedEnv === undefined) { + delete process.env.DATABRICKS_TELEMETRY_DISABLED; + } else { + process.env.DATABRICKS_TELEMETRY_DISABLED = savedEnv; + } + }); + + const recognizedTruthy = ['1', 'true', 'TRUE', 'yes', 'YES', 'on', 'ON', ' true ', 'On']; + const unrecognized = ['0', 'false', 'no', 'off', 'False', 'OFF']; + + for (const val of recognizedTruthy) { + it(`DISABLES telemetry when value is '${val}'`, async () => { + process.env.DATABRICKS_TELEMETRY_DISABLED = val; + const client = new DBSQLClient(); + // Stub out initializeTelemetry so we can detect it not being called. + const initSpy = sinon.spy(client as any, 'initializeTelemetry'); + await client.connect(connectOptions); + expect(initSpy.callCount).to.equal(0); + }); + } + + for (const val of unrecognized) { + it(`WARNS and does NOT disable when value is '${val}'`, async () => { + process.env.DATABRICKS_TELEMETRY_DISABLED = val; + const client = new DBSQLClient(); + const logger = (client as any).logger as { log: (l: LogLevel, m: string) => void }; + const logSpy = sinon.spy(logger, 'log'); + // Suppress the actual telemetry init network calls by stubbing. + sinon.stub(client as any, 'initializeTelemetry').resolves(); + await client.connect(connectOptions); + const warnCalls = logSpy + .getCalls() + .filter((c) => c.args[0] === LogLevel.warn && /DATABRICKS_TELEMETRY_DISABLED/.test(c.args[1] as string)); + expect(warnCalls.length, 'expected a warn-level log about ignored env value').to.be.at.least(1); + }); + } + + it('treats empty / unset env var as no-op (no warn)', async () => { + delete process.env.DATABRICKS_TELEMETRY_DISABLED; + const client = new DBSQLClient(); + const logger = (client as any).logger as { log: (l: LogLevel, m: string) => void }; + const logSpy = sinon.spy(logger, 'log'); + sinon.stub(client as any, 'initializeTelemetry').resolves(); + await client.connect(connectOptions); + const warnCalls = logSpy + .getCalls() + .filter((c) => c.args[0] === LogLevel.warn && /DATABRICKS_TELEMETRY_DISABLED/.test(c.args[1] as string)); + expect(warnCalls.length).to.equal(0); + }); + }); + + describe('extractWorkspaceId', () => { + it('returns the numeric o= param from httpPath', () => { + const client = new DBSQLClient(); + (client as any).httpPath = '/sql/1.0/warehouses/abc?o=12345678901234'; + const id = (client as any).extractWorkspaceId(); + expect(id).to.equal('12345678901234'); + }); + + it('returns undefined when no query string', () => { + const client = new DBSQLClient(); + (client as any).httpPath = '/sql/1.0/warehouses/abc'; + expect((client as any).extractWorkspaceId()).to.be.undefined; + }); + + it('returns undefined when o= is not numeric', () => { + const client = new DBSQLClient(); + (client as any).httpPath = '/sql/1.0/warehouses/abc?o=tenant_xyz'; + expect((client as any).extractWorkspaceId()).to.be.undefined; + }); + + it('handles o= as a non-first param', () => { + const client = new DBSQLClient(); + (client as any).httpPath = '/sql/1.0/warehouses/abc?foo=bar&o=42&baz=qux'; + expect((client as any).extractWorkspaceId()).to.equal('42'); + }); + + it('returns undefined when httpPath is unset', () => { + const client = new DBSQLClient(); + expect((client as any).extractWorkspaceId()).to.be.undefined; + }); + }); + + describe('telemetry refcount on reconnect', () => { + it('releases the prior refcount when connect() is called twice', async () => { + const client = new DBSQLClient(); + // Stub out feature-flag fetch to return true so the telemetry path runs. + sinon.stub(FeatureFlagCache.prototype, 'isTelemetryEnabled').resolves(true); + const releaseSpy = sinon.spy(TelemetryClientProvider.prototype, 'releaseClient'); + + await client.connect(connectOptions); + // Sanity: refcount on host should be 1 after first connect. + expect(TelemetryClientProvider.getInstance().getRefCount(connectOptions.host)).to.equal(1); + + // Second connect to a different host should release the prior refcount. + await client.connect({ ...connectOptions, host: '127.0.0.2' }); + expect(releaseSpy.called, 'releaseClient should fire on reconnect').to.be.true; + // The old host should have decremented to 0 (closed and removed). + expect(TelemetryClientProvider.getInstance().getRefCount(connectOptions.host)).to.equal(0); + + await client.close(); + }); + }); + + describe('telemetry refcount release path on init failure', () => { + it('releases refcount when feature flag fetch throws', async () => { + const client = new DBSQLClient(); + sinon.stub(FeatureFlagCache.prototype, 'isTelemetryEnabled').rejects(new Error('boom')); + const releaseSpy = sinon.spy(TelemetryClientProvider.prototype, 'releaseClient'); + + await client.connect(connectOptions); + + // The init failure should release the refcount it just acquired so the + // per-host TelemetryClient doesn't leak its flush timer for the + // lifetime of the process. + expect(releaseSpy.called, 'releaseClient should run on init failure').to.be.true; + expect(TelemetryClientProvider.getInstance().getRefCount(connectOptions.host)).to.equal(0); + }); + }); + + describe('CONNECTION_OPEN driverConfig de-duplication (F15)', () => { + it('ships driverConfig on the first openSession only', async () => { + const client = new DBSQLClient(); + const thriftClient = new ThriftClientStub(); + sinon.stub(client, 'getClient').returns(Promise.resolve(thriftClient)); + // Need a working emitter so we can spy on emitConnectionOpen. + sinon.stub(FeatureFlagCache.prototype, 'isTelemetryEnabled').resolves(true); + + const emitSpy = sinon.spy(TelemetryEventEmitter.prototype, 'emitConnectionOpen'); + + await client.connect(connectOptions); + await client.openSession(); + await client.openSession(); + await client.openSession(); + + // 3 sessions → 3 emitConnectionOpen calls, but only the first one + // should carry a non-undefined driverConfig blob. + const calls = emitSpy.getCalls(); + expect(calls.length, 'three openSession should emit three CONNECTION_OPEN events').to.equal(3); + expect(calls[0].args[0].driverConfig, 'first session ships full driverConfig').to.not.be.undefined; + expect(calls[1].args[0].driverConfig, 'second session omits driverConfig').to.be.undefined; + expect(calls[2].args[0].driverConfig, 'third session omits driverConfig').to.be.undefined; + + await client.close(); + }); + }); + + describe('getTelemetryStats', () => { + it('returns undefined when telemetry is disabled', async () => { + const client = new DBSQLClient(); + await client.connect({ ...connectOptions, telemetryEnabled: false }); + expect(client.getTelemetryStats()).to.be.undefined; + }); + + it('returns a populated snapshot when telemetry is enabled', async () => { + const client = new DBSQLClient(); + sinon.stub(FeatureFlagCache.prototype, 'isTelemetryEnabled').resolves(true); + await client.connect(connectOptions); + const stats = client.getTelemetryStats(); + expect(stats, 'stats should be populated when telemetry is on').to.not.be.undefined; + expect(stats!.host).to.equal(connectOptions.host); + expect(stats!.pendingMetricsCount).to.be.a('number'); + expect(stats!.droppedMetrics).to.be.a('number'); + expect(stats!.evictedStatements).to.be.a('number'); + expect(stats!.circuitBreakerState).to.be.oneOf(['CLOSED', 'OPEN', 'HALF_OPEN']); + await client.close(); + }); + }); +}); diff --git a/tests/unit/DBSQLOperation.test.ts b/tests/unit/DBSQLOperation.test.ts index b5f142ba..9ddd669f 100644 --- a/tests/unit/DBSQLOperation.test.ts +++ b/tests/unit/DBSQLOperation.test.ts @@ -1177,4 +1177,81 @@ describe('DBSQLOperation', () => { expect(driver.getResultSetMetadata.callCount).to.equal(1); }); }); + + describe('error-telemetry wrapper (F10/F17)', () => { + function makeContextWithEmitter() { + const context = new ClientContextStub(); + const emitError = sinon.stub(); + context.telemetryEmitter = { + emitError, + emitStatementStart: sinon.stub(), + emitStatementComplete: sinon.stub(), + } as any; + return { context, emitError }; + } + + it('emits ERROR telemetry when cancel rejects', async () => { + const { context, emitError } = makeContextWithEmitter(); + context.driver.cancelOperationResp.status.statusCode = TStatusCode.ERROR_STATUS; + const operation = new DBSQLOperation({ handle: operationHandleStub({ hasResultSet: true }), context }); + + await expectFailure(() => operation.cancel()); + + expect(emitError.calledOnce, 'emitError should fire once on rejected cancel').to.be.true; + expect(emitError.firstCall.args[0].errorName).to.match(/Status\s?Error/); + }); + + it('emits ERROR telemetry when close rejects', async () => { + const { context, emitError } = makeContextWithEmitter(); + context.driver.closeOperationResp.status.statusCode = TStatusCode.ERROR_STATUS; + const operation = new DBSQLOperation({ handle: operationHandleStub({ hasResultSet: true }), context }); + + await expectFailure(() => operation.close()); + + expect(emitError.calledOnce).to.be.true; + }); + + it('emits ERROR telemetry when getMetadata rejects', async () => { + const { context, emitError } = makeContextWithEmitter(); + // Force the driver to fail metadata fetch + sinon.stub(context.driver, 'getResultSetMetadata').rejects(new Error('metadata-failed')); + const operation = new DBSQLOperation({ handle: operationHandleStub({ hasResultSet: true }), context }); + + await expectFailure(() => operation.getMetadata()); + + expect(emitError.calledOnce).to.be.true; + expect(emitError.firstCall.args[0].errorMessage).to.equal('metadata-failed'); + }); + + it('emits ERROR telemetry from finished/getSchema/hasMoreRows when called on a closed op (F10 — newly wrapped)', async () => { + // Smaller invariant: closed-op rejection (synchronous error) goes through + // the new wrappers' telemetry path. Avoids the polling loops that happen + // when state is ERROR_STATE — those would require driver-status mock + // sequencing that isn't worth the complexity for this assertion. + for (const fn of ['finished', 'getSchema', 'hasMoreRows'] as const) { + const { context, emitError } = makeContextWithEmitter(); + const operation = new DBSQLOperation({ handle: operationHandleStub({ hasResultSet: true }), context }); + await operation.close(); + emitError.resetHistory(); // close() also emits telemetry; we only want to assert on the new wrapper + if (fn === 'hasMoreRows') { + // hasMoreRows short-circuits to false on closed without throwing + const r = await operation.hasMoreRows(); + expect(r).to.equal(false); + } else { + await expectFailure(() => (operation as any)[fn]()); + expect(emitError.called, `${fn}() should fire emitError on closed op`).to.be.true; + } + } + }); + + it('does not throw when no telemetry emitter is wired', async () => { + const context = new ClientContextStub(); + // emitter intentionally left undefined + context.driver.cancelOperationResp.status.statusCode = TStatusCode.ERROR_STATUS; + const operation = new DBSQLOperation({ handle: operationHandleStub({ hasResultSet: true }), context }); + + // The cancel still rejects, but no extra error from the missing emitter + await expectFailure(() => operation.cancel()); + }); + }); }); diff --git a/tests/unit/result/RowSetProvider.test.ts b/tests/unit/result/RowSetProvider.test.ts new file mode 100644 index 00000000..f2aebe44 --- /dev/null +++ b/tests/unit/result/RowSetProvider.test.ts @@ -0,0 +1,162 @@ +/** + * Copyright (c) 2025 Databricks Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +import { expect } from 'chai'; +import sinon from 'sinon'; +import Int64 from 'node-int64'; +import { TOperationHandle, TOperationType, THandleIdentifier } from '../../../thrift/TCLIService_types'; +import RowSetProvider from '../../../lib/result/RowSetProvider'; +import ClientContextStub from '../.stubs/ClientContextStub'; + +function makeOperationHandle(): TOperationHandle { + return { + operationId: { + guid: Buffer.from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]), + secret: Buffer.from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]), + } as THandleIdentifier, + operationType: TOperationType.EXECUTE_STATEMENT, + hasResultSet: true, + }; +} + +describe('RowSetProvider', () => { + describe('chunk telemetry emission', () => { + it('emits CLOUDFETCH_CHUNK with monotonic chunkIndex per fetch', async () => { + const context = new ClientContextStub(); + const emitCloudFetchChunk = sinon.stub(); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + const handle = makeOperationHandle(); + const provider = new RowSetProvider(context, handle, [], false, 'stmt-1'); + + await provider.fetchNext({ limit: 100 }); + await provider.fetchNext({ limit: 100 }); + + expect(emitCloudFetchChunk.calledTwice).to.be.true; + expect(emitCloudFetchChunk.firstCall.args[0].chunkIndex).to.equal(0); + expect(emitCloudFetchChunk.secondCall.args[0].chunkIndex).to.equal(1); + expect(emitCloudFetchChunk.firstCall.args[0].statementId).to.equal('stmt-1'); + }); + + it('does not emit when statementId is undefined', async () => { + const context = new ClientContextStub(); + const emitCloudFetchChunk = sinon.stub(); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false /* no statementId */); + + await provider.fetchNext({ limit: 100 }); + + expect(emitCloudFetchChunk.called).to.be.false; + }); + + it('does not emit when telemetry emitter is undefined', async () => { + const context = new ClientContextStub(); + // emitter left undefined → safeEmit short-circuits + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false, 'stmt-1'); + + // No assertion on rejection — should resolve normally without emitter + const result = await provider.fetchNext({ limit: 100 }); + expect(result).to.exist; + }); + + it('swallows emitter exceptions and does not throw to the caller', async () => { + const context = new ClientContextStub(); + const logSpy = sinon.spy(context.logger, 'log'); + const emitCloudFetchChunk = sinon.stub().throws(new Error('boom')); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false, 'stmt-1'); + + // Should not throw — telemetry never breaks the driver + const result = await provider.fetchNext({ limit: 100 }); + expect(result).to.exist; + + // Should log at debug + const debugCall = logSpy.getCalls().find((c) => /Telemetry emit error/.test(c.args[1] as string)); + expect(debugCall, 'should log a debug-level emit-error line').to.exist; + }); + + it('sums arrowBatches lengths into the bytes field', async () => { + const context = new ClientContextStub(); + const emitCloudFetchChunk = sinon.stub(); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + // Override the driver's response with arrow batches only — no inline + // columns, no binary columns — so the assertion is on a single shape. + context.driver.fetchResultsResp = { + ...context.driver.fetchResultsResp, + results: { + startRowOffset: new Int64(0), + rows: [], + arrowBatches: [ + { batch: Buffer.from(new Uint8Array(100)), rowCount: new Int64(10) }, + { batch: Buffer.from(new Uint8Array(50)), rowCount: new Int64(5) }, + ], + }, + }; + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false, 'stmt-1'); + await provider.fetchNext({ limit: 100 }); + + expect(emitCloudFetchChunk.calledOnce).to.be.true; + expect(emitCloudFetchChunk.firstCall.args[0].bytes).to.equal(150); + }); + + it('counts column-based result bytes (F10)', async () => { + const context = new ClientContextStub(); + const emitCloudFetchChunk = sinon.stub(); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + // COLUMN_BASED_SET shape: string and i32 columns, no arrow batches. + // Before F10, this path emitted bytes:0 because the byte counter only + // looked at arrowBatches. + context.driver.fetchResultsResp = { + ...context.driver.fetchResultsResp, + results: { + startRowOffset: new Int64(0), + rows: [], + columns: [ + { stringVal: { values: ['hello', 'world'], nulls: Buffer.from([0x00]) } }, + { i32Val: { values: [1, 2, 3], nulls: Buffer.from([0x00, 0x00]) } }, + ], + }, + }; + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false, 'stmt-1'); + await provider.fetchNext({ limit: 100 }); + + expect(emitCloudFetchChunk.calledOnce).to.be.true; + // strings: "hello"(5) + "world"(5) + nulls(1) = 11; i32: 3*4 + nulls(2) = 14; total 25 + expect(emitCloudFetchChunk.firstCall.args[0].bytes).to.equal(25); + }); + + it('counts binaryColumns bytes (F10)', async () => { + const context = new ClientContextStub(); + const emitCloudFetchChunk = sinon.stub(); + context.telemetryEmitter = { emitCloudFetchChunk } as any; + + context.driver.fetchResultsResp = { + ...context.driver.fetchResultsResp, + results: { + startRowOffset: new Int64(0), + rows: [], + binaryColumns: Buffer.from(new Uint8Array(64)), + }, + }; + + const provider = new RowSetProvider(context, makeOperationHandle(), [], false, 'stmt-1'); + await provider.fetchNext({ limit: 100 }); + + expect(emitCloudFetchChunk.firstCall.args[0].bytes).to.equal(64); + }); + }); +}); diff --git a/tests/unit/telemetry/MetricsAggregator.test.ts b/tests/unit/telemetry/MetricsAggregator.test.ts index e3d1bb01..00d71d16 100644 --- a/tests/unit/telemetry/MetricsAggregator.test.ts +++ b/tests/unit/telemetry/MetricsAggregator.test.ts @@ -121,6 +121,79 @@ describe('MetricsAggregator', () => { expect(() => aggregator.completeStatement('unknown-stmt')).to.not.throw(); }); + + it('aggregates chunk timing — initial is first-seen, slowest is max, sum accumulates', () => { + const context = new ClientContextStub(); + const exporter = makeExporterStub(); + const aggregator = new MetricsAggregator(context, exporter as any); + + aggregator.processEvent(statementEvent(TelemetryEventType.STATEMENT_START)); + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { latencyMs: 100, bytes: 10 })); + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { latencyMs: 250, bytes: 10 })); + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { latencyMs: 75, bytes: 10 })); + + aggregator.completeStatement('stmt-1'); + aggregator.flush(); + + const stmtMetric = exporter.export.firstCall.args[0][0]; + expect(stmtMetric.chunkInitialLatencyMs).to.equal(100); + expect(stmtMetric.chunkSlowestLatencyMs).to.equal(250); + expect(stmtMetric.chunkSumLatencyMs).to.equal(425); + }); + + it('chunks with non-positive latency do not contribute to timing fields', () => { + const context = new ClientContextStub(); + const exporter = makeExporterStub(); + const aggregator = new MetricsAggregator(context, exporter as any); + + aggregator.processEvent(statementEvent(TelemetryEventType.STATEMENT_START)); + // latency=0 (cached/prefetched page) — must be ignored entirely. + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { latencyMs: 0, bytes: 5 })); + // latency undefined — emitter didn't set it, must be ignored. + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { bytes: 5 })); + // First *positive* latency wins for `initial`, even though earlier chunks already arrived. + aggregator.processEvent(statementEvent(TelemetryEventType.CLOUDFETCH_CHUNK, { latencyMs: 60, bytes: 5 })); + + aggregator.completeStatement('stmt-1'); + aggregator.flush(); + + const stmtMetric = exporter.export.firstCall.args[0][0]; + expect(stmtMetric.chunkInitialLatencyMs).to.equal(60); + expect(stmtMetric.chunkSlowestLatencyMs).to.equal(60); + expect(stmtMetric.chunkSumLatencyMs).to.equal(60); + expect(stmtMetric.chunkCount).to.equal(3); // chunkCount counts all chunks regardless of latency + }); + }); + + describe('processEvent() - CONNECTION_CLOSE', () => { + it('emits a DELETE_SESSION connection metric immediately', () => { + const context = new ClientContextStub(); + const exporter = makeExporterStub(); + const aggregator = new MetricsAggregator(context, exporter as any); + + aggregator.processEvent(connectionEvent({ eventType: TelemetryEventType.CONNECTION_CLOSE, latencyMs: 42 })); + aggregator.flush(); + + expect(exporter.export.calledOnce).to.be.true; + const metric = exporter.export.firstCall.args[0][0]; + expect(metric.metricType).to.equal('connection'); + expect(metric.operationType).to.equal('DELETE_SESSION'); + expect(metric.latencyMs).to.equal(42); + }); + + it('CONNECTION_OPEN and CONNECTION_CLOSE produce distinct operation types in the same batch', () => { + const context = new ClientContextStub(); + const exporter = makeExporterStub(); + const aggregator = new MetricsAggregator(context, exporter as any); + + aggregator.processEvent(connectionEvent({ eventType: TelemetryEventType.CONNECTION_OPEN, latencyMs: 100 })); + aggregator.processEvent(connectionEvent({ eventType: TelemetryEventType.CONNECTION_CLOSE, latencyMs: 5 })); + aggregator.flush(); + + const batch = exporter.export.firstCall.args[0]; + expect(batch).to.have.lengthOf(2); + expect(batch.map((m: any) => m.operationType)).to.deep.equal(['CREATE_SESSION', 'DELETE_SESSION']); + }); }); describe('processEvent() - error events', () => { diff --git a/tests/unit/telemetry/TelemetryClient.test.ts b/tests/unit/telemetry/TelemetryClient.test.ts index 4f4c3f1d..ba8af8c7 100644 --- a/tests/unit/telemetry/TelemetryClient.test.ts +++ b/tests/unit/telemetry/TelemetryClient.test.ts @@ -142,11 +142,124 @@ describe('TelemetryClient', () => { const logSpy = sinon.spy(context.logger, 'log'); const client = new TelemetryClient(context, HOST); - client.close(); + await client.close(); logSpy.getCalls().forEach((call) => { - expect(call.args[0]).to.equal(LogLevel.debug); + expect(call.args[0]).to.satisfy((lvl: string) => lvl === LogLevel.debug || lvl === LogLevel.warn); }); }); }); + + describe('feature-flag context registration (F1)', () => { + it('should register the host with the feature-flag cache on construction', () => { + const context = new ClientContextStub(); + const client = new TelemetryClient(context, HOST); + + // Without F1's wiring, isTelemetryEnabled returns false because the + // contexts map is empty. With F1, the constructor registers the host + // so the cache is ready to fetch the flag. + const cache = client.getFeatureFlagCache(); + // Internal access for assertion only — tests on getInstance/resetInstance + // would otherwise leak across the singleton. + const ctx = (cache as any).contexts.get(HOST); + expect(ctx, 'context should exist after TelemetryClient construction').to.exist; + expect(ctx.refCount, 'refCount should be 1 after registration').to.equal(1); + }); + + it('should release the feature-flag context on close', async () => { + const context = new ClientContextStub(); + const client = new TelemetryClient(context, HOST); + const cache = client.getFeatureFlagCache(); + + await client.close(); + + const ctx = (cache as any).contexts.get(HOST); + expect(ctx, 'context should be removed on close (refCount → 0)').to.be.undefined; + }); + }); + + describe('multi-context FIFO', () => { + it('registerContext appends contexts in registration order', () => { + const ctxA = new ClientContextStub(); + const ctxB = new ClientContextStub(); + const client = new TelemetryClient(ctxA, HOST); + + client.registerContext(ctxB); + + const internal = client as any; + expect(internal.contexts).to.have.lengthOf(2); + // `contexts` is now an array of {context, authProvider} snapshots so + // unregisterContext can use the cached auth-provider reference even + // after a registrant rotates credentials. Unwrap `.context` to assert. + expect(internal.contexts[0].context).to.equal(ctxA); + expect(internal.contexts[1].context).to.equal(ctxB); + }); + + it('registerContext is idempotent for the same context', () => { + const ctxA = new ClientContextStub(); + const client = new TelemetryClient(ctxA, HOST); + + client.registerContext(ctxA); + client.registerContext(ctxA); + + expect((client as any).contexts).to.have.lengthOf(1); + }); + + it('unregisterContext removes the context', () => { + const ctxA = new ClientContextStub(); + const ctxB = new ClientContextStub(); + const client = new TelemetryClient(ctxA, HOST); + client.registerContext(ctxB); + + client.unregisterContext(ctxA); + + const internal = client as any; + expect(internal.contexts).to.have.lengthOf(1); + expect(internal.contexts[0].context).to.equal(ctxB); + }); + + it('warns when a registered context has divergent telemetry config (F12)', () => { + const ctxA = new ClientContextStub({ telemetryAuthenticatedExport: true }); + const ctxB = new ClientContextStub({ telemetryAuthenticatedExport: false }); + const client = new TelemetryClient(ctxA, HOST); + const logSpy = sinon.spy(ctxA.logger, 'log'); + + client.registerContext(ctxB); + + const warnCall = logSpy + .getCalls() + .find((c) => c.args[0] === LogLevel.warn && /telemetry settings .* differ/.test(c.args[1] as string)); + expect(warnCall, 'should warn about divergent telemetryAuthenticatedExport').to.exist; + }); + }); + + describe('async close()', () => { + it('returns a Promise that resolves only after aggregator close', async () => { + const context = new ClientContextStub(); + const client = new TelemetryClient(context, HOST); + const aggregator = client.getAggregator(); + const aggCloseStub = sinon.stub(aggregator, 'close').callsFake( + () => + new Promise((resolve) => { + setTimeout(resolve, 5); + }), + ); + + const closePromise = client.close(); + expect(client.isClosed(), 'closed flag is set synchronously').to.be.true; + expect(aggCloseStub.calledOnce).to.be.true; + await closePromise; + }); + + it('is idempotent — second close awaits without re-running aggregator close', async () => { + const context = new ClientContextStub(); + const client = new TelemetryClient(context, HOST); + const aggCloseStub = sinon.stub(client.getAggregator(), 'close').resolves(); + + await client.close(); + await client.close(); + + expect(aggCloseStub.calledOnce, 'aggregator.close should run exactly once').to.be.true; + }); + }); }); diff --git a/tests/unit/telemetry/TelemetryClientProvider.test.ts b/tests/unit/telemetry/TelemetryClientProvider.test.ts index 59ae3b99..00f638fe 100644 --- a/tests/unit/telemetry/TelemetryClientProvider.test.ts +++ b/tests/unit/telemetry/TelemetryClientProvider.test.ts @@ -28,28 +28,19 @@ describe('TelemetryClientProvider', () => { describe('Constructor', () => { it('should create provider with empty client map', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); expect(provider.getActiveClients().size).to.equal(0); }); - - it('should log creation at debug level', () => { - const context = new ClientContextStub(); - const logSpy = sinon.spy(context.logger, 'log'); - - new TelemetryClientProvider(context); - - expect(logSpy.calledWith(LogLevel.debug, sinon.match(/created.*telemetryclientprovider/i))).to.be.true; - }); }); describe('getOrCreateClient', () => { it('should create one client per host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST2); + const client1 = provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST2); expect(client1).to.be.instanceOf(TelemetryClient); expect(client2).to.be.instanceOf(TelemetryClient); @@ -59,11 +50,11 @@ describe('TelemetryClientProvider', () => { it('should share client across multiple connections to same host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST1); - const client3 = provider.getOrCreateClient(HOST1); + const client1 = provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST1); + const client3 = provider.getOrCreateClient(context, HOST1); expect(client1).to.equal(client2); expect(client2).to.equal(client3); @@ -72,34 +63,34 @@ describe('TelemetryClientProvider', () => { it('should increment reference count on each call', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(1); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(2); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(3); }); it('should log client creation at debug level', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(logSpy.calledWith(LogLevel.debug, sinon.match(/created new telemetryclient/i))).to.be.true; }); it('should log reference count at debug level', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(logSpy.calledWith(LogLevel.debug, sinon.match(/reference count for/i).and(sinon.match(/: 1$/)))).to.be .true; @@ -107,9 +98,9 @@ describe('TelemetryClientProvider', () => { it('should pass context to TelemetryClient', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client = provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); expect(client.getHost()).to.equal(HOST1); }); @@ -118,28 +109,28 @@ describe('TelemetryClientProvider', () => { describe('releaseClient', () => { it('should decrement reference count on release', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(3); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(2); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(1); }); it('should close client when reference count reaches zero', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client = provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); const closeSpy = sinon.spy(client, 'close'); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(closeSpy.calledOnce).to.be.true; expect(client.isClosed()).to.be.true; @@ -147,12 +138,12 @@ describe('TelemetryClientProvider', () => { it('should remove client from map when reference count reaches zero', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getActiveClients().size).to.equal(1); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(provider.getActiveClients().size).to.equal(0); expect(provider.getRefCount(HOST1)).to.equal(0); @@ -160,14 +151,14 @@ describe('TelemetryClientProvider', () => { it('should NOT close client while other connections exist', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client = provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); const closeSpy = sinon.spy(client, 'close'); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(closeSpy.called).to.be.false; expect(client.isClosed()).to.be.false; @@ -176,23 +167,23 @@ describe('TelemetryClientProvider', () => { it('should handle releasing non-existent client gracefully', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(logSpy.calledWith(LogLevel.debug, sinon.match(/no telemetryclient found/i))).to.be.true; }); it('should log reference count decrease at debug level', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(logSpy.calledWith(LogLevel.debug, sinon.match(/reference count for/i).and(sinon.match(/: 1$/)))).to.be .true; @@ -200,25 +191,25 @@ describe('TelemetryClientProvider', () => { it('should log client closure at debug level', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - provider.getOrCreateClient(HOST1); - provider.releaseClient(HOST1); + provider.getOrCreateClient(context, HOST1); + await provider.releaseClient(context, HOST1); expect(logSpy.calledWith(LogLevel.debug, sinon.match(/closed and removed telemetryclient/i))).to.be.true; }); it('should swallow Error during client closure', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client = provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); const error = new Error('Close error'); sinon.stub(client, 'close').throws(error); const logSpy = sinon.spy(context.logger, 'log'); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect( logSpy.calledWith( @@ -230,9 +221,9 @@ describe('TelemetryClientProvider', () => { it('should swallow non-Error throws during client closure', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client = provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); // Non-Error throws — string, null, undefined — must not escape the catch. sinon.stub(client, 'close').callsFake(() => { // eslint-disable-next-line no-throw-literal @@ -240,7 +231,7 @@ describe('TelemetryClientProvider', () => { }); const logSpy = sinon.spy(context.logger, 'log'); - expect(() => provider.releaseClient(HOST1)).to.not.throw(); + expect(() => provider.releaseClient(context, HOST1)).to.not.throw(); expect( logSpy.calledWith( LogLevel.debug, @@ -249,29 +240,29 @@ describe('TelemetryClientProvider', () => { ).to.be.true; }); - it('should not throw or corrupt state on double-release', () => { + it('should not throw or corrupt state on double-release', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); // One get, two releases — the second must not throw and must not // leave the provider in a state where refCount is negative. - provider.getOrCreateClient(HOST1); - provider.releaseClient(HOST1); + provider.getOrCreateClient(context, HOST1); + await provider.releaseClient(context, HOST1); - expect(() => provider.releaseClient(HOST1)).to.not.throw(); + expect(() => provider.releaseClient(context, HOST1)).to.not.throw(); expect(provider.getRefCount(HOST1)).to.equal(0); expect(provider.getActiveClients().size).to.equal(0); }); - it('should return a fresh non-closed client after full release', () => { + it('should return a fresh non-closed client after full release', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const first = provider.getOrCreateClient(HOST1); - provider.releaseClient(HOST1); + const first = provider.getOrCreateClient(context, HOST1); + await provider.releaseClient(context, HOST1); expect(first.isClosed()).to.be.true; - const second = provider.getOrCreateClient(HOST1); + const second = provider.getOrCreateClient(context, HOST1); expect(second).to.not.equal(first); expect(second.isClosed()).to.be.false; expect(provider.getRefCount(HOST1)).to.equal(1); @@ -281,13 +272,13 @@ describe('TelemetryClientProvider', () => { describe('Host normalization', () => { it('should treat scheme, case, port and trailing slash as the same host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const a = provider.getOrCreateClient('workspace.cloud.databricks.com'); - const b = provider.getOrCreateClient('https://workspace.cloud.databricks.com'); - const c = provider.getOrCreateClient('https://WorkSpace.CLOUD.databricks.com/'); - const d = provider.getOrCreateClient('workspace.cloud.databricks.com:443'); - const e = provider.getOrCreateClient(' workspace.cloud.databricks.com. '); + const a = provider.getOrCreateClient(context, 'workspace.cloud.databricks.com'); + const b = provider.getOrCreateClient(context, 'https://workspace.cloud.databricks.com'); + const c = provider.getOrCreateClient(context, 'https://WorkSpace.CLOUD.databricks.com/'); + const d = provider.getOrCreateClient(context, 'workspace.cloud.databricks.com:443'); + const e = provider.getOrCreateClient(context, ' workspace.cloud.databricks.com. '); expect(a).to.equal(b); expect(a).to.equal(c); @@ -297,12 +288,12 @@ describe('TelemetryClientProvider', () => { expect(provider.getRefCount('workspace.cloud.databricks.com')).to.equal(5); }); - it('should release under an alias correctly', () => { + it('should release under an alias correctly', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient('example.com'); - provider.releaseClient('HTTPS://Example.COM/'); + provider.getOrCreateClient(context, 'example.com'); + await provider.releaseClient(context, 'HTTPS://Example.COM/'); expect(provider.getRefCount('example.com')).to.equal(0); expect(provider.getActiveClients().size).to.equal(0); @@ -312,18 +303,18 @@ describe('TelemetryClientProvider', () => { describe('Reference counting', () => { it('should track reference counts independently per host', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST2); - provider.getOrCreateClient(HOST2); - provider.getOrCreateClient(HOST2); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST2); + provider.getOrCreateClient(context, HOST2); + provider.getOrCreateClient(context, HOST2); expect(provider.getRefCount(HOST1)).to.equal(2); expect(provider.getRefCount(HOST2)).to.equal(3); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(1); expect(provider.getRefCount(HOST2)).to.equal(3); @@ -331,21 +322,21 @@ describe('TelemetryClientProvider', () => { it('should close only last connection for each host', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST2); + const client1 = provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST2); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(client1.isClosed()).to.be.false; expect(provider.getActiveClients().size).to.equal(2); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(client1.isClosed()).to.be.true; expect(provider.getActiveClients().size).to.equal(1); - provider.releaseClient(HOST2); + await provider.releaseClient(context, HOST2); expect(client2.isClosed()).to.be.true; expect(provider.getActiveClients().size).to.equal(0); }); @@ -354,10 +345,10 @@ describe('TelemetryClientProvider', () => { describe('Per-host isolation', () => { it('should isolate clients by host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST2); + const client1 = provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST2); expect(client1.getHost()).to.equal(HOST1); expect(client2.getHost()).to.equal(HOST2); @@ -366,12 +357,12 @@ describe('TelemetryClientProvider', () => { it('should allow closing one host without affecting others', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST2); + const client1 = provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST2); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); expect(client1.isClosed()).to.be.true; expect(client2.isClosed()).to.be.false; @@ -382,19 +373,19 @@ describe('TelemetryClientProvider', () => { describe('getRefCount', () => { it('should return 0 for non-existent host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); expect(provider.getRefCount(HOST1)).to.equal(0); }); it('should return current reference count for existing host', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(1); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(provider.getRefCount(HOST1)).to.equal(2); }); }); @@ -402,7 +393,7 @@ describe('TelemetryClientProvider', () => { describe('getActiveClients', () => { it('should return empty map initially', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const clients = provider.getActiveClients(); @@ -411,10 +402,10 @@ describe('TelemetryClientProvider', () => { it('should return all active clients', () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - const client1 = provider.getOrCreateClient(HOST1); - const client2 = provider.getOrCreateClient(HOST2); + const client1 = provider.getOrCreateClient(context, HOST1); + const client2 = provider.getOrCreateClient(context, HOST2); const clients = provider.getActiveClients(); @@ -425,12 +416,12 @@ describe('TelemetryClientProvider', () => { it('should not include closed clients', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); - provider.getOrCreateClient(HOST2); + provider.getOrCreateClient(context, HOST1); + provider.getOrCreateClient(context, HOST2); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); const clients = provider.getActiveClients(); @@ -444,9 +435,9 @@ describe('TelemetryClientProvider', () => { it('should use logger from context for all logging', () => { const context = new ClientContextStub(); const logSpy = sinon.spy(context.logger, 'log'); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); - provider.getOrCreateClient(HOST1); + provider.getOrCreateClient(context, HOST1); expect(logSpy.called).to.be.true; logSpy.getCalls().forEach((call) => { @@ -456,13 +447,13 @@ describe('TelemetryClientProvider', () => { it('should log close errors at debug level', async () => { const context = new ClientContextStub(); - const provider = new TelemetryClientProvider(context); + const provider = new TelemetryClientProvider(); const logSpy = sinon.spy(context.logger, 'log'); - const client = provider.getOrCreateClient(HOST1); + const client = provider.getOrCreateClient(context, HOST1); sinon.stub(client, 'close').throws(new Error('Test error')); - provider.releaseClient(HOST1); + await provider.releaseClient(context, HOST1); const errorLogs = logSpy.getCalls().filter((call) => /error releasing/i.test(String(call.args[1]))); expect(errorLogs.length).to.be.greaterThan(0); diff --git a/tests/unit/telemetry/TelemetryEventEmitter.test.ts b/tests/unit/telemetry/TelemetryEventEmitter.test.ts index 6fc29107..7a7585ce 100644 --- a/tests/unit/telemetry/TelemetryEventEmitter.test.ts +++ b/tests/unit/telemetry/TelemetryEventEmitter.test.ts @@ -17,187 +17,777 @@ import { expect } from 'chai'; import sinon from 'sinon'; import TelemetryEventEmitter from '../../../lib/telemetry/TelemetryEventEmitter'; -import { TelemetryEventType } from '../../../lib/telemetry/types'; -import ClientContextStub from '../.stubs/ClientContextStub'; -import { LogLevel } from '../../../lib/contracts/IDBSQLLogger'; - -function makeEmitter(enabled: boolean): TelemetryEventEmitter { - const context = new ClientContextStub({ telemetryEnabled: enabled } as any); - return new TelemetryEventEmitter(context); -} +import { TelemetryEventType, TelemetryEvent, DriverConfiguration } from '../../../lib/telemetry/types'; +import IClientContext from '../../../lib/contracts/IClientContext'; +import IDBSQLLogger, { LogLevel } from '../../../lib/contracts/IDBSQLLogger'; describe('TelemetryEventEmitter', () => { - describe('when telemetry is disabled', () => { - it('should not emit any events', () => { - const emitter = makeEmitter(false); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.CONNECTION_OPEN, listener); + let context: IClientContext; + let logger: IDBSQLLogger; + let emitter: TelemetryEventEmitter; + + beforeEach(() => { + logger = { + log: sinon.stub(), + }; + + context = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: true, + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + emitter = new TelemetryEventEmitter(context); + }); - emitter.emitConnectionOpen({ sessionId: 's1', workspaceId: 'w1', driverConfig: {} as any }); + afterEach(() => { + sinon.restore(); + }); - expect(listener.called).to.be.false; + describe('constructor', () => { + it('should create instance with telemetry enabled', () => { + expect(emitter).to.be.instanceOf(TelemetryEventEmitter); }); - it('should not emit statement start', () => { - const emitter = makeEmitter(false); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.STATEMENT_START, listener); + it('should create instance with telemetry disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: false, + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + expect(disabledEmitter).to.be.instanceOf(TelemetryEventEmitter); + }); - emitter.emitStatementStart({ statementId: 'st1', sessionId: 's1' }); + it('should default to disabled when telemetryEnabled is undefined', () => { + const defaultContext = { + getLogger: () => logger, + getConfig: () => ({ + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + const defaultEmitter = new TelemetryEventEmitter(defaultContext); + expect(defaultEmitter).to.be.instanceOf(TelemetryEventEmitter); + }); + }); - expect(listener.called).to.be.false; + describe('emitConnectionOpen', () => { + it('should emit connection.open event with correct data', (done) => { + const driverConfig: DriverConfiguration = { + driverVersion: '1.0.0', + driverName: 'databricks-sql-nodejs', + nodeVersion: process.version, + platform: process.platform, + osVersion: 'test-os', + osArch: 'x64', + runtimeVendor: 'Node.js Foundation', + localeName: 'en_US', + charSetEncoding: 'UTF-8', + processName: 'node', + authType: 'pat', + cloudFetchEnabled: true, + lz4Enabled: true, + arrowEnabled: false, + directResultsEnabled: true, + socketTimeout: 900000, + retryMaxAttempts: 30, + cloudFetchConcurrentDownloads: 10, + }; + + emitter.on(TelemetryEventType.CONNECTION_OPEN, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.CONNECTION_OPEN); + expect(event.sessionId).to.equal('session-123'); + expect(event.workspaceId).to.equal('workspace-456'); + expect(event.driverConfig).to.deep.equal(driverConfig); + expect(event.timestamp).to.be.a('number'); + done(); + }); + + emitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig, + latencyMs: 100, + }); }); - it('should not emit error events', () => { - const emitter = makeEmitter(false); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.ERROR, listener); + it('should not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: false, + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventEmitted = false; + + disabledEmitter.on(TelemetryEventType.CONNECTION_OPEN, () => { + eventEmitted = true; + }); - emitter.emitError({ errorName: 'SomeError', errorMessage: 'msg', isTerminal: false }); + disabledEmitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); - expect(listener.called).to.be.false; + expect(eventEmitted).to.be.false; }); - }); - describe('emitConnectionOpen()', () => { - it('should emit a CONNECTION_OPEN event with correct fields', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.CONNECTION_OPEN, listener); + it('should swallow exceptions and log at debug level', () => { + // Force an exception by emitting before adding any listeners + // Then make emit throw by adding a throwing listener + emitter.on(TelemetryEventType.CONNECTION_OPEN, () => { + throw new Error('Test error'); + }); - emitter.emitConnectionOpen({ sessionId: 's1', workspaceId: 'w1', driverConfig: {} as any }); + emitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); - expect(listener.calledOnce).to.be.true; - const event = listener.firstCall.args[0]; - expect(event.eventType).to.equal(TelemetryEventType.CONNECTION_OPEN); - expect(event.sessionId).to.equal('s1'); - expect(event.workspaceId).to.equal('w1'); - expect(event.timestamp).to.be.a('number'); + expect((logger.log as sinon.SinonStub).calledWith(LogLevel.debug)).to.be.true; + expect((logger.log as sinon.SinonStub).args[0][1]).to.include('Error emitting connection.open'); }); - it('should swallow and log exceptions from listeners', () => { - const context = new ClientContextStub({ telemetryEnabled: true } as any); - const logSpy = sinon.spy(context.logger, 'log'); - const emitter = new TelemetryEventEmitter(context); - + it('should not log at warn or error level', () => { emitter.on(TelemetryEventType.CONNECTION_OPEN, () => { - throw new Error('listener boom'); + throw new Error('Test error'); + }); + + emitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); + + const logStub = logger.log as sinon.SinonStub; + for (let i = 0; i < logStub.callCount; i++) { + const level = logStub.args[i][0]; + expect(level).to.not.equal(LogLevel.warn); + expect(level).to.not.equal(LogLevel.error); + } + }); + }); + + describe('emitConnectionClose', () => { + it('emits connection.close with sessionId and latencyMs', (done) => { + emitter.on(TelemetryEventType.CONNECTION_CLOSE, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.CONNECTION_CLOSE); + expect(event.sessionId).to.equal('session-close-1'); + expect(event.latencyMs).to.equal(7); + expect(event.timestamp).to.be.a('number'); + done(); }); - expect(() => - emitter.emitConnectionOpen({ sessionId: 's1', workspaceId: 'w1', driverConfig: {} as any }), - ).to.not.throw(); - expect(logSpy.calledWith(LogLevel.debug, sinon.match(/listener boom/))).to.be.true; + emitter.emitConnectionClose({ sessionId: 'session-close-1', latencyMs: 7 }); + }); + + it('does not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: false, + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let emitted = false; + disabledEmitter.on(TelemetryEventType.CONNECTION_CLOSE, () => { + emitted = true; + }); - logSpy.restore(); + disabledEmitter.emitConnectionClose({ sessionId: 's', latencyMs: 1 }); + expect(emitted).to.be.false; + }); + + it('swallows exceptions from listeners and logs at debug level', () => { + emitter.on(TelemetryEventType.CONNECTION_CLOSE, () => { + throw new Error('listener boom'); + }); + + expect(() => emitter.emitConnectionClose({ sessionId: 's', latencyMs: 1 })).to.not.throw(); + const logStub = logger.log as sinon.SinonStub; + const debugCalls = logStub.getCalls().filter((c) => c.args[0] === LogLevel.debug); + expect(debugCalls.some((c) => /connection\.close/i.test(c.args[1]))).to.be.true; }); }); - describe('emitStatementStart()', () => { - it('should emit a STATEMENT_START event with correct fields', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.STATEMENT_START, listener); + describe('emitStatementStart', () => { + it('should emit statement.start event with correct data', (done) => { + emitter.on(TelemetryEventType.STATEMENT_START, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_START); + expect(event.statementId).to.equal('stmt-789'); + expect(event.sessionId).to.equal('session-123'); + expect(event.operationType).to.equal('SELECT'); + expect(event.timestamp).to.be.a('number'); + done(); + }); + + emitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + operationType: 'SELECT', + }); + }); + + it('should emit without operationType', (done) => { + emitter.on(TelemetryEventType.STATEMENT_START, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_START); + expect(event.statementId).to.equal('stmt-789'); + expect(event.sessionId).to.equal('session-123'); + expect(event.operationType).to.be.undefined; + done(); + }); + + emitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + }); + + it('should not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ telemetryEnabled: false }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventEmitted = false; + + disabledEmitter.on(TelemetryEventType.STATEMENT_START, () => { + eventEmitted = true; + }); - emitter.emitStatementStart({ statementId: 'st1', sessionId: 's1', operationType: 'SELECT' }); + disabledEmitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); - expect(listener.calledOnce).to.be.true; - const event = listener.firstCall.args[0]; - expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_START); - expect(event.statementId).to.equal('st1'); - expect(event.operationType).to.equal('SELECT'); + expect(eventEmitted).to.be.false; + }); + + it('should swallow exceptions and log at debug level', () => { + emitter.on(TelemetryEventType.STATEMENT_START, () => { + throw new Error('Test error'); + }); + + emitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + + expect((logger.log as sinon.SinonStub).calledWith(LogLevel.debug)).to.be.true; + expect((logger.log as sinon.SinonStub).args[0][1]).to.include('Error emitting statement.start'); }); }); - describe('emitStatementComplete()', () => { - it('should emit a STATEMENT_COMPLETE event with correct fields', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.STATEMENT_COMPLETE, listener); + describe('emitStatementComplete', () => { + it('should emit statement.complete event with all data fields', (done) => { + emitter.on(TelemetryEventType.STATEMENT_COMPLETE, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_COMPLETE); + expect(event.statementId).to.equal('stmt-789'); + expect(event.sessionId).to.equal('session-123'); + expect(event.latencyMs).to.equal(1500); + expect(event.resultFormat).to.equal('cloudfetch'); + expect(event.chunkCount).to.equal(5); + expect(event.bytesDownloaded).to.equal(1024000); + expect(event.pollCount).to.equal(3); + expect(event.timestamp).to.be.a('number'); + done(); + }); emitter.emitStatementComplete({ - statementId: 'st1', - sessionId: 's1', - latencyMs: 123, - resultFormat: 'arrow', - chunkCount: 2, - bytesDownloaded: 1024, + statementId: 'stmt-789', + sessionId: 'session-123', + latencyMs: 1500, + resultFormat: 'cloudfetch', + chunkCount: 5, + bytesDownloaded: 1024000, pollCount: 3, }); + }); + + it('should emit with minimal data', (done) => { + emitter.on(TelemetryEventType.STATEMENT_COMPLETE, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_COMPLETE); + expect(event.statementId).to.equal('stmt-789'); + expect(event.sessionId).to.equal('session-123'); + expect(event.latencyMs).to.be.undefined; + expect(event.resultFormat).to.be.undefined; + done(); + }); + + emitter.emitStatementComplete({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + }); + + it('should not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ telemetryEnabled: false }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventEmitted = false; + + disabledEmitter.on(TelemetryEventType.STATEMENT_COMPLETE, () => { + eventEmitted = true; + }); + + disabledEmitter.emitStatementComplete({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); - expect(listener.calledOnce).to.be.true; - const event = listener.firstCall.args[0]; - expect(event.eventType).to.equal(TelemetryEventType.STATEMENT_COMPLETE); - expect(event.latencyMs).to.equal(123); - expect(event.chunkCount).to.equal(2); + expect(eventEmitted).to.be.false; + }); + + it('should swallow exceptions and log at debug level', () => { + emitter.on(TelemetryEventType.STATEMENT_COMPLETE, () => { + throw new Error('Test error'); + }); + + emitter.emitStatementComplete({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + + expect((logger.log as sinon.SinonStub).calledWith(LogLevel.debug)).to.be.true; + expect((logger.log as sinon.SinonStub).args[0][1]).to.include('Error emitting statement.complete'); }); }); - describe('emitCloudFetchChunk()', () => { - it('should emit a CLOUDFETCH_CHUNK event with correct fields', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, listener); + describe('emitCloudFetchChunk', () => { + it('should emit cloudfetch.chunk event with correct data', (done) => { + emitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.CLOUDFETCH_CHUNK); + expect(event.statementId).to.equal('stmt-789'); + expect(event.chunkIndex).to.equal(2); + expect(event.latencyMs).to.equal(250); + expect(event.bytes).to.equal(204800); + expect(event.compressed).to.be.true; + expect(event.timestamp).to.be.a('number'); + done(); + }); + + emitter.emitCloudFetchChunk({ + statementId: 'stmt-789', + chunkIndex: 2, + latencyMs: 250, + bytes: 204800, + compressed: true, + }); + }); + + it('should emit without optional fields', (done) => { + emitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.CLOUDFETCH_CHUNK); + expect(event.statementId).to.equal('stmt-789'); + expect(event.chunkIndex).to.equal(0); + expect(event.bytes).to.equal(100000); + expect(event.latencyMs).to.be.undefined; + expect(event.compressed).to.be.undefined; + done(); + }); + + emitter.emitCloudFetchChunk({ + statementId: 'stmt-789', + chunkIndex: 0, + bytes: 100000, + }); + }); + + it('should not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ telemetryEnabled: false }), + } as any; - emitter.emitCloudFetchChunk({ statementId: 'st1', chunkIndex: 0, bytes: 512, compressed: true }); + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventEmitted = false; - expect(listener.calledOnce).to.be.true; - const event = listener.firstCall.args[0]; - expect(event.eventType).to.equal(TelemetryEventType.CLOUDFETCH_CHUNK); - expect(event.bytes).to.equal(512); - expect(event.compressed).to.be.true; + disabledEmitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, () => { + eventEmitted = true; + }); + + disabledEmitter.emitCloudFetchChunk({ + statementId: 'stmt-789', + chunkIndex: 0, + bytes: 100000, + }); + + expect(eventEmitted).to.be.false; + }); + + it('should swallow exceptions and log at debug level', () => { + emitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, () => { + throw new Error('Test error'); + }); + + emitter.emitCloudFetchChunk({ + statementId: 'stmt-789', + chunkIndex: 0, + bytes: 100000, + }); + + expect((logger.log as sinon.SinonStub).calledWith(LogLevel.debug)).to.be.true; + expect((logger.log as sinon.SinonStub).args[0][1]).to.include('Error emitting cloudfetch.chunk'); }); }); - describe('emitError()', () => { - it('should emit an ERROR event with correct fields', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.ERROR, listener); + describe('emitError', () => { + it('should emit error event with all fields', (done) => { + emitter.on(TelemetryEventType.ERROR, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.ERROR); + expect(event.statementId).to.equal('stmt-789'); + expect(event.sessionId).to.equal('session-123'); + expect(event.errorName).to.equal('AuthenticationError'); + expect(event.errorMessage).to.equal('Invalid credentials'); + expect(event.isTerminal).to.be.true; + expect(event.timestamp).to.be.a('number'); + done(); + }); + + emitter.emitError({ + statementId: 'stmt-789', + sessionId: 'session-123', + errorName: 'AuthenticationError', + errorMessage: 'Invalid credentials', + isTerminal: true, + }); + }); + + it('should emit error event with minimal fields', (done) => { + emitter.on(TelemetryEventType.ERROR, (event: TelemetryEvent) => { + expect(event.eventType).to.equal(TelemetryEventType.ERROR); + expect(event.errorName).to.equal('TimeoutError'); + expect(event.errorMessage).to.equal('Request timed out'); + expect(event.isTerminal).to.be.false; + expect(event.statementId).to.be.undefined; + expect(event.sessionId).to.be.undefined; + done(); + }); emitter.emitError({ - statementId: 'st1', - sessionId: 's1', - errorName: 'NetworkError', - errorMessage: 'timeout', + errorName: 'TimeoutError', + errorMessage: 'Request timed out', isTerminal: false, }); + }); + + it('should not emit when telemetry is disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ telemetryEnabled: false }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventEmitted = false; + + disabledEmitter.on(TelemetryEventType.ERROR, () => { + eventEmitted = true; + }); - expect(listener.calledOnce).to.be.true; - const event = listener.firstCall.args[0]; - expect(event.eventType).to.equal(TelemetryEventType.ERROR); - expect(event.errorName).to.equal('NetworkError'); - expect(event.isTerminal).to.be.false; + disabledEmitter.emitError({ + errorName: 'Error', + errorMessage: 'Test', + isTerminal: false, + }); + + expect(eventEmitted).to.be.false; }); - it('should emit a terminal ERROR event', () => { - const emitter = makeEmitter(true); - const listener = sinon.stub(); - emitter.on(TelemetryEventType.ERROR, listener); + it('should swallow exceptions and log at debug level', () => { + emitter.on(TelemetryEventType.ERROR, () => { + throw new Error('Test error'); + }); - emitter.emitError({ errorName: 'AuthenticationError', errorMessage: 'auth failed', isTerminal: true }); + emitter.emitError({ + errorName: 'Error', + errorMessage: 'Test', + isTerminal: false, + }); - const event = listener.firstCall.args[0]; - expect(event.isTerminal).to.be.true; + expect((logger.log as sinon.SinonStub).calledWith(LogLevel.debug)).to.be.true; + expect((logger.log as sinon.SinonStub).args[0][1]).to.include('Error emitting telemetry.error'); }); }); - describe('logging level compliance', () => { - it('should never log at warn or error level', () => { - const context = new ClientContextStub({ telemetryEnabled: true } as any); - const logSpy = sinon.spy(context.logger, 'log'); - const emitter = new TelemetryEventEmitter(context); + describe('exception swallowing', () => { + it('should never propagate exceptions to caller', () => { + emitter.on(TelemetryEventType.CONNECTION_OPEN, () => { + throw new Error('Critical error'); + }); + + expect(() => { + emitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); + }).to.not.throw(); + }); + + it('should swallow multiple listener exceptions', () => { + emitter.on(TelemetryEventType.STATEMENT_START, () => { + throw new Error('First listener error'); + }); + emitter.on(TelemetryEventType.STATEMENT_START, () => { + throw new Error('Second listener error'); + }); + + expect(() => { + emitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + }).to.not.throw(); + }); + + it('should log only at debug level, never at warn or error', () => { + emitter.on(TelemetryEventType.STATEMENT_COMPLETE, () => { + throw new Error('Test error'); + }); + emitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, () => { + throw new Error('Test error'); + }); + emitter.on(TelemetryEventType.ERROR, () => { + throw new Error('Test error'); + }); + + emitter.emitStatementComplete({ + statementId: 'stmt-1', + sessionId: 'session-1', + }); + emitter.emitCloudFetchChunk({ + statementId: 'stmt-1', + chunkIndex: 0, + bytes: 1000, + }); + emitter.emitError({ + errorName: 'Error', + errorMessage: 'Test', + isTerminal: false, + }); + + const logStub = logger.log as sinon.SinonStub; + for (let i = 0; i < logStub.callCount; i++) { + const level = logStub.args[i][0]; + expect(level).to.equal(LogLevel.debug); + } + }); + }); + + describe('no console logging', () => { + it('should not use console.log', () => { + const consoleSpy = sinon.spy(console, 'log'); emitter.on(TelemetryEventType.CONNECTION_OPEN, () => { - throw new Error('boom'); + throw new Error('Test error'); + }); + + emitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); + + expect(consoleSpy.called).to.be.false; + consoleSpy.restore(); + }); + + it('should not use console.debug', () => { + const consoleSpy = sinon.spy(console, 'debug'); + + emitter.on(TelemetryEventType.STATEMENT_START, () => { + throw new Error('Test error'); }); - emitter.emitConnectionOpen({ sessionId: 's1', workspaceId: 'w1', driverConfig: {} as any }); + emitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + + expect(consoleSpy.called).to.be.false; + consoleSpy.restore(); + }); + + it('should not use console.error', () => { + const consoleSpy = sinon.spy(console, 'error'); + + emitter.on(TelemetryEventType.ERROR, () => { + throw new Error('Test error'); + }); - expect(logSpy.neverCalledWith(LogLevel.error, sinon.match.any)).to.be.true; - expect(logSpy.neverCalledWith(LogLevel.warn, sinon.match.any)).to.be.true; + emitter.emitError({ + errorName: 'Error', + errorMessage: 'Test', + isTerminal: true, + }); + + expect(consoleSpy.called).to.be.false; + consoleSpy.restore(); + }); + }); + + describe('respects telemetryEnabled flag', () => { + it('should respect flag from context.getConfig()', () => { + const customContext = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: true, + directResultsDefaultMaxRows: 10000, + fetchChunkDefaultMaxRows: 100000, + socketTimeout: 900000, + retryMaxAttempts: 30, + retriesTimeout: 900000, + retryDelayMin: 1000, + retryDelayMax: 30000, + useCloudFetch: true, + cloudFetchConcurrentDownloads: 10, + cloudFetchSpeedThresholdMBps: 0, + useLZ4Compression: true, + }), + } as any; + + const customEmitter = new TelemetryEventEmitter(customContext); + let eventCount = 0; + + customEmitter.on(TelemetryEventType.CONNECTION_OPEN, () => { + eventCount++; + }); + + customEmitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); + + expect(eventCount).to.equal(1); + }); + + it('should not emit when explicitly disabled', () => { + const disabledContext = { + getLogger: () => logger, + getConfig: () => ({ + telemetryEnabled: false, + }), + } as any; + + const disabledEmitter = new TelemetryEventEmitter(disabledContext); + let eventCount = 0; + + disabledEmitter.on(TelemetryEventType.CONNECTION_OPEN, () => { + eventCount++; + }); + disabledEmitter.on(TelemetryEventType.STATEMENT_START, () => { + eventCount++; + }); + disabledEmitter.on(TelemetryEventType.STATEMENT_COMPLETE, () => { + eventCount++; + }); + disabledEmitter.on(TelemetryEventType.CLOUDFETCH_CHUNK, () => { + eventCount++; + }); + disabledEmitter.on(TelemetryEventType.ERROR, () => { + eventCount++; + }); + + disabledEmitter.emitConnectionOpen({ + sessionId: 'session-123', + workspaceId: 'workspace-456', + driverConfig: {} as DriverConfiguration, + latencyMs: 100, + }); + disabledEmitter.emitStatementStart({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + disabledEmitter.emitStatementComplete({ + statementId: 'stmt-789', + sessionId: 'session-123', + }); + disabledEmitter.emitCloudFetchChunk({ + statementId: 'stmt-789', + chunkIndex: 0, + bytes: 1000, + }); + disabledEmitter.emitError({ + errorName: 'Error', + errorMessage: 'Test', + isTerminal: false, + }); - logSpy.restore(); + expect(eventCount).to.equal(0); }); }); });