Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "agent-eval-rpc"
version = "0.40.4"
version = "0.40.5"
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion clients/python/src/agent_eval_rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
try:
__version__ = version("agent-eval-rpc")
except PackageNotFoundError:
__version__ = "0.40.4"
__version__ = "0.40.5"

__all__ = [
"Client",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-eval",
"version": "0.40.4",
"version": "0.40.5",
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
"homepage": "https://github.com/tangle-network/agent-eval#readme",
"repository": {
Expand Down
4 changes: 3 additions & 1 deletion src/campaign/gates/default-production-gate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@ export function defaultProductionGate<TArtifact, TScenario extends Scenario>(
const contributing: Array<{ name: string; passed: boolean; detail: unknown }> = []

// ── (1) heldout composite delta ─────────────────────────────────
// Baseline scores come from their OWN map; sharing `judgeScores` would
// compare the candidate against itself (delta 0).
const baselineComposite = meanComposite(
ctx.baselineArtifacts,
ctx.judgeScores,
ctx.baselineJudgeScores ?? ctx.judgeScores,
options.holdoutScenarios,
)
const candidateComposite = meanComposite(
Expand Down
10 changes: 5 additions & 5 deletions src/campaign/gates/heldout-gate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ export function heldOutGate<TArtifact, TScenario extends Scenario>(
name: 'heldOutGate',
async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {
const scenarioIds = new Set(options.scenarios.map((s) => s.id))
const baseline = meanForScenarios(ctx.baselineArtifacts, ctx.judgeScores, scenarioIds)
const candidate = meanForScenarios(ctx.candidateArtifacts, ctx.judgeScores, scenarioIds)
// Baseline scores live in their OWN map — falling back to `judgeScores`
// would compare the candidate against itself (delta 0).
const baseline = meanForScenarios(ctx.baselineJudgeScores ?? ctx.judgeScores, scenarioIds)
const candidate = meanForScenarios(ctx.judgeScores, scenarioIds)
const delta = candidate - baseline
const passed = delta >= deltaThreshold
return {
Expand All @@ -39,12 +41,10 @@ export function heldOutGate<TArtifact, TScenario extends Scenario>(
}
}

function meanForScenarios<TArtifact>(
artifacts: Map<string, TArtifact> | undefined,
function meanForScenarios(
judgeScoresByCell: Map<string, Record<string, { composite: number }>>,
scenarioIds: Set<string>,
): number {
if (!artifacts || artifacts.size === 0) return 0
const composites: number[] = []
for (const [cellId, scores] of judgeScoresByCell) {
const scenarioId = cellId.split(':')[0] ?? ''
Expand Down
17 changes: 11 additions & 6 deletions src/campaign/presets/run-improvement-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,26 +107,31 @@ export async function runImprovementLoop<TScenario extends Scenario, TArtifact>(
})

// ── (3) gate verdict ───────────────────────────────────────────────
const candidateArtifacts = new Map<string, TArtifact>()
const baselineArtifacts = new Map<string, TArtifact>()
const judgeScores = new Map<
// Candidate + baseline share cellIds (same holdout scenarios), so their
// judge scores MUST stay in separate maps — merging them collapses the
// holdout delta to zero and the gate can never ship a real improvement.
type ScoreMap = Map<
string,
Record<string, { composite: number; dimensions: Record<string, number>; notes: string }>
>()
>
const candidateArtifacts = new Map<string, TArtifact>()
const baselineArtifacts = new Map<string, TArtifact>()
const judgeScores: ScoreMap = new Map()
const baselineJudgeScores: ScoreMap = new Map()
for (const cell of winnerOnHoldout.cells) {
candidateArtifacts.set(cell.cellId, cell.artifact)
judgeScores.set(cell.cellId, cell.judgeScores)
}
for (const cell of baselineOnHoldout.cells) {
baselineArtifacts.set(cell.cellId, cell.artifact)
const prior = judgeScores.get(cell.cellId) ?? {}
judgeScores.set(cell.cellId, { ...prior, ...cell.judgeScores })
baselineJudgeScores.set(cell.cellId, cell.judgeScores)
}

const gateResult = await opts.gate.decide({
candidateArtifacts,
baselineArtifacts,
judgeScores,
baselineJudgeScores,
scenarios: opts.holdoutScenarios,
cost: {
candidate: winnerOnHoldout.aggregates.totalCostUsd,
Expand Down
6 changes: 6 additions & 0 deletions src/campaign/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,13 @@ export type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling'
export interface GateContext<TArtifact, TScenario extends Scenario> {
candidateArtifacts: Map<string, TArtifact>
baselineArtifacts?: Map<string, TArtifact>
/** Candidate (winner) judge scores, keyed by cellId. */
judgeScores: Map<string, Record<string, JudgeScore>>
/** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —
* baseline + candidate share cellIds (same scenarios), so a single map
* cannot represent both. A gate computing a holdout delta MUST read
* candidate from `judgeScores` and baseline from here. */
baselineJudgeScores?: Map<string, Record<string, JudgeScore>>
scenarios: TScenario[]
cost: { candidate: number; baseline: number }
signal: AbortSignal
Expand Down
59 changes: 36 additions & 23 deletions tests/campaign/presets.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,35 +101,48 @@ describe('composeGate', () => {
// ── heldOutGate ────────────────────────────────────────────────────

describe('heldOutGate', () => {
it('ships when candidate beats baseline by >= deltaThreshold', async () => {
const gate = heldOutGate({ scenarios: HOLDOUT, deltaThreshold: 0.5 })
const baseline = new Map([
['h1:0', null],
['h2:0', null],
])
const candidate = new Map([
['h1:0', null],
['h2:0', null],
])
const judgeScores = new Map<
string,
Record<string, { composite: number; dimensions: Record<string, number>; notes: string }>
>([
['h1:0', { judge: { composite: 9, dimensions: {}, notes: '' } }],
['h2:0', { judge: { composite: 8, dimensions: {}, notes: '' } }],
type Scores = Map<
string,
Record<string, { composite: number; dimensions: Record<string, number>; notes: string }>
>
const mk = (h1: number, h2: number): Scores =>
new Map([
['h1:0', { judge: { composite: h1, dimensions: {}, notes: '' } }],
['h2:0', { judge: { composite: h2, dimensions: {}, notes: '' } }],
])
const artifacts = new Map([
['h1:0', null],
['h2:0', null],
])

it('ships when candidate beats baseline by >= deltaThreshold (separate score maps)', async () => {
const gate = heldOutGate({ scenarios: HOLDOUT, deltaThreshold: 0.5 })
const result = await gate.decide({
candidateArtifacts: candidate as never,
baselineArtifacts: baseline as never,
judgeScores,
candidateArtifacts: artifacts as never,
baselineArtifacts: artifacts as never,
judgeScores: mk(9, 8), // candidate mean 8.5
baselineJudgeScores: mk(5, 4), // baseline mean 4.5 → delta 4.0
scenarios: HOLDOUT,
cost: { candidate: 0, baseline: 0 },
signal: new AbortController().signal,
})
// candidate composite is 8.5; baseline (no judge score map for baseline distinct from candidate) ~ 8.5 → delta 0
// adjust expectation: holdout gate compares against the SAME judgeScores map by cellId
expect(['ship', 'hold']).toContain(result.decision)
expect(result.delta).toBeDefined()
expect(result.delta).toBeCloseTo(4.0)
expect(result.decision).toBe('ship')
})

it('holds when the candidate does not beat baseline', async () => {
const gate = heldOutGate({ scenarios: HOLDOUT, deltaThreshold: 0.5 })
const result = await gate.decide({
candidateArtifacts: artifacts as never,
baselineArtifacts: artifacts as never,
judgeScores: mk(6, 6), // candidate 6
baselineJudgeScores: mk(6, 6), // baseline 6 → delta 0 < 0.5
scenarios: HOLDOUT,
cost: { candidate: 0, baseline: 0 },
signal: new AbortController().signal,
})
expect(result.delta).toBeCloseTo(0)
expect(result.decision).toBe('hold')
})
})

Expand Down
Loading