diff --git a/src/agent/loop.ts b/src/agent/loop.ts index beba48d..575d83f 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1382,6 +1382,20 @@ export async function interactiveSession( callMaxTokens = 2048; // Short plan output callSystemPrompt = systemPrompt + '\n\n' + getPlanningPrompt(); } + // Force a final answer: withhold tools so the model must commit to text, + // either on the last turn or once the tool-call budget is spent. Without + // this, models that keep calling tools every turn hit maxTurns with no + // answer (and waste the spend). Opt-in per config. + const onFinalTurn = config.forceAnswerOnFinalTurn && loopCount === maxTurns; + const toolBudgetSpent = config.maxToolCalls != null && turnToolCalls >= config.maxToolCalls; + if ((onFinalTurn || toolBudgetSpent) && callToolDefs.length > 0) { + callToolDefs = []; + callSystemPrompt = systemPrompt + '\n\n' + + (toolBudgetSpent + ? `You have used your research budget (${config.maxToolCalls} tool calls) — no more tools are available.` + : 'This is your FINAL turn — no more tools are available.') + + ' Based on the research so far, output ONLY the final answer now, in the exact format requested.'; + } // ── Hallucination guard for weak models ── // Weak / free models (nemotron-ultra, GLM-4, qwen coder, free-profile @@ -1492,7 +1506,7 @@ export async function interactiveSession( if (!hasText && !hasTools && !hasThinking) { const EMPTY_FALLBACK_MODELS = ['nvidia/qwen3-coder-480b', 'nvidia/llama-4-maverick', 'zai/glm-5.1']; const nextModel = EMPTY_FALLBACK_MODELS.find(m => m !== config.model && !turnFailedModels.has(m)); - if (nextModel && recoveryAttempts < 2) { + if (nextModel && recoveryAttempts < 2 && !config.disableModelFallback) { recoveryAttempts++; turnFailedModels.add(config.model); const oldModel = config.model; @@ -1540,7 +1554,7 @@ export async function interactiveSession( const nextModel = TOOL_USE_FALLBACK_MODELS.find( m => m !== config.model && !turnFailedModels.has(m), ); - if (nextModel && recoveryAttempts < 2) { + if (nextModel && recoveryAttempts < 2 && !config.disableModelFallback) { recoveryAttempts++; turnFailedModels.add(config.model); const oldModel = config.model; @@ -2126,7 +2140,7 @@ export async function interactiveSession( .filter(p => p.type === 'text' && typeof (p as { text?: string }).text === 'string') .map(p => (p as { text: string }).text) .join(''); - if (shouldCheckGrounding(lastUserInput || '', assistantText)) { + if (!config.disableGroundingRetry && shouldCheckGrounding(lastUserInput || '', assistantText)) { const gResult = await checkGrounding(lastUserInput, history, assistantText, client, { abortSignal: abort.signal, }); diff --git a/src/agent/types.ts b/src/agent/types.ts index 1aaad00..dba7551 100644 --- a/src/agent/types.ts +++ b/src/agent/types.ts @@ -217,6 +217,30 @@ export interface AgentConfig { maxSpendUsd?: number; /** Show user-visible harness prefetch status lines (interactive UX only). */ showPrefetchStatus?: boolean; + /** + * On the final turn, withhold tools so the model must commit to a text answer + * instead of researching until cut off. For one-shot forecasting/extraction + * callers (e.g. `franklin predict`) where some models never stop calling tools + * and would otherwise hit maxTurns with no answer. + */ + forceAnswerOnFinalTurn?: boolean; + /** + * Hard cap on total tool calls for the turn. Once reached, tools are withheld + * and the model is forced to answer from what it has. Bounds research/cost + * deterministically (a turn budget alone doesn't — a turn may have no tool). + */ + maxToolCalls?: number; + /** + * Disable Franklin's automatic model-switching (empty-response / stalled-intent + * fallbacks). One-shot callers want a clean abstain from the requested model, + * not a silent switch to a different one. + */ + disableModelFallback?: boolean; + /** + * Disable the post-response "ungrounded claims → force a tool-use retry" guard. + * It fights the forced-answer path and pollutes one-shot structured output. + */ + disableGroundingRetry?: boolean; /** Mid-turn "research-bloat" compaction — summarizes history when a turn * racks up many tool calls + spend, to cut input-replay cost. Default on; * set false to disable (the desktop exposes this as a toggle). */ diff --git a/src/commands/predict.ts b/src/commands/predict.ts new file mode 100644 index 0000000..fe97079 --- /dev/null +++ b/src/commands/predict.ts @@ -0,0 +1,164 @@ +/** + * `franklin predict` — Franklin prediction mode (headless). + * + * Runs ONE model as a disciplined forecaster: it researches a single real-world + * event with a tight, read-only toolset (web search, source fetch, Exa, X, live + * prediction markets, a little market data) the way a bettor would before + * putting money down — then commits to a pick with a confidence. + * + * Designed for machine callers (e.g. BlockRun Oracle): with --json it emits a + * single JSON envelope on stdout containing the model's final answer, the full + * tool-call trace (what it searched and what it found), the terminal reason and + * token usage. Human-readable streaming otherwise. + * + * franklin predict --model anthropic/claude-opus-4.8 \ + * --question "Who wins the 2026 FIFA World Cup? Pick one country." --json + */ +import { interactiveSession } from '../agent/loop.js'; +import type { AgentConfig, StreamEvent, StreamTurnDone } from '../agent/types.js'; +import { predictionCapabilities, resetToolSessionState } from '../tools/index.js'; +import { loadChain, API_URLS } from '../config.js'; +import { resolveModel } from '../ui/model-picker.js'; + +export interface PredictOptions { + model?: string; + question?: string; + maxTurns?: string; + maxToolCalls?: string; + maxSpend?: string; + json?: boolean; + debug?: boolean; +} + +const PREDICTION_SYSTEM: string[] = [ + 'You are a sharp, disciplined forecasting analyst — think like a professional who is about to put real money on this question.', + 'Your job: predict the outcome of ONE real-world event. Before answering you MUST do research the way a bettor would:', + "1. Use web_search (and webfetch / exa tools) for the most CURRENT facts and news — today's real-world state matters far more than your training data.", + '2. Use search_prediction_markets to read the CURRENT market-implied odds (Polymarket, Kalshi, etc.) for this or a closely related question.', + '3. Weigh it: where is the consensus, where might the market be mispriced, what is your edge.', + 'Budget your research: make AT MOST 4-5 focused tool calls in total. As soon as you have enough to decide, STOP calling tools and output the JSON. Do not keep researching — an answer with light research beats no answer.', + 'Your FINAL message must end with EXACTLY ONE single-line minified JSON object and NOTHING after it:', + '{"pick": string, "confidence": number, "rationale": string, "analysis": string, "marketOdds": string}', + '- pick: one option from the question (a short label, e.g. a country, party, bucket, or Yes/No).', + '- confidence: your probability (0-1) that THIS pick is correct.', + '- rationale: one sharp sentence (max 22 words).', + '- analysis: 3-5 sentences citing what your research found, the strongest counter-argument, and why you still land here. No literal newlines inside the string.', + "- marketOdds: what the prediction market currently implies (e.g. 'Polymarket: France 18%'), or 'n/a' if none found.", + 'Be decisive. Do not hedge with "it depends".', +]; + +interface TraceEntry { + tool: string; + input: string; + output: string; + isError?: boolean; +} + +export async function predictCommand(options: PredictOptions): Promise { + const question = options.question?.trim(); + if (!question) { + process.stderr.write('predict: --question is required\n'); + process.exitCode = 1; + return; + } + if (!options.model) { + process.stderr.write('predict: --model is required\n'); + process.exitCode = 1; + return; + } + + const chain = loadChain(); + const apiUrl = API_URLS[chain]; + const model = resolveModel(options.model); + const asJson = options.json !== false; + + resetToolSessionState(); + + const agentConfig: AgentConfig = { + model, + apiUrl, + chain, + systemInstructions: PREDICTION_SYSTEM, + capabilities: predictionCapabilities, + maxTurns: options.maxTurns != null ? Number(options.maxTurns) : 8, + permissionMode: 'trust', + debug: !!options.debug, + showPrefetchStatus: false, + // Governance for one-shot forecasting: bound research by tool-call count and + // force an answer; don't silently switch models or fight a grounding retry. + // Tool budget (5) is the real research limiter; maxTurns (8) is just slack + // above it for a thinking turn + the forced-answer turn. + forceAnswerOnFinalTurn: true, + maxToolCalls: options.maxToolCalls != null ? Number(options.maxToolCalls) : 6, + disableModelFallback: true, + disableGroundingRetry: true, + ...(options.maxSpend != null ? { maxSpendUsd: Number(options.maxSpend) } : {}), + }; + + let finalText = ''; + let turnReason: StreamTurnDone['reason'] = 'completed'; + let turnError: string | undefined; + let inputTokens = 0; + let outputTokens = 0; + const trace: TraceEntry[] = []; + const nameById = new Map(); + const inputById = new Map(); + const previewById = new Map(); + + let delivered = false; + const getInput = async (): Promise => { + if (delivered) return null; + delivered = true; + return question; + }; + + await interactiveSession(agentConfig, getInput, (event: StreamEvent) => { + switch (event.kind) { + case 'text_delta': + finalText += event.text; + if (!asJson) process.stdout.write(event.text); + break; + case 'capability_start': + nameById.set(event.id, event.name); + inputById.set(event.id, ''); + if (event.preview) previewById.set(event.id, event.preview); + if (!asJson) process.stderr.write(`\n · ${event.name}${event.preview ? ` ${event.preview}` : ''}\n`); + break; + case 'capability_input_delta': + inputById.set(event.id, (inputById.get(event.id) || '') + event.delta); + break; + case 'capability_done': { + const tool = nameById.get(event.id) || 'tool'; + const input = (inputById.get(event.id) || '').trim() || previewById.get(event.id) || ''; + const output = event.result?.fullOutput || event.result?.output || ''; + trace.push({ tool, input, output: output.slice(0, 1500), isError: event.result?.isError }); + break; + } + case 'usage': + inputTokens = event.inputTokens; + outputTokens = event.outputTokens; + break; + case 'turn_done': + turnReason = event.reason; + turnError = event.error; + break; + } + }); + + if (asJson) { + const envelope = { + model, + question, + finalText: finalText.trim(), + trace, + turnReason, + ...(turnError ? { error: turnError } : {}), + usage: { inputTokens, outputTokens }, + }; + process.stdout.write(JSON.stringify(envelope) + '\n'); + } else if (turnReason !== 'completed' && turnError) { + process.stderr.write(`\n${turnError}\n`); + } + + process.exitCode = turnReason === 'completed' ? 0 : 1; +} diff --git a/src/index.ts b/src/index.ts index 9e7dc7d..7009a4c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,6 +29,7 @@ import { uninitCommand } from './commands/uninit.js'; import { proxyCommand } from './commands/proxy.js'; import { buildTaskCommand } from './commands/task.js'; import { buildContentCommand } from './commands/content.js'; +import { predictCommand } from './commands/predict.js'; import { VERSION as version } from './config.js'; @@ -90,6 +91,18 @@ program .option('--debug', 'Enable debug logging') .action((options) => proxyCommand({ ...options, version })); +program + .command('predict') + .description('Prediction mode — forecast one real-world event with a research-only toolset (web/markets), headless') + .requiredOption('-m, --model ', 'Model to use (e.g. anthropic/claude-opus-4.8, openai/gpt-5.5)') + .requiredOption('-q, --question ', 'The event question to forecast (include the allowed options)') + .option('--max-turns ', 'Max agent turns before forcing an answer', '8') + .option('--max-tool-calls ', 'Max tool calls before forcing an answer', '6') + .option('--max-spend ', 'Hard USD cap on this prediction run') + .option('--no-json', 'Human-readable streaming instead of a JSON envelope') + .option('--debug', 'Enable debug logging') + .action((options) => predictCommand(options)); + program .command('init') .description('Configure franklin auto-start (writes ~/.claude/settings.json + installs LaunchAgent on macOS)') diff --git a/src/tools/index.ts b/src/tools/index.ts index eb48d4a..e2c34d1 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -234,4 +234,31 @@ export { detachCapability, }; +/** + * "Franklin prediction mode" toolset. + * + * A deliberately tight, research-only capability set for forecasting a single + * real-world event the way a careful bettor would: gather current facts, read + * sources, check live prediction-market odds and a little market data — then + * decide. Everything else (filesystem, shell, media generation, swaps/trade + * execution, phone/voice, GPU sandbox, posting) is intentionally excluded: + * a forecaster looks things up, it does not act on the world or spend beyond + * the cheap read calls these tools make. + * + * Used by the `franklin predict` command and reusable by any headless caller + * (e.g. franklin.bet) that wants a grounded prediction. + */ +export const predictionCapabilities: CapabilityHandler[] = [ + webSearchCapability, // web_search — current news & facts + webFetchCapability, // webfetch — read a specific source URL + exaSearchCapability, // exa search — higher-quality web research + exaAnswerCapability, // exa answer — direct sourced answers + exaReadUrlsCapability, // exa read — pull full text of found URLs + searchXCapability, // search X — live sentiment / breaking signal + predictionMarketCapability, // search_prediction_markets — live implied odds + tradingSignalCapability, // market signal/indicators (for market-type events) + tradingMarketCapability, // market snapshot data + defiLlamaPriceCapability, // token price lookup (crypto-type events) +]; + export { createSubAgentCapability } from './subagent.js';