From 4a61412cd1e0a6953c424f1da6fb71836335768b Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 13:25:07 +0800 Subject: [PATCH 1/3] feat(voice): /voice setup check + whisper.cpp detection (slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface the existing core whisper.cpp engine via a `/voice` slash command and add the settings schema for it. No mic capture yet — this is the safe, self-contained foundation per docs/VOICE_INPUT.md. Core: - Add VoiceConfig (provider | binPath | modelPath) to settings types, re-exported from @deepcode/core (the JSON schema already had the block). - New detectVoice() (voice/detect.ts): resolves the whisper binary (settings.binPath, else whisper-cli/whisper on PATH) and the model (settings.modelPath, else ~/.deepcode/models/whisper-base.en.bin), never throws — missing pieces become `problems`. Injectable probes for deterministic tests. - validateSettingsShallow now flags an unknown voice.provider. CLI: - /voice reports readiness or prints actionable setup steps (+ per-issue detail); `/voice setup` always shows install instructions. - SessionContext gains an optional `home` (honors --home) for the default model-path probe; wired in the REPL. Tests: 9 core detection cases, 1 schema case, 3 CLI messaging cases. Updates the /voice BEHAVIOR_PARITY row (✗ → ✓, 🔄 → 🟡). Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/cli/src/commands.ts | 55 ++++++++++ apps/cli/src/repl.ts | 1 + apps/cli/src/voice-cmd.test.ts | 85 +++++++++++++++ docs/BEHAVIOR_PARITY.md | 98 ++++++++--------- packages/core/src/config/index.ts | 1 + packages/core/src/config/schema.test.ts | 6 ++ packages/core/src/config/schema.ts | 6 ++ packages/core/src/config/types.ts | 15 +++ packages/core/src/index.ts | 6 +- packages/core/src/voice/detect.test.ts | 105 ++++++++++++++++++ packages/core/src/voice/detect.ts | 137 ++++++++++++++++++++++++ packages/core/src/voice/index.ts | 13 +++ 12 files changed, 478 insertions(+), 50 deletions(-) create mode 100644 apps/cli/src/voice-cmd.test.ts create mode 100644 packages/core/src/voice/detect.test.ts create mode 100644 packages/core/src/voice/detect.ts diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts index af8a5e1..b7aab71 100644 --- a/apps/cli/src/commands.ts +++ b/apps/cli/src/commands.ts @@ -128,6 +128,9 @@ export interface SessionContext { credsStore?: CredentialsStore; /** User settings.json path (REPL-injected, honors --home) — backs /config set. */ userSettingsPath?: string; + /** Home dir override (REPL-injected from --home) — backs default-path lookups + * like /voice's `~/.deepcode/models/...` model probe. Defaults to os.homedir(). */ + home?: string; sessionId: string; sessions: SessionManager; usage: { @@ -1169,6 +1172,57 @@ export const TasksCommand: SlashCommand = { }, }; +export const VoiceCommand: SlashCommand = { + name: '/voice', + description: 'Check local voice-input (whisper.cpp) setup; `/voice setup` shows install steps.', + async run(args, ctx) { + const { detectVoice } = await import('@deepcode/core'); + const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); + const forceSetup = (args[0] ?? '').toLowerCase() === 'setup'; + + if (status.ready && !forceSetup) { + return [ + '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', + ` binary: ${status.binPath}`, + ` model: ${status.modelPath}`, + '', + 'Dictate from the REPL with the voice key (default Ctrl+V; remap in keybindings.json).', + 'Note: live mic capture lands in a follow-up — this step ships setup + detection.', + ]; + } + + const lines: string[] = [ + status.ready + ? '🎙 Voice input is ready. Setup reference below.' + : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', + '', + 'Detected:', + ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, + ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, + ]; + if (status.problems.length) { + lines.push('', 'Issues:'); + for (const p of status.problems) lines.push(` • ${p}`); + } + lines.push( + '', + 'Setup:', + ' 1. Install whisper.cpp', + ' macOS: brew install whisper-cpp', + ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', + ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', + ' mkdir -p ~/.deepcode/models', + ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', + ' 3. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', + ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', + ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', + '', + 'Full guide: docs/VOICE_INPUT.md', + ); + return lines; + }, +}; + export const BackgroundCommand: SlashCommand = { name: '/background', aliases: ['/bg'], @@ -1229,6 +1283,7 @@ export const BUILTIN_COMMANDS: SlashCommand[] = [ BtwCommand, TasksCommand, BackgroundCommand, + VoiceCommand, ]; // ────────────────────────────────────────────────────────────────────────── diff --git a/apps/cli/src/repl.ts b/apps/cli/src/repl.ts index 3e49860..5b109c5 100644 --- a/apps/cli/src/repl.ts +++ b/apps/cli/src/repl.ts @@ -437,6 +437,7 @@ export async function startRepl(opts: ReplOpts): Promise { creds, credsStore, userSettingsPath: settingsPaths({ cwd, home: opts.home }).userPath, + home: opts.home, sessionId: session.id, sessions, usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, diff --git a/apps/cli/src/voice-cmd.test.ts b/apps/cli/src/voice-cmd.test.ts new file mode 100644 index 0000000..bbdac47 --- /dev/null +++ b/apps/cli/src/voice-cmd.test.ts @@ -0,0 +1,85 @@ +// Tests for the /voice slash command messaging. Detection logic itself is +// unit-tested in core (voice/detect.test.ts); here we drive the command end to +// end with real temp files so the "ready" path is deterministic, and bogus +// configured paths so the "not set up" path never depends on the host's PATH. + +import { afterEach, describe, expect, it } from 'vitest'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { SessionManager } from '@deepcode/core'; +import { CommandRegistry, type SessionContext } from './commands.js'; + +const reg = new CommandRegistry(); +const tmps: string[] = []; +async function tmpDir(): Promise { + const d = await mkdtemp(join(tmpdir(), 'dc-voice-')); + tmps.push(d); + return d; +} +afterEach(async () => { + await Promise.all(tmps.splice(0).map((d) => rm(d, { recursive: true, force: true }))); +}); + +function ctx(overrides: Partial = {}): SessionContext { + return { + cwd: '/tmp/x', + model: 'deepseek-chat', + mode: 'default', + effort: 'medium', + settings: {}, + creds: { apiKey: 'sk-test' }, + sessionId: 's1', + sessions: new SessionManager({ root: '/tmp/x' }), + usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, + ...overrides, + }; +} + +const run = (args: string[], c: SessionContext) => reg.match('/voice')!.cmd.run(args, c); + +describe('/voice', () => { + it('reports ready when configured binary + model both exist', async () => { + const dir = await tmpDir(); + const binPath = join(dir, 'whisper-cli'); + const modelPath = join(dir, 'model.bin'); + await writeFile(binPath, '#!/bin/sh\n'); + await writeFile(modelPath, 'GGML'); + const out = (await run([], ctx({ settings: { voice: { binPath, modelPath } } }))).join('\n'); + expect(out).toMatch(/ready/i); + expect(out).toContain(binPath); + expect(out).toContain(modelPath); + expect(out).toMatch(/Ctrl\+V/); + }); + + it('prints setup steps + issues when configured paths are missing', async () => { + const out = ( + await run( + [], + ctx({ settings: { voice: { binPath: '/no/such/whisper', modelPath: '/no/such/m.bin' } } }), + ) + ).join('\n'); + expect(out).toMatch(/not set up yet/i); + expect(out).toMatch(/brew install whisper-cpp/); + expect(out).toMatch(/docs\/VOICE_INPUT\.md/); + // The specific configured-but-missing problems surface under "Issues:". + expect(out).toMatch(/Issues:/); + expect(out).toContain('Configured voice.binPath not found: /no/such/whisper'); + expect(out).toContain('Configured voice.modelPath not found: /no/such/m.bin'); + }); + + it('`/voice setup` always shows install steps, even when ready', async () => { + const dir = await tmpDir(); + const binPath = join(dir, 'whisper-cli'); + const modelPath = join(dir, 'model.bin'); + await writeFile(binPath, ''); + await writeFile(modelPath, ''); + const out = (await run(['setup'], ctx({ settings: { voice: { binPath, modelPath } } }))).join( + '\n', + ); + expect(out).toMatch(/Setup:/); + expect(out).toMatch(/brew install whisper-cpp/); + // Still acknowledges it's already ready. + expect(out).toMatch(/ready/i); + }); +}); diff --git a/docs/BEHAVIOR_PARITY.md b/docs/BEHAVIOR_PARITY.md index eb5f547..c5c3821 100644 --- a/docs/BEHAVIOR_PARITY.md +++ b/docs/BEHAVIOR_PARITY.md @@ -21,55 +21,55 @@ Legend: `✅` matches · `🟡` matches with caveats · `🔄` deferred · `⚠ ## Slash commands (30+ in Claude Code, ~32 shipped in DeepCode) -| Command | Claude Code | DeepCode | Status | -| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------ | -| `/help` | ✓ | ✓ | ✅ | -| `/clear` | ✓ | ✓ | ✅ | -| `/exit` / `/quit` | ✓ | ✓ | ✅ | -| `/status` / `/doctor` | ✓ | ✓ | ✅ | -| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | -| `/mode` | ✓ | ✓ | ✅ | -| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | -| `/cost` / `/usage` | ✓ | ✓ | ✅ | -| `/context` | ✓ | ✓ | ✅ | -| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | -| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | -| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | -| `/mcp` | ✓ | ✓ | ✅ | -| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | -| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | -| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | -| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | -| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | -| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | -| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | -| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | -| `/voice` | ✓ | ✗ | 🔄 M8 | -| `/teleport` | ✓ | ✗ | 🔄 M8 | -| `/desktop` | ✓ | ✗ | 🔄 M6 | -| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | -| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | -| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | -| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | -| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | -| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | -| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | -| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | -| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | -| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | -| `/loop` | ✓ | ✗ (skill avail) | 🟡 | -| `/terminal-setup` | ✓ | ✗ | 🔄 | -| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | -| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | -| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | -| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | -| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | -| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | -| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | -| `/migrate-installer` | ✓ | ✗ | 🔄 | -| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | +| Command | Claude Code | DeepCode | Status | +| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `/help` | ✓ | ✓ | ✅ | +| `/clear` | ✓ | ✓ | ✅ | +| `/exit` / `/quit` | ✓ | ✓ | ✅ | +| `/status` / `/doctor` | ✓ | ✓ | ✅ | +| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | +| `/mode` | ✓ | ✓ | ✅ | +| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | +| `/cost` / `/usage` | ✓ | ✓ | ✅ | +| `/context` | ✓ | ✓ | ✅ | +| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | +| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | +| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | +| `/mcp` | ✓ | ✓ | ✅ | +| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | +| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | +| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | +| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | +| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | +| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | +| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | +| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | +| `/voice` | ✓ | ✓ | 🟡 — `/voice` detects whisper.cpp + a model and prints setup steps (docs/VOICE_INPUT.md); core `WhisperCppProvider` is wired; live mic capture lands in a follow-up slice | +| `/teleport` | ✓ | ✗ | 🔄 M8 | +| `/desktop` | ✓ | ✗ | 🔄 M6 | +| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | +| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | +| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | +| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | +| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | +| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | +| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | +| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | +| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | +| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | +| `/loop` | ✓ | ✗ (skill avail) | 🟡 | +| `/terminal-setup` | ✓ | ✗ | 🔄 | +| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | +| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | +| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | +| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | +| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | +| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | +| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | +| `/migrate-installer` | ✓ | ✗ | 🔄 | +| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | --- diff --git a/packages/core/src/config/index.ts b/packages/core/src/config/index.ts index 02500ee..32bf474 100644 --- a/packages/core/src/config/index.ts +++ b/packages/core/src/config/index.ts @@ -15,6 +15,7 @@ export type { UpdateConfig, WorktreeConfig, AutoModeConfig, + VoiceConfig, } from './types.js'; export { diff --git a/packages/core/src/config/schema.test.ts b/packages/core/src/config/schema.test.ts index 3ee0774..a84fd80 100644 --- a/packages/core/src/config/schema.test.ts +++ b/packages/core/src/config/schema.test.ts @@ -51,6 +51,12 @@ describe('validateSettingsShallow', () => { expect(errs[0]).toMatch(/OnEverything/); }); + it('flags unknown voice provider but accepts whisper.cpp', () => { + expect(validateSettingsShallow({ voice: { provider: 'whisper.cpp' } })).toEqual([]); + const errs = validateSettingsShallow({ voice: { provider: 'azure' } }); + expect(errs[0]).toMatch(/voice\.provider "azure"/); + }); + it('returns no errors on empty config', () => { expect(validateSettingsShallow({})).toEqual([]); }); diff --git a/packages/core/src/config/schema.ts b/packages/core/src/config/schema.ts index 3c7d233..d648032 100644 --- a/packages/core/src/config/schema.ts +++ b/packages/core/src/config/schema.ts @@ -89,5 +89,11 @@ export function validateSettingsShallow(settings: Record): stri } } + const voiceProviderEnum = ['whisper.cpp', 'stub']; + const voice = settings['voice'] as { provider?: string } | undefined; + if (voice?.provider && !voiceProviderEnum.includes(voice.provider)) { + errors.push(`voice.provider "${voice.provider}" not in ${voiceProviderEnum.join(' | ')}`); + } + return errors; } diff --git a/packages/core/src/config/types.ts b/packages/core/src/config/types.ts index 6fa1a47..f77af1e 100644 --- a/packages/core/src/config/types.ts +++ b/packages/core/src/config/types.ts @@ -108,6 +108,18 @@ export interface AutoModeConfig { fallback?: 'ask' | 'deny'; } +export interface VoiceConfig { + /** + * Speech-to-text engine. Only 'whisper.cpp' is a real local engine; 'stub' + * returns an empty transcript (tests / "disabled"). Spec: docs/VOICE_INPUT.md. + */ + provider?: 'whisper.cpp' | 'stub'; + /** Path to the whisper CLI binary. Defaults to `whisper-cli`/`whisper` on PATH. */ + binPath?: string; + /** Path to the ggml model file (e.g. ~/.deepcode/models/whisper-base.en.bin). */ + modelPath?: string; +} + export interface DeepCodeSettings { // Identity model?: string; @@ -166,6 +178,9 @@ export interface DeepCodeSettings { // Worktree worktree?: WorktreeConfig; + // Voice input (M8 — local whisper.cpp ASR; see docs/VOICE_INPUT.md) + voice?: VoiceConfig; + // Plugins (M5) plugins?: { globalEnabled?: boolean; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 344b3d7..757713a 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -104,6 +104,7 @@ export { type UpdateConfig, type WorktreeConfig, type AutoModeConfig, + type VoiceConfig, } from './config/index.js'; // Credentials (M2; M3c adds ApiKeyHelperRefresher) @@ -334,15 +335,18 @@ export { type AgentStreamEvent, } from './ipc/protocol.js'; -// Voice input (M8 — whisper.cpp wrapper + stub provider) +// Voice input (M8 — whisper.cpp wrapper + stub provider + setup detection) export { WhisperCppProvider, StubVoiceProvider, parseWhisperOutput, + detectVoice, type VoiceProvider, type VoiceTranscript, type TranscribeOpts, type WhisperCppOpts, + type VoiceProbe, + type VoiceStatus, } from './voice/index.js'; // Auto-mode classifier (M3c-rest — LLM-judged tool gate when mode === 'auto') diff --git a/packages/core/src/voice/detect.test.ts b/packages/core/src/voice/detect.test.ts new file mode 100644 index 0000000..321da90 --- /dev/null +++ b/packages/core/src/voice/detect.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from 'vitest'; +import { join } from 'node:path'; +import { detectVoice, expandHome, type VoiceProbe } from './detect.js'; +import type { VoiceConfig } from '../config/types.js'; + +const HOME = '/home/u'; + +/** Build a fake probe where `present` is the set of paths/bins that "exist". */ +function probe( + present: Iterable, + overrides: Partial = {}, +): Partial { + const set = new Set(present); + return { + home: HOME, + fileExists: async (p) => set.has(p), + which: async (name) => (set.has(name) ? `/usr/bin/${name}` : null), + ...overrides, + }; +} + +describe('expandHome', () => { + it('expands ~ and ~/path, leaves others alone', () => { + expect(expandHome('~', HOME)).toBe(HOME); + expect(expandHome('~/m/x.bin', HOME)).toBe(join(HOME, 'm/x.bin')); + expect(expandHome('/abs/x.bin', HOME)).toBe('/abs/x.bin'); + expect(expandHome('rel/x.bin', HOME)).toBe('rel/x.bin'); + }); +}); + +describe('detectVoice', () => { + it('is ready when configured binPath + modelPath both exist', async () => { + const voice: VoiceConfig = { binPath: '/opt/whisper-cli', modelPath: '/models/base.bin' }; + const s = await detectVoice(voice, probe(['/opt/whisper-cli', '/models/base.bin'])); + expect(s.ready).toBe(true); + expect(s.binPath).toBe('/opt/whisper-cli'); + expect(s.modelPath).toBe('/models/base.bin'); + expect(s.problems).toEqual([]); + }); + + it('finds the binary on PATH when binPath is unset', async () => { + // 'whisper-cli' is the first candidate; PATH has it. + const def = join(HOME, '.deepcode', 'models', 'whisper-base.en.bin'); + const s = await detectVoice({ modelPath: def }, probe(['whisper-cli', def])); + expect(s.ready).toBe(true); + expect(s.binPath).toBe('/usr/bin/whisper-cli'); + }); + + it('falls back to the second PATH candidate (whisper)', async () => { + const s = await detectVoice( + { modelPath: '/m.bin' }, + probe(['whisper', '/m.bin']), // no whisper-cli, but whisper exists + ); + expect(s.binPath).toBe('/usr/bin/whisper'); + expect(s.ready).toBe(true); + }); + + it('uses the default ~/.deepcode model path when modelPath is unset', async () => { + const def = join(HOME, '.deepcode', 'models', 'whisper-base.en.bin'); + const s = await detectVoice({ binPath: '/b' }, probe(['/b', def])); + expect(s.ready).toBe(true); + expect(s.modelPath).toBe(def); + }); + + it('reports both missing pieces when nothing is installed', async () => { + const s = await detectVoice(undefined, probe([])); // empty PATH + fs + expect(s.ready).toBe(false); + expect(s.binPath).toBeUndefined(); + expect(s.modelPath).toBeUndefined(); + expect(s.problems.join('\n')).toMatch(/binary not found on PATH/); + expect(s.problems.join('\n')).toMatch(/no model at the default/); + }); + + it('flags a configured binPath / modelPath that does not exist', async () => { + const s = await detectVoice( + { binPath: '/nope/whisper', modelPath: '/nope/model.bin' }, + probe([]), + ); + expect(s.ready).toBe(false); + expect(s.problems).toContain('Configured voice.binPath not found: /nope/whisper'); + expect(s.problems).toContain('Configured voice.modelPath not found: /nope/model.bin'); + }); + + it('expands ~ in configured paths against the probe home', async () => { + const bin = join(HOME, 'bin', 'whisper'); + const model = join(HOME, 'm', 'x.bin'); + const s = await detectVoice( + { binPath: '~/bin/whisper', modelPath: '~/m/x.bin' }, + probe([bin, model]), + ); + expect(s.ready).toBe(true); + expect(s.binPath).toBe(bin); + expect(s.modelPath).toBe(model); + }); + + it('is not ready with an unknown provider even if bin + model resolve', async () => { + const s = await detectVoice( + { provider: 'azure' as unknown as VoiceConfig['provider'], binPath: '/b', modelPath: '/m' }, + probe(['/b', '/m']), + ); + expect(s.ready).toBe(false); + expect(s.provider).toBe('azure'); + expect(s.problems.join('\n')).toMatch(/Unknown voice provider/); + }); +}); diff --git a/packages/core/src/voice/detect.ts b/packages/core/src/voice/detect.ts new file mode 100644 index 0000000..d70d5ef --- /dev/null +++ b/packages/core/src/voice/detect.ts @@ -0,0 +1,137 @@ +// Voice setup detection — resolves the whisper.cpp binary + model so the +// `/voice` command (and, later, the desktop client) can report readiness and +// print actionable setup steps. Pure logic over injectable probes so it is +// unit-testable without touching the real PATH / filesystem. +// Spec: docs/VOICE_INPUT.md + +import { access, stat } from 'node:fs/promises'; +import { constants as FS } from 'node:fs'; +import { homedir } from 'node:os'; +import { delimiter, join } from 'node:path'; +import type { VoiceConfig } from '../config/types.js'; + +/** Binary names searched on PATH when `voice.binPath` is unset, in order. */ +export const WHISPER_BIN_CANDIDATES = ['whisper-cli', 'whisper'] as const; + +/** Default model location probed when `voice.modelPath` is unset (under home). */ +export const DEFAULT_MODEL_RELPATH = ['.deepcode', 'models', 'whisper-base.en.bin'] as const; + +/** Filesystem / PATH probes — injectable so detection is deterministic in tests. */ +export interface VoiceProbe { + /** Resolve an executable `name` on PATH to an absolute path, or null. */ + which(name: string): Promise; + /** True if a readable regular file exists at `path`. */ + fileExists(path: string): Promise; + /** Home dir, for ~ expansion + the default model path. */ + home: string; +} + +export interface VoiceStatus { + /** True iff a supported provider, a binary, and a model were all resolved. */ + ready: boolean; + /** Resolved provider name (defaults to 'whisper.cpp'). */ + provider: string; + /** Resolved whisper binary (absolute path), if found. */ + binPath?: string; + /** Resolved model file (absolute path), if found. */ + modelPath?: string; + /** Human-readable reasons it is not ready (empty when ready). */ + problems: string[]; +} + +/** Expand a leading `~` / `~/` to the home dir. Other paths pass through. */ +export function expandHome(p: string, home: string): string { + if (p === '~') return home; + if (p.startsWith('~/')) return join(home, p.slice(2)); + return p; +} + +/** Real PATH lookup — first dir in $PATH holding an executable `name`. */ +async function whichOnPath(name: string): Promise { + const dirs = (process.env['PATH'] ?? '').split(delimiter).filter(Boolean); + for (const dir of dirs) { + const candidate = join(dir, name); + try { + await access(candidate, FS.X_OK); + return candidate; + } catch { + /* not here, or not executable */ + } + } + return null; +} + +/** Real existence check — true only for a regular file. */ +async function isFile(path: string): Promise { + try { + return (await stat(path)).isFile(); + } catch { + return false; + } +} + +/** + * Detect whether local voice input (whisper.cpp) is ready to use. + * + * Resolution order: + * - binary: `voice.binPath` (if set) else the first of + * {@link WHISPER_BIN_CANDIDATES} found on PATH. + * - model: `voice.modelPath` (if set) else the documented default + * `~/.deepcode/models/whisper-base.en.bin`. + * + * Never throws — every missing/invalid piece becomes a `problems` entry. + */ +export async function detectVoice( + voice: VoiceConfig | undefined, + probe?: Partial, +): Promise { + const home = probe?.home ?? homedir(); + const which = probe?.which ?? whichOnPath; + const fileExists = probe?.fileExists ?? isFile; + + const provider = voice?.provider ?? 'whisper.cpp'; + const problems: string[] = []; + + if (provider !== 'whisper.cpp' && provider !== 'stub') { + problems.push(`Unknown voice provider "${provider}" — expected "whisper.cpp".`); + } + + // Resolve the binary. + let binPath: string | undefined; + if (voice?.binPath) { + const p = expandHome(voice.binPath, home); + if (await fileExists(p)) binPath = p; + else problems.push(`Configured voice.binPath not found: ${voice.binPath}`); + } else { + for (const name of WHISPER_BIN_CANDIDATES) { + const found = await which(name); + if (found) { + binPath = found; + break; + } + } + if (!binPath) { + problems.push( + `whisper.cpp binary not found on PATH (looked for ${WHISPER_BIN_CANDIDATES.join(', ')}).`, + ); + } + } + + // Resolve the model. + let modelPath: string | undefined; + if (voice?.modelPath) { + const p = expandHome(voice.modelPath, home); + if (await fileExists(p)) modelPath = p; + else problems.push(`Configured voice.modelPath not found: ${voice.modelPath}`); + } else { + const def = join(home, ...DEFAULT_MODEL_RELPATH); + if (await fileExists(def)) modelPath = def; + else + problems.push( + `No voice.modelPath set, and no model at the default ~/${DEFAULT_MODEL_RELPATH.join('/')}.`, + ); + } + + const ready = problems.length === 0 && !!binPath && !!modelPath; + return { ready, provider, binPath, modelPath, problems }; +} diff --git a/packages/core/src/voice/index.ts b/packages/core/src/voice/index.ts index 793a1d3..6efe668 100644 --- a/packages/core/src/voice/index.ts +++ b/packages/core/src/voice/index.ts @@ -137,3 +137,16 @@ export class StubVoiceProvider implements VoiceProvider { return { text: '', latencyMs: 0 }; } } + +// ────────────────────────────────────────────────────────────────────────── +// Setup detection — is whisper.cpp + a model installed and configured? +// ────────────────────────────────────────────────────────────────────────── + +export { + detectVoice, + expandHome, + WHISPER_BIN_CANDIDATES, + DEFAULT_MODEL_RELPATH, + type VoiceProbe, + type VoiceStatus, +} from './detect.js'; From 434869d0c485cbc05bf45e85daa672497dc9aabb Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 13:41:58 +0800 Subject: [PATCH 2/3] feat(voice): CLI mic capture + transcribe via /voice (slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type /voice in the REPL to dictate: record from the mic, transcribe locally with whisper.cpp, and pre-fill the input line with the transcript to edit before sending. Builds on slice 1's detection. Spec: docs/VOICE_INPUT.md. Core: - voice/record.ts: detectRecorder() finds ffmpeg / rec / sox on PATH; buildRecordArgs() builds the 16 kHz mono WAV command per tool + OS (avfoundation on macOS, alsa on Linux; rec/sox use the default device); recordToWav() spawns it and stops on an AbortSignal (SIGINT so the WAV trailer flushes — a non-zero exit after abort is expected, a non-zero exit without one rejects, e.g. no mic). Injectable which/spawn for tests. - VoiceConfig gains optional inputDevice (ffmpeg override); schema updated. CLI: - voice-capture.ts: orchestrates detect → record (Enter to stop) → WhisperCppProvider.transcribe → delete the temp WAV (+ .txt side-file) → return transcript + status lines. Handles not-ready / no-recorder / no-speech / failures gracefully. - /voice now triggers capture when the REPL wires ctx.voiceCapture; falls back to readiness/setup output otherwise. `/voice setup` still forces the install steps. Setup lines extracted to pure, reused helpers. - REPL wires voiceCapture and pre-fills the next prompt via rl.write() once the transcript is ready (ctx.prefillInput). Docs: VOICE_INPUT.md usage now describes the /voice flow (was Ctrl+V) + a recorder-install section; BEHAVIOR_PARITY /voice row updated for CLI capture. Tests: 9 core recorder cases (detect/buildArgs/record orchestration) + 3 new CLI cases (capture callback, cancel/empty, setup bypass). Real-mic end-to-end is manual (no audio hardware in CI). core 661 / cli 151, typecheck + lint + format all clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/cli/src/commands.ts | 108 ++++++++----- apps/cli/src/repl.ts | 19 ++- apps/cli/src/voice-capture.ts | 108 +++++++++++++ apps/cli/src/voice-cmd.test.ts | 35 ++++- docs/BEHAVIOR_PARITY.md | 98 ++++++------ docs/VOICE_INPUT.md | 37 +++-- packages/core/schemas/settings.schema.json | 3 +- packages/core/src/config/types.ts | 6 + packages/core/src/index.ts | 6 + packages/core/src/voice/index.ts | 15 ++ packages/core/src/voice/record.test.ts | 103 +++++++++++++ packages/core/src/voice/record.ts | 169 +++++++++++++++++++++ 12 files changed, 605 insertions(+), 102 deletions(-) create mode 100644 apps/cli/src/voice-capture.ts create mode 100644 packages/core/src/voice/record.test.ts create mode 100644 packages/core/src/voice/record.ts diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts index b7aab71..a405449 100644 --- a/apps/cli/src/commands.ts +++ b/apps/cli/src/commands.ts @@ -11,6 +11,7 @@ import type { SessionMeta, StoredMessage, TaskManager, + VoiceStatus, } from '@deepcode/core'; import { contextWindowFor, @@ -131,6 +132,14 @@ export interface SessionContext { /** Home dir override (REPL-injected from --home) — backs default-path lookups * like /voice's `~/.deepcode/models/...` model probe. Defaults to os.homedir(). */ home?: string; + /** + * Interactive voice capture, wired by the REPL (it owns readline + the mic): + * record → press Enter to stop → transcribe → return the text + display lines. + * `transcript` is null on cancel / not-ready / error. Absent in headless mode. + */ + voiceCapture?: () => Promise<{ transcript: string | null; lines: string[] }>; + /** Set by /voice — the REPL pre-fills the next input line with this text. */ + prefillInput?: string; sessionId: string; sessions: SessionManager; usage: { @@ -1172,54 +1181,71 @@ export const TasksCommand: SlashCommand = { }, }; +/** "Ready" status lines for /voice (non-interactive / headless fallback). */ +export function voiceReadyLines(status: VoiceStatus): string[] { + return [ + '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', + ` binary: ${status.binPath}`, + ` model: ${status.modelPath}`, + '', + 'Type /voice in the interactive REPL to dictate (record → Enter to stop → transcribe).', + ]; +} + +/** Setup/troubleshooting instructions for /voice, driven by a detection result. */ +export function voiceSetupLines(status: VoiceStatus): string[] { + const lines: string[] = [ + status.ready + ? '🎙 Voice input is ready. Setup reference below.' + : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', + '', + 'Detected:', + ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, + ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, + ]; + if (status.problems.length) { + lines.push('', 'Issues:'); + for (const p of status.problems) lines.push(` • ${p}`); + } + lines.push( + '', + 'Setup:', + ' 1. Install whisper.cpp', + ' macOS: brew install whisper-cpp', + ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', + ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', + ' mkdir -p ~/.deepcode/models', + ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', + ' 3. Install a mic recorder (either): brew install ffmpeg · brew install sox', + ' 4. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', + ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', + ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', + '', + 'Full guide: docs/VOICE_INPUT.md', + ); + return lines; +} + export const VoiceCommand: SlashCommand = { name: '/voice', - description: 'Check local voice-input (whisper.cpp) setup; `/voice setup` shows install steps.', + description: + 'Dictate via local whisper.cpp (record → Enter → transcribe); `/voice setup` for steps.', async run(args, ctx) { - const { detectVoice } = await import('@deepcode/core'); - const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); const forceSetup = (args[0] ?? '').toLowerCase() === 'setup'; - if (status.ready && !forceSetup) { - return [ - '🎙 Voice input is ready — whisper.cpp, fully local (no audio leaves your machine).', - ` binary: ${status.binPath}`, - ` model: ${status.modelPath}`, - '', - 'Dictate from the REPL with the voice key (default Ctrl+V; remap in keybindings.json).', - 'Note: live mic capture lands in a follow-up — this step ships setup + detection.', - ]; + // Interactive REPL: record + transcribe via the wired callback, then let the + // REPL pre-fill the input line with the transcript for the user to edit. + if (!forceSetup && ctx.voiceCapture) { + const r = await ctx.voiceCapture(); + if (r.transcript) ctx.prefillInput = r.transcript; + return r.lines; } - const lines: string[] = [ - status.ready - ? '🎙 Voice input is ready. Setup reference below.' - : '🎙 Voice input is not set up yet. Enable local dictation (whisper.cpp — no cloud):', - '', - 'Detected:', - ` ${status.binPath ? '✓' : '✗'} whisper binary ${status.binPath ?? '(not found)'}`, - ` ${status.modelPath ? '✓' : '✗'} model ${status.modelPath ?? '(not found)'}`, - ]; - if (status.problems.length) { - lines.push('', 'Issues:'); - for (const p of status.problems) lines.push(` • ${p}`); - } - lines.push( - '', - 'Setup:', - ' 1. Install whisper.cpp', - ' macOS: brew install whisper-cpp', - ' Linux: build https://github.com/ggerganov/whisper.cpp, put `whisper` on PATH', - ' 2. Download a model (base.en ≈ 140 MB is a good default) and save it:', - ' mkdir -p ~/.deepcode/models', - ' cp ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin', - ' 3. (optional) Point DeepCode at custom paths in ~/.deepcode/settings.json:', - ' { "voice": { "binPath": "/opt/homebrew/bin/whisper-cli",', - ' "modelPath": "~/.deepcode/models/whisper-base.en.bin" } }', - '', - 'Full guide: docs/VOICE_INPUT.md', - ); - return lines; + // Headless / `/voice setup`: report readiness or print setup instructions. + const { detectVoice } = await import('@deepcode/core'); + const status = await detectVoice(ctx.settings.voice, { home: ctx.home }); + if (status.ready && !forceSetup) return voiceReadyLines(status); + return voiceSetupLines(status); }, }; diff --git a/apps/cli/src/repl.ts b/apps/cli/src/repl.ts index 5b109c5..150854d 100644 --- a/apps/cli/src/repl.ts +++ b/apps/cli/src/repl.ts @@ -52,6 +52,7 @@ import { import { createInterface } from 'node:readline/promises'; import type { Readable, Writable } from 'node:stream'; import { CommandRegistry, type SessionContext } from './commands.js'; +import { captureVoice } from './voice-capture.js'; import { resolveEffort } from './parse-args.js'; import { TrustStore } from './trust.js'; import { resolveBuiltinSkillsDir } from './builtin-skills.js'; @@ -453,6 +454,9 @@ export async function startRepl(opts: ReplOpts): Promise { ...(pluginsWire?.spawnFailures.map((n) => `${n}: failed to start`) ?? []), ], initFlow: () => runInitFlow({ cwd, output, rl, provider, model, maxTokens, temperature }), + // M8: /voice records from the mic + transcribes via whisper.cpp, then the + // loop pre-fills the next input line with the transcript (rl is created below). + voiceCapture: () => captureVoice({ rl, output, settings, home: opts.home }), // M7: /rewind needs access to history + provider. provider, history, @@ -549,10 +553,19 @@ export async function startRepl(opts: ReplOpts): Promise { }; await fireLifecycle('SessionStart', { sessionId: session.id, source: 'cli' }); + // Text to inject into the next prompt's input buffer (e.g. a /voice transcript + // the user can edit before submitting). Written right after the prompt renders. + let pendingPrefill: string | undefined; + while (true) { let userInput: string; try { - userInput = await rl.question('› '); + const question = rl.question('› '); + if (pendingPrefill !== undefined) { + rl.write(pendingPrefill); + pendingPrefill = undefined; + } + userInput = await question; } catch { break; } @@ -590,6 +603,10 @@ export async function startRepl(opts: ReplOpts): Promise { history = ctx.newHistory; ctx.newHistory = undefined; } + if (ctx.prefillInput) { + pendingPrefill = ctx.prefillInput; + ctx.prefillInput = undefined; + } if (ctx.exitRequested) break; continue; } diff --git a/apps/cli/src/voice-capture.ts b/apps/cli/src/voice-capture.ts new file mode 100644 index 0000000..e032eec --- /dev/null +++ b/apps/cli/src/voice-capture.ts @@ -0,0 +1,108 @@ +// Interactive voice capture for the REPL: detect whisper.cpp + a recorder, +// record from the mic until the user presses Enter, transcribe locally, and +// return the text so the REPL can pre-fill the input line. The audio file is +// written to $TMPDIR and deleted right after transcription (see VOICE_INPUT.md). + +import { randomUUID } from 'node:crypto'; +import { rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import type { Interface as ReadlineInterface } from 'node:readline/promises'; +import type { Writable } from 'node:stream'; +import { + detectRecorder, + detectVoice, + recordToWav, + WhisperCppProvider, + type DeepCodeSettings, +} from '@deepcode/core'; +import { voiceSetupLines } from './commands.js'; + +export interface VoiceCaptureDeps { + rl: ReadlineInterface; + output: Writable; + settings: DeepCodeSettings; + /** Home override (honors --home), for the default model-path probe. */ + home?: string; +} + +export interface VoiceCaptureResult { + /** Transcribed text, or null on cancel / not-ready / empty / error. */ + transcript: string | null; + /** Lines for the REPL to print (status, errors, or setup steps). */ + lines: string[]; +} + +export async function captureVoice(deps: VoiceCaptureDeps): Promise { + const { rl, output, settings, home } = deps; + + const status = await detectVoice(settings.voice, { home }); + if (!status.ready) return { transcript: null, lines: voiceSetupLines(status) }; + + const rec = await detectRecorder(); + if (!rec.found || !rec.bin || !rec.binPath) { + return { + transcript: null, + lines: [ + '🎙 whisper.cpp is ready, but no microphone recorder was found.', + ` • ${rec.problems[0] ?? 'no recorder on PATH'}`, + ' Install one: brew install ffmpeg · brew install sox', + ], + }; + } + + const wav = join(tmpdir(), `deepcode-voice-${randomUUID()}.wav`); + const cleanup = async (): Promise => { + await rm(wav, { force: true }); + await rm(`${wav}.txt`, { force: true }); // whisper --output-txt side-file + }; + + // Record until the user presses Enter (abort → SIGINT → recorder flushes WAV). + const ac = new AbortController(); + let recErr: Error | undefined; + const recording = recordToWav({ + outPath: wav, + bin: rec.bin, + binPath: rec.binPath, + signal: ac.signal, + device: settings.voice?.inputDevice, + }).catch((e: unknown) => { + recErr = e as Error; + }); + + output.write(` 🎙 Recording with ${rec.bin}… press Enter to stop.\n`); + await rl.question(''); + ac.abort(); + await recording; + + if (recErr) { + await cleanup(); + return { + transcript: null, + lines: [` ⚠ Recording failed: ${recErr.message}`, ' Run `/voice setup` for help.'], + }; + } + + try { + output.write(' … transcribing locally\n'); + const provider = new WhisperCppProvider({ + binPath: status.binPath, + modelPath: status.modelPath!, + }); + const { text } = await provider.transcribe(wav); + await cleanup(); + const transcript = text.trim(); + if (!transcript) { + return { transcript: null, lines: [' (No speech detected — nothing inserted.)'] }; + } + return { + transcript, + lines: [ + ` 🎙 Transcribed (${transcript.length} chars) — review the input line, edit, then press Enter.`, + ], + }; + } catch (e) { + await cleanup(); + return { transcript: null, lines: [` ⚠ Transcription failed: ${(e as Error).message}`] }; + } +} diff --git a/apps/cli/src/voice-cmd.test.ts b/apps/cli/src/voice-cmd.test.ts index bbdac47..55c5ae5 100644 --- a/apps/cli/src/voice-cmd.test.ts +++ b/apps/cli/src/voice-cmd.test.ts @@ -45,11 +45,12 @@ describe('/voice', () => { const modelPath = join(dir, 'model.bin'); await writeFile(binPath, '#!/bin/sh\n'); await writeFile(modelPath, 'GGML'); + // No voiceCapture wired (headless / non-interactive) → report readiness. const out = (await run([], ctx({ settings: { voice: { binPath, modelPath } } }))).join('\n'); expect(out).toMatch(/ready/i); expect(out).toContain(binPath); expect(out).toContain(modelPath); - expect(out).toMatch(/Ctrl\+V/); + expect(out).toMatch(/type \/voice/i); }); it('prints setup steps + issues when configured paths are missing', async () => { @@ -82,4 +83,36 @@ describe('/voice', () => { // Still acknowledges it's already ready. expect(out).toMatch(/ready/i); }); + + it('runs the wired capture callback and pre-fills the transcript', async () => { + const c = ctx({ + voiceCapture: async () => ({ transcript: 'refactor the parser', lines: ['🎙 Transcribed'] }), + }); + const out = (await run([], c)).join('\n'); + expect(out).toContain('Transcribed'); + expect(c.prefillInput).toBe('refactor the parser'); // REPL will inject this + }); + + it('does not pre-fill when capture is cancelled / empty', async () => { + const c = ctx({ + voiceCapture: async () => ({ transcript: null, lines: ['(No speech detected)'] }), + }); + const out = (await run([], c)).join('\n'); + expect(out).toMatch(/no speech/i); + expect(c.prefillInput).toBeUndefined(); + }); + + it('`/voice setup` bypasses capture even when a callback is wired', async () => { + let called = false; + const c = ctx({ + settings: { voice: { binPath: '/no/such', modelPath: '/no/such' } }, + voiceCapture: async () => { + called = true; + return { transcript: 'x', lines: [] }; + }, + }); + const out = (await run(['setup'], c)).join('\n'); + expect(called).toBe(false); + expect(out).toMatch(/Setup:/); + }); }); diff --git a/docs/BEHAVIOR_PARITY.md b/docs/BEHAVIOR_PARITY.md index c5c3821..db0003e 100644 --- a/docs/BEHAVIOR_PARITY.md +++ b/docs/BEHAVIOR_PARITY.md @@ -21,55 +21,55 @@ Legend: `✅` matches · `🟡` matches with caveats · `🔄` deferred · `⚠ ## Slash commands (30+ in Claude Code, ~32 shipped in DeepCode) -| Command | Claude Code | DeepCode | Status | -| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `/help` | ✓ | ✓ | ✅ | -| `/clear` | ✓ | ✓ | ✅ | -| `/exit` / `/quit` | ✓ | ✓ | ✅ | -| `/status` / `/doctor` | ✓ | ✓ | ✅ | -| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | -| `/mode` | ✓ | ✓ | ✅ | -| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | -| `/cost` / `/usage` | ✓ | ✓ | ✅ | -| `/context` | ✓ | ✓ | ✅ | -| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | -| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | -| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | -| `/mcp` | ✓ | ✓ | ✅ | -| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | -| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | -| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | -| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | -| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | -| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | -| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | -| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | -| `/voice` | ✓ | ✓ | 🟡 — `/voice` detects whisper.cpp + a model and prints setup steps (docs/VOICE_INPUT.md); core `WhisperCppProvider` is wired; live mic capture lands in a follow-up slice | -| `/teleport` | ✓ | ✗ | 🔄 M8 | -| `/desktop` | ✓ | ✗ | 🔄 M6 | -| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | -| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | -| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | -| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | -| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | -| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | -| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | -| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | -| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | -| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | -| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | -| `/loop` | ✓ | ✗ (skill avail) | 🟡 | -| `/terminal-setup` | ✓ | ✗ | 🔄 | -| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | -| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | -| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | -| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | -| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | -| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | -| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | -| `/migrate-installer` | ✓ | ✗ | 🔄 | -| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | +| Command | Claude Code | DeepCode | Status | +| -------------------------- | ----------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `/help` | ✓ | ✓ | ✅ | +| `/clear` | ✓ | ✓ | ✅ | +| `/exit` / `/quit` | ✓ | ✓ | ✅ | +| `/status` / `/doctor` | ✓ | ✓ | ✅ | +| `/model` | ✓ | ✓ | ✅ DeepCode constrains to deepseek-\* (model picker doesn't show foreign providers) | +| `/mode` | ✓ | ✓ | ✅ | +| `/effort` | ✓ | ✓ | 🟡 — CLI prints the tier table (numbers from `EFFORT_PARAMS` SSOT); switch via `/effort `; arrow-key selector is GUI-only (M6) | +| `/cost` / `/usage` | ✓ | ✓ | ✅ | +| `/context` | ✓ | ✓ | ✅ | +| `/config` | ✓ | ✓ | 🟡 — dumps merged settings + `/config set ` (dotted keys, JSON values) writes user settings; no full arrow-key editor | +| `/resume` | ✓ | ✓ | ✅ — lists recent sessions; `/resume ` switches the live session in-REPL; `--resume ` / `-r` at launch | +| `/init` | ✓ | ✓ | ✅ — interactive 3-phase REPL flow (scan → draft → approve-write `AGENTS.md`) | +| `/mcp` | ✓ | ✓ | ✅ | +| `/add-dir` | ✓ | ✓ (records intent) | 🟡 — M3 will enforce | +| `/todos` | ✓ | ✓ | ✅ — reads `/todos.json` written by TodoWrite tool | +| `/plugins` | ✓ | ✓ | ✅ — lists wired plugins + contributed hook events + warnings (M5.2) | +| `/compact` | ✓ | ✓ | ✅ — manual `/compact` + automatic threshold trigger in the agent loop | +| `/diff` | ✓ | ✓ | ✅ — git diff + untracked files in the working tree (PR #150) | +| `/btw` | ✓ | ✓ | 🟡 — queues a "by the way" context note the agent sees with your next message (no turn fired); exact Claude Code behavior may differ | +| `/recap` | ✓ | ✓ | ✅ — provider-summarized recap of the session so far | +| `/rewind` | ✓ | ✓ | ✅ — 5 ops (code/conversation/both/summarize-from/up-to); `Esc Esc` bound | +| `/voice` | ✓ | ✓ | 🟡 — CLI: `/voice` records via ffmpeg/sox → whisper.cpp → pre-fills the input line (`/voice setup` for steps; fully local). Desktop 🎙 button is a follow-up slice | +| `/teleport` | ✓ | ✗ | 🔄 M8 | +| `/desktop` | ✓ | ✗ | 🔄 M6 | +| `/background` | ✓ | ✓ | ✅ — runs a prompt as a background sub-agent via the session TaskManager (alias `/bg`); agent-started TaskCreate tasks appear too | +| `/batch` | ✓ | ✗ | 🔄 — batch-of-prompts not yet wired (use `/background` per prompt) | +| `/tasks` | ✓ | ✓ | ✅ — lists this session's background tasks; `/tasks ` shows one's status + output | +| `/plan` | ✓ | ✗ | 🔄 — set via `/mode plan` in DeepCode | +| `/login` / `/logout` | ✓ | ✓ | ✅ — /logout clears creds + exits; /login stores a new key (next launch) | +| `/export` | ✓ | ✓ | ✅ — writes the conversation to a markdown file | +| `/bug` (alias `/feedback`) | ✓ | ✓ | ✅ — prints a prefilled GitHub issue link (model/mode/effort in the body) | +| `/upgrade` | ✓ | ✓ | ✅ — prints version + `npm i -g deepcode-cli@latest` (also the `deepcode upgrade` subcommand) | +| `/pr_comments` | ✓ | ✓ | ✅ — `gh pr view` comments for the current branch's PR | +| `/review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/security-review` | ✓ | ✗ (skill avail) | 🟡 — via Skill tool | +| `/schedule` | ✓ | ✗ (skill avail) | 🟡 | +| `/loop` | ✓ | ✗ (skill avail) | 🟡 | +| `/terminal-setup` | ✓ | ✗ | 🔄 | +| `/vim` | ✓ | ✓ | ✅ — toggles Vim mode (persists to `~/.deepcode/keybindings.json`) | +| `/keybindings` | ✓ | ✓ (read-only) | 🟡 — Claude Code opens/creates the keybindings config; ours lists bindings (edit `~/.deepcode/keybindings.json` manually) | +| `/agents` | ✓ | ✓ | ✅ — lists sub-agents from `.deepcode/agents/` | +| `/hooks` | ✓ | ✓ | ✅ — lists hooks configured in settings.json | +| `/skills` | ✓ | ✓ | ✅ — lists built-in + user + project skills | +| `/permissions` | ✓ | ✓ (read-only) | 🟡 — shows rules + default mode (interactive editor deferred) | +| `/privacy-settings` | ✓ | ✓ | ✅ — summarizes local data locations + what's sent to the DeepSeek API (read-only) | +| `/migrate-installer` | ✓ | ✗ | 🔄 | +| `/release-notes` | ✓ | ✓ | ✅ — prints the latest `CHANGELOG.md` entry | --- diff --git a/docs/VOICE_INPUT.md b/docs/VOICE_INPUT.md index 58d12de..0112235 100644 --- a/docs/VOICE_INPUT.md +++ b/docs/VOICE_INPUT.md @@ -63,6 +63,19 @@ mkdir -p ~/.deepcode/models cp models/ggml-base.en.bin ~/.deepcode/models/whisper-base.en.bin ``` +## Install a mic recorder + +DeepCode records your microphone with whichever recorder it finds on PATH — +`ffmpeg` is tried first, then sox's `rec` / `sox`: + +```bash +# macOS +brew install ffmpeg # or: brew install sox + +# Linux (Debian/Ubuntu) +sudo apt install ffmpeg # or: sudo apt install sox +``` + ## Configure DeepCode In `~/.deepcode/settings.json`: @@ -77,20 +90,26 @@ In `~/.deepcode/settings.json`: } ``` -(The `binPath` defaults to `whisper` on PATH if you omit it.) +(The `binPath` defaults to `whisper-cli` / `whisper` on PATH if you omit it.) +If ffmpeg captures from the wrong input, set `voice.inputDevice` — e.g. +`":1"` for avfoundation (macOS) or `"hw:1"` for ALSA (Linux). sox/rec always +use the system default device. ## Usage -In the CLI REPL, press the voice toggle key (default `Ctrl+V`; remap in -`~/.deepcode/keybindings.json`). DeepCode: +In the CLI REPL, type `/voice` and press Enter. DeepCode: + +1. Records audio from your default mic (via ffmpeg or sox) into a temp + `.wav` file. +2. Stops when you press Enter again (or after a 60 s safety cap). +3. Spawns whisper.cpp to transcribe the `.wav` locally. +4. Pre-fills the input line with the transcript — edit it if needed, then + press Enter to send. -1. Records audio from your default mic into a temp `.wav` file. -2. Stops recording on the next key press OR after 60 s of silence. -3. Spawns whisper.cpp to transcribe the .wav. -4. Inserts the transcribed text into the input box (you can edit before - submitting). +Run `/voice setup` any time to print install steps and what's detected. -In the Mac client (M6-rest), the same flow appears as a 🎙 button. +In the Mac client (M6-rest), the same flow appears as a 🎙 button in the +composer. ## Privacy diff --git a/packages/core/schemas/settings.schema.json b/packages/core/schemas/settings.schema.json index 030f1ae..35b073e 100644 --- a/packages/core/schemas/settings.schema.json +++ b/packages/core/schemas/settings.schema.json @@ -133,7 +133,8 @@ "properties": { "provider": { "type": "string", "enum": ["whisper.cpp", "stub"] }, "binPath": { "type": "string" }, - "modelPath": { "type": "string" } + "modelPath": { "type": "string" }, + "inputDevice": { "type": "string" } } } }, diff --git a/packages/core/src/config/types.ts b/packages/core/src/config/types.ts index f77af1e..81554aa 100644 --- a/packages/core/src/config/types.ts +++ b/packages/core/src/config/types.ts @@ -118,6 +118,12 @@ export interface VoiceConfig { binPath?: string; /** Path to the ggml model file (e.g. ~/.deepcode/models/whisper-base.en.bin). */ modelPath?: string; + /** + * Override the mic input device passed to ffmpeg (e.g. ':1' for avfoundation, + * 'hw:0' for alsa). Default: ':default' (macOS) / 'default' (Linux). sox/rec + * ignore this and use the system default device. + */ + inputDevice?: string; } export interface DeepCodeSettings { diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 757713a..a685970 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -341,12 +341,18 @@ export { StubVoiceProvider, parseWhisperOutput, detectVoice, + detectRecorder, + recordToWav, + buildRecordArgs, type VoiceProvider, type VoiceTranscript, type TranscribeOpts, type WhisperCppOpts, type VoiceProbe, type VoiceStatus, + type RecorderBin, + type RecorderStatus, + type RecordToWavOpts, } from './voice/index.js'; // Auto-mode classifier (M3c-rest — LLM-judged tool gate when mode === 'auto') diff --git a/packages/core/src/voice/index.ts b/packages/core/src/voice/index.ts index 6efe668..401b679 100644 --- a/packages/core/src/voice/index.ts +++ b/packages/core/src/voice/index.ts @@ -150,3 +150,18 @@ export { type VoiceProbe, type VoiceStatus, } from './detect.js'; + +// ────────────────────────────────────────────────────────────────────────── +// Microphone capture — record a WAV for whisper.cpp to transcribe. +// ────────────────────────────────────────────────────────────────────────── + +export { + detectRecorder, + recordToWav, + buildRecordArgs, + RECORDER_CANDIDATES, + type RecorderBin, + type RecorderStatus, + type RecordToWavOpts, + type RecordArgsOpts, +} from './record.js'; diff --git a/packages/core/src/voice/record.test.ts b/packages/core/src/voice/record.test.ts new file mode 100644 index 0000000..7bd035c --- /dev/null +++ b/packages/core/src/voice/record.test.ts @@ -0,0 +1,103 @@ +import { EventEmitter } from 'node:events'; +import type { ChildProcess } from 'node:child_process'; +import { describe, expect, it } from 'vitest'; +import { buildRecordArgs, detectRecorder, recordToWav } from './record.js'; + +describe('detectRecorder', () => { + it('prefers ffmpeg when present', async () => { + const r = await detectRecorder(async (n) => (n === 'ffmpeg' ? `/usr/bin/${n}` : null)); + expect(r.found).toBe(true); + expect(r.bin).toBe('ffmpeg'); + expect(r.binPath).toBe('/usr/bin/ffmpeg'); + }); + + it('falls back to rec, then sox', async () => { + const recOnly = await detectRecorder(async (n) => (n === 'rec' ? '/usr/bin/rec' : null)); + expect(recOnly.bin).toBe('rec'); + const soxOnly = await detectRecorder(async (n) => (n === 'sox' ? '/usr/bin/sox' : null)); + expect(soxOnly.bin).toBe('sox'); + }); + + it('reports a problem when nothing is installed', async () => { + const r = await detectRecorder(async () => null); + expect(r.found).toBe(false); + expect(r.problems.join('\n')).toMatch(/No microphone recorder/); + }); +}); + +describe('buildRecordArgs', () => { + it('ffmpeg uses avfoundation on macOS and 16k mono', () => { + const a = buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'darwin', maxSeconds: 60 }); + expect(a).toEqual( + expect.arrayContaining(['-f', 'avfoundation', '-i', ':default', '-ar', '16000', '-ac', '1']), + ); + expect(a).toContain('-t'); + expect(a[a.length - 1]).toBe('/t/o.wav'); + }); + + it('ffmpeg uses alsa on Linux and honors a custom device', () => { + const a = buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'linux', device: 'hw:1' }); + expect(a).toEqual(expect.arrayContaining(['-f', 'alsa', '-i', 'hw:1'])); + }); + + it('ffmpeg throws on an unsupported platform without a device', () => { + expect(() => buildRecordArgs('ffmpeg', '/t/o.wav', { platform: 'win32' })).toThrow( + /inputDevice/, + ); + }); + + it('rec records the default device (no -d); sox adds -d', () => { + const rec = buildRecordArgs('rec', '/t/o.wav', { maxSeconds: 30 }); + expect(rec).not.toContain('-d'); + expect(rec).toEqual( + expect.arrayContaining(['-r', '16000', '-c', '1', '/t/o.wav', 'trim', '0', '30']), + ); + const sox = buildRecordArgs('sox', '/t/o.wav'); + expect(sox).toContain('-d'); + }); +}); + +/** Fake ChildProcess whose stderr emits `err` then close(code) on next tick. */ +function fakeChild(code: number, err = ''): ChildProcess { + const ee = new EventEmitter() as unknown as ChildProcess; + const stderr = new EventEmitter() as unknown as NodeJS.ReadableStream; + Object.defineProperty(ee, 'stderr', { value: stderr }); + let killed = false; + (ee as unknown as { kill: (s?: string) => boolean }).kill = () => { + killed = true; + // Emulate ffmpeg/sox finalizing + exiting on SIGINT. + setImmediate(() => ee.emit('close', code)); + return true; + }; + setImmediate(() => { + if (err) (stderr as unknown as EventEmitter).emit('data', Buffer.from(err)); + if (!killed) ee.emit('close', code); // self-exit path (no abort) + }); + return ee; +} + +describe('recordToWav', () => { + it('resolves when stopped via the abort signal (non-zero exit is expected)', async () => { + const ac = new AbortController(); + const exec = (() => fakeChild(255)) as unknown as RecordExec; + const p = recordToWav({ + outPath: '/t/o.wav', + bin: 'ffmpeg', + binPath: '/usr/bin/ffmpeg', + platform: 'darwin', + signal: ac.signal, + exec, + }); + ac.abort(); + await expect(p).resolves.toBeUndefined(); + }); + + it('rejects on a non-zero exit when not aborted (e.g. no mic)', async () => { + const exec = (() => fakeChild(1, 'No such audio device')) as unknown as RecordExec; + await expect( + recordToWav({ outPath: '/t/o.wav', bin: 'rec', binPath: '/usr/bin/rec', exec }), + ).rejects.toThrow(/rec exited 1: No such audio device/); + }); +}); + +type RecordExec = NonNullable[0]['exec']>; diff --git a/packages/core/src/voice/record.ts b/packages/core/src/voice/record.ts new file mode 100644 index 0000000..1dbc8c6 --- /dev/null +++ b/packages/core/src/voice/record.ts @@ -0,0 +1,169 @@ +// Microphone capture — spawns a local recorder (ffmpeg or sox) to write a +// 16 kHz mono WAV that whisper.cpp can transcribe. Like the whisper binary, +// the recorder is a user-installed external tool we detect on PATH and, if +// absent, print setup steps for. Spec: docs/VOICE_INPUT.md. + +import { spawn, type ChildProcess } from 'node:child_process'; +import { access } from 'node:fs/promises'; +import { constants as FS } from 'node:fs'; +import { delimiter, join } from 'node:path'; + +/** + * Recorder front-ends we look for, in preference order. `ffmpeg` is the most + * universally installed; `rec` / `sox` are whisper.cpp tutorial favorites and + * pick the default input device automatically. + */ +export const RECORDER_CANDIDATES = ['ffmpeg', 'rec', 'sox'] as const; +export type RecorderBin = (typeof RECORDER_CANDIDATES)[number]; + +export interface RecorderStatus { + /** True if a usable recorder was found on PATH. */ + found: boolean; + /** Which front-end was selected. */ + bin?: RecorderBin; + /** Absolute path to the recorder binary. */ + binPath?: string; + /** Human-readable reason none was found (empty when found). */ + problems: string[]; +} + +/** PATH/`which` probe — injectable for tests. */ +export type WhichFn = (name: string) => Promise; + +async function whichOnPath(name: string): Promise { + const dirs = (process.env['PATH'] ?? '').split(delimiter).filter(Boolean); + for (const dir of dirs) { + const candidate = join(dir, name); + try { + await access(candidate, FS.X_OK); + return candidate; + } catch { + /* not here, or not executable */ + } + } + return null; +} + +/** Find the first available recorder front-end on PATH. Never throws. */ +export async function detectRecorder(which: WhichFn = whichOnPath): Promise { + for (const bin of RECORDER_CANDIDATES) { + const binPath = await which(bin); + if (binPath) return { found: true, bin, binPath, problems: [] }; + } + return { + found: false, + problems: [ + `No microphone recorder found on PATH (looked for ${RECORDER_CANDIDATES.join(', ')}).`, + ], + }; +} + +export interface RecordArgsOpts { + /** Platform, for ffmpeg's OS-specific input format. Defaults to process.platform. */ + platform?: NodeJS.Platform; + /** Override the input device (ffmpeg only). Default: ':default' (mac) / 'default' (linux). */ + device?: string; + /** Hard cap on recording length in seconds (safety net). */ + maxSeconds?: number; +} + +/** + * Build the recorder argv for `bin` writing 16 kHz mono WAV to `outPath`. + * Pure + exported so the per-platform/per-tool command is unit-testable. + * + * - ffmpeg: needs an OS-specific input (avfoundation on macOS, alsa on Linux). + * - rec / sox: capture the system default input device directly. + */ +export function buildRecordArgs( + bin: RecorderBin, + outPath: string, + opts: RecordArgsOpts = {}, +): string[] { + const platform = opts.platform ?? process.platform; + const max = opts.maxSeconds; + + if (bin === 'ffmpeg') { + const input: string[] = + platform === 'darwin' + ? ['-f', 'avfoundation', '-i', opts.device ?? ':default'] + : platform === 'linux' + ? ['-f', 'alsa', '-i', opts.device ?? 'default'] + : (() => { + throw new Error( + `ffmpeg mic capture on ${platform} needs an explicit voice.inputDevice; install sox (rec) or set one.`, + ); + })(); + const dur = max ? ['-t', String(max)] : []; + // -y overwrite, quiet logs, 16 kHz mono PCM WAV (what whisper.cpp expects). + return [ + '-hide_banner', + '-loglevel', + 'error', + '-y', + ...input, + ...dur, + '-ar', + '16000', + '-ac', + '1', + outPath, + ]; + } + + // sox family. `rec OUT` == `sox -d OUT`; both grab the default input device. + const head = bin === 'rec' ? ['-q'] : ['-q', '-d']; + const trim = max ? ['trim', '0', String(max)] : []; + return [...head, '-r', '16000', '-c', '1', outPath, ...trim]; +} + +export interface RecordToWavOpts { + outPath: string; + bin: RecorderBin; + binPath: string; + /** Abort to stop recording (the normal "user pressed Enter" path). */ + signal?: AbortSignal; + /** Override spawn for tests. */ + exec?: typeof spawn; + platform?: NodeJS.Platform; + device?: string; + maxSeconds?: number; +} + +/** + * Record from the default mic into `outPath` until `signal` aborts (or the + * recorder exits / hits `maxSeconds`). Aborting sends SIGINT so ffmpeg/sox + * flush a valid WAV trailer; a non-zero exit *after* an abort is expected and + * resolves cleanly. A non-zero exit *without* an abort (e.g. no microphone) + * rejects with the recorder's stderr. + */ +export function recordToWav(opts: RecordToWavOpts): Promise { + const spawnFn = opts.exec ?? spawn; + const args = buildRecordArgs(opts.bin, opts.outPath, { + platform: opts.platform, + device: opts.device, + maxSeconds: opts.maxSeconds ?? 60, + }); + return new Promise((resolve, reject) => { + const child: ChildProcess = spawnFn(opts.binPath, args); + let stderr = ''; + let aborted = false; + child.stderr?.on('data', (c: Buffer) => (stderr += c.toString())); + child.on('error', reject); + child.on('close', (code) => { + if (aborted || code === 0) resolve(); + else reject(new Error(`${opts.bin} exited ${code}: ${stderr.slice(0, 300).trim()}`)); + }); + if (opts.signal) { + if (opts.signal.aborted) stop(); + else opts.signal.addEventListener('abort', stop, { once: true }); + } + function stop(): void { + aborted = true; + try { + child.kill('SIGINT'); // ffmpeg/sox finalize the WAV on SIGINT + } catch { + /* already exited */ + } + } + }); +} From 22d709f4f932bc248c45fa8d4f30dadec5bf21e7 Mon Sep 17 00:00:00 2001 From: t Date: Mon, 8 Jun 2026 14:16:27 +0800 Subject: [PATCH 3/3] ci: trigger checks (PR retargeted to main)