Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python"
requires-python = ">=3.10, <3.15"

dependencies = [
"livekit-agents[silero,turn-detector]~=1.5",
"livekit-plugins-ai-coustics~=0.2",
"livekit-agents[openai,deepgram,elevenlabs,silero,turn-detector]~=1.5",
"python-dotenv",
]

Expand Down
21 changes: 6 additions & 15 deletions src/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
JobContext,
JobProcess,
cli,
inference,
room_io,
)
from livekit.plugins import ai_coustics, silero
from livekit.plugins import deepgram, elevenlabs, openai, silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

logger = logging.getLogger("agent")
Expand All @@ -25,7 +24,7 @@ def __init__(self) -> None:
super().__init__(
# A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
# See all available models at https://docs.livekit.io/agents/models/llm/
llm=inference.LLM(model="openai/gpt-5.2-chat-latest"),
llm=openai.LLM(model="gpt-5.4"),
# To use a realtime model instead of a voice pipeline, replace the LLM
# with a RealtimeModel and remove the STT/TTS from the AgentSession
# (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/)
Expand Down Expand Up @@ -107,36 +106,28 @@ async def my_agent(ctx: JobContext):
"room": ctx.room.name,
}

# Set up a voice AI pipeline using OpenAI, Cartesia, Deepgram, and the LiveKit turn detector
# Set up a voice AI pipeline using OpenAI, ElevenLabs, Deepgram, and the LiveKit turn detector
session = AgentSession(
# Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
# See all available models at https://docs.livekit.io/agents/models/stt/
stt=inference.STT(model="deepgram/nova-3", language="multi"),
stt=deepgram.STT(model="nova-3", language="multi"),
# Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
# See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
tts=inference.TTS(
model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
),
tts=elevenlabs.TTS(),
# VAD and turn detection are used to determine when the user is speaking and when the agent should respond
# See more at https://docs.livekit.io/agents/build/turns
turn_detection=MultilingualModel(),
vad=ctx.proc.userdata["vad"],
# allow the LLM to generate a response while waiting for the end of turn
# See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
preemptive_generation=True,
turn_handling={"interruption": {"mode": "vad"}},
)

# Start the session, which initializes the voice pipeline and warms up the models
await session.start(
agent=Assistant(),
room=ctx.room,
room_options=room_io.RoomOptions(
audio_input=room_io.AudioInputOptions(
noise_cancellation=ai_coustics.audio_enhancement(
model=ai_coustics.EnhancerModel.QUAIL_VF_S
),
),
),
)

# # Add a virtual avatar to the session, if desired
Expand Down