Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 84 additions & 31 deletions app/agents/voice/automatic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import asyncio
import os
import random
import argparse
from dotenv import load_dotenv
import wave
from datetime import datetime
from zoneinfo import ZoneInfo
from pydub import AudioSegment
import audioop

from dotenv import load_dotenv
Copy link

Copilot AI Sep 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate import of load_dotenv - it's imported again on line 13 and was already imported earlier. Remove the duplicate import.

Suggested change
from dotenv import load_dotenv

Copilot uses AI. Check for mistakes.
from langfuse import get_client
Expand All @@ -20,7 +24,12 @@
LLMFullResponseEndFrame,
OutputAudioRawFrame,
TTSSpeakFrame,
OutputAudioRawFrame,
LLMFullResponseStartFrame,
FunctionCallInProgressFrame,
FunctionCallResultFrame
)

from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
Expand Down Expand Up @@ -54,10 +63,23 @@

from .processors import LLMSpyProcessor
from .processors.ptt_vad_filter import PTTVADFilter
from .processors.user_speaking_audio import UserSpeakingAudioProcessor
from .prompts import get_system_prompt
from .stt import get_stt_service
from .tools import initialize_tools
from .tts import get_tts_service
from .stt import get_stt_service
from .audio.audio_manager import initialize_audio_manager, get_audio_manager
from app.agents.voice.automatic.processors.llm_spy import handle_confirmation_response
from app.agents.voice.automatic.types import (
TTSProvider,
Mode,
decode_tts_provider,
decode_voice_name,
decode_mode,
)
from opentelemetry import trace
from langfuse import get_client
from .types import (
Mode,
TTSProvider,
Expand Down Expand Up @@ -212,6 +234,9 @@ async def main():
enable_chart_text_filter=config.ENABLE_CHARTS,
)

# Initialize audio manager for looping waiting audio
audio_manager = initialize_audio_manager(tts)

llm = LLMServiceWrapper(
AzureLLMService(
api_key=config.AZURE_OPENAI_API_KEY,
Expand Down Expand Up @@ -273,39 +298,61 @@ async def main():

rtvi = RTVIProcessor(config=RTVIConfig(config=[]))

# Simplified event handler for TTS feedback
# LLM response started handler - DO NOT start audio here
# Audio should only start when user stops speaking, not when LLM processes
@llm.event_handler("on_llm_response_started")
async def on_llm_response_started(service,function_calls):
Copy link

Copilot AI Sep 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after comma in function parameter list. Should be service, function_calls.

Suggested change
async def on_llm_response_started(service,function_calls):
async def on_llm_response_started(service, function_calls):

Copilot uses AI. Check for mistakes.
logger.info(f"Function calls started event triggered with {len(function_calls)} calls")

# commented out older waiting audio start code - audio is now managed by user speech events
# # Just log that LLM started - audio is managed by user speech events
# if tts_provider == TTSProvider.GOOGLE:
# for function_call in function_calls:
# # Skip "checking" message for instant functions and chart tools
# if function_call.function_name not in [
# "get_current_time",
# "generate_bar_chart",
# "generate_line_chart",
# "generate_donut_chart",
# "generate_single_stat_card",
# ]:
# phrases = [
# "Let me check on that.",
# "Give me a moment to do that.",
# "I'll get right on that.",
# "Working on that for you.",
# "One moment — I'm on it",
# "One second, boss.",
# "On it, boss!",
# "Just a second, captain.",
# ]
# await tts.queue_frame(TTSSpeakFrame(random.choice(phrases)))
# break

# Keep function call handler for debugging
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
# Only play the "checking" message if using Google TTS
if tts_provider == TTSProvider.GOOGLE:
for function_call in function_calls:
# Skip "checking" message for instant functions and chart tools
instant_functions = [
"get_current_time",
"utility__getCurrentTime", # NeuroLink equivalent
"utility__generateTimestamp", # NeuroLink timestamp tool
"generate_bar_chart",
"generate_line_chart",
"generate_donut_chart",
"generate_single_stat_card",
]
if function_call.function_name not in instant_functions:
# Play tool call sound if enabled, otherwise use phrases
if tool_call_sound:
await transport.send_audio(tool_call_sound)
else:
phrases = [
"Let me check on that.",
"Give me a moment to do that.",
"I'll get right on that.",
"Working on that for you.",
"One moment — I'm on it",
"One second, boss.",
"On it, boss!",
"Just a second, captain.",
]
await tts.queue_frame(TTSSpeakFrame(random.choice(phrases)))
break
logger.info(f"Function calls started event triggered with {len(function_calls)} calls")
# Audio already started by LLM response started, just log
for function_call in function_calls:
logger.debug(f"Function call: {function_call.function_name}")


# Remove duplicate LLM response started handler - moved above

# Only stop audio when function calls complete if audio is actually playing
@llm.event_handler("on_function_calls_finished")
async def on_function_calls_finished(service):
# IMMEDIATELY stop audio when function calls finish (response coming)
if audio_manager.user_has_input or audio_manager.is_playing:
logger.info("🔧 Function calls finished - IMMEDIATELY stopping audio")

# Use simplified stop method
await audio_manager.stop_and_disable_audio()

logger.info("🛑 Audio IMMEDIATELY stopped - function calls finished")
else:
logger.info("Function calls finished - audio not playing, no action needed")

messages = [
{"role": "system", "content": system_prompt},
Expand Down Expand Up @@ -334,6 +381,12 @@ async def on_function_calls_started(service, function_calls):
ptt_vad_filter = PTTVADFilter("PTTVADFilter")
pipeline_components.append(ptt_vad_filter) # Filter VAD frames after STT

# Add user speaking audio processor - manages audio based on user speech
logger.info("🔧 Creating UserSpeakingAudioProcessor...")
user_speaking_processor = UserSpeakingAudioProcessor("UserSpeakingAudioProcessor")
pipeline_components.append(user_speaking_processor) # Add after STT/PTT, before RTVI
logger.info(f"✅ UserSpeakingAudioProcessor added to pipeline at position {len(pipeline_components)-1}")

pipeline_components.extend([rtvi, context_aggregator.user()])
if (
config.MEM0_ENABLED
Expand Down
Loading