feat: Implement Activity Start and End signals in LiveRequestQueue and BaseLLMConnection

hangfei · copybara-github · commit 6071b346506b · 2025-07-16T13:51:04.000-07:00
This change adds activity start and end signals to the LiveRequestQueue,
allowing clients to manually control the start and end of user input in
streaming sessions when automatic voice activity detection is disabled.

The LiveRequestQueue allows users to send messages to the model with the following semantics:
- `content`: sends turn-by-turn content.
- `blob`: sends a media blob for realtime streaming (e.g., audio).
- `activity_start`: indicates the beginning of an activity.
- `activity_end`: indicates the end of an activity.
- `close`: closes the connection.

GeminiLLMConnection has been updated to send the new activity signals to the backend.

This change is a necessary to support clients (e.g. voice assistants) that do not want to use automatic voice activity detection. In this case, the client will be responsible to send the `activity_start` signal when the user starts talking, and `activity_end` when the user finishes talking.

To test the change:
    run_config = RunConfig(
        realtime_input_config=types.RealtimeInputConfig(
            automatic_activity_detection=types.AutomaticActivityDetection(
                disabled=True,
            ),
        )
    )

    import threading  # Add this import

    def thread_target():
      # Define the async operations to run in the background.
      async def background_task():
        live_request_queue.send_activity_start()

        # live_request_queue.send_content(
        #     content=types.Content(
        #         role='user',
        #         parts=[types.Part.from_text(text="hi, what's the time?")],
        #     )
        # )

        await asyncio.sleep(3)
        live_request_queue.send_activity_end()

PiperOrigin-RevId: 783882447
diff --git a/src/google/adk/agents/live_request_queue.py b/src/google/adk/agents/live_request_queue.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import asyncio
 from typing import Optional
 
 from google.genai import types
 from pydantic import BaseModel
 from pydantic import ConfigDict
+from pydantic import field_validator
 
 
 class LiveRequest(BaseModel):
@@ -30,6 +33,10 @@ class LiveRequest(BaseModel):
   """If set, send the content to the model in turn-by-turn mode."""
   blob: Optional[types.Blob] = None
   """If set, send the blob to the model in realtime mode."""
+  activity_start: Optional[types.ActivityStart] = None
+  """If set, signal the start of user activity to the model."""
+  activity_end: Optional[types.ActivityEnd] = None
+  """If set, signal the end of user activity to the model."""
   close: bool = False
   """If set, close the queue. queue.shutdown() is only supported in Python 3.13+."""
 
@@ -58,6 +65,14 @@ def send_content(self, content: types.Content):
   def send_realtime(self, blob: types.Blob):
     self._queue.put_nowait(LiveRequest(blob=blob))
 
+  def send_activity_start(self):
+    """Sends an activity start signal to mark the beginning of user input."""
+    self._queue.put_nowait(LiveRequest(activity_start=types.ActivityStart()))
+
+  def send_activity_end(self):
+    """Sends an activity end signal to mark the end of user input."""
+    self._queue.put_nowait(LiveRequest(activity_end=types.ActivityEnd()))
+
   def send(self, req: LiveRequest):
     self._queue.put_nowait(req)
 
diff --git a/src/google/adk/flows/llm_flows/base_llm_flow.py b/src/google/adk/flows/llm_flows/base_llm_flow.py
@@ -194,7 +194,12 @@ async def _send_to_model(
       if live_request.close:
         await llm_connection.close()
         return
-      if live_request.blob:
+
+      if live_request.activity_start:
+        await llm_connection.send_realtime(types.ActivityStart())
+      elif live_request.activity_end:
+        await llm_connection.send_realtime(types.ActivityEnd())
+      elif live_request.blob:
         # Cache audio data here for transcription
         if not invocation_context.transcription_cache:
           invocation_context.transcription_cache = []
@@ -205,6 +210,7 @@ async def _send_to_model(
               TranscriptionEntry(role='user', data=live_request.blob)
           )
         await llm_connection.send_realtime(live_request.blob)
+
       if live_request.content:
         await llm_connection.send_content(live_request.content)
 
diff --git a/src/google/adk/models/base_llm_connection.py b/src/google/adk/models/base_llm_connection.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from abc import abstractmethod
 from typing import AsyncGenerator
 
diff --git a/src/google/adk/models/gemini_llm_connection.py b/src/google/adk/models/gemini_llm_connection.py
@@ -16,6 +16,7 @@
 
 import logging
 from typing import AsyncGenerator
+from typing import Union
 
 from google.genai import live
 from google.genai import types
@@ -25,6 +26,8 @@
 
 logger = logging.getLogger('google_adk.' + __name__)
 
+RealtimeInput = Union[types.Blob, types.ActivityStart, types.ActivityEnd]
+
 
 class GeminiLlmConnection(BaseLlmConnection):
   """The Gemini model connection."""
@@ -93,16 +96,24 @@ async def send_content(self, content: types.Content):
           )
       )
 
-  async def send_realtime(self, blob: types.Blob):
+  async def send_realtime(self, input: RealtimeInput):
     """Sends a chunk of audio or a frame of video to the model in realtime.
 
     Args:
-      blob: The blob to send to the model.
+      input: The input to send to the model.
     """
-
-    input_blob = blob.model_dump()
-    logger.debug('Sending LLM Blob: %s', input_blob)
-    await self._gemini_session.send(input=input_blob)
+    if isinstance(input, types.Blob):
+      input_blob = input.model_dump()
+      logger.debug('Sending LLM Blob: %s', input_blob)
+      await self._gemini_session.send(input=input_blob)
+    elif isinstance(input, types.ActivityStart):
+      logger.debug('Sending LLM activity start signal')
+      await self._gemini_session.send_realtime_input(activity_start=input)
+    elif isinstance(input, types.ActivityEnd):
+      logger.debug('Sending LLM activity end signal')
+      await self._gemini_session.send_realtime_input(activity_end=input)
+    else:
+      raise ValueError('Unsupported input type: %s' % type(input))
 
   def __build_full_text_response(self, text: str):
     """Builds a full text response.