diff --git a/minitap/mobile_use/agents/contextor/contextor.py b/minitap/mobile_use/agents/contextor/contextor.py index aff31a10..c3a4421a 100644 --- a/minitap/mobile_use/agents/contextor/contextor.py +++ b/minitap/mobile_use/agents/contextor/contextor.py @@ -1,4 +1,3 @@ -from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot from minitap.mobile_use.context import MobileUseContext from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data from minitap.mobile_use.controllers.platform_specific_commands_controller import ( @@ -26,16 +25,9 @@ async def __call__(self, state: State): focused_app_info = get_focused_app_info(self.ctx) device_date = get_device_date(self.ctx) - should_add_screenshot_context = is_last_tool_message_take_screenshot( - list(state.executor_messages) - ) - return await state.asanitize_update( ctx=self.ctx, update={ - "latest_screenshot_base64": device_data.base64 - if should_add_screenshot_context - else None, "latest_ui_hierarchy": device_data.elements, "focused_app_info": focused_app_info, "screen_size": (device_data.width, device_data.height), diff --git a/minitap/mobile_use/agents/cortex/cortex.py b/minitap/mobile_use/agents/cortex/cortex.py index 256d9201..8b546590 100644 --- a/minitap/mobile_use/agents/cortex/cortex.py +++ b/minitap/mobile_use/agents/cortex/cortex.py @@ -18,7 +18,6 @@ from minitap.mobile_use.graph.state import State from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list -from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm from minitap.mobile_use.utils.decorators import wrap_with_callbacks from minitap.mobile_use.utils.logger import get_logger @@ -62,10 +61,6 @@ async def __call__(self, state: State): for thought in state.agents_thoughts: messages.append(AIMessage(content=thought)) - if state.latest_screenshot_base64: - messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64)) - logger.info("Added screenshot to context") - if state.latest_ui_hierarchy: ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False) @@ -121,7 +116,6 @@ async def __call__(self, state: State): "structured_decisions": response.decisions, "complete_subgoals_by_ids": response.complete_subgoals_by_ids, "screen_analysis_prompt": response.screen_analysis_prompt, - "latest_screenshot_base64": None, "latest_ui_hierarchy": None, "focused_app_info": None, "device_date": None, diff --git a/minitap/mobile_use/config.py b/minitap/mobile_use/config.py index 0955a4ef..4c03b331 100644 --- a/minitap/mobile_use/config.py +++ b/minitap/mobile_use/config.py @@ -244,10 +244,19 @@ def get_default_minitap_llm_config() -> LLMConfig | None: def deep_merge_llm_config(default: LLMConfig, override: dict) -> LLMConfig: - def _deep_merge_dict(base: dict, extra: dict): + def _deep_merge_dict(base: dict, extra: dict, path: str = ""): for key, value in extra.items(): - if isinstance(value, dict): - _deep_merge_dict(base[key], value) + current_path = f"{path}.{key}" if path else key + + if key not in base: + logger.warning( + f"Unsupported config key '{current_path}' found in override config. " + f"Ignoring this key." + ) + continue + + if isinstance(value, dict) and isinstance(base[key], dict): + _deep_merge_dict(base[key], value, current_path) else: base[key] = value diff --git a/minitap/mobile_use/controllers/mobile_command_controller.py b/minitap/mobile_use/controllers/mobile_command_controller.py index aae1ce50..74e3ce55 100644 --- a/minitap/mobile_use/controllers/mobile_command_controller.py +++ b/minitap/mobile_use/controllers/mobile_command_controller.py @@ -1,4 +1,5 @@ import re +import time import uuid from enum import Enum @@ -554,18 +555,10 @@ def press_key(ctx: MobileUseContext, key: Key, dry_run: bool = False): #### Other commands #### -class WaitTimeout(Enum): - SHORT = "500" - MEDIUM = "1000" - LONG = "5000" - - -def wait_for_animation_to_end( - ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False -): - if timeout is None: - return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run) - return run_flow(ctx, [{"waitForAnimationToEnd": {"timeout": timeout.value}}], dry_run=dry_run) +def wait_for_delay(time_in_ms: int): + """Wait for a specified delay in milliseconds.""" + time.sleep(time_in_ms / 1000) + return None def run_flow_with_wait_for_animation_to_end( @@ -575,7 +568,7 @@ def run_flow_with_wait_for_animation_to_end( wait_for_animation_to_end: bool = False, ): if wait_for_animation_to_end: - base_flow.append({"waitForAnimationToEnd": {"timeout": int(WaitTimeout.SHORT.value)}}) + base_flow.append({"waitForAnimationToEnd": {"timeout": 500}}) return run_flow(ctx, base_flow, dry_run=dry_run) @@ -603,7 +596,6 @@ def run_flow_with_wait_for_animation_to_end( messages=[], initial_goal="", subgoal_plan=[], - latest_screenshot_base64=screen_data.base64, focused_app_info=None, device_date="", structured_decisions=None, diff --git a/minitap/mobile_use/graph/state.py b/minitap/mobile_use/graph/state.py index 608a8e54..c8f9d064 100644 --- a/minitap/mobile_use/graph/state.py +++ b/minitap/mobile_use/graph/state.py @@ -25,7 +25,6 @@ class State(AgentStatePydantic): subgoal_plan: Annotated[list[Subgoal], "The current plan, made of subgoals"] # contextor related keys - latest_screenshot_base64: Annotated[str | None, "Latest screenshot of the device", take_last] latest_ui_hierarchy: Annotated[ list[dict] | None, "Latest UI hierarchy of the device", take_last ] diff --git a/minitap/mobile_use/sdk/agent.py b/minitap/mobile_use/sdk/agent.py index ed6827e0..90e4ddf8 100644 --- a/minitap/mobile_use/sdk/agent.py +++ b/minitap/mobile_use/sdk/agent.py @@ -559,7 +559,6 @@ def _get_graph_state(self, task: Task): initial_goal=task.request.goal, subgoal_plan=[], latest_ui_hierarchy=None, - latest_screenshot_base64=None, focused_app_info=None, device_date=None, structured_decisions=None, diff --git a/minitap/mobile_use/tools/index.py b/minitap/mobile_use/tools/index.py index 8e624c50..ff8c7a6f 100644 --- a/minitap/mobile_use/tools/index.py +++ b/minitap/mobile_use/tools/index.py @@ -12,8 +12,8 @@ from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper from minitap.mobile_use.tools.mobile.tap import tap_wrapper -from minitap.mobile_use.tools.mobile.wait_for_animation_to_end import ( - wait_for_animation_to_end_wrapper, +from minitap.mobile_use.tools.mobile.wait_for_delay import ( + wait_for_delay_wrapper, ) from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper, ToolWrapper @@ -29,7 +29,7 @@ stop_app_wrapper, clear_text_wrapper, press_key_wrapper, - wait_for_animation_to_end_wrapper, + wait_for_delay_wrapper, ] diff --git a/minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py b/minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py deleted file mode 100644 index 68ff91e0..00000000 --- a/minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +++ /dev/null @@ -1,64 +0,0 @@ -from langchain_core.messages import ToolMessage -from langchain_core.tools import tool -from langchain_core.tools.base import InjectedToolCallId -from langgraph.prebuilt import InjectedState -from langgraph.types import Command -from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY -from minitap.mobile_use.context import MobileUseContext -from minitap.mobile_use.controllers.mobile_command_controller import WaitTimeout -from minitap.mobile_use.controllers.mobile_command_controller import ( - wait_for_animation_to_end as wait_for_animation_to_end_controller, -) -from minitap.mobile_use.graph.state import State -from minitap.mobile_use.tools.tool_wrapper import ToolWrapper -from typing import Annotated - - -def get_wait_for_animation_to_end_tool(ctx: MobileUseContext): - @tool - async def wait_for_animation_to_end( - tool_call_id: Annotated[str, InjectedToolCallId], - state: Annotated[State, InjectedState], - agent_thought: str, - timeout: WaitTimeout | None, - ) -> Command: - """ - Waits for ongoing animations or videos to finish before continuing. - - If a `timeout` (in milliseconds) is set, the command proceeds after the timeout even if - the animation hasn't ended. - The flow continues immediately once the animation is detected as complete. - - Example: - - waitForAnimationToEnd - - waitForAnimationToEnd: { timeout: 5000 } - """ - output = wait_for_animation_to_end_controller(ctx=ctx, timeout=timeout) - has_failed = output is not None - tool_message = ToolMessage( - tool_call_id=tool_call_id, - content=wait_for_animation_to_end_wrapper.on_failure_fn() - if has_failed - else wait_for_animation_to_end_wrapper.on_success_fn(timeout), - additional_kwargs={"error": output} if has_failed else {}, - status="error" if has_failed else "success", - ) - return Command( - update=await state.asanitize_update( - ctx=ctx, - update={ - "agents_thoughts": [agent_thought], - EXECUTOR_MESSAGES_KEY: [tool_message], - }, - agent="executor", - ), - ) - - return wait_for_animation_to_end - - -wait_for_animation_to_end_wrapper = ToolWrapper( - tool_fn_getter=get_wait_for_animation_to_end_tool, - on_success_fn=lambda: "Animation ended successfully.", - on_failure_fn=lambda: "Failed to end animation.", -) diff --git a/minitap/mobile_use/tools/mobile/wait_for_delay.py b/minitap/mobile_use/tools/mobile/wait_for_delay.py new file mode 100644 index 00000000..6c6eb1dc --- /dev/null +++ b/minitap/mobile_use/tools/mobile/wait_for_delay.py @@ -0,0 +1,84 @@ +import asyncio +from typing import Annotated + +from langchain_core.messages import ToolMessage +from langchain_core.tools import tool +from langchain_core.tools.base import InjectedToolCallId +from langgraph.prebuilt import InjectedState +from langgraph.types import Command + +from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY +from minitap.mobile_use.context import MobileUseContext +from minitap.mobile_use.controllers.mobile_command_controller import ( + wait_for_delay as wait_for_delay_controller, +) +from minitap.mobile_use.graph.state import State +from minitap.mobile_use.tools.tool_wrapper import ToolWrapper + +MAX_DELAY_MS = 60000 + + +def get_wait_for_delay_tool(ctx: MobileUseContext): + @tool + async def wait_for_delay( + tool_call_id: Annotated[str, InjectedToolCallId], + state: Annotated[State, InjectedState], + agent_thought: str, + time_in_ms: int, + ) -> Command: + """ + Wait for a delay in milliseconds. + + This tool pauses execution for a specified number of milliseconds. + Use this when you need to introduce a controlled delay to allow the UI + to update after an action, regardless of whether an animation is playing. + + Args: + time_in_ms: The number of milliseconds to wait. (capped at 60 seconds) + + + Example: + - wait_for_delay with time_in_ms=1000 (waits 1 second) + - wait_for_delay with time_in_ms=500 (waits 0.5 seconds) + """ + if time_in_ms < 0: + time_in_ms = 1000 + if time_in_ms > MAX_DELAY_MS: + time_in_ms = MAX_DELAY_MS + try: + await asyncio.to_thread(wait_for_delay_controller, time_in_ms) + output = None + has_failed = False + except Exception as e: + output = str(e) + has_failed = True + agent_outcome = ( + wait_for_delay_wrapper.on_failure_fn() + if has_failed + else wait_for_delay_wrapper.on_success_fn(time_in_ms) + ) + tool_message = ToolMessage( + tool_call_id=tool_call_id, + content=agent_outcome, + additional_kwargs={"error": output} if has_failed else {}, + status="error" if has_failed else "success", + ) + return Command( + update=await state.asanitize_update( + ctx=ctx, + update={ + "agents_thoughts": [agent_thought, agent_outcome], + EXECUTOR_MESSAGES_KEY: [tool_message], + }, + agent="executor", + ), + ) + + return wait_for_delay + + +wait_for_delay_wrapper = ToolWrapper( + tool_fn_getter=get_wait_for_delay_tool, + on_success_fn=lambda delay: f"Successfully waited for {delay} milliseconds.", + on_failure_fn=lambda: "Failed to wait for delay.", +)