Skip to content
Merged
8 changes: 0 additions & 8 deletions minitap/mobile_use/agents/contextor/contextor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
from minitap.mobile_use.context import MobileUseContext
from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
from minitap.mobile_use.controllers.platform_specific_commands_controller import (
Expand Down Expand Up @@ -26,16 +25,9 @@ async def __call__(self, state: State):
focused_app_info = get_focused_app_info(self.ctx)
device_date = get_device_date(self.ctx)

should_add_screenshot_context = is_last_tool_message_take_screenshot(
list(state.executor_messages)
)

return await state.asanitize_update(
ctx=self.ctx,
update={
"latest_screenshot_base64": device_data.base64
if should_add_screenshot_context
else None,
"latest_ui_hierarchy": device_data.elements,
"focused_app_info": focused_app_info,
"screen_size": (device_data.width, device_data.height),
Expand Down
6 changes: 0 additions & 6 deletions minitap/mobile_use/agents/cortex/cortex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from minitap.mobile_use.graph.state import State
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
from minitap.mobile_use.utils.logger import get_logger

Expand Down Expand Up @@ -62,10 +61,6 @@ async def __call__(self, state: State):
for thought in state.agents_thoughts:
messages.append(AIMessage(content=thought))

if state.latest_screenshot_base64:
messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64))
logger.info("Added screenshot to context")

if state.latest_ui_hierarchy:
ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
Expand Down Expand Up @@ -121,7 +116,6 @@ async def __call__(self, state: State):
"structured_decisions": response.decisions,
"complete_subgoals_by_ids": response.complete_subgoals_by_ids,
"screen_analysis_prompt": response.screen_analysis_prompt,
"latest_screenshot_base64": None,
"latest_ui_hierarchy": None,
"focused_app_info": None,
"device_date": None,
Expand Down
15 changes: 12 additions & 3 deletions minitap/mobile_use/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,10 +244,19 @@ def get_default_minitap_llm_config() -> LLMConfig | None:


def deep_merge_llm_config(default: LLMConfig, override: dict) -> LLMConfig:
def _deep_merge_dict(base: dict, extra: dict):
def _deep_merge_dict(base: dict, extra: dict, path: str = ""):
for key, value in extra.items():
if isinstance(value, dict):
_deep_merge_dict(base[key], value)
current_path = f"{path}.{key}" if path else key

if key not in base:
logger.warning(
f"Unsupported config key '{current_path}' found in override config. "
f"Ignoring this key."
)
continue

if isinstance(value, dict) and isinstance(base[key], dict):
_deep_merge_dict(base[key], value, current_path)
else:
base[key] = value

Expand Down
20 changes: 6 additions & 14 deletions minitap/mobile_use/controllers/mobile_command_controller.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import time
import uuid
from enum import Enum

Expand Down Expand Up @@ -554,18 +555,10 @@ def press_key(ctx: MobileUseContext, key: Key, dry_run: bool = False):
#### Other commands ####


class WaitTimeout(Enum):
SHORT = "500"
MEDIUM = "1000"
LONG = "5000"


def wait_for_animation_to_end(
ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False
):
if timeout is None:
return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run)
return run_flow(ctx, [{"waitForAnimationToEnd": {"timeout": timeout.value}}], dry_run=dry_run)
def wait_for_delay(time_in_ms: int):
"""Wait for a specified delay in milliseconds."""
time.sleep(time_in_ms / 1000)
return None


def run_flow_with_wait_for_animation_to_end(
Expand All @@ -575,7 +568,7 @@ def run_flow_with_wait_for_animation_to_end(
wait_for_animation_to_end: bool = False,
):
if wait_for_animation_to_end:
base_flow.append({"waitForAnimationToEnd": {"timeout": int(WaitTimeout.SHORT.value)}})
base_flow.append({"waitForAnimationToEnd": {"timeout": 500}})
return run_flow(ctx, base_flow, dry_run=dry_run)


Expand Down Expand Up @@ -603,7 +596,6 @@ def run_flow_with_wait_for_animation_to_end(
messages=[],
initial_goal="",
subgoal_plan=[],
latest_screenshot_base64=screen_data.base64,
focused_app_info=None,
device_date="",
structured_decisions=None,
Expand Down
1 change: 0 additions & 1 deletion minitap/mobile_use/graph/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class State(AgentStatePydantic):
subgoal_plan: Annotated[list[Subgoal], "The current plan, made of subgoals"]

# contextor related keys
latest_screenshot_base64: Annotated[str | None, "Latest screenshot of the device", take_last]
latest_ui_hierarchy: Annotated[
list[dict] | None, "Latest UI hierarchy of the device", take_last
]
Expand Down
1 change: 0 additions & 1 deletion minitap/mobile_use/sdk/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,6 @@ def _get_graph_state(self, task: Task):
initial_goal=task.request.goal,
subgoal_plan=[],
latest_ui_hierarchy=None,
latest_screenshot_base64=None,
focused_app_info=None,
device_date=None,
structured_decisions=None,
Expand Down
6 changes: 3 additions & 3 deletions minitap/mobile_use/tools/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
from minitap.mobile_use.tools.mobile.tap import tap_wrapper
from minitap.mobile_use.tools.mobile.wait_for_animation_to_end import (
wait_for_animation_to_end_wrapper,
from minitap.mobile_use.tools.mobile.wait_for_delay import (
wait_for_delay_wrapper,
)
from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper, ToolWrapper

Expand All @@ -29,7 +29,7 @@
stop_app_wrapper,
clear_text_wrapper,
press_key_wrapper,
wait_for_animation_to_end_wrapper,
wait_for_delay_wrapper,
]


Expand Down
64 changes: 0 additions & 64 deletions minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py

This file was deleted.

84 changes: 84 additions & 0 deletions minitap/mobile_use/tools/mobile/wait_for_delay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import asyncio
from typing import Annotated

from langchain_core.messages import ToolMessage
from langchain_core.tools import tool
from langchain_core.tools.base import InjectedToolCallId
from langgraph.prebuilt import InjectedState
from langgraph.types import Command

from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
from minitap.mobile_use.context import MobileUseContext
from minitap.mobile_use.controllers.mobile_command_controller import (
wait_for_delay as wait_for_delay_controller,
)
from minitap.mobile_use.graph.state import State
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper

MAX_DELAY_MS = 60000


def get_wait_for_delay_tool(ctx: MobileUseContext):
@tool
async def wait_for_delay(
tool_call_id: Annotated[str, InjectedToolCallId],
state: Annotated[State, InjectedState],
agent_thought: str,
time_in_ms: int,
) -> Command:
"""
Wait for a delay in milliseconds.

This tool pauses execution for a specified number of milliseconds.
Use this when you need to introduce a controlled delay to allow the UI
to update after an action, regardless of whether an animation is playing.

Args:
time_in_ms: The number of milliseconds to wait. (capped at 60 seconds)


Example:
- wait_for_delay with time_in_ms=1000 (waits 1 second)
- wait_for_delay with time_in_ms=500 (waits 0.5 seconds)
"""
if time_in_ms < 0:
time_in_ms = 1000
if time_in_ms > MAX_DELAY_MS:
time_in_ms = MAX_DELAY_MS
try:
await asyncio.to_thread(wait_for_delay_controller, time_in_ms)
output = None
has_failed = False
except Exception as e:
output = str(e)
has_failed = True
agent_outcome = (
wait_for_delay_wrapper.on_failure_fn()
if has_failed
else wait_for_delay_wrapper.on_success_fn(time_in_ms)
)
tool_message = ToolMessage(
tool_call_id=tool_call_id,
content=agent_outcome,
additional_kwargs={"error": output} if has_failed else {},
status="error" if has_failed else "success",
)
return Command(
update=await state.asanitize_update(
ctx=ctx,
update={
"agents_thoughts": [agent_thought, agent_outcome],
EXECUTOR_MESSAGES_KEY: [tool_message],
},
agent="executor",
),
)

return wait_for_delay


wait_for_delay_wrapper = ToolWrapper(
tool_fn_getter=get_wait_for_delay_tool,
on_success_fn=lambda delay: f"Successfully waited for {delay} milliseconds.",
on_failure_fn=lambda: "Failed to wait for delay.",
)