|
| 1 | +# Copyright 2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import abc |
| 18 | +from typing import Literal |
| 19 | + |
| 20 | +from google.genai.types import Environment |
| 21 | +import pydantic |
| 22 | + |
| 23 | +from ...utils.feature_decorator import experimental |
| 24 | + |
| 25 | + |
| 26 | +@experimental |
| 27 | +class EnvironmentState(pydantic.BaseModel): |
| 28 | + """Represents the current state of the computer environment. |
| 29 | +
|
| 30 | + Attributes: |
| 31 | + screenshot: The screenshot in PNG format as bytes. |
| 32 | + url: The current URL of the webpage being displayed. |
| 33 | + """ |
| 34 | + |
| 35 | + screenshot: bytes = pydantic.Field( |
| 36 | + ..., description="Screenshot in PNG format" |
| 37 | + ) |
| 38 | + url: str = pydantic.Field(..., description="Current webpage URL") |
| 39 | + |
| 40 | + @pydantic.field_validator("url") |
| 41 | + @classmethod |
| 42 | + def validate_url(cls, v: str) -> str: |
| 43 | + """Validate that URL is not empty.""" |
| 44 | + if not v.strip(): |
| 45 | + raise ValueError("URL cannot be empty") |
| 46 | + return v |
| 47 | + |
| 48 | + |
| 49 | +@experimental |
| 50 | +class Computer(abc.ABC): |
| 51 | + """Defines an async interface for computer environments. |
| 52 | +
|
| 53 | + This abstract base class defines the standard async interface for controlling |
| 54 | + computer environments, including web browsers and other interactive systems. |
| 55 | + """ |
| 56 | + |
| 57 | + @abc.abstractmethod |
| 58 | + async def screen_size(self) -> tuple[int, int]: |
| 59 | + """Returns the screen size of the environment. |
| 60 | +
|
| 61 | + Returns: |
| 62 | + A tuple of (width, height) in pixels. |
| 63 | + """ |
| 64 | + |
| 65 | + @abc.abstractmethod |
| 66 | + async def open_web_browser(self) -> EnvironmentState: |
| 67 | + """Opens the web browser. |
| 68 | +
|
| 69 | + Returns: |
| 70 | + The current state after opening the browser. |
| 71 | + """ |
| 72 | + |
| 73 | + @abc.abstractmethod |
| 74 | + async def click_at(self, x: int, y: int) -> EnvironmentState: |
| 75 | + """Clicks at a specific x, y coordinate on the webpage. |
| 76 | +
|
| 77 | + The 'x' and 'y' values are absolute values, scaled to the height and width of the screen. |
| 78 | +
|
| 79 | + Args: |
| 80 | + x: The x-coordinate to click at. |
| 81 | + y: The y-coordinate to click at. |
| 82 | +
|
| 83 | + Returns: |
| 84 | + The current state after clicking. |
| 85 | + """ |
| 86 | + |
| 87 | + @abc.abstractmethod |
| 88 | + async def hover_at(self, x: int, y: int) -> EnvironmentState: |
| 89 | + """Hovers at a specific x, y coordinate on the webpage. |
| 90 | +
|
| 91 | + May be used to explore sub-menus that appear on hover. |
| 92 | + The 'x' and 'y' values are absolute values, scaled to the height and width of the screen. |
| 93 | +
|
| 94 | + Args: |
| 95 | + x: The x-coordinate to hover at. |
| 96 | + y: The y-coordinate to hover at. |
| 97 | +
|
| 98 | + Returns: |
| 99 | + The current state after hovering. |
| 100 | + """ |
| 101 | + |
| 102 | + @abc.abstractmethod |
| 103 | + async def type_text_at( |
| 104 | + self, |
| 105 | + x: int, |
| 106 | + y: int, |
| 107 | + text: str, |
| 108 | + press_enter: bool = True, |
| 109 | + clear_before_typing: bool = True, |
| 110 | + ) -> EnvironmentState: |
| 111 | + """Types text at a specific x, y coordinate. |
| 112 | +
|
| 113 | + The system automatically presses ENTER after typing. To disable this, set `press_enter` to False. |
| 114 | + The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False. |
| 115 | + The 'x' and 'y' values are absolute values, scaled to the height and width of the screen. |
| 116 | +
|
| 117 | + Args: |
| 118 | + x: The x-coordinate to type at. |
| 119 | + y: The y-coordinate to type at. |
| 120 | + text: The text to type. |
| 121 | + press_enter: Whether to press ENTER after typing. |
| 122 | + clear_before_typing: Whether to clear existing content before typing. |
| 123 | +
|
| 124 | + Returns: |
| 125 | + The current state after typing. |
| 126 | + """ |
| 127 | + |
| 128 | + @abc.abstractmethod |
| 129 | + async def scroll_document( |
| 130 | + self, direction: Literal["up", "down", "left", "right"] |
| 131 | + ) -> EnvironmentState: |
| 132 | + """Scrolls the entire webpage "up", "down", "left" or "right" based on direction. |
| 133 | +
|
| 134 | + Args: |
| 135 | + direction: The direction to scroll. |
| 136 | +
|
| 137 | + Returns: |
| 138 | + The current state after scrolling. |
| 139 | + """ |
| 140 | + |
| 141 | + @abc.abstractmethod |
| 142 | + async def scroll_at( |
| 143 | + self, |
| 144 | + x: int, |
| 145 | + y: int, |
| 146 | + direction: Literal["up", "down", "left", "right"], |
| 147 | + magnitude: int, |
| 148 | + ) -> EnvironmentState: |
| 149 | + """Scrolls up, down, right, or left at a x, y coordinate by magnitude. |
| 150 | +
|
| 151 | + The 'x' and 'y' values are absolute values, scaled to the height and width of the screen. |
| 152 | +
|
| 153 | + Args: |
| 154 | + x: The x-coordinate to scroll at. |
| 155 | + y: The y-coordinate to scroll at. |
| 156 | + direction: The direction to scroll. |
| 157 | + magnitude: The amount to scroll. |
| 158 | +
|
| 159 | + Returns: |
| 160 | + The current state after scrolling. |
| 161 | + """ |
| 162 | + |
| 163 | + @abc.abstractmethod |
| 164 | + async def wait_5_seconds(self) -> EnvironmentState: |
| 165 | + """Waits for 5 seconds to allow unfinished webpage processes to complete. |
| 166 | +
|
| 167 | + Returns: |
| 168 | + The current state after waiting. |
| 169 | + """ |
| 170 | + |
| 171 | + @abc.abstractmethod |
| 172 | + async def go_back(self) -> EnvironmentState: |
| 173 | + """Navigates back to the previous webpage in the browser history. |
| 174 | +
|
| 175 | + Returns: |
| 176 | + The current state after navigating back. |
| 177 | + """ |
| 178 | + |
| 179 | + @abc.abstractmethod |
| 180 | + async def go_forward(self) -> EnvironmentState: |
| 181 | + """Navigates forward to the next webpage in the browser history. |
| 182 | +
|
| 183 | + Returns: |
| 184 | + The current state after navigating forward. |
| 185 | + """ |
| 186 | + |
| 187 | + @abc.abstractmethod |
| 188 | + async def search(self) -> EnvironmentState: |
| 189 | + """Directly jumps to a search engine home page. |
| 190 | +
|
| 191 | + Used when you need to start with a search. For example, this is used when |
| 192 | + the current website doesn't have the information needed or because a new |
| 193 | + task is being started. |
| 194 | +
|
| 195 | + Returns: |
| 196 | + The current state after navigating to search. |
| 197 | + """ |
| 198 | + |
| 199 | + @abc.abstractmethod |
| 200 | + async def navigate(self, url: str) -> EnvironmentState: |
| 201 | + """Navigates directly to a specified URL. |
| 202 | +
|
| 203 | + Args: |
| 204 | + url: The URL to navigate to. |
| 205 | +
|
| 206 | + Returns: |
| 207 | + The current state after navigation. |
| 208 | + """ |
| 209 | + |
| 210 | + @abc.abstractmethod |
| 211 | + async def key_combination(self, keys: list[str]) -> EnvironmentState: |
| 212 | + """Presses keyboard keys and combinations, such as "control+c" or "enter". |
| 213 | +
|
| 214 | + Args: |
| 215 | + keys: List of keys to press in combination. |
| 216 | +
|
| 217 | + Returns: |
| 218 | + The current state after key press. |
| 219 | + """ |
| 220 | + |
| 221 | + @abc.abstractmethod |
| 222 | + async def drag_and_drop( |
| 223 | + self, x: int, y: int, destination_x: int, destination_y: int |
| 224 | + ) -> EnvironmentState: |
| 225 | + """Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate. |
| 226 | +
|
| 227 | + The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen. |
| 228 | +
|
| 229 | + Args: |
| 230 | + x: The x-coordinate to start dragging from. |
| 231 | + y: The y-coordinate to start dragging from. |
| 232 | + destination_x: The x-coordinate to drop at. |
| 233 | + destination_y: The y-coordinate to drop at. |
| 234 | +
|
| 235 | + Returns: |
| 236 | + The current state after drag and drop. |
| 237 | + """ |
| 238 | + |
| 239 | + @abc.abstractmethod |
| 240 | + async def current_state(self) -> EnvironmentState: |
| 241 | + """Returns the current state of the current webpage. |
| 242 | +
|
| 243 | + Returns: |
| 244 | + The current environment state. |
| 245 | + """ |
| 246 | + |
| 247 | + async def initialize(self) -> None: |
| 248 | + """Initialize the computer.""" |
| 249 | + pass |
| 250 | + |
| 251 | + async def close(self) -> None: |
| 252 | + """Cleanup resource of the computer.""" |
| 253 | + pass |
| 254 | + |
| 255 | + async def environment(self) -> Environment: |
| 256 | + """Returns the environment of the computer. |
| 257 | +
|
| 258 | + Returns: |
| 259 | + The environment type, async defaults to ENVIRONMENT_BROWSER. |
| 260 | + """ |
| 261 | + return Environment.ENVIRONMENT_BROWSER |
0 commit comments