Skip to content

Commit 69ce8fb

Browse files
seanzhougooglecopybara-github
authored andcommitted
feat: Add ComputerUseToolset
PiperOrigin-RevId: 783607978
1 parent 3f9f773 commit 69ce8fb

File tree

8 files changed

+1655
-0
lines changed

8 files changed

+1655
-0
lines changed
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import abc
18+
from typing import Literal
19+
20+
from google.genai.types import Environment
21+
import pydantic
22+
23+
from ...utils.feature_decorator import experimental
24+
25+
26+
@experimental
27+
class EnvironmentState(pydantic.BaseModel):
28+
"""Represents the current state of the computer environment.
29+
30+
Attributes:
31+
screenshot: The screenshot in PNG format as bytes.
32+
url: The current URL of the webpage being displayed.
33+
"""
34+
35+
screenshot: bytes = pydantic.Field(
36+
..., description="Screenshot in PNG format"
37+
)
38+
url: str = pydantic.Field(..., description="Current webpage URL")
39+
40+
@pydantic.field_validator("url")
41+
@classmethod
42+
def validate_url(cls, v: str) -> str:
43+
"""Validate that URL is not empty."""
44+
if not v.strip():
45+
raise ValueError("URL cannot be empty")
46+
return v
47+
48+
49+
@experimental
50+
class Computer(abc.ABC):
51+
"""Defines an async interface for computer environments.
52+
53+
This abstract base class defines the standard async interface for controlling
54+
computer environments, including web browsers and other interactive systems.
55+
"""
56+
57+
@abc.abstractmethod
58+
async def screen_size(self) -> tuple[int, int]:
59+
"""Returns the screen size of the environment.
60+
61+
Returns:
62+
A tuple of (width, height) in pixels.
63+
"""
64+
65+
@abc.abstractmethod
66+
async def open_web_browser(self) -> EnvironmentState:
67+
"""Opens the web browser.
68+
69+
Returns:
70+
The current state after opening the browser.
71+
"""
72+
73+
@abc.abstractmethod
74+
async def click_at(self, x: int, y: int) -> EnvironmentState:
75+
"""Clicks at a specific x, y coordinate on the webpage.
76+
77+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
78+
79+
Args:
80+
x: The x-coordinate to click at.
81+
y: The y-coordinate to click at.
82+
83+
Returns:
84+
The current state after clicking.
85+
"""
86+
87+
@abc.abstractmethod
88+
async def hover_at(self, x: int, y: int) -> EnvironmentState:
89+
"""Hovers at a specific x, y coordinate on the webpage.
90+
91+
May be used to explore sub-menus that appear on hover.
92+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
93+
94+
Args:
95+
x: The x-coordinate to hover at.
96+
y: The y-coordinate to hover at.
97+
98+
Returns:
99+
The current state after hovering.
100+
"""
101+
102+
@abc.abstractmethod
103+
async def type_text_at(
104+
self,
105+
x: int,
106+
y: int,
107+
text: str,
108+
press_enter: bool = True,
109+
clear_before_typing: bool = True,
110+
) -> EnvironmentState:
111+
"""Types text at a specific x, y coordinate.
112+
113+
The system automatically presses ENTER after typing. To disable this, set `press_enter` to False.
114+
The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False.
115+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
116+
117+
Args:
118+
x: The x-coordinate to type at.
119+
y: The y-coordinate to type at.
120+
text: The text to type.
121+
press_enter: Whether to press ENTER after typing.
122+
clear_before_typing: Whether to clear existing content before typing.
123+
124+
Returns:
125+
The current state after typing.
126+
"""
127+
128+
@abc.abstractmethod
129+
async def scroll_document(
130+
self, direction: Literal["up", "down", "left", "right"]
131+
) -> EnvironmentState:
132+
"""Scrolls the entire webpage "up", "down", "left" or "right" based on direction.
133+
134+
Args:
135+
direction: The direction to scroll.
136+
137+
Returns:
138+
The current state after scrolling.
139+
"""
140+
141+
@abc.abstractmethod
142+
async def scroll_at(
143+
self,
144+
x: int,
145+
y: int,
146+
direction: Literal["up", "down", "left", "right"],
147+
magnitude: int,
148+
) -> EnvironmentState:
149+
"""Scrolls up, down, right, or left at a x, y coordinate by magnitude.
150+
151+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
152+
153+
Args:
154+
x: The x-coordinate to scroll at.
155+
y: The y-coordinate to scroll at.
156+
direction: The direction to scroll.
157+
magnitude: The amount to scroll.
158+
159+
Returns:
160+
The current state after scrolling.
161+
"""
162+
163+
@abc.abstractmethod
164+
async def wait_5_seconds(self) -> EnvironmentState:
165+
"""Waits for 5 seconds to allow unfinished webpage processes to complete.
166+
167+
Returns:
168+
The current state after waiting.
169+
"""
170+
171+
@abc.abstractmethod
172+
async def go_back(self) -> EnvironmentState:
173+
"""Navigates back to the previous webpage in the browser history.
174+
175+
Returns:
176+
The current state after navigating back.
177+
"""
178+
179+
@abc.abstractmethod
180+
async def go_forward(self) -> EnvironmentState:
181+
"""Navigates forward to the next webpage in the browser history.
182+
183+
Returns:
184+
The current state after navigating forward.
185+
"""
186+
187+
@abc.abstractmethod
188+
async def search(self) -> EnvironmentState:
189+
"""Directly jumps to a search engine home page.
190+
191+
Used when you need to start with a search. For example, this is used when
192+
the current website doesn't have the information needed or because a new
193+
task is being started.
194+
195+
Returns:
196+
The current state after navigating to search.
197+
"""
198+
199+
@abc.abstractmethod
200+
async def navigate(self, url: str) -> EnvironmentState:
201+
"""Navigates directly to a specified URL.
202+
203+
Args:
204+
url: The URL to navigate to.
205+
206+
Returns:
207+
The current state after navigation.
208+
"""
209+
210+
@abc.abstractmethod
211+
async def key_combination(self, keys: list[str]) -> EnvironmentState:
212+
"""Presses keyboard keys and combinations, such as "control+c" or "enter".
213+
214+
Args:
215+
keys: List of keys to press in combination.
216+
217+
Returns:
218+
The current state after key press.
219+
"""
220+
221+
@abc.abstractmethod
222+
async def drag_and_drop(
223+
self, x: int, y: int, destination_x: int, destination_y: int
224+
) -> EnvironmentState:
225+
"""Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate.
226+
227+
The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen.
228+
229+
Args:
230+
x: The x-coordinate to start dragging from.
231+
y: The y-coordinate to start dragging from.
232+
destination_x: The x-coordinate to drop at.
233+
destination_y: The y-coordinate to drop at.
234+
235+
Returns:
236+
The current state after drag and drop.
237+
"""
238+
239+
@abc.abstractmethod
240+
async def current_state(self) -> EnvironmentState:
241+
"""Returns the current state of the current webpage.
242+
243+
Returns:
244+
The current environment state.
245+
"""
246+
247+
async def initialize(self) -> None:
248+
"""Initialize the computer."""
249+
pass
250+
251+
async def close(self) -> None:
252+
"""Cleanup resource of the computer."""
253+
pass
254+
255+
async def environment(self) -> Environment:
256+
"""Returns the environment of the computer.
257+
258+
Returns:
259+
The environment type, async defaults to ENVIRONMENT_BROWSER.
260+
"""
261+
return Environment.ENVIRONMENT_BROWSER

0 commit comments

Comments
 (0)