browserbase · sameelarif · Aug 29, 2025 · Sep 3, 2025 · Sep 9, 2025 · Sep 10, 2025
diff --git a/.changeset/curly-boats-push.md b/.changeset/curly-boats-push.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": patch
+---
+
+improve evals screenshot service - add img hashing diff to add screenshots and change to screenshot intercepts from the agent
diff --git a/.changeset/dark-crabs-repair.md b/.changeset/dark-crabs-repair.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": minor
+---
+
+added web voyager ground truth (optional), added web bench, and subset of OSWorld evals which run on a browser
diff --git a/.changeset/few-frogs-smoke.md b/.changeset/few-frogs-smoke.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Pass stagehand object to agent instead of stagehand page
diff --git a/.changeset/fifty-windows-throw.md b/.changeset/fifty-windows-throw.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Fix logging for stagehand agent
diff --git a/.changeset/icy-toes-obey.md b/.changeset/icy-toes-obey.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add playwright arguments to agent execute response
diff --git a/.changeset/loud-waves-think.md b/.changeset/loud-waves-think.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+adds support for stagehand agent in the api
diff --git a/.changeset/many-rats-punch.md b/.changeset/many-rats-punch.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Fix for zod peer dependency support
diff --git a/.changeset/purple-squids-know.md b/.changeset/purple-squids-know.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Fixed info logs on api session create
diff --git a/.changeset/short-mirrors-switch.md b/.changeset/short-mirrors-switch.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+patch custom tool support in anthropic cua client
diff --git a/.changeset/tasty-candles-retire.md b/.changeset/tasty-candles-retire.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Improve failed act error logs
diff --git a/.changeset/upset-ghosts-shout.md b/.changeset/upset-ghosts-shout.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add current page and date context to agent
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+CLAUDE.md
 node_modules/
 /test-results/
 /playwright-report/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -233,15 +233,13 @@
   We're thrilled to announce the release of Stagehand 2.0, bringing significant improvements to make browser automation more powerful, faster, and easier to use than ever before.
 
   ### 🚀 New Features
-
   - **Introducing `stagehand.agent`**: A powerful new way to integrate SOTA Computer use models or Browserbase's [Open Operator](https://operator.browserbase.com) into Stagehand with one line of code! Perfect for multi-step workflows and complex interactions. [Learn more](https://docs.stagehand.dev/concepts/agent)
   - **Lightning-fast `act` and `extract`**: Major performance improvements to make your automations run significantly faster.
   - **Enhanced Logging**: Better visibility into what's happening during automation with improved logging and debugging capabilities.
   - **Comprehensive Documentation**: A completely revamped documentation site with better examples, guides, and best practices.
   - **Improved Error Handling**: More descriptive errors and better error recovery to help you debug issues faster.
 
   ### 🛠️ Developer Experience
-
   - **Better TypeScript Support**: Enhanced type definitions and better IDE integration
   - **Better Error Messages**: Clearer, more actionable error messages to help you debug faster
   - **Improved Caching**: More reliable action caching for better performance
@@ -502,7 +500,6 @@
 - [#316](https://github.com/browserbase/stagehand/pull/316) [`902e633`](https://github.com/browserbase/stagehand/commit/902e633e126a58b80b757ea0ecada01a7675a473) Thanks [@kamath](https://github.com/kamath)! - rename browserbaseResumeSessionID -> browserbaseSessionID
 
 - [#296](https://github.com/browserbase/stagehand/pull/296) [`f11da27`](https://github.com/browserbase/stagehand/commit/f11da27a20409c240ceeea2003d520f676def61a) Thanks [@kamath](https://github.com/kamath)! - - Deprecate fields in `init` in favor of constructor options
-
   - Deprecate `initFromPage` in favor of `browserbaseResumeSessionID` in constructor
   - Rename `browserBaseSessionCreateParams` -> `browserbaseSessionCreateParams`
 

diff --git a/docs/configuration/browser.mdx b/docs/configuration/browser.mdx
@@ -114,7 +114,7 @@ stagehand = Stagehand(
       apiKey: process.env.BROWSERBASE_API_KEY,
       projectId: process.env.BROWSERBASE_PROJECT_ID,
       browserbaseSessionCreateParams: {
-        projectId: process.env.BROWSERBASE_PROJECT_ID!,
+        projectId: process.env.BROWSERBASE_PROJECT_ID!, // Optional: automatically set if given in environment variable or by Stagehand parameter
         proxies: true,
         region: "us-west-2",
         timeout: 3600, // 1 hour session timeout
@@ -124,17 +124,11 @@ stagehand = Stagehand(
           blockAds: true,
           solveCaptchas: true,
           recordSession: false,
+          os: "windows", // Valid: "windows" | "mac" | "linux" | "mobile" | "tablet"
           viewport: {
             width: 1920,
             height: 1080,
           },
-          fingerprint: {
-            browsers: ["chrome", "edge"],
-            devices: ["desktop"],
-            operatingSystems: ["windows", "macos"],
-            locales: ["en-US", "en-GB"],
-            httpVersion: 2,
-          },
         },
         userMetadata: {
           userId: "automation-user-123",
@@ -149,7 +143,7 @@ stagehand = Stagehand(
         api_key=os.getenv("BROWSERBASE_API_KEY"),
         project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
         browserbase_session_create_params={
-            "project_id": os.getenv("BROWSERBASE_PROJECT_ID"),
+            "project_id": os.getenv("BROWSERBASE_PROJECT_ID"), # Optional: automatically set if given in environment or by Stagehand parameter
             "proxies": True,
             "region": "us-west-2",
             "timeout": 3600,  # 1 hour session timeout
@@ -159,17 +153,11 @@ stagehand = Stagehand(
                 "block_ads": True,
                 "solve_captchas": True,
                 "record_session": False,
+                "os": "windows",  # "windows" | "mac" | "linux" | "mobile" | "tablet"
                 "viewport": {
                     "width": 1920,
                     "height": 1080,
                 },
-                "fingerprint": {
-                    "browsers": ["chrome", "edge"],
-                    "devices": ["desktop"],
-                    "operating_systems": ["windows", "macos"],
-                    "locales": ["en-US", "en-GB"],
-                    "http_version": 2,
-                },
             },
             "user_metadata": {
                 "user_id": "automation-user-123",

diff --git a/docs/configuration/evals.mdx b/docs/configuration/evals.mdx
@@ -25,33 +25,114 @@ Evaluations help you understand how well your automation performs, which models
 
 Evaluations help you systematically test and improve your automation workflows. Stagehand provides both built-in evaluations and tools to create your own.
 
-<Tip>
-To run evals, you'll need to clone the [Stagehand repo](https://github.com/browserbase/stagehand) and run `npm install` to install the dependencies.
-</Tip>
-
-We have three types of evals:
-1. **Deterministic Evals** - These are evals that are deterministic and can be run without any LLM inference.
+We have 2 types of evals:
+1. **Deterministic Evals** - These include unit tests, integration tests, and E2E tests that can be run without any LLM inference.
 2. **LLM-based Evals** - These are evals that test the underlying functionality of Stagehand's AI primitives.
 
 
-### LLM-based Evals
+### Evals CLI
+![Evals CLI](/media/evals-cli.png)
 
 <Tip>
-To run LLM evals, you'll need a [Braintrust account](https://www.braintrust.dev/docs/).
+To run evals, you'll need to clone the [Stagehand repo](https://github.com/browserbase/stagehand) and set up the CLI.
+
+We recommend using [Braintrust](https://www.braintrust.dev/docs/) to help visualize evals results and metrics.
 </Tip>
 
-To run LLM-based evals, you can run `npm run evals` from within the Stagehand repo. This will test the functionality of the LLM primitives within Stagehand to make sure they're working as expected.
+The Stagehand CLI provides a powerful interface for running evaluations. You can run specific evals, categories, or external benchmarks with customizable settings.
 
-Evals are grouped into three categories:
+Evals are grouped into:
 1. **Act Evals** - These are evals that test the functionality of the `act` method.
 2. **Extract Evals** - These are evals that test the functionality of the `extract` method.
 3. **Observe Evals** - These are evals that test the functionality of the `observe` method.
 4. **Combination Evals** - These are evals that test the functionality of the `act`, `extract`, and `observe` methods together.
+5. **Experimental Evals** - These are experimental custom evals that test the functionality of the stagehand primitives.
+6. **Agent Evals** - These are evals that test the functionality of `agent`.
+7. **(NEW) External Benchmarks** - Run external benchmarks like WebBench, GAIA, WebVoyager, OnlineMind2Web, and OSWorld.
+
+#### Installation
+
+<Steps> 
+<Step title="Install Dependencies">
+```bash
+# From the stagehand root directory
+pnpm install
+```
+</Step>
 
-#### Configuring and Running Evals
-You can view the specific evals in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/main/evals/tasks). Each eval is grouped into eval categories based on [`evals/evals.config.json`](https://github.com/browserbase/stagehand/blob/main/evals/evals.config.json). You can specify models to run and other general task config in [`evals/taskConfig.ts`](https://github.com/browserbase/stagehand/blob/main/evals/taskConfig.ts).
+<Step title="Build the CLI">
+```bash
+pnpm run build:cli
+```
+</Step>
 
-To run a specific eval, you can run `npm run evals <eval>`, or run all evals in a category with `npm run evals category <category>`.
+<Step title="Verify Installation">
+```bash
+evals help
+```
+</Step>
+</Steps>
+
+#### CLI Commands and Options
+
+##### Basic Commands
+
+```bash
+# Run all evals
+evals run all
+
+# Run specific category
+evals run act
+evals run extract
+evals run observe
+evals run agent
+
+# Run specific eval
+evals run extract/extract_text
+
+# List available evals
+evals list
+evals list --detailed
+
+# Configure defaults
+evals config
+evals config set env browserbase
+evals config set trials 5
+```
+
+##### Command Options
+
+- **`-e, --env`**: Environment (`local` or `browserbase`)
+- **`-t, --trials`**: Number of trials per eval (default: 3)
+- **`-c, --concurrency`**: Max parallel sessions (default: 10)
+- **`-m, --model`**: Model override
+- **`-p, --provider`**: Provider override
+- **`--api`**: Use Stagehand API instead of SDK
+
+##### Running External Benchmarks
+
+The CLI supports several industry-standard benchmarks:
+
+```bash
+# WebBench with filters
+evals run benchmark:webbench -l 10 -f difficulty=easy -f category=READ
+
+# GAIA benchmark
+evals run b:gaia -s 100 -l 25 -f level=1
+
+# WebVoyager
+evals run b:webvoyager -l 50
+
+# OnlineMind2Web
+evals run b:onlineMind2Web
+
+# OSWorld
+evals run b:osworld -f source=Mind2Web
+```
+
+#### Configuration Files
+
+You can view the specific evals in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/main/evals/tasks). Each eval is grouped into eval categories based on [`evals/evals.config.json`](https://github.com/browserbase/stagehand/blob/main/evals/evals.config.json).
 
 
 #### Viewing eval results
@@ -65,7 +146,7 @@ You can use the Braintrust UI to filter by model/eval and aggregate results acro
 
 ### Deterministic Evals
 
-To run deterministic evals, you can just run `npm run e2e` from within the Stagehand repo. This will test the functionality of Playwright within Stagehand to make sure it's working as expected.
+To run deterministic evals, you can run `npm run e2e` from within the Stagehand repo. This will test the functionality of Playwright within Stagehand to make sure it's working as expected.
 
 These tests are in [`evals/deterministic`](https://github.com/browserbase/stagehand/tree/main/evals/deterministic) and test on both Browserbase browsers and local headless Chromium browsers.
 
@@ -139,10 +220,13 @@ Update `evals/evals.config.json`:
 <Step title="Run Your Evaluation">
 ```bash
 # Test your custom evaluation
-npm run evals custom_task_name
+evals run custom_task_name
 
 # Run the entire custom category
-npm run evals category custom
+evals run custom
+
+# Run with specific settings
+evals run custom_task_name -e browserbase -t 5 -m gpt-4o
 ```
 </Step>
 </Steps>

diff --git a/docs/configuration/models.mdx b/docs/configuration/models.mdx
@@ -156,47 +156,70 @@ stagehand = Stagehand(
 ## Custom LLM Integration
 
 <Note>
-Custom LLMs are currently only supported in TypeScript.
+Only [LiteLLM compatible providers](https://docs.litellm.ai/docs/providers) are available in Python. Some may require extra setup.
 </Note>
 
 Integrate any LLM with Stagehand using custom clients. The only requirement is **structured output support** for consistent automation behavior.
 
 ### Vercel AI SDK
 The [Vercel AI SDK](https://sdk.vercel.ai/providers/ai-sdk-providers) is a popular library for interacting with LLMs. You can use any of the providers supported by the Vercel AI SDK to create a client for your model, **as long as they support structured outputs**.
 
-Vercel AI SDK supports providers for OpenAI, Anthropic, and Google, along with support for **Amazon Bedrock** and **Azure OpenAI**.
+Vercel AI SDK supports providers for OpenAI, Anthropic, and Google, along with support for **Amazon Bedrock** and **Azure OpenAI**. For a full list, see the [Vercel AI SDK providers page](https://sdk.vercel.ai/providers/ai-sdk-providers).
 
-To get started, you'll need to install the `ai` package and the provider you want to use. For example, to use Amazon Bedrock, you'll need to install the `@ai-sdk/amazon-bedrock` package.
+To get started, you'll need to install the `ai` package (version 4) and the provider you want to use (version 1 - both need to be compatible with LanguageModelV1). For example, to use Amazon Bedrock, you'll need to install the `@ai-sdk/amazon-bedrock` package.
 
-You'll also need to use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/main/examples/external_clients/aisdk.ts) as a template to create a client for your model.
+You'll also need to import the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/main/lib/llm/aisdk.ts) through Stagehand to create a client for your model.
 
 <Tabs>
 	<Tab title="npm">
 	```bash
-	npm install ai @ai-sdk/amazon-bedrock
+	npm install ai@4 @ai-sdk/amazon-bedrock@1
 	```
 	</Tab>
 
 	<Tab title="pnpm">
 	```bash
-	pnpm install ai @ai-sdk/amazon-bedrock
+	pnpm install ai@4 @ai-sdk/amazon-bedrock@1
 	```
 	</Tab>
 
 	<Tab title="yarn">
 	```bash
-	yarn add ai @ai-sdk/amazon-bedrock
+	yarn add ai@4 @ai-sdk/amazon-bedrock@1
 	```
 	</Tab>
 </Tabs>
 
-To get started, you can use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/84f810b4631291307a32a47addad7e26e9c1deb3/examples/external_clients/aisdk.ts) as a template to create a client for your model.
+<Note>
+The `AISdkClient` is not yet available via the Stagehand npm package. For now, install Stagehand as a git repository to access the `AISdkClient` (this will be included in the npm package in an upcoming release).
+</Note>
+
+<Tabs>
+	<Tab title="npm">
+	```bash
+	npm install @browserbasehq/stagehand@git+https://github.com/browserbase/stagehand.git
+	```
+	</Tab>
+
+	<Tab title="pnpm">
+	```bash
+	pnpm install @browserbasehq/stagehand@git+https://github.com/browserbase/stagehand.git
+	```
+	</Tab>
+
+	<Tab title="yarn">
+	```bash
+	yarn add @browserbasehq/stagehand@git+https://github.com/browserbase/stagehand.git
+	```
+	</Tab>
+</Tabs>
 
 ```ts
 // Install/import the provider you want to use.
-// For example, to use OpenAI, import `openai` from @ai-sdk/openai
+// For example, to use Azure OpenAI, import { createAzure } from '@ai-sdk/azure';
 import { bedrock } from "@ai-sdk/amazon-bedrock";
-import { AISdkClient } from "./external_clients/aisdk";
+// @ts-ignore
+import { AISdkClient } from "@browserbasehq/stagehand";
 
 const stagehand = new Stagehand({
   llmClient: new AISdkClient({