Skip to content

Commit 0031871

Browse files
navidkprkamath
andauthored
fix gray outline missing + action completion attr creation (#179)
* fix gray outline missing * fix the action not finishing error + add more llm logging * further improve the prompt * fix standard xpath sometimes not working * completion - aim for high false positive * changeset --------- Co-authored-by: Anirudh Kamath <github@kamath.io>
1 parent 4e52dd8 commit 0031871

File tree

8 files changed

+82
-26
lines changed

8 files changed

+82
-26
lines changed

.changeset/little-weeks-worry.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
Fixes:
6+
7+
The last big change we pushed out, introduced a small regression. As a result, the gray outline showing the elements Stagehand is looking out is missing. This commit fixes that. We now process selectorMap properly now (using the updated type Record<number, string[]
8+
9+
Improved the action prompt:
10+
11+
Improved the structure
12+
Made it more straightforward
13+
Improved working for completed arg and prioritized precision over recall

lib/dom/debug.ts

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
11
async function debugDom() {
22
window.chunkNumber = 0;
33

4-
const { selectorMap, outputString } = await window.processElements(
5-
window.chunkNumber,
6-
);
4+
const { selectorMap: multiSelectorMap, outputString } =
5+
await window.processElements(window.chunkNumber);
6+
7+
const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
78

89
drawChunk(selectorMap);
910
setupChunkNav();
1011
}
1112

13+
function multiSelectorMapToSelectorMap(
14+
multiSelectorMap: Record<number, string[]>,
15+
) {
16+
return Object.fromEntries(
17+
Object.entries(multiSelectorMap).map(([key, selectors]) => [
18+
Number(key),
19+
selectors[0],
20+
]),
21+
);
22+
}
23+
1224
function drawChunk(selectorMap: Record<number, string>) {
1325
cleanupMarkers();
1426
Object.entries(selectorMap).forEach(([_index, selector]) => {
@@ -90,7 +102,12 @@ function setupChunkNav() {
90102
window.chunkNumber -= 1;
91103
window.scrollTo(0, window.chunkNumber * window.innerHeight);
92104
await window.waitForDomSettle();
93-
const { selectorMap } = await processElements(window.chunkNumber);
105+
const { selectorMap: multiSelectorMap } = await window.processElements(
106+
window.chunkNumber,
107+
);
108+
109+
const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
110+
94111
drawChunk(selectorMap);
95112
setupChunkNav();
96113
};
@@ -113,7 +130,10 @@ function setupChunkNav() {
113130
window.scrollTo(0, window.chunkNumber * window.innerHeight);
114131
await window.waitForDomSettle();
115132

116-
const { selectorMap } = await processElements(window.chunkNumber);
133+
const { selectorMap: multiSelectorMap } = await window.processElements(
134+
window.chunkNumber,
135+
);
136+
const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
117137
drawChunk(selectorMap);
118138
setupChunkNav();
119139
};

lib/dom/process.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@ export function isTextNode(node: Node): node is Text {
1010

1111
export async function processDom(chunksSeen: Array<number>) {
1212
const { chunk, chunksArray } = await pickChunk(chunksSeen);
13-
const { outputString, selectorMap } = await processElements(
14-
chunk,
15-
undefined,
16-
undefined,
17-
);
13+
const { outputString, selectorMap } = await processElements(chunk);
1814

1915
console.log(
2016
`Stagehand (Browser Process): Extracted dom elements:\n${outputString}`,

lib/dom/xpathUtils.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ export async function generateXPathsForElement(
114114
// This should return in order from most accurate on current page to most cachable.
115115
// Do not change the order if you are not sure what you are doing.
116116
// Contact Navid if you need help understanding it.
117-
return [...(idBasedXPath ? [idBasedXPath] : []), standardXPath, complexXPath];
117+
return [standardXPath, ...(idBasedXPath ? [idBasedXPath] : []), complexXPath];
118118
}
119119

120120
async function generateComplexXPath(element: ChildNode): Promise<string> {
@@ -212,34 +212,28 @@ async function generateStandardXPath(element: ChildNode): Promise<string> {
212212
const siblings = element.parentElement
213213
? Array.from(element.parentElement.childNodes)
214214
: [];
215-
216215
for (let i = 0; i < siblings.length; i++) {
217216
const sibling = siblings[i];
218-
219217
if (
220218
sibling.nodeType === element.nodeType &&
221219
sibling.nodeName === element.nodeName
222220
) {
223221
index = index + 1;
224222
hasSameTypeSiblings = true;
225-
226223
if (sibling.isSameNode(element)) {
227224
break;
228225
}
229226
}
230227
}
231-
232228
// text "nodes" are selected differently than elements with xPaths
233229
if (element.nodeName !== "#text") {
234230
const tagName = element.nodeName.toLowerCase();
235231
const pathIndex = hasSameTypeSiblings ? `[${index}]` : "";
236232
parts.unshift(`${tagName}${pathIndex}`);
237233
}
238-
239234
element = element.parentElement as HTMLElement;
240235
}
241-
242-
return parts.length ? `//${parts.join("//")}` : "";
236+
return parts.length ? `/${parts.join("/")}` : "";
243237
}
244238

245239
async function generatedIdBasedXPath(

lib/index.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import { AvailableModel, LLMProvider } from "./llm/LLMProvider";
77
import path from "path";
88
import { ScreenshotService } from "./vision";
99
import { modelsWithVision } from "./llm/LLMClient";
10-
import { ActionCache } from "./cache/ActionCache";
1110
import { StagehandActHandler } from "./handlers/actHandler";
1211
import { generateId } from "./utils";
1312

lib/llm/AnthropicClient.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ export class AnthropicClient implements LLMClient {
3636
async createChatCompletion(
3737
options: ChatCompletionOptions & { retries?: number },
3838
) {
39+
const { image: _, ...optionsWithoutImage } = options;
40+
this.logger({
41+
category: "Anthropic",
42+
message: `Creating chat completion with options: ${JSON.stringify(
43+
optionsWithoutImage,
44+
null,
45+
2,
46+
)}`,
47+
level: 1,
48+
});
3949
// Try to get cached response
4050
const cacheOptions = {
4151
model: options.model,
@@ -145,6 +155,12 @@ export class AnthropicClient implements LLMClient {
145155
temperature: options.temperature,
146156
});
147157

158+
this.logger({
159+
category: "Anthropic",
160+
message: `Response: ${JSON.stringify(response, null, 2)}`,
161+
level: 1,
162+
});
163+
148164
// Parse the response here
149165
const transformedResponse = {
150166
id: response.id,

lib/llm/OpenAIClient.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,16 @@ export class OpenAIClient implements LLMClient {
3232
}
3333

3434
async createChatCompletion(options: ChatCompletionOptions) {
35+
const { image: _, ...optionsWithoutImage } = options;
36+
this.logger({
37+
category: "OpenAI",
38+
message: `Creating chat completion with options: ${JSON.stringify(
39+
optionsWithoutImage,
40+
null,
41+
2,
42+
)}`,
43+
level: 1,
44+
});
3545
const cacheOptions = {
3646
model: options.model,
3747
messages: options.messages,
@@ -95,6 +105,12 @@ export class OpenAIClient implements LLMClient {
95105
response_format: responseFormat,
96106
});
97107

108+
this.logger({
109+
category: "OpenAI",
110+
message: `Response: ${JSON.stringify(response, null, 2)}`,
111+
level: 1,
112+
});
113+
98114
if (response_model) {
99115
const extractedData = response.choices[0].message.content;
100116
const parsedData = JSON.parse(extractedData);

lib/prompt.ts

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,23 @@ import { ChatMessage } from "./llm/LLMClient";
44
// act
55
const actSystemPrompt = `
66
# Instructions
7-
You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls.
7+
You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls by running playwright commands.
88
9-
You are given:
9+
## Input
10+
You will receive:
1011
1. the user's overall goal
1112
2. the steps that you've taken so far
1213
3. a list of active DOM elements in this chunk to consider to get closer to the goal.
1314
4. Optionally, a list of variable names that the user has provided that you may use to accomplish the goal. To use the variables, you must use the special <|VARIABLE_NAME|> syntax.
1415
15-
You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do not perform any other actions.
1616
17-
Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed.
17+
## Your Goal / Specification
18+
You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do exactly what the user's goal is. Do not perform any other actions or exceed the scope of the goal.
19+
If the user's goal will be accomplished after running the playwright action, set completed to true. Better to have completed set to true if your are not sure.
1820
19-
Also, verify if the goal has been accomplished already. Do this by checking if the goal has been accomplished based on the previous steps completed, the current page DOM elements and the current page URL / starting page URL. If it has, set completed to true and finish the task.
21+
Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed.
2022
21-
Do exactly what the user's goal is. Do not exceed the scope of the goal.
23+
Again, if the user's goal will be accomplished after running the playwright action, set completed to true.
2224
`;
2325

2426
const verifyActCompletionSystemPrompt = `
@@ -117,7 +119,7 @@ ${steps}
117119
${domElements}
118120
`;
119121

120-
if (variables) {
122+
if (variables && Object.keys(variables).length > 0) {
121123
actUserPrompt += `
122124
# Variables
123125
${Object.entries(variables)

0 commit comments

Comments
 (0)