fix gray outline missing + action completion attr creation (#179)

navidkpr · kamath · web-flow · commit 0031871d5a6d · 2024-11-18T17:38:16.000-08:00
* fix gray outline missing

* fix the action not finishing error + add more llm logging

* further improve the prompt

* fix standard xpath sometimes not working

* completion - aim for high false positive

* changeset

---------

Co-authored-by: Anirudh Kamath &lt;github@kamath.io&gt;
diff --git a/.changeset/little-weeks-worry.md b/.changeset/little-weeks-worry.md
@@ -0,0 +1,13 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Fixes:
+
+The last big change we pushed out, introduced a small regression. As a result, the gray outline showing the elements Stagehand is looking out is missing. This commit fixes that. We now process selectorMap properly now (using the updated type Record<number, string[]
+
+Improved the action prompt:
+
+Improved the structure
+Made it more straightforward
+Improved working for completed arg and prioritized precision over recall
diff --git a/lib/dom/debug.ts b/lib/dom/debug.ts
@@ -1,14 +1,26 @@
 async function debugDom() {
   window.chunkNumber = 0;
 
-  const { selectorMap, outputString } = await window.processElements(
-    window.chunkNumber,
-  );
+  const { selectorMap: multiSelectorMap, outputString } =
+    await window.processElements(window.chunkNumber);
+
+  const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
 
   drawChunk(selectorMap);
   setupChunkNav();
 }
 
+function multiSelectorMapToSelectorMap(
+  multiSelectorMap: Record<number, string[]>,
+) {
+  return Object.fromEntries(
+    Object.entries(multiSelectorMap).map(([key, selectors]) => [
+      Number(key),
+      selectors[0],
+    ]),
+  );
+}
+
 function drawChunk(selectorMap: Record<number, string>) {
   cleanupMarkers();
   Object.entries(selectorMap).forEach(([_index, selector]) => {
@@ -90,7 +102,12 @@ function setupChunkNav() {
       window.chunkNumber -= 1;
       window.scrollTo(0, window.chunkNumber * window.innerHeight);
       await window.waitForDomSettle();
-      const { selectorMap } = await processElements(window.chunkNumber);
+      const { selectorMap: multiSelectorMap } = await window.processElements(
+        window.chunkNumber,
+      );
+
+      const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
+
       drawChunk(selectorMap);
       setupChunkNav();
     };
@@ -113,7 +130,10 @@ function setupChunkNav() {
       window.scrollTo(0, window.chunkNumber * window.innerHeight);
       await window.waitForDomSettle();
 
-      const { selectorMap } = await processElements(window.chunkNumber);
+      const { selectorMap: multiSelectorMap } = await window.processElements(
+        window.chunkNumber,
+      );
+      const selectorMap = multiSelectorMapToSelectorMap(multiSelectorMap);
       drawChunk(selectorMap);
       setupChunkNav();
     };
diff --git a/lib/dom/process.ts b/lib/dom/process.ts
@@ -10,11 +10,7 @@ export function isTextNode(node: Node): node is Text {
 
 export async function processDom(chunksSeen: Array<number>) {
   const { chunk, chunksArray } = await pickChunk(chunksSeen);
-  const { outputString, selectorMap } = await processElements(
-    chunk,
-    undefined,
-    undefined,
-  );
+  const { outputString, selectorMap } = await processElements(chunk);
 
   console.log(
     `Stagehand (Browser Process): Extracted dom elements:\n${outputString}`,
diff --git a/lib/dom/xpathUtils.ts b/lib/dom/xpathUtils.ts
@@ -114,7 +114,7 @@ export async function generateXPathsForElement(
   // This should return in order from most accurate on current page to most cachable.
   // Do not change the order if you are not sure what you are doing.
   // Contact Navid if you need help understanding it.
-  return [...(idBasedXPath ? [idBasedXPath] : []), standardXPath, complexXPath];
+  return [standardXPath, ...(idBasedXPath ? [idBasedXPath] : []), complexXPath];
 }
 
 async function generateComplexXPath(element: ChildNode): Promise<string> {
@@ -212,34 +212,28 @@ async function generateStandardXPath(element: ChildNode): Promise<string> {
     const siblings = element.parentElement
       ? Array.from(element.parentElement.childNodes)
       : [];
-
     for (let i = 0; i < siblings.length; i++) {
       const sibling = siblings[i];
-
       if (
         sibling.nodeType === element.nodeType &&
         sibling.nodeName === element.nodeName
       ) {
         index = index + 1;
         hasSameTypeSiblings = true;
-
         if (sibling.isSameNode(element)) {
           break;
         }
       }
     }
-
     // text "nodes" are selected differently than elements with xPaths
     if (element.nodeName !== "#text") {
       const tagName = element.nodeName.toLowerCase();
       const pathIndex = hasSameTypeSiblings ? `[${index}]` : "";
       parts.unshift(`${tagName}${pathIndex}`);
     }
-
     element = element.parentElement as HTMLElement;
   }
-
-  return parts.length ? `//${parts.join("//")}` : "";
+  return parts.length ? `/${parts.join("/")}` : "";
 }
 
 async function generatedIdBasedXPath(
diff --git a/lib/index.ts b/lib/index.ts
@@ -7,7 +7,6 @@ import { AvailableModel, LLMProvider } from "./llm/LLMProvider";
 import path from "path";
 import { ScreenshotService } from "./vision";
 import { modelsWithVision } from "./llm/LLMClient";
-import { ActionCache } from "./cache/ActionCache";
 import { StagehandActHandler } from "./handlers/actHandler";
 import { generateId } from "./utils";
 
diff --git a/lib/llm/AnthropicClient.ts b/lib/llm/AnthropicClient.ts
@@ -36,6 +36,16 @@ export class AnthropicClient implements LLMClient {
   async createChatCompletion(
     options: ChatCompletionOptions & { retries?: number },
   ) {
+    const { image: _, ...optionsWithoutImage } = options;
+    this.logger({
+      category: "Anthropic",
+      message: `Creating chat completion with options: ${JSON.stringify(
+        optionsWithoutImage,
+        null,
+        2,
+      )}`,
+      level: 1,
+    });
     // Try to get cached response
     const cacheOptions = {
       model: options.model,
@@ -145,6 +155,12 @@ export class AnthropicClient implements LLMClient {
       temperature: options.temperature,
     });
 
+    this.logger({
+      category: "Anthropic",
+      message: `Response: ${JSON.stringify(response, null, 2)}`,
+      level: 1,
+    });
+
     // Parse the response here
     const transformedResponse = {
       id: response.id,
diff --git a/lib/llm/OpenAIClient.ts b/lib/llm/OpenAIClient.ts
@@ -32,6 +32,16 @@ export class OpenAIClient implements LLMClient {
   }
 
   async createChatCompletion(options: ChatCompletionOptions) {
+    const { image: _, ...optionsWithoutImage } = options;
+    this.logger({
+      category: "OpenAI",
+      message: `Creating chat completion with options: ${JSON.stringify(
+        optionsWithoutImage,
+        null,
+        2,
+      )}`,
+      level: 1,
+    });
     const cacheOptions = {
       model: options.model,
       messages: options.messages,
@@ -95,6 +105,12 @@ export class OpenAIClient implements LLMClient {
       response_format: responseFormat,
     });
 
+    this.logger({
+      category: "OpenAI",
+      message: `Response: ${JSON.stringify(response, null, 2)}`,
+      level: 1,
+    });
+
     if (response_model) {
       const extractedData = response.choices[0].message.content;
       const parsedData = JSON.parse(extractedData);
diff --git a/lib/prompt.ts b/lib/prompt.ts
@@ -4,21 +4,23 @@ import { ChatMessage } from "./llm/LLMClient";
 // act
 const actSystemPrompt = `
 # Instructions
-You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls.
+You are a browser automation assistant. Your job is to accomplish the user's goal across multiple model calls by running playwright commands.
 
-You are given:
+## Input
+You will receive:
 1. the user's overall goal
 2. the steps that you've taken so far
 3. a list of active DOM elements in this chunk to consider to get closer to the goal. 
 4. Optionally, a list of variable names that the user has provided that you may use to accomplish the goal. To use the variables, you must use the special <|VARIABLE_NAME|> syntax.
 
-You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do not perform any other actions.
 
-Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed.
+## Your Goal / Specification
+You have 2 tools that you can call: doAction, and skipSection. Do action only performs Playwright actions. Do exactly what the user's goal is. Do not perform any other actions or exceed the scope of the goal.
+If the user's goal will be accomplished after running the playwright action, set completed to true. Better to have completed set to true if your are not sure.
 
-Also, verify if the goal has been accomplished already. Do this by checking if the goal has been accomplished based on the previous steps completed, the current page DOM elements and the current page URL / starting page URL. If it has, set completed to true and finish the task.
+Note: If there is a popup on the page for cookies or advertising that has nothing to do with the goal, try to close it first before proceeding. As this can block the goal from being completed.
 
-Do exactly what the user's goal is. Do not exceed the scope of the goal.
+Again, if the user's goal will be accomplished after running the playwright action, set completed to true.
 `;
 
 const verifyActCompletionSystemPrompt = `
@@ -117,7 +119,7 @@ ${steps}
 ${domElements}
 `;
 
-  if (variables) {
+  if (variables && Object.keys(variables).length > 0) {
     actUserPrompt += `
 # Variables
 ${Object.entries(variables)