diff --git a/src/core/tools/browserActionTool.ts b/src/core/tools/browserActionTool.ts index 13cb9b0ec266..3b035661383f 100644 --- a/src/core/tools/browserActionTool.ts +++ b/src/core/tools/browserActionTool.ts @@ -148,6 +148,32 @@ export async function browserActionTool( } } + // Clean up old browser screenshots from API conversation history to prevent memory accumulation + // Only keep the latest screenshot - old ones are no longer needed by the model + if (browserActionResult?.screenshot) { + const apiHistory = cline.apiConversationHistory + for (let i = apiHistory.length - 1; i >= 0; i--) { + const message = apiHistory[i] + if (Array.isArray(message.content)) { + // Filter out old screenshot image blocks + message.content = message.content.filter((block) => { + // Remove base64 image blocks (browser screenshots) + // Keep text blocks and other content + if ( + block.type === "image" && + "source" in block && + block.source.type === "base64" && + (block.source.media_type === "image/webp" || block.source.media_type === "image/png") + ) { + // This is likely an old browser screenshot - remove it + return false + } + return true + }) + } + } + } + switch (action) { case "launch": case "click": diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index c23b79f568a3..b32abd19243a 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -2,7 +2,6 @@ import React, { memo, useEffect, useMemo, useRef, useState } from "react" import { useSize } from "react-use" import deepEqual from "fast-deep-equal" import { useTranslation } from "react-i18next" -import { VSCodeButton } from "@vscode/webview-ui-toolkit/react" import type { ClineMessage } from "@roo-code/types" @@ -12,7 +11,6 @@ import { vscode } from "@src/utils/vscode" import { useExtensionState } from "@src/context/ExtensionStateContext" import CodeBlock, { CODE_BLOCK_BG_COLOR } from "../common/CodeBlock" -import { ChatRowContent } from "./ChatRow" import { ProgressIndicator } from "./ProgressIndicator" import { Globe, Pointer, SquareTerminal } from "lucide-react" @@ -30,7 +28,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { const { messages, isLast, onHeightChange, lastModifiedMessage } = props const { t } = useTranslation() const prevHeightRef = useRef(0) - const [maxActionHeight, setMaxActionHeight] = useState(0) const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false) const { browserViewportSize = "900x600" } = useExtensionState() @@ -55,182 +52,62 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { }, [messages, lastModifiedMessage, isLast]) const isBrowsing = useMemo(() => { - return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted // after user approves, browser_action_result with "" is sent to indicate that the session has started + return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted }, [isLast, messages, isLastApiReqInterrupted]) - // Organize messages into pages with current state and next action - const pages = useMemo(() => { - const result: { - currentState: { - url?: string - screenshot?: string - mousePosition?: string - consoleLogs?: string - messages: ClineMessage[] // messages up to and including the result - } - nextAction?: { - messages: ClineMessage[] // messages leading to next result - } - }[] = [] - - let currentStateMessages: ClineMessage[] = [] - let nextActionMessages: ClineMessage[] = [] - - messages.forEach((message) => { - if (message.ask === "browser_action_launch") { - // Start first page - currentStateMessages = [message] - } else if (message.say === "browser_action_result") { - if (message.text === "") { - // first browser_action_result is an empty string that signals that session has started - return - } - // Complete current state - currentStateMessages.push(message) - const resultData = JSON.parse(message.text || "{}") as BrowserActionResult - - // Add page with current state and previous next actions - result.push({ - currentState: { - url: resultData.currentUrl, - screenshot: resultData.screenshot, - mousePosition: resultData.currentMousePosition, - consoleLogs: resultData.logs, - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - - // Reset for next page - currentStateMessages = [] - nextActionMessages = [] - } else if ( - message.say === "api_req_started" || - message.say === "text" || - message.say === "browser_action" - ) { - // These messages lead to the next result, so they should always go in nextActionMessages - nextActionMessages.push(message) - } else { - // Any other message types - currentStateMessages.push(message) - } - }) - - // Add incomplete page if exists - if (currentStateMessages.length > 0 || nextActionMessages.length > 0) { - result.push({ - currentState: { - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - } - - return result - }, [messages]) - - // Auto-advance to latest page - const [currentPageIndex, setCurrentPageIndex] = useState(0) - useEffect(() => { - setCurrentPageIndex(pages.length - 1) - }, [pages.length]) - // Get initial URL from launch message const initialUrl = useMemo(() => { const launchMessage = messages.find((m) => m.ask === "browser_action_launch") return launchMessage?.text || "" }, [messages]) - // Find the latest available URL and screenshot + // Find the LATEST browser action result only (no history needed) const latestState = useMemo(() => { - for (let i = pages.length - 1; i >= 0; i--) { - const page = pages[i] - if (page.currentState.url || page.currentState.screenshot) { + // Search backwards to find the most recent browser_action_result + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action_result" && message.text && message.text !== "") { + const resultData = JSON.parse(message.text) as BrowserActionResult return { - url: page.currentState.url, - mousePosition: page.currentState.mousePosition, - consoleLogs: page.currentState.consoleLogs, - screenshot: page.currentState.screenshot, + url: resultData.currentUrl, + screenshot: resultData.screenshot, + mousePosition: resultData.currentMousePosition, + consoleLogs: resultData.logs, } } } - return { url: undefined, mousePosition: undefined, consoleLogs: undefined, screenshot: undefined } - }, [pages]) + return { url: undefined, screenshot: undefined, mousePosition: undefined, consoleLogs: undefined } + }, [messages]) - const currentPage = pages[currentPageIndex] - const isLastPage = currentPageIndex === pages.length - 1 + // Display state is simply the latest state or defaults + const displayState = { + url: latestState.url || initialUrl, + screenshot: latestState.screenshot, + mousePosition: latestState.mousePosition || defaultMousePosition, + consoleLogs: latestState.consoleLogs, + } - // Use latest state if we're on the last page and don't have a state yet - const displayState = isLastPage - ? { - url: currentPage?.currentState.url || latestState.url || initialUrl, - mousePosition: - currentPage?.currentState.mousePosition || latestState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot || latestState.screenshot, - } - : { - url: currentPage?.currentState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot, + // Find latest browser action and click position + const latestBrowserAction = useMemo(() => { + // Look through messages backwards for the latest browser_action + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action" && message.text) { + const browserAction = JSON.parse(message.text) as ClineSayBrowserAction + return browserAction } - - const [actionContent, { height: actionHeight }] = useSize( -