diff --git a/src/core/tools/browserActionTool.ts b/src/core/tools/browserActionTool.ts index 13cb9b0ec266..3b035661383f 100644 --- a/src/core/tools/browserActionTool.ts +++ b/src/core/tools/browserActionTool.ts @@ -148,6 +148,32 @@ export async function browserActionTool( } } + // Clean up old browser screenshots from API conversation history to prevent memory accumulation + // Only keep the latest screenshot - old ones are no longer needed by the model + if (browserActionResult?.screenshot) { + const apiHistory = cline.apiConversationHistory + for (let i = apiHistory.length - 1; i >= 0; i--) { + const message = apiHistory[i] + if (Array.isArray(message.content)) { + // Filter out old screenshot image blocks + message.content = message.content.filter((block) => { + // Remove base64 image blocks (browser screenshots) + // Keep text blocks and other content + if ( + block.type === "image" && + "source" in block && + block.source.type === "base64" && + (block.source.media_type === "image/webp" || block.source.media_type === "image/png") + ) { + // This is likely an old browser screenshot - remove it + return false + } + return true + }) + } + } + } + switch (action) { case "launch": case "click": diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index c23b79f568a3..b32abd19243a 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -2,7 +2,6 @@ import React, { memo, useEffect, useMemo, useRef, useState } from "react" import { useSize } from "react-use" import deepEqual from "fast-deep-equal" import { useTranslation } from "react-i18next" -import { VSCodeButton } from "@vscode/webview-ui-toolkit/react" import type { ClineMessage } from "@roo-code/types" @@ -12,7 +11,6 @@ import { vscode } from "@src/utils/vscode" import { useExtensionState } from "@src/context/ExtensionStateContext" import CodeBlock, { CODE_BLOCK_BG_COLOR } from "../common/CodeBlock" -import { ChatRowContent } from "./ChatRow" import { ProgressIndicator } from "./ProgressIndicator" import { Globe, Pointer, SquareTerminal } from "lucide-react" @@ -30,7 +28,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { const { messages, isLast, onHeightChange, lastModifiedMessage } = props const { t } = useTranslation() const prevHeightRef = useRef(0) - const [maxActionHeight, setMaxActionHeight] = useState(0) const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false) const { browserViewportSize = "900x600" } = useExtensionState() @@ -55,182 +52,62 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { }, [messages, lastModifiedMessage, isLast]) const isBrowsing = useMemo(() => { - return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted // after user approves, browser_action_result with "" is sent to indicate that the session has started + return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted }, [isLast, messages, isLastApiReqInterrupted]) - // Organize messages into pages with current state and next action - const pages = useMemo(() => { - const result: { - currentState: { - url?: string - screenshot?: string - mousePosition?: string - consoleLogs?: string - messages: ClineMessage[] // messages up to and including the result - } - nextAction?: { - messages: ClineMessage[] // messages leading to next result - } - }[] = [] - - let currentStateMessages: ClineMessage[] = [] - let nextActionMessages: ClineMessage[] = [] - - messages.forEach((message) => { - if (message.ask === "browser_action_launch") { - // Start first page - currentStateMessages = [message] - } else if (message.say === "browser_action_result") { - if (message.text === "") { - // first browser_action_result is an empty string that signals that session has started - return - } - // Complete current state - currentStateMessages.push(message) - const resultData = JSON.parse(message.text || "{}") as BrowserActionResult - - // Add page with current state and previous next actions - result.push({ - currentState: { - url: resultData.currentUrl, - screenshot: resultData.screenshot, - mousePosition: resultData.currentMousePosition, - consoleLogs: resultData.logs, - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - - // Reset for next page - currentStateMessages = [] - nextActionMessages = [] - } else if ( - message.say === "api_req_started" || - message.say === "text" || - message.say === "browser_action" - ) { - // These messages lead to the next result, so they should always go in nextActionMessages - nextActionMessages.push(message) - } else { - // Any other message types - currentStateMessages.push(message) - } - }) - - // Add incomplete page if exists - if (currentStateMessages.length > 0 || nextActionMessages.length > 0) { - result.push({ - currentState: { - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - } - - return result - }, [messages]) - - // Auto-advance to latest page - const [currentPageIndex, setCurrentPageIndex] = useState(0) - useEffect(() => { - setCurrentPageIndex(pages.length - 1) - }, [pages.length]) - // Get initial URL from launch message const initialUrl = useMemo(() => { const launchMessage = messages.find((m) => m.ask === "browser_action_launch") return launchMessage?.text || "" }, [messages]) - // Find the latest available URL and screenshot + // Find the LATEST browser action result only (no history needed) const latestState = useMemo(() => { - for (let i = pages.length - 1; i >= 0; i--) { - const page = pages[i] - if (page.currentState.url || page.currentState.screenshot) { + // Search backwards to find the most recent browser_action_result + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action_result" && message.text && message.text !== "") { + const resultData = JSON.parse(message.text) as BrowserActionResult return { - url: page.currentState.url, - mousePosition: page.currentState.mousePosition, - consoleLogs: page.currentState.consoleLogs, - screenshot: page.currentState.screenshot, + url: resultData.currentUrl, + screenshot: resultData.screenshot, + mousePosition: resultData.currentMousePosition, + consoleLogs: resultData.logs, } } } - return { url: undefined, mousePosition: undefined, consoleLogs: undefined, screenshot: undefined } - }, [pages]) + return { url: undefined, screenshot: undefined, mousePosition: undefined, consoleLogs: undefined } + }, [messages]) - const currentPage = pages[currentPageIndex] - const isLastPage = currentPageIndex === pages.length - 1 + // Display state is simply the latest state or defaults + const displayState = { + url: latestState.url || initialUrl, + screenshot: latestState.screenshot, + mousePosition: latestState.mousePosition || defaultMousePosition, + consoleLogs: latestState.consoleLogs, + } - // Use latest state if we're on the last page and don't have a state yet - const displayState = isLastPage - ? { - url: currentPage?.currentState.url || latestState.url || initialUrl, - mousePosition: - currentPage?.currentState.mousePosition || latestState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot || latestState.screenshot, - } - : { - url: currentPage?.currentState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot, + // Find latest browser action and click position + const latestBrowserAction = useMemo(() => { + // Look through messages backwards for the latest browser_action + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action" && message.text) { + const browserAction = JSON.parse(message.text) as ClineSayBrowserAction + return browserAction } - - const [actionContent, { height: actionHeight }] = useSize( -
- {currentPage?.nextAction?.messages.map((message) => ( - - ))} - {!isBrowsing && messages.some((m) => m.say === "browser_action_result") && currentPageIndex === 0 && ( - - )} -
, - ) - - useEffect(() => { - if (actionHeight === 0 || actionHeight === Infinity) { - return - } - if (actionHeight > maxActionHeight) { - setMaxActionHeight(actionHeight) } - }, [actionHeight, maxActionHeight]) + return undefined + }, [messages]) - // Track latest click coordinate const latestClickPosition = useMemo(() => { - if (!isBrowsing) return undefined - - // Look through current page's next actions for the latest browser_action - const actions = currentPage?.nextAction?.messages || [] - for (let i = actions.length - 1; i >= 0; i--) { - const message = actions[i] - if (message.say === "browser_action") { - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - if (browserAction.action === "click" && browserAction.coordinate) { - return browserAction.coordinate - } - } + if (!isBrowsing || !latestBrowserAction) return undefined + if (latestBrowserAction.action === "click" && latestBrowserAction.coordinate) { + return latestBrowserAction.coordinate } return undefined - }, [isBrowsing, currentPage?.nextAction?.messages]) + }, [isBrowsing, latestBrowserAction]) - // Use latest click position while browsing, otherwise use display state const mousePosition = isBrowsing ? latestClickPosition || displayState.mousePosition : displayState.mousePosition || defaultMousePosition @@ -354,36 +231,13 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { )} - {/* Action content with min height */} -
{actionContent}
- - {/* Pagination moved to bottom */} - {pages.length > 1 && ( -
-
- {t("chat:browser.navigation.step", { current: currentPageIndex + 1, total: pages.length })} -
-
- setCurrentPageIndex((i) => i - 1)}> - {t("chat:browser.navigation.previous")} - - setCurrentPageIndex((i) => i + 1)}> - {t("chat:browser.navigation.next")} - -
-
+ {/* Show latest browser action */} + {latestBrowserAction && ( + )} , ) @@ -402,93 +256,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { return browserSessionRow }, deepEqual) -interface BrowserSessionRowContentProps extends Omit { - message: ClineMessage - setMaxActionHeight: (height: number) => void - isStreaming: boolean -} - -const BrowserSessionRowContent = ({ - message, - isExpanded, - onToggleExpand, - lastModifiedMessage, - isLast, - setMaxActionHeight, - isStreaming, -}: BrowserSessionRowContentProps) => { - const { t } = useTranslation() - const headerStyle: React.CSSProperties = { - display: "flex", - alignItems: "center", - gap: "10px", - marginBottom: "10px", - wordBreak: "break-word", - } - - switch (message.type) { - case "say": - switch (message.say) { - case "api_req_started": - case "text": - return ( -
- { - if (message.say === "api_req_started") { - setMaxActionHeight(0) - } - onToggleExpand(message.ts) - }} - lastModifiedMessage={lastModifiedMessage} - isLast={isLast} - isStreaming={isStreaming} - /> -
- ) - - case "browser_action": - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - return ( - - ) - - default: - return null - } - - case "ask": - switch (message.ask) { - case "browser_action_launch": - return ( - <> -
- {t("chat:browser.sessionStarted")} -
-
- -
- - ) - - default: - return null - } - } -} - const BrowserActionBox = ({ action, coordinate, @@ -518,7 +285,7 @@ const BrowserActionBox = ({ } } return ( -
+