From c3bf7fb96417674b1a91d9dfc053733633c14753 Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Sat, 25 Oct 2025 21:25:03 -0500 Subject: [PATCH 1/2] fix: prevent browser screenshot memory accumulation - Remove old screenshots from apiConversationHistory after each browser action - Simplify BrowserSessionRow to display only current state without pagination - Remove screenshot history and pagination controls This prevents memory issues where screenshots accumulated indefinitely, consuming significant memory and wasting tokens by sending all historical screenshots to the model on every request. Now only the latest screenshot is kept in memory (~100-200KB) instead of potentially MBs of accumulated screenshots. --- src/core/tools/browserActionTool.ts | 26 ++ .../src/components/chat/BrowserSessionRow.tsx | 300 ++---------------- 2 files changed, 53 insertions(+), 273 deletions(-) diff --git a/src/core/tools/browserActionTool.ts b/src/core/tools/browserActionTool.ts index 13cb9b0ec266..3b035661383f 100644 --- a/src/core/tools/browserActionTool.ts +++ b/src/core/tools/browserActionTool.ts @@ -148,6 +148,32 @@ export async function browserActionTool( } } + // Clean up old browser screenshots from API conversation history to prevent memory accumulation + // Only keep the latest screenshot - old ones are no longer needed by the model + if (browserActionResult?.screenshot) { + const apiHistory = cline.apiConversationHistory + for (let i = apiHistory.length - 1; i >= 0; i--) { + const message = apiHistory[i] + if (Array.isArray(message.content)) { + // Filter out old screenshot image blocks + message.content = message.content.filter((block) => { + // Remove base64 image blocks (browser screenshots) + // Keep text blocks and other content + if ( + block.type === "image" && + "source" in block && + block.source.type === "base64" && + (block.source.media_type === "image/webp" || block.source.media_type === "image/png") + ) { + // This is likely an old browser screenshot - remove it + return false + } + return true + }) + } + } + } + switch (action) { case "launch": case "click": diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index c23b79f568a3..5964b4d8a051 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -2,7 +2,6 @@ import React, { memo, useEffect, useMemo, useRef, useState } from "react" import { useSize } from "react-use" import deepEqual from "fast-deep-equal" import { useTranslation } from "react-i18next" -import { VSCodeButton } from "@vscode/webview-ui-toolkit/react" import type { ClineMessage } from "@roo-code/types" @@ -12,7 +11,6 @@ import { vscode } from "@src/utils/vscode" import { useExtensionState } from "@src/context/ExtensionStateContext" import CodeBlock, { CODE_BLOCK_BG_COLOR } from "../common/CodeBlock" -import { ChatRowContent } from "./ChatRow" import { ProgressIndicator } from "./ProgressIndicator" import { Globe, Pointer, SquareTerminal } from "lucide-react" @@ -30,7 +28,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { const { messages, isLast, onHeightChange, lastModifiedMessage } = props const { t } = useTranslation() const prevHeightRef = useRef(0) - const [maxActionHeight, setMaxActionHeight] = useState(0) const [consoleLogsExpanded, setConsoleLogsExpanded] = useState(false) const { browserViewportSize = "900x600" } = useExtensionState() @@ -55,182 +52,58 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { }, [messages, lastModifiedMessage, isLast]) const isBrowsing = useMemo(() => { - return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted // after user approves, browser_action_result with "" is sent to indicate that the session has started + return isLast && messages.some((m) => m.say === "browser_action_result") && !isLastApiReqInterrupted }, [isLast, messages, isLastApiReqInterrupted]) - // Organize messages into pages with current state and next action - const pages = useMemo(() => { - const result: { - currentState: { - url?: string - screenshot?: string - mousePosition?: string - consoleLogs?: string - messages: ClineMessage[] // messages up to and including the result - } - nextAction?: { - messages: ClineMessage[] // messages leading to next result - } - }[] = [] - - let currentStateMessages: ClineMessage[] = [] - let nextActionMessages: ClineMessage[] = [] - - messages.forEach((message) => { - if (message.ask === "browser_action_launch") { - // Start first page - currentStateMessages = [message] - } else if (message.say === "browser_action_result") { - if (message.text === "") { - // first browser_action_result is an empty string that signals that session has started - return - } - // Complete current state - currentStateMessages.push(message) - const resultData = JSON.parse(message.text || "{}") as BrowserActionResult - - // Add page with current state and previous next actions - result.push({ - currentState: { - url: resultData.currentUrl, - screenshot: resultData.screenshot, - mousePosition: resultData.currentMousePosition, - consoleLogs: resultData.logs, - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - - // Reset for next page - currentStateMessages = [] - nextActionMessages = [] - } else if ( - message.say === "api_req_started" || - message.say === "text" || - message.say === "browser_action" - ) { - // These messages lead to the next result, so they should always go in nextActionMessages - nextActionMessages.push(message) - } else { - // Any other message types - currentStateMessages.push(message) - } - }) - - // Add incomplete page if exists - if (currentStateMessages.length > 0 || nextActionMessages.length > 0) { - result.push({ - currentState: { - messages: [...currentStateMessages], - }, - nextAction: - nextActionMessages.length > 0 - ? { - messages: [...nextActionMessages], - } - : undefined, - }) - } - - return result - }, [messages]) - - // Auto-advance to latest page - const [currentPageIndex, setCurrentPageIndex] = useState(0) - useEffect(() => { - setCurrentPageIndex(pages.length - 1) - }, [pages.length]) - // Get initial URL from launch message const initialUrl = useMemo(() => { const launchMessage = messages.find((m) => m.ask === "browser_action_launch") return launchMessage?.text || "" }, [messages]) - // Find the latest available URL and screenshot + // Find the LATEST browser action result only (no history needed) const latestState = useMemo(() => { - for (let i = pages.length - 1; i >= 0; i--) { - const page = pages[i] - if (page.currentState.url || page.currentState.screenshot) { + // Search backwards to find the most recent browser_action_result + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action_result" && message.text && message.text !== "") { + const resultData = JSON.parse(message.text) as BrowserActionResult return { - url: page.currentState.url, - mousePosition: page.currentState.mousePosition, - consoleLogs: page.currentState.consoleLogs, - screenshot: page.currentState.screenshot, + url: resultData.currentUrl, + screenshot: resultData.screenshot, + mousePosition: resultData.currentMousePosition, + consoleLogs: resultData.logs, } } } - return { url: undefined, mousePosition: undefined, consoleLogs: undefined, screenshot: undefined } - }, [pages]) - - const currentPage = pages[currentPageIndex] - const isLastPage = currentPageIndex === pages.length - 1 - - // Use latest state if we're on the last page and don't have a state yet - const displayState = isLastPage - ? { - url: currentPage?.currentState.url || latestState.url || initialUrl, - mousePosition: - currentPage?.currentState.mousePosition || latestState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot || latestState.screenshot, - } - : { - url: currentPage?.currentState.url || initialUrl, - mousePosition: currentPage?.currentState.mousePosition || defaultMousePosition, - consoleLogs: currentPage?.currentState.consoleLogs, - screenshot: currentPage?.currentState.screenshot, - } - - const [actionContent, { height: actionHeight }] = useSize( -
- {currentPage?.nextAction?.messages.map((message) => ( - - ))} - {!isBrowsing && messages.some((m) => m.say === "browser_action_result") && currentPageIndex === 0 && ( - - )} -
, - ) + return { url: undefined, screenshot: undefined, mousePosition: undefined, consoleLogs: undefined } + }, [messages]) - useEffect(() => { - if (actionHeight === 0 || actionHeight === Infinity) { - return - } - if (actionHeight > maxActionHeight) { - setMaxActionHeight(actionHeight) - } - }, [actionHeight, maxActionHeight]) + // Display state is simply the latest state or defaults + const displayState = { + url: latestState.url || initialUrl, + screenshot: latestState.screenshot, + mousePosition: latestState.mousePosition || defaultMousePosition, + consoleLogs: latestState.consoleLogs, + } - // Track latest click coordinate + // Find latest click position for cursor display const latestClickPosition = useMemo(() => { if (!isBrowsing) return undefined - // Look through current page's next actions for the latest browser_action - const actions = currentPage?.nextAction?.messages || [] - for (let i = actions.length - 1; i >= 0; i--) { - const message = actions[i] - if (message.say === "browser_action") { - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction + // Look through messages backwards for the latest browser_action with click + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i] + if (message.say === "browser_action" && message.text) { + const browserAction = JSON.parse(message.text) as ClineSayBrowserAction if (browserAction.action === "click" && browserAction.coordinate) { return browserAction.coordinate } } } return undefined - }, [isBrowsing, currentPage?.nextAction?.messages]) + }, [isBrowsing, messages]) - // Use latest click position while browsing, otherwise use display state const mousePosition = isBrowsing ? latestClickPosition || displayState.mousePosition : displayState.mousePosition || defaultMousePosition @@ -353,38 +226,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { )} - - {/* Action content with min height */} -
{actionContent}
- - {/* Pagination moved to bottom */} - {pages.length > 1 && ( -
-
- {t("chat:browser.navigation.step", { current: currentPageIndex + 1, total: pages.length })} -
-
- setCurrentPageIndex((i) => i - 1)}> - {t("chat:browser.navigation.previous")} - - setCurrentPageIndex((i) => i + 1)}> - {t("chat:browser.navigation.next")} - -
-
- )} , ) @@ -402,93 +243,6 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { return browserSessionRow }, deepEqual) -interface BrowserSessionRowContentProps extends Omit { - message: ClineMessage - setMaxActionHeight: (height: number) => void - isStreaming: boolean -} - -const BrowserSessionRowContent = ({ - message, - isExpanded, - onToggleExpand, - lastModifiedMessage, - isLast, - setMaxActionHeight, - isStreaming, -}: BrowserSessionRowContentProps) => { - const { t } = useTranslation() - const headerStyle: React.CSSProperties = { - display: "flex", - alignItems: "center", - gap: "10px", - marginBottom: "10px", - wordBreak: "break-word", - } - - switch (message.type) { - case "say": - switch (message.say) { - case "api_req_started": - case "text": - return ( -
- { - if (message.say === "api_req_started") { - setMaxActionHeight(0) - } - onToggleExpand(message.ts) - }} - lastModifiedMessage={lastModifiedMessage} - isLast={isLast} - isStreaming={isStreaming} - /> -
- ) - - case "browser_action": - const browserAction = JSON.parse(message.text || "{}") as ClineSayBrowserAction - return ( - - ) - - default: - return null - } - - case "ask": - switch (message.ask) { - case "browser_action_launch": - return ( - <> -
- {t("chat:browser.sessionStarted")} -
-
- -
- - ) - - default: - return null - } - } -} - const BrowserActionBox = ({ action, coordinate, From 74219c566273728c1dacd6eb4fa192db9cf4a08d Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Sat, 25 Oct 2025 21:26:12 -0500 Subject: [PATCH 2/2] feat: display latest browser action below screenshot --- .../src/components/chat/BrowserSessionRow.tsx | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/webview-ui/src/components/chat/BrowserSessionRow.tsx b/webview-ui/src/components/chat/BrowserSessionRow.tsx index 5964b4d8a051..b32abd19243a 100644 --- a/webview-ui/src/components/chat/BrowserSessionRow.tsx +++ b/webview-ui/src/components/chat/BrowserSessionRow.tsx @@ -87,22 +87,26 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { consoleLogs: latestState.consoleLogs, } - // Find latest click position for cursor display - const latestClickPosition = useMemo(() => { - if (!isBrowsing) return undefined - - // Look through messages backwards for the latest browser_action with click + // Find latest browser action and click position + const latestBrowserAction = useMemo(() => { + // Look through messages backwards for the latest browser_action for (let i = messages.length - 1; i >= 0; i--) { const message = messages[i] if (message.say === "browser_action" && message.text) { const browserAction = JSON.parse(message.text) as ClineSayBrowserAction - if (browserAction.action === "click" && browserAction.coordinate) { - return browserAction.coordinate - } + return browserAction } } return undefined - }, [isBrowsing, messages]) + }, [messages]) + + const latestClickPosition = useMemo(() => { + if (!isBrowsing || !latestBrowserAction) return undefined + if (latestBrowserAction.action === "click" && latestBrowserAction.coordinate) { + return latestBrowserAction.coordinate + } + return undefined + }, [isBrowsing, latestBrowserAction]) const mousePosition = isBrowsing ? latestClickPosition || displayState.mousePosition @@ -226,6 +230,15 @@ const BrowserSessionRow = memo((props: BrowserSessionRowProps) => { )} + + {/* Show latest browser action */} + {latestBrowserAction && ( + + )} , ) @@ -272,7 +285,7 @@ const BrowserActionBox = ({ } } return ( -
+