From 0a99134e156949f18839de3493dd6b558c90c93d Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Fri, 24 Oct 2025 10:26:33 +0000
Subject: [PATCH] feat: add image count limit handling for context window

- Add maxImages property to ModelInfo type and Anthropic models
- Implement automatic image trimming when limit is exceeded
- Add warnings when images are trimmed from conversation
- Add comprehensive tests for image limit handling

Fixes #8800
---
 packages/types/src/model.ts                   |   1 +
 packages/types/src/providers/anthropic.ts     |  10 +
 .../__tests__/image-limit-handler.spec.ts     | 334 ++++++++++++++++++
 src/core/context/image-limit-handler.ts       | 132 +++++++
 src/core/task/Task.ts                         |  40 +++
 5 files changed, 517 insertions(+)
 create mode 100644 src/core/context/__tests__/image-limit-handler.spec.ts
 create mode 100644 src/core/context/image-limit-handler.ts

diff --git a/packages/types/src/model.ts b/packages/types/src/model.ts
index 8d67d3977f0d..a90084ef3ccf 100644
--- a/packages/types/src/model.ts
+++ b/packages/types/src/model.ts
@@ -57,6 +57,7 @@ export const modelInfoSchema = z.object({
 	maxThinkingTokens: z.number().nullish(),
 	contextWindow: z.number(),
 	supportsImages: z.boolean().optional(),
+	maxImages: z.number().optional(),
 	supportsPromptCache: z.boolean(),
 	// Capability flag to indicate whether the model supports an output verbosity parameter
 	supportsVerbosity: z.boolean().optional(),
diff --git a/packages/types/src/providers/anthropic.ts b/packages/types/src/providers/anthropic.ts
index 5fbf62d50782..999f320ca114 100644
--- a/packages/types/src/providers/anthropic.ts
+++ b/packages/types/src/providers/anthropic.ts
@@ -10,6 +10,7 @@ export const anthropicModels = {
 		maxTokens: 64_000, // Overridden to 8k if `enableReasoningEffort` is false.
 		contextWindow: 200_000, // Default 200K, extendable to 1M with beta flag 'context-1m-2025-08-07'
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 3.0, // $3 per million input tokens (≤200K context)
 		outputPrice: 15.0, // $15 per million output tokens (≤200K context)
@@ -31,6 +32,7 @@ export const anthropicModels = {
 		maxTokens: 64_000, // Overridden to 8k if `enableReasoningEffort` is false.
 		contextWindow: 200_000, // Default 200K, extendable to 1M with beta flag 'context-1m-2025-08-07'
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 3.0, // $3 per million input tokens (≤200K context)
 		outputPrice: 15.0, // $15 per million output tokens (≤200K context)
@@ -52,6 +54,7 @@ export const anthropicModels = {
 		maxTokens: 8192,
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 15.0, // $15 per million input tokens
 		outputPrice: 75.0, // $75 per million output tokens
@@ -63,6 +66,7 @@ export const anthropicModels = {
 		maxTokens: 32_000, // Overridden to 8k if `enableReasoningEffort` is false.
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 15.0, // $15 per million input tokens
 		outputPrice: 75.0, // $75 per million output tokens
@@ -74,6 +78,7 @@ export const anthropicModels = {
 		maxTokens: 128_000, // Unlocked by passing `beta` flag to the model. Otherwise, it's 64k.
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 3.0, // $3 per million input tokens
 		outputPrice: 15.0, // $15 per million output tokens
@@ -86,6 +91,7 @@ export const anthropicModels = {
 		maxTokens: 8192, // Since we already have a `:thinking` virtual model we aren't setting `supportsReasoningBudget: true` here.
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 3.0, // $3 per million input tokens
 		outputPrice: 15.0, // $15 per million output tokens
@@ -96,6 +102,7 @@ export const anthropicModels = {
 		maxTokens: 8192,
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 3.0, // $3 per million input tokens
 		outputPrice: 15.0, // $15 per million output tokens
@@ -116,6 +123,7 @@ export const anthropicModels = {
 		maxTokens: 4096,
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 15.0,
 		outputPrice: 75.0,
@@ -126,6 +134,7 @@ export const anthropicModels = {
 		maxTokens: 4096,
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 0.25,
 		outputPrice: 1.25,
@@ -136,6 +145,7 @@ export const anthropicModels = {
 		maxTokens: 64_000,
 		contextWindow: 200_000,
 		supportsImages: true,
+		maxImages: 20,
 		supportsPromptCache: true,
 		inputPrice: 1.0,
 		outputPrice: 5.0,
diff --git a/src/core/context/__tests__/image-limit-handler.spec.ts b/src/core/context/__tests__/image-limit-handler.spec.ts
new file mode 100644
index 000000000000..519088f7e67d
--- /dev/null
+++ b/src/core/context/__tests__/image-limit-handler.spec.ts
@@ -0,0 +1,334 @@
+import { describe, it, expect } from "vitest"
+import { Anthropic } from "@anthropic-ai/sdk"
+import type { ModelInfo } from "@roo-code/types"
+import type { ApiMessage } from "../../task-persistence"
+import { countImagesInConversation, trimImagesFromConversation, wouldExceedImageLimit } from "../image-limit-handler"
+
+describe("image-limit-handler", () => {
+	describe("countImagesInConversation", () => {
+		it("should count images in conversation correctly", () => {
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "text", text: "Hello" },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+					],
+					ts: Date.now(),
+				},
+				{
+					role: "assistant",
+					content: [{ type: "text", text: "I see your image" }],
+					ts: Date.now(),
+				},
+				{
+					role: "user",
+					content: [
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } },
+						{ type: "text", text: "Here are two more images" },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const count = countImagesInConversation(messages)
+			expect(count).toBe(3)
+		})
+
+		it("should return 0 when no images in conversation", () => {
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [{ type: "text", text: "Hello" }],
+					ts: Date.now(),
+				},
+				{
+					role: "assistant",
+					content: [{ type: "text", text: "Hi there" }],
+					ts: Date.now(),
+				},
+			]
+
+			const count = countImagesInConversation(messages)
+			expect(count).toBe(0)
+		})
+
+		it("should handle messages with string content", () => {
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: "Simple text message",
+					ts: Date.now(),
+				},
+				{
+					role: "assistant",
+					content: "Response",
+					ts: Date.now(),
+				},
+			]
+
+			const count = countImagesInConversation(messages)
+			expect(count).toBe(0)
+		})
+	})
+
+	describe("trimImagesFromConversation", () => {
+		it("should trim oldest images when exceeding limit", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 2,
+				supportsPromptCache: true,
+			}
+
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "text", text: "First message" },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+					],
+					ts: Date.now(),
+				},
+				{
+					role: "user",
+					content: [
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+						{ type: "text", text: "Second message" },
+					],
+					ts: Date.now(),
+				},
+				{
+					role: "user",
+					content: [
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } },
+						{ type: "text", text: "Third message" },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const result = trimImagesFromConversation(messages, modelInfo)
+
+			expect(result.trimmedCount).toBe(1)
+			expect(result.warningMessage).toContain("Removed 1 image(s)")
+			expect(countImagesInConversation(result.messages)).toBe(2)
+
+			// Check that the first image was replaced with a placeholder
+			const firstMessage = result.messages[0]
+			if (Array.isArray(firstMessage.content)) {
+				const hasPlaceholder = firstMessage.content.some(
+					(block) => block.type === "text" && block.text.includes("[Image removed"),
+				)
+				expect(hasPlaceholder).toBe(true)
+			}
+		})
+
+		it("should not trim when within limit", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 5,
+				supportsPromptCache: true,
+			}
+
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "text", text: "Message" },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const result = trimImagesFromConversation(messages, modelInfo)
+
+			expect(result.trimmedCount).toBe(0)
+			expect(result.warningMessage).toBeUndefined()
+			expect(result.messages).toEqual(messages)
+		})
+
+		it("should handle model without image support", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: false,
+				supportsPromptCache: true,
+			}
+
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }],
+					ts: Date.now(),
+				},
+			]
+
+			const result = trimImagesFromConversation(messages, modelInfo)
+
+			expect(result.trimmedCount).toBe(0)
+			expect(result.messages).toEqual(messages)
+		})
+
+		it("should handle model without maxImages defined", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				supportsPromptCache: true,
+				// maxImages not defined
+			}
+
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const result = trimImagesFromConversation(messages, modelInfo)
+
+			expect(result.trimmedCount).toBe(0)
+			expect(result.messages).toEqual(messages)
+		})
+
+		it("should trim multiple images and preserve text content", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 1,
+				supportsPromptCache: true,
+			}
+
+			const messages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "text", text: "Here are images:" },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+						{ type: "text", text: "What do you see?" },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const result = trimImagesFromConversation(messages, modelInfo)
+
+			expect(result.trimmedCount).toBe(2)
+			expect(countImagesInConversation(result.messages)).toBe(1)
+
+			// Check that text content is preserved
+			const firstMessage = result.messages[0]
+			if (Array.isArray(firstMessage.content)) {
+				const textBlocks = firstMessage.content.filter((block) => block.type === "text")
+				const hasOriginalText = textBlocks.some((block) => block.text.includes("Here are images:"))
+				const hasQuestionText = textBlocks.some((block) => block.text.includes("What do you see?"))
+				expect(hasOriginalText).toBe(true)
+				expect(hasQuestionText).toBe(true)
+			}
+		})
+	})
+
+	describe("wouldExceedImageLimit", () => {
+		it("should detect when adding content would exceed limit", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 3,
+				supportsPromptCache: true,
+			}
+
+			const currentMessages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } },
+						{ type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } },
+					],
+					ts: Date.now(),
+				},
+			]
+
+			const newContent: Anthropic.Messages.ContentBlockParam[] = [
+				{
+					type: "image" as const,
+					source: { type: "base64" as const, media_type: "image/png" as const, data: "data3" },
+				},
+				{
+					type: "image" as const,
+					source: { type: "base64" as const, media_type: "image/png" as const, data: "data4" },
+				},
+			]
+
+			const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo)
+			expect(result).toBe(true)
+		})
+
+		it("should return false when within limit", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 5,
+				supportsPromptCache: true,
+			}
+
+			const currentMessages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }],
+					ts: Date.now(),
+				},
+			]
+
+			const newContent: Anthropic.Messages.ContentBlockParam[] = [
+				{
+					type: "image" as const,
+					source: { type: "base64" as const, media_type: "image/png" as const, data: "data2" },
+				},
+			]
+
+			const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo)
+			expect(result).toBe(false)
+		})
+
+		it("should handle text-only content", () => {
+			const modelInfo: ModelInfo = {
+				maxTokens: 8192,
+				contextWindow: 200000,
+				supportsImages: true,
+				maxImages: 1,
+				supportsPromptCache: true,
+			}
+
+			const currentMessages: ApiMessage[] = [
+				{
+					role: "user",
+					content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }],
+					ts: Date.now(),
+				},
+			]
+
+			const newContent: Anthropic.Messages.ContentBlockParam[] = [
+				{ type: "text" as const, text: "Just text, no images" },
+			]
+
+			const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo)
+			expect(result).toBe(false)
+		})
+	})
+})
diff --git a/src/core/context/image-limit-handler.ts b/src/core/context/image-limit-handler.ts
new file mode 100644
index 000000000000..d87a3e9130b4
--- /dev/null
+++ b/src/core/context/image-limit-handler.ts
@@ -0,0 +1,132 @@
+import { Anthropic } from "@anthropic-ai/sdk"
+import type { ModelInfo } from "@roo-code/types"
+import type { ApiMessage } from "../task-persistence"
+
+export interface ImageTrimResult {
+	messages: ApiMessage[]
+	trimmedCount: number
+	warningMessage?: string
+}
+
+/**
+ * Count the total number of images in the conversation history
+ */
+export function countImagesInConversation(messages: ApiMessage[]): number {
+	let imageCount = 0
+
+	for (const message of messages) {
+		if (Array.isArray(message.content)) {
+			for (const block of message.content) {
+				if (block.type === "image") {
+					imageCount++
+				}
+			}
+		}
+	}
+
+	return imageCount
+}
+
+/**
+ * Trim the oldest images from the conversation history to stay within the model's image limit
+ */
+export function trimImagesFromConversation(messages: ApiMessage[], modelInfo: ModelInfo): ImageTrimResult {
+	// If model doesn't support images or doesn't have a limit, return as-is
+	if (!modelInfo.supportsImages || !modelInfo.maxImages) {
+		return {
+			messages,
+			trimmedCount: 0,
+		}
+	}
+
+	const maxImages = modelInfo.maxImages
+	const currentImageCount = countImagesInConversation(messages)
+
+	// If within limit, no trimming needed
+	if (currentImageCount <= maxImages) {
+		return {
+			messages,
+			trimmedCount: 0,
+		}
+	}
+
+	// Calculate how many images to trim
+	const imagesToTrim = currentImageCount - maxImages
+	let trimmedCount = 0
+
+	// Create a deep copy of messages to avoid modifying the original
+	const trimmedMessages: ApiMessage[] = JSON.parse(JSON.stringify(messages))
+
+	// Iterate through messages from oldest to newest and remove images
+	for (let i = 0; i < trimmedMessages.length && trimmedCount < imagesToTrim; i++) {
+		const message = trimmedMessages[i]
+
+		if (Array.isArray(message.content)) {
+			const newContent: Anthropic.Messages.ContentBlockParam[] = []
+			let addedPlaceholder = false
+
+			for (const block of message.content) {
+				if (block.type === "image" && trimmedCount < imagesToTrim) {
+					// Replace the first trimmed image with a placeholder text
+					if (!addedPlaceholder) {
+						newContent.push({
+							type: "text",
+							text: "[Image removed to stay within model's image limit]",
+						})
+						addedPlaceholder = true
+					}
+					trimmedCount++
+				} else {
+					newContent.push(block)
+				}
+			}
+
+			// Update the message content
+			message.content = newContent
+
+			// If all content was removed, ensure at least one text block remains
+			if (newContent.length === 0) {
+				message.content = [
+					{
+						type: "text",
+						text: "[Content removed to stay within model's image limit]",
+					},
+				]
+			}
+		}
+	}
+
+	const warningMessage = `⚠️ Removed ${trimmedCount} image(s) from conversation history to stay within the model's limit of ${maxImages} images. The oldest images were removed first.`
+
+	return {
+		messages: trimmedMessages,
+		trimmedCount,
+		warningMessage,
+	}
+}
+
+/**
+ * Check if adding new content would exceed the image limit
+ */
+export function wouldExceedImageLimit(
+	currentMessages: ApiMessage[],
+	newContent: Anthropic.Messages.ContentBlockParam[],
+	modelInfo: ModelInfo,
+): boolean {
+	if (!modelInfo.supportsImages || !modelInfo.maxImages) {
+		return false
+	}
+
+	const currentImageCount = countImagesInConversation(currentMessages)
+	let newImageCount = 0
+
+	if (Array.isArray(newContent)) {
+		for (const block of newContent) {
+			if (block.type === "image") {
+				newImageCount++
+			}
+		}
+	}
+
+	return currentImageCount + newImageCount > modelInfo.maxImages
+}
diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts
index 74cbd2a11005..0686a1ccac71 100644
--- a/src/core/task/Task.ts
+++ b/src/core/task/Task.ts
@@ -103,6 +103,7 @@ import {
 } from "../task-persistence"
 import { getEnvironmentDetails } from "../environment/getEnvironmentDetails"
 import { checkContextWindowExceededError } from "../context/context-management/context-error-handling"
+import { trimImagesFromConversation, countImagesInConversation } from "../context/image-limit-handler"
 import {
 	type CheckpointDiffOptions,
 	type CheckpointRestoreOptions,
@@ -601,6 +602,26 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 	private async addToApiConversationHistory(message: Anthropic.MessageParam) {
 		const messageWithTs = { ...message, ts: Date.now() }
 		this.apiConversationHistory.push(messageWithTs)
+
+		// Check if we need to trim images after adding the new message
+		const modelInfo = this.api.getModel().info
+		if (modelInfo.supportsImages && modelInfo.maxImages) {
+			const imageCount = countImagesInConversation(this.apiConversationHistory)
+			if (imageCount > modelInfo.maxImages) {
+				const trimResult = trimImagesFromConversation(this.apiConversationHistory, modelInfo)
+
+				// Update conversation history with trimmed messages
+				this.apiConversationHistory = trimResult.messages
+
+				// Notify user about trimmed images
+				if (trimResult.warningMessage) {
+					await this.say("text", trimResult.warningMessage, undefined, false, undefined, undefined, {
+						isNonInteractive: true,
+					})
+				}
+			}
+		}
+
 		await this.saveApiConversationHistory()
 	}
 
@@ -2569,6 +2590,25 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 			profileThresholds = {},
 		} = state ?? {}
 
+		// Check and trim images if needed
+		const modelInfo = this.api.getModel().info
+		if (modelInfo.supportsImages && modelInfo.maxImages) {
+			const imageCount = countImagesInConversation(this.apiConversationHistory)
+			if (imageCount > modelInfo.maxImages) {
+				const trimResult = trimImagesFromConversation(this.apiConversationHistory, modelInfo)
+
+				// Update conversation history with trimmed messages
+				await this.overwriteApiConversationHistory(trimResult.messages)
+
+				// Notify user about trimmed images
+				if (trimResult.warningMessage) {
+					await this.say("text", trimResult.warningMessage, undefined, false, undefined, undefined, {
+						isNonInteractive: true,
+					})
+				}
+			}
+		}
+
 		// Get condensing configuration for automatic triggers.
 		const customCondensingPrompt = state?.customCondensingPrompt
 		const condensingApiConfigId = state?.condensingApiConfigId