From 0a99134e156949f18839de3493dd6b558c90c93d Mon Sep 17 00:00:00 2001 From: Roo Code Date: Fri, 24 Oct 2025 10:26:33 +0000 Subject: [PATCH] feat: add image count limit handling for context window - Add maxImages property to ModelInfo type and Anthropic models - Implement automatic image trimming when limit is exceeded - Add warnings when images are trimmed from conversation - Add comprehensive tests for image limit handling Fixes #8800 --- packages/types/src/model.ts | 1 + packages/types/src/providers/anthropic.ts | 10 + .../__tests__/image-limit-handler.spec.ts | 334 ++++++++++++++++++ src/core/context/image-limit-handler.ts | 132 +++++++ src/core/task/Task.ts | 40 +++ 5 files changed, 517 insertions(+) create mode 100644 src/core/context/__tests__/image-limit-handler.spec.ts create mode 100644 src/core/context/image-limit-handler.ts diff --git a/packages/types/src/model.ts b/packages/types/src/model.ts index 8d67d3977f0d..a90084ef3ccf 100644 --- a/packages/types/src/model.ts +++ b/packages/types/src/model.ts @@ -57,6 +57,7 @@ export const modelInfoSchema = z.object({ maxThinkingTokens: z.number().nullish(), contextWindow: z.number(), supportsImages: z.boolean().optional(), + maxImages: z.number().optional(), supportsPromptCache: z.boolean(), // Capability flag to indicate whether the model supports an output verbosity parameter supportsVerbosity: z.boolean().optional(), diff --git a/packages/types/src/providers/anthropic.ts b/packages/types/src/providers/anthropic.ts index 5fbf62d50782..999f320ca114 100644 --- a/packages/types/src/providers/anthropic.ts +++ b/packages/types/src/providers/anthropic.ts @@ -10,6 +10,7 @@ export const anthropicModels = { maxTokens: 64_000, // Overridden to 8k if `enableReasoningEffort` is false. contextWindow: 200_000, // Default 200K, extendable to 1M with beta flag 'context-1m-2025-08-07' supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 3.0, // $3 per million input tokens (≤200K context) outputPrice: 15.0, // $15 per million output tokens (≤200K context) @@ -31,6 +32,7 @@ export const anthropicModels = { maxTokens: 64_000, // Overridden to 8k if `enableReasoningEffort` is false. contextWindow: 200_000, // Default 200K, extendable to 1M with beta flag 'context-1m-2025-08-07' supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 3.0, // $3 per million input tokens (≤200K context) outputPrice: 15.0, // $15 per million output tokens (≤200K context) @@ -52,6 +54,7 @@ export const anthropicModels = { maxTokens: 8192, contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 15.0, // $15 per million input tokens outputPrice: 75.0, // $75 per million output tokens @@ -63,6 +66,7 @@ export const anthropicModels = { maxTokens: 32_000, // Overridden to 8k if `enableReasoningEffort` is false. contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 15.0, // $15 per million input tokens outputPrice: 75.0, // $75 per million output tokens @@ -74,6 +78,7 @@ export const anthropicModels = { maxTokens: 128_000, // Unlocked by passing `beta` flag to the model. Otherwise, it's 64k. contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 3.0, // $3 per million input tokens outputPrice: 15.0, // $15 per million output tokens @@ -86,6 +91,7 @@ export const anthropicModels = { maxTokens: 8192, // Since we already have a `:thinking` virtual model we aren't setting `supportsReasoningBudget: true` here. contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 3.0, // $3 per million input tokens outputPrice: 15.0, // $15 per million output tokens @@ -96,6 +102,7 @@ export const anthropicModels = { maxTokens: 8192, contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 3.0, // $3 per million input tokens outputPrice: 15.0, // $15 per million output tokens @@ -116,6 +123,7 @@ export const anthropicModels = { maxTokens: 4096, contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 15.0, outputPrice: 75.0, @@ -126,6 +134,7 @@ export const anthropicModels = { maxTokens: 4096, contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 0.25, outputPrice: 1.25, @@ -136,6 +145,7 @@ export const anthropicModels = { maxTokens: 64_000, contextWindow: 200_000, supportsImages: true, + maxImages: 20, supportsPromptCache: true, inputPrice: 1.0, outputPrice: 5.0, diff --git a/src/core/context/__tests__/image-limit-handler.spec.ts b/src/core/context/__tests__/image-limit-handler.spec.ts new file mode 100644 index 000000000000..519088f7e67d --- /dev/null +++ b/src/core/context/__tests__/image-limit-handler.spec.ts @@ -0,0 +1,334 @@ +import { describe, it, expect } from "vitest" +import { Anthropic } from "@anthropic-ai/sdk" +import type { ModelInfo } from "@roo-code/types" +import type { ApiMessage } from "../../task-persistence" +import { countImagesInConversation, trimImagesFromConversation, wouldExceedImageLimit } from "../image-limit-handler" + +describe("image-limit-handler", () => { + describe("countImagesInConversation", () => { + it("should count images in conversation correctly", () => { + const messages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "Hello" }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + ], + ts: Date.now(), + }, + { + role: "assistant", + content: [{ type: "text", text: "I see your image" }], + ts: Date.now(), + }, + { + role: "user", + content: [ + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } }, + { type: "text", text: "Here are two more images" }, + ], + ts: Date.now(), + }, + ] + + const count = countImagesInConversation(messages) + expect(count).toBe(3) + }) + + it("should return 0 when no images in conversation", () => { + const messages: ApiMessage[] = [ + { + role: "user", + content: [{ type: "text", text: "Hello" }], + ts: Date.now(), + }, + { + role: "assistant", + content: [{ type: "text", text: "Hi there" }], + ts: Date.now(), + }, + ] + + const count = countImagesInConversation(messages) + expect(count).toBe(0) + }) + + it("should handle messages with string content", () => { + const messages: ApiMessage[] = [ + { + role: "user", + content: "Simple text message", + ts: Date.now(), + }, + { + role: "assistant", + content: "Response", + ts: Date.now(), + }, + ] + + const count = countImagesInConversation(messages) + expect(count).toBe(0) + }) + }) + + describe("trimImagesFromConversation", () => { + it("should trim oldest images when exceeding limit", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 2, + supportsPromptCache: true, + } + + const messages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "First message" }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + ], + ts: Date.now(), + }, + { + role: "user", + content: [ + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + { type: "text", text: "Second message" }, + ], + ts: Date.now(), + }, + { + role: "user", + content: [ + { type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } }, + { type: "text", text: "Third message" }, + ], + ts: Date.now(), + }, + ] + + const result = trimImagesFromConversation(messages, modelInfo) + + expect(result.trimmedCount).toBe(1) + expect(result.warningMessage).toContain("Removed 1 image(s)") + expect(countImagesInConversation(result.messages)).toBe(2) + + // Check that the first image was replaced with a placeholder + const firstMessage = result.messages[0] + if (Array.isArray(firstMessage.content)) { + const hasPlaceholder = firstMessage.content.some( + (block) => block.type === "text" && block.text.includes("[Image removed"), + ) + expect(hasPlaceholder).toBe(true) + } + }) + + it("should not trim when within limit", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 5, + supportsPromptCache: true, + } + + const messages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "Message" }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + ], + ts: Date.now(), + }, + ] + + const result = trimImagesFromConversation(messages, modelInfo) + + expect(result.trimmedCount).toBe(0) + expect(result.warningMessage).toBeUndefined() + expect(result.messages).toEqual(messages) + }) + + it("should handle model without image support", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: false, + supportsPromptCache: true, + } + + const messages: ApiMessage[] = [ + { + role: "user", + content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }], + ts: Date.now(), + }, + ] + + const result = trimImagesFromConversation(messages, modelInfo) + + expect(result.trimmedCount).toBe(0) + expect(result.messages).toEqual(messages) + }) + + it("should handle model without maxImages defined", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + supportsPromptCache: true, + // maxImages not defined + } + + const messages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + ], + ts: Date.now(), + }, + ] + + const result = trimImagesFromConversation(messages, modelInfo) + + expect(result.trimmedCount).toBe(0) + expect(result.messages).toEqual(messages) + }) + + it("should trim multiple images and preserve text content", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 1, + supportsPromptCache: true, + } + + const messages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "Here are images:" }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + { type: "text", text: "What do you see?" }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data3" } }, + ], + ts: Date.now(), + }, + ] + + const result = trimImagesFromConversation(messages, modelInfo) + + expect(result.trimmedCount).toBe(2) + expect(countImagesInConversation(result.messages)).toBe(1) + + // Check that text content is preserved + const firstMessage = result.messages[0] + if (Array.isArray(firstMessage.content)) { + const textBlocks = firstMessage.content.filter((block) => block.type === "text") + const hasOriginalText = textBlocks.some((block) => block.text.includes("Here are images:")) + const hasQuestionText = textBlocks.some((block) => block.text.includes("What do you see?")) + expect(hasOriginalText).toBe(true) + expect(hasQuestionText).toBe(true) + } + }) + }) + + describe("wouldExceedImageLimit", () => { + it("should detect when adding content would exceed limit", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 3, + supportsPromptCache: true, + } + + const currentMessages: ApiMessage[] = [ + { + role: "user", + content: [ + { type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }, + { type: "image", source: { type: "base64", media_type: "image/png", data: "data2" } }, + ], + ts: Date.now(), + }, + ] + + const newContent: Anthropic.Messages.ContentBlockParam[] = [ + { + type: "image" as const, + source: { type: "base64" as const, media_type: "image/png" as const, data: "data3" }, + }, + { + type: "image" as const, + source: { type: "base64" as const, media_type: "image/png" as const, data: "data4" }, + }, + ] + + const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo) + expect(result).toBe(true) + }) + + it("should return false when within limit", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 5, + supportsPromptCache: true, + } + + const currentMessages: ApiMessage[] = [ + { + role: "user", + content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }], + ts: Date.now(), + }, + ] + + const newContent: Anthropic.Messages.ContentBlockParam[] = [ + { + type: "image" as const, + source: { type: "base64" as const, media_type: "image/png" as const, data: "data2" }, + }, + ] + + const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo) + expect(result).toBe(false) + }) + + it("should handle text-only content", () => { + const modelInfo: ModelInfo = { + maxTokens: 8192, + contextWindow: 200000, + supportsImages: true, + maxImages: 1, + supportsPromptCache: true, + } + + const currentMessages: ApiMessage[] = [ + { + role: "user", + content: [{ type: "image", source: { type: "base64", media_type: "image/png", data: "data1" } }], + ts: Date.now(), + }, + ] + + const newContent: Anthropic.Messages.ContentBlockParam[] = [ + { type: "text" as const, text: "Just text, no images" }, + ] + + const result = wouldExceedImageLimit(currentMessages, newContent, modelInfo) + expect(result).toBe(false) + }) + }) +}) diff --git a/src/core/context/image-limit-handler.ts b/src/core/context/image-limit-handler.ts new file mode 100644 index 000000000000..d87a3e9130b4 --- /dev/null +++ b/src/core/context/image-limit-handler.ts @@ -0,0 +1,132 @@ +import { Anthropic } from "@anthropic-ai/sdk" +import type { ModelInfo } from "@roo-code/types" +import type { ApiMessage } from "../task-persistence" + +export interface ImageTrimResult { + messages: ApiMessage[] + trimmedCount: number + warningMessage?: string +} + +/** + * Count the total number of images in the conversation history + */ +export function countImagesInConversation(messages: ApiMessage[]): number { + let imageCount = 0 + + for (const message of messages) { + if (Array.isArray(message.content)) { + for (const block of message.content) { + if (block.type === "image") { + imageCount++ + } + } + } + } + + return imageCount +} + +/** + * Trim the oldest images from the conversation history to stay within the model's image limit + */ +export function trimImagesFromConversation(messages: ApiMessage[], modelInfo: ModelInfo): ImageTrimResult { + // If model doesn't support images or doesn't have a limit, return as-is + if (!modelInfo.supportsImages || !modelInfo.maxImages) { + return { + messages, + trimmedCount: 0, + } + } + + const maxImages = modelInfo.maxImages + const currentImageCount = countImagesInConversation(messages) + + // If within limit, no trimming needed + if (currentImageCount <= maxImages) { + return { + messages, + trimmedCount: 0, + } + } + + // Calculate how many images to trim + const imagesToTrim = currentImageCount - maxImages + let trimmedCount = 0 + + // Create a deep copy of messages to avoid modifying the original + const trimmedMessages: ApiMessage[] = JSON.parse(JSON.stringify(messages)) + + // Iterate through messages from oldest to newest and remove images + for (let i = 0; i < trimmedMessages.length && trimmedCount < imagesToTrim; i++) { + const message = trimmedMessages[i] + + if (Array.isArray(message.content)) { + const newContent: Anthropic.Messages.ContentBlockParam[] = [] + let addedPlaceholder = false + + for (const block of message.content) { + if (block.type === "image" && trimmedCount < imagesToTrim) { + // Replace the first trimmed image with a placeholder text + if (!addedPlaceholder) { + newContent.push({ + type: "text", + text: "[Image removed to stay within model's image limit]", + }) + addedPlaceholder = true + } + trimmedCount++ + } else { + newContent.push(block) + } + } + + // Update the message content + message.content = newContent + + // If all content was removed, ensure at least one text block remains + if (newContent.length === 0) { + message.content = [ + { + type: "text", + text: "[Content removed to stay within model's image limit]", + }, + ] + } + } + } + + const warningMessage = `⚠️ Removed ${trimmedCount} image(s) from conversation history to stay within the model's limit of ${maxImages} images. The oldest images were removed first.` + + return { + messages: trimmedMessages, + trimmedCount, + warningMessage, + } +} + +/** + * Check if adding new content would exceed the image limit + */ +export function wouldExceedImageLimit( + currentMessages: ApiMessage[], + newContent: Anthropic.Messages.ContentBlockParam[], + modelInfo: ModelInfo, +): boolean { + if (!modelInfo.supportsImages || !modelInfo.maxImages) { + return false + } + + const currentImageCount = countImagesInConversation(currentMessages) + let newImageCount = 0 + + if (Array.isArray(newContent)) { + for (const block of newContent) { + if (block.type === "image") { + newImageCount++ + } + } + } + + return currentImageCount + newImageCount > modelInfo.maxImages +} diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 74cbd2a11005..0686a1ccac71 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -103,6 +103,7 @@ import { } from "../task-persistence" import { getEnvironmentDetails } from "../environment/getEnvironmentDetails" import { checkContextWindowExceededError } from "../context/context-management/context-error-handling" +import { trimImagesFromConversation, countImagesInConversation } from "../context/image-limit-handler" import { type CheckpointDiffOptions, type CheckpointRestoreOptions, @@ -601,6 +602,26 @@ export class Task extends EventEmitter implements TaskLike { private async addToApiConversationHistory(message: Anthropic.MessageParam) { const messageWithTs = { ...message, ts: Date.now() } this.apiConversationHistory.push(messageWithTs) + + // Check if we need to trim images after adding the new message + const modelInfo = this.api.getModel().info + if (modelInfo.supportsImages && modelInfo.maxImages) { + const imageCount = countImagesInConversation(this.apiConversationHistory) + if (imageCount > modelInfo.maxImages) { + const trimResult = trimImagesFromConversation(this.apiConversationHistory, modelInfo) + + // Update conversation history with trimmed messages + this.apiConversationHistory = trimResult.messages + + // Notify user about trimmed images + if (trimResult.warningMessage) { + await this.say("text", trimResult.warningMessage, undefined, false, undefined, undefined, { + isNonInteractive: true, + }) + } + } + } + await this.saveApiConversationHistory() } @@ -2569,6 +2590,25 @@ export class Task extends EventEmitter implements TaskLike { profileThresholds = {}, } = state ?? {} + // Check and trim images if needed + const modelInfo = this.api.getModel().info + if (modelInfo.supportsImages && modelInfo.maxImages) { + const imageCount = countImagesInConversation(this.apiConversationHistory) + if (imageCount > modelInfo.maxImages) { + const trimResult = trimImagesFromConversation(this.apiConversationHistory, modelInfo) + + // Update conversation history with trimmed messages + await this.overwriteApiConversationHistory(trimResult.messages) + + // Notify user about trimmed images + if (trimResult.warningMessage) { + await this.say("text", trimResult.warningMessage, undefined, false, undefined, undefined, { + isNonInteractive: true, + }) + } + } + } + // Get condensing configuration for automatic triggers. const customCondensingPrompt = state?.customCondensingPrompt const condensingApiConfigId = state?.condensingApiConfigId