Skip to content

feat: enhance MCP tool response handling to support image responses #5185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/core/prompts/responses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,16 @@ Otherwise, if you have not completed the task and do not need additional informa
images?: string[],
): string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam> => {
if (images && images.length > 0) {
const textBlock: Anthropic.TextBlockParam = { type: "text", text }
const imageBlocks: Anthropic.ImageBlockParam[] = formatImagesIntoBlocks(images)
// Placing images after text leads to better results
return [textBlock, ...imageBlocks]

if (text.trim()) {
const textBlock: Anthropic.TextBlockParam = { type: "text", text }
// Placing images after text leads to better results
return [textBlock, ...imageBlocks]
} else {
// For image-only responses, return only image blocks
return imageBlocks
}
} else {
return text
}
Expand Down
110 changes: 108 additions & 2 deletions src/core/tools/__tests__/useMcpToolTool.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ import { ToolUse } from "../../../shared/tools"
// Mock dependencies
vi.mock("../../prompts/responses", () => ({
formatResponse: {
toolResult: vi.fn((result: string) => `Tool result: ${result}`),
toolResult: vi.fn((result: string, images?: string[]) => {
if (images && images.length > 0) {
return `Tool result: ${result} (with ${images.length} images)`
}
return `Tool result: ${result}`
}),
toolError: vi.fn((error: string) => `Tool error: ${error}`),
invalidMcpToolArgumentError: vi.fn((server: string, tool: string) => `Invalid args for ${server}:${tool}`),
},
Expand Down Expand Up @@ -208,10 +213,111 @@ describe("useMcpToolTool", () => {
expect(mockTask.consecutiveMistakeCount).toBe(0)
expect(mockAskApproval).toHaveBeenCalled()
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Tool executed successfully")
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Tool executed successfully", [])
expect(mockPushToolResult).toHaveBeenCalledWith("Tool result: Tool executed successfully")
})

it("should handle tool result with text and images", async () => {
const block: ToolUse = {
type: "tool_use",
name: "use_mcp_tool",
params: {
server_name: "test_server",
tool_name: "test_tool",
arguments: '{"param": "value"}',
},
partial: false,
}

mockAskApproval.mockResolvedValue(true)

const mockToolResult = {
content: [
{ type: "text", text: "Generated image:" },
{
type: "image",
data: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
mimeType: "image/png",
},
],
isError: false,
}

mockProviderRef.deref.mockReturnValue({
getMcpHub: () => ({
callTool: vi.fn().mockResolvedValue(mockToolResult),
}),
postMessageToWebview: vi.fn(),
})

await useMcpToolTool(
mockTask as Task,
block,
mockAskApproval,
mockHandleError,
mockPushToolResult,
mockRemoveClosingTag,
)

expect(mockTask.consecutiveMistakeCount).toBe(0)
expect(mockAskApproval).toHaveBeenCalled()
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Generated image:", [
"",
])
expect(mockPushToolResult).toHaveBeenCalledWith("Tool result: Generated image: (with 1 images)")
})

it("should handle tool result with only images (no text)", async () => {
const block: ToolUse = {
type: "tool_use",
name: "use_mcp_tool",
params: {
server_name: "test_server",
tool_name: "test_tool",
arguments: '{"param": "value"}',
},
partial: false,
}

mockAskApproval.mockResolvedValue(true)

const mockToolResult = {
content: [
{
type: "image",
data: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
mimeType: "image/png",
},
],
isError: false,
}

mockProviderRef.deref.mockReturnValue({
getMcpHub: () => ({
callTool: vi.fn().mockResolvedValue(mockToolResult),
}),
postMessageToWebview: vi.fn(),
})

await useMcpToolTool(
mockTask as Task,
block,
mockAskApproval,
mockHandleError,
mockPushToolResult,
mockRemoveClosingTag,
)

expect(mockTask.consecutiveMistakeCount).toBe(0)
expect(mockAskApproval).toHaveBeenCalled()
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "", [
"",
])
expect(mockPushToolResult).toHaveBeenCalledWith("Tool result: (with 1 images)")
})

it("should handle user rejection", async () => {
const block: ToolUse = {
type: "tool_use",
Expand Down
53 changes: 35 additions & 18 deletions src/core/tools/useMcpToolTool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,24 +89,39 @@ async function sendExecutionStatus(cline: Task, status: McpExecutionStatus): Pro
})
}

function processToolContent(toolResult: any): string {
function processToolContent(toolResult: any): { text: string; images: string[] } {
if (!toolResult?.content || toolResult.content.length === 0) {
return ""
return { text: "", images: [] }
}

return toolResult.content
.map((item: any) => {
if (item.type === "text") {
return item.text
const textParts: string[] = []
const images: string[] = []

toolResult.content.forEach((item: any) => {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Images are validated sequentially in the forEach loop. For responses with many images, this could be slow.

Consider batching validation or using Promise.all for parallel processing:

const validatedImages = await Promise.all(
  toolResult.content
    .filter((item: any) => item.type === "image")
    .map(async (item: any) => validateAndProcessImage(item))
);
images.push(...validatedImages.filter(Boolean));

This would improve performance for MCP tools that return multiple images.

if (item.type === "text") {
textParts.push(item.text)
} else if (item.type === "image") {
if (item.data && item.mimeType) {
const validImageTypes = ["image/png", "image/jpeg", "image/gif", "image/webp"]
if (validImageTypes.includes(item.mimeType)) {
const dataUrl = `data:${item.mimeType};base64,${item.data}`
images.push(dataUrl)
} else {
console.warn(`Unsupported image MIME type: ${item.mimeType}`)
}
} else {
console.warn("Invalid MCP ImageContent: missing data or mimeType")
}
if (item.type === "resource") {
const { blob: _, ...rest } = item.resource
return JSON.stringify(rest, null, 2)
}
return ""
})
.filter(Boolean)
.join("\n\n")
} else if (item.type === "resource") {
const { blob: _, ...rest } = item.resource
textParts.push(JSON.stringify(rest, null, 2))
}
})

return {
text: textParts.filter(Boolean).join("\n\n"),
images,
}
}

async function executeToolAndProcessResult(
Expand All @@ -130,11 +145,13 @@ async function executeToolAndProcessResult(
const toolResult = await cline.providerRef.deref()?.getMcpHub()?.callTool(serverName, toolName, parsedArguments)

let toolResultPretty = "(No response)"
let images: string[] = []

if (toolResult) {
const outputText = processToolContent(toolResult)
const { text: outputText, images: outputImages } = processToolContent(toolResult)
images = outputImages

if (outputText) {
if (outputText || images.length > 0) {
await sendExecutionStatus(cline, {
executionId,
status: "output",
Expand All @@ -160,8 +177,8 @@ async function executeToolAndProcessResult(
})
}

await cline.say("mcp_server_response", toolResultPretty)
pushToolResult(formatResponse.toolResult(toolResultPretty))
await cline.say("mcp_server_response", toolResultPretty, images)
pushToolResult(formatResponse.toolResult(toolResultPretty, images))
}

export async function useMcpToolTool(
Expand Down
42 changes: 42 additions & 0 deletions src/shared/__tests__/combineCommandSequences.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,48 @@ describe("combineCommandSequences", () => {
})
})

it("should preserve images from mcp_server_response messages", () => {
const messages: ClineMessage[] = [
{
type: "ask",
ask: "use_mcp_server",
text: JSON.stringify({
serverName: "test-server",
toolName: "test-tool",
arguments: { param: "value" },
}),
ts: 1625097600000,
},
{
type: "say",
say: "mcp_server_response",
text: "Generated 1 image",
images: [
"",
],
ts: 1625097601000,
},
]

const result = combineCommandSequences(messages)

expect(result).toHaveLength(1)
expect(result[0]).toEqual({
type: "ask",
ask: "use_mcp_server",
text: JSON.stringify({
serverName: "test-server",
toolName: "test-tool",
arguments: { param: "value" },
response: "Generated 1 image",
}),
images: [
"",
],
ts: 1625097600000,
})
})

it("should handle multiple MCP server requests", () => {
const messages: ClineMessage[] = [
{
Expand Down
20 changes: 17 additions & 3 deletions src/shared/combineCommandSequences.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,16 @@ export function combineCommandSequences(messages: ClineMessage[]): ClineMessage[
if (msg.type === "ask" && msg.ask === "use_mcp_server") {
// Look ahead for MCP responses
let responses: string[] = []
let allImages: string[] = []
let j = i + 1

while (j < messages.length) {
if (messages[j].say === "mcp_server_response") {
responses.push(messages[j].text || "")
// Collect images from MCP server responses
if (messages[j].images && Array.isArray(messages[j].images) && messages[j].images!.length > 0) {
allImages.push(...messages[j].images!)
}
processedIndices.add(j)
j++
} else if (messages[j].type === "ask" && messages[j].ask === "use_mcp_server") {
Expand All @@ -56,13 +61,22 @@ export function combineCommandSequences(messages: ClineMessage[]): ClineMessage[
// Parse the JSON from the message text
const jsonObj = safeJsonParse<any>(msg.text || "{}", {})

// Add the response to the JSON object
jsonObj.response = responses.join("\n")
// Only add non-empty responses
const nonEmptyResponses = responses.filter((response) => response.trim())
if (nonEmptyResponses.length > 0) {
jsonObj.response = nonEmptyResponses.join("\n")
}

// Stringify the updated JSON object
const combinedText = JSON.stringify(jsonObj)

combinedMessages.set(msg.ts, { ...msg, text: combinedText })
// Preserve images in the combined message
const combinedMessage = { ...msg, text: combinedText }
if (allImages.length > 0) {
combinedMessage.images = allImages
}

combinedMessages.set(msg.ts, combinedMessage)
} else {
// If there's no response, just keep the original message
combinedMessages.set(msg.ts, { ...msg })
Expand Down
1 change: 1 addition & 0 deletions webview-ui/src/components/chat/ChatRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,7 @@ export const ChatRowContent = ({
server={server}
useMcpServer={useMcpServer}
alwaysAllowMcp={alwaysAllowMcp}
images={message.images}
/>
)}
</div>
Expand Down
Loading