Skip to content

feat: add file deduplication to reduce token usage #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/types/src/experiment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import type { Keys, Equals, AssertEqual } from "./type-fu.js"
* ExperimentId
*/

export const experimentIds = ["powerSteering", "multiFileApplyDiff"] as const
export const experimentIds = ["powerSteering", "multiFileApplyDiff", "contextDeduplication"] as const

export const experimentIdsSchema = z.enum(experimentIds)

Expand All @@ -19,6 +19,7 @@ export type ExperimentId = z.infer<typeof experimentIdsSchema>
export const experimentsSchema = z.object({
powerSteering: z.boolean().optional(),
multiFileApplyDiff: z.boolean().optional(),
contextDeduplication: z.boolean().optional(),
})

export type Experiments = z.infer<typeof experimentsSchema>
Expand Down
191 changes: 191 additions & 0 deletions src/core/task/Task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,187 @@ export class Task extends EventEmitter<ClineEvents> {
}
}

// Context deduplication methods
private deduplicateReadFileHistory(): { removedCount: number; tokensSaved: number } {
const seenFiles = new Map<
string,
{
messageIndex: number
contentIndex: number
blockIndex?: number
ranges?: Array<{ start: number; end: number }>
isFullRead: boolean
}
>()

let removedCount = 0
let tokensSaved = 0

// Scan forwards to keep first occurrence and remove later duplicates
for (let i = 0; i < this.apiConversationHistory.length; i++) {
const message = this.apiConversationHistory[i]
if (message.role !== "user" || typeof message.content === "string") continue

for (let j = 0; j < message.content.length; j++) {
const content = message.content[j]

// Handle tool_result blocks
if (content.type === "tool_result" && content.content) {
const toolContent = Array.isArray(content.content) ? content.content : [content.content]

for (let k = 0; k < toolContent.length; k++) {
const block = toolContent[k]
if (typeof block === "object" && block.type === "text") {
const fileReads = this.parseFileReads(block.text)

for (const fileRead of fileReads) {
const existing = seenFiles.get(fileRead.path)

if (!existing) {
// First occurrence - keep it
seenFiles.set(fileRead.path, {
messageIndex: i,
contentIndex: j,
blockIndex: k,
ranges: fileRead.ranges,
isFullRead: fileRead.isFullRead,
})
} else if (this.shouldRemoveContent(fileRead, existing)) {
// Remove this duplicate occurrence
const oldContent = typeof block === "object" && "text" in block ? block.text : ""
const estimatedTokens = Math.ceil(oldContent.length / 4) // Rough token estimate
tokensSaved += estimatedTokens

// Replace with deduplication notice
if (Array.isArray(content.content)) {
content.content[k] = {
type: "text",
text: `[File content removed - already read ${fileRead.path}]`,
}
}
removedCount++
}
}
}
}
}
// Handle text blocks with file_content tags (from @mentions)
else if (content.type === "text") {
const fileContentMatches = Array.from(
content.text.matchAll(/<file_content\s+path="([^"]+)"[^>]*>([\s\S]*?)<\/file_content>/g),
)

for (const match of fileContentMatches) {
const [fullMatch, filePath, fileContent] = match
const existing = seenFiles.get(filePath)

if (!existing) {
seenFiles.set(filePath, {
messageIndex: i,
contentIndex: j,
isFullRead: true,
})
} else {
// Remove duplicate file_content
const estimatedTokens = Math.ceil(fileContent.length / 4)
tokensSaved += estimatedTokens

content.text = content.text.replace(
fullMatch,
`<file_content path="${filePath}">[Content removed - already included]</file_content>`,
)
removedCount++
}
}
}
}
}

return { removedCount, tokensSaved }
}

private parseFileReads(text: string): Array<{
path: string
ranges?: Array<{ start: number; end: number }>
isFullRead: boolean
}> {
const results: Array<{
path: string
ranges?: Array<{ start: number; end: number }>
isFullRead: boolean
}> = []

// Match read_file results in the format from readFileTool
// Pattern for: Result:<files><file><path>filepath</path>
const filePathPattern = /<file><path>([^<]+)<\/path>/g
let match

while ((match = filePathPattern.exec(text)) !== null) {
const path = match[1]
const fileResult = {
path,
ranges: [] as Array<{ start: number; end: number }>,
isFullRead: true,
}

// Check for line ranges in the same file block
const fileBlockMatch = text.match(
new RegExp(`<file><path>${path.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}<\/path>[\\s\\S]*?<\/file>`, "s"),
)
if (fileBlockMatch) {
const fileBlock = fileBlockMatch[0]
// Look for content with line ranges
const rangeMatches = fileBlock.matchAll(/<content\s+lines="(\d+)-(\d+)">/g)

for (const rangeMatch of rangeMatches) {
const [, start, end] = rangeMatch
fileResult.ranges?.push({
start: parseInt(start, 10),
end: parseInt(end, 10),
})
fileResult.isFullRead = false
}
}

results.push(fileResult)
}

return results
}

private shouldRemoveContent(
current: { ranges?: Array<{ start: number; end: number }>; isFullRead: boolean },
existing: { ranges?: Array<{ start: number; end: number }>; isFullRead: boolean },
): boolean {
// If existing is full read, remove all later content
if (existing.isFullRead) return true

// If current is full read but existing is partial, keep current (don't remove)
if (current.isFullRead && !existing.isFullRead) return false

// Check for range overlap
if (existing.ranges && current.ranges && existing.ranges.length > 0 && current.ranges.length > 0) {
return this.hasOverlap(existing.ranges, current.ranges)
}

// Default to removing if we can't determine overlap
return true
}

private hasOverlap(
rangesA: Array<{ start: number; end: number }>,
rangesB: Array<{ start: number; end: number }>,
): boolean {
for (const a of rangesA) {
for (const b of rangesB) {
if (a.start <= b.end && b.start <= a.end) {
return true
}
}
}
return false
}

// Cline Messages

private async getSavedClineMessages(): Promise<ClineMessage[]> {
Expand Down Expand Up @@ -1724,6 +1905,16 @@ export class Task extends EventEmitter<ClineEvents> {
state?.listApiConfigMeta.find((profile) => profile.name === state?.currentApiConfigName)?.id ??
"default"

// Apply context deduplication if enabled
if (state?.experiments?.contextDeduplication) {
const { removedCount, tokensSaved } = this.deduplicateReadFileHistory()
if (removedCount > 0) {
console.log(
`Context deduplication: removed ${removedCount} duplicate file reads, saved ~${tokensSaved} tokens`,
)
}
}

const truncateResult = await truncateConversationIfNeeded({
messages: this.apiConversationHistory,
totalTokens: contextTokens,
Expand Down
Loading
Loading