Skip to content

Commit ca385ed

Browse files
committed
feat: implement prompt caching for Anthropic
This is a backport of #15731 for Theia 1.61.x Mark system prompts, tools and messages for caching in the Anthropic LanguageModel. For typical agent workflows like "@coder" this reduces token costs by a large factor and reduces the pressure on the rate limit. Currently caching is enabled by default for all Anthropic models. Also updates token tracking functionality to be aware of token caching and updates the Anthropic SDK.
1 parent d51a501 commit ca385ed

11 files changed

+204
-34
lines changed

package-lock.json

Lines changed: 11 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/ai-anthropic/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"version": "1.61.0",
44
"description": "Theia - Anthropic Integration",
55
"dependencies": {
6-
"@anthropic-ai/sdk": "^0.39.0",
6+
"@anthropic-ai/sdk": "^0.52.0",
77
"@theia/ai-core": "1.61.0",
88
"@theia/core": "1.61.0"
99
},

packages/ai-anthropic/src/browser/anthropic-frontend-application-contribution.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ export class AnthropicFrontendApplicationContribution implements FrontendApplica
7979
id: id,
8080
model: modelId,
8181
apiKey: true,
82-
enableStreaming: true
82+
enableStreaming: true,
83+
useCaching: true
8384
};
8485

8586
if (maxTokens !== undefined) {

packages/ai-anthropic/src/common/anthropic-language-models-manager.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ export interface AnthropicModelDescription {
3232
* Indicate whether the streaming API shall be used.
3333
*/
3434
enableStreaming: boolean;
35+
/**
36+
* Indicate whether the model supports prompt caching.
37+
*/
38+
useCaching: boolean;
3539
/**
3640
* Maximum number of tokens to generate. Default is 4096.
3741
*/

packages/ai-anthropic/src/node/anthropic-language-model.ts

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import {
2828
} from '@theia/ai-core';
2929
import { CancellationToken, isArray } from '@theia/core';
3030
import { Anthropic } from '@anthropic-ai/sdk';
31-
import { Message, MessageParam } from '@anthropic-ai/sdk/resources';
31+
import type { Message, MessageParam } from '@anthropic-ai/sdk/resources';
3232

3333
export const DEFAULT_MAX_TOKENS = 4096;
3434

@@ -41,7 +41,7 @@ interface ToolCallback {
4141

4242
const createMessageContent = (message: LanguageModelMessage): MessageParam['content'] => {
4343
if (LanguageModelMessage.isTextMessage(message)) {
44-
return message.text;
44+
return [{ type: 'text', text: message.text }];
4545
} else if (LanguageModelMessage.isThinkingMessage(message)) {
4646
return [{ signature: message.signature, thinking: message.thinking, type: 'thinking' }];
4747
} else if (LanguageModelMessage.isToolUseMessage(message)) {
@@ -52,17 +52,27 @@ const createMessageContent = (message: LanguageModelMessage): MessageParam['cont
5252
throw new Error(`Unknown message type:'${JSON.stringify(message)}'`);
5353
};
5454

55+
type NonThinkingParam = Exclude<Anthropic.Messages.ContentBlockParam, Anthropic.Messages.ThinkingBlockParam | Anthropic.Messages.RedactedThinkingBlockParam>;
56+
function isNonThinkingParam(
57+
content: Anthropic.Messages.ContentBlockParam
58+
): content is NonThinkingParam {
59+
return content.type !== 'thinking' && content.type !== 'redacted_thinking';
60+
}
61+
5562
/**
5663
* Transforms Theia language model messages to Anthropic API format
5764
* @param messages Array of LanguageModelRequestMessage to transform
5865
* @returns Object containing transformed messages and optional system message
5966
*/
6067
function transformToAnthropicParams(
61-
messages: readonly LanguageModelMessage[]
62-
): { messages: MessageParam[]; systemMessage?: string } {
68+
messages: readonly LanguageModelMessage[],
69+
addCacheControl: boolean = true
70+
): { messages: MessageParam[]; systemMessage?: Anthropic.Messages.TextBlockParam[] } {
6371
// Extract the system message (if any), as it is a separate parameter in the Anthropic API.
6472
const systemMessageObj = messages.find(message => message.actor === 'system');
65-
const systemMessage = systemMessageObj && LanguageModelMessage.isTextMessage(systemMessageObj) && systemMessageObj.text || undefined;
73+
const systemMessageText = systemMessageObj && LanguageModelMessage.isTextMessage(systemMessageObj) && systemMessageObj.text || undefined;
74+
const systemMessage: Anthropic.Messages.TextBlockParam[] | undefined =
75+
systemMessageText ? [{ type: 'text', text: systemMessageText, cache_control: addCacheControl ? { type: 'ephemeral' } : undefined }] : undefined;
6676

6777
const convertedMessages = messages
6878
.filter(message => message.actor !== 'system')
@@ -77,6 +87,35 @@ function transformToAnthropicParams(
7787
};
7888
}
7989

90+
/**
91+
* If possible adds a cache control to the last message in the conversation.
92+
* This is used to enable incremental caching of the conversation.
93+
* @param messages The messages to process
94+
* @returns A new messages array with the last message adapted to include cache control. If no cache control can be added, the original messages are returned.
95+
* In any case, the original messages are not modified
96+
*/
97+
function addCacheControlToLastMessage(messages: Anthropic.Messages.MessageParam[]): Anthropic.Messages.MessageParam[] {
98+
const clonedMessages = [...messages];
99+
const latestMessage = clonedMessages.pop();
100+
if (latestMessage) {
101+
let content: NonThinkingParam | undefined = undefined;
102+
if (typeof latestMessage.content === 'string') {
103+
content = { type: 'text', text: latestMessage.content };
104+
} else if (Array.isArray(latestMessage.content)) {
105+
// we can't set cache control on thinking messages, so we only set it on the last non-thinking block
106+
const filteredContent = latestMessage.content.filter(isNonThinkingParam);
107+
if (filteredContent.length) {
108+
content = filteredContent[filteredContent.length - 1];
109+
}
110+
}
111+
if (content) {
112+
const cachedContent: NonThinkingParam = { ...content, cache_control: { type: 'ephemeral' } };
113+
return [...clonedMessages, { ...latestMessage, content: [cachedContent] }];
114+
}
115+
}
116+
return messages;
117+
}
118+
80119
export const AnthropicModelIdentifier = Symbol('AnthropicModelIdentifier');
81120

82121
/**
@@ -102,6 +141,7 @@ export class AnthropicModel implements LanguageModel {
102141
public readonly id: string,
103142
public model: string,
104143
public enableStreaming: boolean,
144+
public useCaching: boolean,
105145
public apiKey: () => string | undefined,
106146
public maxTokens: number = DEFAULT_MAX_TOKENS,
107147
protected readonly tokenUsageService?: TokenUsageService
@@ -153,11 +193,18 @@ export class AnthropicModel implements LanguageModel {
153193
toolMessages?: readonly Anthropic.Messages.MessageParam[]
154194
): Promise<LanguageModelStreamResponse> {
155195
const settings = this.getSettings(request);
156-
const { messages, systemMessage } = transformToAnthropicParams(request.messages);
196+
const { messages, systemMessage } = transformToAnthropicParams(request.messages, this.useCaching);
197+
198+
let anthropicMessages = [...messages, ...(toolMessages ?? [])];
199+
200+
if (this.useCaching && anthropicMessages.length) {
201+
anthropicMessages = addCacheControlToLastMessage(anthropicMessages);
202+
}
203+
157204
const tools = this.createTools(request);
158205
const params: Anthropic.MessageCreateParams = {
159206
max_tokens: this.maxTokens,
160-
messages: [...messages, ...(toolMessages ?? [])],
207+
messages: anthropicMessages,
161208
tools,
162209
tool_choice: tools ? { type: 'auto' } : undefined,
163210
model: this.model,
@@ -231,6 +278,8 @@ export class AnthropicModel implements LanguageModel {
231278
const tokenUsageParams: TokenUsageParams = {
232279
inputTokens: currentMessage.usage.input_tokens,
233280
outputTokens: currentMessage.usage.output_tokens,
281+
cachedInputTokens: currentMessage.usage.cache_creation_input_tokens || undefined,
282+
readCachedInputTokens: currentMessage.usage.cache_read_input_tokens || undefined,
234283
requestId: request.requestId
235284
};
236285
await that.tokenUsageService.recordTokenUsage(that.id, tokenUsageParams);
@@ -285,15 +334,21 @@ export class AnthropicModel implements LanguageModel {
285334
return { stream: asyncIterator };
286335
}
287336

288-
private createTools(request: LanguageModelRequest): Anthropic.Messages.Tool[] | undefined {
337+
protected createTools(request: LanguageModelRequest): Anthropic.Messages.Tool[] | undefined {
289338
if (request.tools?.length === 0) {
290339
return undefined;
291340
}
292-
return request.tools?.map(tool => ({
341+
const tools = request.tools?.map(tool => ({
293342
name: tool.name,
294343
description: tool.description,
295344
input_schema: tool.parameters
296345
} as Anthropic.Messages.Tool));
346+
if (this.useCaching) {
347+
if (tools?.length) {
348+
tools[tools.length - 1].cache_control = { type: 'ephemeral' };
349+
}
350+
}
351+
return tools;
297352
}
298353

299354
protected async handleNonStreamingRequest(

packages/ai-anthropic/src/node/anthropic-language-models-manager-impl.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ export class AnthropicLanguageModelsManagerImpl implements AnthropicLanguageMode
6666
modelDescription.id,
6767
modelDescription.model,
6868
modelDescription.enableStreaming,
69+
modelDescription.useCaching,
6970
apiKeyProvider,
7071
modelDescription.maxTokens,
7172
this.tokenUsageService

packages/ai-core/src/browser/token-usage-frontend-service-impl.ts

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ export class TokenUsageFrontendServiceImpl implements TokenUsageFrontendService
7676
const modelMap = new Map<string, {
7777
inputTokens: number;
7878
outputTokens: number;
79+
cachedInputTokens: number;
80+
readCachedInputTokens: number;
7981
lastUsed?: Date;
8082
}>();
8183

@@ -87,6 +89,16 @@ export class TokenUsageFrontendServiceImpl implements TokenUsageFrontendService
8789
existing.inputTokens += usage.inputTokens;
8890
existing.outputTokens += usage.outputTokens;
8991

92+
// Add cached tokens if they exist
93+
if (usage.cachedInputTokens !== undefined) {
94+
existing.cachedInputTokens += usage.cachedInputTokens;
95+
}
96+
97+
// Add read cached tokens if they exist
98+
if (usage.readCachedInputTokens !== undefined) {
99+
existing.readCachedInputTokens += usage.readCachedInputTokens;
100+
}
101+
90102
// Update last used if this usage is more recent
91103
if (!existing.lastUsed || (usage.timestamp && usage.timestamp > existing.lastUsed)) {
92104
existing.lastUsed = usage.timestamp;
@@ -95,6 +107,8 @@ export class TokenUsageFrontendServiceImpl implements TokenUsageFrontendService
95107
modelMap.set(usage.model, {
96108
inputTokens: usage.inputTokens,
97109
outputTokens: usage.outputTokens,
110+
cachedInputTokens: usage.cachedInputTokens || 0,
111+
readCachedInputTokens: usage.readCachedInputTokens || 0,
98112
lastUsed: usage.timestamp
99113
});
100114
}
@@ -104,12 +118,23 @@ export class TokenUsageFrontendServiceImpl implements TokenUsageFrontendService
104118
const result: ModelTokenUsageData[] = [];
105119

106120
for (const [modelId, data] of modelMap.entries()) {
107-
result.push({
121+
const modelData: ModelTokenUsageData = {
108122
modelId,
109123
inputTokens: data.inputTokens,
110124
outputTokens: data.outputTokens,
111125
lastUsed: data.lastUsed
112-
});
126+
};
127+
128+
// Only include cache-related fields if they have non-zero values
129+
if (data.cachedInputTokens > 0) {
130+
modelData.cachedInputTokens = data.cachedInputTokens;
131+
}
132+
133+
if (data.readCachedInputTokens > 0) {
134+
modelData.readCachedInputTokens = data.readCachedInputTokens;
135+
}
136+
137+
result.push(modelData);
113138
}
114139

115140
return result;

packages/ai-core/src/browser/token-usage-frontend-service.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ export interface ModelTokenUsageData {
2626
inputTokens: number;
2727
/** Number of output tokens used */
2828
outputTokens: number;
29+
/** Number of input tokens written to cache */
30+
cachedInputTokens?: number;
31+
/** Number of input tokens read from cache */
32+
readCachedInputTokens?: number;
2933
/** Date when the model was last used */
3034
lastUsed?: Date;
3135
}

packages/ai-core/src/common/token-usage-service.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ export interface TokenUsage {
2323
inputTokens: number;
2424
/** The output token count */
2525
outputTokens: number;
26+
/** Input tokens written to cache */
27+
cachedInputTokens?: number;
28+
/** Input tokens read from cache */
29+
readCachedInputTokens?: number;
2630
/** The model identifier */
2731
model: string;
2832
/** The timestamp of when the tokens were used */
@@ -36,6 +40,10 @@ export interface TokenUsageParams {
3640
inputTokens: number;
3741
/** The output token count */
3842
outputTokens: number;
43+
/** Input tokens placed in cache */
44+
cachedInputTokens?: number;
45+
/** Input tokens read from cache */
46+
readCachedInputTokens?: number;
3947
/** Request identifier */
4048
requestId: string;
4149
}

packages/ai-core/src/node/token-usage-service-impl.ts

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ export class TokenUsageServiceImpl implements TokenUsageService {
4141
async recordTokenUsage(model: string, params: TokenUsageParams): Promise<void> {
4242
const usage: TokenUsage = {
4343
inputTokens: params.inputTokens,
44+
cachedInputTokens: params.cachedInputTokens,
45+
readCachedInputTokens: params.readCachedInputTokens,
4446
outputTokens: params.outputTokens,
4547
model,
4648
timestamp: new Date(),
@@ -50,7 +52,23 @@ export class TokenUsageServiceImpl implements TokenUsageService {
5052
this.tokenUsages.push(usage);
5153
this.client?.notifyTokenUsage(usage);
5254

53-
console.log(`Input Tokens: ${params.inputTokens}; Output Tokens: ${params.outputTokens}; Model: ${model}${params.requestId ? `; RequestId: ${params.requestId}` : ''}`);
55+
let logMessage = `Input Tokens: ${params.inputTokens};`;
56+
57+
if (params.cachedInputTokens) {
58+
logMessage += ` Input Tokens written to cache: ${params.cachedInputTokens};`;
59+
}
60+
61+
if (params.readCachedInputTokens) {
62+
logMessage += ` Input Tokens read from cache: ${params.readCachedInputTokens};`;
63+
}
64+
65+
logMessage += ` Output Tokens: ${params.outputTokens}; Model: ${model};`;
66+
67+
if (params.requestId) {
68+
logMessage += `; RequestId: ${params.requestId}`;
69+
}
70+
71+
console.debug(logMessage);
5472
// For now we just store in memory
5573
// In the future, this could be persisted to disk, a database, or sent to a service
5674
return Promise.resolve();

0 commit comments

Comments
 (0)