diff --git a/packages/types/src/providers/groq.ts b/packages/types/src/providers/groq.ts index a3fc284bb50a..99bf4be3d019 100644 --- a/packages/types/src/providers/groq.ts +++ b/packages/types/src/providers/groq.ts @@ -17,7 +17,7 @@ export const groqDefaultModelId: GroqModelId = "llama-3.3-70b-versatile" // Defa export const groqModels = { // Models based on API response: https://api.groq.com/openai/v1/models "llama-3.1-8b-instant": { - maxTokens: 131072, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -26,7 +26,7 @@ export const groqModels = { description: "Meta Llama 3.1 8B Instant model, 128K context.", }, "llama-3.3-70b-versatile": { - maxTokens: 32768, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -53,7 +53,7 @@ export const groqModels = { description: "Meta Llama 4 Maverick 17B Instruct model, 128K context.", }, "mistral-saba-24b": { - maxTokens: 32768, + maxTokens: 8192, contextWindow: 32768, supportsImages: false, supportsPromptCache: false, @@ -62,7 +62,7 @@ export const groqModels = { description: "Mistral Saba 24B model, 32K context.", }, "qwen-qwq-32b": { - maxTokens: 131072, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -71,7 +71,7 @@ export const groqModels = { description: "Alibaba Qwen QwQ 32B model, 128K context.", }, "qwen/qwen3-32b": { - maxTokens: 40960, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -80,7 +80,7 @@ export const groqModels = { description: "Alibaba Qwen 3 32B model, 128K context.", }, "deepseek-r1-distill-llama-70b": { - maxTokens: 131072, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -89,7 +89,7 @@ export const groqModels = { description: "DeepSeek R1 Distill Llama 70B model, 128K context.", }, "moonshotai/kimi-k2-instruct": { - maxTokens: 131072, + maxTokens: 8192, contextWindow: 131072, supportsImages: false, supportsPromptCache: false, diff --git a/src/api/transform/model-params.ts b/src/api/transform/model-params.ts index 8b6069666c0b..6ed975ac5aac 100644 --- a/src/api/transform/model-params.ts +++ b/src/api/transform/model-params.ts @@ -5,6 +5,7 @@ import { DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS, shouldUseReasoningBudget, shouldUseReasoningEffort, + getModelMaxOutputTokens, } from "../../shared/api" import { @@ -76,20 +77,25 @@ export function getModelParams({ reasoningEffort: customReasoningEffort, } = settings - let maxTokens = model.maxTokens ?? undefined + // Use the centralized logic for computing maxTokens + const maxTokens = getModelMaxOutputTokens({ + modelId, + model, + settings, + format, + }) + let temperature = customTemperature ?? defaultTemperature let reasoningBudget: ModelParams["reasoningBudget"] = undefined let reasoningEffort: ModelParams["reasoningEffort"] = undefined if (shouldUseReasoningBudget({ model, settings })) { - // If `customMaxTokens` is not specified use the default. - maxTokens = customMaxTokens ?? DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS - // If `customMaxThinkingTokens` is not specified use the default. reasoningBudget = customMaxThinkingTokens ?? DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS // Reasoning cannot exceed 80% of the `maxTokens` value. - if (reasoningBudget > Math.floor(maxTokens * 0.8)) { + // maxTokens should always be defined for reasoning budget models, but add a guard just in case + if (maxTokens && reasoningBudget > Math.floor(maxTokens * 0.8)) { reasoningBudget = Math.floor(maxTokens * 0.8) } @@ -106,24 +112,6 @@ export function getModelParams({ reasoningEffort = customReasoningEffort ?? model.reasoningEffort } - // TODO: We should consolidate this logic to compute `maxTokens` with - // `getModelMaxOutputTokens` in order to maintain a single source of truth. - - const isAnthropic = format === "anthropic" || (format === "openrouter" && modelId.startsWith("anthropic/")) - - // For "Hybrid" reasoning models, we should discard the model's actual - // `maxTokens` value if we're not using reasoning. We do this for Anthropic - // models only for now. Should we do this for Gemini too? - if (model.supportsReasoningBudget && !reasoningBudget && isAnthropic) { - maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS - } - - // For Anthropic models we should always make sure a `maxTokens` value is - // set. - if (!maxTokens && isAnthropic) { - maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS - } - const params: BaseModelParams = { maxTokens, temperature, reasoningEffort, reasoningBudget } if (format === "anthropic") { diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts index a13e823a90f7..08d4bdf3bbc8 100644 --- a/src/shared/__tests__/api.spec.ts +++ b/src/shared/__tests__/api.spec.ts @@ -76,7 +76,7 @@ describe("getModelMaxOutputTokens", () => { expect(result).toBe(32000) }) - test("should return 20% of context window when maxTokens is undefined", () => { + test("should return default of 8192 when maxTokens is undefined", () => { const modelWithoutMaxTokens: ModelInfo = { contextWindow: 100000, supportsPromptCache: true, @@ -88,7 +88,7 @@ describe("getModelMaxOutputTokens", () => { settings: {}, }) - expect(result).toBe(20000) // 20% of 100000 + expect(result).toBe(8192) }) test("should return ANTHROPIC_DEFAULT_MAX_TOKENS for Anthropic models that support reasoning budget but aren't using it", () => { diff --git a/src/shared/api.ts b/src/shared/api.ts index a1603fc77618..8cbfc721336b 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -58,14 +58,15 @@ export const getModelMaxOutputTokens = ({ modelId, model, settings, + format, }: { modelId: string model: ModelInfo settings?: ProviderSettings + format?: "anthropic" | "openai" | "gemini" | "openrouter" }): number | undefined => { // Check for Claude Code specific max output tokens setting if (settings?.apiProvider === "claude-code") { - // Return the configured value or default to CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS return settings.claudeCodeMaxOutputTokens || CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS } @@ -73,21 +74,33 @@ export const getModelMaxOutputTokens = ({ return settings?.modelMaxTokens || DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS } - const isAnthropicModel = modelId.includes("claude") + const isAnthropicContext = + modelId.includes("claude") || + format === "anthropic" || + (format === "openrouter" && modelId.startsWith("anthropic/")) - // For "Hybrid" reasoning models, we should discard the model's actual - // `maxTokens` value if we're not using reasoning. We do this for Anthropic - // models only for now. Should we do this for Gemini too? - if (model.supportsReasoningBudget && isAnthropicModel) { + // For "Hybrid" reasoning models, discard the model's actual maxTokens for Anthropic contexts + if (model.supportsReasoningBudget && isAnthropicContext) { return ANTHROPIC_DEFAULT_MAX_TOKENS } - // If maxTokens is 0 or undefined or the full context window, fall back to 20% of context window + // For Anthropic contexts, always ensure a maxTokens value is set + if (isAnthropicContext && (!model.maxTokens || model.maxTokens === 0)) { + return ANTHROPIC_DEFAULT_MAX_TOKENS + } + + // If model has explicit maxTokens and it's not the full context window, use it if (model.maxTokens && model.maxTokens !== model.contextWindow) { return model.maxTokens - } else { - return Math.ceil(model.contextWindow * 0.2) } + + // For non-Anthropic formats without explicit maxTokens, return undefined + if (format) { + return undefined + } + + // Default fallback + return ANTHROPIC_DEFAULT_MAX_TOKENS } // GetModelsOptions diff --git a/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts b/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts index 39c6cd40d5ca..0f9314218c50 100644 --- a/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts +++ b/webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts @@ -7,41 +7,41 @@ export {} // This makes the file a proper TypeScript module describe("ContextWindowProgress Logic", () => { // Using the shared utility function from model-utils.ts instead of reimplementing it - test("calculates correct token distribution with default 20% reservation", () => { - const contextWindow = 4000 + test("calculates correct token distribution with default 8192 reservation", () => { + const contextWindow = 10000 const contextTokens = 1000 const result = calculateTokenDistribution(contextWindow, contextTokens) // Expected calculations: - // reservedForOutput = 0.2 * 4000 = 800 - // availableSize = 4000 - 1000 - 800 = 2200 - // total = 1000 + 800 + 2200 = 4000 - expect(result.reservedForOutput).toBe(800) - expect(result.availableSize).toBe(2200) + // reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS) + // availableSize = 10000 - 1000 - 8192 = 808 + // total = 1000 + 8192 + 808 = 10000 + expect(result.reservedForOutput).toBe(8192) + expect(result.availableSize).toBe(808) // Check percentages - expect(result.currentPercent).toBeCloseTo(25) // 1000/4000 * 100 = 25% - expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20% - expect(result.availablePercent).toBeCloseTo(55) // 2200/4000 * 100 = 55% + expect(result.currentPercent).toBeCloseTo(10) // 1000/10000 * 100 = 10% + expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92% + expect(result.availablePercent).toBeCloseTo(8.08) // 808/10000 * 100 = 8.08% // Verify percentages sum to 100% expect(result.currentPercent + result.reservedPercent + result.availablePercent).toBeCloseTo(100) }) test("uses provided maxTokens when available instead of default calculation", () => { - const contextWindow = 4000 + const contextWindow = 10000 const contextTokens = 1000 - // First calculate with default 20% reservation (no maxTokens provided) + // First calculate with default 8192 reservation (no maxTokens provided) const defaultResult = calculateTokenDistribution(contextWindow, contextTokens) // Then calculate with custom maxTokens value - const customMaxTokens = 1500 // Custom maxTokens instead of default 20% + const customMaxTokens = 1500 // Custom maxTokens instead of default 8192 const customResult = calculateTokenDistribution(contextWindow, contextTokens, customMaxTokens) - // VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 20% calculation - const defaultReserved = Math.ceil(contextWindow * 0.2) // 800 tokens (20% of 4000) + // VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 8192 calculation + const defaultReserved = 8192 // ANTHROPIC_DEFAULT_MAX_TOKENS expect(defaultResult.reservedForOutput).toBe(defaultReserved) expect(customResult.reservedForOutput).toBe(customMaxTokens) // Should use exact provided value @@ -51,13 +51,13 @@ describe("ContextWindowProgress Logic", () => { expect(defaultTooltip).not.toBe(customTooltip) // Verify the effect on available space - expect(customResult.availableSize).toBe(4000 - 1000 - 1500) // 1500 tokens available - expect(defaultResult.availableSize).toBe(4000 - 1000 - 800) // 2200 tokens available + expect(customResult.availableSize).toBe(10000 - 1000 - 1500) // 7500 tokens available + expect(defaultResult.availableSize).toBe(10000 - 1000 - 8192) // 808 tokens available // Verify the effect on percentages - // With custom maxTokens (1500), the reserved percentage should be higher - expect(defaultResult.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20% - expect(customResult.reservedPercent).toBeCloseTo(37.5) // 1500/4000 * 100 = 37.5% + // With custom maxTokens (1500), the reserved percentage should be lower than default + expect(defaultResult.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92% + expect(customResult.reservedPercent).toBeCloseTo(15) // 1500/10000 * 100 = 15% // Verify percentages still sum to 100% expect(customResult.currentPercent + customResult.reservedPercent + customResult.availablePercent).toBeCloseTo( @@ -66,19 +66,19 @@ describe("ContextWindowProgress Logic", () => { }) test("handles negative input values", () => { - const contextWindow = 4000 + const contextWindow = 10000 const contextTokens = -500 // Negative tokens should be handled gracefully const result = calculateTokenDistribution(contextWindow, contextTokens) // Expected calculations: // safeContextTokens = Math.max(0, -500) = 0 - // reservedForOutput = 0.2 * 4000 = 800 - // availableSize = 4000 - 0 - 800 = 3200 - // total = 0 + 800 + 3200 = 4000 - expect(result.currentPercent).toBeCloseTo(0) // 0/4000 * 100 = 0% - expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20% - expect(result.availablePercent).toBeCloseTo(80) // 3200/4000 * 100 = 80% + // reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS) + // availableSize = 10000 - 0 - 8192 = 1808 + // total = 0 + 8192 + 1808 = 10000 + expect(result.currentPercent).toBeCloseTo(0) // 0/10000 * 100 = 0% + expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92% + expect(result.availablePercent).toBeCloseTo(18.08) // 1808/10000 * 100 = 18.08% }) test("handles zero context window gracefully", () => { @@ -87,9 +87,9 @@ describe("ContextWindowProgress Logic", () => { const result = calculateTokenDistribution(contextWindow, contextTokens) - // With zero context window, everything should be zero - expect(result.reservedForOutput).toBe(0) - expect(result.availableSize).toBe(0) + // With zero context window, the function uses ANTHROPIC_DEFAULT_MAX_TOKENS but available size becomes 0 + expect(result.reservedForOutput).toBe(8192) // ANTHROPIC_DEFAULT_MAX_TOKENS + expect(result.availableSize).toBe(0) // max(0, 0 - 1000 - 8192) = 0 // The percentages maintain total of 100% even with zero context window // due to how the division handles this edge case @@ -98,20 +98,20 @@ describe("ContextWindowProgress Logic", () => { }) test("handles case where tokens exceed context window", () => { - const contextWindow = 4000 - const contextTokens = 5000 // More tokens than the window size + const contextWindow = 10000 + const contextTokens = 12000 // More tokens than the window size const result = calculateTokenDistribution(contextWindow, contextTokens) // Expected calculations: - // reservedForOutput = 0.2 * 4000 = 800 - // availableSize = Math.max(0, 4000 - 5000 - 800) = 0 - expect(result.reservedForOutput).toBe(800) + // reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS) + // availableSize = Math.max(0, 10000 - 12000 - 8192) = 0 + expect(result.reservedForOutput).toBe(8192) expect(result.availableSize).toBe(0) - // Percentages should be calculated based on total (5000 + 800 + 0 = 5800) - expect(result.currentPercent).toBeCloseTo((5000 / 5800) * 100) - expect(result.reservedPercent).toBeCloseTo((800 / 5800) * 100) + // Percentages should be calculated based on total (12000 + 8192 + 0 = 20192) + expect(result.currentPercent).toBeCloseTo((12000 / 20192) * 100) + expect(result.reservedPercent).toBeCloseTo((8192 / 20192) * 100) expect(result.availablePercent).toBeCloseTo(0) // Verify percentages sum to 100% diff --git a/webview-ui/src/utils/__tests__/model-utils.spec.ts b/webview-ui/src/utils/__tests__/model-utils.spec.ts index 7b630e906eed..a8ae33300ad1 100644 --- a/webview-ui/src/utils/__tests__/model-utils.spec.ts +++ b/webview-ui/src/utils/__tests__/model-utils.spec.ts @@ -17,33 +17,33 @@ describe("calculateTokenDistribution", () => { expect(Math.round(result.currentPercent + result.reservedPercent + result.availablePercent)).toBe(100) }) - it("should default to 20% of context window when maxTokens not provided", () => { - const contextWindow = 10000 + it("should default to 8192 when maxTokens not provided", () => { + const contextWindow = 20000 const contextTokens = 5000 const result = calculateTokenDistribution(contextWindow, contextTokens) - expect(result.reservedForOutput).toBe(2000) // 20% of 10000 - expect(result.availableSize).toBe(3000) // 10000 - 5000 - 2000 + expect(result.reservedForOutput).toBe(8192) + expect(result.availableSize).toBe(6808) // 20000 - 5000 - 8192 }) it("should handle negative or zero inputs by using positive fallbacks", () => { const result = calculateTokenDistribution(-1000, -500) expect(result.currentPercent).toBe(0) - expect(result.reservedPercent).toBe(0) + expect(result.reservedPercent).toBe(100) // 8192 / 8192 = 100% expect(result.availablePercent).toBe(0) - expect(result.reservedForOutput).toBe(0) // With negative inputs, both context window and tokens become 0, so 20% of 0 is 0 - expect(result.availableSize).toBe(0) + expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS + expect(result.availableSize).toBe(0) // max(0, 0 - 0 - 8192) = 0 }) - it("should handle zero total tokens without division by zero errors", () => { - const result = calculateTokenDistribution(0, 0, 0) + it("should handle zero context window without division by zero errors", () => { + const result = calculateTokenDistribution(0, 0) expect(result.currentPercent).toBe(0) - expect(result.reservedPercent).toBe(0) + expect(result.reservedPercent).toBe(100) // When contextWindow is 0, reserved gets 100% expect(result.availablePercent).toBe(0) - expect(result.reservedForOutput).toBe(0) + expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS when no maxTokens provided expect(result.availableSize).toBe(0) }) }) diff --git a/webview-ui/src/utils/model-utils.ts b/webview-ui/src/utils/model-utils.ts index 269f9865fb88..6ac31f5f1115 100644 --- a/webview-ui/src/utils/model-utils.ts +++ b/webview-ui/src/utils/model-utils.ts @@ -1,3 +1,5 @@ +import { ANTHROPIC_DEFAULT_MAX_TOKENS } from "@roo-code/types" + /** * Result of token distribution calculation */ @@ -34,7 +36,7 @@ export interface TokenDistributionResult { * * @param contextWindow The total size of the context window * @param contextTokens The number of tokens currently used - * @param maxTokens Optional override for tokens reserved for model output (otherwise uses 20% of window) + * @param maxTokens Optional override for tokens reserved for model output (otherwise uses 8192) * @returns Distribution of tokens with percentages and raw numbers */ export const calculateTokenDistribution = ( @@ -47,9 +49,9 @@ export const calculateTokenDistribution = ( const safeContextTokens = Math.max(0, contextTokens) // Get the actual max tokens value from the model - // If maxTokens is valid, use it, otherwise reserve 20% of the context window as a default + // If maxTokens is valid (positive and not equal to context window), use it, otherwise reserve 8192 tokens as a default const reservedForOutput = - maxTokens && maxTokens > 0 && maxTokens !== safeContextWindow ? maxTokens : Math.ceil(safeContextWindow * 0.2) + maxTokens && maxTokens > 0 && maxTokens !== safeContextWindow ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS // Calculate sizes directly without buffer display const availableSize = Math.max(0, safeContextWindow - safeContextTokens - reservedForOutput)