Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions packages/types/src/providers/groq.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export const groqDefaultModelId: GroqModelId = "llama-3.3-70b-versatile" // Defa
export const groqModels = {
// Models based on API response: https://api.groq.com/openai/v1/models
"llama-3.1-8b-instant": {
maxTokens: 131072,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -26,7 +26,7 @@ export const groqModels = {
description: "Meta Llama 3.1 8B Instant model, 128K context.",
},
"llama-3.3-70b-versatile": {
maxTokens: 32768,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -53,7 +53,7 @@ export const groqModels = {
description: "Meta Llama 4 Maverick 17B Instruct model, 128K context.",
},
"mistral-saba-24b": {
maxTokens: 32768,
maxTokens: 8192,
contextWindow: 32768,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -62,7 +62,7 @@ export const groqModels = {
description: "Mistral Saba 24B model, 32K context.",
},
"qwen-qwq-32b": {
maxTokens: 131072,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -71,7 +71,7 @@ export const groqModels = {
description: "Alibaba Qwen QwQ 32B model, 128K context.",
},
"qwen/qwen3-32b": {
maxTokens: 40960,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -80,7 +80,7 @@ export const groqModels = {
description: "Alibaba Qwen 3 32B model, 128K context.",
},
"deepseek-r1-distill-llama-70b": {
maxTokens: 131072,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand All @@ -89,7 +89,7 @@ export const groqModels = {
description: "DeepSeek R1 Distill Llama 70B model, 128K context.",
},
"moonshotai/kimi-k2-instruct": {
maxTokens: 131072,
maxTokens: 8192,
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
Expand Down
34 changes: 11 additions & 23 deletions src/api/transform/model-params.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS,
shouldUseReasoningBudget,
shouldUseReasoningEffort,
getModelMaxOutputTokens,
} from "../../shared/api"

import {
Expand Down Expand Up @@ -76,20 +77,25 @@ export function getModelParams({
reasoningEffort: customReasoningEffort,
} = settings

let maxTokens = model.maxTokens ?? undefined
// Use the centralized logic for computing maxTokens
const maxTokens = getModelMaxOutputTokens({
modelId,
model,
settings,
format,
})

let temperature = customTemperature ?? defaultTemperature
let reasoningBudget: ModelParams["reasoningBudget"] = undefined
let reasoningEffort: ModelParams["reasoningEffort"] = undefined

if (shouldUseReasoningBudget({ model, settings })) {
// If `customMaxTokens` is not specified use the default.
maxTokens = customMaxTokens ?? DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS

// If `customMaxThinkingTokens` is not specified use the default.
reasoningBudget = customMaxThinkingTokens ?? DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS

// Reasoning cannot exceed 80% of the `maxTokens` value.
if (reasoningBudget > Math.floor(maxTokens * 0.8)) {
// maxTokens should always be defined for reasoning budget models, but add a guard just in case
if (maxTokens && reasoningBudget > Math.floor(maxTokens * 0.8)) {
reasoningBudget = Math.floor(maxTokens * 0.8)
}

Expand All @@ -106,24 +112,6 @@ export function getModelParams({
reasoningEffort = customReasoningEffort ?? model.reasoningEffort
}

// TODO: We should consolidate this logic to compute `maxTokens` with
// `getModelMaxOutputTokens` in order to maintain a single source of truth.

const isAnthropic = format === "anthropic" || (format === "openrouter" && modelId.startsWith("anthropic/"))

// For "Hybrid" reasoning models, we should discard the model's actual
// `maxTokens` value if we're not using reasoning. We do this for Anthropic
// models only for now. Should we do this for Gemini too?
if (model.supportsReasoningBudget && !reasoningBudget && isAnthropic) {
maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
}

// For Anthropic models we should always make sure a `maxTokens` value is
// set.
if (!maxTokens && isAnthropic) {
maxTokens = ANTHROPIC_DEFAULT_MAX_TOKENS
}

const params: BaseModelParams = { maxTokens, temperature, reasoningEffort, reasoningBudget }

if (format === "anthropic") {
Expand Down
4 changes: 2 additions & 2 deletions src/shared/__tests__/api.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ describe("getModelMaxOutputTokens", () => {
expect(result).toBe(32000)
})

test("should return 20% of context window when maxTokens is undefined", () => {
test("should return default of 8192 when maxTokens is undefined", () => {
const modelWithoutMaxTokens: ModelInfo = {
contextWindow: 100000,
supportsPromptCache: true,
Expand All @@ -88,7 +88,7 @@ describe("getModelMaxOutputTokens", () => {
settings: {},
})

expect(result).toBe(20000) // 20% of 100000
expect(result).toBe(8192)
})

test("should return ANTHROPIC_DEFAULT_MAX_TOKENS for Anthropic models that support reasoning budget but aren't using it", () => {
Expand Down
31 changes: 22 additions & 9 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,36 +58,49 @@ export const getModelMaxOutputTokens = ({
modelId,
model,
settings,
format,
}: {
modelId: string
model: ModelInfo
settings?: ProviderSettings
format?: "anthropic" | "openai" | "gemini" | "openrouter"
}): number | undefined => {
// Check for Claude Code specific max output tokens setting
if (settings?.apiProvider === "claude-code") {
// Return the configured value or default to CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
return settings.claudeCodeMaxOutputTokens || CLAUDE_CODE_DEFAULT_MAX_OUTPUT_TOKENS
}

if (shouldUseReasoningBudget({ model, settings })) {
return settings?.modelMaxTokens || DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS
}

const isAnthropicModel = modelId.includes("claude")
const isAnthropicContext =
modelId.includes("claude") ||
format === "anthropic" ||
(format === "openrouter" && modelId.startsWith("anthropic/"))

// For "Hybrid" reasoning models, we should discard the model's actual
// `maxTokens` value if we're not using reasoning. We do this for Anthropic
// models only for now. Should we do this for Gemini too?
if (model.supportsReasoningBudget && isAnthropicModel) {
// For "Hybrid" reasoning models, discard the model's actual maxTokens for Anthropic contexts
if (model.supportsReasoningBudget && isAnthropicContext) {
return ANTHROPIC_DEFAULT_MAX_TOKENS
}

// If maxTokens is 0 or undefined or the full context window, fall back to 20% of context window
// For Anthropic contexts, always ensure a maxTokens value is set
if (isAnthropicContext && (!model.maxTokens || model.maxTokens === 0)) {
return ANTHROPIC_DEFAULT_MAX_TOKENS
}

// If model has explicit maxTokens and it's not the full context window, use it
if (model.maxTokens && model.maxTokens !== model.contextWindow) {
return model.maxTokens
} else {
return Math.ceil(model.contextWindow * 0.2)
}

// For non-Anthropic formats without explicit maxTokens, return undefined
if (format) {
return undefined
}

// Default fallback
return ANTHROPIC_DEFAULT_MAX_TOKENS
}

// GetModelsOptions
Expand Down
76 changes: 38 additions & 38 deletions webview-ui/src/__tests__/ContextWindowProgressLogic.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,41 @@ export {} // This makes the file a proper TypeScript module
describe("ContextWindowProgress Logic", () => {
// Using the shared utility function from model-utils.ts instead of reimplementing it

test("calculates correct token distribution with default 20% reservation", () => {
const contextWindow = 4000
test("calculates correct token distribution with default 8192 reservation", () => {
const contextWindow = 10000
const contextTokens = 1000

const result = calculateTokenDistribution(contextWindow, contextTokens)

// Expected calculations:
// reservedForOutput = 0.2 * 4000 = 800
// availableSize = 4000 - 1000 - 800 = 2200
// total = 1000 + 800 + 2200 = 4000
expect(result.reservedForOutput).toBe(800)
expect(result.availableSize).toBe(2200)
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
// availableSize = 10000 - 1000 - 8192 = 808
// total = 1000 + 8192 + 808 = 10000
expect(result.reservedForOutput).toBe(8192)
expect(result.availableSize).toBe(808)

// Check percentages
expect(result.currentPercent).toBeCloseTo(25) // 1000/4000 * 100 = 25%
expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
expect(result.availablePercent).toBeCloseTo(55) // 2200/4000 * 100 = 55%
expect(result.currentPercent).toBeCloseTo(10) // 1000/10000 * 100 = 10%
expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
expect(result.availablePercent).toBeCloseTo(8.08) // 808/10000 * 100 = 8.08%

// Verify percentages sum to 100%
expect(result.currentPercent + result.reservedPercent + result.availablePercent).toBeCloseTo(100)
})

test("uses provided maxTokens when available instead of default calculation", () => {
const contextWindow = 4000
const contextWindow = 10000
const contextTokens = 1000

// First calculate with default 20% reservation (no maxTokens provided)
// First calculate with default 8192 reservation (no maxTokens provided)
const defaultResult = calculateTokenDistribution(contextWindow, contextTokens)

// Then calculate with custom maxTokens value
const customMaxTokens = 1500 // Custom maxTokens instead of default 20%
const customMaxTokens = 1500 // Custom maxTokens instead of default 8192
const customResult = calculateTokenDistribution(contextWindow, contextTokens, customMaxTokens)

// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 20% calculation
const defaultReserved = Math.ceil(contextWindow * 0.2) // 800 tokens (20% of 4000)
// VERIFY MAXTOKEN PROP EFFECT: Custom maxTokens should be used directly instead of 8192 calculation
const defaultReserved = 8192 // ANTHROPIC_DEFAULT_MAX_TOKENS
expect(defaultResult.reservedForOutput).toBe(defaultReserved)
expect(customResult.reservedForOutput).toBe(customMaxTokens) // Should use exact provided value

Expand All @@ -51,13 +51,13 @@ describe("ContextWindowProgress Logic", () => {
expect(defaultTooltip).not.toBe(customTooltip)

// Verify the effect on available space
expect(customResult.availableSize).toBe(4000 - 1000 - 1500) // 1500 tokens available
expect(defaultResult.availableSize).toBe(4000 - 1000 - 800) // 2200 tokens available
expect(customResult.availableSize).toBe(10000 - 1000 - 1500) // 7500 tokens available
expect(defaultResult.availableSize).toBe(10000 - 1000 - 8192) // 808 tokens available

// Verify the effect on percentages
// With custom maxTokens (1500), the reserved percentage should be higher
expect(defaultResult.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
expect(customResult.reservedPercent).toBeCloseTo(37.5) // 1500/4000 * 100 = 37.5%
// With custom maxTokens (1500), the reserved percentage should be lower than default
expect(defaultResult.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
expect(customResult.reservedPercent).toBeCloseTo(15) // 1500/10000 * 100 = 15%

// Verify percentages still sum to 100%
expect(customResult.currentPercent + customResult.reservedPercent + customResult.availablePercent).toBeCloseTo(
Expand All @@ -66,19 +66,19 @@ describe("ContextWindowProgress Logic", () => {
})

test("handles negative input values", () => {
const contextWindow = 4000
const contextWindow = 10000
const contextTokens = -500 // Negative tokens should be handled gracefully

const result = calculateTokenDistribution(contextWindow, contextTokens)

// Expected calculations:
// safeContextTokens = Math.max(0, -500) = 0
// reservedForOutput = 0.2 * 4000 = 800
// availableSize = 4000 - 0 - 800 = 3200
// total = 0 + 800 + 3200 = 4000
expect(result.currentPercent).toBeCloseTo(0) // 0/4000 * 100 = 0%
expect(result.reservedPercent).toBeCloseTo(20) // 800/4000 * 100 = 20%
expect(result.availablePercent).toBeCloseTo(80) // 3200/4000 * 100 = 80%
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
// availableSize = 10000 - 0 - 8192 = 1808
// total = 0 + 8192 + 1808 = 10000
expect(result.currentPercent).toBeCloseTo(0) // 0/10000 * 100 = 0%
expect(result.reservedPercent).toBeCloseTo(81.92) // 8192/10000 * 100 = 81.92%
expect(result.availablePercent).toBeCloseTo(18.08) // 1808/10000 * 100 = 18.08%
})

test("handles zero context window gracefully", () => {
Expand All @@ -87,9 +87,9 @@ describe("ContextWindowProgress Logic", () => {

const result = calculateTokenDistribution(contextWindow, contextTokens)

// With zero context window, everything should be zero
expect(result.reservedForOutput).toBe(0)
expect(result.availableSize).toBe(0)
// With zero context window, the function uses ANTHROPIC_DEFAULT_MAX_TOKENS but available size becomes 0
expect(result.reservedForOutput).toBe(8192) // ANTHROPIC_DEFAULT_MAX_TOKENS
expect(result.availableSize).toBe(0) // max(0, 0 - 1000 - 8192) = 0

// The percentages maintain total of 100% even with zero context window
// due to how the division handles this edge case
Expand All @@ -98,20 +98,20 @@ describe("ContextWindowProgress Logic", () => {
})

test("handles case where tokens exceed context window", () => {
const contextWindow = 4000
const contextTokens = 5000 // More tokens than the window size
const contextWindow = 10000
const contextTokens = 12000 // More tokens than the window size

const result = calculateTokenDistribution(contextWindow, contextTokens)

// Expected calculations:
// reservedForOutput = 0.2 * 4000 = 800
// availableSize = Math.max(0, 4000 - 5000 - 800) = 0
expect(result.reservedForOutput).toBe(800)
// reservedForOutput = 8192 (ANTHROPIC_DEFAULT_MAX_TOKENS)
// availableSize = Math.max(0, 10000 - 12000 - 8192) = 0
expect(result.reservedForOutput).toBe(8192)
expect(result.availableSize).toBe(0)

// Percentages should be calculated based on total (5000 + 800 + 0 = 5800)
expect(result.currentPercent).toBeCloseTo((5000 / 5800) * 100)
expect(result.reservedPercent).toBeCloseTo((800 / 5800) * 100)
// Percentages should be calculated based on total (12000 + 8192 + 0 = 20192)
expect(result.currentPercent).toBeCloseTo((12000 / 20192) * 100)
expect(result.reservedPercent).toBeCloseTo((8192 / 20192) * 100)
expect(result.availablePercent).toBeCloseTo(0)

// Verify percentages sum to 100%
Expand Down
22 changes: 11 additions & 11 deletions webview-ui/src/utils/__tests__/model-utils.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,33 @@ describe("calculateTokenDistribution", () => {
expect(Math.round(result.currentPercent + result.reservedPercent + result.availablePercent)).toBe(100)
})

it("should default to 20% of context window when maxTokens not provided", () => {
const contextWindow = 10000
it("should default to 8192 when maxTokens not provided", () => {
const contextWindow = 20000
const contextTokens = 5000

const result = calculateTokenDistribution(contextWindow, contextTokens)

expect(result.reservedForOutput).toBe(2000) // 20% of 10000
expect(result.availableSize).toBe(3000) // 10000 - 5000 - 2000
expect(result.reservedForOutput).toBe(8192)
expect(result.availableSize).toBe(6808) // 20000 - 5000 - 8192
})

it("should handle negative or zero inputs by using positive fallbacks", () => {
const result = calculateTokenDistribution(-1000, -500)

expect(result.currentPercent).toBe(0)
expect(result.reservedPercent).toBe(0)
expect(result.reservedPercent).toBe(100) // 8192 / 8192 = 100%
expect(result.availablePercent).toBe(0)
expect(result.reservedForOutput).toBe(0) // With negative inputs, both context window and tokens become 0, so 20% of 0 is 0
expect(result.availableSize).toBe(0)
expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS
expect(result.availableSize).toBe(0) // max(0, 0 - 0 - 8192) = 0
})

it("should handle zero total tokens without division by zero errors", () => {
const result = calculateTokenDistribution(0, 0, 0)
it("should handle zero context window without division by zero errors", () => {
const result = calculateTokenDistribution(0, 0)

expect(result.currentPercent).toBe(0)
expect(result.reservedPercent).toBe(0)
expect(result.reservedPercent).toBe(100) // When contextWindow is 0, reserved gets 100%
expect(result.availablePercent).toBe(0)
expect(result.reservedForOutput).toBe(0)
expect(result.reservedForOutput).toBe(8192) // Uses ANTHROPIC_DEFAULT_MAX_TOKENS when no maxTokens provided
expect(result.availableSize).toBe(0)
})
})
Loading