From 3bb20f52831b5d202ec305ffc0b56c9c0472efbe Mon Sep 17 00:00:00 2001 From: Ed Zynda Date: Thu, 16 Apr 2026 23:12:10 +0300 Subject: [PATCH] feat(models): surface and prevent silent max-tokens truncation - Raise --max-tokens default from 4096 to 8192. - Auto-raise MaxTokens toward the model's catalog Limit.Output (capped at 32768) when the user hasn't set --max-tokens explicitly and no per-model modelSettings override applied. Prevents silent 4k/8k truncation on models that support 32k-262k output. - Surface FinishReasonLength at turn end: the app now subscribes to TurnEndEvent and renders a system-message banner explaining the current cap, the model's known ceiling, and how to raise it. Previously the TUI swallowed 'length' stops, producing 'ghost' truncations. - Export FinishReason* constants on pkg/kit (Stop, Length, ToolCalls, ContentFilter, Error, Other, Unknown) and fix stale comments that used Anthropic-style strings. - Add Kit.MaxTokens() and Kit.MaxOutputLimit() SDK accessors, backed by Agent.GetMaxTokens() which correctly returns 0 for providers that suppress the param (e.g. Codex OAuth). - Tests: rightSizeMaxTokens covers 7 paths (cap, raise, preserve, explicit flag, nil info, zero limit); handleTurnEnd covers length/ non-length/nil-sendFn and the fallback message formatter. - Docs: update configuration.md, cli/flags.md, and kit-extensions skill to reflect the new default and behavior. --- cmd/root.go | 2 +- internal/agent/agent.go | 16 ++++ internal/app/app.go | 60 ++++++++++++ internal/app/app_test.go | 93 +++++++++++++++++++ internal/models/providers.go | 36 ++++++++ internal/models/rightsize_test.go | 148 ++++++++++++++++++++++++++++++ pkg/kit/events.go | 42 ++++++++- pkg/kit/kit.go | 34 ++++++- skills/kit-extensions/SKILL.md | 2 +- www/pages/cli/flags.md | 2 +- www/pages/configuration.md | 4 +- 11 files changed, 429 insertions(+), 10 deletions(-) create mode 100644 internal/models/rightsize_test.go diff --git a/cmd/root.go b/cmd/root.go index cfc69cda..250918cc 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -297,7 +297,7 @@ func init() { flags.BoolVar(&noPromptTemplates, "no-prompt-templates", false, "disable prompt template discovery") // Model generation parameters - flags.IntVar(&maxTokens, "max-tokens", 4096, "maximum number of tokens in the response") + flags.IntVar(&maxTokens, "max-tokens", 8192, "maximum number of output tokens per response (auto-raised up to 32768 for models with higher known output limits; see internal/models/embedded_models.json)") flags.Float32Var(&temperature, "temperature", 0.7, "controls randomness in responses (0.0-1.0)") flags.Float32Var(&topP, "top-p", 0.95, "controls diversity via nucleus sampling (0.0-1.0)") flags.Int32Var(&topK, "top-k", 40, "controls diversity by limiting top K tokens to sample from") diff --git a/internal/agent/agent.go b/internal/agent/agent.go index 1d2bf286..8a5bd55f 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -1025,6 +1025,22 @@ func (a *Agent) GetModel() fantasy.LanguageModel { return a.model } +// GetMaxTokens returns the effective max output tokens the agent currently +// sends to the LLM provider, after per-model defaults, right-sizing, and any +// Anthropic thinking-budget adjustments. Returns 0 when no ModelConfig is +// attached (e.g. early init) or when the provider suppresses the parameter +// (e.g. Codex OAuth), which allows callers to differentiate "default" from +// "explicitly capped". +func (a *Agent) GetMaxTokens() int { + if a.skipMaxOutputTokens { + return 0 + } + if a.modelConfig == nil { + return 0 + } + return a.modelConfig.MaxTokens +} + // Close closes the agent and cleans up resources. // If MCP tools are still loading in the background, Close waits for them // to finish before closing connections to avoid resource leaks. diff --git a/internal/app/app.go b/internal/app/app.go index d2881f1e..e2d04e18 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -932,6 +932,8 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo Password: resp.Password, Cancelled: resp.Cancelled, } + case kit.TurnEndEvent: + a.handleTurnEnd(ev, sendFn) } })) @@ -942,6 +944,64 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo } } +// handleTurnEnd inspects a turn's final StopReason and surfaces actionable +// feedback to the user when the turn ended in a state they can act on. +// +// Today the only surfaced case is FinishReasonLength — the model hit its +// configured max_output_tokens budget and the reply was truncated. Without +// this banner the TUI used to swallow the truncation silently, leading to +// "ghost" cut-offs with no indication of why. +// +// Separated from subscribeSDKEvents so tests can exercise it directly via a +// stubbed sendFn without standing up a full Kit. +func (a *App) handleTurnEnd(ev kit.TurnEndEvent, sendFn func(tea.Msg)) { + if sendFn == nil { + return + } + if ev.StopReason != kit.FinishReasonLength { + return + } + sendFn(ExtensionPrintEvent{ + Level: "info", + Text: a.formatMaxTokensTruncatedMessage(), + }) +} + +// formatMaxTokensTruncatedMessage builds the user-facing explanation for a +// truncated turn. It reports the active max_output_tokens budget and, when +// known, the model's catalog output ceiling so the user can judge how much +// headroom is available. +func (a *App) formatMaxTokensTruncatedMessage() string { + k := a.opts.Kit + if k == nil { + // Extremely early / test-stub case: still emit a useful generic hint. + return "⚠ Response truncated: the model hit the configured max_output_tokens limit. " + + "Raise it with --max-tokens N, KIT_MAX_TOKENS=N, or per-model " + + "modelSettings[provider/model].maxTokens in config." + } + current := k.MaxTokens() + ceiling := k.MaxOutputLimit() + model := k.GetModelString() + + msg := "⚠ Response truncated: " + if model != "" { + msg += fmt.Sprintf("%s hit the configured max_output_tokens limit", model) + } else { + msg += "the model hit the configured max_output_tokens limit" + } + if current > 0 { + msg += fmt.Sprintf(" (%d)", current) + } + msg += "." + if ceiling > 0 && current > 0 && ceiling > current { + msg += fmt.Sprintf(" This model supports up to %d output tokens.", ceiling) + } + msg += "\n\nRaise it with --max-tokens N, KIT_MAX_TOKENS=N, " + + "or per-model modelSettings[provider/model].maxTokens in your config. " + + "Re-run the last prompt after raising it to get the full response." + return msg +} + // QuitFromExtension triggers a graceful shutdown. In interactive mode it // sends a tea.QuitMsg to the program so the TUI exits cleanly. In // non-interactive mode it cancels the root context, stopping any in-flight diff --git a/internal/app/app_test.go b/internal/app/app_test.go index fab6b6c6..3202af6b 100644 --- a/internal/app/app_test.go +++ b/internal/app/app_test.go @@ -3,10 +3,12 @@ package app import ( "context" "errors" + "strings" "sync" "testing" "time" + tea "charm.land/bubbletea/v2" kit "github.com/mark3labs/kit/pkg/kit" ) @@ -666,3 +668,94 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T) expected, usage.contextCalls, usage.lastContextTokens) } } + +// TestHandleTurnEnd_LengthEmitsWarning verifies that when the SDK reports a +// FinishReasonLength (max_output_tokens hit), the app surfaces a user-visible +// ExtensionPrintEvent with Level="info" so the TUI can render a banner +// instead of silently showing a truncated reply. +func TestHandleTurnEnd_LengthEmitsWarning(t *testing.T) { + app := New(Options{}, nil) + defer app.Close() + + var mu sync.Mutex + var received []tea.Msg + sendFn := func(m tea.Msg) { + mu.Lock() + defer mu.Unlock() + received = append(received, m) + } + + app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, sendFn) + + mu.Lock() + defer mu.Unlock() + if len(received) != 1 { + t.Fatalf("expected 1 event on length stop, got %d", len(received)) + } + ev, ok := received[0].(ExtensionPrintEvent) + if !ok { + t.Fatalf("expected ExtensionPrintEvent, got %T", received[0]) + } + if ev.Level != "info" { + t.Errorf("expected Level=info, got %q", ev.Level) + } + if ev.Text == "" { + t.Error("expected non-empty warning text") + } + if !strings.Contains(ev.Text, "max_output_tokens") { + t.Errorf("warning text should mention max_output_tokens, got: %s", ev.Text) + } +} + +// TestHandleTurnEnd_NonLengthIgnored verifies that ordinary stop reasons +// (stop, tool-calls, error, unknown, "") do not produce a warning banner. +func TestHandleTurnEnd_NonLengthIgnored(t *testing.T) { + app := New(Options{}, nil) + defer app.Close() + + reasons := []string{ + kit.FinishReasonStop, + kit.FinishReasonToolCalls, + kit.FinishReasonError, + kit.FinishReasonContentFilter, + kit.FinishReasonOther, + kit.FinishReasonUnknown, + "", + } + for _, r := range reasons { + var called bool + app.handleTurnEnd(kit.TurnEndEvent{StopReason: r}, func(m tea.Msg) { + called = true + }) + if called { + t.Errorf("stop reason %q unexpectedly emitted a warning", r) + } + } +} + +// TestHandleTurnEnd_NilSendFn guards against panics when no TUI listener is +// attached (e.g. early init or headless teardown). +func TestHandleTurnEnd_NilSendFn(t *testing.T) { + app := New(Options{}, nil) + defer app.Close() + + // Should not panic with a nil sendFn. + app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, nil) +} + +// TestFormatMaxTokensTruncatedMessage_NoKit verifies the fallback message +// when Options.Kit is nil (test/stub path). +func TestFormatMaxTokensTruncatedMessage_NoKit(t *testing.T) { + app := New(Options{}, nil) + defer app.Close() + + msg := app.formatMaxTokensTruncatedMessage() + if msg == "" { + t.Fatal("expected non-empty fallback message") + } + for _, needle := range []string{"max_output_tokens", "--max-tokens", "KIT_MAX_TOKENS", "modelSettings"} { + if !strings.Contains(msg, needle) { + t.Errorf("fallback message missing %q:\n%s", needle, msg) + } + } +} diff --git a/internal/models/providers.go b/internal/models/providers.go index d026180d..a339f757 100644 --- a/internal/models/providers.go +++ b/internal/models/providers.go @@ -251,6 +251,11 @@ func CreateProvider(ctx context.Context, config *ProviderConfig) (*ProviderResul // via CLI flag or global config. ApplyModelSettings(config, modelInfo) + // Auto-raise MaxTokens toward the model's known output ceiling when the + // user hasn't explicitly set --max-tokens and no per-model override + // applied. Runs after ApplyModelSettings so explicit modelSettings win. + rightSizeMaxTokens(config, modelInfo) + // Create the base provider var result *ProviderResult var createErr error @@ -489,6 +494,37 @@ func validateModelConfig(config *ProviderConfig, modelInfo *ModelInfo) { } } +// defaultRightSizeCap bounds auto-raised MaxTokens so that we don't silently +// allocate enormous output budgets for models with very high ceilings (e.g. +// Devstral at 262144, Mistral at 128000). Users who genuinely want more can +// pass --max-tokens explicitly or set modelSettings[...].maxTokens in config. +const defaultRightSizeCap = 32768 + +// rightSizeMaxTokens raises config.MaxTokens toward the model's known output +// ceiling when: +// - the user has not explicitly set --max-tokens (or the KIT_MAX_TOKENS env +// var, or the top-level max-tokens key in config.yaml), AND +// - no per-model override already bumped MaxTokens (ApplyModelSettings runs +// before this function), AND +// - modelInfo.Limit.Output is known and larger than the current MaxTokens. +// +// The raised value is capped at defaultRightSizeCap to keep accidental +// allocations reasonable on very-large-output models. This prevents the +// common "ghost" where the agent's reply is silently truncated at the 8192 +// default even though the selected model supports 64k or 262k output tokens. +func rightSizeMaxTokens(config *ProviderConfig, modelInfo *ModelInfo) { + if modelInfo == nil || modelInfo.Limit.Output <= 0 { + return + } + if isExplicitlySet("max-tokens") { + return + } + target := min(modelInfo.Limit.Output, defaultRightSizeCap) + if config.MaxTokens < target { + config.MaxTokens = target + } +} + // clearConflictingAnthropicSamplingParams ensures that temperature and top_p are // not both sent to the Anthropic API, which rejects requests containing both. // When both are set (typically from defaults), top_p is cleared so that diff --git a/internal/models/rightsize_test.go b/internal/models/rightsize_test.go new file mode 100644 index 00000000..51c5067a --- /dev/null +++ b/internal/models/rightsize_test.go @@ -0,0 +1,148 @@ +package models + +import ( + "testing" + + "github.com/spf13/pflag" + "github.com/spf13/viper" +) + +// bindMaxTokensFlag wires a fresh pflag-backed "max-tokens" key into viper so +// isExplicitlySet behaves the same way it does in production. Returns a +// cleanup function that removes the binding so sibling tests see a clean +// state. +func bindMaxTokensFlag(t *testing.T, args []string) func() { + t.Helper() + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + fs.Int("max-tokens", 8192, "") + if err := viper.BindPFlag("max-tokens", fs.Lookup("max-tokens")); err != nil { + t.Fatalf("BindPFlag: %v", err) + } + if err := fs.Parse(args); err != nil { + t.Fatalf("fs.Parse: %v", err) + } + return func() { + viper.Reset() + } +} + +func TestRightSizeMaxTokens_RaisesWhenBelowCeiling(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) // no args → flag.Changed = false + defer cleanup() + + config := &ProviderConfig{MaxTokens: 8192} + modelInfo := &ModelInfo{ + ID: "claude-sonnet-4-5", + Limit: Limit{Context: 200000, Output: 64000}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != 32768 { + t.Errorf("expected MaxTokens raised to defaultRightSizeCap (32768), got %d", config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_CapsAtDefaultRightSizeCap(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) + defer cleanup() + + config := &ProviderConfig{MaxTokens: 8192} + // Mistral Devstral has 262144 output — we should still cap at 32768. + modelInfo := &ModelInfo{ + ID: "devstral-medium-latest", + Limit: Limit{Context: 262144, Output: 262144}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != defaultRightSizeCap { + t.Errorf("expected MaxTokens capped at %d, got %d", defaultRightSizeCap, config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_UsesExactOutputWhenBelowCap(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) + defer cleanup() + + config := &ProviderConfig{MaxTokens: 4096} + // Model with output limit smaller than the cap. + modelInfo := &ModelInfo{ + ID: "gpt-4", + Limit: Limit{Context: 8192, Output: 8192}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != 8192 { + t.Errorf("expected MaxTokens raised to model output ceiling (8192), got %d", config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_DoesNotLowerCurrentValue(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) + defer cleanup() + + // User (via per-model settings, applied earlier) already bumped MaxTokens + // above the cap — we must not clobber their choice. + config := &ProviderConfig{MaxTokens: 100000} + modelInfo := &ModelInfo{ + ID: "devstral-medium-latest", + Limit: Limit{Context: 262144, Output: 262144}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != 100000 { + t.Errorf("expected MaxTokens preserved at 100000, got %d", config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_RespectsExplicitFlag(t *testing.T) { + // Simulate `--max-tokens 4096` on the command line. + cleanup := bindMaxTokensFlag(t, []string{"--max-tokens", "4096"}) + defer cleanup() + + config := &ProviderConfig{MaxTokens: 4096} + modelInfo := &ModelInfo{ + ID: "claude-sonnet-4-5", + Limit: Limit{Context: 200000, Output: 64000}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != 4096 { + t.Errorf("expected explicit --max-tokens to be preserved (4096), got %d", config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_NilModelInfo(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) + defer cleanup() + + config := &ProviderConfig{MaxTokens: 8192} + // Custom model / Ollama / unknown provider → no model info. + rightSizeMaxTokens(config, nil) + + if config.MaxTokens != 8192 { + t.Errorf("expected MaxTokens unchanged with nil modelInfo, got %d", config.MaxTokens) + } +} + +func TestRightSizeMaxTokens_ZeroOutputLimit(t *testing.T) { + cleanup := bindMaxTokensFlag(t, nil) + defer cleanup() + + config := &ProviderConfig{MaxTokens: 8192} + // Model present in catalog but with no known output limit. + modelInfo := &ModelInfo{ + ID: "unknown-model", + Limit: Limit{Context: 0, Output: 0}, + } + + rightSizeMaxTokens(config, modelInfo) + + if config.MaxTokens != 8192 { + t.Errorf("expected MaxTokens unchanged with zero output limit, got %d", config.MaxTokens) + } +} diff --git a/pkg/kit/events.go b/pkg/kit/events.go index dc9c9a15..96bafcde 100644 --- a/pkg/kit/events.go +++ b/pkg/kit/events.go @@ -110,6 +110,38 @@ func parseToolArgs(toolArgs string) map[string]any { return nil } +// --------------------------------------------------------------------------- +// Finish reason constants +// --------------------------------------------------------------------------- + +// Finish reasons reported by the LLM provider on a completed turn. These +// mirror fantasy.FinishReason string values so comparisons against +// TurnEndEvent.StopReason / TurnResult.StopReason are stable across +// providers. +const ( + // FinishReasonStop: the model produced a natural stop (e.g. stop sequence + // or end-of-turn signal). + FinishReasonStop = "stop" + // FinishReasonLength: the model hit the configured max_output_tokens + // budget. The response is truncated. Surface this to the user and + // consider raising --max-tokens / KIT_MAX_TOKENS / modelSettings[...] + // .maxTokens. + FinishReasonLength = "length" + // FinishReasonToolCalls: the model stopped to emit tool calls (normal + // mid-turn state during agentic loops). + FinishReasonToolCalls = "tool-calls" + // FinishReasonContentFilter: the provider's safety filter stopped + // generation. + FinishReasonContentFilter = "content-filter" + // FinishReasonError: the model stopped because of an error. + FinishReasonError = "error" + // FinishReasonOther: provider-specific reason that doesn't map to any of + // the above. + FinishReasonOther = "other" + // FinishReasonUnknown: the provider didn't report a finish reason. + FinishReasonUnknown = "unknown" +) + // --------------------------------------------------------------------------- // Concrete event structs // --------------------------------------------------------------------------- @@ -124,9 +156,13 @@ func (e TurnStartEvent) EventType() EventType { return EventTurnStart } // TurnEndEvent fires after the agent finishes processing. type TurnEndEvent struct { - Response string - Error error - StopReason string // "end_turn", "max_tokens", "tool_use", "error", etc. + Response string + Error error + // StopReason is the LLM provider's finish reason for the final step of + // the turn. Compare against the FinishReason* constants — in particular, + // FinishReasonLength indicates the response was truncated because the + // agent hit its max_output_tokens budget. + StopReason string } // EventType implements Event. diff --git a/pkg/kit/kit.go b/pkg/kit/kit.go index d19b71ac..4e587525 100644 --- a/pkg/kit/kit.go +++ b/pkg/kit/kit.go @@ -1451,8 +1451,9 @@ type TurnResult struct { Response string // StopReason indicates why the turn ended. Derived from the LLM - // provider's finish reason: "stop", "length" (max tokens), "tool-calls", - // "content-filter", "error", "other", "unknown". + // provider's finish reason: FinishReasonStop, FinishReasonLength (max + // output tokens reached), FinishReasonToolCalls, FinishReasonContentFilter, + // FinishReasonError, FinishReasonOther, FinishReasonUnknown. StopReason string // SessionID is the UUID of the session this turn belongs to. @@ -2249,6 +2250,35 @@ func (m *Kit) GetTools() []Tool { return m.agent.GetTools() } +// MaxTokens returns the effective max output tokens currently configured for +// the agent. This is the value actually sent to the LLM provider on each +// request, after CLI/env/config resolution, per-model overrides, model-aware +// right-sizing, and any Anthropic thinking-budget adjustments. +// +// Returns 0 when the active provider suppresses the max_output_tokens +// parameter (e.g. OpenAI Codex OAuth) or when no model is configured yet. +// A non-zero value is the number that will cause a FinishReasonLength +// truncation if the model tries to generate beyond it. +func (m *Kit) MaxTokens() int { + if m.agent == nil { + return 0 + } + return m.agent.GetMaxTokens() +} + +// MaxOutputLimit returns the catalog-reported output ceiling for the current +// model in tokens, or 0 when the model isn't in the registry (custom models, +// new releases, Ollama, etc.). Pair with MaxTokens() to detect when the agent +// is configured well below what the model supports and surface a hint to the +// user. +func (m *Kit) MaxOutputLimit() int { + info := m.GetModelInfo() + if info == nil { + return 0 + } + return info.Limit.Output +} + // extractFileParts returns all FilePart entries from a message's Content. // Used to preserve image attachments when replacing user message text. func extractFileParts(msg fantasy.Message) []fantasy.FilePart { diff --git a/skills/kit-extensions/SKILL.md b/skills/kit-extensions/SKILL.md index 98d20209..546096b1 100644 --- a/skills/kit-extensions/SKILL.md +++ b/skills/kit-extensions/SKILL.md @@ -93,7 +93,7 @@ api.OnAgentEnd(func(e ext.AgentEndEvent, ctx ext.Context) { // e.Response string // e.StopReason string — "error" (on failure), "completed" (when LLM returns // empty stop reason), or the raw LLM provider value passed through - // (e.g. "stop", "end_turn", "max_tokens", "tool_use"). + // (e.g. "stop", "length" (max output tokens hit), "tool-calls", "content-filter"). // To detect errors, check e.StopReason == "error". // Do NOT compare against "completed" for success — instead check != "error". }) diff --git a/www/pages/cli/flags.md b/www/pages/cli/flags.md index 89c4e5b0..52be0b5f 100644 --- a/www/pages/cli/flags.md +++ b/www/pages/cli/flags.md @@ -52,7 +52,7 @@ These flags control Kit's behavior. When a prompt is passed as a positional argu | Flag | Short | Default | Description | |------|-------|---------|-------------| -| `--max-tokens` | — | `4096` | Maximum tokens in response | +| `--max-tokens` | — | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. | | `--temperature` | — | `0.7` | Randomness 0.0–1.0 | | `--top-p` | — | `0.95` | Nucleus sampling 0.0–1.0 | | `--top-k` | — | `40` | Limit top K tokens | diff --git a/www/pages/configuration.md b/www/pages/configuration.md index 00a113d4..802fa5c4 100644 --- a/www/pages/configuration.md +++ b/www/pages/configuration.md @@ -18,7 +18,7 @@ Create `~/.kit.yml`: ```yaml model: anthropic/claude-sonnet-latest -max-tokens: 4096 +max-tokens: 8192 temperature: 0.7 stream: true ``` @@ -28,7 +28,7 @@ stream: true | Key | Type | Default | Description | |-----|------|---------|-------------| | `model` | string | `anthropic/claude-sonnet-latest` | Model to use (provider/model format) | -| `max-tokens` | int | `4096` | Maximum tokens in response | +| `max-tokens` | int | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. Use [`modelSettings[provider/model].maxTokens`](#per-model-settings) to override per-model. | | `temperature` | float | `0.7` | Randomness 0.0–1.0 | | `top-p` | float | `0.95` | Nucleus sampling 0.0–1.0 | | `top-k` | int | `40` | Limit top K tokens |