feat(models): surface and prevent silent max-tokens truncation

- Raise --max-tokens default from 4096 to 8192. - Auto-raise MaxTokens toward the model's catalog Limit.Output (capped at 32768) when the user hasn't set --max-tokens explicitly and no per-model modelSettings override applied. Prevents silent 4k/8k truncation on models that support 32k-262k output. - Surface FinishReasonLength at turn end: the app now subscribes to TurnEndEvent and renders a system-message banner explaining the current cap, the model's known ceiling, and how to raise it. Previously the TUI swallowed 'length' stops, producing 'ghost' truncations. - Export FinishReason* constants on pkg/kit (Stop, Length, ToolCalls, ContentFilter, Error, Other, Unknown) and fix stale comments that used Anthropic-style strings. - Add Kit.MaxTokens() and Kit.MaxOutputLimit() SDK accessors, backed by Agent.GetMaxTokens() which correctly returns 0 for providers that suppress the param (e.g. Codex OAuth). - Tests: rightSizeMaxTokens covers 7 paths (cap, raise, preserve, explicit flag, nil info, zero limit); handleTurnEnd covers length/ non-length/nil-sendFn and the fallback message formatter. - Docs: update configuration.md, cli/flags.md, and kit-extensions skill to reflect the new default and behavior.
2026-06-14 03:30:26 +00:00 · 2026-04-16 23:12:10 +03:00
parent 633fa38b2b
commit 3bb20f5283
11 changed files with 429 additions and 10 deletions
@@ -297,7 +297,7 @@ func init() {
 	flags.BoolVar(&noPromptTemplates, "no-prompt-templates", false, "disable prompt template discovery")

 	// Model generation parameters
-	flags.IntVar(&maxTokens, "max-tokens", 4096, "maximum number of tokens in the response")
+	flags.IntVar(&maxTokens, "max-tokens", 8192, "maximum number of output tokens per response (auto-raised up to 32768 for models with higher known output limits; see internal/models/embedded_models.json)")
 	flags.Float32Var(&temperature, "temperature", 0.7, "controls randomness in responses (0.0-1.0)")
 	flags.Float32Var(&topP, "top-p", 0.95, "controls diversity via nucleus sampling (0.0-1.0)")
 	flags.Int32Var(&topK, "top-k", 40, "controls diversity by limiting top K tokens to sample from")
@@ -1025,6 +1025,22 @@ func (a *Agent) GetModel() fantasy.LanguageModel {
 	return a.model
 }

+// GetMaxTokens returns the effective max output tokens the agent currently
+// sends to the LLM provider, after per-model defaults, right-sizing, and any
+// Anthropic thinking-budget adjustments. Returns 0 when no ModelConfig is
+// attached (e.g. early init) or when the provider suppresses the parameter
+// (e.g. Codex OAuth), which allows callers to differentiate "default" from
+// "explicitly capped".
+func (a *Agent) GetMaxTokens() int {
+	if a.skipMaxOutputTokens {
+		return 0
+	}
+	if a.modelConfig == nil {
+		return 0
+	}
+	return a.modelConfig.MaxTokens
+}
+
 // Close closes the agent and cleans up resources.
 // If MCP tools are still loading in the background, Close waits for them
 // to finish before closing connections to avoid resource leaks.
@@ -932,6 +932,8 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo
 				Password:  resp.Password,
 				Cancelled: resp.Cancelled,
 			}
+		case kit.TurnEndEvent:
+			a.handleTurnEnd(ev, sendFn)
 		}
 	}))

@@ -942,6 +944,64 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo
 	}
 }

+// handleTurnEnd inspects a turn's final StopReason and surfaces actionable
+// feedback to the user when the turn ended in a state they can act on.
+//
+// Today the only surfaced case is FinishReasonLength — the model hit its
+// configured max_output_tokens budget and the reply was truncated. Without
+// this banner the TUI used to swallow the truncation silently, leading to
+// "ghost" cut-offs with no indication of why.
+//
+// Separated from subscribeSDKEvents so tests can exercise it directly via a
+// stubbed sendFn without standing up a full Kit.
+func (a *App) handleTurnEnd(ev kit.TurnEndEvent, sendFn func(tea.Msg)) {
+	if sendFn == nil {
+		return
+	}
+	if ev.StopReason != kit.FinishReasonLength {
+		return
+	}
+	sendFn(ExtensionPrintEvent{
+		Level: "info",
+		Text:  a.formatMaxTokensTruncatedMessage(),
+	})
+}
+
+// formatMaxTokensTruncatedMessage builds the user-facing explanation for a
+// truncated turn. It reports the active max_output_tokens budget and, when
+// known, the model's catalog output ceiling so the user can judge how much
+// headroom is available.
+func (a *App) formatMaxTokensTruncatedMessage() string {
+	k := a.opts.Kit
+	if k == nil {
+		// Extremely early / test-stub case: still emit a useful generic hint.
+		return "⚠ Response truncated: the model hit the configured max_output_tokens limit. " +
+			"Raise it with --max-tokens N, KIT_MAX_TOKENS=N, or per-model " +
+			"modelSettings[provider/model].maxTokens in config."
+	}
+	current := k.MaxTokens()
+	ceiling := k.MaxOutputLimit()
+	model := k.GetModelString()
+
+	msg := "⚠ Response truncated: "
+	if model != "" {
+		msg += fmt.Sprintf("%s hit the configured max_output_tokens limit", model)
+	} else {
+		msg += "the model hit the configured max_output_tokens limit"
+	}
+	if current > 0 {
+		msg += fmt.Sprintf(" (%d)", current)
+	}
+	msg += "."
+	if ceiling > 0 && current > 0 && ceiling > current {
+		msg += fmt.Sprintf(" This model supports up to %d output tokens.", ceiling)
+	}
+	msg += "\n\nRaise it with --max-tokens N, KIT_MAX_TOKENS=N, " +
+		"or per-model modelSettings[provider/model].maxTokens in your config. " +
+		"Re-run the last prompt after raising it to get the full response."
+	return msg
+}
+
 // QuitFromExtension triggers a graceful shutdown. In interactive mode it
 // sends a tea.QuitMsg to the program so the TUI exits cleanly. In
 // non-interactive mode it cancels the root context, stopping any in-flight
@@ -3,10 +3,12 @@ package app
 import (
 	"context"
 	"errors"
+	"strings"
 	"sync"
 	"testing"
 	"time"

+	tea "charm.land/bubbletea/v2"
 	kit "github.com/mark3labs/kit/pkg/kit"
 )

@@ -666,3 +668,94 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T)
 			expected, usage.contextCalls, usage.lastContextTokens)
 	}
 }
+
+// TestHandleTurnEnd_LengthEmitsWarning verifies that when the SDK reports a
+// FinishReasonLength (max_output_tokens hit), the app surfaces a user-visible
+// ExtensionPrintEvent with Level="info" so the TUI can render a banner
+// instead of silently showing a truncated reply.
+func TestHandleTurnEnd_LengthEmitsWarning(t *testing.T) {
+	app := New(Options{}, nil)
+	defer app.Close()
+
+	var mu sync.Mutex
+	var received []tea.Msg
+	sendFn := func(m tea.Msg) {
+		mu.Lock()
+		defer mu.Unlock()
+		received = append(received, m)
+	}
+
+	app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, sendFn)
+
+	mu.Lock()
+	defer mu.Unlock()
+	if len(received) != 1 {
+		t.Fatalf("expected 1 event on length stop, got %d", len(received))
+	}
+	ev, ok := received[0].(ExtensionPrintEvent)
+	if !ok {
+		t.Fatalf("expected ExtensionPrintEvent, got %T", received[0])
+	}
+	if ev.Level != "info" {
+		t.Errorf("expected Level=info, got %q", ev.Level)
+	}
+	if ev.Text == "" {
+		t.Error("expected non-empty warning text")
+	}
+	if !strings.Contains(ev.Text, "max_output_tokens") {
+		t.Errorf("warning text should mention max_output_tokens, got: %s", ev.Text)
+	}
+}
+
+// TestHandleTurnEnd_NonLengthIgnored verifies that ordinary stop reasons
+// (stop, tool-calls, error, unknown, "") do not produce a warning banner.
+func TestHandleTurnEnd_NonLengthIgnored(t *testing.T) {
+	app := New(Options{}, nil)
+	defer app.Close()
+
+	reasons := []string{
+		kit.FinishReasonStop,
+		kit.FinishReasonToolCalls,
+		kit.FinishReasonError,
+		kit.FinishReasonContentFilter,
+		kit.FinishReasonOther,
+		kit.FinishReasonUnknown,
+		"",
+	}
+	for _, r := range reasons {
+		var called bool
+		app.handleTurnEnd(kit.TurnEndEvent{StopReason: r}, func(m tea.Msg) {
+			called = true
+		})
+		if called {
+			t.Errorf("stop reason %q unexpectedly emitted a warning", r)
+		}
+	}
+}
+
+// TestHandleTurnEnd_NilSendFn guards against panics when no TUI listener is
+// attached (e.g. early init or headless teardown).
+func TestHandleTurnEnd_NilSendFn(t *testing.T) {
+	app := New(Options{}, nil)
+	defer app.Close()
+
+	// Should not panic with a nil sendFn.
+	app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, nil)
+}
+
+// TestFormatMaxTokensTruncatedMessage_NoKit verifies the fallback message
+// when Options.Kit is nil (test/stub path).
+func TestFormatMaxTokensTruncatedMessage_NoKit(t *testing.T) {
+	app := New(Options{}, nil)
+	defer app.Close()
+
+	msg := app.formatMaxTokensTruncatedMessage()
+	if msg == "" {
+		t.Fatal("expected non-empty fallback message")
+	}
+	for _, needle := range []string{"max_output_tokens", "--max-tokens", "KIT_MAX_TOKENS", "modelSettings"} {
+		if !strings.Contains(msg, needle) {
+			t.Errorf("fallback message missing %q:\n%s", needle, msg)
+		}
+	}
+}
@@ -251,6 +251,11 @@ func CreateProvider(ctx context.Context, config *ProviderConfig) (*ProviderResul
 	// via CLI flag or global config.
 	ApplyModelSettings(config, modelInfo)

+	// Auto-raise MaxTokens toward the model's known output ceiling when the
+	// user hasn't explicitly set --max-tokens and no per-model override
+	// applied. Runs after ApplyModelSettings so explicit modelSettings win.
+	rightSizeMaxTokens(config, modelInfo)
+
 	// Create the base provider
 	var result *ProviderResult
 	var createErr error
@@ -489,6 +494,37 @@ func validateModelConfig(config *ProviderConfig, modelInfo *ModelInfo) {
 	}
 }

+// defaultRightSizeCap bounds auto-raised MaxTokens so that we don't silently
+// allocate enormous output budgets for models with very high ceilings (e.g.
+// Devstral at 262144, Mistral at 128000). Users who genuinely want more can
+// pass --max-tokens explicitly or set modelSettings[...].maxTokens in config.
+const defaultRightSizeCap = 32768
+
+// rightSizeMaxTokens raises config.MaxTokens toward the model's known output
+// ceiling when:
+//   - the user has not explicitly set --max-tokens (or the KIT_MAX_TOKENS env
+//     var, or the top-level max-tokens key in config.yaml), AND
+//   - no per-model override already bumped MaxTokens (ApplyModelSettings runs
+//     before this function), AND
+//   - modelInfo.Limit.Output is known and larger than the current MaxTokens.
+//
+// The raised value is capped at defaultRightSizeCap to keep accidental
+// allocations reasonable on very-large-output models. This prevents the
+// common "ghost" where the agent's reply is silently truncated at the 8192
+// default even though the selected model supports 64k or 262k output tokens.
+func rightSizeMaxTokens(config *ProviderConfig, modelInfo *ModelInfo) {
+	if modelInfo == nil || modelInfo.Limit.Output <= 0 {
+		return
+	}
+	if isExplicitlySet("max-tokens") {
+		return
+	}
+	target := min(modelInfo.Limit.Output, defaultRightSizeCap)
+	if config.MaxTokens < target {
+		config.MaxTokens = target
+	}
+}
+
 // clearConflictingAnthropicSamplingParams ensures that temperature and top_p are
 // not both sent to the Anthropic API, which rejects requests containing both.
 // When both are set (typically from defaults), top_p is cleared so that
@@ -0,0 +1,148 @@
+package models
+
+import (
+	"testing"
+
+	"github.com/spf13/pflag"
+	"github.com/spf13/viper"
+)
+
+// bindMaxTokensFlag wires a fresh pflag-backed "max-tokens" key into viper so
+// isExplicitlySet behaves the same way it does in production. Returns a
+// cleanup function that removes the binding so sibling tests see a clean
+// state.
+func bindMaxTokensFlag(t *testing.T, args []string) func() {
+	t.Helper()
+	fs := pflag.NewFlagSet("test", pflag.ContinueOnError)
+	fs.Int("max-tokens", 8192, "")
+	if err := viper.BindPFlag("max-tokens", fs.Lookup("max-tokens")); err != nil {
+		t.Fatalf("BindPFlag: %v", err)
+	}
+	if err := fs.Parse(args); err != nil {
+		t.Fatalf("fs.Parse: %v", err)
+	}
+	return func() {
+		viper.Reset()
+	}
+}
+
+func TestRightSizeMaxTokens_RaisesWhenBelowCeiling(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil) // no args → flag.Changed = false
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 8192}
+	modelInfo := &ModelInfo{
+		ID:    "claude-sonnet-4-5",
+		Limit: Limit{Context: 200000, Output: 64000},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != 32768 {
+		t.Errorf("expected MaxTokens raised to defaultRightSizeCap (32768), got %d", config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_CapsAtDefaultRightSizeCap(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil)
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 8192}
+	// Mistral Devstral has 262144 output — we should still cap at 32768.
+	modelInfo := &ModelInfo{
+		ID:    "devstral-medium-latest",
+		Limit: Limit{Context: 262144, Output: 262144},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != defaultRightSizeCap {
+		t.Errorf("expected MaxTokens capped at %d, got %d", defaultRightSizeCap, config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_UsesExactOutputWhenBelowCap(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil)
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 4096}
+	// Model with output limit smaller than the cap.
+	modelInfo := &ModelInfo{
+		ID:    "gpt-4",
+		Limit: Limit{Context: 8192, Output: 8192},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != 8192 {
+		t.Errorf("expected MaxTokens raised to model output ceiling (8192), got %d", config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_DoesNotLowerCurrentValue(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil)
+	defer cleanup()
+
+	// User (via per-model settings, applied earlier) already bumped MaxTokens
+	// above the cap — we must not clobber their choice.
+	config := &ProviderConfig{MaxTokens: 100000}
+	modelInfo := &ModelInfo{
+		ID:    "devstral-medium-latest",
+		Limit: Limit{Context: 262144, Output: 262144},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != 100000 {
+		t.Errorf("expected MaxTokens preserved at 100000, got %d", config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_RespectsExplicitFlag(t *testing.T) {
+	// Simulate `--max-tokens 4096` on the command line.
+	cleanup := bindMaxTokensFlag(t, []string{"--max-tokens", "4096"})
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 4096}
+	modelInfo := &ModelInfo{
+		ID:    "claude-sonnet-4-5",
+		Limit: Limit{Context: 200000, Output: 64000},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != 4096 {
+		t.Errorf("expected explicit --max-tokens to be preserved (4096), got %d", config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_NilModelInfo(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil)
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 8192}
+	// Custom model / Ollama / unknown provider → no model info.
+	rightSizeMaxTokens(config, nil)
+
+	if config.MaxTokens != 8192 {
+		t.Errorf("expected MaxTokens unchanged with nil modelInfo, got %d", config.MaxTokens)
+	}
+}
+
+func TestRightSizeMaxTokens_ZeroOutputLimit(t *testing.T) {
+	cleanup := bindMaxTokensFlag(t, nil)
+	defer cleanup()
+
+	config := &ProviderConfig{MaxTokens: 8192}
+	// Model present in catalog but with no known output limit.
+	modelInfo := &ModelInfo{
+		ID:    "unknown-model",
+		Limit: Limit{Context: 0, Output: 0},
+	}
+
+	rightSizeMaxTokens(config, modelInfo)
+
+	if config.MaxTokens != 8192 {
+		t.Errorf("expected MaxTokens unchanged with zero output limit, got %d", config.MaxTokens)
+	}
+}
@@ -110,6 +110,38 @@ func parseToolArgs(toolArgs string) map[string]any {
 	return nil
 }

+// ---------------------------------------------------------------------------
+// Finish reason constants
+// ---------------------------------------------------------------------------
+
+// Finish reasons reported by the LLM provider on a completed turn. These
+// mirror fantasy.FinishReason string values so comparisons against
+// TurnEndEvent.StopReason / TurnResult.StopReason are stable across
+// providers.
+const (
+	// FinishReasonStop: the model produced a natural stop (e.g. stop sequence
+	// or end-of-turn signal).
+	FinishReasonStop = "stop"
+	// FinishReasonLength: the model hit the configured max_output_tokens
+	// budget. The response is truncated. Surface this to the user and
+	// consider raising --max-tokens / KIT_MAX_TOKENS / modelSettings[...]
+	// .maxTokens.
+	FinishReasonLength = "length"
+	// FinishReasonToolCalls: the model stopped to emit tool calls (normal
+	// mid-turn state during agentic loops).
+	FinishReasonToolCalls = "tool-calls"
+	// FinishReasonContentFilter: the provider's safety filter stopped
+	// generation.
+	FinishReasonContentFilter = "content-filter"
+	// FinishReasonError: the model stopped because of an error.
+	FinishReasonError = "error"
+	// FinishReasonOther: provider-specific reason that doesn't map to any of
+	// the above.
+	FinishReasonOther = "other"
+	// FinishReasonUnknown: the provider didn't report a finish reason.
+	FinishReasonUnknown = "unknown"
+)
+
 // ---------------------------------------------------------------------------
 // Concrete event structs
 // ---------------------------------------------------------------------------
@@ -124,9 +156,13 @@ func (e TurnStartEvent) EventType() EventType { return EventTurnStart }

 // TurnEndEvent fires after the agent finishes processing.
 type TurnEndEvent struct {
-	Response   string
-	Error      error
-	StopReason string // "end_turn", "max_tokens", "tool_use", "error", etc.
+	Response string
+	Error    error
+	// StopReason is the LLM provider's finish reason for the final step of
+	// the turn. Compare against the FinishReason* constants — in particular,
+	// FinishReasonLength indicates the response was truncated because the
+	// agent hit its max_output_tokens budget.
+	StopReason string
 }

 // EventType implements Event.
@@ -1451,8 +1451,9 @@ type TurnResult struct {
 	Response string

 	// StopReason indicates why the turn ended. Derived from the LLM
-	// provider's finish reason: "stop", "length" (max tokens), "tool-calls",
-	// "content-filter", "error", "other", "unknown".
+	// provider's finish reason: FinishReasonStop, FinishReasonLength (max
+	// output tokens reached), FinishReasonToolCalls, FinishReasonContentFilter,
+	// FinishReasonError, FinishReasonOther, FinishReasonUnknown.
 	StopReason string

 	// SessionID is the UUID of the session this turn belongs to.
@@ -2249,6 +2250,35 @@ func (m *Kit) GetTools() []Tool {
 	return m.agent.GetTools()
 }

+// MaxTokens returns the effective max output tokens currently configured for
+// the agent. This is the value actually sent to the LLM provider on each
+// request, after CLI/env/config resolution, per-model overrides, model-aware
+// right-sizing, and any Anthropic thinking-budget adjustments.
+//
+// Returns 0 when the active provider suppresses the max_output_tokens
+// parameter (e.g. OpenAI Codex OAuth) or when no model is configured yet.
+// A non-zero value is the number that will cause a FinishReasonLength
+// truncation if the model tries to generate beyond it.
+func (m *Kit) MaxTokens() int {
+	if m.agent == nil {
+		return 0
+	}
+	return m.agent.GetMaxTokens()
+}
+
+// MaxOutputLimit returns the catalog-reported output ceiling for the current
+// model in tokens, or 0 when the model isn't in the registry (custom models,
+// new releases, Ollama, etc.). Pair with MaxTokens() to detect when the agent
+// is configured well below what the model supports and surface a hint to the
+// user.
+func (m *Kit) MaxOutputLimit() int {
+	info := m.GetModelInfo()
+	if info == nil {
+		return 0
+	}
+	return info.Limit.Output
+}
+
 // extractFileParts returns all FilePart entries from a message's Content.
 // Used to preserve image attachments when replacing user message text.
 func extractFileParts(msg fantasy.Message) []fantasy.FilePart {
@@ -93,7 +93,7 @@ api.OnAgentEnd(func(e ext.AgentEndEvent, ctx ext.Context) {
    // e.Response string
    // e.StopReason string — "error" (on failure), "completed" (when LLM returns
    //   empty stop reason), or the raw LLM provider value passed through
-    //   (e.g. "stop", "end_turn", "max_tokens", "tool_use").
+    //   (e.g. "stop", "length" (max output tokens hit), "tool-calls", "content-filter").
    //   To detect errors, check e.StopReason == "error".
    //   Do NOT compare against "completed" for success — instead check != "error".
 })
@@ -52,7 +52,7 @@ These flags control Kit's behavior. When a prompt is passed as a positional argu

 | Flag | Short | Default | Description |
 |------|-------|---------|-------------|
-| `--max-tokens` | — | `4096` | Maximum tokens in response |
+| `--max-tokens` | — | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. |
 | `--temperature` | — | `0.7` | Randomness 0.0–1.0 |
 | `--top-p` | — | `0.95` | Nucleus sampling 0.0–1.0 |
 | `--top-k` | — | `40` | Limit top K tokens |
@@ -18,7 +18,7 @@ Create `~/.kit.yml`:

 ```yaml
 model: anthropic/claude-sonnet-latest
-max-tokens: 4096
+max-tokens: 8192
 temperature: 0.7
 stream: true
 ```
@@ -28,7 +28,7 @@ stream: true
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
 | `model` | string | `anthropic/claude-sonnet-latest` | Model to use (provider/model format) |
-| `max-tokens` | int | `4096` | Maximum tokens in response |
+| `max-tokens` | int | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. Use [`modelSettings[provider/model].maxTokens`](#per-model-settings) to override per-model. |
 | `temperature` | float | `0.7` | Randomness 0.0–1.0 |
 | `top-p` | float | `0.95` | Nucleus sampling 0.0–1.0 |
 | `top-k` | int | `40` | Limit top K tokens |