mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-14 03:30:26 +00:00
feat(models): surface and prevent silent max-tokens truncation
- Raise --max-tokens default from 4096 to 8192. - Auto-raise MaxTokens toward the model's catalog Limit.Output (capped at 32768) when the user hasn't set --max-tokens explicitly and no per-model modelSettings override applied. Prevents silent 4k/8k truncation on models that support 32k-262k output. - Surface FinishReasonLength at turn end: the app now subscribes to TurnEndEvent and renders a system-message banner explaining the current cap, the model's known ceiling, and how to raise it. Previously the TUI swallowed 'length' stops, producing 'ghost' truncations. - Export FinishReason* constants on pkg/kit (Stop, Length, ToolCalls, ContentFilter, Error, Other, Unknown) and fix stale comments that used Anthropic-style strings. - Add Kit.MaxTokens() and Kit.MaxOutputLimit() SDK accessors, backed by Agent.GetMaxTokens() which correctly returns 0 for providers that suppress the param (e.g. Codex OAuth). - Tests: rightSizeMaxTokens covers 7 paths (cap, raise, preserve, explicit flag, nil info, zero limit); handleTurnEnd covers length/ non-length/nil-sendFn and the fallback message formatter. - Docs: update configuration.md, cli/flags.md, and kit-extensions skill to reflect the new default and behavior.
This commit is contained in:
+1
-1
@@ -297,7 +297,7 @@ func init() {
|
||||
flags.BoolVar(&noPromptTemplates, "no-prompt-templates", false, "disable prompt template discovery")
|
||||
|
||||
// Model generation parameters
|
||||
flags.IntVar(&maxTokens, "max-tokens", 4096, "maximum number of tokens in the response")
|
||||
flags.IntVar(&maxTokens, "max-tokens", 8192, "maximum number of output tokens per response (auto-raised up to 32768 for models with higher known output limits; see internal/models/embedded_models.json)")
|
||||
flags.Float32Var(&temperature, "temperature", 0.7, "controls randomness in responses (0.0-1.0)")
|
||||
flags.Float32Var(&topP, "top-p", 0.95, "controls diversity via nucleus sampling (0.0-1.0)")
|
||||
flags.Int32Var(&topK, "top-k", 40, "controls diversity by limiting top K tokens to sample from")
|
||||
|
||||
@@ -1025,6 +1025,22 @@ func (a *Agent) GetModel() fantasy.LanguageModel {
|
||||
return a.model
|
||||
}
|
||||
|
||||
// GetMaxTokens returns the effective max output tokens the agent currently
|
||||
// sends to the LLM provider, after per-model defaults, right-sizing, and any
|
||||
// Anthropic thinking-budget adjustments. Returns 0 when no ModelConfig is
|
||||
// attached (e.g. early init) or when the provider suppresses the parameter
|
||||
// (e.g. Codex OAuth), which allows callers to differentiate "default" from
|
||||
// "explicitly capped".
|
||||
func (a *Agent) GetMaxTokens() int {
|
||||
if a.skipMaxOutputTokens {
|
||||
return 0
|
||||
}
|
||||
if a.modelConfig == nil {
|
||||
return 0
|
||||
}
|
||||
return a.modelConfig.MaxTokens
|
||||
}
|
||||
|
||||
// Close closes the agent and cleans up resources.
|
||||
// If MCP tools are still loading in the background, Close waits for them
|
||||
// to finish before closing connections to avoid resource leaks.
|
||||
|
||||
@@ -932,6 +932,8 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo
|
||||
Password: resp.Password,
|
||||
Cancelled: resp.Cancelled,
|
||||
}
|
||||
case kit.TurnEndEvent:
|
||||
a.handleTurnEnd(ev, sendFn)
|
||||
}
|
||||
}))
|
||||
|
||||
@@ -942,6 +944,64 @@ func (a *App) subscribeSDKEvents(sendFn func(tea.Msg), stepUsageSeen *atomic.Boo
|
||||
}
|
||||
}
|
||||
|
||||
// handleTurnEnd inspects a turn's final StopReason and surfaces actionable
|
||||
// feedback to the user when the turn ended in a state they can act on.
|
||||
//
|
||||
// Today the only surfaced case is FinishReasonLength — the model hit its
|
||||
// configured max_output_tokens budget and the reply was truncated. Without
|
||||
// this banner the TUI used to swallow the truncation silently, leading to
|
||||
// "ghost" cut-offs with no indication of why.
|
||||
//
|
||||
// Separated from subscribeSDKEvents so tests can exercise it directly via a
|
||||
// stubbed sendFn without standing up a full Kit.
|
||||
func (a *App) handleTurnEnd(ev kit.TurnEndEvent, sendFn func(tea.Msg)) {
|
||||
if sendFn == nil {
|
||||
return
|
||||
}
|
||||
if ev.StopReason != kit.FinishReasonLength {
|
||||
return
|
||||
}
|
||||
sendFn(ExtensionPrintEvent{
|
||||
Level: "info",
|
||||
Text: a.formatMaxTokensTruncatedMessage(),
|
||||
})
|
||||
}
|
||||
|
||||
// formatMaxTokensTruncatedMessage builds the user-facing explanation for a
|
||||
// truncated turn. It reports the active max_output_tokens budget and, when
|
||||
// known, the model's catalog output ceiling so the user can judge how much
|
||||
// headroom is available.
|
||||
func (a *App) formatMaxTokensTruncatedMessage() string {
|
||||
k := a.opts.Kit
|
||||
if k == nil {
|
||||
// Extremely early / test-stub case: still emit a useful generic hint.
|
||||
return "⚠ Response truncated: the model hit the configured max_output_tokens limit. " +
|
||||
"Raise it with --max-tokens N, KIT_MAX_TOKENS=N, or per-model " +
|
||||
"modelSettings[provider/model].maxTokens in config."
|
||||
}
|
||||
current := k.MaxTokens()
|
||||
ceiling := k.MaxOutputLimit()
|
||||
model := k.GetModelString()
|
||||
|
||||
msg := "⚠ Response truncated: "
|
||||
if model != "" {
|
||||
msg += fmt.Sprintf("%s hit the configured max_output_tokens limit", model)
|
||||
} else {
|
||||
msg += "the model hit the configured max_output_tokens limit"
|
||||
}
|
||||
if current > 0 {
|
||||
msg += fmt.Sprintf(" (%d)", current)
|
||||
}
|
||||
msg += "."
|
||||
if ceiling > 0 && current > 0 && ceiling > current {
|
||||
msg += fmt.Sprintf(" This model supports up to %d output tokens.", ceiling)
|
||||
}
|
||||
msg += "\n\nRaise it with --max-tokens N, KIT_MAX_TOKENS=N, " +
|
||||
"or per-model modelSettings[provider/model].maxTokens in your config. " +
|
||||
"Re-run the last prompt after raising it to get the full response."
|
||||
return msg
|
||||
}
|
||||
|
||||
// QuitFromExtension triggers a graceful shutdown. In interactive mode it
|
||||
// sends a tea.QuitMsg to the program so the TUI exits cleanly. In
|
||||
// non-interactive mode it cancels the root context, stopping any in-flight
|
||||
|
||||
@@ -3,10 +3,12 @@ package app
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
tea "charm.land/bubbletea/v2"
|
||||
kit "github.com/mark3labs/kit/pkg/kit"
|
||||
)
|
||||
|
||||
@@ -666,3 +668,94 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T)
|
||||
expected, usage.contextCalls, usage.lastContextTokens)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleTurnEnd_LengthEmitsWarning verifies that when the SDK reports a
|
||||
// FinishReasonLength (max_output_tokens hit), the app surfaces a user-visible
|
||||
// ExtensionPrintEvent with Level="info" so the TUI can render a banner
|
||||
// instead of silently showing a truncated reply.
|
||||
func TestHandleTurnEnd_LengthEmitsWarning(t *testing.T) {
|
||||
app := New(Options{}, nil)
|
||||
defer app.Close()
|
||||
|
||||
var mu sync.Mutex
|
||||
var received []tea.Msg
|
||||
sendFn := func(m tea.Msg) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
received = append(received, m)
|
||||
}
|
||||
|
||||
app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, sendFn)
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(received) != 1 {
|
||||
t.Fatalf("expected 1 event on length stop, got %d", len(received))
|
||||
}
|
||||
ev, ok := received[0].(ExtensionPrintEvent)
|
||||
if !ok {
|
||||
t.Fatalf("expected ExtensionPrintEvent, got %T", received[0])
|
||||
}
|
||||
if ev.Level != "info" {
|
||||
t.Errorf("expected Level=info, got %q", ev.Level)
|
||||
}
|
||||
if ev.Text == "" {
|
||||
t.Error("expected non-empty warning text")
|
||||
}
|
||||
if !strings.Contains(ev.Text, "max_output_tokens") {
|
||||
t.Errorf("warning text should mention max_output_tokens, got: %s", ev.Text)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleTurnEnd_NonLengthIgnored verifies that ordinary stop reasons
|
||||
// (stop, tool-calls, error, unknown, "") do not produce a warning banner.
|
||||
func TestHandleTurnEnd_NonLengthIgnored(t *testing.T) {
|
||||
app := New(Options{}, nil)
|
||||
defer app.Close()
|
||||
|
||||
reasons := []string{
|
||||
kit.FinishReasonStop,
|
||||
kit.FinishReasonToolCalls,
|
||||
kit.FinishReasonError,
|
||||
kit.FinishReasonContentFilter,
|
||||
kit.FinishReasonOther,
|
||||
kit.FinishReasonUnknown,
|
||||
"",
|
||||
}
|
||||
for _, r := range reasons {
|
||||
var called bool
|
||||
app.handleTurnEnd(kit.TurnEndEvent{StopReason: r}, func(m tea.Msg) {
|
||||
called = true
|
||||
})
|
||||
if called {
|
||||
t.Errorf("stop reason %q unexpectedly emitted a warning", r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleTurnEnd_NilSendFn guards against panics when no TUI listener is
|
||||
// attached (e.g. early init or headless teardown).
|
||||
func TestHandleTurnEnd_NilSendFn(t *testing.T) {
|
||||
app := New(Options{}, nil)
|
||||
defer app.Close()
|
||||
|
||||
// Should not panic with a nil sendFn.
|
||||
app.handleTurnEnd(kit.TurnEndEvent{StopReason: kit.FinishReasonLength}, nil)
|
||||
}
|
||||
|
||||
// TestFormatMaxTokensTruncatedMessage_NoKit verifies the fallback message
|
||||
// when Options.Kit is nil (test/stub path).
|
||||
func TestFormatMaxTokensTruncatedMessage_NoKit(t *testing.T) {
|
||||
app := New(Options{}, nil)
|
||||
defer app.Close()
|
||||
|
||||
msg := app.formatMaxTokensTruncatedMessage()
|
||||
if msg == "" {
|
||||
t.Fatal("expected non-empty fallback message")
|
||||
}
|
||||
for _, needle := range []string{"max_output_tokens", "--max-tokens", "KIT_MAX_TOKENS", "modelSettings"} {
|
||||
if !strings.Contains(msg, needle) {
|
||||
t.Errorf("fallback message missing %q:\n%s", needle, msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,6 +251,11 @@ func CreateProvider(ctx context.Context, config *ProviderConfig) (*ProviderResul
|
||||
// via CLI flag or global config.
|
||||
ApplyModelSettings(config, modelInfo)
|
||||
|
||||
// Auto-raise MaxTokens toward the model's known output ceiling when the
|
||||
// user hasn't explicitly set --max-tokens and no per-model override
|
||||
// applied. Runs after ApplyModelSettings so explicit modelSettings win.
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
// Create the base provider
|
||||
var result *ProviderResult
|
||||
var createErr error
|
||||
@@ -489,6 +494,37 @@ func validateModelConfig(config *ProviderConfig, modelInfo *ModelInfo) {
|
||||
}
|
||||
}
|
||||
|
||||
// defaultRightSizeCap bounds auto-raised MaxTokens so that we don't silently
|
||||
// allocate enormous output budgets for models with very high ceilings (e.g.
|
||||
// Devstral at 262144, Mistral at 128000). Users who genuinely want more can
|
||||
// pass --max-tokens explicitly or set modelSettings[...].maxTokens in config.
|
||||
const defaultRightSizeCap = 32768
|
||||
|
||||
// rightSizeMaxTokens raises config.MaxTokens toward the model's known output
|
||||
// ceiling when:
|
||||
// - the user has not explicitly set --max-tokens (or the KIT_MAX_TOKENS env
|
||||
// var, or the top-level max-tokens key in config.yaml), AND
|
||||
// - no per-model override already bumped MaxTokens (ApplyModelSettings runs
|
||||
// before this function), AND
|
||||
// - modelInfo.Limit.Output is known and larger than the current MaxTokens.
|
||||
//
|
||||
// The raised value is capped at defaultRightSizeCap to keep accidental
|
||||
// allocations reasonable on very-large-output models. This prevents the
|
||||
// common "ghost" where the agent's reply is silently truncated at the 8192
|
||||
// default even though the selected model supports 64k or 262k output tokens.
|
||||
func rightSizeMaxTokens(config *ProviderConfig, modelInfo *ModelInfo) {
|
||||
if modelInfo == nil || modelInfo.Limit.Output <= 0 {
|
||||
return
|
||||
}
|
||||
if isExplicitlySet("max-tokens") {
|
||||
return
|
||||
}
|
||||
target := min(modelInfo.Limit.Output, defaultRightSizeCap)
|
||||
if config.MaxTokens < target {
|
||||
config.MaxTokens = target
|
||||
}
|
||||
}
|
||||
|
||||
// clearConflictingAnthropicSamplingParams ensures that temperature and top_p are
|
||||
// not both sent to the Anthropic API, which rejects requests containing both.
|
||||
// When both are set (typically from defaults), top_p is cleared so that
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"github.com/spf13/viper"
|
||||
)
|
||||
|
||||
// bindMaxTokensFlag wires a fresh pflag-backed "max-tokens" key into viper so
|
||||
// isExplicitlySet behaves the same way it does in production. Returns a
|
||||
// cleanup function that removes the binding so sibling tests see a clean
|
||||
// state.
|
||||
func bindMaxTokensFlag(t *testing.T, args []string) func() {
|
||||
t.Helper()
|
||||
fs := pflag.NewFlagSet("test", pflag.ContinueOnError)
|
||||
fs.Int("max-tokens", 8192, "")
|
||||
if err := viper.BindPFlag("max-tokens", fs.Lookup("max-tokens")); err != nil {
|
||||
t.Fatalf("BindPFlag: %v", err)
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
t.Fatalf("fs.Parse: %v", err)
|
||||
}
|
||||
return func() {
|
||||
viper.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_RaisesWhenBelowCeiling(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil) // no args → flag.Changed = false
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 8192}
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "claude-sonnet-4-5",
|
||||
Limit: Limit{Context: 200000, Output: 64000},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != 32768 {
|
||||
t.Errorf("expected MaxTokens raised to defaultRightSizeCap (32768), got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_CapsAtDefaultRightSizeCap(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 8192}
|
||||
// Mistral Devstral has 262144 output — we should still cap at 32768.
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "devstral-medium-latest",
|
||||
Limit: Limit{Context: 262144, Output: 262144},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != defaultRightSizeCap {
|
||||
t.Errorf("expected MaxTokens capped at %d, got %d", defaultRightSizeCap, config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_UsesExactOutputWhenBelowCap(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 4096}
|
||||
// Model with output limit smaller than the cap.
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "gpt-4",
|
||||
Limit: Limit{Context: 8192, Output: 8192},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != 8192 {
|
||||
t.Errorf("expected MaxTokens raised to model output ceiling (8192), got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_DoesNotLowerCurrentValue(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
// User (via per-model settings, applied earlier) already bumped MaxTokens
|
||||
// above the cap — we must not clobber their choice.
|
||||
config := &ProviderConfig{MaxTokens: 100000}
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "devstral-medium-latest",
|
||||
Limit: Limit{Context: 262144, Output: 262144},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != 100000 {
|
||||
t.Errorf("expected MaxTokens preserved at 100000, got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_RespectsExplicitFlag(t *testing.T) {
|
||||
// Simulate `--max-tokens 4096` on the command line.
|
||||
cleanup := bindMaxTokensFlag(t, []string{"--max-tokens", "4096"})
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 4096}
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "claude-sonnet-4-5",
|
||||
Limit: Limit{Context: 200000, Output: 64000},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != 4096 {
|
||||
t.Errorf("expected explicit --max-tokens to be preserved (4096), got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_NilModelInfo(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 8192}
|
||||
// Custom model / Ollama / unknown provider → no model info.
|
||||
rightSizeMaxTokens(config, nil)
|
||||
|
||||
if config.MaxTokens != 8192 {
|
||||
t.Errorf("expected MaxTokens unchanged with nil modelInfo, got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRightSizeMaxTokens_ZeroOutputLimit(t *testing.T) {
|
||||
cleanup := bindMaxTokensFlag(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
config := &ProviderConfig{MaxTokens: 8192}
|
||||
// Model present in catalog but with no known output limit.
|
||||
modelInfo := &ModelInfo{
|
||||
ID: "unknown-model",
|
||||
Limit: Limit{Context: 0, Output: 0},
|
||||
}
|
||||
|
||||
rightSizeMaxTokens(config, modelInfo)
|
||||
|
||||
if config.MaxTokens != 8192 {
|
||||
t.Errorf("expected MaxTokens unchanged with zero output limit, got %d", config.MaxTokens)
|
||||
}
|
||||
}
|
||||
+39
-3
@@ -110,6 +110,38 @@ func parseToolArgs(toolArgs string) map[string]any {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Finish reason constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Finish reasons reported by the LLM provider on a completed turn. These
|
||||
// mirror fantasy.FinishReason string values so comparisons against
|
||||
// TurnEndEvent.StopReason / TurnResult.StopReason are stable across
|
||||
// providers.
|
||||
const (
|
||||
// FinishReasonStop: the model produced a natural stop (e.g. stop sequence
|
||||
// or end-of-turn signal).
|
||||
FinishReasonStop = "stop"
|
||||
// FinishReasonLength: the model hit the configured max_output_tokens
|
||||
// budget. The response is truncated. Surface this to the user and
|
||||
// consider raising --max-tokens / KIT_MAX_TOKENS / modelSettings[...]
|
||||
// .maxTokens.
|
||||
FinishReasonLength = "length"
|
||||
// FinishReasonToolCalls: the model stopped to emit tool calls (normal
|
||||
// mid-turn state during agentic loops).
|
||||
FinishReasonToolCalls = "tool-calls"
|
||||
// FinishReasonContentFilter: the provider's safety filter stopped
|
||||
// generation.
|
||||
FinishReasonContentFilter = "content-filter"
|
||||
// FinishReasonError: the model stopped because of an error.
|
||||
FinishReasonError = "error"
|
||||
// FinishReasonOther: provider-specific reason that doesn't map to any of
|
||||
// the above.
|
||||
FinishReasonOther = "other"
|
||||
// FinishReasonUnknown: the provider didn't report a finish reason.
|
||||
FinishReasonUnknown = "unknown"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Concrete event structs
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -124,9 +156,13 @@ func (e TurnStartEvent) EventType() EventType { return EventTurnStart }
|
||||
|
||||
// TurnEndEvent fires after the agent finishes processing.
|
||||
type TurnEndEvent struct {
|
||||
Response string
|
||||
Error error
|
||||
StopReason string // "end_turn", "max_tokens", "tool_use", "error", etc.
|
||||
Response string
|
||||
Error error
|
||||
// StopReason is the LLM provider's finish reason for the final step of
|
||||
// the turn. Compare against the FinishReason* constants — in particular,
|
||||
// FinishReasonLength indicates the response was truncated because the
|
||||
// agent hit its max_output_tokens budget.
|
||||
StopReason string
|
||||
}
|
||||
|
||||
// EventType implements Event.
|
||||
|
||||
+32
-2
@@ -1451,8 +1451,9 @@ type TurnResult struct {
|
||||
Response string
|
||||
|
||||
// StopReason indicates why the turn ended. Derived from the LLM
|
||||
// provider's finish reason: "stop", "length" (max tokens), "tool-calls",
|
||||
// "content-filter", "error", "other", "unknown".
|
||||
// provider's finish reason: FinishReasonStop, FinishReasonLength (max
|
||||
// output tokens reached), FinishReasonToolCalls, FinishReasonContentFilter,
|
||||
// FinishReasonError, FinishReasonOther, FinishReasonUnknown.
|
||||
StopReason string
|
||||
|
||||
// SessionID is the UUID of the session this turn belongs to.
|
||||
@@ -2249,6 +2250,35 @@ func (m *Kit) GetTools() []Tool {
|
||||
return m.agent.GetTools()
|
||||
}
|
||||
|
||||
// MaxTokens returns the effective max output tokens currently configured for
|
||||
// the agent. This is the value actually sent to the LLM provider on each
|
||||
// request, after CLI/env/config resolution, per-model overrides, model-aware
|
||||
// right-sizing, and any Anthropic thinking-budget adjustments.
|
||||
//
|
||||
// Returns 0 when the active provider suppresses the max_output_tokens
|
||||
// parameter (e.g. OpenAI Codex OAuth) or when no model is configured yet.
|
||||
// A non-zero value is the number that will cause a FinishReasonLength
|
||||
// truncation if the model tries to generate beyond it.
|
||||
func (m *Kit) MaxTokens() int {
|
||||
if m.agent == nil {
|
||||
return 0
|
||||
}
|
||||
return m.agent.GetMaxTokens()
|
||||
}
|
||||
|
||||
// MaxOutputLimit returns the catalog-reported output ceiling for the current
|
||||
// model in tokens, or 0 when the model isn't in the registry (custom models,
|
||||
// new releases, Ollama, etc.). Pair with MaxTokens() to detect when the agent
|
||||
// is configured well below what the model supports and surface a hint to the
|
||||
// user.
|
||||
func (m *Kit) MaxOutputLimit() int {
|
||||
info := m.GetModelInfo()
|
||||
if info == nil {
|
||||
return 0
|
||||
}
|
||||
return info.Limit.Output
|
||||
}
|
||||
|
||||
// extractFileParts returns all FilePart entries from a message's Content.
|
||||
// Used to preserve image attachments when replacing user message text.
|
||||
func extractFileParts(msg fantasy.Message) []fantasy.FilePart {
|
||||
|
||||
@@ -93,7 +93,7 @@ api.OnAgentEnd(func(e ext.AgentEndEvent, ctx ext.Context) {
|
||||
// e.Response string
|
||||
// e.StopReason string — "error" (on failure), "completed" (when LLM returns
|
||||
// empty stop reason), or the raw LLM provider value passed through
|
||||
// (e.g. "stop", "end_turn", "max_tokens", "tool_use").
|
||||
// (e.g. "stop", "length" (max output tokens hit), "tool-calls", "content-filter").
|
||||
// To detect errors, check e.StopReason == "error".
|
||||
// Do NOT compare against "completed" for success — instead check != "error".
|
||||
})
|
||||
|
||||
@@ -52,7 +52,7 @@ These flags control Kit's behavior. When a prompt is passed as a positional argu
|
||||
|
||||
| Flag | Short | Default | Description |
|
||||
|------|-------|---------|-------------|
|
||||
| `--max-tokens` | — | `4096` | Maximum tokens in response |
|
||||
| `--max-tokens` | — | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. |
|
||||
| `--temperature` | — | `0.7` | Randomness 0.0–1.0 |
|
||||
| `--top-p` | — | `0.95` | Nucleus sampling 0.0–1.0 |
|
||||
| `--top-k` | — | `40` | Limit top K tokens |
|
||||
|
||||
@@ -18,7 +18,7 @@ Create `~/.kit.yml`:
|
||||
|
||||
```yaml
|
||||
model: anthropic/claude-sonnet-latest
|
||||
max-tokens: 4096
|
||||
max-tokens: 8192
|
||||
temperature: 0.7
|
||||
stream: true
|
||||
```
|
||||
@@ -28,7 +28,7 @@ stream: true
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| `model` | string | `anthropic/claude-sonnet-latest` | Model to use (provider/model format) |
|
||||
| `max-tokens` | int | `4096` | Maximum tokens in response |
|
||||
| `max-tokens` | int | `8192` | Base cap for output tokens. Auto-raised per-model up to 32768 when the model's catalog ceiling is higher and no explicit value is set. Use [`modelSettings[provider/model].maxTokens`](#per-model-settings) to override per-model. |
|
||||
| `temperature` | float | `0.7` | Randomness 0.0–1.0 |
|
||||
| `top-p` | float | `0.95` | Nucleus sampling 0.0–1.0 |
|
||||
| `top-k` | int | `40` | Limit top K tokens |
|
||||
|
||||
Reference in New Issue
Block a user