mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-13 19:20:06 +00:00
fix(ui): accurate context token tracking including cache tokens
- Include all token categories in context fill calculation: InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens - With Anthropic/kimi prompt caching, InputTokens can be near-zero while CacheReadTokens holds the bulk of the context - Include OutputTokens since assistant output becomes context next turn - Remove max-only guard in SetContextTokens so context shrinks after compaction instead of staying stuck at the high-water mark - Reset context tokens to 0 after compaction in both SDK and UI layers - Use real API-reported token counts in ShouldCompact() instead of the chars/4 text heuristic which misses system prompts and tool defs
This commit is contained in:
+28
-12
@@ -1165,9 +1165,10 @@ func (a *App) recordStepUsage(ev kit.StepUsageEvent, stepUsageSeen *atomic.Bool)
|
||||
int(ev.CacheWriteTokens),
|
||||
)
|
||||
// NOTE: We do NOT call SetContextTokens here. Context fill is set once
|
||||
// at turn completion via updateUsageFromTurnResult using FinalUsage.InputTokens,
|
||||
// which reflects the full accumulated context. Per-step context tokens would
|
||||
// cause the display to jump around during multi-step tool calls.
|
||||
// at turn completion via updateUsageFromTurnResult, which sums all token
|
||||
// categories (Input + CacheRead + CacheCreate + Output) from FinalUsage.
|
||||
// Per-step context tokens would cause the display to jump around during
|
||||
// multi-step tool calls.
|
||||
}
|
||||
|
||||
// updateUsageFromTurnResult records token usage from an SDK TurnResult into the
|
||||
@@ -1231,15 +1232,30 @@ func (a *App) updateUsageFromTurnResult(result *kit.TurnResult, userPrompt strin
|
||||
}
|
||||
|
||||
// --- Context window fill (drives the % bar) ---
|
||||
// Use FinalUsage.InputTokens as the context window fill. The API's InputTokens
|
||||
// already includes the full conversation history (system prompt + all previous
|
||||
// messages + current user message). Adding OutputTokens would double-count since
|
||||
// the output becomes part of the input for the next turn.
|
||||
if result.FinalUsage != nil && result.FinalUsage.InputTokens > 0 {
|
||||
if a.opts.Debug {
|
||||
log.Printf("[DEBUG] updateUsageFromTurnResult: calling SetContextTokens=%d (FinalUsage.InputTokens)",
|
||||
result.FinalUsage.InputTokens)
|
||||
// Calculate context fill from the LAST API call's usage. The context
|
||||
// window is filled by everything sent to and received from the model:
|
||||
//
|
||||
// InputTokens — non-cached input (may be small with prompt caching)
|
||||
// CacheReadTokens — input tokens served from cache
|
||||
// CacheCreationTokens — input tokens written to cache this call
|
||||
// OutputTokens — assistant output (becomes input next turn)
|
||||
//
|
||||
// With Anthropic prompt caching, InputTokens can drop to near-zero while
|
||||
// CacheReadTokens holds the bulk of the context. We must sum all four to
|
||||
// get the true context window utilization.
|
||||
//
|
||||
// We use FinalUsage (last step only), NOT TotalUsage, because TotalUsage
|
||||
// sums across all tool-calling steps — and each step re-sends the full
|
||||
// conversation, so TotalUsage massively overstates the actual window fill.
|
||||
if result.FinalUsage != nil {
|
||||
u := result.FinalUsage
|
||||
contextFill := int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
|
||||
if contextFill > 0 {
|
||||
if a.opts.Debug {
|
||||
log.Printf("[DEBUG] updateUsageFromTurnResult: SetContextTokens=%d (Input=%d + CacheRead=%d + CacheCreate=%d + Output=%d)",
|
||||
contextFill, u.InputTokens, u.CacheReadTokens, u.CacheCreationTokens, u.OutputTokens)
|
||||
}
|
||||
a.opts.UsageTracker.SetContextTokens(contextFill)
|
||||
}
|
||||
a.opts.UsageTracker.SetContextTokens(int(result.FinalUsage.InputTokens))
|
||||
}
|
||||
}
|
||||
|
||||
+19
-13
@@ -630,10 +630,12 @@ func TestUpdateUsageFromTurnResult_recordsWhenInputTokensZero(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly verifies that context
|
||||
// window fill uses InputTokens only (not input+output). The API's InputTokens
|
||||
// already includes the full conversation history; adding output would double-count.
|
||||
func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
|
||||
// TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories verifies that
|
||||
// context window fill uses all token categories from the final API call:
|
||||
// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens.
|
||||
// With Anthropic prompt caching, InputTokens can be near-zero while
|
||||
// CacheReadTokens holds the bulk of the context.
|
||||
func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T) {
|
||||
usage := &usageUpdaterStub{}
|
||||
app := New(Options{UsageTracker: usage}, nil)
|
||||
defer app.Close()
|
||||
@@ -641,22 +643,26 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
|
||||
app.updateUsageFromTurnResult(&kit.TurnResult{
|
||||
Response: "ok",
|
||||
TotalUsage: &kit.LLMUsage{
|
||||
InputTokens: 1000,
|
||||
OutputTokens: 200,
|
||||
InputTokens: 3,
|
||||
OutputTokens: 5,
|
||||
CacheReadTokens: 0,
|
||||
CacheCreationTokens: 4317,
|
||||
},
|
||||
FinalUsage: &kit.LLMUsage{
|
||||
InputTokens: 1000, // Full context including history
|
||||
OutputTokens: 200,
|
||||
InputTokens: 3, // Non-cached input (small with caching)
|
||||
OutputTokens: 5, // Assistant output
|
||||
CacheReadTokens: 0, // No cache reads on first call
|
||||
CacheCreationTokens: 4317, // System prompt + tools written to cache
|
||||
},
|
||||
}, "prompt", false)
|
||||
|
||||
usage.mu.Lock()
|
||||
defer usage.mu.Unlock()
|
||||
|
||||
// Context tokens should be InputTokens only (1000), not input+output (1200)
|
||||
// because InputTokens already includes the full conversation history
|
||||
if usage.contextCalls != 1 || usage.lastContextTokens != 1000 {
|
||||
t.Fatalf("expected context tokens=1000 (InputTokens only), got calls=%d tokens=%d",
|
||||
usage.contextCalls, usage.lastContextTokens)
|
||||
// Context tokens should be Input + CacheRead + CacheCreate + Output = 4325
|
||||
expected := 3 + 0 + 4317 + 5
|
||||
if usage.contextCalls != 1 || usage.lastContextTokens != expected {
|
||||
t.Fatalf("expected context tokens=%d (all categories), got calls=%d tokens=%d",
|
||||
expected, usage.contextCalls, usage.lastContextTokens)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,8 +21,10 @@ type UsageUpdater interface {
|
||||
// the provider does not return exact counts.
|
||||
EstimateAndUpdateUsage(inputText, outputText string)
|
||||
// SetContextTokens records the approximate current context window fill
|
||||
// level. This should be the final API call's input+output tokens (from
|
||||
// FinalResponse.Usage), NOT the aggregate TotalUsage.
|
||||
// level. This should be the sum of ALL token categories from the last
|
||||
// API call: InputTokens + CacheReadTokens + CacheCreationTokens +
|
||||
// OutputTokens. With Anthropic prompt caching, InputTokens can be
|
||||
// near-zero while CacheReadTokens holds the bulk of the context.
|
||||
SetContextTokens(tokens int)
|
||||
}
|
||||
|
||||
|
||||
@@ -1820,6 +1820,12 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
// Refresh content to show the finalized message.
|
||||
m.refreshContent()
|
||||
|
||||
// Reset context token display — the pre-compaction count is stale.
|
||||
// The next API call will set the accurate post-compaction value.
|
||||
if m.usageTracker != nil {
|
||||
m.usageTracker.SetContextTokens(0)
|
||||
}
|
||||
|
||||
// Print stats as a separate system message.
|
||||
saved := msg.OriginalTokens - msg.CompactedTokens
|
||||
statsMsg := fmt.Sprintf(
|
||||
|
||||
@@ -134,23 +134,28 @@ func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
|
||||
}
|
||||
|
||||
// SetContextTokens records the approximate current context window utilization.
|
||||
// This should be set from FinalUsage.InputTokens, which already includes the
|
||||
// full conversation history (system prompt + all previous messages). Do NOT
|
||||
// add OutputTokens as that would double-count (output becomes input next turn).
|
||||
// Use FinalResponse.Usage rather than aggregate TotalUsage, because TotalUsage
|
||||
// sums across all tool-calling steps and overstates the actual window fill level.
|
||||
//
|
||||
// The value should include ALL token categories from the last API call:
|
||||
//
|
||||
// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens
|
||||
//
|
||||
// With Anthropic prompt caching, InputTokens can be near-zero while
|
||||
// CacheReadTokens holds the bulk of the context. All four must be summed
|
||||
// to get the true context window fill level.
|
||||
//
|
||||
// OutputTokens is included because the assistant's output becomes part of
|
||||
// the context on the next turn.
|
||||
//
|
||||
// Use FinalResponse.Usage (last step only) rather than aggregate TotalUsage,
|
||||
// because TotalUsage sums across all tool-calling steps and overstates the
|
||||
// actual window fill level.
|
||||
//
|
||||
// The value is set unconditionally (not max-only) so that context shrinks
|
||||
// correctly after compaction.
|
||||
func (ut *UsageTracker) SetContextTokens(tokens int) {
|
||||
ut.mu.Lock()
|
||||
defer ut.mu.Unlock()
|
||||
// Track the maximum context seen so far. In multi-step tool calls,
|
||||
// FinalUsage.InputTokens may reflect only the last step's input, which
|
||||
// can be smaller than previous steps. We want to show the largest context
|
||||
// the model has processed in this session.
|
||||
if tokens > ut.contextTokens {
|
||||
ut.contextTokens = tokens
|
||||
}
|
||||
// If tokens < current, we keep the larger value (no-op)
|
||||
// This prevents the display from dropping during multi-step tool calls.
|
||||
ut.contextTokens = tokens
|
||||
}
|
||||
|
||||
// RenderUsageInfo generates a formatted string displaying current usage statistics
|
||||
|
||||
@@ -31,6 +31,11 @@ func (m *Kit) EstimateContextTokens() int {
|
||||
// limit and should be compacted.
|
||||
// Formula: contextTokens > contextWindow − reserveTokens.
|
||||
// Returns false if the model's context limit is unknown.
|
||||
//
|
||||
// When API-reported token counts are available (after at least one turn),
|
||||
// the real count is used instead of the text-based heuristic. This is
|
||||
// significantly more accurate because it includes system prompts, tool
|
||||
// definitions, and other overhead that the heuristic cannot account for.
|
||||
func (m *Kit) ShouldCompact() bool {
|
||||
info := m.GetModelInfo()
|
||||
if info == nil || info.Limit.Context <= 0 {
|
||||
@@ -42,6 +47,16 @@ func (m *Kit) ShouldCompact() bool {
|
||||
reserveTokens = m.compactionOpts.ReserveTokens
|
||||
}
|
||||
|
||||
// Prefer the real API-reported token count when available.
|
||||
m.lastInputTokensMu.RLock()
|
||||
realTokens := m.lastInputTokens
|
||||
m.lastInputTokensMu.RUnlock()
|
||||
|
||||
if realTokens > 0 {
|
||||
return realTokens > info.Limit.Context-reserveTokens
|
||||
}
|
||||
|
||||
// Fall back to text-based heuristic before first turn completes.
|
||||
messages := m.session.GetMessages()
|
||||
return compaction.ShouldCompact(convertKitMessagesToFantasy(messages), info.Limit.Context, reserveTokens)
|
||||
}
|
||||
@@ -245,6 +260,14 @@ func (m *Kit) persistAndEmitCompaction(
|
||||
); err != nil {
|
||||
return fmt.Errorf("failed to persist compaction entry: %w", err)
|
||||
}
|
||||
|
||||
// Reset the API-reported token count so GetContextStats() and
|
||||
// ShouldCompact() don't use stale pre-compaction values. The next
|
||||
// API call will set the accurate post-compaction count.
|
||||
m.lastInputTokensMu.Lock()
|
||||
m.lastInputTokens = 0
|
||||
m.lastInputTokensMu.Unlock()
|
||||
|
||||
m.events.emit(CompactionEvent{
|
||||
Summary: summary,
|
||||
OriginalTokens: originalTokens,
|
||||
|
||||
+10
-6
@@ -1183,9 +1183,11 @@ type TurnResult struct {
|
||||
// report usage.
|
||||
TotalUsage *LLMUsage
|
||||
|
||||
// FinalUsage is the token usage from the last API call only. Use this
|
||||
// for context window fill estimation (InputTokens + OutputTokens ≈
|
||||
// current context size). Nil if unavailable.
|
||||
// FinalUsage is the token usage from the last API call only. For context
|
||||
// window fill, sum all categories: InputTokens + CacheReadTokens +
|
||||
// CacheCreationTokens + OutputTokens. With prompt caching, InputTokens
|
||||
// alone understates the context (cached tokens are reported separately).
|
||||
// Nil if unavailable.
|
||||
FinalUsage *LLMUsage
|
||||
|
||||
// Messages is the full updated conversation after the turn, including
|
||||
@@ -1664,12 +1666,14 @@ func (m *Kit) runTurn(ctx context.Context, promptLabel string, prompt string, pr
|
||||
}
|
||||
|
||||
// Store the API-reported token count so GetContextStats() matches the
|
||||
// built-in status bar (which uses input + output tokens). The
|
||||
// text-based heuristic misses system prompts, tool definitions, etc.
|
||||
// built-in status bar. The context window is filled by all token
|
||||
// categories: non-cached input, cache reads, cache writes, and output.
|
||||
// With Anthropic prompt caching, InputTokens can be near-zero while
|
||||
// CacheReadTokens/CacheCreationTokens hold the bulk of the context.
|
||||
if result.FinalResponse != nil {
|
||||
u := result.FinalResponse.Usage
|
||||
m.lastInputTokensMu.Lock()
|
||||
m.lastInputTokens = int(u.InputTokens) + int(u.OutputTokens)
|
||||
m.lastInputTokens = int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
|
||||
m.lastInputTokensMu.Unlock()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user