From 2de98d32be6f98b328497f53391e3262ed293c21 Mon Sep 17 00:00:00 2001 From: Ed Zynda Date: Fri, 10 Apr 2026 17:05:47 +0300 Subject: [PATCH] fix(ui): accurate context token tracking including cache tokens - Include all token categories in context fill calculation: InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens - With Anthropic/kimi prompt caching, InputTokens can be near-zero while CacheReadTokens holds the bulk of the context - Include OutputTokens since assistant output becomes context next turn - Remove max-only guard in SetContextTokens so context shrinks after compaction instead of staying stuck at the high-water mark - Reset context tokens to 0 after compaction in both SDK and UI layers - Use real API-reported token counts in ShouldCompact() instead of the chars/4 text heuristic which misses system prompts and tool defs --- internal/app/app.go | 40 +++++++++++++++++++++++++----------- internal/app/app_test.go | 32 +++++++++++++++++------------ internal/app/options.go | 6 ++++-- internal/ui/model.go | 6 ++++++ internal/ui/usage_tracker.go | 33 ++++++++++++++++------------- pkg/kit/compaction.go | 23 +++++++++++++++++++++ pkg/kit/kit.go | 16 +++++++++------ 7 files changed, 109 insertions(+), 47 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index c1566743..e414a062 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -1165,9 +1165,10 @@ func (a *App) recordStepUsage(ev kit.StepUsageEvent, stepUsageSeen *atomic.Bool) int(ev.CacheWriteTokens), ) // NOTE: We do NOT call SetContextTokens here. Context fill is set once - // at turn completion via updateUsageFromTurnResult using FinalUsage.InputTokens, - // which reflects the full accumulated context. Per-step context tokens would - // cause the display to jump around during multi-step tool calls. + // at turn completion via updateUsageFromTurnResult, which sums all token + // categories (Input + CacheRead + CacheCreate + Output) from FinalUsage. + // Per-step context tokens would cause the display to jump around during + // multi-step tool calls. } // updateUsageFromTurnResult records token usage from an SDK TurnResult into the @@ -1231,15 +1232,30 @@ func (a *App) updateUsageFromTurnResult(result *kit.TurnResult, userPrompt strin } // --- Context window fill (drives the % bar) --- - // Use FinalUsage.InputTokens as the context window fill. The API's InputTokens - // already includes the full conversation history (system prompt + all previous - // messages + current user message). Adding OutputTokens would double-count since - // the output becomes part of the input for the next turn. - if result.FinalUsage != nil && result.FinalUsage.InputTokens > 0 { - if a.opts.Debug { - log.Printf("[DEBUG] updateUsageFromTurnResult: calling SetContextTokens=%d (FinalUsage.InputTokens)", - result.FinalUsage.InputTokens) + // Calculate context fill from the LAST API call's usage. The context + // window is filled by everything sent to and received from the model: + // + // InputTokens — non-cached input (may be small with prompt caching) + // CacheReadTokens — input tokens served from cache + // CacheCreationTokens — input tokens written to cache this call + // OutputTokens — assistant output (becomes input next turn) + // + // With Anthropic prompt caching, InputTokens can drop to near-zero while + // CacheReadTokens holds the bulk of the context. We must sum all four to + // get the true context window utilization. + // + // We use FinalUsage (last step only), NOT TotalUsage, because TotalUsage + // sums across all tool-calling steps — and each step re-sends the full + // conversation, so TotalUsage massively overstates the actual window fill. + if result.FinalUsage != nil { + u := result.FinalUsage + contextFill := int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens) + if contextFill > 0 { + if a.opts.Debug { + log.Printf("[DEBUG] updateUsageFromTurnResult: SetContextTokens=%d (Input=%d + CacheRead=%d + CacheCreate=%d + Output=%d)", + contextFill, u.InputTokens, u.CacheReadTokens, u.CacheCreationTokens, u.OutputTokens) + } + a.opts.UsageTracker.SetContextTokens(contextFill) } - a.opts.UsageTracker.SetContextTokens(int(result.FinalUsage.InputTokens)) } } diff --git a/internal/app/app_test.go b/internal/app/app_test.go index d0096203..fab6b6c6 100644 --- a/internal/app/app_test.go +++ b/internal/app/app_test.go @@ -630,10 +630,12 @@ func TestUpdateUsageFromTurnResult_recordsWhenInputTokensZero(t *testing.T) { } } -// TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly verifies that context -// window fill uses InputTokens only (not input+output). The API's InputTokens -// already includes the full conversation history; adding output would double-count. -func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) { +// TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories verifies that +// context window fill uses all token categories from the final API call: +// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens. +// With Anthropic prompt caching, InputTokens can be near-zero while +// CacheReadTokens holds the bulk of the context. +func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T) { usage := &usageUpdaterStub{} app := New(Options{UsageTracker: usage}, nil) defer app.Close() @@ -641,22 +643,26 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) { app.updateUsageFromTurnResult(&kit.TurnResult{ Response: "ok", TotalUsage: &kit.LLMUsage{ - InputTokens: 1000, - OutputTokens: 200, + InputTokens: 3, + OutputTokens: 5, + CacheReadTokens: 0, + CacheCreationTokens: 4317, }, FinalUsage: &kit.LLMUsage{ - InputTokens: 1000, // Full context including history - OutputTokens: 200, + InputTokens: 3, // Non-cached input (small with caching) + OutputTokens: 5, // Assistant output + CacheReadTokens: 0, // No cache reads on first call + CacheCreationTokens: 4317, // System prompt + tools written to cache }, }, "prompt", false) usage.mu.Lock() defer usage.mu.Unlock() - // Context tokens should be InputTokens only (1000), not input+output (1200) - // because InputTokens already includes the full conversation history - if usage.contextCalls != 1 || usage.lastContextTokens != 1000 { - t.Fatalf("expected context tokens=1000 (InputTokens only), got calls=%d tokens=%d", - usage.contextCalls, usage.lastContextTokens) + // Context tokens should be Input + CacheRead + CacheCreate + Output = 4325 + expected := 3 + 0 + 4317 + 5 + if usage.contextCalls != 1 || usage.lastContextTokens != expected { + t.Fatalf("expected context tokens=%d (all categories), got calls=%d tokens=%d", + expected, usage.contextCalls, usage.lastContextTokens) } } diff --git a/internal/app/options.go b/internal/app/options.go index 7c1b9a76..8bf983ac 100644 --- a/internal/app/options.go +++ b/internal/app/options.go @@ -21,8 +21,10 @@ type UsageUpdater interface { // the provider does not return exact counts. EstimateAndUpdateUsage(inputText, outputText string) // SetContextTokens records the approximate current context window fill - // level. This should be the final API call's input+output tokens (from - // FinalResponse.Usage), NOT the aggregate TotalUsage. + // level. This should be the sum of ALL token categories from the last + // API call: InputTokens + CacheReadTokens + CacheCreationTokens + + // OutputTokens. With Anthropic prompt caching, InputTokens can be + // near-zero while CacheReadTokens holds the bulk of the context. SetContextTokens(tokens int) } diff --git a/internal/ui/model.go b/internal/ui/model.go index a2373ceb..9c10e589 100644 --- a/internal/ui/model.go +++ b/internal/ui/model.go @@ -1820,6 +1820,12 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { // Refresh content to show the finalized message. m.refreshContent() + // Reset context token display — the pre-compaction count is stale. + // The next API call will set the accurate post-compaction value. + if m.usageTracker != nil { + m.usageTracker.SetContextTokens(0) + } + // Print stats as a separate system message. saved := msg.OriginalTokens - msg.CompactedTokens statsMsg := fmt.Sprintf( diff --git a/internal/ui/usage_tracker.go b/internal/ui/usage_tracker.go index e0cbfea2..f88f884a 100644 --- a/internal/ui/usage_tracker.go +++ b/internal/ui/usage_tracker.go @@ -134,23 +134,28 @@ func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) { } // SetContextTokens records the approximate current context window utilization. -// This should be set from FinalUsage.InputTokens, which already includes the -// full conversation history (system prompt + all previous messages). Do NOT -// add OutputTokens as that would double-count (output becomes input next turn). -// Use FinalResponse.Usage rather than aggregate TotalUsage, because TotalUsage -// sums across all tool-calling steps and overstates the actual window fill level. +// +// The value should include ALL token categories from the last API call: +// +// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens +// +// With Anthropic prompt caching, InputTokens can be near-zero while +// CacheReadTokens holds the bulk of the context. All four must be summed +// to get the true context window fill level. +// +// OutputTokens is included because the assistant's output becomes part of +// the context on the next turn. +// +// Use FinalResponse.Usage (last step only) rather than aggregate TotalUsage, +// because TotalUsage sums across all tool-calling steps and overstates the +// actual window fill level. +// +// The value is set unconditionally (not max-only) so that context shrinks +// correctly after compaction. func (ut *UsageTracker) SetContextTokens(tokens int) { ut.mu.Lock() defer ut.mu.Unlock() - // Track the maximum context seen so far. In multi-step tool calls, - // FinalUsage.InputTokens may reflect only the last step's input, which - // can be smaller than previous steps. We want to show the largest context - // the model has processed in this session. - if tokens > ut.contextTokens { - ut.contextTokens = tokens - } - // If tokens < current, we keep the larger value (no-op) - // This prevents the display from dropping during multi-step tool calls. + ut.contextTokens = tokens } // RenderUsageInfo generates a formatted string displaying current usage statistics diff --git a/pkg/kit/compaction.go b/pkg/kit/compaction.go index 2420d19f..f99edd25 100644 --- a/pkg/kit/compaction.go +++ b/pkg/kit/compaction.go @@ -31,6 +31,11 @@ func (m *Kit) EstimateContextTokens() int { // limit and should be compacted. // Formula: contextTokens > contextWindow − reserveTokens. // Returns false if the model's context limit is unknown. +// +// When API-reported token counts are available (after at least one turn), +// the real count is used instead of the text-based heuristic. This is +// significantly more accurate because it includes system prompts, tool +// definitions, and other overhead that the heuristic cannot account for. func (m *Kit) ShouldCompact() bool { info := m.GetModelInfo() if info == nil || info.Limit.Context <= 0 { @@ -42,6 +47,16 @@ func (m *Kit) ShouldCompact() bool { reserveTokens = m.compactionOpts.ReserveTokens } + // Prefer the real API-reported token count when available. + m.lastInputTokensMu.RLock() + realTokens := m.lastInputTokens + m.lastInputTokensMu.RUnlock() + + if realTokens > 0 { + return realTokens > info.Limit.Context-reserveTokens + } + + // Fall back to text-based heuristic before first turn completes. messages := m.session.GetMessages() return compaction.ShouldCompact(convertKitMessagesToFantasy(messages), info.Limit.Context, reserveTokens) } @@ -245,6 +260,14 @@ func (m *Kit) persistAndEmitCompaction( ); err != nil { return fmt.Errorf("failed to persist compaction entry: %w", err) } + + // Reset the API-reported token count so GetContextStats() and + // ShouldCompact() don't use stale pre-compaction values. The next + // API call will set the accurate post-compaction count. + m.lastInputTokensMu.Lock() + m.lastInputTokens = 0 + m.lastInputTokensMu.Unlock() + m.events.emit(CompactionEvent{ Summary: summary, OriginalTokens: originalTokens, diff --git a/pkg/kit/kit.go b/pkg/kit/kit.go index 69c87eae..aec56ecb 100644 --- a/pkg/kit/kit.go +++ b/pkg/kit/kit.go @@ -1183,9 +1183,11 @@ type TurnResult struct { // report usage. TotalUsage *LLMUsage - // FinalUsage is the token usage from the last API call only. Use this - // for context window fill estimation (InputTokens + OutputTokens ≈ - // current context size). Nil if unavailable. + // FinalUsage is the token usage from the last API call only. For context + // window fill, sum all categories: InputTokens + CacheReadTokens + + // CacheCreationTokens + OutputTokens. With prompt caching, InputTokens + // alone understates the context (cached tokens are reported separately). + // Nil if unavailable. FinalUsage *LLMUsage // Messages is the full updated conversation after the turn, including @@ -1664,12 +1666,14 @@ func (m *Kit) runTurn(ctx context.Context, promptLabel string, prompt string, pr } // Store the API-reported token count so GetContextStats() matches the - // built-in status bar (which uses input + output tokens). The - // text-based heuristic misses system prompts, tool definitions, etc. + // built-in status bar. The context window is filled by all token + // categories: non-cached input, cache reads, cache writes, and output. + // With Anthropic prompt caching, InputTokens can be near-zero while + // CacheReadTokens/CacheCreationTokens hold the bulk of the context. if result.FinalResponse != nil { u := result.FinalResponse.Usage m.lastInputTokensMu.Lock() - m.lastInputTokens = int(u.InputTokens) + int(u.OutputTokens) + m.lastInputTokens = int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens) m.lastInputTokensMu.Unlock() }