fix(ui): accurate context token tracking including cache tokens

- Include all token categories in context fill calculation:
  InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens
- With Anthropic/kimi prompt caching, InputTokens can be near-zero
  while CacheReadTokens holds the bulk of the context
- Include OutputTokens since assistant output becomes context next turn
- Remove max-only guard in SetContextTokens so context shrinks after
  compaction instead of staying stuck at the high-water mark
- Reset context tokens to 0 after compaction in both SDK and UI layers
- Use real API-reported token counts in ShouldCompact() instead of
  the chars/4 text heuristic which misses system prompts and tool defs
This commit is contained in:
Ed Zynda
2026-04-10 17:05:47 +03:00
parent 83127467c5
commit 2de98d32be
7 changed files with 109 additions and 47 deletions
+28 -12
View File
@@ -1165,9 +1165,10 @@ func (a *App) recordStepUsage(ev kit.StepUsageEvent, stepUsageSeen *atomic.Bool)
int(ev.CacheWriteTokens),
)
// NOTE: We do NOT call SetContextTokens here. Context fill is set once
// at turn completion via updateUsageFromTurnResult using FinalUsage.InputTokens,
// which reflects the full accumulated context. Per-step context tokens would
// cause the display to jump around during multi-step tool calls.
// at turn completion via updateUsageFromTurnResult, which sums all token
// categories (Input + CacheRead + CacheCreate + Output) from FinalUsage.
// Per-step context tokens would cause the display to jump around during
// multi-step tool calls.
}
// updateUsageFromTurnResult records token usage from an SDK TurnResult into the
@@ -1231,15 +1232,30 @@ func (a *App) updateUsageFromTurnResult(result *kit.TurnResult, userPrompt strin
}
// --- Context window fill (drives the % bar) ---
// Use FinalUsage.InputTokens as the context window fill. The API's InputTokens
// already includes the full conversation history (system prompt + all previous
// messages + current user message). Adding OutputTokens would double-count since
// the output becomes part of the input for the next turn.
if result.FinalUsage != nil && result.FinalUsage.InputTokens > 0 {
if a.opts.Debug {
log.Printf("[DEBUG] updateUsageFromTurnResult: calling SetContextTokens=%d (FinalUsage.InputTokens)",
result.FinalUsage.InputTokens)
// Calculate context fill from the LAST API call's usage. The context
// window is filled by everything sent to and received from the model:
//
// InputTokens — non-cached input (may be small with prompt caching)
// CacheReadTokens — input tokens served from cache
// CacheCreationTokens — input tokens written to cache this call
// OutputTokens — assistant output (becomes input next turn)
//
// With Anthropic prompt caching, InputTokens can drop to near-zero while
// CacheReadTokens holds the bulk of the context. We must sum all four to
// get the true context window utilization.
//
// We use FinalUsage (last step only), NOT TotalUsage, because TotalUsage
// sums across all tool-calling steps — and each step re-sends the full
// conversation, so TotalUsage massively overstates the actual window fill.
if result.FinalUsage != nil {
u := result.FinalUsage
contextFill := int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
if contextFill > 0 {
if a.opts.Debug {
log.Printf("[DEBUG] updateUsageFromTurnResult: SetContextTokens=%d (Input=%d + CacheRead=%d + CacheCreate=%d + Output=%d)",
contextFill, u.InputTokens, u.CacheReadTokens, u.CacheCreationTokens, u.OutputTokens)
}
a.opts.UsageTracker.SetContextTokens(contextFill)
}
a.opts.UsageTracker.SetContextTokens(int(result.FinalUsage.InputTokens))
}
}
+19 -13
View File
@@ -630,10 +630,12 @@ func TestUpdateUsageFromTurnResult_recordsWhenInputTokensZero(t *testing.T) {
}
}
// TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly verifies that context
// window fill uses InputTokens only (not input+output). The API's InputTokens
// already includes the full conversation history; adding output would double-count.
func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
// TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories verifies that
// context window fill uses all token categories from the final API call:
// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens.
// With Anthropic prompt caching, InputTokens can be near-zero while
// CacheReadTokens holds the bulk of the context.
func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T) {
usage := &usageUpdaterStub{}
app := New(Options{UsageTracker: usage}, nil)
defer app.Close()
@@ -641,22 +643,26 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
app.updateUsageFromTurnResult(&kit.TurnResult{
Response: "ok",
TotalUsage: &kit.LLMUsage{
InputTokens: 1000,
OutputTokens: 200,
InputTokens: 3,
OutputTokens: 5,
CacheReadTokens: 0,
CacheCreationTokens: 4317,
},
FinalUsage: &kit.LLMUsage{
InputTokens: 1000, // Full context including history
OutputTokens: 200,
InputTokens: 3, // Non-cached input (small with caching)
OutputTokens: 5, // Assistant output
CacheReadTokens: 0, // No cache reads on first call
CacheCreationTokens: 4317, // System prompt + tools written to cache
},
}, "prompt", false)
usage.mu.Lock()
defer usage.mu.Unlock()
// Context tokens should be InputTokens only (1000), not input+output (1200)
// because InputTokens already includes the full conversation history
if usage.contextCalls != 1 || usage.lastContextTokens != 1000 {
t.Fatalf("expected context tokens=1000 (InputTokens only), got calls=%d tokens=%d",
usage.contextCalls, usage.lastContextTokens)
// Context tokens should be Input + CacheRead + CacheCreate + Output = 4325
expected := 3 + 0 + 4317 + 5
if usage.contextCalls != 1 || usage.lastContextTokens != expected {
t.Fatalf("expected context tokens=%d (all categories), got calls=%d tokens=%d",
expected, usage.contextCalls, usage.lastContextTokens)
}
}
+4 -2
View File
@@ -21,8 +21,10 @@ type UsageUpdater interface {
// the provider does not return exact counts.
EstimateAndUpdateUsage(inputText, outputText string)
// SetContextTokens records the approximate current context window fill
// level. This should be the final API call's input+output tokens (from
// FinalResponse.Usage), NOT the aggregate TotalUsage.
// level. This should be the sum of ALL token categories from the last
// API call: InputTokens + CacheReadTokens + CacheCreationTokens +
// OutputTokens. With Anthropic prompt caching, InputTokens can be
// near-zero while CacheReadTokens holds the bulk of the context.
SetContextTokens(tokens int)
}
+6
View File
@@ -1820,6 +1820,12 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
// Refresh content to show the finalized message.
m.refreshContent()
// Reset context token display — the pre-compaction count is stale.
// The next API call will set the accurate post-compaction value.
if m.usageTracker != nil {
m.usageTracker.SetContextTokens(0)
}
// Print stats as a separate system message.
saved := msg.OriginalTokens - msg.CompactedTokens
statsMsg := fmt.Sprintf(
+19 -14
View File
@@ -134,23 +134,28 @@ func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
}
// SetContextTokens records the approximate current context window utilization.
// This should be set from FinalUsage.InputTokens, which already includes the
// full conversation history (system prompt + all previous messages). Do NOT
// add OutputTokens as that would double-count (output becomes input next turn).
// Use FinalResponse.Usage rather than aggregate TotalUsage, because TotalUsage
// sums across all tool-calling steps and overstates the actual window fill level.
//
// The value should include ALL token categories from the last API call:
//
// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens
//
// With Anthropic prompt caching, InputTokens can be near-zero while
// CacheReadTokens holds the bulk of the context. All four must be summed
// to get the true context window fill level.
//
// OutputTokens is included because the assistant's output becomes part of
// the context on the next turn.
//
// Use FinalResponse.Usage (last step only) rather than aggregate TotalUsage,
// because TotalUsage sums across all tool-calling steps and overstates the
// actual window fill level.
//
// The value is set unconditionally (not max-only) so that context shrinks
// correctly after compaction.
func (ut *UsageTracker) SetContextTokens(tokens int) {
ut.mu.Lock()
defer ut.mu.Unlock()
// Track the maximum context seen so far. In multi-step tool calls,
// FinalUsage.InputTokens may reflect only the last step's input, which
// can be smaller than previous steps. We want to show the largest context
// the model has processed in this session.
if tokens > ut.contextTokens {
ut.contextTokens = tokens
}
// If tokens < current, we keep the larger value (no-op)
// This prevents the display from dropping during multi-step tool calls.
ut.contextTokens = tokens
}
// RenderUsageInfo generates a formatted string displaying current usage statistics
+23
View File
@@ -31,6 +31,11 @@ func (m *Kit) EstimateContextTokens() int {
// limit and should be compacted.
// Formula: contextTokens > contextWindow reserveTokens.
// Returns false if the model's context limit is unknown.
//
// When API-reported token counts are available (after at least one turn),
// the real count is used instead of the text-based heuristic. This is
// significantly more accurate because it includes system prompts, tool
// definitions, and other overhead that the heuristic cannot account for.
func (m *Kit) ShouldCompact() bool {
info := m.GetModelInfo()
if info == nil || info.Limit.Context <= 0 {
@@ -42,6 +47,16 @@ func (m *Kit) ShouldCompact() bool {
reserveTokens = m.compactionOpts.ReserveTokens
}
// Prefer the real API-reported token count when available.
m.lastInputTokensMu.RLock()
realTokens := m.lastInputTokens
m.lastInputTokensMu.RUnlock()
if realTokens > 0 {
return realTokens > info.Limit.Context-reserveTokens
}
// Fall back to text-based heuristic before first turn completes.
messages := m.session.GetMessages()
return compaction.ShouldCompact(convertKitMessagesToFantasy(messages), info.Limit.Context, reserveTokens)
}
@@ -245,6 +260,14 @@ func (m *Kit) persistAndEmitCompaction(
); err != nil {
return fmt.Errorf("failed to persist compaction entry: %w", err)
}
// Reset the API-reported token count so GetContextStats() and
// ShouldCompact() don't use stale pre-compaction values. The next
// API call will set the accurate post-compaction count.
m.lastInputTokensMu.Lock()
m.lastInputTokens = 0
m.lastInputTokensMu.Unlock()
m.events.emit(CompactionEvent{
Summary: summary,
OriginalTokens: originalTokens,
+10 -6
View File
@@ -1183,9 +1183,11 @@ type TurnResult struct {
// report usage.
TotalUsage *LLMUsage
// FinalUsage is the token usage from the last API call only. Use this
// for context window fill estimation (InputTokens + OutputTokens
// current context size). Nil if unavailable.
// FinalUsage is the token usage from the last API call only. For context
// window fill, sum all categories: InputTokens + CacheReadTokens +
// CacheCreationTokens + OutputTokens. With prompt caching, InputTokens
// alone understates the context (cached tokens are reported separately).
// Nil if unavailable.
FinalUsage *LLMUsage
// Messages is the full updated conversation after the turn, including
@@ -1664,12 +1666,14 @@ func (m *Kit) runTurn(ctx context.Context, promptLabel string, prompt string, pr
}
// Store the API-reported token count so GetContextStats() matches the
// built-in status bar (which uses input + output tokens). The
// text-based heuristic misses system prompts, tool definitions, etc.
// built-in status bar. The context window is filled by all token
// categories: non-cached input, cache reads, cache writes, and output.
// With Anthropic prompt caching, InputTokens can be near-zero while
// CacheReadTokens/CacheCreationTokens hold the bulk of the context.
if result.FinalResponse != nil {
u := result.FinalResponse.Usage
m.lastInputTokensMu.Lock()
m.lastInputTokens = int(u.InputTokens) + int(u.OutputTokens)
m.lastInputTokens = int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
m.lastInputTokensMu.Unlock()
}