fix(ui): accurate context token tracking including cache tokens

- Include all token categories in context fill calculation: InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens - With Anthropic/kimi prompt caching, InputTokens can be near-zero while CacheReadTokens holds the bulk of the context - Include OutputTokens since assistant output becomes context next turn - Remove max-only guard in SetContextTokens so context shrinks after compaction instead of staying stuck at the high-water mark - Reset context tokens to 0 after compaction in both SDK and UI layers - Use real API-reported token counts in ShouldCompact() instead of the chars/4 text heuristic which misses system prompts and tool defs
2026-06-13 19:20:06 +00:00 · 2026-04-10 17:05:47 +03:00
parent 83127467c5
commit 2de98d32be
7 changed files with 109 additions and 47 deletions
@@ -1165,9 +1165,10 @@ func (a *App) recordStepUsage(ev kit.StepUsageEvent, stepUsageSeen *atomic.Bool)
 		int(ev.CacheWriteTokens),
 	)
 	// NOTE: We do NOT call SetContextTokens here. Context fill is set once
-	// at turn completion via updateUsageFromTurnResult using FinalUsage.InputTokens,
-	// which reflects the full accumulated context. Per-step context tokens would
-	// cause the display to jump around during multi-step tool calls.
+	// at turn completion via updateUsageFromTurnResult, which sums all token
+	// categories (Input + CacheRead + CacheCreate + Output) from FinalUsage.
+	// Per-step context tokens would cause the display to jump around during
+	// multi-step tool calls.
 }

 // updateUsageFromTurnResult records token usage from an SDK TurnResult into the
@@ -1231,15 +1232,30 @@ func (a *App) updateUsageFromTurnResult(result *kit.TurnResult, userPrompt strin
 	}

 	// --- Context window fill (drives the % bar) ---
-	// Use FinalUsage.InputTokens as the context window fill. The API's InputTokens
-	// already includes the full conversation history (system prompt + all previous
-	// messages + current user message). Adding OutputTokens would double-count since
-	// the output becomes part of the input for the next turn.
-	if result.FinalUsage != nil && result.FinalUsage.InputTokens > 0 {
-		if a.opts.Debug {
-			log.Printf("[DEBUG] updateUsageFromTurnResult: calling SetContextTokens=%d (FinalUsage.InputTokens)",
-				result.FinalUsage.InputTokens)
+	// Calculate context fill from the LAST API call's usage. The context
+	// window is filled by everything sent to and received from the model:
+	//
+	//   InputTokens       — non-cached input (may be small with prompt caching)
+	//   CacheReadTokens   — input tokens served from cache
+	//   CacheCreationTokens — input tokens written to cache this call
+	//   OutputTokens      — assistant output (becomes input next turn)
+	//
+	// With Anthropic prompt caching, InputTokens can drop to near-zero while
+	// CacheReadTokens holds the bulk of the context. We must sum all four to
+	// get the true context window utilization.
+	//
+	// We use FinalUsage (last step only), NOT TotalUsage, because TotalUsage
+	// sums across all tool-calling steps — and each step re-sends the full
+	// conversation, so TotalUsage massively overstates the actual window fill.
+	if result.FinalUsage != nil {
+		u := result.FinalUsage
+		contextFill := int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
+		if contextFill > 0 {
+			if a.opts.Debug {
+				log.Printf("[DEBUG] updateUsageFromTurnResult: SetContextTokens=%d (Input=%d + CacheRead=%d + CacheCreate=%d + Output=%d)",
+					contextFill, u.InputTokens, u.CacheReadTokens, u.CacheCreationTokens, u.OutputTokens)
+			}
+			a.opts.UsageTracker.SetContextTokens(contextFill)
 		}
-		a.opts.UsageTracker.SetContextTokens(int(result.FinalUsage.InputTokens))
 	}
 }
@@ -630,10 +630,12 @@ func TestUpdateUsageFromTurnResult_recordsWhenInputTokensZero(t *testing.T) {
 	}
 }

-// TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly verifies that context
-// window fill uses InputTokens only (not input+output). The API's InputTokens
-// already includes the full conversation history; adding output would double-count.
-func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
+// TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories verifies that
+// context window fill uses all token categories from the final API call:
+// InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens.
+// With Anthropic prompt caching, InputTokens can be near-zero while
+// CacheReadTokens holds the bulk of the context.
+func TestUpdateUsageFromTurnResult_contextTokensUsesAllCategories(t *testing.T) {
 	usage := &usageUpdaterStub{}
 	app := New(Options{UsageTracker: usage}, nil)
 	defer app.Close()
@@ -641,22 +643,26 @@ func TestUpdateUsageFromTurnResult_contextTokensUsesInputOnly(t *testing.T) {
 	app.updateUsageFromTurnResult(&kit.TurnResult{
 		Response: "ok",
 		TotalUsage: &kit.LLMUsage{
-			InputTokens:  1000,
-			OutputTokens: 200,
+			InputTokens:         3,
+			OutputTokens:        5,
+			CacheReadTokens:     0,
+			CacheCreationTokens: 4317,
 		},
 		FinalUsage: &kit.LLMUsage{
-			InputTokens:  1000, // Full context including history
-			OutputTokens: 200,
+			InputTokens:         3,    // Non-cached input (small with caching)
+			OutputTokens:        5,    // Assistant output
+			CacheReadTokens:     0,    // No cache reads on first call
+			CacheCreationTokens: 4317, // System prompt + tools written to cache
 		},
 	}, "prompt", false)

 	usage.mu.Lock()
 	defer usage.mu.Unlock()

-	// Context tokens should be InputTokens only (1000), not input+output (1200)
-	// because InputTokens already includes the full conversation history
-	if usage.contextCalls != 1 || usage.lastContextTokens != 1000 {
-		t.Fatalf("expected context tokens=1000 (InputTokens only), got calls=%d tokens=%d",
-			usage.contextCalls, usage.lastContextTokens)
+	// Context tokens should be Input + CacheRead + CacheCreate + Output = 4325
+	expected := 3 + 0 + 4317 + 5
+	if usage.contextCalls != 1 || usage.lastContextTokens != expected {
+		t.Fatalf("expected context tokens=%d (all categories), got calls=%d tokens=%d",
+			expected, usage.contextCalls, usage.lastContextTokens)
 	}
 }
@@ -21,8 +21,10 @@ type UsageUpdater interface {
 	// the provider does not return exact counts.
 	EstimateAndUpdateUsage(inputText, outputText string)
 	// SetContextTokens records the approximate current context window fill
-	// level. This should be the final API call's input+output tokens (from
-	// FinalResponse.Usage), NOT the aggregate TotalUsage.
+	// level. This should be the sum of ALL token categories from the last
+	// API call: InputTokens + CacheReadTokens + CacheCreationTokens +
+	// OutputTokens. With Anthropic prompt caching, InputTokens can be
+	// near-zero while CacheReadTokens holds the bulk of the context.
 	SetContextTokens(tokens int)
 }

@@ -1820,6 +1820,12 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		// Refresh content to show the finalized message.
 		m.refreshContent()

+		// Reset context token display — the pre-compaction count is stale.
+		// The next API call will set the accurate post-compaction value.
+		if m.usageTracker != nil {
+			m.usageTracker.SetContextTokens(0)
+		}
+
 		// Print stats as a separate system message.
 		saved := msg.OriginalTokens - msg.CompactedTokens
 		statsMsg := fmt.Sprintf(
@@ -134,23 +134,28 @@ func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
 }

 // SetContextTokens records the approximate current context window utilization.
-// This should be set from FinalUsage.InputTokens, which already includes the
-// full conversation history (system prompt + all previous messages). Do NOT
-// add OutputTokens as that would double-count (output becomes input next turn).
-// Use FinalResponse.Usage rather than aggregate TotalUsage, because TotalUsage
-// sums across all tool-calling steps and overstates the actual window fill level.
+//
+// The value should include ALL token categories from the last API call:
+//
+//	InputTokens + CacheReadTokens + CacheCreationTokens + OutputTokens
+//
+// With Anthropic prompt caching, InputTokens can be near-zero while
+// CacheReadTokens holds the bulk of the context. All four must be summed
+// to get the true context window fill level.
+//
+// OutputTokens is included because the assistant's output becomes part of
+// the context on the next turn.
+//
+// Use FinalResponse.Usage (last step only) rather than aggregate TotalUsage,
+// because TotalUsage sums across all tool-calling steps and overstates the
+// actual window fill level.
+//
+// The value is set unconditionally (not max-only) so that context shrinks
+// correctly after compaction.
 func (ut *UsageTracker) SetContextTokens(tokens int) {
 	ut.mu.Lock()
 	defer ut.mu.Unlock()
-	// Track the maximum context seen so far. In multi-step tool calls,
-	// FinalUsage.InputTokens may reflect only the last step's input, which
-	// can be smaller than previous steps. We want to show the largest context
-	// the model has processed in this session.
-	if tokens > ut.contextTokens {
-		ut.contextTokens = tokens
-	}
-	// If tokens < current, we keep the larger value (no-op)
-	// This prevents the display from dropping during multi-step tool calls.
+	ut.contextTokens = tokens
 }

 // RenderUsageInfo generates a formatted string displaying current usage statistics
@@ -31,6 +31,11 @@ func (m *Kit) EstimateContextTokens() int {
 // limit and should be compacted.
 // Formula: contextTokens > contextWindow − reserveTokens.
 // Returns false if the model's context limit is unknown.
+//
+// When API-reported token counts are available (after at least one turn),
+// the real count is used instead of the text-based heuristic. This is
+// significantly more accurate because it includes system prompts, tool
+// definitions, and other overhead that the heuristic cannot account for.
 func (m *Kit) ShouldCompact() bool {
 	info := m.GetModelInfo()
 	if info == nil || info.Limit.Context <= 0 {
@@ -42,6 +47,16 @@ func (m *Kit) ShouldCompact() bool {
 		reserveTokens = m.compactionOpts.ReserveTokens
 	}

+	// Prefer the real API-reported token count when available.
+	m.lastInputTokensMu.RLock()
+	realTokens := m.lastInputTokens
+	m.lastInputTokensMu.RUnlock()
+
+	if realTokens > 0 {
+		return realTokens > info.Limit.Context-reserveTokens
+	}
+
+	// Fall back to text-based heuristic before first turn completes.
 	messages := m.session.GetMessages()
 	return compaction.ShouldCompact(convertKitMessagesToFantasy(messages), info.Limit.Context, reserveTokens)
 }
@@ -245,6 +260,14 @@ func (m *Kit) persistAndEmitCompaction(
 	); err != nil {
 		return fmt.Errorf("failed to persist compaction entry: %w", err)
 	}
+
+	// Reset the API-reported token count so GetContextStats() and
+	// ShouldCompact() don't use stale pre-compaction values. The next
+	// API call will set the accurate post-compaction count.
+	m.lastInputTokensMu.Lock()
+	m.lastInputTokens = 0
+	m.lastInputTokensMu.Unlock()
+
 	m.events.emit(CompactionEvent{
 		Summary:         summary,
 		OriginalTokens:  originalTokens,
@@ -1183,9 +1183,11 @@ type TurnResult struct {
 	// report usage.
 	TotalUsage *LLMUsage

-	// FinalUsage is the token usage from the last API call only. Use this
-	// for context window fill estimation (InputTokens + OutputTokens ≈
-	// current context size). Nil if unavailable.
+	// FinalUsage is the token usage from the last API call only. For context
+	// window fill, sum all categories: InputTokens + CacheReadTokens +
+	// CacheCreationTokens + OutputTokens. With prompt caching, InputTokens
+	// alone understates the context (cached tokens are reported separately).
+	// Nil if unavailable.
 	FinalUsage *LLMUsage

 	// Messages is the full updated conversation after the turn, including
@@ -1664,12 +1666,14 @@ func (m *Kit) runTurn(ctx context.Context, promptLabel string, prompt string, pr
 	}

 	// Store the API-reported token count so GetContextStats() matches the
-	// built-in status bar (which uses input + output tokens). The
-	// text-based heuristic misses system prompts, tool definitions, etc.
+	// built-in status bar. The context window is filled by all token
+	// categories: non-cached input, cache reads, cache writes, and output.
+	// With Anthropic prompt caching, InputTokens can be near-zero while
+	// CacheReadTokens/CacheCreationTokens hold the bulk of the context.
 	if result.FinalResponse != nil {
 		u := result.FinalResponse.Usage
 		m.lastInputTokensMu.Lock()
-		m.lastInputTokens = int(u.InputTokens) + int(u.OutputTokens)
+		m.lastInputTokens = int(u.InputTokens) + int(u.CacheReadTokens) + int(u.CacheCreationTokens) + int(u.OutputTokens)
 		m.lastInputTokensMu.Unlock()
 	}