From c5b75674a31de23d7fc694c161c320d496b3ec0c Mon Sep 17 00:00:00 2001
From: Ed Zynda <ezynda3@gmail.com>
Date: Thu, 26 Feb 2026 16:10:43 +0300
Subject: [PATCH] fix: token usage tracking with fantasy and sticky display
 across all visual modes

- Fix context percentage: use FinalResponse.Usage (last API call) instead of
  TotalUsage (sum of all tool-calling steps) which overstated context fill level
- Fix token count: display current context window tokens, not cumulative session
  total, so the number and percentage tell a consistent story
- Fix script mode double-counting: app.updateUsage already updates the shared
  tracker before sending StepCompleteEvent, so remove redundant
  UpdateUsageFromResponse call
- Add sticky usage display in TUI: render in View() layout between stream and
  separator instead of tea.Println so it updates in place
- Add usage display for non-interactive --prompt mode (non-quiet)
- Add SetContextTokens to UsageUpdater interface for separating billing tokens
  (TotalUsage) from context utilization (FinalResponse.Usage)
---
 cmd/root.go                              | 11 +++-
 cmd/script.go                            |  8 +--
 internal/app/app.go                      | 16 ++++++
 internal/app/options.go                  |  6 +++
 internal/ui/cli.go                       |  6 ++-
 internal/ui/model.go                     | 37 +++++++++++--
 internal/ui/usage_tracker.go             | 67 +++++++++++++++---------
 internal/ui/usage_tracker_render_test.go |  4 +-
 8 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/cmd/root.go b/cmd/root.go
index a3de7bd7..95e894ec 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -755,11 +755,20 @@ func runNormalMode(ctx context.Context) error {
 //
 // When --no-exit is set, after RunOnce completes the interactive BubbleTea TUI
 // is started so the user can continue the conversation.
-func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, _, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
+func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, quiet, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
 	if err := appInstance.RunOnce(ctx, prompt); err != nil {
 		return err
 	}
 
+	// Display token usage after the response (unless quiet mode suppresses output).
+	// The app layer has already updated the tracker inside RunOnce.
+	if !quiet && usageTracker != nil {
+		usageInfo := usageTracker.RenderUsageInfo()
+		if usageInfo != "" {
+			fmt.Println(usageInfo)
+		}
+	}
+
 	// If --no-exit was requested, hand off to the interactive TUI.
 	if noExit {
 		return runInteractiveModeBubbleTea(ctx, appInstance, modelName, providerName, loadingMessage, serverNames, toolNames, usageTracker)
diff --git a/cmd/script.go b/cmd/script.go
index dd369bfb..d487fe95 100644
--- a/cmd/script.go
+++ b/cmd/script.go
@@ -851,10 +851,10 @@ func (h *scriptEventHandler) Handle(msg tea.Msg) {
 			_ = h.cli.DisplayAssistantMessageWithModel(responseText, h.modelName)
 		}
 
-		// Update and display usage.
-		if e.Response != nil {
-			h.cli.UpdateUsageFromResponse(e.Response, "")
-		}
+		// Display usage. The app layer has already updated the shared
+		// UsageTracker (via app.updateUsage in RunOnceWithDisplay) before
+		// sending this event, so we only need to render — calling
+		// UpdateUsageFromResponse here would double-count.
 		h.cli.DisplayUsageAfterResponse()
 
 		// Reset for next step in the agentic loop.
diff --git a/internal/app/app.go b/internal/app/app.go
index 02bacbe0..004e00d9 100644
--- a/internal/app/app.go
+++ b/internal/app/app.go
@@ -419,6 +419,11 @@ func (a *App) sendEvent(msg tea.Msg) {
 // updateUsage records token usage from a completed agent step into the configured
 // UsageTracker (if any). It uses the actual token counts from the agent result's
 // TotalUsage field when available; otherwise it falls back to text-based estimation.
+//
+// TotalUsage is the sum across all tool-calling steps in a single agent run and
+// is used for session cost tracking. For context window utilization we use the
+// final response's per-call usage (FinalResponse.Usage) which reflects the actual
+// context size at the last API call.
 func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt string) {
 	if a.opts.UsageTracker == nil || result == nil {
 		return
@@ -438,5 +443,16 @@ func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt strin
 			responseText = result.FinalResponse.Content.Text()
 		}
 		a.opts.UsageTracker.EstimateAndUpdateUsage(userPrompt, responseText)
+		return // EstimateAndUpdateUsage already sets context tokens internally
+	}
+
+	// Set context window utilization from the final API call's per-step usage.
+	// FinalResponse.Usage represents the last step only (not the aggregate),
+	// so input+output there reflects the actual context fill level.
+	if result.FinalResponse != nil {
+		fu := result.FinalResponse.Usage
+		if ct := int(fu.InputTokens) + int(fu.OutputTokens); ct > 0 {
+			a.opts.UsageTracker.SetContextTokens(ct)
+		}
 	}
 }
diff --git a/internal/app/options.go b/internal/app/options.go
index c563ae82..9b9cc8f8 100644
--- a/internal/app/options.go
+++ b/internal/app/options.go
@@ -32,10 +32,16 @@ type AgentRunner interface {
 // in cmd/root.go, which can import both packages.
 type UsageUpdater interface {
 	// UpdateUsage records actual token counts returned by the provider.
+	// The counts come from fantasy's TotalUsage (aggregate across all steps
+	// in a multi-step tool-calling run) and are used for session cost tracking.
 	UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens int)
 	// EstimateAndUpdateUsage falls back to text-based token estimation when
 	// the provider does not return exact counts.
 	EstimateAndUpdateUsage(inputText, outputText string)
+	// SetContextTokens records the approximate current context window fill
+	// level. This should be the final API call's input+output tokens (from
+	// FinalResponse.Usage), NOT the aggregate TotalUsage.
+	SetContextTokens(tokens int)
 }
 
 // Options configures an App instance. It mirrors the fields from AgenticLoopConfig
diff --git a/internal/ui/cli.go b/internal/ui/cli.go
index 1b2abfe9..459fe72a 100644
--- a/internal/ui/cli.go
+++ b/internal/ui/cli.go
@@ -262,8 +262,12 @@ func (c *CLI) UpdateUsageFromResponse(response *fantasy.Response, inputText stri
 		cacheReadTokens := int(usage.CacheReadTokens)
 		cacheWriteTokens := int(usage.CacheCreationTokens)
 		c.usageTracker.UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens)
+		// Per-response usage is a single API call, so it represents the
+		// actual context window fill level.
+		c.usageTracker.SetContextTokens(inputTokens + outputTokens)
 	} else {
-		// Fallback to estimation if no metadata is available
+		// Fallback to estimation if no metadata is available.
+		// EstimateAndUpdateUsage sets context tokens internally.
 		c.usageTracker.EstimateAndUpdateUsage(inputText, response.Content.Text())
 	}
 }
diff --git a/internal/ui/model.go b/internal/ui/model.go
index d9b85714..47ae15f7 100644
--- a/internal/ui/model.go
+++ b/internal/ui/model.go
@@ -433,7 +433,9 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 
 	case app.StepCompleteEvent:
 		// Flush any remaining streamed text to scrollback, then reset stream
-		// and return to input state.
+		// and return to input state. Token usage is rendered as a sticky
+		// element in View() — the app layer has already updated the shared
+		// UsageTracker before sending this event.
 		cmds = append(cmds, m.flushStreamContent())
 		if m.stream != nil {
 			m.stream.Reset()
@@ -479,13 +481,21 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 }
 
 // View implements tea.Model. It renders the stacked layout:
-// stream region + separator + [queued messages] + input region.
+// stream region + [usage info] + separator + [queued messages] + input region.
 func (m *AppModel) View() tea.View {
 	streamView := m.renderStream()
 	separator := m.renderSeparator()
 	inputView := m.renderInput()
 
-	parts := []string{streamView, separator}
+	parts := []string{streamView}
+
+	// Sticky usage info sits between the stream and separator so it is
+	// always visible at the bottom of the messages area and updates in place.
+	if usageView := m.renderUsageInfo(); usageView != "" {
+		parts = append(parts, usageView)
+	}
+
+	parts = append(parts, separator)
 
 	if queuedView := m.renderQueuedMessages(); queuedView != "" {
 		parts = append(parts, queuedView)
@@ -523,6 +533,16 @@ func (m *AppModel) renderStream() string {
 	return m.stream.View().Content
 }
 
+// renderUsageInfo returns the sticky token usage line (tokens + context% + cost).
+// Returns an empty string when no usage data is available (no requests yet or
+// tracker is nil), so the element is invisible until the first response arrives.
+func (m *AppModel) renderUsageInfo() string {
+	if m.usageTracker == nil {
+		return ""
+	}
+	return m.usageTracker.RenderUsageInfo()
+}
+
 // renderSeparator renders the separator line with an optional queue count badge.
 func (m *AppModel) renderSeparator() string {
 	theme := GetTheme()
@@ -796,7 +816,8 @@ func (m *AppModel) flushStreamContent() tea.Cmd {
 //
 // Layout (line counts):
 //
-//	stream region  = total - separator(1) - queued(N*5) - input(5)
+//	stream region  = total - usage(0-1) - separator(1) - queued(N*5) - input(5)
+//	usage info     = 0 or 1 line (visible only after first response)
 //	separator      = 1 line
 //	queued msgs    = ~5 lines per message (padding + text + badge + padding)
 //	input region   = 5 lines: title(1) + textarea(3) + help(1)
@@ -806,7 +827,13 @@ func (m *AppModel) distributeHeight() {
 	const linesPerQueuedMsg = 5
 	queuedLines := len(m.queuedMessages) * linesPerQueuedMsg
 
-	streamHeight := max(m.height-separatorLines-queuedLines-inputLines, 0)
+	// Reserve space for the sticky usage line when the tracker has data.
+	usageLines := 0
+	if m.usageTracker != nil && m.usageTracker.GetSessionStats().RequestCount > 0 {
+		usageLines = 1
+	}
+
+	streamHeight := max(m.height-usageLines-separatorLines-queuedLines-inputLines, 0)
 
 	if m.stream != nil {
 		m.stream.SetHeight(streamHeight)
diff --git a/internal/ui/usage_tracker.go b/internal/ui/usage_tracker.go
index 1a9879e5..91439fdd 100644
--- a/internal/ui/usage_tracker.go
+++ b/internal/ui/usage_tracker.go
@@ -41,13 +41,14 @@ type SessionStats struct {
 // for LLM interactions throughout a session. It provides real-time usage information
 // and supports both estimated and actual token counts. OAuth users see $0 costs.
 type UsageTracker struct {
-	mu           sync.RWMutex
-	modelInfo    *models.ModelInfo
-	provider     string
-	sessionStats SessionStats
-	lastRequest  *UsageStats
-	width        int
-	isOAuth      bool // Whether OAuth credentials are being used (costs should be $0)
+	mu            sync.RWMutex
+	modelInfo     *models.ModelInfo
+	provider      string
+	sessionStats  SessionStats
+	lastRequest   *UsageStats
+	contextTokens int // approximate current context window utilization (last API call)
+	width         int
+	isOAuth       bool // Whether OAuth credentials are being used (costs should be $0)
 }
 
 // NewUsageTracker creates and initializes a new UsageTracker for the specified model.
@@ -119,11 +120,27 @@ func (ut *UsageTracker) UpdateUsage(inputTokens, outputTokens, cacheReadTokens,
 
 // EstimateAndUpdateUsage estimates token counts from raw text strings and updates
 // the usage statistics. This method is used when actual token counts are not available
-// from the API response.
+// from the API response. The estimated values also serve as the context utilization
+// approximation since they represent a single API call.
 func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
 	inputTokens := estimateTokens(inputText)
 	outputTokens := estimateTokens(outputText)
 	ut.UpdateUsage(inputTokens, outputTokens, 0, 0)
+	// For estimated usage the values represent a single call, so they are a
+	// reasonable proxy for the current context window fill level.
+	ut.mu.Lock()
+	ut.contextTokens = inputTokens + outputTokens
+	ut.mu.Unlock()
+}
+
+// SetContextTokens records the approximate current context window utilization.
+// This should be set from the final API call's input + output tokens (i.e.
+// FinalResponse.Usage) rather than the aggregate TotalUsage, because TotalUsage
+// sums across all tool-calling steps and overstates the actual window fill level.
+func (ut *UsageTracker) SetContextTokens(tokens int) {
+	ut.mu.Lock()
+	defer ut.mu.Unlock()
+	ut.contextTokens = tokens
 }
 
 // RenderUsageInfo generates a formatted string displaying current usage statistics
@@ -134,31 +151,32 @@ func (ut *UsageTracker) RenderUsageInfo() string {
 	ut.mu.RLock()
 	defer ut.mu.RUnlock()
 
-	// if ut.sessionStats.RequestCount == 0 {
-	// 	return ""
-	// }
+	if ut.sessionStats.RequestCount == 0 {
+		return ""
+	}
 
-	// Import lipgloss for styling
 	baseStyle := lipgloss.NewStyle()
 
-	// Calculate total tokens
-	totalTokens := ut.sessionStats.TotalInputTokens + ut.sessionStats.TotalOutputTokens
+	// Display the current context window token count (from the last API call),
+	// not the cumulative session total. This keeps the number consistent with
+	// the percentage and answers "how full is my context right now?".
+	displayTokens := ut.contextTokens
 
 	// Format tokens with K/M suffix for better readability
 	var tokenStr string
-	if totalTokens >= 1000000 {
-		tokenStr = fmt.Sprintf("%.1fM", float64(totalTokens)/1000000)
-	} else if totalTokens >= 1000 {
-		tokenStr = fmt.Sprintf("%.1fK", float64(totalTokens)/1000)
+	if displayTokens >= 1000000 {
+		tokenStr = fmt.Sprintf("%.1fM", float64(displayTokens)/1000000)
+	} else if displayTokens >= 1000 {
+		tokenStr = fmt.Sprintf("%.1fK", float64(displayTokens)/1000)
 	} else {
-		tokenStr = fmt.Sprintf("%d", totalTokens)
+		tokenStr = fmt.Sprintf("%d", displayTokens)
 	}
 
-	// Calculate percentage based on context limit with color coding
+	// Calculate context window utilization percentage from the same value.
 	var percentageStr string
 	var percentageColor color.Color
-	if ut.modelInfo.Limit.Context > 0 {
-		percentage := float64(totalTokens) / float64(ut.modelInfo.Limit.Context) * 100
+	if ut.modelInfo.Limit.Context > 0 && displayTokens > 0 {
+		percentage := float64(displayTokens) / float64(ut.modelInfo.Limit.Context) * 100
 
 		// Color code based on usage percentage
 		theme := GetTheme()
@@ -202,8 +220,8 @@ func (ut *UsageTracker) RenderUsageInfo() string {
 		Foreground(theme.Muted).
 		Render(" | Cost: ")
 
-	// Build the enhanced display
-	return fmt.Sprintf("%s%s%s%s%s\n",
+	// Build the enhanced display (no trailing newline — callers control spacing).
+	return fmt.Sprintf("%s%s%s%s%s",
 		tokensLabel, tokensValue, percentageStr, costLabel, costStr)
 }
 
@@ -237,6 +255,7 @@ func (ut *UsageTracker) Reset() {
 	defer ut.mu.Unlock()
 	ut.sessionStats = SessionStats{}
 	ut.lastRequest = nil
+	ut.contextTokens = 0
 }
 
 // SetWidth updates the terminal width used for formatting usage information display.
diff --git a/internal/ui/usage_tracker_render_test.go b/internal/ui/usage_tracker_render_test.go
index e683b587..68a192b8 100644
--- a/internal/ui/usage_tracker_render_test.go
+++ b/internal/ui/usage_tracker_render_test.go
@@ -32,7 +32,8 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
 
 	// Test OAuth rendering (should show $0.00)
 	oauthTracker := NewUsageTracker(modelInfo, "anthropic", 80, true)
-	oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens
+	oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens (session/billing)
+	oauthTracker.SetContextTokens(1500 + 500) // context window utilization
 
 	rendered := stripAnsi(oauthTracker.RenderUsageInfo())
 
@@ -50,6 +51,7 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
 	// Test regular API key rendering (should show actual cost)
 	regularTracker := NewUsageTracker(modelInfo, "anthropic", 80, false)
 	regularTracker.UpdateUsage(1500, 500, 0, 0) // Same token usage
+	regularTracker.SetContextTokens(1500 + 500) // context window utilization
 
 	regularRendered := stripAnsi(regularTracker.RenderUsageInfo())