fix: token usage tracking with fantasy and sticky display across all visual modes

- Fix context percentage: use FinalResponse.Usage (last API call) instead of
  TotalUsage (sum of all tool-calling steps) which overstated context fill level
- Fix token count: display current context window tokens, not cumulative session
  total, so the number and percentage tell a consistent story
- Fix script mode double-counting: app.updateUsage already updates the shared
  tracker before sending StepCompleteEvent, so remove redundant
  UpdateUsageFromResponse call
- Add sticky usage display in TUI: render in View() layout between stream and
  separator instead of tea.Println so it updates in place
- Add usage display for non-interactive --prompt mode (non-quiet)
- Add SetContextTokens to UsageUpdater interface for separating billing tokens
  (TotalUsage) from context utilization (FinalResponse.Usage)
This commit is contained in:
Ed Zynda
2026-02-26 16:10:43 +03:00
parent 41f1198cb6
commit c5b75674a3
8 changed files with 119 additions and 36 deletions
+10 -1
View File
@@ -755,11 +755,20 @@ func runNormalMode(ctx context.Context) error {
//
// When --no-exit is set, after RunOnce completes the interactive BubbleTea TUI
// is started so the user can continue the conversation.
func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, _, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, quiet, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
if err := appInstance.RunOnce(ctx, prompt); err != nil {
return err
}
// Display token usage after the response (unless quiet mode suppresses output).
// The app layer has already updated the tracker inside RunOnce.
if !quiet && usageTracker != nil {
usageInfo := usageTracker.RenderUsageInfo()
if usageInfo != "" {
fmt.Println(usageInfo)
}
}
// If --no-exit was requested, hand off to the interactive TUI.
if noExit {
return runInteractiveModeBubbleTea(ctx, appInstance, modelName, providerName, loadingMessage, serverNames, toolNames, usageTracker)
+4 -4
View File
@@ -851,10 +851,10 @@ func (h *scriptEventHandler) Handle(msg tea.Msg) {
_ = h.cli.DisplayAssistantMessageWithModel(responseText, h.modelName)
}
// Update and display usage.
if e.Response != nil {
h.cli.UpdateUsageFromResponse(e.Response, "")
}
// Display usage. The app layer has already updated the shared
// UsageTracker (via app.updateUsage in RunOnceWithDisplay) before
// sending this event, so we only need to render — calling
// UpdateUsageFromResponse here would double-count.
h.cli.DisplayUsageAfterResponse()
// Reset for next step in the agentic loop.
+16
View File
@@ -419,6 +419,11 @@ func (a *App) sendEvent(msg tea.Msg) {
// updateUsage records token usage from a completed agent step into the configured
// UsageTracker (if any). It uses the actual token counts from the agent result's
// TotalUsage field when available; otherwise it falls back to text-based estimation.
//
// TotalUsage is the sum across all tool-calling steps in a single agent run and
// is used for session cost tracking. For context window utilization we use the
// final response's per-call usage (FinalResponse.Usage) which reflects the actual
// context size at the last API call.
func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt string) {
if a.opts.UsageTracker == nil || result == nil {
return
@@ -438,5 +443,16 @@ func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt strin
responseText = result.FinalResponse.Content.Text()
}
a.opts.UsageTracker.EstimateAndUpdateUsage(userPrompt, responseText)
return // EstimateAndUpdateUsage already sets context tokens internally
}
// Set context window utilization from the final API call's per-step usage.
// FinalResponse.Usage represents the last step only (not the aggregate),
// so input+output there reflects the actual context fill level.
if result.FinalResponse != nil {
fu := result.FinalResponse.Usage
if ct := int(fu.InputTokens) + int(fu.OutputTokens); ct > 0 {
a.opts.UsageTracker.SetContextTokens(ct)
}
}
}
+6
View File
@@ -32,10 +32,16 @@ type AgentRunner interface {
// in cmd/root.go, which can import both packages.
type UsageUpdater interface {
// UpdateUsage records actual token counts returned by the provider.
// The counts come from fantasy's TotalUsage (aggregate across all steps
// in a multi-step tool-calling run) and are used for session cost tracking.
UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens int)
// EstimateAndUpdateUsage falls back to text-based token estimation when
// the provider does not return exact counts.
EstimateAndUpdateUsage(inputText, outputText string)
// SetContextTokens records the approximate current context window fill
// level. This should be the final API call's input+output tokens (from
// FinalResponse.Usage), NOT the aggregate TotalUsage.
SetContextTokens(tokens int)
}
// Options configures an App instance. It mirrors the fields from AgenticLoopConfig
+5 -1
View File
@@ -262,8 +262,12 @@ func (c *CLI) UpdateUsageFromResponse(response *fantasy.Response, inputText stri
cacheReadTokens := int(usage.CacheReadTokens)
cacheWriteTokens := int(usage.CacheCreationTokens)
c.usageTracker.UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens)
// Per-response usage is a single API call, so it represents the
// actual context window fill level.
c.usageTracker.SetContextTokens(inputTokens + outputTokens)
} else {
// Fallback to estimation if no metadata is available
// Fallback to estimation if no metadata is available.
// EstimateAndUpdateUsage sets context tokens internally.
c.usageTracker.EstimateAndUpdateUsage(inputText, response.Content.Text())
}
}
+32 -5
View File
@@ -433,7 +433,9 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
case app.StepCompleteEvent:
// Flush any remaining streamed text to scrollback, then reset stream
// and return to input state.
// and return to input state. Token usage is rendered as a sticky
// element in View() — the app layer has already updated the shared
// UsageTracker before sending this event.
cmds = append(cmds, m.flushStreamContent())
if m.stream != nil {
m.stream.Reset()
@@ -479,13 +481,21 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
}
// View implements tea.Model. It renders the stacked layout:
// stream region + separator + [queued messages] + input region.
// stream region + [usage info] + separator + [queued messages] + input region.
func (m *AppModel) View() tea.View {
streamView := m.renderStream()
separator := m.renderSeparator()
inputView := m.renderInput()
parts := []string{streamView, separator}
parts := []string{streamView}
// Sticky usage info sits between the stream and separator so it is
// always visible at the bottom of the messages area and updates in place.
if usageView := m.renderUsageInfo(); usageView != "" {
parts = append(parts, usageView)
}
parts = append(parts, separator)
if queuedView := m.renderQueuedMessages(); queuedView != "" {
parts = append(parts, queuedView)
@@ -523,6 +533,16 @@ func (m *AppModel) renderStream() string {
return m.stream.View().Content
}
// renderUsageInfo returns the sticky token usage line (tokens + context% + cost).
// Returns an empty string when no usage data is available (no requests yet or
// tracker is nil), so the element is invisible until the first response arrives.
func (m *AppModel) renderUsageInfo() string {
if m.usageTracker == nil {
return ""
}
return m.usageTracker.RenderUsageInfo()
}
// renderSeparator renders the separator line with an optional queue count badge.
func (m *AppModel) renderSeparator() string {
theme := GetTheme()
@@ -796,7 +816,8 @@ func (m *AppModel) flushStreamContent() tea.Cmd {
//
// Layout (line counts):
//
// stream region = total - separator(1) - queued(N*5) - input(5)
// stream region = total - usage(0-1) - separator(1) - queued(N*5) - input(5)
// usage info = 0 or 1 line (visible only after first response)
// separator = 1 line
// queued msgs = ~5 lines per message (padding + text + badge + padding)
// input region = 5 lines: title(1) + textarea(3) + help(1)
@@ -806,7 +827,13 @@ func (m *AppModel) distributeHeight() {
const linesPerQueuedMsg = 5
queuedLines := len(m.queuedMessages) * linesPerQueuedMsg
streamHeight := max(m.height-separatorLines-queuedLines-inputLines, 0)
// Reserve space for the sticky usage line when the tracker has data.
usageLines := 0
if m.usageTracker != nil && m.usageTracker.GetSessionStats().RequestCount > 0 {
usageLines = 1
}
streamHeight := max(m.height-usageLines-separatorLines-queuedLines-inputLines, 0)
if m.stream != nil {
m.stream.SetHeight(streamHeight)
+43 -24
View File
@@ -41,13 +41,14 @@ type SessionStats struct {
// for LLM interactions throughout a session. It provides real-time usage information
// and supports both estimated and actual token counts. OAuth users see $0 costs.
type UsageTracker struct {
mu sync.RWMutex
modelInfo *models.ModelInfo
provider string
sessionStats SessionStats
lastRequest *UsageStats
width int
isOAuth bool // Whether OAuth credentials are being used (costs should be $0)
mu sync.RWMutex
modelInfo *models.ModelInfo
provider string
sessionStats SessionStats
lastRequest *UsageStats
contextTokens int // approximate current context window utilization (last API call)
width int
isOAuth bool // Whether OAuth credentials are being used (costs should be $0)
}
// NewUsageTracker creates and initializes a new UsageTracker for the specified model.
@@ -119,11 +120,27 @@ func (ut *UsageTracker) UpdateUsage(inputTokens, outputTokens, cacheReadTokens,
// EstimateAndUpdateUsage estimates token counts from raw text strings and updates
// the usage statistics. This method is used when actual token counts are not available
// from the API response.
// from the API response. The estimated values also serve as the context utilization
// approximation since they represent a single API call.
func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
inputTokens := estimateTokens(inputText)
outputTokens := estimateTokens(outputText)
ut.UpdateUsage(inputTokens, outputTokens, 0, 0)
// For estimated usage the values represent a single call, so they are a
// reasonable proxy for the current context window fill level.
ut.mu.Lock()
ut.contextTokens = inputTokens + outputTokens
ut.mu.Unlock()
}
// SetContextTokens records the approximate current context window utilization.
// This should be set from the final API call's input + output tokens (i.e.
// FinalResponse.Usage) rather than the aggregate TotalUsage, because TotalUsage
// sums across all tool-calling steps and overstates the actual window fill level.
func (ut *UsageTracker) SetContextTokens(tokens int) {
ut.mu.Lock()
defer ut.mu.Unlock()
ut.contextTokens = tokens
}
// RenderUsageInfo generates a formatted string displaying current usage statistics
@@ -134,31 +151,32 @@ func (ut *UsageTracker) RenderUsageInfo() string {
ut.mu.RLock()
defer ut.mu.RUnlock()
// if ut.sessionStats.RequestCount == 0 {
// return ""
// }
if ut.sessionStats.RequestCount == 0 {
return ""
}
// Import lipgloss for styling
baseStyle := lipgloss.NewStyle()
// Calculate total tokens
totalTokens := ut.sessionStats.TotalInputTokens + ut.sessionStats.TotalOutputTokens
// Display the current context window token count (from the last API call),
// not the cumulative session total. This keeps the number consistent with
// the percentage and answers "how full is my context right now?".
displayTokens := ut.contextTokens
// Format tokens with K/M suffix for better readability
var tokenStr string
if totalTokens >= 1000000 {
tokenStr = fmt.Sprintf("%.1fM", float64(totalTokens)/1000000)
} else if totalTokens >= 1000 {
tokenStr = fmt.Sprintf("%.1fK", float64(totalTokens)/1000)
if displayTokens >= 1000000 {
tokenStr = fmt.Sprintf("%.1fM", float64(displayTokens)/1000000)
} else if displayTokens >= 1000 {
tokenStr = fmt.Sprintf("%.1fK", float64(displayTokens)/1000)
} else {
tokenStr = fmt.Sprintf("%d", totalTokens)
tokenStr = fmt.Sprintf("%d", displayTokens)
}
// Calculate percentage based on context limit with color coding
// Calculate context window utilization percentage from the same value.
var percentageStr string
var percentageColor color.Color
if ut.modelInfo.Limit.Context > 0 {
percentage := float64(totalTokens) / float64(ut.modelInfo.Limit.Context) * 100
if ut.modelInfo.Limit.Context > 0 && displayTokens > 0 {
percentage := float64(displayTokens) / float64(ut.modelInfo.Limit.Context) * 100
// Color code based on usage percentage
theme := GetTheme()
@@ -202,8 +220,8 @@ func (ut *UsageTracker) RenderUsageInfo() string {
Foreground(theme.Muted).
Render(" | Cost: ")
// Build the enhanced display
return fmt.Sprintf("%s%s%s%s%s\n",
// Build the enhanced display (no trailing newline — callers control spacing).
return fmt.Sprintf("%s%s%s%s%s",
tokensLabel, tokensValue, percentageStr, costLabel, costStr)
}
@@ -237,6 +255,7 @@ func (ut *UsageTracker) Reset() {
defer ut.mu.Unlock()
ut.sessionStats = SessionStats{}
ut.lastRequest = nil
ut.contextTokens = 0
}
// SetWidth updates the terminal width used for formatting usage information display.
+3 -1
View File
@@ -32,7 +32,8 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
// Test OAuth rendering (should show $0.00)
oauthTracker := NewUsageTracker(modelInfo, "anthropic", 80, true)
oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens
oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens (session/billing)
oauthTracker.SetContextTokens(1500 + 500) // context window utilization
rendered := stripAnsi(oauthTracker.RenderUsageInfo())
@@ -50,6 +51,7 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
// Test regular API key rendering (should show actual cost)
regularTracker := NewUsageTracker(modelInfo, "anthropic", 80, false)
regularTracker.UpdateUsage(1500, 500, 0, 0) // Same token usage
regularTracker.SetContextTokens(1500 + 500) // context window utilization
regularRendered := stripAnsi(regularTracker.RenderUsageInfo())