mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-14 03:30:26 +00:00
fix: token usage tracking with fantasy and sticky display across all visual modes
- Fix context percentage: use FinalResponse.Usage (last API call) instead of TotalUsage (sum of all tool-calling steps) which overstated context fill level - Fix token count: display current context window tokens, not cumulative session total, so the number and percentage tell a consistent story - Fix script mode double-counting: app.updateUsage already updates the shared tracker before sending StepCompleteEvent, so remove redundant UpdateUsageFromResponse call - Add sticky usage display in TUI: render in View() layout between stream and separator instead of tea.Println so it updates in place - Add usage display for non-interactive --prompt mode (non-quiet) - Add SetContextTokens to UsageUpdater interface for separating billing tokens (TotalUsage) from context utilization (FinalResponse.Usage)
This commit is contained in:
+10
-1
@@ -755,11 +755,20 @@ func runNormalMode(ctx context.Context) error {
|
||||
//
|
||||
// When --no-exit is set, after RunOnce completes the interactive BubbleTea TUI
|
||||
// is started so the user can continue the conversation.
|
||||
func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, _, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
|
||||
func runNonInteractiveModeApp(ctx context.Context, appInstance *app.App, prompt string, quiet, noExit bool, modelName, providerName, loadingMessage string, serverNames, toolNames []string, usageTracker *ui.UsageTracker) error {
|
||||
if err := appInstance.RunOnce(ctx, prompt); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Display token usage after the response (unless quiet mode suppresses output).
|
||||
// The app layer has already updated the tracker inside RunOnce.
|
||||
if !quiet && usageTracker != nil {
|
||||
usageInfo := usageTracker.RenderUsageInfo()
|
||||
if usageInfo != "" {
|
||||
fmt.Println(usageInfo)
|
||||
}
|
||||
}
|
||||
|
||||
// If --no-exit was requested, hand off to the interactive TUI.
|
||||
if noExit {
|
||||
return runInteractiveModeBubbleTea(ctx, appInstance, modelName, providerName, loadingMessage, serverNames, toolNames, usageTracker)
|
||||
|
||||
+4
-4
@@ -851,10 +851,10 @@ func (h *scriptEventHandler) Handle(msg tea.Msg) {
|
||||
_ = h.cli.DisplayAssistantMessageWithModel(responseText, h.modelName)
|
||||
}
|
||||
|
||||
// Update and display usage.
|
||||
if e.Response != nil {
|
||||
h.cli.UpdateUsageFromResponse(e.Response, "")
|
||||
}
|
||||
// Display usage. The app layer has already updated the shared
|
||||
// UsageTracker (via app.updateUsage in RunOnceWithDisplay) before
|
||||
// sending this event, so we only need to render — calling
|
||||
// UpdateUsageFromResponse here would double-count.
|
||||
h.cli.DisplayUsageAfterResponse()
|
||||
|
||||
// Reset for next step in the agentic loop.
|
||||
|
||||
@@ -419,6 +419,11 @@ func (a *App) sendEvent(msg tea.Msg) {
|
||||
// updateUsage records token usage from a completed agent step into the configured
|
||||
// UsageTracker (if any). It uses the actual token counts from the agent result's
|
||||
// TotalUsage field when available; otherwise it falls back to text-based estimation.
|
||||
//
|
||||
// TotalUsage is the sum across all tool-calling steps in a single agent run and
|
||||
// is used for session cost tracking. For context window utilization we use the
|
||||
// final response's per-call usage (FinalResponse.Usage) which reflects the actual
|
||||
// context size at the last API call.
|
||||
func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt string) {
|
||||
if a.opts.UsageTracker == nil || result == nil {
|
||||
return
|
||||
@@ -438,5 +443,16 @@ func (a *App) updateUsage(result *agent.GenerateWithLoopResult, userPrompt strin
|
||||
responseText = result.FinalResponse.Content.Text()
|
||||
}
|
||||
a.opts.UsageTracker.EstimateAndUpdateUsage(userPrompt, responseText)
|
||||
return // EstimateAndUpdateUsage already sets context tokens internally
|
||||
}
|
||||
|
||||
// Set context window utilization from the final API call's per-step usage.
|
||||
// FinalResponse.Usage represents the last step only (not the aggregate),
|
||||
// so input+output there reflects the actual context fill level.
|
||||
if result.FinalResponse != nil {
|
||||
fu := result.FinalResponse.Usage
|
||||
if ct := int(fu.InputTokens) + int(fu.OutputTokens); ct > 0 {
|
||||
a.opts.UsageTracker.SetContextTokens(ct)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,10 +32,16 @@ type AgentRunner interface {
|
||||
// in cmd/root.go, which can import both packages.
|
||||
type UsageUpdater interface {
|
||||
// UpdateUsage records actual token counts returned by the provider.
|
||||
// The counts come from fantasy's TotalUsage (aggregate across all steps
|
||||
// in a multi-step tool-calling run) and are used for session cost tracking.
|
||||
UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens int)
|
||||
// EstimateAndUpdateUsage falls back to text-based token estimation when
|
||||
// the provider does not return exact counts.
|
||||
EstimateAndUpdateUsage(inputText, outputText string)
|
||||
// SetContextTokens records the approximate current context window fill
|
||||
// level. This should be the final API call's input+output tokens (from
|
||||
// FinalResponse.Usage), NOT the aggregate TotalUsage.
|
||||
SetContextTokens(tokens int)
|
||||
}
|
||||
|
||||
// Options configures an App instance. It mirrors the fields from AgenticLoopConfig
|
||||
|
||||
+5
-1
@@ -262,8 +262,12 @@ func (c *CLI) UpdateUsageFromResponse(response *fantasy.Response, inputText stri
|
||||
cacheReadTokens := int(usage.CacheReadTokens)
|
||||
cacheWriteTokens := int(usage.CacheCreationTokens)
|
||||
c.usageTracker.UpdateUsage(inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens)
|
||||
// Per-response usage is a single API call, so it represents the
|
||||
// actual context window fill level.
|
||||
c.usageTracker.SetContextTokens(inputTokens + outputTokens)
|
||||
} else {
|
||||
// Fallback to estimation if no metadata is available
|
||||
// Fallback to estimation if no metadata is available.
|
||||
// EstimateAndUpdateUsage sets context tokens internally.
|
||||
c.usageTracker.EstimateAndUpdateUsage(inputText, response.Content.Text())
|
||||
}
|
||||
}
|
||||
|
||||
+32
-5
@@ -433,7 +433,9 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
|
||||
case app.StepCompleteEvent:
|
||||
// Flush any remaining streamed text to scrollback, then reset stream
|
||||
// and return to input state.
|
||||
// and return to input state. Token usage is rendered as a sticky
|
||||
// element in View() — the app layer has already updated the shared
|
||||
// UsageTracker before sending this event.
|
||||
cmds = append(cmds, m.flushStreamContent())
|
||||
if m.stream != nil {
|
||||
m.stream.Reset()
|
||||
@@ -479,13 +481,21 @@ func (m *AppModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
}
|
||||
|
||||
// View implements tea.Model. It renders the stacked layout:
|
||||
// stream region + separator + [queued messages] + input region.
|
||||
// stream region + [usage info] + separator + [queued messages] + input region.
|
||||
func (m *AppModel) View() tea.View {
|
||||
streamView := m.renderStream()
|
||||
separator := m.renderSeparator()
|
||||
inputView := m.renderInput()
|
||||
|
||||
parts := []string{streamView, separator}
|
||||
parts := []string{streamView}
|
||||
|
||||
// Sticky usage info sits between the stream and separator so it is
|
||||
// always visible at the bottom of the messages area and updates in place.
|
||||
if usageView := m.renderUsageInfo(); usageView != "" {
|
||||
parts = append(parts, usageView)
|
||||
}
|
||||
|
||||
parts = append(parts, separator)
|
||||
|
||||
if queuedView := m.renderQueuedMessages(); queuedView != "" {
|
||||
parts = append(parts, queuedView)
|
||||
@@ -523,6 +533,16 @@ func (m *AppModel) renderStream() string {
|
||||
return m.stream.View().Content
|
||||
}
|
||||
|
||||
// renderUsageInfo returns the sticky token usage line (tokens + context% + cost).
|
||||
// Returns an empty string when no usage data is available (no requests yet or
|
||||
// tracker is nil), so the element is invisible until the first response arrives.
|
||||
func (m *AppModel) renderUsageInfo() string {
|
||||
if m.usageTracker == nil {
|
||||
return ""
|
||||
}
|
||||
return m.usageTracker.RenderUsageInfo()
|
||||
}
|
||||
|
||||
// renderSeparator renders the separator line with an optional queue count badge.
|
||||
func (m *AppModel) renderSeparator() string {
|
||||
theme := GetTheme()
|
||||
@@ -796,7 +816,8 @@ func (m *AppModel) flushStreamContent() tea.Cmd {
|
||||
//
|
||||
// Layout (line counts):
|
||||
//
|
||||
// stream region = total - separator(1) - queued(N*5) - input(5)
|
||||
// stream region = total - usage(0-1) - separator(1) - queued(N*5) - input(5)
|
||||
// usage info = 0 or 1 line (visible only after first response)
|
||||
// separator = 1 line
|
||||
// queued msgs = ~5 lines per message (padding + text + badge + padding)
|
||||
// input region = 5 lines: title(1) + textarea(3) + help(1)
|
||||
@@ -806,7 +827,13 @@ func (m *AppModel) distributeHeight() {
|
||||
const linesPerQueuedMsg = 5
|
||||
queuedLines := len(m.queuedMessages) * linesPerQueuedMsg
|
||||
|
||||
streamHeight := max(m.height-separatorLines-queuedLines-inputLines, 0)
|
||||
// Reserve space for the sticky usage line when the tracker has data.
|
||||
usageLines := 0
|
||||
if m.usageTracker != nil && m.usageTracker.GetSessionStats().RequestCount > 0 {
|
||||
usageLines = 1
|
||||
}
|
||||
|
||||
streamHeight := max(m.height-usageLines-separatorLines-queuedLines-inputLines, 0)
|
||||
|
||||
if m.stream != nil {
|
||||
m.stream.SetHeight(streamHeight)
|
||||
|
||||
@@ -41,13 +41,14 @@ type SessionStats struct {
|
||||
// for LLM interactions throughout a session. It provides real-time usage information
|
||||
// and supports both estimated and actual token counts. OAuth users see $0 costs.
|
||||
type UsageTracker struct {
|
||||
mu sync.RWMutex
|
||||
modelInfo *models.ModelInfo
|
||||
provider string
|
||||
sessionStats SessionStats
|
||||
lastRequest *UsageStats
|
||||
width int
|
||||
isOAuth bool // Whether OAuth credentials are being used (costs should be $0)
|
||||
mu sync.RWMutex
|
||||
modelInfo *models.ModelInfo
|
||||
provider string
|
||||
sessionStats SessionStats
|
||||
lastRequest *UsageStats
|
||||
contextTokens int // approximate current context window utilization (last API call)
|
||||
width int
|
||||
isOAuth bool // Whether OAuth credentials are being used (costs should be $0)
|
||||
}
|
||||
|
||||
// NewUsageTracker creates and initializes a new UsageTracker for the specified model.
|
||||
@@ -119,11 +120,27 @@ func (ut *UsageTracker) UpdateUsage(inputTokens, outputTokens, cacheReadTokens,
|
||||
|
||||
// EstimateAndUpdateUsage estimates token counts from raw text strings and updates
|
||||
// the usage statistics. This method is used when actual token counts are not available
|
||||
// from the API response.
|
||||
// from the API response. The estimated values also serve as the context utilization
|
||||
// approximation since they represent a single API call.
|
||||
func (ut *UsageTracker) EstimateAndUpdateUsage(inputText, outputText string) {
|
||||
inputTokens := estimateTokens(inputText)
|
||||
outputTokens := estimateTokens(outputText)
|
||||
ut.UpdateUsage(inputTokens, outputTokens, 0, 0)
|
||||
// For estimated usage the values represent a single call, so they are a
|
||||
// reasonable proxy for the current context window fill level.
|
||||
ut.mu.Lock()
|
||||
ut.contextTokens = inputTokens + outputTokens
|
||||
ut.mu.Unlock()
|
||||
}
|
||||
|
||||
// SetContextTokens records the approximate current context window utilization.
|
||||
// This should be set from the final API call's input + output tokens (i.e.
|
||||
// FinalResponse.Usage) rather than the aggregate TotalUsage, because TotalUsage
|
||||
// sums across all tool-calling steps and overstates the actual window fill level.
|
||||
func (ut *UsageTracker) SetContextTokens(tokens int) {
|
||||
ut.mu.Lock()
|
||||
defer ut.mu.Unlock()
|
||||
ut.contextTokens = tokens
|
||||
}
|
||||
|
||||
// RenderUsageInfo generates a formatted string displaying current usage statistics
|
||||
@@ -134,31 +151,32 @@ func (ut *UsageTracker) RenderUsageInfo() string {
|
||||
ut.mu.RLock()
|
||||
defer ut.mu.RUnlock()
|
||||
|
||||
// if ut.sessionStats.RequestCount == 0 {
|
||||
// return ""
|
||||
// }
|
||||
if ut.sessionStats.RequestCount == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Import lipgloss for styling
|
||||
baseStyle := lipgloss.NewStyle()
|
||||
|
||||
// Calculate total tokens
|
||||
totalTokens := ut.sessionStats.TotalInputTokens + ut.sessionStats.TotalOutputTokens
|
||||
// Display the current context window token count (from the last API call),
|
||||
// not the cumulative session total. This keeps the number consistent with
|
||||
// the percentage and answers "how full is my context right now?".
|
||||
displayTokens := ut.contextTokens
|
||||
|
||||
// Format tokens with K/M suffix for better readability
|
||||
var tokenStr string
|
||||
if totalTokens >= 1000000 {
|
||||
tokenStr = fmt.Sprintf("%.1fM", float64(totalTokens)/1000000)
|
||||
} else if totalTokens >= 1000 {
|
||||
tokenStr = fmt.Sprintf("%.1fK", float64(totalTokens)/1000)
|
||||
if displayTokens >= 1000000 {
|
||||
tokenStr = fmt.Sprintf("%.1fM", float64(displayTokens)/1000000)
|
||||
} else if displayTokens >= 1000 {
|
||||
tokenStr = fmt.Sprintf("%.1fK", float64(displayTokens)/1000)
|
||||
} else {
|
||||
tokenStr = fmt.Sprintf("%d", totalTokens)
|
||||
tokenStr = fmt.Sprintf("%d", displayTokens)
|
||||
}
|
||||
|
||||
// Calculate percentage based on context limit with color coding
|
||||
// Calculate context window utilization percentage from the same value.
|
||||
var percentageStr string
|
||||
var percentageColor color.Color
|
||||
if ut.modelInfo.Limit.Context > 0 {
|
||||
percentage := float64(totalTokens) / float64(ut.modelInfo.Limit.Context) * 100
|
||||
if ut.modelInfo.Limit.Context > 0 && displayTokens > 0 {
|
||||
percentage := float64(displayTokens) / float64(ut.modelInfo.Limit.Context) * 100
|
||||
|
||||
// Color code based on usage percentage
|
||||
theme := GetTheme()
|
||||
@@ -202,8 +220,8 @@ func (ut *UsageTracker) RenderUsageInfo() string {
|
||||
Foreground(theme.Muted).
|
||||
Render(" | Cost: ")
|
||||
|
||||
// Build the enhanced display
|
||||
return fmt.Sprintf("%s%s%s%s%s\n",
|
||||
// Build the enhanced display (no trailing newline — callers control spacing).
|
||||
return fmt.Sprintf("%s%s%s%s%s",
|
||||
tokensLabel, tokensValue, percentageStr, costLabel, costStr)
|
||||
}
|
||||
|
||||
@@ -237,6 +255,7 @@ func (ut *UsageTracker) Reset() {
|
||||
defer ut.mu.Unlock()
|
||||
ut.sessionStats = SessionStats{}
|
||||
ut.lastRequest = nil
|
||||
ut.contextTokens = 0
|
||||
}
|
||||
|
||||
// SetWidth updates the terminal width used for formatting usage information display.
|
||||
|
||||
@@ -32,7 +32,8 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
|
||||
|
||||
// Test OAuth rendering (should show $0.00)
|
||||
oauthTracker := NewUsageTracker(modelInfo, "anthropic", 80, true)
|
||||
oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens
|
||||
oauthTracker.UpdateUsage(1500, 500, 0, 0) // 2000 total tokens (session/billing)
|
||||
oauthTracker.SetContextTokens(1500 + 500) // context window utilization
|
||||
|
||||
rendered := stripAnsi(oauthTracker.RenderUsageInfo())
|
||||
|
||||
@@ -50,6 +51,7 @@ func TestUsageTracker_RenderUsageInfo_OAuth(t *testing.T) {
|
||||
// Test regular API key rendering (should show actual cost)
|
||||
regularTracker := NewUsageTracker(modelInfo, "anthropic", 80, false)
|
||||
regularTracker.UpdateUsage(1500, 500, 0, 0) // Same token usage
|
||||
regularTracker.SetContextTokens(1500 + 500) // context window utilization
|
||||
|
||||
regularRendered := stripAnsi(regularTracker.RenderUsageInfo())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user