mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-14 03:30:26 +00:00
fix(subagent): prevent instant failure from already-dead parent contexts
- Replace detachedWithCancel (goroutine-based) with context.WithoutCancel + valuesContext; the old goroutine would fire immediately if the parent was already cancelled/deadline-exceeded, causing 'failed after 0s' - Kit.Subagent() pre-flight: if the incoming ctx is already done, reset to context.Background() before applying the subagent timeout - Both Subagent() error paths now return a non-nil *SubagentResult with Elapsed set, so the tool response always shows accurate timing - Narrow viperInitMu scope in Kit.New(): snapshot viper state + call BuildProviderConfig under the lock, then release before SetupAgent / MCP loading; parallel subagent spawns no longer serialise on viper I/O - AgentSetupOptions gains ProviderConfig + scalar fields so SetupAgent can skip viper reads when a pre-built config is supplied - Add subagent_test.go covering the fixed context detachment behaviour
This commit is contained in:
+26
-33
@@ -130,13 +130,22 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
|
||||
), fmt.Errorf("no subagent spawner in context")
|
||||
}
|
||||
|
||||
// Detach from the parent's deadline so the subagent gets its own
|
||||
// independent timeout (applied downstream in Kit.Subagent). The parent
|
||||
// context may carry a tight deadline from the LLM generation loop or
|
||||
// other tool timeouts that would prematurely kill the subagent.
|
||||
// We preserve context values (spawner, etc.) and propagate parent
|
||||
// cancellation (e.g. user hits Ctrl-C) without inheriting the deadline.
|
||||
spawnCtx := detachedWithCancel(ctx)
|
||||
// Build a clean context for the subagent that inherits values (e.g. the
|
||||
// spawner callback) but is completely detached from the parent's
|
||||
// deadline AND cancellation. The subagent gets its own independent
|
||||
// timeout (applied downstream in Kit.Subagent).
|
||||
//
|
||||
// Why full detachment instead of propagating parent cancellation?
|
||||
// The parent context may already be done (deadline exceeded or
|
||||
// cancelled) by the time this tool handler executes — for example when
|
||||
// the generation loop context carries a deadline, when the user
|
||||
// double-ESC cancels mid-turn, or when parallel tool execution
|
||||
// encounters a race between stream completion and tool dispatch. Using
|
||||
// context.WithoutCancel (Go 1.21+) ensures the subagent always starts
|
||||
// cleanly with a fresh timeout, following the pattern used by crush for
|
||||
// shutdown-resilient child work. The subagent's own timeout
|
||||
// (defaultSubagentTimeout / user-specified) provides the safety net.
|
||||
spawnCtx := context.WithoutCancel(valuesContext{parent: ctx})
|
||||
|
||||
// Spawn in-process subagent.
|
||||
result, err := spawner(spawnCtx, call.ID, args.Task, args.Model, args.SystemPrompt, timeout)
|
||||
@@ -173,37 +182,21 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Context detachment
|
||||
// Context helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// detachedContext wraps a parent context, preserving its values but removing
|
||||
// its deadline and cancellation. This allows the subagent to have its own
|
||||
// independent timeout while still accessing context-stored values (e.g. the
|
||||
// subagent spawner function).
|
||||
type detachedContext struct {
|
||||
// valuesContext preserves a parent context's values (e.g. the subagent
|
||||
// spawner callback) while stripping its deadline and cancellation. Combined
|
||||
// with context.WithoutCancel() this gives the subagent a completely clean
|
||||
// context that only inherits value-based dependencies.
|
||||
type valuesContext struct {
|
||||
parent context.Context
|
||||
}
|
||||
|
||||
func (d detachedContext) Deadline() (time.Time, bool) { return time.Time{}, false }
|
||||
func (d detachedContext) Done() <-chan struct{} { return nil }
|
||||
func (d detachedContext) Err() error { return nil }
|
||||
func (d detachedContext) Value(key any) any { return d.parent.Value(key) }
|
||||
|
||||
// detachedWithCancel creates a new context that inherits values from the
|
||||
// parent but has no deadline. Cancellation of the parent is propagated: when
|
||||
// the parent is cancelled the returned context is also cancelled, but the
|
||||
// parent's deadline does not apply to the child.
|
||||
func detachedWithCancel(parent context.Context) context.Context {
|
||||
child, cancel := context.WithCancel(detachedContext{parent: parent})
|
||||
go func() {
|
||||
select {
|
||||
case <-parent.Done():
|
||||
cancel()
|
||||
case <-child.Done():
|
||||
}
|
||||
}()
|
||||
return child
|
||||
}
|
||||
func (v valuesContext) Deadline() (time.Time, bool) { return time.Time{}, false }
|
||||
func (v valuesContext) Done() <-chan struct{} { return nil }
|
||||
func (v valuesContext) Err() error { return nil }
|
||||
func (v valuesContext) Value(key any) any { return v.parent.Value(key) }
|
||||
|
||||
// truncateResponse limits the response length to avoid overwhelming context windows.
|
||||
func truncateResponse(s string, maxLen int) string {
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestValuesContext_StripsDeadlineAndCancellation(t *testing.T) {
|
||||
// Parent with a tight deadline.
|
||||
parent, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
|
||||
defer cancel()
|
||||
time.Sleep(5 * time.Millisecond) // Let deadline expire.
|
||||
|
||||
if parent.Err() == nil {
|
||||
t.Fatal("expected parent to be expired")
|
||||
}
|
||||
|
||||
vc := valuesContext{parent: parent}
|
||||
|
||||
if _, ok := vc.Deadline(); ok {
|
||||
t.Error("valuesContext should report no deadline")
|
||||
}
|
||||
if vc.Done() != nil {
|
||||
t.Error("valuesContext.Done() should return nil")
|
||||
}
|
||||
if vc.Err() != nil {
|
||||
t.Errorf("valuesContext.Err() should be nil, got %v", vc.Err())
|
||||
}
|
||||
}
|
||||
|
||||
func TestValuesContext_PreservesValues(t *testing.T) {
|
||||
type testKey struct{}
|
||||
parent := context.WithValue(context.Background(), testKey{}, "hello")
|
||||
|
||||
vc := valuesContext{parent: parent}
|
||||
|
||||
got, ok := vc.Value(testKey{}).(string)
|
||||
if !ok || got != "hello" {
|
||||
t.Errorf("expected value 'hello', got %q (ok=%v)", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpawnContext_SurvivesCancelledParent(t *testing.T) {
|
||||
// Simulate the exact scenario from the bug: the parent generation
|
||||
// context is already cancelled when the subagent tool handler runs.
|
||||
parent, cancel := context.WithCancel(context.Background())
|
||||
cancel() // Cancelled before detach.
|
||||
|
||||
// This is what executeSubagent now does:
|
||||
spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
|
||||
|
||||
// The spawn context must be alive.
|
||||
if spawnCtx.Err() != nil {
|
||||
t.Fatalf("spawnCtx should be alive, got err: %v", spawnCtx.Err())
|
||||
}
|
||||
|
||||
// Adding a timeout should produce a working context.
|
||||
tCtx, tCancel := context.WithTimeout(spawnCtx, 5*time.Second)
|
||||
defer tCancel()
|
||||
|
||||
if tCtx.Err() != nil {
|
||||
t.Fatalf("timeout context should be alive, got err: %v", tCtx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpawnContext_SurvivesDeadlineExceededParent(t *testing.T) {
|
||||
// Simulate: parent had a deadline that already expired.
|
||||
parent, pCancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
|
||||
defer pCancel()
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
if parent.Err() != context.DeadlineExceeded {
|
||||
t.Fatalf("expected parent deadline exceeded, got: %v", parent.Err())
|
||||
}
|
||||
|
||||
spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
|
||||
|
||||
if spawnCtx.Err() != nil {
|
||||
t.Fatalf("spawnCtx should be alive after deadline-exceeded parent, got: %v", spawnCtx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpawnContext_PreservesSpawnerValue(t *testing.T) {
|
||||
// Verify the subagent spawner callback survives context detachment.
|
||||
called := false
|
||||
spawner := SubagentSpawnFunc(func(ctx context.Context, toolCallID, prompt, model, systemPrompt string, timeout time.Duration) (*SubagentSpawnResult, error) {
|
||||
called = true
|
||||
return &SubagentSpawnResult{Response: "ok"}, nil
|
||||
})
|
||||
|
||||
parent := WithSubagentSpawner(context.Background(), spawner)
|
||||
// Cancel the parent.
|
||||
parentCtx, cancel := context.WithCancel(parent)
|
||||
cancel()
|
||||
|
||||
spawnCtx := context.WithoutCancel(valuesContext{parent: parentCtx})
|
||||
|
||||
// Should be able to retrieve the spawner from the detached context.
|
||||
recovered := getSubagentSpawner(spawnCtx)
|
||||
if recovered == nil {
|
||||
t.Fatal("spawner should be recoverable from detached context")
|
||||
}
|
||||
|
||||
result, err := recovered(spawnCtx, "tc1", "test task", "", "", time.Minute)
|
||||
if err != nil {
|
||||
t.Fatalf("spawner call failed: %v", err)
|
||||
}
|
||||
if !called {
|
||||
t.Error("spawner was not called")
|
||||
}
|
||||
if result.Response != "ok" {
|
||||
t.Errorf("expected 'ok', got %q", result.Response)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user