fix(subagent): prevent instant failure from already-dead parent contexts

- Replace detachedWithCancel (goroutine-based) with context.WithoutCancel
  + valuesContext; the old goroutine would fire immediately if the parent
  was already cancelled/deadline-exceeded, causing 'failed after 0s'
- Kit.Subagent() pre-flight: if the incoming ctx is already done, reset
  to context.Background() before applying the subagent timeout
- Both Subagent() error paths now return a non-nil *SubagentResult with
  Elapsed set, so the tool response always shows accurate timing
- Narrow viperInitMu scope in Kit.New(): snapshot viper state + call
  BuildProviderConfig under the lock, then release before SetupAgent /
  MCP loading; parallel subagent spawns no longer serialise on viper I/O
- AgentSetupOptions gains ProviderConfig + scalar fields so SetupAgent
  can skip viper reads when a pre-built config is supplied
- Add subagent_test.go covering the fixed context detachment behaviour
This commit is contained in:
Ed Zynda
2026-04-02 15:54:47 +03:00
parent 685aaf207f
commit ead4afbfe6
4 changed files with 305 additions and 111 deletions
+26 -33
View File
@@ -130,13 +130,22 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
), fmt.Errorf("no subagent spawner in context")
}
// Detach from the parent's deadline so the subagent gets its own
// independent timeout (applied downstream in Kit.Subagent). The parent
// context may carry a tight deadline from the LLM generation loop or
// other tool timeouts that would prematurely kill the subagent.
// We preserve context values (spawner, etc.) and propagate parent
// cancellation (e.g. user hits Ctrl-C) without inheriting the deadline.
spawnCtx := detachedWithCancel(ctx)
// Build a clean context for the subagent that inherits values (e.g. the
// spawner callback) but is completely detached from the parent's
// deadline AND cancellation. The subagent gets its own independent
// timeout (applied downstream in Kit.Subagent).
//
// Why full detachment instead of propagating parent cancellation?
// The parent context may already be done (deadline exceeded or
// cancelled) by the time this tool handler executes — for example when
// the generation loop context carries a deadline, when the user
// double-ESC cancels mid-turn, or when parallel tool execution
// encounters a race between stream completion and tool dispatch. Using
// context.WithoutCancel (Go 1.21+) ensures the subagent always starts
// cleanly with a fresh timeout, following the pattern used by crush for
// shutdown-resilient child work. The subagent's own timeout
// (defaultSubagentTimeout / user-specified) provides the safety net.
spawnCtx := context.WithoutCancel(valuesContext{parent: ctx})
// Spawn in-process subagent.
result, err := spawner(spawnCtx, call.ID, args.Task, args.Model, args.SystemPrompt, timeout)
@@ -173,37 +182,21 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
}
// ---------------------------------------------------------------------------
// Context detachment
// Context helpers
// ---------------------------------------------------------------------------
// detachedContext wraps a parent context, preserving its values but removing
// its deadline and cancellation. This allows the subagent to have its own
// independent timeout while still accessing context-stored values (e.g. the
// subagent spawner function).
type detachedContext struct {
// valuesContext preserves a parent context's values (e.g. the subagent
// spawner callback) while stripping its deadline and cancellation. Combined
// with context.WithoutCancel() this gives the subagent a completely clean
// context that only inherits value-based dependencies.
type valuesContext struct {
parent context.Context
}
func (d detachedContext) Deadline() (time.Time, bool) { return time.Time{}, false }
func (d detachedContext) Done() <-chan struct{} { return nil }
func (d detachedContext) Err() error { return nil }
func (d detachedContext) Value(key any) any { return d.parent.Value(key) }
// detachedWithCancel creates a new context that inherits values from the
// parent but has no deadline. Cancellation of the parent is propagated: when
// the parent is cancelled the returned context is also cancelled, but the
// parent's deadline does not apply to the child.
func detachedWithCancel(parent context.Context) context.Context {
child, cancel := context.WithCancel(detachedContext{parent: parent})
go func() {
select {
case <-parent.Done():
cancel()
case <-child.Done():
}
}()
return child
}
func (v valuesContext) Deadline() (time.Time, bool) { return time.Time{}, false }
func (v valuesContext) Done() <-chan struct{} { return nil }
func (v valuesContext) Err() error { return nil }
func (v valuesContext) Value(key any) any { return v.parent.Value(key) }
// truncateResponse limits the response length to avoid overwhelming context windows.
func truncateResponse(s string, maxLen int) string {
+115
View File
@@ -0,0 +1,115 @@
package core
import (
"context"
"testing"
"time"
)
func TestValuesContext_StripsDeadlineAndCancellation(t *testing.T) {
// Parent with a tight deadline.
parent, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
defer cancel()
time.Sleep(5 * time.Millisecond) // Let deadline expire.
if parent.Err() == nil {
t.Fatal("expected parent to be expired")
}
vc := valuesContext{parent: parent}
if _, ok := vc.Deadline(); ok {
t.Error("valuesContext should report no deadline")
}
if vc.Done() != nil {
t.Error("valuesContext.Done() should return nil")
}
if vc.Err() != nil {
t.Errorf("valuesContext.Err() should be nil, got %v", vc.Err())
}
}
func TestValuesContext_PreservesValues(t *testing.T) {
type testKey struct{}
parent := context.WithValue(context.Background(), testKey{}, "hello")
vc := valuesContext{parent: parent}
got, ok := vc.Value(testKey{}).(string)
if !ok || got != "hello" {
t.Errorf("expected value 'hello', got %q (ok=%v)", got, ok)
}
}
func TestSpawnContext_SurvivesCancelledParent(t *testing.T) {
// Simulate the exact scenario from the bug: the parent generation
// context is already cancelled when the subagent tool handler runs.
parent, cancel := context.WithCancel(context.Background())
cancel() // Cancelled before detach.
// This is what executeSubagent now does:
spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
// The spawn context must be alive.
if spawnCtx.Err() != nil {
t.Fatalf("spawnCtx should be alive, got err: %v", spawnCtx.Err())
}
// Adding a timeout should produce a working context.
tCtx, tCancel := context.WithTimeout(spawnCtx, 5*time.Second)
defer tCancel()
if tCtx.Err() != nil {
t.Fatalf("timeout context should be alive, got err: %v", tCtx.Err())
}
}
func TestSpawnContext_SurvivesDeadlineExceededParent(t *testing.T) {
// Simulate: parent had a deadline that already expired.
parent, pCancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
defer pCancel()
time.Sleep(5 * time.Millisecond)
if parent.Err() != context.DeadlineExceeded {
t.Fatalf("expected parent deadline exceeded, got: %v", parent.Err())
}
spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
if spawnCtx.Err() != nil {
t.Fatalf("spawnCtx should be alive after deadline-exceeded parent, got: %v", spawnCtx.Err())
}
}
func TestSpawnContext_PreservesSpawnerValue(t *testing.T) {
// Verify the subagent spawner callback survives context detachment.
called := false
spawner := SubagentSpawnFunc(func(ctx context.Context, toolCallID, prompt, model, systemPrompt string, timeout time.Duration) (*SubagentSpawnResult, error) {
called = true
return &SubagentSpawnResult{Response: "ok"}, nil
})
parent := WithSubagentSpawner(context.Background(), spawner)
// Cancel the parent.
parentCtx, cancel := context.WithCancel(parent)
cancel()
spawnCtx := context.WithoutCancel(valuesContext{parent: parentCtx})
// Should be able to retrieve the spawner from the detached context.
recovered := getSubagentSpawner(spawnCtx)
if recovered == nil {
t.Fatal("spawner should be recoverable from detached context")
}
result, err := recovered(spawnCtx, "tc1", "test task", "", "", time.Minute)
if err != nil {
t.Fatalf("spawner call failed: %v", err)
}
if !called {
t.Error("spawner was not called")
}
if result.Response != "ok" {
t.Errorf("expected 'ok', got %q", result.Response)
}
}
+46 -7
View File
@@ -40,6 +40,24 @@ type AgentSetupOptions struct {
// wrapping. Used by the SDK hook system. Both wrappers compose:
// extension wrapper runs first (inner), then this wrapper (outer).
ToolWrapper func([]fantasy.AgentTool) []fantasy.AgentTool
// ProviderConfig, when non-nil, is used directly instead of calling
// BuildProviderConfig(). Callers that already hold viperInitMu can
// pre-build this and release the lock before calling SetupAgent, so the
// slow agent/MCP initialisation runs concurrently with other New() calls.
ProviderConfig *models.ProviderConfig
// Debug enables debug logging. When zero-value, viper is consulted.
// Only meaningful when ProviderConfig is also set.
Debug bool
// NoExtensions skips extension loading. When false, viper is consulted.
// Only meaningful when ProviderConfig is also set.
NoExtensions bool
// MaxSteps overrides the agent step limit. 0 means use viper value.
// Only meaningful when ProviderConfig is also set.
MaxSteps int
// StreamingEnabled controls streaming. Only meaningful when ProviderConfig
// is also set.
StreamingEnabled bool
}
// AgentSetupResult bundles the created agent and any debug logger so the caller
@@ -88,15 +106,36 @@ func BuildProviderConfig() (*models.ProviderConfig, string, error) {
// SetupAgent creates an agent from the current viper state + the provided
// options. It wraps BuildProviderConfig and agent.CreateAgent.
func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult, error) {
modelConfig, systemPrompt, err := BuildProviderConfig()
if err != nil {
return nil, err
var modelConfig *models.ProviderConfig
var systemPrompt string
if opts.ProviderConfig != nil {
// Pre-built config supplied by caller (e.g. Kit.New after releasing
// viperInitMu). Use it directly — no viper reads needed here.
modelConfig = opts.ProviderConfig
systemPrompt = modelConfig.SystemPrompt
} else {
var err error
modelConfig, systemPrompt, err = BuildProviderConfig()
if err != nil {
return nil, err
}
}
// Resolve debug / no-extensions / max-steps / streaming: prefer explicit
// fields (set when ProviderConfig was pre-built) over viper fallback.
debugEnabled := opts.Debug || viper.GetBool("debug")
noExtensions := opts.NoExtensions || viper.GetBool("no-extensions")
maxSteps := opts.MaxSteps
if maxSteps == 0 {
maxSteps = viper.GetInt("max-steps")
}
streamingEnabled := opts.StreamingEnabled || viper.GetBool("stream")
// Create the appropriate debug logger.
var debugLogger tools.DebugLogger
var bufferedLogger *tools.BufferedDebugLogger
if viper.GetBool("debug") {
if debugEnabled {
if opts.UseBufferedLogger {
bufferedLogger = tools.NewBufferedDebugLogger(true)
debugLogger = bufferedLogger
@@ -108,7 +147,7 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult,
// Load extensions unless --no-extensions is set.
var extRunner *extensions.Runner
var extCreationOpts extensionCreationOpts
if !viper.GetBool("no-extensions") {
if !noExtensions {
var extErr error
extRunner, extCreationOpts, extErr = loadExtensions()
if extErr != nil {
@@ -140,8 +179,8 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult,
ModelConfig: modelConfig,
MCPConfig: opts.MCPConfig,
SystemPrompt: systemPrompt,
MaxSteps: viper.GetInt("max-steps"),
StreamingEnabled: viper.GetBool("stream"),
MaxSteps: maxSteps,
StreamingEnabled: streamingEnabled,
ShowSpinner: opts.ShowSpinner,
Quiet: opts.Quiet,
SpinnerFunc: opts.SpinnerFunc,
+118 -71
View File
@@ -505,85 +505,125 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
opts = &Options{}
}
viperInitMu.Lock()
defer viperInitMu.Unlock()
// All viper writes (SetSDKDefaults, InitConfig, Set calls, system-prompt
// composition) happen under viperInitMu. We also call BuildProviderConfig
// here — it's fast (just reads) — so we can capture the full config
// snapshot before releasing the lock. The expensive work (MCP loading,
// provider creation, session init) then runs outside the lock, allowing
// parallel subagent spawns to proceed concurrently.
var (
providerConfig *models.ProviderConfig
modelString string
cwd string
contextFiles []*ContextFile
loadedSkills []*Skill
mcpConfig *config.Config
debug bool
noExtensions bool
maxSteps int
streaming bool
)
// Set CLI-equivalent defaults for viper. When used as an SDK (without
// cobra), these defaults are not registered via flag bindings.
setSDKDefaults()
if err := func() error {
viperInitMu.Lock()
defer viperInitMu.Unlock()
// Initialize config (loads config files and env vars).
// Only initialize if not already done (e.g., by CLI's cobra.OnInitialize).
// Check if model is already set, which indicates config was loaded.
if viper.GetString("model") == "" {
if err := InitConfig(opts.ConfigFile, false); err != nil {
return nil, fmt.Errorf("failed to initialize config: %w", err)
}
}
// Set CLI-equivalent defaults for viper. When used as an SDK (without
// cobra), these defaults are not registered via flag bindings.
setSDKDefaults()
// Handle CLI debug mode.
if opts.Debug {
viper.Set("debug", true)
}
// Override viper settings with options.
if opts.Model != "" {
viper.Set("model", opts.Model)
}
if opts.SystemPrompt != "" {
viper.Set("system-prompt", opts.SystemPrompt)
}
if opts.MaxSteps > 0 {
viper.Set("max-steps", opts.MaxSteps)
}
viper.Set("stream", opts.Streaming)
// Resolve working directory for context/skill discovery.
cwd := opts.SessionDir
if cwd == "" {
cwd, _ = os.Getwd()
}
// Load context files (AGENTS.md) from the project root.
contextFiles := loadContextFiles(cwd)
// Load skills — either from explicit paths or via auto-discovery.
loadedSkills, err := loadSkills(opts)
if err != nil {
return nil, fmt.Errorf("failed to load skills: %w", err)
}
// Always compose the system prompt with runtime context: base prompt +
// AGENTS.md context + skills metadata + date/cwd.
{
basePrompt := viper.GetString("system-prompt")
pb := skills.NewPromptBuilder(basePrompt)
// Inject AGENTS.md content as project context.
for _, cf := range contextFiles {
pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content))
// Initialize config (loads config files and env vars).
// Only initialize if not already done (e.g., by CLI's cobra.OnInitialize).
// Check if model is already set, which indicates config was loaded.
if viper.GetString("model") == "" {
if err := InitConfig(opts.ConfigFile, false); err != nil {
return fmt.Errorf("failed to initialize config: %w", err)
}
}
// Inject skills metadata (name + description + location).
if len(loadedSkills) > 0 {
pb.WithSkills(loadedSkills)
// Handle CLI debug mode.
if opts.Debug {
viper.Set("debug", true)
}
// Append current date/time and working directory.
pb.WithSection("", fmt.Sprintf(
"Current date and time: %s\nCurrent working directory: %s",
time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd,
))
// Override viper settings with options.
if opts.Model != "" {
viper.Set("model", opts.Model)
}
if opts.SystemPrompt != "" {
viper.Set("system-prompt", opts.SystemPrompt)
}
if opts.MaxSteps > 0 {
viper.Set("max-steps", opts.MaxSteps)
}
viper.Set("stream", opts.Streaming)
viper.Set("system-prompt", pb.Build())
// Resolve working directory for context/skill discovery.
cwd = opts.SessionDir
if cwd == "" {
cwd, _ = os.Getwd()
}
// Load context files (AGENTS.md) from the project root.
contextFiles = loadContextFiles(cwd)
// Load skills — either from explicit paths or via auto-discovery.
var err error
loadedSkills, err = loadSkills(opts)
if err != nil {
return fmt.Errorf("failed to load skills: %w", err)
}
// Always compose the system prompt with runtime context: base prompt +
// AGENTS.md context + skills metadata + date/cwd.
{
basePrompt := viper.GetString("system-prompt")
pb := skills.NewPromptBuilder(basePrompt)
// Inject AGENTS.md content as project context.
for _, cf := range contextFiles {
pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content))
}
// Inject skills metadata (name + description + location).
if len(loadedSkills) > 0 {
pb.WithSkills(loadedSkills)
}
// Append current date/time and working directory.
pb.WithSection("", fmt.Sprintf(
"Current date and time: %s\nCurrent working directory: %s",
time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd,
))
viper.Set("system-prompt", pb.Build())
}
// Snapshot all viper-derived values now, while the lock is held.
// BuildProviderConfig is fast (pure reads), so we do it here.
var pcErr error
providerConfig, _, pcErr = kitsetup.BuildProviderConfig()
if pcErr != nil {
return fmt.Errorf("failed to build provider config: %w", pcErr)
}
modelString = viper.GetString("model")
debug = viper.GetBool("debug")
noExtensions = viper.GetBool("no-extensions")
maxSteps = viper.GetInt("max-steps")
streaming = viper.GetBool("stream")
return nil
}(); err != nil {
return nil, err
}
// ---- viperInitMu released — heavy I/O below runs concurrently ----
// Load MCP configuration. Use pre-loaded config if provided via CLI options.
var mcpConfig *config.Config
if opts.CLI != nil {
if opts.CLI != nil && opts.CLI.MCPConfig != nil {
mcpConfig = opts.CLI.MCPConfig
}
if mcpConfig == nil {
var err error
mcpConfig, err = config.LoadAndValidateConfig()
if err != nil {
return nil, fmt.Errorf("failed to load MCP config: %w", err)
@@ -601,12 +641,19 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
beforeCompact := newHookRegistry[BeforeCompactHook, BeforeCompactResult]()
// Build agent setup options, pulling CLI-specific fields when available.
// Pass the pre-built ProviderConfig and scalar viper snapshots so
// SetupAgent doesn't need to re-read viper (which would require the lock).
setupOpts := kitsetup.AgentSetupOptions{
MCPConfig: mcpConfig,
Quiet: opts.Quiet,
CoreTools: opts.Tools,
ExtraTools: opts.ExtraTools,
ToolWrapper: hookToolWrapper(beforeToolCall, afterToolResult),
MCPConfig: mcpConfig,
Quiet: opts.Quiet,
CoreTools: opts.Tools,
ExtraTools: opts.ExtraTools,
ToolWrapper: hookToolWrapper(beforeToolCall, afterToolResult),
ProviderConfig: providerConfig,
Debug: debug,
NoExtensions: noExtensions,
MaxSteps: maxSteps,
StreamingEnabled: streaming,
}
if opts.CLI != nil {
setupOpts.ShowSpinner = opts.CLI.ShowSpinner
@@ -630,7 +677,7 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
k := &Kit{
agent: agentResult.Agent,
treeSession: treeSession,
modelString: viper.GetString("model"),
modelString: modelString,
events: newEventBus(),
autoCompact: opts.AutoCompact,
compactionOpts: opts.CompactionOptions,