From ead4afbfe6d3c1924b4bf5dd2371ffdb61e5040a Mon Sep 17 00:00:00 2001 From: Ed Zynda Date: Thu, 2 Apr 2026 15:54:47 +0300 Subject: [PATCH] fix(subagent): prevent instant failure from already-dead parent contexts - Replace detachedWithCancel (goroutine-based) with context.WithoutCancel + valuesContext; the old goroutine would fire immediately if the parent was already cancelled/deadline-exceeded, causing 'failed after 0s' - Kit.Subagent() pre-flight: if the incoming ctx is already done, reset to context.Background() before applying the subagent timeout - Both Subagent() error paths now return a non-nil *SubagentResult with Elapsed set, so the tool response always shows accurate timing - Narrow viperInitMu scope in Kit.New(): snapshot viper state + call BuildProviderConfig under the lock, then release before SetupAgent / MCP loading; parallel subagent spawns no longer serialise on viper I/O - AgentSetupOptions gains ProviderConfig + scalar fields so SetupAgent can skip viper reads when a pre-built config is supplied - Add subagent_test.go covering the fixed context detachment behaviour --- internal/core/subagent.go | 59 +++++----- internal/core/subagent_test.go | 115 ++++++++++++++++++++ internal/kitsetup/setup.go | 53 +++++++-- pkg/kit/kit.go | 189 ++++++++++++++++++++------------- 4 files changed, 305 insertions(+), 111 deletions(-) create mode 100644 internal/core/subagent_test.go diff --git a/internal/core/subagent.go b/internal/core/subagent.go index 2181cd8e..4f96ec2b 100644 --- a/internal/core/subagent.go +++ b/internal/core/subagent.go @@ -130,13 +130,22 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe ), fmt.Errorf("no subagent spawner in context") } - // Detach from the parent's deadline so the subagent gets its own - // independent timeout (applied downstream in Kit.Subagent). The parent - // context may carry a tight deadline from the LLM generation loop or - // other tool timeouts that would prematurely kill the subagent. - // We preserve context values (spawner, etc.) and propagate parent - // cancellation (e.g. user hits Ctrl-C) without inheriting the deadline. - spawnCtx := detachedWithCancel(ctx) + // Build a clean context for the subagent that inherits values (e.g. the + // spawner callback) but is completely detached from the parent's + // deadline AND cancellation. The subagent gets its own independent + // timeout (applied downstream in Kit.Subagent). + // + // Why full detachment instead of propagating parent cancellation? + // The parent context may already be done (deadline exceeded or + // cancelled) by the time this tool handler executes — for example when + // the generation loop context carries a deadline, when the user + // double-ESC cancels mid-turn, or when parallel tool execution + // encounters a race between stream completion and tool dispatch. Using + // context.WithoutCancel (Go 1.21+) ensures the subagent always starts + // cleanly with a fresh timeout, following the pattern used by crush for + // shutdown-resilient child work. The subagent's own timeout + // (defaultSubagentTimeout / user-specified) provides the safety net. + spawnCtx := context.WithoutCancel(valuesContext{parent: ctx}) // Spawn in-process subagent. result, err := spawner(spawnCtx, call.ID, args.Task, args.Model, args.SystemPrompt, timeout) @@ -173,37 +182,21 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe } // --------------------------------------------------------------------------- -// Context detachment +// Context helpers // --------------------------------------------------------------------------- -// detachedContext wraps a parent context, preserving its values but removing -// its deadline and cancellation. This allows the subagent to have its own -// independent timeout while still accessing context-stored values (e.g. the -// subagent spawner function). -type detachedContext struct { +// valuesContext preserves a parent context's values (e.g. the subagent +// spawner callback) while stripping its deadline and cancellation. Combined +// with context.WithoutCancel() this gives the subagent a completely clean +// context that only inherits value-based dependencies. +type valuesContext struct { parent context.Context } -func (d detachedContext) Deadline() (time.Time, bool) { return time.Time{}, false } -func (d detachedContext) Done() <-chan struct{} { return nil } -func (d detachedContext) Err() error { return nil } -func (d detachedContext) Value(key any) any { return d.parent.Value(key) } - -// detachedWithCancel creates a new context that inherits values from the -// parent but has no deadline. Cancellation of the parent is propagated: when -// the parent is cancelled the returned context is also cancelled, but the -// parent's deadline does not apply to the child. -func detachedWithCancel(parent context.Context) context.Context { - child, cancel := context.WithCancel(detachedContext{parent: parent}) - go func() { - select { - case <-parent.Done(): - cancel() - case <-child.Done(): - } - }() - return child -} +func (v valuesContext) Deadline() (time.Time, bool) { return time.Time{}, false } +func (v valuesContext) Done() <-chan struct{} { return nil } +func (v valuesContext) Err() error { return nil } +func (v valuesContext) Value(key any) any { return v.parent.Value(key) } // truncateResponse limits the response length to avoid overwhelming context windows. func truncateResponse(s string, maxLen int) string { diff --git a/internal/core/subagent_test.go b/internal/core/subagent_test.go new file mode 100644 index 00000000..3f1913a4 --- /dev/null +++ b/internal/core/subagent_test.go @@ -0,0 +1,115 @@ +package core + +import ( + "context" + "testing" + "time" +) + +func TestValuesContext_StripsDeadlineAndCancellation(t *testing.T) { + // Parent with a tight deadline. + parent, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer cancel() + time.Sleep(5 * time.Millisecond) // Let deadline expire. + + if parent.Err() == nil { + t.Fatal("expected parent to be expired") + } + + vc := valuesContext{parent: parent} + + if _, ok := vc.Deadline(); ok { + t.Error("valuesContext should report no deadline") + } + if vc.Done() != nil { + t.Error("valuesContext.Done() should return nil") + } + if vc.Err() != nil { + t.Errorf("valuesContext.Err() should be nil, got %v", vc.Err()) + } +} + +func TestValuesContext_PreservesValues(t *testing.T) { + type testKey struct{} + parent := context.WithValue(context.Background(), testKey{}, "hello") + + vc := valuesContext{parent: parent} + + got, ok := vc.Value(testKey{}).(string) + if !ok || got != "hello" { + t.Errorf("expected value 'hello', got %q (ok=%v)", got, ok) + } +} + +func TestSpawnContext_SurvivesCancelledParent(t *testing.T) { + // Simulate the exact scenario from the bug: the parent generation + // context is already cancelled when the subagent tool handler runs. + parent, cancel := context.WithCancel(context.Background()) + cancel() // Cancelled before detach. + + // This is what executeSubagent now does: + spawnCtx := context.WithoutCancel(valuesContext{parent: parent}) + + // The spawn context must be alive. + if spawnCtx.Err() != nil { + t.Fatalf("spawnCtx should be alive, got err: %v", spawnCtx.Err()) + } + + // Adding a timeout should produce a working context. + tCtx, tCancel := context.WithTimeout(spawnCtx, 5*time.Second) + defer tCancel() + + if tCtx.Err() != nil { + t.Fatalf("timeout context should be alive, got err: %v", tCtx.Err()) + } +} + +func TestSpawnContext_SurvivesDeadlineExceededParent(t *testing.T) { + // Simulate: parent had a deadline that already expired. + parent, pCancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer pCancel() + time.Sleep(5 * time.Millisecond) + + if parent.Err() != context.DeadlineExceeded { + t.Fatalf("expected parent deadline exceeded, got: %v", parent.Err()) + } + + spawnCtx := context.WithoutCancel(valuesContext{parent: parent}) + + if spawnCtx.Err() != nil { + t.Fatalf("spawnCtx should be alive after deadline-exceeded parent, got: %v", spawnCtx.Err()) + } +} + +func TestSpawnContext_PreservesSpawnerValue(t *testing.T) { + // Verify the subagent spawner callback survives context detachment. + called := false + spawner := SubagentSpawnFunc(func(ctx context.Context, toolCallID, prompt, model, systemPrompt string, timeout time.Duration) (*SubagentSpawnResult, error) { + called = true + return &SubagentSpawnResult{Response: "ok"}, nil + }) + + parent := WithSubagentSpawner(context.Background(), spawner) + // Cancel the parent. + parentCtx, cancel := context.WithCancel(parent) + cancel() + + spawnCtx := context.WithoutCancel(valuesContext{parent: parentCtx}) + + // Should be able to retrieve the spawner from the detached context. + recovered := getSubagentSpawner(spawnCtx) + if recovered == nil { + t.Fatal("spawner should be recoverable from detached context") + } + + result, err := recovered(spawnCtx, "tc1", "test task", "", "", time.Minute) + if err != nil { + t.Fatalf("spawner call failed: %v", err) + } + if !called { + t.Error("spawner was not called") + } + if result.Response != "ok" { + t.Errorf("expected 'ok', got %q", result.Response) + } +} diff --git a/internal/kitsetup/setup.go b/internal/kitsetup/setup.go index 977be308..4d46e2f8 100644 --- a/internal/kitsetup/setup.go +++ b/internal/kitsetup/setup.go @@ -40,6 +40,24 @@ type AgentSetupOptions struct { // wrapping. Used by the SDK hook system. Both wrappers compose: // extension wrapper runs first (inner), then this wrapper (outer). ToolWrapper func([]fantasy.AgentTool) []fantasy.AgentTool + + // ProviderConfig, when non-nil, is used directly instead of calling + // BuildProviderConfig(). Callers that already hold viperInitMu can + // pre-build this and release the lock before calling SetupAgent, so the + // slow agent/MCP initialisation runs concurrently with other New() calls. + ProviderConfig *models.ProviderConfig + // Debug enables debug logging. When zero-value, viper is consulted. + // Only meaningful when ProviderConfig is also set. + Debug bool + // NoExtensions skips extension loading. When false, viper is consulted. + // Only meaningful when ProviderConfig is also set. + NoExtensions bool + // MaxSteps overrides the agent step limit. 0 means use viper value. + // Only meaningful when ProviderConfig is also set. + MaxSteps int + // StreamingEnabled controls streaming. Only meaningful when ProviderConfig + // is also set. + StreamingEnabled bool } // AgentSetupResult bundles the created agent and any debug logger so the caller @@ -88,15 +106,36 @@ func BuildProviderConfig() (*models.ProviderConfig, string, error) { // SetupAgent creates an agent from the current viper state + the provided // options. It wraps BuildProviderConfig and agent.CreateAgent. func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult, error) { - modelConfig, systemPrompt, err := BuildProviderConfig() - if err != nil { - return nil, err + var modelConfig *models.ProviderConfig + var systemPrompt string + + if opts.ProviderConfig != nil { + // Pre-built config supplied by caller (e.g. Kit.New after releasing + // viperInitMu). Use it directly — no viper reads needed here. + modelConfig = opts.ProviderConfig + systemPrompt = modelConfig.SystemPrompt + } else { + var err error + modelConfig, systemPrompt, err = BuildProviderConfig() + if err != nil { + return nil, err + } } + // Resolve debug / no-extensions / max-steps / streaming: prefer explicit + // fields (set when ProviderConfig was pre-built) over viper fallback. + debugEnabled := opts.Debug || viper.GetBool("debug") + noExtensions := opts.NoExtensions || viper.GetBool("no-extensions") + maxSteps := opts.MaxSteps + if maxSteps == 0 { + maxSteps = viper.GetInt("max-steps") + } + streamingEnabled := opts.StreamingEnabled || viper.GetBool("stream") + // Create the appropriate debug logger. var debugLogger tools.DebugLogger var bufferedLogger *tools.BufferedDebugLogger - if viper.GetBool("debug") { + if debugEnabled { if opts.UseBufferedLogger { bufferedLogger = tools.NewBufferedDebugLogger(true) debugLogger = bufferedLogger @@ -108,7 +147,7 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult, // Load extensions unless --no-extensions is set. var extRunner *extensions.Runner var extCreationOpts extensionCreationOpts - if !viper.GetBool("no-extensions") { + if !noExtensions { var extErr error extRunner, extCreationOpts, extErr = loadExtensions() if extErr != nil { @@ -140,8 +179,8 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult, ModelConfig: modelConfig, MCPConfig: opts.MCPConfig, SystemPrompt: systemPrompt, - MaxSteps: viper.GetInt("max-steps"), - StreamingEnabled: viper.GetBool("stream"), + MaxSteps: maxSteps, + StreamingEnabled: streamingEnabled, ShowSpinner: opts.ShowSpinner, Quiet: opts.Quiet, SpinnerFunc: opts.SpinnerFunc, diff --git a/pkg/kit/kit.go b/pkg/kit/kit.go index 625f9b48..bd16dd70 100644 --- a/pkg/kit/kit.go +++ b/pkg/kit/kit.go @@ -505,85 +505,125 @@ func New(ctx context.Context, opts *Options) (*Kit, error) { opts = &Options{} } - viperInitMu.Lock() - defer viperInitMu.Unlock() + // All viper writes (SetSDKDefaults, InitConfig, Set calls, system-prompt + // composition) happen under viperInitMu. We also call BuildProviderConfig + // here — it's fast (just reads) — so we can capture the full config + // snapshot before releasing the lock. The expensive work (MCP loading, + // provider creation, session init) then runs outside the lock, allowing + // parallel subagent spawns to proceed concurrently. + var ( + providerConfig *models.ProviderConfig + modelString string + cwd string + contextFiles []*ContextFile + loadedSkills []*Skill + mcpConfig *config.Config + debug bool + noExtensions bool + maxSteps int + streaming bool + ) - // Set CLI-equivalent defaults for viper. When used as an SDK (without - // cobra), these defaults are not registered via flag bindings. - setSDKDefaults() + if err := func() error { + viperInitMu.Lock() + defer viperInitMu.Unlock() - // Initialize config (loads config files and env vars). - // Only initialize if not already done (e.g., by CLI's cobra.OnInitialize). - // Check if model is already set, which indicates config was loaded. - if viper.GetString("model") == "" { - if err := InitConfig(opts.ConfigFile, false); err != nil { - return nil, fmt.Errorf("failed to initialize config: %w", err) - } - } + // Set CLI-equivalent defaults for viper. When used as an SDK (without + // cobra), these defaults are not registered via flag bindings. + setSDKDefaults() - // Handle CLI debug mode. - if opts.Debug { - viper.Set("debug", true) - } - - // Override viper settings with options. - if opts.Model != "" { - viper.Set("model", opts.Model) - } - if opts.SystemPrompt != "" { - viper.Set("system-prompt", opts.SystemPrompt) - } - if opts.MaxSteps > 0 { - viper.Set("max-steps", opts.MaxSteps) - } - viper.Set("stream", opts.Streaming) - - // Resolve working directory for context/skill discovery. - cwd := opts.SessionDir - if cwd == "" { - cwd, _ = os.Getwd() - } - - // Load context files (AGENTS.md) from the project root. - contextFiles := loadContextFiles(cwd) - - // Load skills — either from explicit paths or via auto-discovery. - loadedSkills, err := loadSkills(opts) - if err != nil { - return nil, fmt.Errorf("failed to load skills: %w", err) - } - - // Always compose the system prompt with runtime context: base prompt + - // AGENTS.md context + skills metadata + date/cwd. - { - basePrompt := viper.GetString("system-prompt") - pb := skills.NewPromptBuilder(basePrompt) - - // Inject AGENTS.md content as project context. - for _, cf := range contextFiles { - pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content)) + // Initialize config (loads config files and env vars). + // Only initialize if not already done (e.g., by CLI's cobra.OnInitialize). + // Check if model is already set, which indicates config was loaded. + if viper.GetString("model") == "" { + if err := InitConfig(opts.ConfigFile, false); err != nil { + return fmt.Errorf("failed to initialize config: %w", err) + } } - // Inject skills metadata (name + description + location). - if len(loadedSkills) > 0 { - pb.WithSkills(loadedSkills) + // Handle CLI debug mode. + if opts.Debug { + viper.Set("debug", true) } - // Append current date/time and working directory. - pb.WithSection("", fmt.Sprintf( - "Current date and time: %s\nCurrent working directory: %s", - time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd, - )) + // Override viper settings with options. + if opts.Model != "" { + viper.Set("model", opts.Model) + } + if opts.SystemPrompt != "" { + viper.Set("system-prompt", opts.SystemPrompt) + } + if opts.MaxSteps > 0 { + viper.Set("max-steps", opts.MaxSteps) + } + viper.Set("stream", opts.Streaming) - viper.Set("system-prompt", pb.Build()) + // Resolve working directory for context/skill discovery. + cwd = opts.SessionDir + if cwd == "" { + cwd, _ = os.Getwd() + } + + // Load context files (AGENTS.md) from the project root. + contextFiles = loadContextFiles(cwd) + + // Load skills — either from explicit paths or via auto-discovery. + var err error + loadedSkills, err = loadSkills(opts) + if err != nil { + return fmt.Errorf("failed to load skills: %w", err) + } + + // Always compose the system prompt with runtime context: base prompt + + // AGENTS.md context + skills metadata + date/cwd. + { + basePrompt := viper.GetString("system-prompt") + pb := skills.NewPromptBuilder(basePrompt) + + // Inject AGENTS.md content as project context. + for _, cf := range contextFiles { + pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content)) + } + + // Inject skills metadata (name + description + location). + if len(loadedSkills) > 0 { + pb.WithSkills(loadedSkills) + } + + // Append current date/time and working directory. + pb.WithSection("", fmt.Sprintf( + "Current date and time: %s\nCurrent working directory: %s", + time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd, + )) + + viper.Set("system-prompt", pb.Build()) + } + + // Snapshot all viper-derived values now, while the lock is held. + // BuildProviderConfig is fast (pure reads), so we do it here. + var pcErr error + providerConfig, _, pcErr = kitsetup.BuildProviderConfig() + if pcErr != nil { + return fmt.Errorf("failed to build provider config: %w", pcErr) + } + modelString = viper.GetString("model") + debug = viper.GetBool("debug") + noExtensions = viper.GetBool("no-extensions") + maxSteps = viper.GetInt("max-steps") + streaming = viper.GetBool("stream") + + return nil + }(); err != nil { + return nil, err } + // ---- viperInitMu released — heavy I/O below runs concurrently ---- // Load MCP configuration. Use pre-loaded config if provided via CLI options. - var mcpConfig *config.Config - if opts.CLI != nil { + if opts.CLI != nil && opts.CLI.MCPConfig != nil { mcpConfig = opts.CLI.MCPConfig } if mcpConfig == nil { + var err error mcpConfig, err = config.LoadAndValidateConfig() if err != nil { return nil, fmt.Errorf("failed to load MCP config: %w", err) @@ -601,12 +641,19 @@ func New(ctx context.Context, opts *Options) (*Kit, error) { beforeCompact := newHookRegistry[BeforeCompactHook, BeforeCompactResult]() // Build agent setup options, pulling CLI-specific fields when available. + // Pass the pre-built ProviderConfig and scalar viper snapshots so + // SetupAgent doesn't need to re-read viper (which would require the lock). setupOpts := kitsetup.AgentSetupOptions{ - MCPConfig: mcpConfig, - Quiet: opts.Quiet, - CoreTools: opts.Tools, - ExtraTools: opts.ExtraTools, - ToolWrapper: hookToolWrapper(beforeToolCall, afterToolResult), + MCPConfig: mcpConfig, + Quiet: opts.Quiet, + CoreTools: opts.Tools, + ExtraTools: opts.ExtraTools, + ToolWrapper: hookToolWrapper(beforeToolCall, afterToolResult), + ProviderConfig: providerConfig, + Debug: debug, + NoExtensions: noExtensions, + MaxSteps: maxSteps, + StreamingEnabled: streaming, } if opts.CLI != nil { setupOpts.ShowSpinner = opts.CLI.ShowSpinner @@ -630,7 +677,7 @@ func New(ctx context.Context, opts *Options) (*Kit, error) { k := &Kit{ agent: agentResult.Agent, treeSession: treeSession, - modelString: viper.GetString("model"), + modelString: modelString, events: newEventBus(), autoCompact: opts.AutoCompact, compactionOpts: opts.CompactionOptions,