From ead4afbfe6d3c1924b4bf5dd2371ffdb61e5040a Mon Sep 17 00:00:00 2001
From: Ed Zynda <ezynda3@gmail.com>
Date: Thu, 2 Apr 2026 15:54:47 +0300
Subject: [PATCH] fix(subagent): prevent instant failure from already-dead
 parent contexts

- Replace detachedWithCancel (goroutine-based) with context.WithoutCancel
  + valuesContext; the old goroutine would fire immediately if the parent
  was already cancelled/deadline-exceeded, causing 'failed after 0s'
- Kit.Subagent() pre-flight: if the incoming ctx is already done, reset
  to context.Background() before applying the subagent timeout
- Both Subagent() error paths now return a non-nil *SubagentResult with
  Elapsed set, so the tool response always shows accurate timing
- Narrow viperInitMu scope in Kit.New(): snapshot viper state + call
  BuildProviderConfig under the lock, then release before SetupAgent /
  MCP loading; parallel subagent spawns no longer serialise on viper I/O
- AgentSetupOptions gains ProviderConfig + scalar fields so SetupAgent
  can skip viper reads when a pre-built config is supplied
- Add subagent_test.go covering the fixed context detachment behaviour
---
 internal/core/subagent.go      |  59 +++++-----
 internal/core/subagent_test.go | 115 ++++++++++++++++++++
 internal/kitsetup/setup.go     |  53 +++++++--
 pkg/kit/kit.go                 | 189 ++++++++++++++++++++-------------
 4 files changed, 305 insertions(+), 111 deletions(-)
 create mode 100644 internal/core/subagent_test.go

diff --git a/internal/core/subagent.go b/internal/core/subagent.go
index 2181cd8e..4f96ec2b 100644
--- a/internal/core/subagent.go
+++ b/internal/core/subagent.go
@@ -130,13 +130,22 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
 		), fmt.Errorf("no subagent spawner in context")
 	}
 
-	// Detach from the parent's deadline so the subagent gets its own
-	// independent timeout (applied downstream in Kit.Subagent). The parent
-	// context may carry a tight deadline from the LLM generation loop or
-	// other tool timeouts that would prematurely kill the subagent.
-	// We preserve context values (spawner, etc.) and propagate parent
-	// cancellation (e.g. user hits Ctrl-C) without inheriting the deadline.
-	spawnCtx := detachedWithCancel(ctx)
+	// Build a clean context for the subagent that inherits values (e.g. the
+	// spawner callback) but is completely detached from the parent's
+	// deadline AND cancellation. The subagent gets its own independent
+	// timeout (applied downstream in Kit.Subagent).
+	//
+	// Why full detachment instead of propagating parent cancellation?
+	// The parent context may already be done (deadline exceeded or
+	// cancelled) by the time this tool handler executes — for example when
+	// the generation loop context carries a deadline, when the user
+	// double-ESC cancels mid-turn, or when parallel tool execution
+	// encounters a race between stream completion and tool dispatch. Using
+	// context.WithoutCancel (Go 1.21+) ensures the subagent always starts
+	// cleanly with a fresh timeout, following the pattern used by crush for
+	// shutdown-resilient child work. The subagent's own timeout
+	// (defaultSubagentTimeout / user-specified) provides the safety net.
+	spawnCtx := context.WithoutCancel(valuesContext{parent: ctx})
 
 	// Spawn in-process subagent.
 	result, err := spawner(spawnCtx, call.ID, args.Task, args.Model, args.SystemPrompt, timeout)
@@ -173,37 +182,21 @@ func executeSubagent(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolRe
 }
 
 // ---------------------------------------------------------------------------
-// Context detachment
+// Context helpers
 // ---------------------------------------------------------------------------
 
-// detachedContext wraps a parent context, preserving its values but removing
-// its deadline and cancellation. This allows the subagent to have its own
-// independent timeout while still accessing context-stored values (e.g. the
-// subagent spawner function).
-type detachedContext struct {
+// valuesContext preserves a parent context's values (e.g. the subagent
+// spawner callback) while stripping its deadline and cancellation. Combined
+// with context.WithoutCancel() this gives the subagent a completely clean
+// context that only inherits value-based dependencies.
+type valuesContext struct {
 	parent context.Context
 }
 
-func (d detachedContext) Deadline() (time.Time, bool) { return time.Time{}, false }
-func (d detachedContext) Done() <-chan struct{}       { return nil }
-func (d detachedContext) Err() error                  { return nil }
-func (d detachedContext) Value(key any) any           { return d.parent.Value(key) }
-
-// detachedWithCancel creates a new context that inherits values from the
-// parent but has no deadline. Cancellation of the parent is propagated: when
-// the parent is cancelled the returned context is also cancelled, but the
-// parent's deadline does not apply to the child.
-func detachedWithCancel(parent context.Context) context.Context {
-	child, cancel := context.WithCancel(detachedContext{parent: parent})
-	go func() {
-		select {
-		case <-parent.Done():
-			cancel()
-		case <-child.Done():
-		}
-	}()
-	return child
-}
+func (v valuesContext) Deadline() (time.Time, bool) { return time.Time{}, false }
+func (v valuesContext) Done() <-chan struct{}       { return nil }
+func (v valuesContext) Err() error                  { return nil }
+func (v valuesContext) Value(key any) any           { return v.parent.Value(key) }
 
 // truncateResponse limits the response length to avoid overwhelming context windows.
 func truncateResponse(s string, maxLen int) string {
diff --git a/internal/core/subagent_test.go b/internal/core/subagent_test.go
new file mode 100644
index 00000000..3f1913a4
--- /dev/null
+++ b/internal/core/subagent_test.go
@@ -0,0 +1,115 @@
+package core
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+func TestValuesContext_StripsDeadlineAndCancellation(t *testing.T) {
+	// Parent with a tight deadline.
+	parent, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
+	defer cancel()
+	time.Sleep(5 * time.Millisecond) // Let deadline expire.
+
+	if parent.Err() == nil {
+		t.Fatal("expected parent to be expired")
+	}
+
+	vc := valuesContext{parent: parent}
+
+	if _, ok := vc.Deadline(); ok {
+		t.Error("valuesContext should report no deadline")
+	}
+	if vc.Done() != nil {
+		t.Error("valuesContext.Done() should return nil")
+	}
+	if vc.Err() != nil {
+		t.Errorf("valuesContext.Err() should be nil, got %v", vc.Err())
+	}
+}
+
+func TestValuesContext_PreservesValues(t *testing.T) {
+	type testKey struct{}
+	parent := context.WithValue(context.Background(), testKey{}, "hello")
+
+	vc := valuesContext{parent: parent}
+
+	got, ok := vc.Value(testKey{}).(string)
+	if !ok || got != "hello" {
+		t.Errorf("expected value 'hello', got %q (ok=%v)", got, ok)
+	}
+}
+
+func TestSpawnContext_SurvivesCancelledParent(t *testing.T) {
+	// Simulate the exact scenario from the bug: the parent generation
+	// context is already cancelled when the subagent tool handler runs.
+	parent, cancel := context.WithCancel(context.Background())
+	cancel() // Cancelled before detach.
+
+	// This is what executeSubagent now does:
+	spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
+
+	// The spawn context must be alive.
+	if spawnCtx.Err() != nil {
+		t.Fatalf("spawnCtx should be alive, got err: %v", spawnCtx.Err())
+	}
+
+	// Adding a timeout should produce a working context.
+	tCtx, tCancel := context.WithTimeout(spawnCtx, 5*time.Second)
+	defer tCancel()
+
+	if tCtx.Err() != nil {
+		t.Fatalf("timeout context should be alive, got err: %v", tCtx.Err())
+	}
+}
+
+func TestSpawnContext_SurvivesDeadlineExceededParent(t *testing.T) {
+	// Simulate: parent had a deadline that already expired.
+	parent, pCancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
+	defer pCancel()
+	time.Sleep(5 * time.Millisecond)
+
+	if parent.Err() != context.DeadlineExceeded {
+		t.Fatalf("expected parent deadline exceeded, got: %v", parent.Err())
+	}
+
+	spawnCtx := context.WithoutCancel(valuesContext{parent: parent})
+
+	if spawnCtx.Err() != nil {
+		t.Fatalf("spawnCtx should be alive after deadline-exceeded parent, got: %v", spawnCtx.Err())
+	}
+}
+
+func TestSpawnContext_PreservesSpawnerValue(t *testing.T) {
+	// Verify the subagent spawner callback survives context detachment.
+	called := false
+	spawner := SubagentSpawnFunc(func(ctx context.Context, toolCallID, prompt, model, systemPrompt string, timeout time.Duration) (*SubagentSpawnResult, error) {
+		called = true
+		return &SubagentSpawnResult{Response: "ok"}, nil
+	})
+
+	parent := WithSubagentSpawner(context.Background(), spawner)
+	// Cancel the parent.
+	parentCtx, cancel := context.WithCancel(parent)
+	cancel()
+
+	spawnCtx := context.WithoutCancel(valuesContext{parent: parentCtx})
+
+	// Should be able to retrieve the spawner from the detached context.
+	recovered := getSubagentSpawner(spawnCtx)
+	if recovered == nil {
+		t.Fatal("spawner should be recoverable from detached context")
+	}
+
+	result, err := recovered(spawnCtx, "tc1", "test task", "", "", time.Minute)
+	if err != nil {
+		t.Fatalf("spawner call failed: %v", err)
+	}
+	if !called {
+		t.Error("spawner was not called")
+	}
+	if result.Response != "ok" {
+		t.Errorf("expected 'ok', got %q", result.Response)
+	}
+}
diff --git a/internal/kitsetup/setup.go b/internal/kitsetup/setup.go
index 977be308..4d46e2f8 100644
--- a/internal/kitsetup/setup.go
+++ b/internal/kitsetup/setup.go
@@ -40,6 +40,24 @@ type AgentSetupOptions struct {
 	// wrapping. Used by the SDK hook system. Both wrappers compose:
 	// extension wrapper runs first (inner), then this wrapper (outer).
 	ToolWrapper func([]fantasy.AgentTool) []fantasy.AgentTool
+
+	// ProviderConfig, when non-nil, is used directly instead of calling
+	// BuildProviderConfig(). Callers that already hold viperInitMu can
+	// pre-build this and release the lock before calling SetupAgent, so the
+	// slow agent/MCP initialisation runs concurrently with other New() calls.
+	ProviderConfig *models.ProviderConfig
+	// Debug enables debug logging. When zero-value, viper is consulted.
+	// Only meaningful when ProviderConfig is also set.
+	Debug bool
+	// NoExtensions skips extension loading. When false, viper is consulted.
+	// Only meaningful when ProviderConfig is also set.
+	NoExtensions bool
+	// MaxSteps overrides the agent step limit. 0 means use viper value.
+	// Only meaningful when ProviderConfig is also set.
+	MaxSteps int
+	// StreamingEnabled controls streaming. Only meaningful when ProviderConfig
+	// is also set.
+	StreamingEnabled bool
 }
 
 // AgentSetupResult bundles the created agent and any debug logger so the caller
@@ -88,15 +106,36 @@ func BuildProviderConfig() (*models.ProviderConfig, string, error) {
 // SetupAgent creates an agent from the current viper state + the provided
 // options. It wraps BuildProviderConfig and agent.CreateAgent.
 func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult, error) {
-	modelConfig, systemPrompt, err := BuildProviderConfig()
-	if err != nil {
-		return nil, err
+	var modelConfig *models.ProviderConfig
+	var systemPrompt string
+
+	if opts.ProviderConfig != nil {
+		// Pre-built config supplied by caller (e.g. Kit.New after releasing
+		// viperInitMu). Use it directly — no viper reads needed here.
+		modelConfig = opts.ProviderConfig
+		systemPrompt = modelConfig.SystemPrompt
+	} else {
+		var err error
+		modelConfig, systemPrompt, err = BuildProviderConfig()
+		if err != nil {
+			return nil, err
+		}
 	}
 
+	// Resolve debug / no-extensions / max-steps / streaming: prefer explicit
+	// fields (set when ProviderConfig was pre-built) over viper fallback.
+	debugEnabled := opts.Debug || viper.GetBool("debug")
+	noExtensions := opts.NoExtensions || viper.GetBool("no-extensions")
+	maxSteps := opts.MaxSteps
+	if maxSteps == 0 {
+		maxSteps = viper.GetInt("max-steps")
+	}
+	streamingEnabled := opts.StreamingEnabled || viper.GetBool("stream")
+
 	// Create the appropriate debug logger.
 	var debugLogger tools.DebugLogger
 	var bufferedLogger *tools.BufferedDebugLogger
-	if viper.GetBool("debug") {
+	if debugEnabled {
 		if opts.UseBufferedLogger {
 			bufferedLogger = tools.NewBufferedDebugLogger(true)
 			debugLogger = bufferedLogger
@@ -108,7 +147,7 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult,
 	// Load extensions unless --no-extensions is set.
 	var extRunner *extensions.Runner
 	var extCreationOpts extensionCreationOpts
-	if !viper.GetBool("no-extensions") {
+	if !noExtensions {
 		var extErr error
 		extRunner, extCreationOpts, extErr = loadExtensions()
 		if extErr != nil {
@@ -140,8 +179,8 @@ func SetupAgent(ctx context.Context, opts AgentSetupOptions) (*AgentSetupResult,
 		ModelConfig:      modelConfig,
 		MCPConfig:        opts.MCPConfig,
 		SystemPrompt:     systemPrompt,
-		MaxSteps:         viper.GetInt("max-steps"),
-		StreamingEnabled: viper.GetBool("stream"),
+		MaxSteps:         maxSteps,
+		StreamingEnabled: streamingEnabled,
 		ShowSpinner:      opts.ShowSpinner,
 		Quiet:            opts.Quiet,
 		SpinnerFunc:      opts.SpinnerFunc,
diff --git a/pkg/kit/kit.go b/pkg/kit/kit.go
index 625f9b48..bd16dd70 100644
--- a/pkg/kit/kit.go
+++ b/pkg/kit/kit.go
@@ -505,85 +505,125 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
 		opts = &Options{}
 	}
 
-	viperInitMu.Lock()
-	defer viperInitMu.Unlock()
+	// All viper writes (SetSDKDefaults, InitConfig, Set calls, system-prompt
+	// composition) happen under viperInitMu. We also call BuildProviderConfig
+	// here — it's fast (just reads) — so we can capture the full config
+	// snapshot before releasing the lock. The expensive work (MCP loading,
+	// provider creation, session init) then runs outside the lock, allowing
+	// parallel subagent spawns to proceed concurrently.
+	var (
+		providerConfig *models.ProviderConfig
+		modelString    string
+		cwd            string
+		contextFiles   []*ContextFile
+		loadedSkills   []*Skill
+		mcpConfig      *config.Config
+		debug          bool
+		noExtensions   bool
+		maxSteps       int
+		streaming      bool
+	)
 
-	// Set CLI-equivalent defaults for viper. When used as an SDK (without
-	// cobra), these defaults are not registered via flag bindings.
-	setSDKDefaults()
+	if err := func() error {
+		viperInitMu.Lock()
+		defer viperInitMu.Unlock()
 
-	// Initialize config (loads config files and env vars).
-	// Only initialize if not already done (e.g., by CLI's cobra.OnInitialize).
-	// Check if model is already set, which indicates config was loaded.
-	if viper.GetString("model") == "" {
-		if err := InitConfig(opts.ConfigFile, false); err != nil {
-			return nil, fmt.Errorf("failed to initialize config: %w", err)
-		}
-	}
+		// Set CLI-equivalent defaults for viper. When used as an SDK (without
+		// cobra), these defaults are not registered via flag bindings.
+		setSDKDefaults()
 
-	// Handle CLI debug mode.
-	if opts.Debug {
-		viper.Set("debug", true)
-	}
-
-	// Override viper settings with options.
-	if opts.Model != "" {
-		viper.Set("model", opts.Model)
-	}
-	if opts.SystemPrompt != "" {
-		viper.Set("system-prompt", opts.SystemPrompt)
-	}
-	if opts.MaxSteps > 0 {
-		viper.Set("max-steps", opts.MaxSteps)
-	}
-	viper.Set("stream", opts.Streaming)
-
-	// Resolve working directory for context/skill discovery.
-	cwd := opts.SessionDir
-	if cwd == "" {
-		cwd, _ = os.Getwd()
-	}
-
-	// Load context files (AGENTS.md) from the project root.
-	contextFiles := loadContextFiles(cwd)
-
-	// Load skills — either from explicit paths or via auto-discovery.
-	loadedSkills, err := loadSkills(opts)
-	if err != nil {
-		return nil, fmt.Errorf("failed to load skills: %w", err)
-	}
-
-	// Always compose the system prompt with runtime context: base prompt +
-	// AGENTS.md context + skills metadata + date/cwd.
-	{
-		basePrompt := viper.GetString("system-prompt")
-		pb := skills.NewPromptBuilder(basePrompt)
-
-		// Inject AGENTS.md content as project context.
-		for _, cf := range contextFiles {
-			pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content))
+		// Initialize config (loads config files and env vars).
+		// Only initialize if not already done (e.g., by CLI's cobra.OnInitialize).
+		// Check if model is already set, which indicates config was loaded.
+		if viper.GetString("model") == "" {
+			if err := InitConfig(opts.ConfigFile, false); err != nil {
+				return fmt.Errorf("failed to initialize config: %w", err)
+			}
 		}
 
-		// Inject skills metadata (name + description + location).
-		if len(loadedSkills) > 0 {
-			pb.WithSkills(loadedSkills)
+		// Handle CLI debug mode.
+		if opts.Debug {
+			viper.Set("debug", true)
 		}
 
-		// Append current date/time and working directory.
-		pb.WithSection("", fmt.Sprintf(
-			"Current date and time: %s\nCurrent working directory: %s",
-			time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd,
-		))
+		// Override viper settings with options.
+		if opts.Model != "" {
+			viper.Set("model", opts.Model)
+		}
+		if opts.SystemPrompt != "" {
+			viper.Set("system-prompt", opts.SystemPrompt)
+		}
+		if opts.MaxSteps > 0 {
+			viper.Set("max-steps", opts.MaxSteps)
+		}
+		viper.Set("stream", opts.Streaming)
 
-		viper.Set("system-prompt", pb.Build())
+		// Resolve working directory for context/skill discovery.
+		cwd = opts.SessionDir
+		if cwd == "" {
+			cwd, _ = os.Getwd()
+		}
+
+		// Load context files (AGENTS.md) from the project root.
+		contextFiles = loadContextFiles(cwd)
+
+		// Load skills — either from explicit paths or via auto-discovery.
+		var err error
+		loadedSkills, err = loadSkills(opts)
+		if err != nil {
+			return fmt.Errorf("failed to load skills: %w", err)
+		}
+
+		// Always compose the system prompt with runtime context: base prompt +
+		// AGENTS.md context + skills metadata + date/cwd.
+		{
+			basePrompt := viper.GetString("system-prompt")
+			pb := skills.NewPromptBuilder(basePrompt)
+
+			// Inject AGENTS.md content as project context.
+			for _, cf := range contextFiles {
+				pb.WithSection("", fmt.Sprintf("Instructions from: %s\n\n%s", cf.Path, cf.Content))
+			}
+
+			// Inject skills metadata (name + description + location).
+			if len(loadedSkills) > 0 {
+				pb.WithSkills(loadedSkills)
+			}
+
+			// Append current date/time and working directory.
+			pb.WithSection("", fmt.Sprintf(
+				"Current date and time: %s\nCurrent working directory: %s",
+				time.Now().Format("Monday, January 2, 2006, 3:04:05 PM MST"), cwd,
+			))
+
+			viper.Set("system-prompt", pb.Build())
+		}
+
+		// Snapshot all viper-derived values now, while the lock is held.
+		// BuildProviderConfig is fast (pure reads), so we do it here.
+		var pcErr error
+		providerConfig, _, pcErr = kitsetup.BuildProviderConfig()
+		if pcErr != nil {
+			return fmt.Errorf("failed to build provider config: %w", pcErr)
+		}
+		modelString = viper.GetString("model")
+		debug = viper.GetBool("debug")
+		noExtensions = viper.GetBool("no-extensions")
+		maxSteps = viper.GetInt("max-steps")
+		streaming = viper.GetBool("stream")
+
+		return nil
+	}(); err != nil {
+		return nil, err
 	}
+	// ---- viperInitMu released — heavy I/O below runs concurrently ----
 
 	// Load MCP configuration. Use pre-loaded config if provided via CLI options.
-	var mcpConfig *config.Config
-	if opts.CLI != nil {
+	if opts.CLI != nil && opts.CLI.MCPConfig != nil {
 		mcpConfig = opts.CLI.MCPConfig
 	}
 	if mcpConfig == nil {
+		var err error
 		mcpConfig, err = config.LoadAndValidateConfig()
 		if err != nil {
 			return nil, fmt.Errorf("failed to load MCP config: %w", err)
@@ -601,12 +641,19 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
 	beforeCompact := newHookRegistry[BeforeCompactHook, BeforeCompactResult]()
 
 	// Build agent setup options, pulling CLI-specific fields when available.
+	// Pass the pre-built ProviderConfig and scalar viper snapshots so
+	// SetupAgent doesn't need to re-read viper (which would require the lock).
 	setupOpts := kitsetup.AgentSetupOptions{
-		MCPConfig:   mcpConfig,
-		Quiet:       opts.Quiet,
-		CoreTools:   opts.Tools,
-		ExtraTools:  opts.ExtraTools,
-		ToolWrapper: hookToolWrapper(beforeToolCall, afterToolResult),
+		MCPConfig:        mcpConfig,
+		Quiet:            opts.Quiet,
+		CoreTools:        opts.Tools,
+		ExtraTools:       opts.ExtraTools,
+		ToolWrapper:      hookToolWrapper(beforeToolCall, afterToolResult),
+		ProviderConfig:   providerConfig,
+		Debug:            debug,
+		NoExtensions:     noExtensions,
+		MaxSteps:         maxSteps,
+		StreamingEnabled: streaming,
 	}
 	if opts.CLI != nil {
 		setupOpts.ShowSpinner = opts.CLI.ShowSpinner
@@ -630,7 +677,7 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
 	k := &Kit{
 		agent:           agentResult.Agent,
 		treeSession:     treeSession,
-		modelString:     viper.GetString("model"),
+		modelString:     modelString,
 		events:          newEventBus(),
 		autoCompact:     opts.AutoCompact,
 		compactionOpts:  opts.CompactionOptions,