fix(extensions): serialize handler calls per-extension to prevent data races

- Add per-extension reentrant mutex to Runner that serializes handler
  invocations from concurrent goroutines (e.g. parallel subagent events)
  while allowing re-entrant calls (handler → EmitCustomEvent → handler)
- Fix subagent-monitor slice aliasing bug: submonEntries[:0] reuses the
  backing array, corrupting entries during in-place filtering
- Pass parent's loaded MCPConfig to child subagents in Kit.Subagent(),
  eliminating concurrent viper map access during parallel kit.New() calls
- Add Options.MCPConfig field so SDK consumers can also skip viper reads
- Add tests for concurrent emit, cross-extension concurrency, and
  re-entrant EmitCustomEvent
This commit is contained in:
Ed Zynda
2026-04-16 12:11:10 +03:00
parent 7e6455732c
commit fcaa52bf1c
5 changed files with 307 additions and 9 deletions
+6 -3
View File
@@ -13,6 +13,8 @@
// - No channels in maps (Yaegi panics on range over map[string]chan)
// - All ctx.* calls guarded with nil checks
// - Simple data structures only
// - The extension runner serializes handler calls per-extension, so
// concurrent subagent events cannot race on this shared state.
package main
import (
@@ -43,7 +45,8 @@ const (
)
// ---------------------------------------------------------------------------
// Package-level state - all simple types
// Package-level state — safe because the runner serializes all handler
// invocations for the same extension (per-extension reentrant mutex).
// ---------------------------------------------------------------------------
var (
@@ -282,8 +285,8 @@ func Init(api ext.API) {
submonPushWidget()
// Remove the entry immediately (no goroutine to avoid races)
newEntries := submonEntries[:0]
// Remove the entry — build a new slice to avoid aliasing bugs
newEntries := make([]*submonEntry, 0, len(submonEntries))
for _, en := range submonEntries {
if en.callID != e.ToolCallID {
newEntries = append(newEntries, en)
@@ -130,6 +130,58 @@ func TestSubagentMonitor_MultipleSubagents(t *testing.T) {
time.Sleep(100 * time.Millisecond)
}
// TestSubagentMonitor_ConcurrentSubagents verifies no panics when multiple
// subagents emit events concurrently from different goroutines.
func TestSubagentMonitor_ConcurrentSubagents(t *testing.T) {
harness := test.New(t)
harness.LoadFile("../../.kit/extensions/subagent-monitor.go")
_, err := harness.Emit(extensions.SessionStartEvent{SessionID: "test-session"})
if err != nil {
t.Fatalf("SessionStart should not error: %v", err)
}
// Start 5 subagents concurrently
done := make(chan struct{}, 5)
for i := range 5 {
go func(idx int) {
defer func() { done <- struct{}{} }()
callID := fmt.Sprintf("concurrent-%d", idx)
task := fmt.Sprintf("concurrent task %d", idx)
_, _ = harness.Emit(extensions.SubagentStartEvent{
ToolCallID: callID,
Task: task,
})
// Emit many chunks rapidly
for j := range 20 {
_, _ = harness.Emit(extensions.SubagentChunkEvent{
ToolCallID: callID,
Task: task,
ChunkType: "text",
Content: fmt.Sprintf("agent %d chunk %d", idx, j),
})
}
_, _ = harness.Emit(extensions.SubagentEndEvent{
ToolCallID: callID,
Task: task,
Response: "done",
})
}(i)
}
// Wait for all goroutines
for range 5 {
<-done
}
// Allow any final processing
time.Sleep(200 * time.Millisecond)
}
// TestSubagentMonitor_SessionShutdown verifies shutdown doesn't panic
// even with nil ctx functions.
func TestSubagentMonitor_SessionShutdown(t *testing.T) {
+92 -3
View File
@@ -1,21 +1,93 @@
package extensions
import (
"bytes"
"fmt"
"log"
"os"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"github.com/spf13/viper"
)
// ---------------------------------------------------------------------------
// reentrantMu — a per-extension mutex that allows the same goroutine to
// re-enter (e.g. handler → ctx.EmitCustomEvent → handler in same extension).
// Different goroutines are serialized, preventing concurrent state mutation.
// ---------------------------------------------------------------------------
type reentrantMu struct {
mu sync.Mutex
cond *sync.Cond
owner int64 // goroutine ID that holds the lock, or 0
depth int // re-entrancy depth
}
// initReentrantMu initializes the reentrant mutex in-place. Must be called
// after the struct is at its final memory location (not before copying).
func (r *reentrantMu) init() {
r.cond = sync.NewCond(&r.mu)
}
// lock acquires the mutex. If the calling goroutine already holds it, the
// call succeeds immediately (re-entrant). Every call to lock must be paired
// with a call to unlock.
func (r *reentrantMu) lock() {
gid := goroutineID()
r.mu.Lock()
if r.owner == gid {
// Re-entrant: same goroutine already holds the lock.
r.depth++
r.mu.Unlock()
return
}
// Wait for the current owner to release.
for r.owner != 0 {
r.cond.Wait() // releases mu, blocks, re-acquires mu on wake
}
r.owner = gid
r.depth = 1
r.mu.Unlock()
}
// unlock releases the mutex (or decrements re-entrancy depth).
func (r *reentrantMu) unlock() {
r.mu.Lock()
r.depth--
if r.depth == 0 {
r.owner = 0
r.cond.Signal()
}
r.mu.Unlock()
}
// goroutineID extracts the current goroutine's ID from runtime.Stack output.
// This is a well-known technique used by Go testing infrastructure.
func goroutineID() int64 {
var buf [64]byte
n := runtime.Stack(buf[:], false)
// Stack output starts with "goroutine NNN ["
s := buf[:n]
s = s[len("goroutine "):]
s = s[:bytes.IndexByte(s, ' ')]
id, _ := strconv.ParseInt(string(s), 10, 64)
return id
}
// Runner manages loaded extensions and dispatches events to their handlers
// sequentially. Handlers execute in extension
// load order; for cancellable events the first blocking result wins.
//
// Each extension has a dedicated reentrant mutex so that handlers for the
// same extension are serialized (preventing data races on shared package-level
// state), while handlers for different extensions may execute concurrently.
type Runner struct {
extensions []LoadedExtension
extMu []reentrantMu // per-extension reentrant mutex, indexed by extension position
ctx Context
widgets map[string]WidgetConfig // keyed by widget ID
statusEntries map[string]StatusBarEntry // keyed by status key
@@ -52,7 +124,11 @@ type LoadedExtension struct {
// NewRunner creates a Runner from a set of loaded extensions.
func NewRunner(exts []LoadedExtension) *Runner {
return &Runner{extensions: exts}
mus := make([]reentrantMu, len(exts))
for i := range mus {
mus[i].init()
}
return &Runner{extensions: exts, extMu: mus}
}
// SetContext updates the runtime context (session ID, model, etc.) that is
@@ -367,6 +443,11 @@ func (r *Runner) Emit(event Event) (Result, error) {
for i := range r.extensions {
ext := &r.extensions[i]
handlers := ext.Handlers[event.Type()]
if len(handlers) == 0 {
continue
}
r.extMu[i].lock()
for _, handler := range handlers {
result, err := safeCall(handler, event, ctx)
if err != nil {
@@ -379,6 +460,7 @@ func (r *Runner) Emit(event Event) (Result, error) {
// Check for blocking/short-circuit results.
if isBlocking(result) {
r.extMu[i].unlock()
return result, nil
}
@@ -386,6 +468,7 @@ func (r *Runner) Emit(event Event) (Result, error) {
// the caller is responsible for applying the modifications.
accumulated = result
}
r.extMu[i].unlock()
}
return accumulated, nil
}
@@ -712,11 +795,17 @@ func (r *Runner) EmitCustomEvent(name, data string) {
// Extension-registered handlers first (in load order).
for i := range r.extensions {
for _, h := range r.extensions[i].CustomEventHandlers[name] {
extHandlers := r.extensions[i].CustomEventHandlers[name]
if len(extHandlers) == 0 {
continue
}
r.extMu[i].lock()
for _, h := range extHandlers {
safeInvoke(h)
}
r.extMu[i].unlock()
}
// Then dynamic subscriptions.
// Then dynamic subscriptions (not extension-scoped, no per-ext lock).
for _, h := range dynamicHandlers {
safeInvoke(h)
}
+140
View File
@@ -1,6 +1,7 @@
package extensions
import (
"sync"
"testing"
)
@@ -571,3 +572,142 @@ func TestRunner_ContextPrintNilSafe(t *testing.T) {
t.Fatalf("unexpected error: %v", err)
}
}
func TestRunner_ConcurrentEmitSameExtension(t *testing.T) {
// Verify that concurrent Emit calls for the same extension are serialized
// and don't cause data races on shared handler state.
var counter int
ext := makeHandlerExt("shared-state.go", map[EventType][]HandlerFunc{
SubagentStart: {
func(e Event, c Context) Result {
// Read-modify-write: racy without serialization.
v := counter
counter = v + 1
return nil
},
},
SubagentChunk: {
func(e Event, c Context) Result {
v := counter
counter = v + 1
return nil
},
},
})
r := makeRunner(ext)
var wg sync.WaitGroup
const goroutines = 20
const iterations = 50
wg.Add(goroutines)
for range goroutines {
go func() {
defer wg.Done()
for range iterations {
_, _ = r.Emit(SubagentStartEvent{ToolCallID: "x"})
_, _ = r.Emit(SubagentChunkEvent{ToolCallID: "x"})
}
}()
}
wg.Wait()
if counter != goroutines*iterations*2 {
t.Errorf("expected counter=%d, got %d (race detected)", goroutines*iterations*2, counter)
}
}
func TestRunner_ConcurrentEmitDifferentExtensions(t *testing.T) {
// Two extensions with independent state should not block each other
// and should both run correctly under concurrent Emit calls.
var counter1, counter2 int
ext1 := makeHandlerExt("ext1.go", map[EventType][]HandlerFunc{
SubagentStart: {
func(e Event, c Context) Result {
v := counter1
counter1 = v + 1
return nil
},
},
})
ext2 := makeHandlerExt("ext2.go", map[EventType][]HandlerFunc{
SubagentStart: {
func(e Event, c Context) Result {
v := counter2
counter2 = v + 1
return nil
},
},
})
r := makeRunner(ext1, ext2)
var wg sync.WaitGroup
const goroutines = 20
const iterations = 50
wg.Add(goroutines)
for range goroutines {
go func() {
defer wg.Done()
for range iterations {
_, _ = r.Emit(SubagentStartEvent{ToolCallID: "x"})
}
}()
}
wg.Wait()
expected := goroutines * iterations
if counter1 != expected {
t.Errorf("ext1 counter: expected %d, got %d", expected, counter1)
}
if counter2 != expected {
t.Errorf("ext2 counter: expected %d, got %d", expected, counter2)
}
}
func TestRunner_ReentrantEmitCustomEvent(t *testing.T) {
// Verify that a handler can call EmitCustomEvent (which dispatches to
// the same extension's custom event handlers) without deadlocking.
var order []string
ext := LoadedExtension{
Path: "reentrant.go",
Handlers: map[EventType][]HandlerFunc{
SessionStart: {
func(e Event, c Context) Result {
order = append(order, "session_start")
// This triggers EmitCustomEvent for the same extension
// via a direct runner call (simulating ctx.EmitCustomEvent).
return nil
},
},
},
CustomEventHandlers: map[string][]func(string){
"test-event": {
func(data string) {
order = append(order, "custom:"+data)
},
},
},
}
r := makeRunner(ext)
// Wire up the handler to call EmitCustomEvent re-entrantly.
ext.Handlers[SessionStart] = []HandlerFunc{
func(e Event, c Context) Result {
order = append(order, "session_start")
r.EmitCustomEvent("test-event", "hello")
return nil
},
}
r.extensions[0] = ext
// Rebuild mutexes after modifying extensions slice.
r.extMu = make([]reentrantMu, len(r.extensions))
for i := range r.extMu {
r.extMu[i].init()
}
_, err := r.Emit(SessionStartEvent{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(order) != 2 || order[0] != "session_start" || order[1] != "custom:hello" {
t.Errorf("expected [session_start, custom:hello], got %v", order)
}
}
+17 -3
View File
@@ -51,6 +51,7 @@ type Kit struct {
bufferedLogger *tools.BufferedDebugLogger
authHandler MCPAuthHandler // OAuth handler for remote MCP servers (may need Close)
opts *Options // stored for reload operations (skills, etc.)
mcpConfig *config.Config // loaded MCP/server config, shared with subagents
// hasCustomSystemPrompt is true when the user explicitly configured a
// system prompt (via --system-prompt flag, config file, or SDK option).
@@ -849,6 +850,13 @@ type Options struct {
// (e.g. AGENTS.md) from the working directory.
NoContextFiles bool
// MCPConfig provides a pre-loaded MCP configuration. When set,
// LoadAndValidateConfig is skipped during Kit creation — avoiding
// viper access entirely. This is set automatically for in-process
// subagents (inheriting the parent's loaded config) and can be used
// by SDK consumers who build config programmatically.
MCPConfig *config.Config
// InProcessMCPServers registers mcp-go servers that run in the same
// process. Each key is the server name (used to prefix tool names, e.g.
// "docs__search"). The value must be a *[server.MCPServer].
@@ -1136,8 +1144,11 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
}
// ---- viperInitMu released — heavy I/O below runs concurrently ----
// Load MCP configuration. Use pre-loaded config if provided via CLI options.
if opts.CLI != nil && opts.CLI.MCPConfig != nil {
// Load MCP configuration. Use pre-loaded config if provided directly,
// via CLI options, or load from viper as a last resort.
if opts.MCPConfig != nil {
mcpConfig = opts.MCPConfig
} else if opts.CLI != nil && opts.CLI.MCPConfig != nil {
mcpConfig = opts.CLI.MCPConfig
}
if mcpConfig == nil {
@@ -1258,6 +1269,7 @@ func New(ctx context.Context, opts *Options) (*Kit, error) {
bufferedLogger: agentResult.BufferedLogger,
authHandler: setupOpts.AuthHandler,
opts: opts,
mcpConfig: mcpConfig,
hasCustomSystemPrompt: hasCustomSystemPrompt,
beforeToolCall: beforeToolCall,
afterToolResult: afterToolResult,
@@ -1582,13 +1594,15 @@ func (m *Kit) Subagent(ctx context.Context, cfg SubagentConfig) (*SubagentResult
tools = SubagentTools()
}
// Create child Kit instance.
// Create child Kit instance. Pass the parent's loaded MCP config to
// avoid re-reading viper (which races with concurrent subagent spawns).
childOpts := &Options{
Model: model,
SystemPrompt: systemPrompt,
Tools: tools,
NoSession: cfg.NoSession,
Quiet: true,
MCPConfig: m.mcpConfig,
}
child, err := New(ctx, childOpts)
if err != nil {