From 61408ed4907d9fdcbaf5e088358cec7ec6bf2659 Mon Sep 17 00:00:00 2001 From: Ed Zynda Date: Wed, 22 Apr 2026 16:58:07 +0300 Subject: [PATCH] fix(sdk): infer ToolResponse.Type for binary data in NewTool/NewParallelTool - Infer Type="image" for image/* MIME types and Type="media" for all other binary content so the downstream framework creates a media content block instead of silently discarding Data bytes (#17) - Extract shared toolOutputToResponse() helper to eliminate duplication - Add ImageResult() and MediaResult() convenience constructors - Add LLMToolCall and LLMToolResponse type aliases so SDK consumers can call Tool.Run() without importing the underlying framework - Add 6 regression tests covering image, media, and text responses Closes #17 --- README.md | 23 +++++- pkg/kit/tools.go | 74 +++++++++++++------ pkg/kit/tools_test.go | 146 ++++++++++++++++++++++++++++++++++++++ pkg/kit/types.go | 12 ++++ skills/kit-sdk/SKILL.md | 4 ++ www/pages/sdk/overview.md | 4 +- 6 files changed, 239 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 0c43d8f7..cce98d7f 100644 --- a/README.md +++ b/README.md @@ -646,7 +646,28 @@ host, _ := kit.New(ctx, &kit.Options{ }) ``` -Use `kit.NewParallelTool` for tools safe to run concurrently. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`. +Use `kit.NewParallelTool` for tools safe to run concurrently. Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`. + +#### Return Helpers + +| Helper | Description | +| --- | --- | +| `kit.TextResult(content)` | Successful text result | +| `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) | +| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) | +| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) | + +#### ToolOutput Fields + +```go +kit.ToolOutput{ + Content: "result text", // text returned to the LLM + IsError: false, // true = LLM sees this as an error + Data: pngBytes, // optional binary data (images, audio) + MediaType: "image/png", // MIME type for binary Data + Metadata: map[string]any{}, // opaque metadata for hooks/UI (not sent to LLM) +} +``` ### With Callbacks diff --git a/pkg/kit/tools.go b/pkg/kit/tools.go index c20b9c33..07b5996c 100644 --- a/pkg/kit/tools.go +++ b/pkg/kit/tools.go @@ -2,6 +2,7 @@ package kit import ( "context" + "strings" "charm.land/fantasy" @@ -52,6 +53,22 @@ func ErrorResult(content string) ToolOutput { return ToolOutput{Content: content, IsError: true} } +// ImageResult creates a [ToolOutput] that returns an image to the LLM. +// The data is the raw image bytes and mediaType is the MIME type +// (e.g. "image/png", "image/jpeg"). The optional text content accompanies +// the image and is visible to the LLM alongside it. +func ImageResult(content string, data []byte, mediaType string) ToolOutput { + return ToolOutput{Content: content, Data: data, MediaType: mediaType} +} + +// MediaResult creates a [ToolOutput] that returns non-image binary media +// (e.g. audio, video) to the LLM. The data is the raw bytes and mediaType +// is the MIME type (e.g. "audio/wav", "video/mp4"). The optional text +// content accompanies the media. +func MediaResult(content string, data []byte, mediaType string) ToolOutput { + return ToolOutput{Content: content, Data: data, MediaType: mediaType} +} + // toolCallIDKey is the context key for the tool call ID. type toolCallIDKey struct{} @@ -63,9 +80,35 @@ func ToolCallIDFromContext(ctx context.Context) string { return s } +// toolOutputToResponse converts a [ToolOutput] into the underlying +// framework's ToolResponse, inferring the response Type from Data/MediaType +// so that binary content (images, audio, etc.) is forwarded to the LLM +// instead of being silently dropped. +func toolOutputToResponse(result ToolOutput) fantasy.ToolResponse { + resp := fantasy.ToolResponse{ + Content: result.Content, + IsError: result.IsError, + Data: result.Data, + MediaType: result.MediaType, + } + // Infer response type from binary data so the downstream framework + // creates a media content block instead of a plain-text one. + if len(result.Data) > 0 && result.MediaType != "" { + if strings.HasPrefix(result.MediaType, "image/") { + resp.Type = "image" + } else { + resp.Type = "media" + } + } + if result.Metadata != nil { + resp = fantasy.WithResponseMetadata(resp, result.Metadata) + } + return resp +} + // NewTool creates a custom [Tool] with automatic JSON schema generation from // the TInput struct type. The handler receives a typed input (deserialized -// from the LLM's JSON arguments) and returns a [ToolResult]. +// from the LLM's JSON arguments) and returns a [ToolOutput]. // // Struct tags on TInput control the generated schema: // @@ -77,6 +120,11 @@ func ToolCallIDFromContext(ctx context.Context) string { // The tool call ID is injected into the context and can be retrieved with // [ToolCallIDFromContext]. // +// Binary results: When [ToolOutput.Data] and [ToolOutput.MediaType] are set, +// the response type is automatically inferred so the LLM receives the binary +// content (e.g. an image) instead of only the text. Use [ImageResult] or +// [MediaResult] for convenience. +// // Example: // // type WeatherInput struct { @@ -84,7 +132,7 @@ func ToolCallIDFromContext(ctx context.Context) string { // } // // tool := kit.NewTool("get_weather", "Get weather for a city", -// func(ctx context.Context, input WeatherInput) (kit.ToolResult, error) { +// func(ctx context.Context, input WeatherInput) (kit.ToolOutput, error) { // return kit.TextResult("72°F, sunny in " + input.City), nil // }, // ) @@ -96,16 +144,7 @@ func NewTool[TInput any](name, description string, fn func(ctx context.Context, if err != nil { return fantasy.NewTextErrorResponse(err.Error()), nil } - resp := fantasy.ToolResponse{ - Content: result.Content, - IsError: result.IsError, - Data: result.Data, - MediaType: result.MediaType, - } - if result.Metadata != nil { - resp = fantasy.WithResponseMetadata(resp, result.Metadata) - } - return resp, nil + return toolOutputToResponse(result), nil }, ) } @@ -121,16 +160,7 @@ func NewParallelTool[TInput any](name, description string, fn func(ctx context.C if err != nil { return fantasy.NewTextErrorResponse(err.Error()), nil } - resp := fantasy.ToolResponse{ - Content: result.Content, - IsError: result.IsError, - Data: result.Data, - MediaType: result.MediaType, - } - if result.Metadata != nil { - resp = fantasy.WithResponseMetadata(resp, result.Metadata) - } - return resp, nil + return toolOutputToResponse(result), nil }, ) } diff --git a/pkg/kit/tools_test.go b/pkg/kit/tools_test.go index 901d26fd..25e78269 100644 --- a/pkg/kit/tools_test.go +++ b/pkg/kit/tools_test.go @@ -117,3 +117,149 @@ func TestToolOutput_BinaryData(t *testing.T) { t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png") } } + +// TestImageResult verifies the ImageResult convenience constructor. +func TestImageResult(t *testing.T) { + data := []byte{0x89, 0x50, 0x4E, 0x47} + r := kit.ImageResult("here is the image", data, "image/png") + if r.Content != "here is the image" { + t.Errorf("Content = %q, want %q", r.Content, "here is the image") + } + if len(r.Data) != 4 { + t.Errorf("Data len = %d, want 4", len(r.Data)) + } + if r.MediaType != "image/png" { + t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png") + } + if r.IsError { + t.Error("ImageResult should not set IsError") + } +} + +// TestMediaResult verifies the MediaResult convenience constructor. +func TestMediaResult(t *testing.T) { + data := []byte{0xFF, 0xFB, 0x90, 0x00} + r := kit.MediaResult("audio clip", data, "audio/mpeg") + if r.Content != "audio clip" { + t.Errorf("Content = %q, want %q", r.Content, "audio clip") + } + if len(r.Data) != 4 { + t.Errorf("Data len = %d, want 4", len(r.Data)) + } + if r.MediaType != "audio/mpeg" { + t.Errorf("MediaType = %q, want %q", r.MediaType, "audio/mpeg") + } + if r.IsError { + t.Error("MediaResult should not set IsError") + } +} + +// TestNewTool_BinaryImageResponse verifies that NewTool correctly infers the +// response type for image data so binary content is forwarded to the LLM +// (issue #17). +func TestNewTool_BinaryImageResponse(t *testing.T) { + type Input struct { + Path string `json:"path"` + } + + imgData := []byte{0x89, 0x50, 0x4E, 0x47} // PNG magic bytes + + tool := kit.NewTool("read_image", "Read an image file", + func(ctx context.Context, input Input) (kit.ToolOutput, error) { + return kit.ImageResult("Here is the image", imgData, "image/png"), nil + }, + ) + + // Run the tool and inspect the raw ToolResponse via the AgentTool interface. + resp, err := tool.Run(context.Background(), kit.LLMToolCall{ + ID: "call_1", + Name: "read_image", + Input: `{"path": "test.png"}`, + }) + if err != nil { + t.Fatalf("Run() error: %v", err) + } + + // The Type field must be "image" so the downstream framework creates a + // media content block instead of discarding the binary data. + if resp.Type != "image" { + t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image") + } + if len(resp.Data) != 4 { + t.Errorf("ToolResponse.Data len = %d, want 4", len(resp.Data)) + } + if resp.MediaType != "image/png" { + t.Errorf("ToolResponse.MediaType = %q, want %q", resp.MediaType, "image/png") + } + if resp.Content != "Here is the image" { + t.Errorf("ToolResponse.Content = %q, want %q", resp.Content, "Here is the image") + } +} + +// TestNewTool_BinaryMediaResponse verifies type inference for non-image media. +func TestNewTool_BinaryMediaResponse(t *testing.T) { + type Input struct{} + + tool := kit.NewTool("get_audio", "Get audio", + func(ctx context.Context, input Input) (kit.ToolOutput, error) { + return kit.MediaResult("audio clip", []byte{0xFF, 0xFB}, "audio/mpeg"), nil + }, + ) + + resp, err := tool.Run(context.Background(), kit.LLMToolCall{ + ID: "call_2", + Name: "get_audio", + Input: `{}`, + }) + if err != nil { + t.Fatalf("Run() error: %v", err) + } + if resp.Type != "media" { + t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "media") + } +} + +// TestNewTool_TextResponseTypeNotSet verifies that text-only responses do NOT +// get an inferred type (preserving existing behavior). +func TestNewTool_TextResponseTypeNotSet(t *testing.T) { + type Input struct{} + + tool := kit.NewTool("echo", "Echo", + func(ctx context.Context, input Input) (kit.ToolOutput, error) { + return kit.TextResult("hello"), nil + }, + ) + + resp, err := tool.Run(context.Background(), kit.LLMToolCall{ + ID: "call_3", Name: "echo", Input: `{}`, + }) + if err != nil { + t.Fatalf("Run() error: %v", err) + } + // Text responses should not have Type set (the framework treats "" as text). + if resp.Type != "" { + t.Errorf("ToolResponse.Type = %q, want empty string for text responses", resp.Type) + } +} + +// TestNewParallelTool_BinaryImageResponse mirrors the NewTool binary test for +// NewParallelTool. +func TestNewParallelTool_BinaryImageResponse(t *testing.T) { + type Input struct{} + + tool := kit.NewParallelTool("snap", "Take a snapshot", + func(ctx context.Context, input Input) (kit.ToolOutput, error) { + return kit.ImageResult("snapshot", []byte{0xFF, 0xD8}, "image/jpeg"), nil + }, + ) + + resp, err := tool.Run(context.Background(), kit.LLMToolCall{ + ID: "call_4", Name: "snap", Input: `{}`, + }) + if err != nil { + t.Fatalf("Run() error: %v", err) + } + if resp.Type != "image" { + t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image") + } +} diff --git a/pkg/kit/types.go b/pkg/kit/types.go index a48e5211..d450c646 100644 --- a/pkg/kit/types.go +++ b/pkg/kit/types.go @@ -157,6 +157,18 @@ type LLMTextPart = fantasy.TextPart // LLMReasoningPart is a reasoning/chain-of-thought content part. type LLMReasoningPart = fantasy.ReasoningPart +// LLMToolCall represents the raw tool invocation passed to a [Tool]'s Run +// method. It carries the call ID, tool name, and the JSON-encoded input +// arguments from the LLM. This is the execution-layer call object — distinct +// from [ToolCall] (a message content part). +type LLMToolCall = fantasy.ToolCall + +// LLMToolResponse represents the raw response returned from a [Tool]'s Run +// method. Most SDK consumers should use [ToolOutput] with [NewTool] / +// [NewParallelTool] instead — this alias is provided for advanced use cases +// that need to call Tool.Run() directly (e.g. testing). +type LLMToolResponse = fantasy.ToolResponse + // LLMToolCallPart represents an LLM-initiated tool invocation within a message. type LLMToolCallPart = fantasy.ToolCallPart diff --git a/skills/kit-sdk/SKILL.md b/skills/kit-sdk/SKILL.md index f6acb004..47297d91 100644 --- a/skills/kit-sdk/SKILL.md +++ b/skills/kit-sdk/SKILL.md @@ -493,6 +493,8 @@ host, _ := kit.New(ctx, &kit.Options{ |----------|-------------| | `kit.TextResult(content)` | Successful text result | | `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) | +| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) | +| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) | **ToolOutput fields** (for advanced use): @@ -1095,6 +1097,8 @@ kit.LLMUsage // {InputTokens, OutputTokens, TotalTokens, ReasoningTokens, // CacheCreationTokens, CacheReadTokens} kit.LLMResponse // {Content, FinishReason, Usage} kit.LLMFilePart // {Filename, Data []byte, MediaType} +kit.LLMToolCall // {ID, Name, Input string} — execution-layer tool call (for Tool.Run) +kit.LLMToolResponse // {Type, Content, Data, MediaType, IsError, ...} — raw tool response // Compaction types kit.CompactionResult, kit.CompactionOptions diff --git a/www/pages/sdk/overview.md b/www/pages/sdk/overview.md index 942d85fb..d3cee101 100644 --- a/www/pages/sdk/overview.md +++ b/www/pages/sdk/overview.md @@ -101,8 +101,10 @@ Return values: |--------|-------------| | `kit.TextResult(s)` | Successful text result | | `kit.ErrorResult(s)` | Error result (LLM sees it as a tool error) | +| `kit.ImageResult(s, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) | +| `kit.MediaResult(s, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) | -For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields. +Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields. Use `kit.NewParallelTool` for tools that are safe to run concurrently. Use `kit.ToolCallIDFromContext(ctx)` to retrieve the LLM-assigned call ID for logging or tracing.