fix(sdk): infer ToolResponse.Type for binary data in NewTool/NewParallelTool

- Infer Type="image" for image/* MIME types and Type="media" for all other binary content so the downstream framework creates a media content block instead of silently discarding Data bytes (#17) - Extract shared toolOutputToResponse() helper to eliminate duplication - Add ImageResult() and MediaResult() convenience constructors - Add LLMToolCall and LLMToolResponse type aliases so SDK consumers can call Tool.Run() without importing the underlying framework - Add 6 regression tests covering image, media, and text responses Closes #17
2026-06-13 19:20:06 +00:00 · 2026-04-22 16:58:07 +03:00
parent 3cfb6437f9
commit 61408ed490
6 changed files with 239 additions and 24 deletions
@@ -646,7 +646,28 @@ host, _ := kit.New(ctx, &kit.Options{
 })
 ```

-Use `kit.NewParallelTool` for tools safe to run concurrently. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`.
+Use `kit.NewParallelTool` for tools safe to run concurrently. Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`.
+
+#### Return Helpers
+
+| Helper | Description |
+| --- | --- |
+| `kit.TextResult(content)` | Successful text result |
+| `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) |
+| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
+| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |
+
+#### ToolOutput Fields
+
+```go
+kit.ToolOutput{
+    Content:   "result text",     // text returned to the LLM
+    IsError:   false,             // true = LLM sees this as an error
+    Data:      pngBytes,          // optional binary data (images, audio)
+    MediaType: "image/png",       // MIME type for binary Data
+    Metadata:  map[string]any{},  // opaque metadata for hooks/UI (not sent to LLM)
+}
+```

 ### With Callbacks

@@ -2,6 +2,7 @@ package kit

 import (
 	"context"
+	"strings"

 	"charm.land/fantasy"

@@ -52,6 +53,22 @@ func ErrorResult(content string) ToolOutput {
 	return ToolOutput{Content: content, IsError: true}
 }

+// ImageResult creates a [ToolOutput] that returns an image to the LLM.
+// The data is the raw image bytes and mediaType is the MIME type
+// (e.g. "image/png", "image/jpeg"). The optional text content accompanies
+// the image and is visible to the LLM alongside it.
+func ImageResult(content string, data []byte, mediaType string) ToolOutput {
+	return ToolOutput{Content: content, Data: data, MediaType: mediaType}
+}
+
+// MediaResult creates a [ToolOutput] that returns non-image binary media
+// (e.g. audio, video) to the LLM. The data is the raw bytes and mediaType
+// is the MIME type (e.g. "audio/wav", "video/mp4"). The optional text
+// content accompanies the media.
+func MediaResult(content string, data []byte, mediaType string) ToolOutput {
+	return ToolOutput{Content: content, Data: data, MediaType: mediaType}
+}
+
 // toolCallIDKey is the context key for the tool call ID.
 type toolCallIDKey struct{}

@@ -63,9 +80,35 @@ func ToolCallIDFromContext(ctx context.Context) string {
 	return s
 }

+// toolOutputToResponse converts a [ToolOutput] into the underlying
+// framework's ToolResponse, inferring the response Type from Data/MediaType
+// so that binary content (images, audio, etc.) is forwarded to the LLM
+// instead of being silently dropped.
+func toolOutputToResponse(result ToolOutput) fantasy.ToolResponse {
+	resp := fantasy.ToolResponse{
+		Content:   result.Content,
+		IsError:   result.IsError,
+		Data:      result.Data,
+		MediaType: result.MediaType,
+	}
+	// Infer response type from binary data so the downstream framework
+	// creates a media content block instead of a plain-text one.
+	if len(result.Data) > 0 && result.MediaType != "" {
+		if strings.HasPrefix(result.MediaType, "image/") {
+			resp.Type = "image"
+		} else {
+			resp.Type = "media"
+		}
+	}
+	if result.Metadata != nil {
+		resp = fantasy.WithResponseMetadata(resp, result.Metadata)
+	}
+	return resp
+}
+
 // NewTool creates a custom [Tool] with automatic JSON schema generation from
 // the TInput struct type. The handler receives a typed input (deserialized
-// from the LLM's JSON arguments) and returns a [ToolResult].
+// from the LLM's JSON arguments) and returns a [ToolOutput].
 //
 // Struct tags on TInput control the generated schema:
 //
@@ -77,6 +120,11 @@ func ToolCallIDFromContext(ctx context.Context) string {
 // The tool call ID is injected into the context and can be retrieved with
 // [ToolCallIDFromContext].
 //
+// Binary results: When [ToolOutput.Data] and [ToolOutput.MediaType] are set,
+// the response type is automatically inferred so the LLM receives the binary
+// content (e.g. an image) instead of only the text. Use [ImageResult] or
+// [MediaResult] for convenience.
+//
 // Example:
 //
 //	type WeatherInput struct {
@@ -84,7 +132,7 @@ func ToolCallIDFromContext(ctx context.Context) string {
 //	}
 //
 //	tool := kit.NewTool("get_weather", "Get weather for a city",
-//	    func(ctx context.Context, input WeatherInput) (kit.ToolResult, error) {
+//	    func(ctx context.Context, input WeatherInput) (kit.ToolOutput, error) {
 //	        return kit.TextResult("72°F, sunny in " + input.City), nil
 //	    },
 //	)
@@ -96,16 +144,7 @@ func NewTool[TInput any](name, description string, fn func(ctx context.Context,
 			if err != nil {
 				return fantasy.NewTextErrorResponse(err.Error()), nil
 			}
-			resp := fantasy.ToolResponse{
-				Content:   result.Content,
-				IsError:   result.IsError,
-				Data:      result.Data,
-				MediaType: result.MediaType,
-			}
-			if result.Metadata != nil {
-				resp = fantasy.WithResponseMetadata(resp, result.Metadata)
-			}
-			return resp, nil
+			return toolOutputToResponse(result), nil
 		},
 	)
 }
@@ -121,16 +160,7 @@ func NewParallelTool[TInput any](name, description string, fn func(ctx context.C
 			if err != nil {
 				return fantasy.NewTextErrorResponse(err.Error()), nil
 			}
-			resp := fantasy.ToolResponse{
-				Content:   result.Content,
-				IsError:   result.IsError,
-				Data:      result.Data,
-				MediaType: result.MediaType,
-			}
-			if result.Metadata != nil {
-				resp = fantasy.WithResponseMetadata(resp, result.Metadata)
-			}
-			return resp, nil
+			return toolOutputToResponse(result), nil
 		},
 	)
 }
@@ -117,3 +117,149 @@ func TestToolOutput_BinaryData(t *testing.T) {
 		t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png")
 	}
 }
+
+// TestImageResult verifies the ImageResult convenience constructor.
+func TestImageResult(t *testing.T) {
+	data := []byte{0x89, 0x50, 0x4E, 0x47}
+	r := kit.ImageResult("here is the image", data, "image/png")
+	if r.Content != "here is the image" {
+		t.Errorf("Content = %q, want %q", r.Content, "here is the image")
+	}
+	if len(r.Data) != 4 {
+		t.Errorf("Data len = %d, want 4", len(r.Data))
+	}
+	if r.MediaType != "image/png" {
+		t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png")
+	}
+	if r.IsError {
+		t.Error("ImageResult should not set IsError")
+	}
+}
+
+// TestMediaResult verifies the MediaResult convenience constructor.
+func TestMediaResult(t *testing.T) {
+	data := []byte{0xFF, 0xFB, 0x90, 0x00}
+	r := kit.MediaResult("audio clip", data, "audio/mpeg")
+	if r.Content != "audio clip" {
+		t.Errorf("Content = %q, want %q", r.Content, "audio clip")
+	}
+	if len(r.Data) != 4 {
+		t.Errorf("Data len = %d, want 4", len(r.Data))
+	}
+	if r.MediaType != "audio/mpeg" {
+		t.Errorf("MediaType = %q, want %q", r.MediaType, "audio/mpeg")
+	}
+	if r.IsError {
+		t.Error("MediaResult should not set IsError")
+	}
+}
+
+// TestNewTool_BinaryImageResponse verifies that NewTool correctly infers the
+// response type for image data so binary content is forwarded to the LLM
+// (issue #17).
+func TestNewTool_BinaryImageResponse(t *testing.T) {
+	type Input struct {
+		Path string `json:"path"`
+	}
+
+	imgData := []byte{0x89, 0x50, 0x4E, 0x47} // PNG magic bytes
+
+	tool := kit.NewTool("read_image", "Read an image file",
+		func(ctx context.Context, input Input) (kit.ToolOutput, error) {
+			return kit.ImageResult("Here is the image", imgData, "image/png"), nil
+		},
+	)
+
+	// Run the tool and inspect the raw ToolResponse via the AgentTool interface.
+	resp, err := tool.Run(context.Background(), kit.LLMToolCall{
+		ID:    "call_1",
+		Name:  "read_image",
+		Input: `{"path": "test.png"}`,
+	})
+	if err != nil {
+		t.Fatalf("Run() error: %v", err)
+	}
+
+	// The Type field must be "image" so the downstream framework creates a
+	// media content block instead of discarding the binary data.
+	if resp.Type != "image" {
+		t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image")
+	}
+	if len(resp.Data) != 4 {
+		t.Errorf("ToolResponse.Data len = %d, want 4", len(resp.Data))
+	}
+	if resp.MediaType != "image/png" {
+		t.Errorf("ToolResponse.MediaType = %q, want %q", resp.MediaType, "image/png")
+	}
+	if resp.Content != "Here is the image" {
+		t.Errorf("ToolResponse.Content = %q, want %q", resp.Content, "Here is the image")
+	}
+}
+
+// TestNewTool_BinaryMediaResponse verifies type inference for non-image media.
+func TestNewTool_BinaryMediaResponse(t *testing.T) {
+	type Input struct{}
+
+	tool := kit.NewTool("get_audio", "Get audio",
+		func(ctx context.Context, input Input) (kit.ToolOutput, error) {
+			return kit.MediaResult("audio clip", []byte{0xFF, 0xFB}, "audio/mpeg"), nil
+		},
+	)
+
+	resp, err := tool.Run(context.Background(), kit.LLMToolCall{
+		ID:    "call_2",
+		Name:  "get_audio",
+		Input: `{}`,
+	})
+	if err != nil {
+		t.Fatalf("Run() error: %v", err)
+	}
+	if resp.Type != "media" {
+		t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "media")
+	}
+}
+
+// TestNewTool_TextResponseTypeNotSet verifies that text-only responses do NOT
+// get an inferred type (preserving existing behavior).
+func TestNewTool_TextResponseTypeNotSet(t *testing.T) {
+	type Input struct{}
+
+	tool := kit.NewTool("echo", "Echo",
+		func(ctx context.Context, input Input) (kit.ToolOutput, error) {
+			return kit.TextResult("hello"), nil
+		},
+	)
+
+	resp, err := tool.Run(context.Background(), kit.LLMToolCall{
+		ID: "call_3", Name: "echo", Input: `{}`,
+	})
+	if err != nil {
+		t.Fatalf("Run() error: %v", err)
+	}
+	// Text responses should not have Type set (the framework treats "" as text).
+	if resp.Type != "" {
+		t.Errorf("ToolResponse.Type = %q, want empty string for text responses", resp.Type)
+	}
+}
+
+// TestNewParallelTool_BinaryImageResponse mirrors the NewTool binary test for
+// NewParallelTool.
+func TestNewParallelTool_BinaryImageResponse(t *testing.T) {
+	type Input struct{}
+
+	tool := kit.NewParallelTool("snap", "Take a snapshot",
+		func(ctx context.Context, input Input) (kit.ToolOutput, error) {
+			return kit.ImageResult("snapshot", []byte{0xFF, 0xD8}, "image/jpeg"), nil
+		},
+	)
+
+	resp, err := tool.Run(context.Background(), kit.LLMToolCall{
+		ID: "call_4", Name: "snap", Input: `{}`,
+	})
+	if err != nil {
+		t.Fatalf("Run() error: %v", err)
+	}
+	if resp.Type != "image" {
+		t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image")
+	}
+}
@@ -157,6 +157,18 @@ type LLMTextPart = fantasy.TextPart
 // LLMReasoningPart is a reasoning/chain-of-thought content part.
 type LLMReasoningPart = fantasy.ReasoningPart

+// LLMToolCall represents the raw tool invocation passed to a [Tool]'s Run
+// method. It carries the call ID, tool name, and the JSON-encoded input
+// arguments from the LLM. This is the execution-layer call object — distinct
+// from [ToolCall] (a message content part).
+type LLMToolCall = fantasy.ToolCall
+
+// LLMToolResponse represents the raw response returned from a [Tool]'s Run
+// method. Most SDK consumers should use [ToolOutput] with [NewTool] /
+// [NewParallelTool] instead — this alias is provided for advanced use cases
+// that need to call Tool.Run() directly (e.g. testing).
+type LLMToolResponse = fantasy.ToolResponse
+
 // LLMToolCallPart represents an LLM-initiated tool invocation within a message.
 type LLMToolCallPart = fantasy.ToolCallPart

@@ -493,6 +493,8 @@ host, _ := kit.New(ctx, &kit.Options{
 |----------|-------------|
 | `kit.TextResult(content)` | Successful text result |
 | `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) |
+| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
+| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |

 **ToolOutput fields** (for advanced use):

@@ -1095,6 +1097,8 @@ kit.LLMUsage        // {InputTokens, OutputTokens, TotalTokens, ReasoningTokens,
                     //  CacheCreationTokens, CacheReadTokens}
 kit.LLMResponse     // {Content, FinishReason, Usage}
 kit.LLMFilePart     // {Filename, Data []byte, MediaType}
+kit.LLMToolCall     // {ID, Name, Input string} — execution-layer tool call (for Tool.Run)
+kit.LLMToolResponse // {Type, Content, Data, MediaType, IsError, ...} — raw tool response

 // Compaction types
 kit.CompactionResult, kit.CompactionOptions
@@ -101,8 +101,10 @@ Return values:
 |--------|-------------|
 | `kit.TextResult(s)` | Successful text result |
 | `kit.ErrorResult(s)` | Error result (LLM sees it as a tool error) |
+| `kit.ImageResult(s, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
+| `kit.MediaResult(s, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |

-For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields.
+Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields.

 Use `kit.NewParallelTool` for tools that are safe to run concurrently. Use `kit.ToolCallIDFromContext(ctx)` to retrieve the LLM-assigned call ID for logging or tracing.