mirror of
https://github.com/mark3labs/kit.git
synced 2026-06-13 19:20:06 +00:00
fix(sdk): infer ToolResponse.Type for binary data in NewTool/NewParallelTool
- Infer Type="image" for image/* MIME types and Type="media" for all other binary content so the downstream framework creates a media content block instead of silently discarding Data bytes (#17) - Extract shared toolOutputToResponse() helper to eliminate duplication - Add ImageResult() and MediaResult() convenience constructors - Add LLMToolCall and LLMToolResponse type aliases so SDK consumers can call Tool.Run() without importing the underlying framework - Add 6 regression tests covering image, media, and text responses Closes #17
This commit is contained in:
@@ -646,7 +646,28 @@ host, _ := kit.New(ctx, &kit.Options{
|
||||
})
|
||||
```
|
||||
|
||||
Use `kit.NewParallelTool` for tools safe to run concurrently. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`.
|
||||
Use `kit.NewParallelTool` for tools safe to run concurrently. Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. See the [SDK docs](/sdk/overview) for full details on struct tags, `ToolOutput` fields, and `ToolCallIDFromContext`.
|
||||
|
||||
#### Return Helpers
|
||||
|
||||
| Helper | Description |
|
||||
| --- | --- |
|
||||
| `kit.TextResult(content)` | Successful text result |
|
||||
| `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) |
|
||||
| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
|
||||
| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |
|
||||
|
||||
#### ToolOutput Fields
|
||||
|
||||
```go
|
||||
kit.ToolOutput{
|
||||
Content: "result text", // text returned to the LLM
|
||||
IsError: false, // true = LLM sees this as an error
|
||||
Data: pngBytes, // optional binary data (images, audio)
|
||||
MediaType: "image/png", // MIME type for binary Data
|
||||
Metadata: map[string]any{}, // opaque metadata for hooks/UI (not sent to LLM)
|
||||
}
|
||||
```
|
||||
|
||||
### With Callbacks
|
||||
|
||||
|
||||
+52
-22
@@ -2,6 +2,7 @@ package kit
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
|
||||
"charm.land/fantasy"
|
||||
|
||||
@@ -52,6 +53,22 @@ func ErrorResult(content string) ToolOutput {
|
||||
return ToolOutput{Content: content, IsError: true}
|
||||
}
|
||||
|
||||
// ImageResult creates a [ToolOutput] that returns an image to the LLM.
|
||||
// The data is the raw image bytes and mediaType is the MIME type
|
||||
// (e.g. "image/png", "image/jpeg"). The optional text content accompanies
|
||||
// the image and is visible to the LLM alongside it.
|
||||
func ImageResult(content string, data []byte, mediaType string) ToolOutput {
|
||||
return ToolOutput{Content: content, Data: data, MediaType: mediaType}
|
||||
}
|
||||
|
||||
// MediaResult creates a [ToolOutput] that returns non-image binary media
|
||||
// (e.g. audio, video) to the LLM. The data is the raw bytes and mediaType
|
||||
// is the MIME type (e.g. "audio/wav", "video/mp4"). The optional text
|
||||
// content accompanies the media.
|
||||
func MediaResult(content string, data []byte, mediaType string) ToolOutput {
|
||||
return ToolOutput{Content: content, Data: data, MediaType: mediaType}
|
||||
}
|
||||
|
||||
// toolCallIDKey is the context key for the tool call ID.
|
||||
type toolCallIDKey struct{}
|
||||
|
||||
@@ -63,9 +80,35 @@ func ToolCallIDFromContext(ctx context.Context) string {
|
||||
return s
|
||||
}
|
||||
|
||||
// toolOutputToResponse converts a [ToolOutput] into the underlying
|
||||
// framework's ToolResponse, inferring the response Type from Data/MediaType
|
||||
// so that binary content (images, audio, etc.) is forwarded to the LLM
|
||||
// instead of being silently dropped.
|
||||
func toolOutputToResponse(result ToolOutput) fantasy.ToolResponse {
|
||||
resp := fantasy.ToolResponse{
|
||||
Content: result.Content,
|
||||
IsError: result.IsError,
|
||||
Data: result.Data,
|
||||
MediaType: result.MediaType,
|
||||
}
|
||||
// Infer response type from binary data so the downstream framework
|
||||
// creates a media content block instead of a plain-text one.
|
||||
if len(result.Data) > 0 && result.MediaType != "" {
|
||||
if strings.HasPrefix(result.MediaType, "image/") {
|
||||
resp.Type = "image"
|
||||
} else {
|
||||
resp.Type = "media"
|
||||
}
|
||||
}
|
||||
if result.Metadata != nil {
|
||||
resp = fantasy.WithResponseMetadata(resp, result.Metadata)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// NewTool creates a custom [Tool] with automatic JSON schema generation from
|
||||
// the TInput struct type. The handler receives a typed input (deserialized
|
||||
// from the LLM's JSON arguments) and returns a [ToolResult].
|
||||
// from the LLM's JSON arguments) and returns a [ToolOutput].
|
||||
//
|
||||
// Struct tags on TInput control the generated schema:
|
||||
//
|
||||
@@ -77,6 +120,11 @@ func ToolCallIDFromContext(ctx context.Context) string {
|
||||
// The tool call ID is injected into the context and can be retrieved with
|
||||
// [ToolCallIDFromContext].
|
||||
//
|
||||
// Binary results: When [ToolOutput.Data] and [ToolOutput.MediaType] are set,
|
||||
// the response type is automatically inferred so the LLM receives the binary
|
||||
// content (e.g. an image) instead of only the text. Use [ImageResult] or
|
||||
// [MediaResult] for convenience.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// type WeatherInput struct {
|
||||
@@ -84,7 +132,7 @@ func ToolCallIDFromContext(ctx context.Context) string {
|
||||
// }
|
||||
//
|
||||
// tool := kit.NewTool("get_weather", "Get weather for a city",
|
||||
// func(ctx context.Context, input WeatherInput) (kit.ToolResult, error) {
|
||||
// func(ctx context.Context, input WeatherInput) (kit.ToolOutput, error) {
|
||||
// return kit.TextResult("72°F, sunny in " + input.City), nil
|
||||
// },
|
||||
// )
|
||||
@@ -96,16 +144,7 @@ func NewTool[TInput any](name, description string, fn func(ctx context.Context,
|
||||
if err != nil {
|
||||
return fantasy.NewTextErrorResponse(err.Error()), nil
|
||||
}
|
||||
resp := fantasy.ToolResponse{
|
||||
Content: result.Content,
|
||||
IsError: result.IsError,
|
||||
Data: result.Data,
|
||||
MediaType: result.MediaType,
|
||||
}
|
||||
if result.Metadata != nil {
|
||||
resp = fantasy.WithResponseMetadata(resp, result.Metadata)
|
||||
}
|
||||
return resp, nil
|
||||
return toolOutputToResponse(result), nil
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -121,16 +160,7 @@ func NewParallelTool[TInput any](name, description string, fn func(ctx context.C
|
||||
if err != nil {
|
||||
return fantasy.NewTextErrorResponse(err.Error()), nil
|
||||
}
|
||||
resp := fantasy.ToolResponse{
|
||||
Content: result.Content,
|
||||
IsError: result.IsError,
|
||||
Data: result.Data,
|
||||
MediaType: result.MediaType,
|
||||
}
|
||||
if result.Metadata != nil {
|
||||
resp = fantasy.WithResponseMetadata(resp, result.Metadata)
|
||||
}
|
||||
return resp, nil
|
||||
return toolOutputToResponse(result), nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -117,3 +117,149 @@ func TestToolOutput_BinaryData(t *testing.T) {
|
||||
t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png")
|
||||
}
|
||||
}
|
||||
|
||||
// TestImageResult verifies the ImageResult convenience constructor.
|
||||
func TestImageResult(t *testing.T) {
|
||||
data := []byte{0x89, 0x50, 0x4E, 0x47}
|
||||
r := kit.ImageResult("here is the image", data, "image/png")
|
||||
if r.Content != "here is the image" {
|
||||
t.Errorf("Content = %q, want %q", r.Content, "here is the image")
|
||||
}
|
||||
if len(r.Data) != 4 {
|
||||
t.Errorf("Data len = %d, want 4", len(r.Data))
|
||||
}
|
||||
if r.MediaType != "image/png" {
|
||||
t.Errorf("MediaType = %q, want %q", r.MediaType, "image/png")
|
||||
}
|
||||
if r.IsError {
|
||||
t.Error("ImageResult should not set IsError")
|
||||
}
|
||||
}
|
||||
|
||||
// TestMediaResult verifies the MediaResult convenience constructor.
|
||||
func TestMediaResult(t *testing.T) {
|
||||
data := []byte{0xFF, 0xFB, 0x90, 0x00}
|
||||
r := kit.MediaResult("audio clip", data, "audio/mpeg")
|
||||
if r.Content != "audio clip" {
|
||||
t.Errorf("Content = %q, want %q", r.Content, "audio clip")
|
||||
}
|
||||
if len(r.Data) != 4 {
|
||||
t.Errorf("Data len = %d, want 4", len(r.Data))
|
||||
}
|
||||
if r.MediaType != "audio/mpeg" {
|
||||
t.Errorf("MediaType = %q, want %q", r.MediaType, "audio/mpeg")
|
||||
}
|
||||
if r.IsError {
|
||||
t.Error("MediaResult should not set IsError")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewTool_BinaryImageResponse verifies that NewTool correctly infers the
|
||||
// response type for image data so binary content is forwarded to the LLM
|
||||
// (issue #17).
|
||||
func TestNewTool_BinaryImageResponse(t *testing.T) {
|
||||
type Input struct {
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
imgData := []byte{0x89, 0x50, 0x4E, 0x47} // PNG magic bytes
|
||||
|
||||
tool := kit.NewTool("read_image", "Read an image file",
|
||||
func(ctx context.Context, input Input) (kit.ToolOutput, error) {
|
||||
return kit.ImageResult("Here is the image", imgData, "image/png"), nil
|
||||
},
|
||||
)
|
||||
|
||||
// Run the tool and inspect the raw ToolResponse via the AgentTool interface.
|
||||
resp, err := tool.Run(context.Background(), kit.LLMToolCall{
|
||||
ID: "call_1",
|
||||
Name: "read_image",
|
||||
Input: `{"path": "test.png"}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Run() error: %v", err)
|
||||
}
|
||||
|
||||
// The Type field must be "image" so the downstream framework creates a
|
||||
// media content block instead of discarding the binary data.
|
||||
if resp.Type != "image" {
|
||||
t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image")
|
||||
}
|
||||
if len(resp.Data) != 4 {
|
||||
t.Errorf("ToolResponse.Data len = %d, want 4", len(resp.Data))
|
||||
}
|
||||
if resp.MediaType != "image/png" {
|
||||
t.Errorf("ToolResponse.MediaType = %q, want %q", resp.MediaType, "image/png")
|
||||
}
|
||||
if resp.Content != "Here is the image" {
|
||||
t.Errorf("ToolResponse.Content = %q, want %q", resp.Content, "Here is the image")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewTool_BinaryMediaResponse verifies type inference for non-image media.
|
||||
func TestNewTool_BinaryMediaResponse(t *testing.T) {
|
||||
type Input struct{}
|
||||
|
||||
tool := kit.NewTool("get_audio", "Get audio",
|
||||
func(ctx context.Context, input Input) (kit.ToolOutput, error) {
|
||||
return kit.MediaResult("audio clip", []byte{0xFF, 0xFB}, "audio/mpeg"), nil
|
||||
},
|
||||
)
|
||||
|
||||
resp, err := tool.Run(context.Background(), kit.LLMToolCall{
|
||||
ID: "call_2",
|
||||
Name: "get_audio",
|
||||
Input: `{}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Run() error: %v", err)
|
||||
}
|
||||
if resp.Type != "media" {
|
||||
t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "media")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewTool_TextResponseTypeNotSet verifies that text-only responses do NOT
|
||||
// get an inferred type (preserving existing behavior).
|
||||
func TestNewTool_TextResponseTypeNotSet(t *testing.T) {
|
||||
type Input struct{}
|
||||
|
||||
tool := kit.NewTool("echo", "Echo",
|
||||
func(ctx context.Context, input Input) (kit.ToolOutput, error) {
|
||||
return kit.TextResult("hello"), nil
|
||||
},
|
||||
)
|
||||
|
||||
resp, err := tool.Run(context.Background(), kit.LLMToolCall{
|
||||
ID: "call_3", Name: "echo", Input: `{}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Run() error: %v", err)
|
||||
}
|
||||
// Text responses should not have Type set (the framework treats "" as text).
|
||||
if resp.Type != "" {
|
||||
t.Errorf("ToolResponse.Type = %q, want empty string for text responses", resp.Type)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewParallelTool_BinaryImageResponse mirrors the NewTool binary test for
|
||||
// NewParallelTool.
|
||||
func TestNewParallelTool_BinaryImageResponse(t *testing.T) {
|
||||
type Input struct{}
|
||||
|
||||
tool := kit.NewParallelTool("snap", "Take a snapshot",
|
||||
func(ctx context.Context, input Input) (kit.ToolOutput, error) {
|
||||
return kit.ImageResult("snapshot", []byte{0xFF, 0xD8}, "image/jpeg"), nil
|
||||
},
|
||||
)
|
||||
|
||||
resp, err := tool.Run(context.Background(), kit.LLMToolCall{
|
||||
ID: "call_4", Name: "snap", Input: `{}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Run() error: %v", err)
|
||||
}
|
||||
if resp.Type != "image" {
|
||||
t.Errorf("ToolResponse.Type = %q, want %q", resp.Type, "image")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,6 +157,18 @@ type LLMTextPart = fantasy.TextPart
|
||||
// LLMReasoningPart is a reasoning/chain-of-thought content part.
|
||||
type LLMReasoningPart = fantasy.ReasoningPart
|
||||
|
||||
// LLMToolCall represents the raw tool invocation passed to a [Tool]'s Run
|
||||
// method. It carries the call ID, tool name, and the JSON-encoded input
|
||||
// arguments from the LLM. This is the execution-layer call object — distinct
|
||||
// from [ToolCall] (a message content part).
|
||||
type LLMToolCall = fantasy.ToolCall
|
||||
|
||||
// LLMToolResponse represents the raw response returned from a [Tool]'s Run
|
||||
// method. Most SDK consumers should use [ToolOutput] with [NewTool] /
|
||||
// [NewParallelTool] instead — this alias is provided for advanced use cases
|
||||
// that need to call Tool.Run() directly (e.g. testing).
|
||||
type LLMToolResponse = fantasy.ToolResponse
|
||||
|
||||
// LLMToolCallPart represents an LLM-initiated tool invocation within a message.
|
||||
type LLMToolCallPart = fantasy.ToolCallPart
|
||||
|
||||
|
||||
@@ -493,6 +493,8 @@ host, _ := kit.New(ctx, &kit.Options{
|
||||
|----------|-------------|
|
||||
| `kit.TextResult(content)` | Successful text result |
|
||||
| `kit.ErrorResult(content)` | Error result (LLM sees it as a tool error) |
|
||||
| `kit.ImageResult(content, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
|
||||
| `kit.MediaResult(content, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |
|
||||
|
||||
**ToolOutput fields** (for advanced use):
|
||||
|
||||
@@ -1095,6 +1097,8 @@ kit.LLMUsage // {InputTokens, OutputTokens, TotalTokens, ReasoningTokens,
|
||||
// CacheCreationTokens, CacheReadTokens}
|
||||
kit.LLMResponse // {Content, FinishReason, Usage}
|
||||
kit.LLMFilePart // {Filename, Data []byte, MediaType}
|
||||
kit.LLMToolCall // {ID, Name, Input string} — execution-layer tool call (for Tool.Run)
|
||||
kit.LLMToolResponse // {Type, Content, Data, MediaType, IsError, ...} — raw tool response
|
||||
|
||||
// Compaction types
|
||||
kit.CompactionResult, kit.CompactionOptions
|
||||
|
||||
@@ -101,8 +101,10 @@ Return values:
|
||||
|--------|-------------|
|
||||
| `kit.TextResult(s)` | Successful text result |
|
||||
| `kit.ErrorResult(s)` | Error result (LLM sees it as a tool error) |
|
||||
| `kit.ImageResult(s, data, mediaType)` | Image result with binary data (e.g. `"image/png"`) |
|
||||
| `kit.MediaResult(s, data, mediaType)` | Non-image media result (e.g. `"audio/mpeg"`) |
|
||||
|
||||
For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields.
|
||||
Binary data (images, audio, etc.) in `ToolOutput.Data` is automatically forwarded to the LLM when `MediaType` is set. For advanced use, return a `kit.ToolOutput` struct directly with `Data`, `MediaType`, and `Metadata` fields.
|
||||
|
||||
Use `kit.NewParallelTool` for tools that are safe to run concurrently. Use `kit.ToolCallIDFromContext(ctx)` to retrieve the LLM-assigned call ID for logging or tracing.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user