Skip to content

Commit 32fc2fb

Browse files
feat(compose): add enhanced tool interfaces with multimodal support (#760)
Introduce EnhancedInvokableTool and EnhancedStreamableTool interfaces that support returning structured multimodal results (ToolResult) instead of plain strings. Key changes: 1. New Tool Interfaces (components/tool/interface.go): - EnhancedInvokableTool: accepts ToolArgument, returns *ToolResult - EnhancedStreamableTool: accepts ToolArgument, returns StreamReader[*ToolResult] 2. Tool Utils (components/tool/utils/): - Added InferEnhancedTool and InferEnhancedStreamTool for type inference - Added NewEnhancedTool and NewEnhancedStreamTool factory functions - Support both regular and optionable variants 3. ToolsNode Enhancement (compose/tool_node.go): - Added EnhancedInvokableToolMiddleware and EnhancedStreamableToolMiddleware - Auto-conversion between enhanced invokable and streamable endpoints - Priority given to enhanced tools when both interfaces are implemented - Updated interrupt/rerun state to track enhanced tool results separately 4. Schema Extensions (schema/message.go): - Added ToolPartType enum (text, image, audio, video, file) - Added ToolArgument and ToolResult structs for structured I/O - Added ToolOutputPart with multimodal content support - Added ConcatToolResults for stream chunk merging - Added ToMessageInputParts() for ToolResult to Message conversion - Enhanced Message.String() to display multimodal content 5. React Agent Integration (flow/agent/react/): - Added enhanced tool result senders for MessageFuture - Added middleware support for enhanced tool result collection This enables tools to return rich content like images, audio, video, and files alongside text, supporting advanced multimodal AI agent scenarios.
1 parent e328752 commit 32fc2fb

File tree

11 files changed

+2332
-112
lines changed

11 files changed

+2332
-112
lines changed

components/tool/callback_extra.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package tool
1919

2020
import (
2121
"github.com/cloudwego/eino/callbacks"
22+
"github.com/cloudwego/eino/schema"
2223
)
2324

2425
// CallbackInput is the input for the tool callback.
@@ -33,6 +34,8 @@ type CallbackInput struct {
3334
type CallbackOutput struct {
3435
// Response is the response for the tool.
3536
Response string
37+
// ToolOutput is the multimodal output for the tool. Used when the tool returns structured data.
38+
ToolOutput *schema.ToolResult
3639
// Extra is the extra information for the tool.
3740
Extra map[string]any
3841
}
@@ -44,6 +47,8 @@ func ConvCallbackInput(src callbacks.CallbackInput) *CallbackInput {
4447
return t
4548
case string:
4649
return &CallbackInput{ArgumentsInJSON: t}
50+
case *schema.ToolArgument:
51+
return &CallbackInput{ArgumentsInJSON: t.TextArgument}
4752
default:
4853
return nil
4954
}
@@ -56,6 +61,8 @@ func ConvCallbackOutput(src callbacks.CallbackOutput) *CallbackOutput {
5661
return t
5762
case string:
5863
return &CallbackOutput{Response: t}
64+
case *schema.ToolResult:
65+
return &CallbackOutput{ToolOutput: t}
5966
default:
6067
return nil
6168
}

components/tool/interface.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,18 @@ type StreamableTool interface {
4141

4242
StreamableRun(ctx context.Context, argumentsInJSON string, opts ...Option) (*schema.StreamReader[string], error)
4343
}
44+
45+
// EnhancedInvokableTool is a tool interface that supports returning structured multimodal results.
46+
// Unlike InvokableTool which returns a string, this interface returns *schema.ToolResult
47+
// which can contain text, images, audio, video, and files.
48+
type EnhancedInvokableTool interface {
49+
BaseTool
50+
InvokableRun(ctx context.Context, toolArgument *schema.ToolArgument, opts ...Option) (*schema.ToolResult, error)
51+
}
52+
53+
// EnhancedStreamableTool is a streaming tool interface that supports returning structured multimodal results.
54+
// It provides a stream reader for accessing multimodal content progressively.
55+
type EnhancedStreamableTool interface {
56+
BaseTool
57+
StreamableRun(ctx context.Context, toolArgument *schema.ToolArgument, opts ...Option) (*schema.StreamReader[*schema.ToolResult], error)
58+
}

components/tool/utils/invokable_func.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,33 @@ func InferOptionableTool[T, D any](toolName, toolDesc string, i OptionableInvoke
5656
return newOptionableTool(ti, i, opts...), nil
5757
}
5858

59+
// EnhancedInvokeFunc is the function type for the enhanced tool.
60+
type EnhancedInvokeFunc[T any] func(ctx context.Context, input T) (output *schema.ToolResult, err error)
61+
62+
// OptionableEnhancedInvokeFunc is the function type for the enhanced tool with tool option.
63+
type OptionableEnhancedInvokeFunc[T any] func(ctx context.Context, input T, opts ...tool.Option) (output *schema.ToolResult, err error)
64+
65+
// InferEnhancedTool creates an EnhancedInvokableTool from a given function by inferring the ToolInfo from the function's request parameters.
66+
// End-user can pass a SchemaCustomizerFn in opts to customize the go struct tag parsing process, overriding default behavior.
67+
func InferEnhancedTool[T any](toolName, toolDesc string, i EnhancedInvokeFunc[T], opts ...Option) (tool.EnhancedInvokableTool, error) {
68+
ti, err := goStruct2ToolInfo[T](toolName, toolDesc, opts...)
69+
if err != nil {
70+
return nil, err
71+
}
72+
73+
return NewEnhancedTool(ti, i, opts...), nil
74+
}
75+
76+
// InferOptionableEnhancedTool creates an EnhancedInvokableTool from a given function by inferring the ToolInfo from the function's request parameters, with tool option.
77+
func InferOptionableEnhancedTool[T any](toolName, toolDesc string, i OptionableEnhancedInvokeFunc[T], opts ...Option) (tool.EnhancedInvokableTool, error) {
78+
ti, err := goStruct2ToolInfo[T](toolName, toolDesc, opts...)
79+
if err != nil {
80+
return nil, err
81+
}
82+
83+
return newOptionableEnhancedTool(ti, i, opts...), nil
84+
}
85+
5986
// GoStruct2ParamsOneOf converts a go struct to a ParamsOneOf.
6087
// if you attempt to use ResponseFormat of some ChatModel to get StructuredOutput, you can infer the JSONSchema from the go struct.
6188
func GoStruct2ParamsOneOf[T any](opts ...Option) (*schema.ParamsOneOf, error) {
@@ -200,3 +227,76 @@ func snakeToCamel(s string) string {
200227

201228
return strings.Join(parts, "")
202229
}
230+
231+
// NewEnhancedTool Create an enhanced tool, where the input is in JSON format and output is *schema.ToolResult.
232+
func NewEnhancedTool[T any](desc *schema.ToolInfo, i EnhancedInvokeFunc[T], opts ...Option) tool.EnhancedInvokableTool {
233+
return newOptionableEnhancedTool(desc, func(ctx context.Context, input T, _ ...tool.Option) (*schema.ToolResult, error) {
234+
return i(ctx, input)
235+
}, opts...)
236+
}
237+
238+
func newOptionableEnhancedTool[T any](desc *schema.ToolInfo, i OptionableEnhancedInvokeFunc[T], opts ...Option) tool.EnhancedInvokableTool {
239+
to := getToolOptions(opts...)
240+
241+
return &enhancedInvokableTool[T]{
242+
info: desc,
243+
um: to.um,
244+
Fn: i,
245+
}
246+
}
247+
248+
type enhancedInvokableTool[T any] struct {
249+
info *schema.ToolInfo
250+
251+
um UnmarshalArguments
252+
253+
Fn OptionableEnhancedInvokeFunc[T]
254+
}
255+
256+
func (e *enhancedInvokableTool[T]) Info(ctx context.Context) (*schema.ToolInfo, error) {
257+
return e.info, nil
258+
}
259+
260+
func (e *enhancedInvokableTool[T]) InvokableRun(ctx context.Context, toolArgument *schema.ToolArgument, opts ...tool.Option) (*schema.ToolResult, error) {
261+
var inst T
262+
var err error
263+
264+
if e.um != nil {
265+
var val any
266+
val, err = e.um(ctx, toolArgument.TextArgument)
267+
if err != nil {
268+
return nil, fmt.Errorf("[EnhancedLocalFunc] failed to unmarshal arguments, toolName=%s, err=%w", e.getToolName(), err)
269+
}
270+
gt, ok := val.(T)
271+
if !ok {
272+
return nil, fmt.Errorf("[EnhancedLocalFunc] invalid type, toolName=%s, expected=%T, given=%T", e.getToolName(), inst, val)
273+
}
274+
inst = gt
275+
} else {
276+
inst = generic.NewInstance[T]()
277+
278+
err = sonic.UnmarshalString(toolArgument.TextArgument, &inst)
279+
if err != nil {
280+
return nil, fmt.Errorf("[EnhancedLocalFunc] failed to unmarshal arguments in json, toolName=%s, err=%w", e.getToolName(), err)
281+
}
282+
}
283+
284+
resp, err := e.Fn(ctx, inst, opts...)
285+
if err != nil {
286+
return nil, fmt.Errorf("[EnhancedLocalFunc] failed to invoke tool, toolName=%s, err=%w", e.getToolName(), err)
287+
}
288+
289+
return resp, nil
290+
}
291+
292+
func (e *enhancedInvokableTool[T]) GetType() string {
293+
return snakeToCamel(e.getToolName())
294+
}
295+
296+
func (e *enhancedInvokableTool[T]) getToolName() string {
297+
if e.info == nil {
298+
return ""
299+
}
300+
301+
return e.info.Name
302+
}

components/tool/utils/streamable_func.go

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,101 @@ func (s *streamableTool[T, D]) getToolName() string {
155155

156156
return s.info.Name
157157
}
158+
159+
// EnhancedStreamFunc is the function type for the enhanced streamable tool.
160+
type EnhancedStreamFunc[T any] func(ctx context.Context, input T) (output *schema.StreamReader[*schema.ToolResult], err error)
161+
162+
// OptionableEnhancedStreamFunc is the function type for the enhanced streamable tool with tool option.
163+
type OptionableEnhancedStreamFunc[T any] func(ctx context.Context, input T, opts ...tool.Option) (output *schema.StreamReader[*schema.ToolResult], err error)
164+
165+
// InferEnhancedStreamTool creates an EnhancedStreamableTool from a given function by inferring the ToolInfo from the function's request parameters.
166+
// End-user can pass a SchemaCustomizerFn in opts to customize the go struct tag parsing process, overriding default behavior.
167+
func InferEnhancedStreamTool[T any](toolName, toolDesc string, s EnhancedStreamFunc[T], opts ...Option) (tool.EnhancedStreamableTool, error) {
168+
ti, err := goStruct2ToolInfo[T](toolName, toolDesc, opts...)
169+
if err != nil {
170+
return nil, err
171+
}
172+
173+
return NewEnhancedStreamTool(ti, s, opts...), nil
174+
}
175+
176+
// InferOptionableEnhancedStreamTool creates an EnhancedStreamableTool from a given function by inferring the ToolInfo from the function's request parameters, with tool option.
177+
func InferOptionableEnhancedStreamTool[T any](toolName, toolDesc string, s OptionableEnhancedStreamFunc[T], opts ...Option) (tool.EnhancedStreamableTool, error) {
178+
ti, err := goStruct2ToolInfo[T](toolName, toolDesc, opts...)
179+
if err != nil {
180+
return nil, err
181+
}
182+
183+
return newOptionableEnhancedStreamTool(ti, s, opts...), nil
184+
}
185+
186+
// NewEnhancedStreamTool Create an enhanced streaming tool, where the input is in JSON format and output is *schema.StreamReader[*schema.ToolResult].
187+
func NewEnhancedStreamTool[T any](desc *schema.ToolInfo, s EnhancedStreamFunc[T], opts ...Option) tool.EnhancedStreamableTool {
188+
return newOptionableEnhancedStreamTool(desc,
189+
func(ctx context.Context, input T, _ ...tool.Option) (output *schema.StreamReader[*schema.ToolResult], err error) {
190+
return s(ctx, input)
191+
},
192+
opts...)
193+
}
194+
195+
func newOptionableEnhancedStreamTool[T any](desc *schema.ToolInfo, s OptionableEnhancedStreamFunc[T], opts ...Option) tool.EnhancedStreamableTool {
196+
to := getToolOptions(opts...)
197+
198+
return &enhancedStreamableTool[T]{
199+
info: desc,
200+
um: to.um,
201+
Fn: s,
202+
}
203+
}
204+
205+
type enhancedStreamableTool[T any] struct {
206+
info *schema.ToolInfo
207+
208+
um UnmarshalArguments
209+
210+
Fn OptionableEnhancedStreamFunc[T]
211+
}
212+
213+
func (s *enhancedStreamableTool[T]) Info(ctx context.Context) (*schema.ToolInfo, error) {
214+
return s.info, nil
215+
}
216+
217+
func (s *enhancedStreamableTool[T]) StreamableRun(ctx context.Context, toolArgument *schema.ToolArgument, opts ...tool.Option) (
218+
outStream *schema.StreamReader[*schema.ToolResult], err error) {
219+
220+
var inst T
221+
if s.um != nil {
222+
var val any
223+
val, err = s.um(ctx, toolArgument.TextArgument)
224+
if err != nil {
225+
return nil, fmt.Errorf("[EnhancedLocalStreamFunc] failed to unmarshal arguments, toolName=%s, err=%w", s.getToolName(), err)
226+
}
227+
228+
gt, ok := val.(T)
229+
if !ok {
230+
return nil, fmt.Errorf("[EnhancedLocalStreamFunc] type err, toolName=%s, expected=%T, given=%T", s.getToolName(), inst, val)
231+
}
232+
inst = gt
233+
} else {
234+
inst = generic.NewInstance[T]()
235+
236+
err = sonic.UnmarshalString(toolArgument.TextArgument, &inst)
237+
if err != nil {
238+
return nil, fmt.Errorf("[EnhancedLocalStreamFunc] failed to unmarshal arguments in json, toolName=%s, err=%w", s.getToolName(), err)
239+
}
240+
}
241+
242+
return s.Fn(ctx, inst, opts...)
243+
}
244+
245+
func (s *enhancedStreamableTool[T]) GetType() string {
246+
return snakeToCamel(s.getToolName())
247+
}
248+
249+
func (s *enhancedStreamableTool[T]) getToolName() string {
250+
if s.info == nil {
251+
return ""
252+
}
253+
254+
return s.info.Name
255+
}

0 commit comments

Comments
 (0)