Skip to content

Commit e1fb4a4

Browse files
committed
Attach file instead of base64 them into the main message
Signed-off-by: Jean-Laurent de Morlhon <jeanlaurent@morlhon.net>
1 parent eb8f299 commit e1fb4a4

18 files changed

+2018
-187
lines changed

pkg/chat/chat.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,44 @@ const (
2626
ImageURLDetailAuto ImageURLDetail = "auto"
2727
)
2828

29+
// FileSourceType indicates how the file should be referenced in API calls
30+
type FileSourceType string
31+
32+
const (
33+
// FileSourceTypeNone means no file reference, use URL or base64
34+
FileSourceTypeNone FileSourceType = ""
35+
// FileSourceTypeFileID means the file was uploaded and should be referenced by ID
36+
FileSourceTypeFileID FileSourceType = "file_id"
37+
// FileSourceTypeFileURI means the file was uploaded and should be referenced by URI (Gemini)
38+
FileSourceTypeFileURI FileSourceType = "file_uri"
39+
// FileSourceTypeLocalPath means the file is a local path that needs to be uploaded/converted
40+
FileSourceTypeLocalPath FileSourceType = "local_path"
41+
)
42+
43+
// FileReference contains information about a file attachment
44+
type FileReference struct {
45+
// SourceType indicates how this file should be referenced
46+
SourceType FileSourceType `json:"source_type,omitempty"`
47+
// FileID is the provider-specific file identifier (for FileSourceTypeFileID)
48+
FileID string `json:"file_id,omitempty"`
49+
// FileURI is the file URI (for FileSourceTypeFileURI, used by Gemini)
50+
FileURI string `json:"file_uri,omitempty"`
51+
// LocalPath is the path to a local file (for FileSourceTypeLocalPath)
52+
LocalPath string `json:"local_path,omitempty"`
53+
// MimeType is the MIME type of the file
54+
MimeType string `json:"mime_type,omitempty"`
55+
// Provider identifies which provider this reference is for (when uploaded)
56+
Provider string `json:"provider,omitempty"`
57+
}
58+
2959
type MessageImageURL struct {
60+
// URL contains a data URL (base64) or a public HTTP(S) URL
3061
URL string `json:"url,omitempty"`
3162
Detail ImageURLDetail `json:"detail,omitempty"`
63+
64+
// FileRef contains file reference info when the image was uploaded via Files API
65+
// or references a local file path that needs to be processed
66+
FileRef *FileReference `json:"file_ref,omitempty"`
3267
}
3368

3469
type Message struct {

pkg/cli/runner.go

Lines changed: 41 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package cli
33
import (
44
"cmp"
55
"context"
6-
"encoding/base64"
76
"encoding/json"
87
"fmt"
98
"io"
@@ -307,55 +306,29 @@ func ParseAttachCommand(userInput string) (messageText, attachPath string) {
307306
return messageText, attachPath
308307
}
309308

310-
// CreateUserMessageWithAttachment creates a user message with optional image attachment
309+
// CreateUserMessageWithAttachment creates a user message with optional image attachment.
310+
// Instead of converting to base64, this stores the file path for later processing
311+
// by the provider (which may use Files API or base64 as appropriate).
311312
func CreateUserMessageWithAttachment(userContent, attachmentPath string) *session.Message {
312313
if attachmentPath == "" {
313314
return session.UserMessage(userContent)
314315
}
315316

316-
// Convert file to data URL
317-
dataURL, err := fileToDataURL(attachmentPath)
317+
// Resolve to absolute path
318+
absPath, err := filepath.Abs(attachmentPath)
318319
if err != nil {
319-
slog.Warn("Failed to attach file", "path", attachmentPath, "error", err)
320+
slog.Warn("Failed to resolve attachment path", "path", attachmentPath, "error", err)
320321
return session.UserMessage(userContent)
321322
}
322323

323-
// Ensure we have some text content when attaching a file
324-
textContent := cmp.Or(strings.TrimSpace(userContent), "Please analyze this attached file.")
325-
326-
// Create message with multi-content including text and image
327-
multiContent := []chat.MessagePart{
328-
{
329-
Type: chat.MessagePartTypeText,
330-
Text: textContent,
331-
},
332-
{
333-
Type: chat.MessagePartTypeImageURL,
334-
ImageURL: &chat.MessageImageURL{
335-
URL: dataURL,
336-
Detail: chat.ImageURLDetailAuto,
337-
},
338-
},
339-
}
340-
341-
return session.UserMessage("", multiContent...)
342-
}
343-
344-
// fileToDataURL converts a file to a data URL
345-
func fileToDataURL(filePath string) (string, error) {
346324
// Check if file exists
347-
if _, err := os.Stat(filePath); os.IsNotExist(err) {
348-
return "", fmt.Errorf("file does not exist: %s", filePath)
349-
}
350-
351-
// Read file content
352-
fileBytes, err := os.ReadFile(filePath)
353-
if err != nil {
354-
return "", fmt.Errorf("failed to read file: %w", err)
325+
if _, err := os.Stat(absPath); os.IsNotExist(err) {
326+
slog.Warn("Attachment file does not exist", "path", absPath)
327+
return session.UserMessage(userContent)
355328
}
356329

357-
// Determine MIME type based on file extension
358-
ext := strings.ToLower(filepath.Ext(filePath))
330+
// Determine MIME type from extension
331+
ext := strings.ToLower(filepath.Ext(absPath))
359332
var mimeType string
360333
switch ext {
361334
case ".jpg", ".jpeg":
@@ -370,15 +343,39 @@ func fileToDataURL(filePath string) (string, error) {
370343
mimeType = "image/bmp"
371344
case ".svg":
372345
mimeType = "image/svg+xml"
346+
case ".pdf":
347+
mimeType = "application/pdf"
373348
default:
374-
return "", fmt.Errorf("unsupported image format: %s", ext)
349+
slog.Warn("Unsupported file format for attachment", "path", absPath, "ext", ext)
350+
return session.UserMessage(userContent)
375351
}
376352

377-
// Encode to base64
378-
encoded := base64.StdEncoding.EncodeToString(fileBytes)
353+
slog.Debug("Creating message with file attachment",
354+
"path", absPath,
355+
"mime_type", mimeType)
356+
357+
// Ensure we have some text content when attaching a file
358+
textContent := cmp.Or(strings.TrimSpace(userContent), "Please analyze this attached file.")
379359

380-
// Create data URL
381-
dataURL := fmt.Sprintf("data:%s;base64,%s", mimeType, encoded)
360+
// Create message with file reference (not base64)
361+
// The provider will handle uploading via Files API or converting to base64
362+
multiContent := []chat.MessagePart{
363+
{
364+
Type: chat.MessagePartTypeText,
365+
Text: textContent,
366+
},
367+
{
368+
Type: chat.MessagePartTypeImageURL,
369+
ImageURL: &chat.MessageImageURL{
370+
Detail: chat.ImageURLDetailAuto,
371+
FileRef: &chat.FileReference{
372+
SourceType: chat.FileSourceTypeLocalPath,
373+
LocalPath: absPath,
374+
MimeType: mimeType,
375+
},
376+
},
377+
},
378+
}
382379

383-
return dataURL, nil
380+
return session.UserMessage("", multiContent...)
384381
}

pkg/cli/runner_attachment_test.go

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
package cli
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
10+
11+
"github.com/docker/cagent/pkg/chat"
12+
)
13+
14+
func TestCreateUserMessageWithAttachment(t *testing.T) {
15+
t.Parallel()
16+
17+
// Create a temporary test image file
18+
tmpDir := t.TempDir()
19+
jpegPath := filepath.Join(tmpDir, "test.jpg")
20+
pngPath := filepath.Join(tmpDir, "test.png")
21+
gifPath := filepath.Join(tmpDir, "test.gif")
22+
webpPath := filepath.Join(tmpDir, "test.webp")
23+
pdfPath := filepath.Join(tmpDir, "test.pdf")
24+
unsupportedPath := filepath.Join(tmpDir, "test.xyz")
25+
26+
// Create test files
27+
for _, path := range []string{jpegPath, pngPath, gifPath, webpPath, pdfPath, unsupportedPath} {
28+
err := os.WriteFile(path, []byte("test data"), 0o644)
29+
require.NoError(t, err)
30+
}
31+
32+
tests := []struct {
33+
name string
34+
userContent string
35+
attachmentPath string
36+
wantMultiContent bool
37+
wantFileRef bool
38+
wantMimeType string
39+
wantDefaultPrompt bool
40+
}{
41+
{
42+
name: "no attachment",
43+
userContent: "Hello world",
44+
attachmentPath: "",
45+
wantMultiContent: false,
46+
},
47+
{
48+
name: "jpeg attachment",
49+
userContent: "Check this image",
50+
attachmentPath: jpegPath,
51+
wantMultiContent: true,
52+
wantFileRef: true,
53+
wantMimeType: "image/jpeg",
54+
},
55+
{
56+
name: "png attachment",
57+
userContent: "Analyze this",
58+
attachmentPath: pngPath,
59+
wantMultiContent: true,
60+
wantFileRef: true,
61+
wantMimeType: "image/png",
62+
},
63+
{
64+
name: "gif attachment",
65+
userContent: "What's in this gif?",
66+
attachmentPath: gifPath,
67+
wantMultiContent: true,
68+
wantFileRef: true,
69+
wantMimeType: "image/gif",
70+
},
71+
{
72+
name: "webp attachment",
73+
userContent: "Describe this",
74+
attachmentPath: webpPath,
75+
wantMultiContent: true,
76+
wantFileRef: true,
77+
wantMimeType: "image/webp",
78+
},
79+
{
80+
name: "pdf attachment",
81+
userContent: "Summarize this PDF",
82+
attachmentPath: pdfPath,
83+
wantMultiContent: true,
84+
wantFileRef: true,
85+
wantMimeType: "application/pdf",
86+
},
87+
{
88+
name: "attachment with empty content gets default prompt",
89+
userContent: "",
90+
attachmentPath: jpegPath,
91+
wantMultiContent: true,
92+
wantFileRef: true,
93+
wantMimeType: "image/jpeg",
94+
wantDefaultPrompt: true,
95+
},
96+
{
97+
name: "attachment with whitespace content gets default prompt",
98+
userContent: " ",
99+
attachmentPath: jpegPath,
100+
wantMultiContent: true,
101+
wantFileRef: true,
102+
wantMimeType: "image/jpeg",
103+
wantDefaultPrompt: true,
104+
},
105+
{
106+
name: "non-existent file falls back to text only",
107+
userContent: "Hello",
108+
attachmentPath: "/non/existent/file.jpg",
109+
wantMultiContent: false,
110+
},
111+
{
112+
name: "unsupported format falls back to text only",
113+
userContent: "Hello",
114+
attachmentPath: unsupportedPath,
115+
wantMultiContent: false,
116+
},
117+
}
118+
119+
for _, tt := range tests {
120+
t.Run(tt.name, func(t *testing.T) {
121+
t.Parallel()
122+
msg := CreateUserMessageWithAttachment(tt.userContent, tt.attachmentPath)
123+
124+
require.NotNil(t, msg)
125+
assert.Equal(t, chat.MessageRoleUser, msg.Message.Role)
126+
127+
if tt.wantMultiContent {
128+
assert.NotEmpty(t, msg.Message.MultiContent)
129+
assert.Len(t, msg.Message.MultiContent, 2) // text + image
130+
131+
// Check text part
132+
textPart := msg.Message.MultiContent[0]
133+
assert.Equal(t, chat.MessagePartTypeText, textPart.Type)
134+
if tt.wantDefaultPrompt {
135+
assert.Equal(t, "Please analyze this attached file.", textPart.Text)
136+
} else {
137+
assert.Equal(t, tt.userContent, textPart.Text)
138+
}
139+
140+
// Check image part
141+
imagePart := msg.Message.MultiContent[1]
142+
assert.Equal(t, chat.MessagePartTypeImageURL, imagePart.Type)
143+
assert.NotNil(t, imagePart.ImageURL)
144+
145+
if tt.wantFileRef {
146+
assert.NotNil(t, imagePart.ImageURL.FileRef)
147+
assert.Equal(t, chat.FileSourceTypeLocalPath, imagePart.ImageURL.FileRef.SourceType)
148+
assert.NotEmpty(t, imagePart.ImageURL.FileRef.LocalPath)
149+
assert.Equal(t, tt.wantMimeType, imagePart.ImageURL.FileRef.MimeType)
150+
}
151+
} else {
152+
assert.Empty(t, msg.Message.MultiContent)
153+
assert.Equal(t, tt.userContent, msg.Message.Content)
154+
}
155+
})
156+
}
157+
}
158+
159+
func TestParseAttachCommand(t *testing.T) {
160+
t.Parallel()
161+
162+
tests := []struct {
163+
name string
164+
input string
165+
wantText string
166+
wantAttachPath string
167+
}{
168+
{
169+
name: "no attach command",
170+
input: "Hello world",
171+
wantText: "Hello world",
172+
wantAttachPath: "",
173+
},
174+
{
175+
name: "attach at start",
176+
input: "/attach image.png describe this",
177+
wantText: "describe this",
178+
wantAttachPath: "image.png",
179+
},
180+
{
181+
name: "attach in middle",
182+
input: "please /attach photo.jpg analyze it",
183+
wantText: "please analyze it",
184+
wantAttachPath: "photo.jpg",
185+
},
186+
{
187+
name: "attach only",
188+
input: "/attach test.gif",
189+
wantText: "",
190+
wantAttachPath: "test.gif",
191+
},
192+
{
193+
name: "attach with path containing spaces handled",
194+
input: "/attach my_image.png what is this?",
195+
wantText: "what is this?",
196+
wantAttachPath: "my_image.png",
197+
},
198+
{
199+
name: "multiline with attach",
200+
input: "First line\n/attach image.jpg second part\nThird line",
201+
wantText: "First line\nsecond part\nThird line",
202+
wantAttachPath: "image.jpg",
203+
},
204+
}
205+
206+
for _, tt := range tests {
207+
t.Run(tt.name, func(t *testing.T) {
208+
t.Parallel()
209+
text, path := ParseAttachCommand(tt.input)
210+
assert.Equal(t, tt.wantText, text)
211+
assert.Equal(t, tt.wantAttachPath, path)
212+
})
213+
}
214+
}

pkg/model/provider/anthropic/beta_client.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func (c *Client) createBetaStream(
3838
return nil, err
3939
}
4040

41-
converted := convertBetaMessages(messages)
41+
converted := convertBetaMessagesWithClient(ctx, &client, messages)
4242
if err := validateAnthropicSequencingBeta(converted); err != nil {
4343
slog.Warn("Invalid message sequencing for Anthropic Beta API detected, attempting self-repair", "error", err)
4444
converted = repairAnthropicSequencingBeta(converted)

0 commit comments

Comments
 (0)