From e9c9ed5ab157a5443b1cfdf49c18a3a845246b28 Mon Sep 17 00:00:00 2001 From: Acbox Date: Tue, 24 Mar 2026 19:14:33 +0800 Subject: [PATCH] fix(agent): route native images into user message for vision models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Images sent by users were silently dropped when the model supported vision: routeAttachmentsByCapability classified them as "Native", but extractFileRefPaths only collected "Fallback" (tool_file_ref) paths, so the image data URL was computed and then discarded — the model saw neither the image nor its container path. - Add InlineImages field to RunConfig to carry native image data - Replace extractFileRefPaths with extractAttachmentPaths that collects paths from both Native (FallbackPath) and Fallback attachments so the YAML header always lists every attachment - Add extractNativeImageParts to extract inline image data URLs - Pass InlineImages as sdk.ImagePart in prepareRunConfig so the LLM receives the actual image content alongside the text query --- internal/agent/types.go | 1 + internal/conversation/flow/resolver.go | 56 +++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/internal/agent/types.go b/internal/agent/types.go index ff37a9f4..dc69d457 100644 --- a/internal/agent/types.go +++ b/internal/agent/types.go @@ -51,6 +51,7 @@ type RunConfig struct { System string SessionType string SupportsImageInput bool + InlineImages []sdk.ImagePart Identity SessionContext Skills []SkillEntry LoopDetection LoopDetectionConfig diff --git a/internal/conversation/flow/resolver.go b/internal/conversation/flow/resolver.go index b12c1b18..e9293fef 100644 --- a/internal/conversation/flow/resolver.go +++ b/internal/conversation/flow/resolver.go @@ -239,6 +239,7 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r } displayName := r.resolveDisplayName(ctx, req) + mergedAttachments := r.routeAndMergeAttachments(ctx, chatModel, req) headerifiedQuery := FormatUserHeader( strings.TrimSpace(req.ExternalMessageID), strings.TrimSpace(req.SourceChannelIdentityID), @@ -246,9 +247,10 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r req.CurrentChannel, strings.TrimSpace(req.ConversationType), strings.TrimSpace(req.ConversationName), - extractFileRefPaths(r.routeAndMergeAttachments(ctx, chatModel, req)), + extractAttachmentPaths(mergedAttachments), req.Query, ) + inlineImages := extractNativeImageParts(mergedAttachments) reasoningEffort := "" if chatModel.HasCompatibility(models.CompatReasoning) && botSettings.ReasoningEnabled { @@ -280,6 +282,7 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r Messages: sdkMessages, Query: headerifiedQuery, SupportsImageInput: chatModel.HasCompatibility(models.CompatVision), + InlineImages: inlineImages, Identity: agentpkg.SessionContext{ BotID: req.BotID, ChatID: req.ChatID, @@ -348,7 +351,13 @@ func (r *Resolver) prepareRunConfig(ctx context.Context, cfg agentpkg.RunConfig) }) if cfg.Query != "" { - cfg.Messages = append(cfg.Messages, sdk.UserMessage(cfg.Query)) + var extra []sdk.MessagePart + for _, img := range cfg.InlineImages { + if strings.TrimSpace(img.Image) != "" { + extra = append(extra, img) + } + } + cfg.Messages = append(cfg.Messages, sdk.UserMessage(cfg.Query, extra...)) } return cfg @@ -510,14 +519,49 @@ func anyNumberToByte(value any) (byte, bool) { return byte(parsed), true } -// extractFileRefPaths collects container file paths from gateway attachments -// that use the tool_file_ref transport. -func extractFileRefPaths(attachments []any) []string { +// extractAttachmentPaths collects container file paths from ALL gateway +// attachments — both tool_file_ref (fallback) and native images that carry a +// FallbackPath. This ensures the YAML user header always lists every +// attachment the user sent, regardless of whether the model consumes the +// image natively or via the read_media tool. +func extractAttachmentPaths(attachments []any) []string { var paths []string for _, att := range attachments { - if ga, ok := att.(gatewayAttachment); ok && ga.Transport == gatewayTransportToolFileRef && strings.TrimSpace(ga.Payload) != "" { + ga, ok := att.(gatewayAttachment) + if !ok { + continue + } + if ga.Transport == gatewayTransportToolFileRef && strings.TrimSpace(ga.Payload) != "" { paths = append(paths, ga.Payload) + } else if strings.TrimSpace(ga.FallbackPath) != "" { + paths = append(paths, ga.FallbackPath) } } return paths } + +// extractNativeImageParts returns sdk.ImagePart entries for attachments that +// the model can consume as inline multimodal input (vision-capable images with +// an inline data URL or public URL payload). +func extractNativeImageParts(attachments []any) []sdk.ImagePart { + var parts []sdk.ImagePart + for _, att := range attachments { + ga, ok := att.(gatewayAttachment) + if !ok || ga.Type != "image" { + continue + } + transport := strings.ToLower(strings.TrimSpace(ga.Transport)) + if transport != gatewayTransportInlineDataURL && transport != gatewayTransportPublicURL { + continue + } + payload := strings.TrimSpace(ga.Payload) + if payload == "" { + continue + } + parts = append(parts, sdk.ImagePart{ + Image: payload, + MediaType: strings.TrimSpace(ga.Mime), + }) + } + return parts +}