mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
fix(agent): route native images into user message for vision models
Images sent by users were silently dropped when the model supported vision: routeAttachmentsByCapability classified them as "Native", but extractFileRefPaths only collected "Fallback" (tool_file_ref) paths, so the image data URL was computed and then discarded — the model saw neither the image nor its container path. - Add InlineImages field to RunConfig to carry native image data - Replace extractFileRefPaths with extractAttachmentPaths that collects paths from both Native (FallbackPath) and Fallback attachments so the YAML header always lists every attachment - Add extractNativeImageParts to extract inline image data URLs - Pass InlineImages as sdk.ImagePart in prepareRunConfig so the LLM receives the actual image content alongside the text query
This commit is contained in:
@@ -51,6 +51,7 @@ type RunConfig struct {
|
||||
System string
|
||||
SessionType string
|
||||
SupportsImageInput bool
|
||||
InlineImages []sdk.ImagePart
|
||||
Identity SessionContext
|
||||
Skills []SkillEntry
|
||||
LoopDetection LoopDetectionConfig
|
||||
|
||||
@@ -239,6 +239,7 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r
|
||||
}
|
||||
|
||||
displayName := r.resolveDisplayName(ctx, req)
|
||||
mergedAttachments := r.routeAndMergeAttachments(ctx, chatModel, req)
|
||||
headerifiedQuery := FormatUserHeader(
|
||||
strings.TrimSpace(req.ExternalMessageID),
|
||||
strings.TrimSpace(req.SourceChannelIdentityID),
|
||||
@@ -246,9 +247,10 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r
|
||||
req.CurrentChannel,
|
||||
strings.TrimSpace(req.ConversationType),
|
||||
strings.TrimSpace(req.ConversationName),
|
||||
extractFileRefPaths(r.routeAndMergeAttachments(ctx, chatModel, req)),
|
||||
extractAttachmentPaths(mergedAttachments),
|
||||
req.Query,
|
||||
)
|
||||
inlineImages := extractNativeImageParts(mergedAttachments)
|
||||
|
||||
reasoningEffort := ""
|
||||
if chatModel.HasCompatibility(models.CompatReasoning) && botSettings.ReasoningEnabled {
|
||||
@@ -280,6 +282,7 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r
|
||||
Messages: sdkMessages,
|
||||
Query: headerifiedQuery,
|
||||
SupportsImageInput: chatModel.HasCompatibility(models.CompatVision),
|
||||
InlineImages: inlineImages,
|
||||
Identity: agentpkg.SessionContext{
|
||||
BotID: req.BotID,
|
||||
ChatID: req.ChatID,
|
||||
@@ -348,7 +351,13 @@ func (r *Resolver) prepareRunConfig(ctx context.Context, cfg agentpkg.RunConfig)
|
||||
})
|
||||
|
||||
if cfg.Query != "" {
|
||||
cfg.Messages = append(cfg.Messages, sdk.UserMessage(cfg.Query))
|
||||
var extra []sdk.MessagePart
|
||||
for _, img := range cfg.InlineImages {
|
||||
if strings.TrimSpace(img.Image) != "" {
|
||||
extra = append(extra, img)
|
||||
}
|
||||
}
|
||||
cfg.Messages = append(cfg.Messages, sdk.UserMessage(cfg.Query, extra...))
|
||||
}
|
||||
|
||||
return cfg
|
||||
@@ -510,14 +519,49 @@ func anyNumberToByte(value any) (byte, bool) {
|
||||
return byte(parsed), true
|
||||
}
|
||||
|
||||
// extractFileRefPaths collects container file paths from gateway attachments
|
||||
// that use the tool_file_ref transport.
|
||||
func extractFileRefPaths(attachments []any) []string {
|
||||
// extractAttachmentPaths collects container file paths from ALL gateway
|
||||
// attachments — both tool_file_ref (fallback) and native images that carry a
|
||||
// FallbackPath. This ensures the YAML user header always lists every
|
||||
// attachment the user sent, regardless of whether the model consumes the
|
||||
// image natively or via the read_media tool.
|
||||
func extractAttachmentPaths(attachments []any) []string {
|
||||
var paths []string
|
||||
for _, att := range attachments {
|
||||
if ga, ok := att.(gatewayAttachment); ok && ga.Transport == gatewayTransportToolFileRef && strings.TrimSpace(ga.Payload) != "" {
|
||||
ga, ok := att.(gatewayAttachment)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if ga.Transport == gatewayTransportToolFileRef && strings.TrimSpace(ga.Payload) != "" {
|
||||
paths = append(paths, ga.Payload)
|
||||
} else if strings.TrimSpace(ga.FallbackPath) != "" {
|
||||
paths = append(paths, ga.FallbackPath)
|
||||
}
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
// extractNativeImageParts returns sdk.ImagePart entries for attachments that
|
||||
// the model can consume as inline multimodal input (vision-capable images with
|
||||
// an inline data URL or public URL payload).
|
||||
func extractNativeImageParts(attachments []any) []sdk.ImagePart {
|
||||
var parts []sdk.ImagePart
|
||||
for _, att := range attachments {
|
||||
ga, ok := att.(gatewayAttachment)
|
||||
if !ok || ga.Type != "image" {
|
||||
continue
|
||||
}
|
||||
transport := strings.ToLower(strings.TrimSpace(ga.Transport))
|
||||
if transport != gatewayTransportInlineDataURL && transport != gatewayTransportPublicURL {
|
||||
continue
|
||||
}
|
||||
payload := strings.TrimSpace(ga.Payload)
|
||||
if payload == "" {
|
||||
continue
|
||||
}
|
||||
parts = append(parts, sdk.ImagePart{
|
||||
Image: payload,
|
||||
MediaType: strings.TrimSpace(ga.Mime),
|
||||
})
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user