Files
Memoh/internal/agent/types.go
T
Acbox e9c9ed5ab1 fix(agent): route native images into user message for vision models
Images sent by users were silently dropped when the model supported
vision: routeAttachmentsByCapability classified them as "Native", but
extractFileRefPaths only collected "Fallback" (tool_file_ref) paths,
so the image data URL was computed and then discarded — the model saw
neither the image nor its container path.

- Add InlineImages field to RunConfig to carry native image data
- Replace extractFileRefPaths with extractAttachmentPaths that
  collects paths from both Native (FallbackPath) and Fallback
  attachments so the YAML header always lists every attachment
- Add extractNativeImageParts to extract inline image data URLs
- Pass InlineImages as sdk.ImagePart in prepareRunConfig so the
  LLM receives the actual image content alongside the text query
2026-03-24 19:14:33 +08:00

144 lines
3.7 KiB
Go

package agent
import (
"encoding/json"
"time"
sdk "github.com/memohai/twilight-ai/sdk"
)
// SessionContext carries request-scoped identity and routing information.
type SessionContext struct {
BotID string
ChatID string
SessionID string
ChannelIdentityID string
CurrentPlatform string
ReplyTarget string
SessionToken string //nolint:gosec // carries session credential material at runtime
IsSubagent bool
}
// SkillEntry represents a skill loaded from the bot container.
type SkillEntry struct {
Name string
Description string
Content string
Metadata map[string]any
}
// Schedule represents a scheduled task definition.
type Schedule struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Pattern string `json:"pattern"`
MaxCalls *int `json:"maxCalls,omitempty"`
Command string `json:"command"`
}
// LoopDetectionConfig controls loop detection behavior.
type LoopDetectionConfig struct {
Enabled bool
}
// RunConfig holds everything needed for a single agent invocation.
type RunConfig struct {
Model *sdk.Model
ReasoningEffort string
Messages []sdk.Message
Query string
System string
SessionType string
SupportsImageInput bool
InlineImages []sdk.ImagePart
Identity SessionContext
Skills []SkillEntry
LoopDetection LoopDetectionConfig
}
// GenerateResult holds the result of a non-streaming agent invocation.
type GenerateResult struct {
Messages []sdk.Message
Text string
Attachments []FileAttachment
Reactions []ReactionItem
Speeches []SpeechItem
Usage *sdk.Usage
}
// FileAttachment represents a file reference extracted from agent output.
type FileAttachment struct {
Type string `json:"type"`
Path string `json:"path,omitempty"`
URL string `json:"url,omitempty"`
Mime string `json:"mime,omitempty"`
Name string `json:"name,omitempty"`
}
// ReactionItem represents an emoji reaction extracted from agent output.
type ReactionItem struct {
Emoji string `json:"emoji"`
}
// SpeechItem represents a TTS request extracted from agent output.
type SpeechItem struct {
Text string `json:"text"`
}
// SystemFile is a file loaded from the bot container for prompt generation.
type SystemFile struct {
Filename string
Content string
}
// ModelConfig holds provider and model information resolved from DB.
type ModelConfig struct {
ModelID string
ClientType string
APIKey string //nolint:gosec // carries provider credential material at runtime
BaseURL string
ReasoningConfig *ReasoningConfig
}
// ReasoningConfig controls extended thinking/reasoning behavior.
type ReasoningConfig struct {
Enabled bool
Effort string
}
func mustMarshal(v any) json.RawMessage {
data, err := json.Marshal(v)
if err != nil {
return nil
}
return data
}
// StripTagsFromMessages strips attachment/reaction/speech tags from assistant messages.
func StripTagsFromMessages(msgs []sdk.Message) []sdk.Message {
resolvers := DefaultTagResolvers()
result := make([]sdk.Message, 0, len(msgs))
for _, msg := range msgs {
if msg.Role != sdk.MessageRoleAssistant {
result = append(result, msg)
continue
}
cleaned := make([]sdk.MessagePart, 0, len(msg.Content))
for _, part := range msg.Content {
if tp, ok := part.(sdk.TextPart); ok {
text, _ := ExtractTagsFromText(tp.Text, resolvers)
cleaned = append(cleaned, sdk.TextPart{Text: text})
} else {
cleaned = append(cleaned, part)
}
}
msg.Content = cleaned
result = append(result, msg)
}
return result
}
// TimeNow is a hook for testing. Defaults to time.Now.
var TimeNow = time.Now