refactor(agent): replace XML tag extraction with tool-based send/react/speak (#330)

* refactor(agent): replace XML tag extraction with tool-based send/react/speak

Remove the <attachments>, <reactions>, and <speech> XML tag extraction
system from the agent streaming pipeline. Instead, the send/react/speak
tools now handle both same-conversation and cross-conversation delivery:

- send: omit target to deliver attachments in the current conversation;
  specify target for cross-channel messaging
- react: omit target to react in the current conversation
- speak: omit target to speak in the current conversation

Backend changes:
- Add StreamEmitter callback to tools.SessionContext so tools can push
  attachment/reaction/speech events directly into the agent stream
- Wire emitter in agent.go for both streaming and non-streaming paths
- Remove StreamTagExtractor, DefaultTagResolvers, emitTagEvents, and
  delete internal/agent/tags.go entirely
- Remove StripAgentTags calls from assistant_output.go
- Add IsSameConversation detection in messaging executor; same-conv
  sends pass raw paths through the emitter for downstream ingestion
- Auto-resolve relative paths (e.g. "IDENTITY.md" -> "/data/IDENTITY.md")
- Add Metadata propagation through the full attachment chain
  (tools.Attachment -> agent.FileAttachment -> parseAttachmentDelta)
- Update system_chat.md and _contacts.md prompts

Frontend changes (apps/web):
- Hide send/react/speak tool_call blocks when result indicates
  delivered to current conversation
- Defer attachment_delta blocks to end of message (flush on stream
  completion) for consistent positioning with DB-loaded history

* fix(agent): speak tool emits synthesized audio directly as voice attachment

Instead of emitting speech_delta (which requires downstream re-synthesis),
the speak tool now emits the already-synthesized audio as an attachment_delta
with voice type. This avoids double TTS synthesis and eliminates dependency
on ttsService being configured on the inbound processor.

Also fixes speak on WebUI where ReplyTarget is empty (same fix as send).
This commit is contained in:
Acbox Liu
2026-04-04 20:55:03 +08:00
committed by GitHub
parent a5f59ea6a5
commit 5cfbaa40e2
12 changed files with 426 additions and 483 deletions
+80 -11
View File
@@ -62,6 +62,10 @@ type SendResult struct {
BotID string
Platform string
Target string
// Local is true when the message targets the current conversation.
// The caller should emit the resolved attachments as stream events.
Local bool
LocalAttachments []channel.Attachment
}
// ReactResult is the success payload returned after reacting.
@@ -95,6 +99,59 @@ func (e *Executor) Send(ctx context.Context, session SessionContext, args map[st
}
outboundMessage = channel.Message{Text: strings.TrimSpace(messageText)}
}
target := firstStringArg(args, "target")
if target == "" {
target = strings.TrimSpace(session.ReplyTarget)
}
// Same-conversation send: no explicit target from the LLM, or target
// matches the current session. Pass raw paths through for downstream
// ingestOutboundAttachments to resolve (matches old tag-based flow).
isSameConv := target == "" || IsSameConversation(session, channelType.String(), target)
if isSameConv {
if rawAttachments, ok := args["attachments"]; ok && rawAttachments != nil {
for _, item := range NormalizeAttachmentInputs(rawAttachments) {
ref := ""
name := ""
attType := ""
switch v := item.(type) {
case string:
ref = strings.TrimSpace(v)
case map[string]any:
ref = firstStringArg(v, "path", "url")
name = firstStringArg(v, "name")
attType = firstStringArg(v, "type")
}
if ref == "" {
continue
}
lower := strings.ToLower(ref)
if !strings.HasPrefix(ref, "/") &&
!strings.HasPrefix(lower, "http://") &&
!strings.HasPrefix(lower, "https://") &&
!strings.HasPrefix(lower, "data:") {
ref = "/data/" + ref
}
if name == "" {
name = filepath.Base(ref)
}
t := channel.AttachmentType(attType)
if t == "" {
t = InferAttachmentTypeFromExt(ref)
}
outboundMessage.Attachments = append(outboundMessage.Attachments,
channel.Attachment{Type: t, URL: ref, Name: name})
}
}
if outboundMessage.IsEmpty() {
return nil, errors.New("message or attachments required")
}
return &SendResult{
BotID: botID, Platform: channelType.String(), Target: target,
Local: true, LocalAttachments: outboundMessage.Attachments,
}, nil
}
if rawAttachments, ok := args["attachments"]; ok && rawAttachments != nil {
items := NormalizeAttachmentInputs(rawAttachments)
if items == nil {
@@ -117,18 +174,8 @@ func (e *Executor) Send(ctx context.Context, session SessionContext, args map[st
if outboundMessage.Format == "" && channel.ContainsMarkdown(outboundMessage.Text) {
outboundMessage.Format = channel.MessageFormatMarkdown
}
target := firstStringArg(args, "target")
if target == "" {
target = strings.TrimSpace(session.ReplyTarget)
}
if target == "" {
return nil, errors.New("target is required")
}
if strings.EqualFold(channelType.String(), strings.TrimSpace(session.CurrentPlatform)) &&
target == strings.TrimSpace(session.ReplyTarget) {
return nil, errors.New("you are trying to send a message to the same conversation you are already in. " +
"Do not use the send tool for this. Instead, write your reply as plain text directly. " +
"To include files, use the <attachments> block in your response (e.g. <attachments>[{\"type\":\"image\",\"path\":\"/data/media/file.jpg\"}]</attachments>)")
return nil, errors.New("target is required for cross-conversation send")
}
if err := e.Sender.Send(ctx, botID, channelType, channel.SendRequest{Target: target, Message: outboundMessage}); err != nil {
if e.Logger != nil {
@@ -189,6 +236,23 @@ func (e *Executor) CanSend() bool { return e.Sender != nil && e.Resolver != nil
// CanReact returns true if the executor has a reactor and resolver configured.
func (e *Executor) CanReact() bool { return e.Reactor != nil && e.Resolver != nil }
// IsSameConversation reports whether platform+target matches the session's
// current conversation.
func IsSameConversation(session SessionContext, platform, target string) bool {
replyTarget := strings.TrimSpace(session.ReplyTarget)
if replyTarget == "" {
return false
}
if platform == "" {
platform = strings.TrimSpace(session.CurrentPlatform)
}
if target == "" {
target = replyTarget
}
return strings.EqualFold(platform, strings.TrimSpace(session.CurrentPlatform)) &&
target == replyTarget
}
func (*Executor) resolveBotID(args map[string]any, session SessionContext) (string, error) {
botID := firstStringArg(args, "bot_id")
if botID == "" {
@@ -263,6 +327,11 @@ func (e *Executor) resolveAttachmentRef(ctx context.Context, botID, ref, attType
}
return &channel.Attachment{Type: t, Base64: ref, Name: name}
}
// Resolve relative paths against the container's data mount.
// LLMs often pass bare filenames like "IDENTITY.md" instead of "/data/IDENTITY.md".
if !strings.HasPrefix(ref, "/") {
ref = "/data/" + ref
}
if name == "" {
name = filepath.Base(ref)
}