Memoh/agent/src/agent.ts

import {
  generateText,
  ImagePart,
  LanguageModelUsage,
  ModelMessage,
  stepCountIs,
  streamText,
  ToolSet,
  UserModelMessage,
} from 'ai'
import {
  AgentInput,
  AgentParams,
  AgentSkill,
  allActions,
  MCPConnection,
  Schedule,
} from './types'
import { ModelInput, hasInputModality } from './types/model'
import { system, schedule, user, subagentSystem } from './prompts'
import { AuthFetcher } from './index'
import { createModel } from './model'
import { AgentAction } from './types/action'
import {
  extractAttachmentsFromText,
  stripAttachmentsFromMessages,
  dedupeAttachments,
  AttachmentsStreamExtractor,
} from './utils/attachments'
import type {
  ContainerFileAttachment,
  ImageAttachment,
} from './types/attachment'
import { getMCPTools } from './tools/mcp'
import { getTools } from './tools'
import { buildIdentityHeaders } from './utils/headers'

export const createAgent = (
  {
    model: modelConfig,
    activeContextTime = 24 * 60,
    language = 'Same as the user input',
    allowedActions = allActions,
    channels = [],
    skills = [],
    mcpConnections = [],
    currentChannel = 'Unknown Channel',
    identity = {
      botId: '',
      containerId: '',
      channelIdentityId: '',
      displayName: '',
    },
    auth,
  }: AgentParams,
  fetch: AuthFetcher,
) => {
  const model = createModel(modelConfig)
  const enabledSkills: AgentSkill[] = []

  const enableSkill = (skill: string) => {
    const agentSkill = skills.find((s) => s.name === skill)
    if (agentSkill) {
      enabledSkills.push(agentSkill)
    }
  }

  const getEnabledSkills = () => {
    return enabledSkills.map((skill) => skill.name)
  }

  const loadSystemFiles = async () => {
    if (!auth?.bearer || !identity.botId) {
      return {
        identityContent: '',
        soulContent: '',
        toolsContent: '',
      }
    }
    const readViaMCP = async (path: string): Promise<string> => {
      const url = `${auth.baseUrl.replace(/\/$/, '')}/bots/${identity.botId}/tools`
      const headers: Record<string, string> = {
        'Content-Type': 'application/json',
        Accept: 'application/json, text/event-stream',
        Authorization: `Bearer ${auth.bearer}`,
      }
      if (identity.channelIdentityId) {
        headers['X-Memoh-Channel-Identity-Id'] = identity.channelIdentityId
      }
      const body = JSON.stringify({
        jsonrpc: '2.0',
        id: `read-${path}`,
        method: 'tools/call',
        params: { name: 'read', arguments: { path } },
      })
      const response = await fetch(url, { method: 'POST', headers, body })
      if (!response.ok) return ''
      const data = await response.json().catch(() => ({}))
      const structured =
        data?.result?.structuredContent ?? data?.result?.content?.[0]?.text
      if (typeof structured === 'string') {
        try {
          const parsed = JSON.parse(structured)
          return typeof parsed?.content === 'string' ? parsed.content : ''
        } catch {
          return structured
        }
      }
      if (typeof structured === 'object' && structured?.content) {
        return typeof structured.content === 'string' ? structured.content : ''
      }
      return ''
    }
    const [identityContent, soulContent, toolsContent] = await Promise.all([
      readViaMCP('IDENTITY.md'),
      readViaMCP('SOUL.md'),
      readViaMCP('TOOLS.md'),
    ])
    return {
      identityContent,
      soulContent,
      toolsContent,
    }
  }

  const prepareInputWithMCPImageBase64 = async (
    input: AgentInput,
  ): Promise<AgentInput> => {
    if (!auth?.bearer || !identity.botId) {
      return input
    }
    const url = `${auth.baseUrl.replace(/\/$/, '')}/bots/${identity.botId}/tools`
    const headers: Record<string, string> = {
      'Content-Type': 'application/json',
      Accept: 'application/json, text/event-stream',
      Authorization: `Bearer ${auth.bearer}`,
    }
    if (identity.channelIdentityId) {
      headers['X-Memoh-Channel-Identity-Id'] = identity.channelIdentityId
    }
    if (identity.sessionToken) {
      headers['X-Memoh-Session-Token'] = identity.sessionToken
    }
    if (identity.currentPlatform) {
      headers['X-Memoh-Current-Platform'] = identity.currentPlatform
    }
    if (identity.replyTarget) {
      headers['X-Memoh-Reply-Target'] = identity.replyTarget
    }
    const attachments = await Promise.all(
      input.attachments.map(async (attachment) => {
        if (attachment.type !== 'image') {
          return attachment
        }
        const image = attachment as ImageAttachment
        if (typeof image.base64 === 'string' && image.base64.trim() !== '') {
          return image
        }
        const path = String(image.path ?? '').trim()
        if (!path) {
          return image
        }
        const quotedPath = `'${path.replace(/'/g, '\'\\\'\'')}'`
        const command = `base64 ${quotedPath} | tr -d '\\n'`
        const body = JSON.stringify({
          jsonrpc: '2.0',
          id: `read-image-${quotedPath}`,
          method: 'tools/call',
          params: {
            name: 'exec',
            arguments: { command },
          },
        })
        try {
          const response = await fetch(url, { method: 'POST', headers, body })
          if (!response.ok) {
            return image
          }
          const payload = await response.json().catch(() => ({}))
          const structured = payload?.result?.structuredContent
          const execResult = (
            structured && typeof structured === 'object' ? structured : null
          ) as { stdout?: unknown; exit_code?: unknown } | null
          const exitCode = Number(execResult?.exit_code ?? 1)
          const stdout =
            typeof execResult?.stdout === 'string'
              ? execResult.stdout.trim()
              : ''
          if (exitCode !== 0 || stdout === '') {
            return image
          }
          const mime = String(image.mime ?? '').trim() || 'image/png'
          return {
            ...image,
            base64: `data:${mime};base64,${stdout}`,
          }
        } catch {
          return image
        }
      }),
    )
    return { ...input, attachments }
  }

  const generateSystemPrompt = async () => {
    const { identityContent, soulContent, toolsContent } =
      await loadSystemFiles()
    return system({
      date: new Date(),
      language,
      maxContextLoadTime: activeContextTime,
      channels,
      currentChannel,
      skills,
      enabledSkills,
      identityContent,
      soulContent,
      toolsContent,
    })
  }

  const getAgentTools = async () => {
    const baseUrl = auth.baseUrl.replace(/\/$/, '')
    const botId = identity.botId.trim()
    if (!baseUrl || !botId) {
      return {
        tools: {},
        close: async () => {},
      }
    }
    const headers = buildIdentityHeaders(identity, auth)
    const builtins: MCPConnection[] = [
      {
        type: 'http',
        name: 'builtin',
        url: `${baseUrl}/bots/${botId}/tools`,
        headers,
      },
    ]
    const { tools: mcpTools, close: closeMCP } = await getMCPTools(
      [...builtins, ...mcpConnections],
      {
        auth,
        fetch,
        botId,
      },
    )
    const tools = getTools(allowedActions, {
      fetch,
      model: modelConfig,
      identity,
      auth,
      enableSkill,
    })
    return {
      tools: { ...mcpTools, ...tools } as ToolSet,
      close: closeMCP,
    }
  }

  const generateUserPrompt = (input: AgentInput) => {
    const supportsImage = hasInputModality(modelConfig, ModelInput.Image)

    // Separate attachments by model capability: native images vs fallback file paths.
    const nativeImages = supportsImage
      ? input.attachments.filter((a) => a.type === 'image')
      : []
    const fallbackFiles = input.attachments.filter(
      (a): a is ContainerFileAttachment => a.type === 'file',
    )
    // Images the model cannot handle natively are mentioned as path references.
    const unsupportedImages: ContainerFileAttachment[] = supportsImage
      ? []
      : input.attachments
          .filter((a) => a.type === 'image')
          .map((a) => ({
            type: 'file' as const,
            path: String(
              (a as ImageAttachment).path || a.metadata?.path || '[image]',
            ),
            metadata: a.metadata,
          }))
    const allFiles: ContainerFileAttachment[] = [
      ...fallbackFiles,
      ...unsupportedImages,
    ]

    const text = user(input.query, {
      channelIdentityId: identity.channelIdentityId || identity.contactId || '',
      displayName: identity.displayName || identity.contactName || 'User',
      channel: currentChannel,
      conversationType: identity.conversationType || 'direct',
      date: new Date(),
      attachments: allFiles,
    })
    const imageParts: ImagePart[] = nativeImages
      .map((image) => {
        const img = image as ImageAttachment
        if (img.base64) {
          return { type: 'image', image: img.base64 } as ImagePart
        }
        if (img.url) {
          return { type: 'image', image: img.url } as ImagePart
        }
        return { type: 'image', image: '' } as ImagePart
      })
      .filter((p) => p.image !== '')
    const userMessage: UserModelMessage = {
      role: 'user',
      content: [{ type: 'text', text }, ...imageParts],
    }
    return userMessage
  }

  const ask = async (input: AgentInput) => {
    const preparedInput = await prepareInputWithMCPImageBase64(input)
    const userPrompt = generateUserPrompt(preparedInput)
    const messages = [...preparedInput.messages, userPrompt]
    preparedInput.skills.forEach((skill) => enableSkill(skill))
    const systemPrompt = await generateSystemPrompt()
    const { tools, close } = await getAgentTools()
    const { response, reasoning, text, usage } = await generateText({
      model,
      messages,
      system: systemPrompt,
      stopWhen: stepCountIs(Infinity),
      prepareStep: () => {
        return {
          system: systemPrompt,
        }
      },
      onFinish: async () => {
        await close()
      },
      tools,
    })
    const { cleanedText, attachments: textAttachments } =
      extractAttachmentsFromText(text)
    const { messages: strippedMessages, attachments: messageAttachments } =
      stripAttachmentsFromMessages(response.messages)
    const allAttachments = dedupeAttachments([
      ...textAttachments,
      ...messageAttachments,
    ])
    return {
      messages: strippedMessages,
      reasoning: reasoning.map((part) => part.text),
      usage,
      text: cleanedText,
      attachments: allAttachments,
      skills: getEnabledSkills(),
    }
  }

  const askAsSubagent = async (params: {
    input: string;
    name: string;
    description: string;
    messages: ModelMessage[];
  }) => {
    const userPrompt: UserModelMessage = {
      role: 'user',
      content: [{ type: 'text', text: params.input }],
    }
    const generateSubagentSystemPrompt = () => {
      return subagentSystem({
        date: new Date(),
        name: params.name,
        description: params.description,
      })
    }
    const messages = [...params.messages, userPrompt]
    const { tools, close } = await getAgentTools()
    const { response, reasoning, text, usage } = await generateText({
      model,
      messages,
      system: generateSubagentSystemPrompt(),
      stopWhen: stepCountIs(Infinity),
      prepareStep: () => {
        return {
          system: generateSubagentSystemPrompt(),
        }
      },
      onFinish: async () => {
        await close()
      },
      tools,
    })
    return {
      messages: [userPrompt, ...response.messages],
      reasoning: reasoning.map((part) => part.text),
      usage,
      text,
      skills: getEnabledSkills(),
    }
  }

  const triggerSchedule = async (params: {
    schedule: Schedule;
    messages: ModelMessage[];
    skills: string[];
  }) => {
    const scheduleMessage: UserModelMessage = {
      role: 'user',
      content: [
        {
          type: 'text',
          text: schedule({ schedule: params.schedule, date: new Date() }),
        },
      ],
    }
    const messages = [...params.messages, scheduleMessage]
    params.skills.forEach((skill) => enableSkill(skill))
    const { tools, close } = await getAgentTools()
    const { response, reasoning, text, usage } = await generateText({
      model,
      messages,
      system: await generateSystemPrompt(),
      stopWhen: stepCountIs(Infinity),
      onFinish: async () => {
        await close()
      },
      tools,
    })
    return {
      messages: [scheduleMessage, ...response.messages],
      reasoning: reasoning.map((part) => part.text),
      usage,
      text,
      skills: getEnabledSkills(),
    }
  }

  const resolveStreamErrorMessage = (raw: unknown): string => {
    if (raw instanceof Error && raw.message.trim()) {
      return raw.message
    }
    if (typeof raw === 'string' && raw.trim()) {
      return raw
    }
    if (raw && typeof raw === 'object') {
      const candidate = raw as { message?: unknown; error?: unknown }
      if (typeof candidate.message === 'string' && candidate.message.trim()) {
        return candidate.message
      }
      if (typeof candidate.error === 'string' && candidate.error.trim()) {
        return candidate.error
      }
      if (candidate.error instanceof Error && candidate.error.message.trim()) {
        return candidate.error.message
      }
    }
    return 'Model stream failed'
  }

  async function* stream(input: AgentInput): AsyncGenerator<AgentAction> {
    const preparedInput = await prepareInputWithMCPImageBase64(input)
    const userPrompt = generateUserPrompt(preparedInput)
    const messages = [...preparedInput.messages, userPrompt]
    preparedInput.skills.forEach((skill) => enableSkill(skill))
    const systemPrompt = await generateSystemPrompt()
    const attachmentsExtractor = new AttachmentsStreamExtractor()
    const result: {
      messages: ModelMessage[];
      reasoning: string[];
      usage: LanguageModelUsage | null;
    } = {
      messages: [],
      reasoning: [],
      usage: null,
    }
    const { tools, close } = await getAgentTools()
    const { fullStream } = streamText({
      model,
      messages,
      system: systemPrompt,
      stopWhen: stepCountIs(Infinity),
      prepareStep: () => {
        return {
          system: systemPrompt,
        }
      },
      tools,
      onFinish: async ({ usage, reasoning, response }) => {
        await close()
        result.usage = usage as never
        result.reasoning = reasoning.map((part) => part.text)
        result.messages = response.messages
      },
    })
    yield {
      type: 'agent_start',
      input,
    }
    for await (const chunk of fullStream) {
      if (chunk.type === 'error') {
        throw new Error(
          resolveStreamErrorMessage((chunk as { error?: unknown }).error),
        )
      }
      switch (chunk.type) {
        case 'reasoning-start':
          yield {
            type: 'reasoning_start',
            metadata: chunk,
          }
          break
        case 'reasoning-delta':
          yield {
            type: 'reasoning_delta',
            delta: chunk.text,
          }
          break
        case 'reasoning-end':
          yield {
            type: 'reasoning_end',
            metadata: chunk,
          }
          break
        case 'text-start':
          yield {
            type: 'text_start',
          }
          break
        case 'text-delta': {
          const { visibleText, attachments } = attachmentsExtractor.push(
            chunk.text,
          )
          if (visibleText) {
            yield {
              type: 'text_delta',
              delta: visibleText,
            }
          }
          if (attachments.length) {
            yield {
              type: 'attachment_delta',
              attachments,
            }
          }
          break
        }
        case 'text-end': {
          // Flush any remaining buffered content before ending the text stream.
          const remainder = attachmentsExtractor.flushRemainder()
          if (remainder.visibleText) {
            yield {
              type: 'text_delta',
              delta: remainder.visibleText,
            }
          }
          if (remainder.attachments.length) {
            yield {
              type: 'attachment_delta',
              attachments: remainder.attachments,
            }
          }
          yield {
            type: 'text_end',
            metadata: chunk,
          }
          break
        }
        case 'tool-call':
          yield {
            type: 'tool_call_start',
            toolName: chunk.toolName,
            toolCallId: chunk.toolCallId,
            input: chunk.input,
            metadata: chunk,
          }
          break
        case 'tool-result':
          yield {
            type: 'tool_call_end',
            toolName: chunk.toolName,
            toolCallId: chunk.toolCallId,
            input: chunk.input,
            result: chunk.output,
            metadata: chunk,
          }
          break
        case 'file':
          yield {
            type: 'attachment_delta',
            attachments: [
              {
                type: 'image',
                url: `data:${chunk.file.mediaType ?? 'image/png'};base64,${chunk.file.base64}`,
                mime: chunk.file.mediaType ?? 'image/png',
              },
            ],
          }
      }
    }

    const { messages: strippedMessages } = stripAttachmentsFromMessages(
      result.messages,
    )
    yield {
      type: 'agent_end',
      messages: strippedMessages,
      reasoning: result.reasoning,
      usage: result.usage!,
      skills: getEnabledSkills(),
    }
  }

  return {
    stream,
    ask,
    askAsSubagent,
    triggerSchedule,
  }
}