Files
Memoh/packages/agent/src/agent.ts
T
Ringo.Typowriter 0a2a17ecc8 feat(agent): add readMedia tool for model to view the image (#165)
* feat(agent): add readMedia tool for loading local images into model
context

* feat(channel/inbound): include container attachment refs in inbound
query

* fix(agent): preserve ImagePart literal typing in buildNativeImageParts

* chore: rename tool

---------

Co-authored-by: 晨苒 <16112591+chen-ran@users.noreply.github.com>
2026-03-04 11:24:01 +08:00

722 lines
22 KiB
TypeScript

import {
generateText,
type ImagePart,
LanguageModelUsage,
ModelMessage,
stepCountIs,
streamText,
ToolSet,
UserModelMessage,
type PrepareStepFunction,
} from 'ai'
import {
AgentInput,
AgentParams,
AgentSkill,
AgentStreamAction,
allActions,
Heartbeat,
MCPConnection,
Schedule,
SystemFile,
} from './types'
import { ClientType, ModelConfig, ModelInput, hasInputModality } from './types/model'
import { system, schedule, heartbeat, subagentSystem } from './prompts'
import { AuthFetcher } from './types'
import { createModel } from './model'
import {
extractAttachmentsFromText,
stripAttachmentsFromMessages,
dedupeAttachments,
AttachmentsStreamExtractor,
} from './utils/attachments'
import type { GatewayInputAttachment } from './types/attachment'
import { getMCPTools } from './tools/mcp'
import { getTools } from './tools'
import { buildIdentityHeaders } from './utils/headers'
import { createFS } from './utils'
import { createTextLoopGuard, createTextLoopProbeBuffer } from './sential'
import { createToolLoopGuardedTools } from './tool-loop'
import { createPrepareStepWithReadMedia } from './utils/read-media-injector'
const ANTHROPIC_BUDGET: Record<string, number> = { low: 5000, medium: 16000, high: 50000 }
const GOOGLE_BUDGET: Record<string, number> = { low: 5000, medium: 16000, high: 50000 }
const LOOP_DETECTED_ABORT_MESSAGE = 'loop detected, stream aborted'
const LOOP_DETECTED_STREAK_THRESHOLD = 3
const LOOP_DETECTED_MIN_NEW_GRAMS_PER_CHUNK = 8
const LOOP_DETECTED_PROBE_CHARS = 256
const TOOL_LOOP_DETECTED_ABORT_MESSAGE = 'tool loop detected, stream aborted'
const TOOL_LOOP_REPEAT_THRESHOLD = 5
const TOOL_LOOP_WARNINGS_BEFORE_ABORT = 1
const TOOL_LOOP_WARNING_KEY = '__memoh_tool_loop_warning'
const TOOL_LOOP_WARNING_TEXT = '[MEMOH_TOOL_LOOP_WARNING] Repeated identical tool invocation (same tool + arguments) was detected more than 5 times. Stop looping this tool and either summarize current results or change strategy.'
const buildProviderOptions = (config: ModelConfig): Record<string, Record<string, unknown>> | undefined => {
if (!config.reasoning?.enabled) return undefined
const effort = config.reasoning.effort ?? 'medium'
switch (config.clientType) {
case ClientType.AnthropicMessages:
return { anthropic: { thinking: { type: 'enabled' as const, budgetTokens: ANTHROPIC_BUDGET[effort] } } }
case ClientType.OpenAIResponses:
case ClientType.OpenAICompletions:
return { openai: { reasoningEffort: effort } }
case ClientType.GoogleGenerativeAI:
return { google: { thinkingConfig: { thinkingBudget: GOOGLE_BUDGET[effort] } } }
default:
return undefined
}
}
const buildStepUsages = (
steps: { usage: LanguageModelUsage; response: { messages: unknown[] } }[],
): (LanguageModelUsage | null)[] => {
const usages: (LanguageModelUsage | null)[] = []
for (const step of steps) {
for (let i = 0; i < step.response.messages.length; i++) {
usages.push(i === 0 ? step.usage : null)
}
}
return usages
}
export const buildNativeImageParts = (attachments: GatewayInputAttachment[]): ImagePart[] => {
return attachments
.filter((attachment) =>
attachment.type === 'image' &&
(attachment.transport === 'inline_data_url' || attachment.transport === 'public_url') &&
Boolean(attachment.payload),
)
.map((attachment): ImagePart => ({ type: 'image', image: attachment.payload }))
}
export const createAgent = (
{
model: modelConfig,
activeContextTime = 24 * 60,
language = 'Same as the user input',
allowedActions = allActions,
channels = [],
skills = [],
mcpConnections = [],
currentChannel = 'Unknown Channel',
identity = {
botId: '',
containerId: '',
channelIdentityId: '',
displayName: '',
},
auth,
inbox = [],
loopDetection = { enabled: false },
}: AgentParams,
fetch: AuthFetcher,
) => {
const model = createModel(modelConfig)
const supportsImageInput = hasInputModality(modelConfig, ModelInput.Image)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const providerOptions = buildProviderOptions(modelConfig) as any
const loopDetectionEnabled = loopDetection?.enabled === true
const enabledSkills: AgentSkill[] = []
const fs = createFS({ fetch, botId: identity.botId })
const enableSkill = (skill: string) => {
const agentSkill = skills.find((s) => s.name === skill)
if (agentSkill) {
enabledSkills.push(agentSkill)
}
}
const getEnabledSkills = () => {
return enabledSkills.map((skill) => skill.name)
}
const loadSystemFiles = async (): Promise<SystemFile[]> => {
const home = '/data'
const pad = (n: number) => n.toString().padStart(2, '0')
const getDateString = (date: Date) =>
`${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`
const _today = getDateString(new Date())
const _yesterday = getDateString(new Date(Date.now() - 24 * 60 * 60 * 1000))
const files = [
'IDENTITY.md',
'SOUL.md',
'TOOLS.md',
'MEMORY.md',
'PROFILES.md',
`memory/${_today}.md`,
`memory/${_yesterday}.md`,
]
const promises = files.map((file) => (async () => ({
filename: file,
content: await fs.readText(`${home}/${file}`).catch(() => ''),
}))())
return await Promise.all(promises) as SystemFile[]
}
const generateSystemPrompt = async () => {
const files = await loadSystemFiles()
return system({
date: new Date(),
language,
maxContextLoadTime: activeContextTime,
channels,
currentChannel,
skills,
enabledSkills,
inbox,
supportsImageInput,
files,
})
}
const getAgentTools = async () => {
const baseUrl = auth.baseUrl.replace(/\/$/, '')
const botId = identity.botId.trim()
if (!baseUrl || !botId) {
return {
tools: {},
close: async () => {},
}
}
const headers = buildIdentityHeaders(identity, auth)
const builtins: MCPConnection[] = [
{
type: 'http',
name: 'builtin',
url: `${baseUrl}/bots/${botId}/tools`,
headers,
},
]
const { tools: mcpTools, close: closeMCP } = await getMCPTools(
[...builtins, ...mcpConnections],
{
auth,
fetch,
botId,
},
)
const tools = getTools(allowedActions, {
fetch,
model: modelConfig,
identity,
auth,
enableSkill,
})
return {
tools: { ...mcpTools, ...tools } as ToolSet,
close: closeMCP,
}
}
const generateUserPrompt = (input: AgentInput) => {
const imageParts = supportsImageInput ? buildNativeImageParts(input.attachments) : []
const userMessage: UserModelMessage = {
role: 'user',
content: [{ type: 'text', text: input.query }, ...imageParts],
}
return userMessage
}
const createNonStreamTextLoopInspector = () => {
if (!loopDetectionEnabled) {
return null
}
const textLoopGuard = createTextLoopGuard({
consecutiveHitsToAbort: LOOP_DETECTED_STREAK_THRESHOLD,
minNewGramsPerChunk: LOOP_DETECTED_MIN_NEW_GRAMS_PER_CHUNK,
})
return (text: string) => {
const result = textLoopGuard.inspect(text)
if (result.abort) {
throw new Error(LOOP_DETECTED_ABORT_MESSAGE)
}
}
}
const buildGuardedTools = (
tools: ToolSet,
onAbortToolCall: (toolCallId: string) => void = () => {},
): ToolSet => {
if (!loopDetectionEnabled) {
return tools
}
return createToolLoopGuardedTools(tools, {
repeatThreshold: TOOL_LOOP_REPEAT_THRESHOLD,
warningsBeforeAbort: TOOL_LOOP_WARNINGS_BEFORE_ABORT,
onAbortToolCall,
warningKey: TOOL_LOOP_WARNING_KEY,
warningText: TOOL_LOOP_WARNING_TEXT,
})
}
const runTextGeneration = async ({
messages,
systemPrompt,
basePrepareStep,
}: {
messages: ModelMessage[]
systemPrompt: string
basePrepareStep?: PrepareStepFunction
}) => {
const { tools: baseTools, close } = await getAgentTools()
const { prepareStep, tools: readMediaTools } = createPrepareStepWithReadMedia({
modelConfig,
fs,
systemPrompt,
basePrepareStep,
})
const tools = { ...baseTools, ...readMediaTools }
let shouldAbortForToolLoop = false
const guardedTools = buildGuardedTools(tools, () => {
shouldAbortForToolLoop = true
})
const inspectTextLoop = createNonStreamTextLoopInspector()
let runError: unknown = null
try {
return await generateText({
model,
messages,
system: systemPrompt,
...(providerOptions && { providerOptions }),
stopWhen: stepCountIs(Infinity),
prepareStep,
...(loopDetectionEnabled && {
onStepFinish: ({ text }: { text: string }) => {
if (shouldAbortForToolLoop) {
throw new Error(TOOL_LOOP_DETECTED_ABORT_MESSAGE)
}
if (inspectTextLoop) {
inspectTextLoop(text)
}
},
}),
tools: guardedTools,
})
} catch (error) {
runError = error
throw error
} finally {
try {
await close()
} catch (closeError) {
if (runError == null) {
throw closeError
}
console.error(closeError)
}
}
}
const ask = async (input: AgentInput) => {
const userPrompt = generateUserPrompt(input)
const messages = [...input.messages, userPrompt]
input.skills.forEach((skill) => enableSkill(skill))
const systemPrompt = await generateSystemPrompt()
const { response, reasoning, text, usage, steps } = await runTextGeneration({
messages,
systemPrompt,
basePrepareStep: () => ({ system: systemPrompt }),
})
const stepUsages = buildStepUsages(steps)
const { cleanedText, attachments: textAttachments } =
extractAttachmentsFromText(text)
const { messages: strippedMessages, attachments: messageAttachments } =
stripAttachmentsFromMessages(response.messages)
const allAttachments = dedupeAttachments([
...textAttachments,
...messageAttachments,
])
return {
messages: [
userPrompt,
...strippedMessages,
],
usages: [null, ...stepUsages] as (LanguageModelUsage | null)[],
reasoning: reasoning.map((part) => part.text),
usage,
text: cleanedText,
attachments: allAttachments,
skills: getEnabledSkills(),
}
}
const askAsSubagent = async (params: {
input: string;
name: string;
description: string;
messages: ModelMessage[];
}) => {
const userPrompt: UserModelMessage = {
role: 'user',
content: [{ type: 'text', text: params.input }],
}
const generateSubagentSystemPrompt = () => {
return subagentSystem({
date: new Date(),
name: params.name,
description: params.description,
})
}
const systemPrompt = generateSubagentSystemPrompt()
const messages = [...params.messages, userPrompt]
const { response, reasoning, text, usage, steps } = await runTextGeneration({
messages,
systemPrompt,
basePrepareStep: () => ({ system: generateSubagentSystemPrompt() }),
})
const stepUsages = buildStepUsages(steps)
return {
messages: [userPrompt, ...response.messages],
usages: [null, ...stepUsages] as (LanguageModelUsage | null)[],
reasoning: reasoning.map((part) => part.text),
usage,
text,
skills: getEnabledSkills(),
}
}
const triggerSchedule = async (params: {
schedule: Schedule;
messages: ModelMessage[];
skills: string[];
}) => {
const scheduleMessage: UserModelMessage = {
role: 'user',
content: [
{
type: 'text',
text: schedule({ schedule: params.schedule, date: new Date() }),
},
],
}
const messages = [...params.messages, scheduleMessage]
params.skills.forEach((skill) => enableSkill(skill))
const { response, reasoning, text, usage, steps } = await runTextGeneration({
messages,
systemPrompt: await generateSystemPrompt(),
})
const stepUsages = buildStepUsages(steps)
return {
messages: [scheduleMessage, ...response.messages],
usages: [null, ...stepUsages] as (LanguageModelUsage | null)[],
reasoning: reasoning.map((part) => part.text),
usage,
text,
skills: getEnabledSkills(),
}
}
const triggerHeartbeat = async (params: {
heartbeat: Heartbeat;
messages: ModelMessage[];
skills: string[];
}) => {
const heartbeatText = await heartbeat({ interval: params.heartbeat.interval, date: new Date(), fs })
const heartbeatMessage: UserModelMessage = {
role: 'user',
content: [
{
type: 'text',
text: heartbeatText,
},
],
}
const messages = [...params.messages, heartbeatMessage]
params.skills.forEach((skill) => enableSkill(skill))
const { response, reasoning, text, usage, steps } = await runTextGeneration({
messages,
systemPrompt: await generateSystemPrompt(),
})
const stepUsages = buildStepUsages(steps)
return {
messages: [heartbeatMessage, ...response.messages],
usages: [null, ...stepUsages] as (LanguageModelUsage | null)[],
reasoning: reasoning.map((part) => part.text),
usage,
text,
skills: getEnabledSkills(),
}
}
const resolveStreamErrorMessage = (raw: unknown): string => {
if (raw instanceof Error && raw.message.trim()) {
return raw.message
}
if (typeof raw === 'string' && raw.trim()) {
return raw
}
if (raw && typeof raw === 'object') {
const candidate = raw as { message?: unknown; error?: unknown }
if (typeof candidate.message === 'string' && candidate.message.trim()) {
return candidate.message
}
if (typeof candidate.error === 'string' && candidate.error.trim()) {
return candidate.error
}
if (candidate.error instanceof Error && candidate.error.message.trim()) {
return candidate.error.message
}
}
return 'Model stream failed'
}
async function* stream(input: AgentInput): AsyncGenerator<AgentStreamAction> {
const userPrompt = generateUserPrompt(input)
const messages = [...input.messages, userPrompt]
input.skills.forEach((skill) => enableSkill(skill))
const systemPrompt = await generateSystemPrompt()
const attachmentsExtractor = new AttachmentsStreamExtractor()
const textLoopGuard = loopDetectionEnabled
? createTextLoopGuard({
consecutiveHitsToAbort: LOOP_DETECTED_STREAK_THRESHOLD,
minNewGramsPerChunk: LOOP_DETECTED_MIN_NEW_GRAMS_PER_CHUNK,
})
: null
const guardLoopOutput = (text: string) => {
if (!textLoopGuard) {
return
}
const result = textLoopGuard.inspect(text)
if (result.abort) {
throw new Error(LOOP_DETECTED_ABORT_MESSAGE)
}
}
const textLoopProbeBuffer = textLoopGuard
? createTextLoopProbeBuffer(
LOOP_DETECTED_PROBE_CHARS,
guardLoopOutput,
)
: null
const result: {
messages: ModelMessage[];
reasoning: string[];
usage: LanguageModelUsage | null;
usages: (LanguageModelUsage | null)[];
} = {
messages: [],
reasoning: [],
usage: null,
usages: [],
}
const toolLoopAbortCallIds = new Set<string>()
const { tools: baseTools, close } = await getAgentTools()
const { prepareStep, tools: readMediaTools } = createPrepareStepWithReadMedia({
modelConfig,
fs,
systemPrompt,
basePrepareStep: () => ({ system: systemPrompt }),
})
const tools = { ...baseTools, ...readMediaTools }
// Stream path needs deferred abort to keep tool_call_start/tool_call_end event pairing.
const guardedTools = buildGuardedTools(tools, (toolCallId) => {
toolLoopAbortCallIds.add(toolCallId)
})
let closePromise: Promise<void> | null = null
const closeTools = async () => {
if (!closePromise) {
closePromise = Promise.resolve().then(() => close())
}
await closePromise
}
let streamError: unknown = null
try {
const { fullStream } = streamText({
model,
messages,
system: systemPrompt,
...(providerOptions && { providerOptions }),
stopWhen: stepCountIs(Infinity),
prepareStep,
tools: guardedTools,
onFinish: async ({ usage, reasoning, response, steps }) => {
await closeTools()
result.usage = usage as never
result.reasoning = reasoning.map((part) => part.text)
result.messages = response.messages
result.usages = buildStepUsages(steps)
},
})
yield {
type: 'agent_start',
input,
}
for await (const chunk of fullStream) {
if (chunk.type === 'error') {
throw new Error(
resolveStreamErrorMessage((chunk as { error?: unknown }).error),
)
}
switch (chunk.type) {
case 'reasoning-start':
yield {
type: 'reasoning_start',
metadata: chunk,
}
break
case 'reasoning-delta':
yield {
type: 'reasoning_delta',
delta: chunk.text,
}
break
case 'reasoning-end':
yield {
type: 'reasoning_end',
metadata: chunk,
}
break
case 'text-start':
yield {
type: 'text_start',
}
break
case 'text-delta': {
const { visibleText, attachments } = attachmentsExtractor.push(
chunk.text,
)
if (visibleText) {
if (textLoopProbeBuffer) {
textLoopProbeBuffer.push(visibleText)
}
yield {
type: 'text_delta',
delta: visibleText,
}
}
if (attachments.length) {
yield {
type: 'attachment_delta',
attachments,
}
}
break
}
case 'text-end': {
// Flush any remaining buffered content before ending the text stream.
const remainder = attachmentsExtractor.flushRemainder()
if (remainder.visibleText) {
if (textLoopProbeBuffer) {
textLoopProbeBuffer.push(remainder.visibleText)
}
yield {
type: 'text_delta',
delta: remainder.visibleText,
}
}
if (textLoopProbeBuffer) {
textLoopProbeBuffer.flush()
}
if (remainder.attachments.length) {
yield {
type: 'attachment_delta',
attachments: remainder.attachments,
}
}
yield {
type: 'text_end',
metadata: chunk,
}
break
}
case 'tool-call':
// Flush any remaining buffered content before ending the text stream.
const remainder = attachmentsExtractor.flushRemainder()
if (remainder.visibleText) {
if (textLoopProbeBuffer) {
textLoopProbeBuffer.push(remainder.visibleText)
}
yield {
type: 'text_delta',
delta: remainder.visibleText,
}
}
if (textLoopProbeBuffer) {
textLoopProbeBuffer.flush()
}
if (remainder.attachments.length) {
yield {
type: 'attachment_delta',
attachments: remainder.attachments,
}
}
yield {
type: 'tool_call_start',
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
metadata: chunk,
}
break
case 'tool-result':
// Always emit the terminal tool event first so downstream reducers
// can close the in-flight tool block before the stream aborts.
const shouldAbortForToolLoop = toolLoopAbortCallIds.delete(chunk.toolCallId)
yield {
type: 'tool_call_end',
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
result: chunk.output,
metadata: chunk,
}
if (shouldAbortForToolLoop) {
throw new Error(TOOL_LOOP_DETECTED_ABORT_MESSAGE)
}
break
case 'file':
yield {
type: 'attachment_delta',
attachments: [
{
type: 'image',
url: `data:${chunk.file.mediaType ?? 'image/png'};base64,${chunk.file.base64}`,
mime: chunk.file.mediaType ?? 'image/png',
},
],
}
}
}
if (textLoopProbeBuffer) {
textLoopProbeBuffer.flush()
}
const { messages: strippedMessages } = stripAttachmentsFromMessages(
result.messages,
)
yield {
type: 'agent_end',
messages: [
userPrompt,
...strippedMessages,
],
usages: [null, ...result.usages],
reasoning: result.reasoning,
usage: result.usage!,
skills: getEnabledSkills(),
}
} catch (error) {
streamError = error
console.error(error)
throw error
} finally {
try {
await closeTools()
} catch (closeError) {
if (streamError == null) {
throw closeError
}
console.error(closeError)
}
}
}
return {
stream,
ask,
askAsSubagent,
triggerSchedule,
triggerHeartbeat,
}
}