feat(bots): add MCP connection runtime health checks

- Add RuntimeChecker interface for extensible bot health checks
- Implement MCP ConnectionChecker: probes active connections via tools/list
- Show MCP server status (healthy/unreachable/no tools) in bot checks
- Register checker in main.go alongside container lifecycle
This commit is contained in:
BBQ
2026-02-13 02:13:52 +08:00
parent f9be6baa4e
commit 1d1586186c
4 changed files with 146 additions and 1 deletions
+2 -1
View File
@@ -460,7 +460,7 @@ func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.C
})
}
func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler) {
func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService) {
fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo())
lc.Append(fx.Hook{
@@ -469,6 +469,7 @@ func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutd
return err
}
botService.SetContainerLifecycle(containerdHandler)
botService.AddRuntimeChecker(mcp.NewConnectionChecker(logger, mcpConnService, toolGateway))
go func() {
if err := srv.Start(); err != nil && !errors.Is(err, http.ErrServerClosed) {
+13
View File
@@ -23,6 +23,7 @@ type Service struct {
queries *sqlc.Queries
logger *slog.Logger
containerLifecycle ContainerLifecycle
checkers []RuntimeChecker
}
const (
@@ -56,6 +57,13 @@ func (s *Service) SetContainerLifecycle(lc ContainerLifecycle) {
s.containerLifecycle = lc
}
// AddRuntimeChecker registers an additional runtime checker.
func (s *Service) AddRuntimeChecker(c RuntimeChecker) {
if c != nil {
s.checkers = append(s.checkers, c)
}
}
// AuthorizeAccess checks whether userID may access the given bot.
func (s *Service) AuthorizeAccess(ctx context.Context, userID, botID string, isAdmin bool, policy AccessPolicy) (Bot, error) {
if s.queries == nil {
@@ -836,6 +844,11 @@ func (s *Service) buildRuntimeChecks(ctx context.Context, row sqlc.Bot) ([]BotCh
}
checks = append(checks, dataCheck)
botID := uuid.UUID(row.ID.Bytes).String()
for _, checker := range s.checkers {
checks = append(checks, checker.CheckBot(ctx, botID)...)
}
return checks, nil
}
+5
View File
@@ -87,6 +87,11 @@ type ContainerLifecycle interface {
CleanupBotContainer(ctx context.Context, botID string) error
}
// RuntimeChecker produces runtime check items for a bot.
type RuntimeChecker interface {
CheckBot(ctx context.Context, botID string) []BotCheck
}
const (
BotTypePersonal = "personal"
BotTypePublic = "public"
+126
View File
@@ -0,0 +1,126 @@
package mcp
import (
"context"
"fmt"
"log/slog"
"strings"
"time"
"github.com/memohai/memoh/internal/bots"
)
const (
mcpCheckTimeout = 8 * time.Second
)
// ConnectionChecker implements bots.RuntimeChecker for MCP connections.
type ConnectionChecker struct {
logger *slog.Logger
connections *ConnectionService
gateway *ToolGatewayService
}
// NewConnectionChecker creates an MCP runtime checker.
func NewConnectionChecker(log *slog.Logger, connections *ConnectionService, gateway *ToolGatewayService) *ConnectionChecker {
if log == nil {
log = slog.Default()
}
return &ConnectionChecker{
logger: log.With(slog.String("checker", "mcp")),
connections: connections,
gateway: gateway,
}
}
// CheckBot probes each active MCP connection for a bot and returns check results.
func (c *ConnectionChecker) CheckBot(ctx context.Context, botID string) []bots.BotCheck {
if c.connections == nil {
return nil
}
items, err := c.connections.ListActiveByBot(ctx, botID)
if err != nil {
c.logger.Warn("mcp checker: list connections failed",
slog.String("bot_id", botID), slog.Any("error", err))
return nil
}
if len(items) == 0 {
return nil
}
checks := make([]bots.BotCheck, 0, len(items))
for _, conn := range items {
check := c.probeConnection(ctx, botID, conn)
checks = append(checks, check)
}
return checks
}
func (c *ConnectionChecker) probeConnection(ctx context.Context, botID string, conn Connection) bots.BotCheck {
checkKey := "mcp." + sanitizeCheckKey(conn.Name)
check := bots.BotCheck{
CheckKey: checkKey,
Status: bots.BotCheckStatusUnknown,
Summary: fmt.Sprintf("MCP server %q is being checked.", conn.Name),
Metadata: map[string]any{
"connection_id": conn.ID,
"name": conn.Name,
"type": conn.Type,
},
}
if c.gateway == nil {
check.Status = bots.BotCheckStatusWarn
check.Summary = fmt.Sprintf("MCP server %q cannot be checked.", conn.Name)
check.Detail = "tool gateway not available"
return check
}
probeCtx, cancel := context.WithTimeout(ctx, mcpCheckTimeout)
defer cancel()
session := ToolSessionContext{BotID: botID}
tools, err := c.gateway.ListTools(probeCtx, session)
if err != nil {
check.Status = bots.BotCheckStatusError
check.Summary = fmt.Sprintf("MCP server %q is not reachable.", conn.Name)
check.Detail = err.Error()
return check
}
// Count tools belonging to this connection (prefixed with connection name).
prefix := sanitizeCheckKey(conn.Name) + "."
toolCount := 0
for _, t := range tools {
if strings.HasPrefix(t.Name, prefix) || t.Name == conn.Name {
toolCount++
}
}
if toolCount > 0 {
check.Status = bots.BotCheckStatusOK
check.Summary = fmt.Sprintf("MCP server %q is healthy (%d tools).", conn.Name, toolCount)
check.Metadata["tool_count"] = toolCount
} else {
check.Status = bots.BotCheckStatusWarn
check.Summary = fmt.Sprintf("MCP server %q is reachable but no tools found.", conn.Name)
check.Detail = "The server responded but exposed no tools."
}
return check
}
func sanitizeCheckKey(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw))
if raw == "" {
return "unknown"
}
b := strings.Builder{}
for _, ch := range raw {
if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || ch == '_' || ch == '-' {
b.WriteRune(ch)
} else {
b.WriteRune('_')
}
}
return strings.Trim(b.String(), "_-")
}