feat(bots): add MCP connection runtime health checks

- Add RuntimeChecker interface for extensible bot health checks - Implement MCP ConnectionChecker: probes active connections via tools/list - Show MCP server status (healthy/unreachable/no tools) in bot checks - Register checker in main.go alongside container lifecycle
2026-04-27 07:16:19 +09:00 · 2026-02-13 02:13:52 +08:00
parent f9be6baa4e
commit 1d1586186c
4 changed files with 146 additions and 1 deletions
@@ -460,7 +460,7 @@ func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.C
 	})
 }

-func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler) {
+func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService) {
 	fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo())

 	lc.Append(fx.Hook{
@@ -469,6 +469,7 @@ func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutd
 				return err
 			}
 			botService.SetContainerLifecycle(containerdHandler)
+			botService.AddRuntimeChecker(mcp.NewConnectionChecker(logger, mcpConnService, toolGateway))

 			go func() {
 				if err := srv.Start(); err != nil && !errors.Is(err, http.ErrServerClosed) {
@@ -23,6 +23,7 @@ type Service struct {
 	queries            *sqlc.Queries
 	logger             *slog.Logger
 	containerLifecycle ContainerLifecycle
+	checkers           []RuntimeChecker
 }

 const (
@@ -56,6 +57,13 @@ func (s *Service) SetContainerLifecycle(lc ContainerLifecycle) {
 	s.containerLifecycle = lc
 }

+// AddRuntimeChecker registers an additional runtime checker.
+func (s *Service) AddRuntimeChecker(c RuntimeChecker) {
+	if c != nil {
+		s.checkers = append(s.checkers, c)
+	}
+}
+
 // AuthorizeAccess checks whether userID may access the given bot.
 func (s *Service) AuthorizeAccess(ctx context.Context, userID, botID string, isAdmin bool, policy AccessPolicy) (Bot, error) {
 	if s.queries == nil {
@@ -836,6 +844,11 @@ func (s *Service) buildRuntimeChecks(ctx context.Context, row sqlc.Bot) ([]BotCh
 	}
 	checks = append(checks, dataCheck)

+	botID := uuid.UUID(row.ID.Bytes).String()
+	for _, checker := range s.checkers {
+		checks = append(checks, checker.CheckBot(ctx, botID)...)
+	}
+
 	return checks, nil
 }

@@ -87,6 +87,11 @@ type ContainerLifecycle interface {
 	CleanupBotContainer(ctx context.Context, botID string) error
 }

+// RuntimeChecker produces runtime check items for a bot.
+type RuntimeChecker interface {
+	CheckBot(ctx context.Context, botID string) []BotCheck
+}
+
 const (
 	BotTypePersonal = "personal"
 	BotTypePublic   = "public"
@@ -0,0 +1,126 @@
+package mcp
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"strings"
+	"time"
+
+	"github.com/memohai/memoh/internal/bots"
+)
+
+const (
+	mcpCheckTimeout = 8 * time.Second
+)
+
+// ConnectionChecker implements bots.RuntimeChecker for MCP connections.
+type ConnectionChecker struct {
+	logger      *slog.Logger
+	connections *ConnectionService
+	gateway     *ToolGatewayService
+}
+
+// NewConnectionChecker creates an MCP runtime checker.
+func NewConnectionChecker(log *slog.Logger, connections *ConnectionService, gateway *ToolGatewayService) *ConnectionChecker {
+	if log == nil {
+		log = slog.Default()
+	}
+	return &ConnectionChecker{
+		logger:      log.With(slog.String("checker", "mcp")),
+		connections: connections,
+		gateway:     gateway,
+	}
+}
+
+// CheckBot probes each active MCP connection for a bot and returns check results.
+func (c *ConnectionChecker) CheckBot(ctx context.Context, botID string) []bots.BotCheck {
+	if c.connections == nil {
+		return nil
+	}
+	items, err := c.connections.ListActiveByBot(ctx, botID)
+	if err != nil {
+		c.logger.Warn("mcp checker: list connections failed",
+			slog.String("bot_id", botID), slog.Any("error", err))
+		return nil
+	}
+	if len(items) == 0 {
+		return nil
+	}
+
+	checks := make([]bots.BotCheck, 0, len(items))
+	for _, conn := range items {
+		check := c.probeConnection(ctx, botID, conn)
+		checks = append(checks, check)
+	}
+	return checks
+}
+
+func (c *ConnectionChecker) probeConnection(ctx context.Context, botID string, conn Connection) bots.BotCheck {
+	checkKey := "mcp." + sanitizeCheckKey(conn.Name)
+	check := bots.BotCheck{
+		CheckKey: checkKey,
+		Status:   bots.BotCheckStatusUnknown,
+		Summary:  fmt.Sprintf("MCP server %q is being checked.", conn.Name),
+		Metadata: map[string]any{
+			"connection_id": conn.ID,
+			"name":          conn.Name,
+			"type":          conn.Type,
+		},
+	}
+
+	if c.gateway == nil {
+		check.Status = bots.BotCheckStatusWarn
+		check.Summary = fmt.Sprintf("MCP server %q cannot be checked.", conn.Name)
+		check.Detail = "tool gateway not available"
+		return check
+	}
+
+	probeCtx, cancel := context.WithTimeout(ctx, mcpCheckTimeout)
+	defer cancel()
+
+	session := ToolSessionContext{BotID: botID}
+	tools, err := c.gateway.ListTools(probeCtx, session)
+	if err != nil {
+		check.Status = bots.BotCheckStatusError
+		check.Summary = fmt.Sprintf("MCP server %q is not reachable.", conn.Name)
+		check.Detail = err.Error()
+		return check
+	}
+
+	// Count tools belonging to this connection (prefixed with connection name).
+	prefix := sanitizeCheckKey(conn.Name) + "."
+	toolCount := 0
+	for _, t := range tools {
+		if strings.HasPrefix(t.Name, prefix) || t.Name == conn.Name {
+			toolCount++
+		}
+	}
+
+	if toolCount > 0 {
+		check.Status = bots.BotCheckStatusOK
+		check.Summary = fmt.Sprintf("MCP server %q is healthy (%d tools).", conn.Name, toolCount)
+		check.Metadata["tool_count"] = toolCount
+	} else {
+		check.Status = bots.BotCheckStatusWarn
+		check.Summary = fmt.Sprintf("MCP server %q is reachable but no tools found.", conn.Name)
+		check.Detail = "The server responded but exposed no tools."
+	}
+	return check
+}
+
+func sanitizeCheckKey(raw string) string {
+	raw = strings.TrimSpace(strings.ToLower(raw))
+	if raw == "" {
+		return "unknown"
+	}
+	b := strings.Builder{}
+	for _, ch := range raw {
+		if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || ch == '_' || ch == '-' {
+			b.WriteRune(ch)
+		} else {
+			b.WriteRune('_')
+		}
+	}
+	return strings.Trim(b.String(), "_-")
+}