mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
e4aca0db13
Expose a dedicated container metrics endpoint and surface current CPU, memory, and root filesystem usage in the bot container view. This gives operators a quick health snapshot while degrading cleanly on unsupported backends.
519 lines
16 KiB
Go
519 lines
16 KiB
Go
package workspace
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/containerd/errdefs"
|
|
"github.com/jackc/pgx/v5/pgxpool"
|
|
|
|
"github.com/memohai/memoh/internal/config"
|
|
ctr "github.com/memohai/memoh/internal/containerd"
|
|
dbsqlc "github.com/memohai/memoh/internal/db/sqlc"
|
|
"github.com/memohai/memoh/internal/identity"
|
|
skillset "github.com/memohai/memoh/internal/skills"
|
|
"github.com/memohai/memoh/internal/workspace/bridge"
|
|
)
|
|
|
|
const (
|
|
BotLabelKey = "memoh.bot_id"
|
|
WorkspaceLabelKey = "memoh.workspace"
|
|
WorkspaceLabelValue = "v3"
|
|
WorkspaceCDIDevicesLabelKey = "memoh.workspace.cdi_devices"
|
|
ContainerPrefix = "workspace-"
|
|
LegacyContainerPrefix = "mcp-"
|
|
|
|
legacyGRPCPort = 9090
|
|
)
|
|
|
|
// ErrContainerNotFound is returned when no container exists for a bot.
|
|
var ErrContainerNotFound = errors.New("container not found for bot")
|
|
|
|
// ContainerStatus combines DB records with live containerd state.
|
|
type ContainerStatus struct {
|
|
ContainerID string `json:"container_id"`
|
|
Image string `json:"image"`
|
|
Status string `json:"status"`
|
|
Namespace string `json:"namespace"`
|
|
ContainerPath string `json:"container_path"`
|
|
CDIDevices []string `json:"cdi_devices,omitempty"`
|
|
TaskRunning bool `json:"task_running"`
|
|
HasPreservedData bool `json:"has_preserved_data"`
|
|
Legacy bool `json:"legacy"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
type ContainerMetricsStatus struct {
|
|
Exists bool `json:"exists"`
|
|
TaskRunning bool `json:"task_running"`
|
|
}
|
|
|
|
type ContainerStorageMetrics struct {
|
|
Path string `json:"path"`
|
|
UsedBytes uint64 `json:"used_bytes"`
|
|
}
|
|
|
|
type ContainerMetricsResult struct {
|
|
Supported bool
|
|
UnsupportedReason string
|
|
Status ContainerMetricsStatus
|
|
SampledAt time.Time
|
|
CPU *ctr.CPUMetrics
|
|
Memory *ctr.MemoryMetrics
|
|
Storage *ContainerStorageMetrics
|
|
}
|
|
|
|
type Manager struct {
|
|
service ctr.Service
|
|
cfg config.WorkspaceConfig
|
|
namespace string
|
|
db *pgxpool.Pool
|
|
queries *dbsqlc.Queries
|
|
logger *slog.Logger
|
|
containerLockMu sync.Mutex
|
|
containerLocks map[string]*sync.Mutex
|
|
grpcPool *bridge.Pool
|
|
legacyMu sync.RWMutex
|
|
legacyIPs map[string]string // botID → IP for pre-bridge containers
|
|
}
|
|
|
|
func NewManager(log *slog.Logger, service ctr.Service, cfg config.WorkspaceConfig, namespace string, conn *pgxpool.Pool) *Manager {
|
|
if namespace == "" {
|
|
namespace = config.DefaultNamespace
|
|
}
|
|
m := &Manager{
|
|
service: service,
|
|
cfg: cfg,
|
|
namespace: namespace,
|
|
db: conn,
|
|
queries: dbsqlc.New(conn),
|
|
logger: log.With(slog.String("component", "workspace")),
|
|
containerLocks: make(map[string]*sync.Mutex),
|
|
legacyIPs: make(map[string]string),
|
|
}
|
|
m.grpcPool = bridge.NewPool(m.dialTarget)
|
|
return m
|
|
}
|
|
|
|
// resolveContainerID resolves the actual containerd container ID for a bot.
|
|
// This is the SINGLE point of container ID resolution for all lookup operations.
|
|
// It delegates to ContainerID (DB → label → scan) and falls back to the
|
|
// new-style prefix if no container exists yet.
|
|
func (m *Manager) resolveContainerID(ctx context.Context, botID string) string {
|
|
id, err := m.ContainerID(ctx, botID)
|
|
if err != nil {
|
|
return ContainerPrefix + botID
|
|
}
|
|
return id
|
|
}
|
|
|
|
func (m *Manager) lockContainer(containerID string) func() {
|
|
m.containerLockMu.Lock()
|
|
lock, ok := m.containerLocks[containerID]
|
|
if !ok {
|
|
lock = &sync.Mutex{}
|
|
m.containerLocks[containerID] = lock
|
|
}
|
|
m.containerLockMu.Unlock()
|
|
|
|
lock.Lock()
|
|
return lock.Unlock
|
|
}
|
|
|
|
// socketDir returns the host-side directory that is bind-mounted into the
|
|
// container at /run/memoh, holding the UDS socket file.
|
|
func (m *Manager) socketDir(botID string) string {
|
|
return filepath.Join(m.dataRoot(), "run", botID)
|
|
}
|
|
|
|
// socketPath returns the path to the UDS socket file for a bot's container.
|
|
func (m *Manager) socketPath(botID string) string {
|
|
return filepath.Join(m.socketDir(botID), "bridge.sock")
|
|
}
|
|
|
|
// dialTarget returns the gRPC dial target for a bot. Legacy containers
|
|
// (pre-bridge) are reached via TCP; bridge containers use UDS.
|
|
func (m *Manager) dialTarget(botID string) string {
|
|
m.legacyMu.RLock()
|
|
ip, legacy := m.legacyIPs[botID]
|
|
m.legacyMu.RUnlock()
|
|
if legacy {
|
|
return fmt.Sprintf("%s:%d", ip, legacyGRPCPort)
|
|
}
|
|
return "unix://" + m.socketPath(botID)
|
|
}
|
|
|
|
// SetLegacyIP records the IP address of a legacy (pre-bridge) container
|
|
// so the gRPC pool can reach it via TCP.
|
|
func (m *Manager) SetLegacyIP(botID, ip string) {
|
|
m.legacyMu.Lock()
|
|
m.legacyIPs[botID] = ip
|
|
m.legacyMu.Unlock()
|
|
}
|
|
|
|
// ClearLegacyIP removes a cached legacy IP (e.g. when the container is deleted).
|
|
func (m *Manager) ClearLegacyIP(botID string) {
|
|
m.legacyMu.Lock()
|
|
delete(m.legacyIPs, botID)
|
|
m.legacyMu.Unlock()
|
|
}
|
|
|
|
// clearLegacyRoute evicts any stale TCP fallback state for a bot so future
|
|
// gRPC dials use the bridge container's Unix socket.
|
|
func (m *Manager) clearLegacyRoute(botID string) {
|
|
m.ClearLegacyIP(botID)
|
|
m.grpcPool.Remove(botID)
|
|
}
|
|
|
|
// MCPClient returns a gRPC client for the given bot's container.
|
|
// Implements bridge.Provider.
|
|
func (m *Manager) MCPClient(ctx context.Context, botID string) (*bridge.Client, error) {
|
|
return m.grpcPool.Get(ctx, botID)
|
|
}
|
|
|
|
func (m *Manager) Init(ctx context.Context) error {
|
|
image := m.imageRef()
|
|
|
|
// Pre-pull the default base image so container creation doesn't block
|
|
// on a network download. If the image is already present, this is a no-op.
|
|
if _, err := m.service.GetImage(ctx, image); err != nil {
|
|
m.logger.Info("pulling base image for workspace containers", slog.String("image", image))
|
|
if _, pullErr := m.service.PullImage(ctx, image, &ctr.PullImageOptions{
|
|
Unpack: true,
|
|
Snapshotter: m.cfg.Snapshotter,
|
|
}); pullErr != nil {
|
|
m.logger.Warn("base image pull failed", slog.String("image", image), slog.Any("error", pullErr))
|
|
return pullErr
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// EnsureBot creates the workspace container for a bot if it does not exist.
|
|
// Bot data lives in the container's writable layer (snapshot), not bind mounts.
|
|
// The Memoh runtime (bridge binary + toolkit) is injected via read-only bind mount.
|
|
// If imageOverride is non-empty, it is used instead of the configured default.
|
|
func (m *Manager) EnsureBot(ctx context.Context, botID, imageOverride string) error {
|
|
image := m.imageRef()
|
|
if imageOverride != "" {
|
|
image = config.NormalizeImageRef(imageOverride)
|
|
}
|
|
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return m.ensureBotWithImage(ctx, botID, image, gpu)
|
|
}
|
|
|
|
func workspaceCDIDevicesLabelValue(devices []string) string {
|
|
devices = normalizeWorkspaceGPUDevices(devices)
|
|
return strings.Join(devices, ",")
|
|
}
|
|
|
|
func workspaceCDIDevicesFromLabels(labels map[string]string) []string {
|
|
if len(labels) == 0 {
|
|
return nil
|
|
}
|
|
value := strings.TrimSpace(labels[WorkspaceCDIDevicesLabelKey])
|
|
if value == "" {
|
|
return nil
|
|
}
|
|
return normalizeWorkspaceGPUDevices(strings.Split(value, ","))
|
|
}
|
|
|
|
func (m *Manager) buildWorkspaceContainerSpec(ctx context.Context, botID string, gpu WorkspaceGPUConfig) (ctr.ContainerSpec, error) {
|
|
resolvPath, err := ctr.ResolveConfSource(m.dataRoot())
|
|
if err != nil {
|
|
return ctr.ContainerSpec{}, err
|
|
}
|
|
|
|
runtimeDir := m.cfg.RuntimePath()
|
|
sockDir := m.socketDir(botID)
|
|
if err := os.MkdirAll(sockDir, 0o750); err != nil {
|
|
return ctr.ContainerSpec{}, fmt.Errorf("create socket dir: %w", err)
|
|
}
|
|
|
|
mounts := []ctr.MountSpec{
|
|
{
|
|
Destination: "/etc/resolv.conf",
|
|
Type: "bind",
|
|
Source: resolvPath,
|
|
Options: []string{"rbind", "ro"},
|
|
},
|
|
{
|
|
Destination: "/opt/memoh",
|
|
Type: "bind",
|
|
Source: runtimeDir,
|
|
Options: []string{"rbind", "ro"},
|
|
},
|
|
{
|
|
Destination: "/run/memoh",
|
|
Type: "bind",
|
|
Source: sockDir,
|
|
Options: []string{"rbind", "rw"},
|
|
},
|
|
}
|
|
tzMounts, tzEnv := ctr.TimezoneSpec()
|
|
mounts = append(mounts, tzMounts...)
|
|
|
|
skillRoots, err := m.ResolveWorkspaceSkillDiscoveryRoots(ctx, botID)
|
|
if err != nil {
|
|
return ctr.ContainerSpec{}, err
|
|
}
|
|
skillEnv := skillset.ContainerEnv(skillRoots)
|
|
env := make([]string, 0, len(tzEnv)+1+len(skillEnv))
|
|
env = append(env, tzEnv...)
|
|
env = append(env, "BRIDGE_SOCKET_PATH=/run/memoh/bridge.sock")
|
|
env = append(env, skillEnv...)
|
|
|
|
return ctr.ContainerSpec{
|
|
Cmd: []string{"/opt/memoh/bridge"},
|
|
Mounts: mounts,
|
|
Env: env,
|
|
CDIDevices: normalizeWorkspaceGPUDevices(gpu.Devices),
|
|
}, nil
|
|
}
|
|
|
|
func (m *Manager) ensureBotWithImage(ctx context.Context, botID, image string, gpu WorkspaceGPUConfig) error {
|
|
if err := validateBotID(botID); err != nil {
|
|
return err
|
|
}
|
|
spec, err := m.buildWorkspaceContainerSpec(ctx, botID, gpu)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
labels := map[string]string{
|
|
BotLabelKey: botID,
|
|
WorkspaceLabelKey: WorkspaceLabelValue,
|
|
}
|
|
if value := workspaceCDIDevicesLabelValue(gpu.Devices); value != "" {
|
|
labels[WorkspaceCDIDevicesLabelKey] = value
|
|
}
|
|
|
|
_, err = m.service.CreateContainer(ctx, ctr.CreateContainerRequest{
|
|
ID: ContainerPrefix + botID,
|
|
ImageRef: image,
|
|
Snapshotter: m.cfg.Snapshotter,
|
|
Labels: labels,
|
|
Spec: spec,
|
|
})
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
if !errdefs.IsAlreadyExists(err) {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ListBots returns the bot IDs that have workspace containers.
|
|
func (m *Manager) ListBots(ctx context.Context) ([]string, error) {
|
|
containers, err := m.service.ListContainers(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
botIDs := make([]string, 0, len(containers))
|
|
for _, info := range containers {
|
|
if botID, ok := BotIDFromContainerInfo(info); ok {
|
|
botIDs = append(botIDs, botID)
|
|
}
|
|
}
|
|
return botIDs, nil
|
|
}
|
|
|
|
func (m *Manager) Start(ctx context.Context, botID string) error {
|
|
image, err := m.resolveWorkspaceImage(ctx, botID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return m.startWithResolvedConfig(ctx, botID, image, gpu)
|
|
}
|
|
|
|
// StartWithImage creates and starts the MCP container for a bot.
|
|
// If imageOverride is non-empty, it is used as the base image instead of the
|
|
// configured default. The override only applies when creating a new container.
|
|
func (m *Manager) StartWithImage(ctx context.Context, botID, imageOverride string) error {
|
|
image := strings.TrimSpace(imageOverride)
|
|
if image == "" {
|
|
return m.Start(ctx, botID)
|
|
}
|
|
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return m.startWithResolvedConfig(ctx, botID, config.NormalizeImageRef(image), gpu)
|
|
}
|
|
|
|
// StartWithResolvedImage creates and starts the workspace container for a bot
|
|
// using an explicit image reference.
|
|
func (m *Manager) StartWithResolvedImage(ctx context.Context, botID, image string) error {
|
|
image = strings.TrimSpace(image)
|
|
if image == "" {
|
|
return errors.New("image is required")
|
|
}
|
|
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return m.startWithResolvedConfig(ctx, botID, image, gpu)
|
|
}
|
|
|
|
func (m *Manager) StartWithResolvedConfig(ctx context.Context, botID, image string, gpu WorkspaceGPUConfig) error {
|
|
image = strings.TrimSpace(image)
|
|
if image == "" {
|
|
return errors.New("image is required")
|
|
}
|
|
return m.startWithResolvedConfig(ctx, botID, image, gpu)
|
|
}
|
|
|
|
func (m *Manager) startWithResolvedConfig(ctx context.Context, botID, image string, gpu WorkspaceGPUConfig) error {
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
|
|
// Before creating a new container, check for an orphaned snapshot
|
|
// (container deleted but snapshot with /data survived). Export /data
|
|
// to a backup so it can be restored after EnsureBot creates a fresh
|
|
// container. This covers dev image rebuilds, containerd metadata loss,
|
|
// and manual container deletion.
|
|
if _, err := m.service.GetContainer(ctx, containerID); errdefs.IsNotFound(err) {
|
|
m.recoverOrphanedSnapshot(ctx, botID)
|
|
}
|
|
|
|
if err := m.ensureBotWithImage(ctx, botID, image, gpu); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Restore preserved data (from orphaned snapshot recovery or a previous
|
|
// CleanupBotContainer with preserveData) into the fresh snapshot before
|
|
// starting the task, avoiding a redundant stop/start cycle.
|
|
if m.HasPreservedData(botID) {
|
|
if err := m.restorePreservedIntoSnapshot(ctx, botID); err != nil {
|
|
return fmt.Errorf("restore preserved data: %w", err)
|
|
}
|
|
}
|
|
|
|
if err := m.service.StartContainer(ctx, containerID, nil); err != nil {
|
|
return err
|
|
}
|
|
|
|
// CNI network setup (for outbound connectivity — container processes
|
|
// may need to download packages). Server communicates via UDS, not IP.
|
|
if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
|
|
ContainerID: containerID,
|
|
CNIBinDir: m.cfg.CNIBinaryDir,
|
|
CNIConfDir: m.cfg.CNIConfigDir,
|
|
}); err != nil {
|
|
if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil {
|
|
m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr))
|
|
}
|
|
return err
|
|
}
|
|
if !m.IsLegacyContainer(ctx, containerID) {
|
|
m.clearLegacyRoute(botID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) Stop(ctx context.Context, botID string, timeout time.Duration) error {
|
|
if err := validateBotID(botID); err != nil {
|
|
return err
|
|
}
|
|
return m.service.StopContainer(ctx, m.resolveContainerID(ctx, botID), &ctr.StopTaskOptions{
|
|
Timeout: timeout,
|
|
Force: true,
|
|
})
|
|
}
|
|
|
|
func (m *Manager) Delete(ctx context.Context, botID string, preserveData bool) error {
|
|
if err := validateBotID(botID); err != nil {
|
|
return err
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
|
|
stoppedForPreserve := false
|
|
|
|
if preserveData {
|
|
info, err := m.service.GetContainer(ctx, containerID)
|
|
if err != nil {
|
|
return fmt.Errorf("get container for preserve: %w", err)
|
|
}
|
|
|
|
if _, err := m.snapshotMounts(ctx, info); errors.Is(err, errMountNotSupported) {
|
|
// Apple backend fallback uses gRPC against a running container.
|
|
} else if err != nil {
|
|
return err
|
|
} else {
|
|
if err := m.safeStopTask(ctx, containerID); err != nil {
|
|
return fmt.Errorf("stop for data preserve: %w", err)
|
|
}
|
|
stoppedForPreserve = true
|
|
}
|
|
|
|
if err := m.PreserveData(ctx, botID); err != nil {
|
|
// Export failed — restart only if we stopped the task, and abort
|
|
// deletion to prevent data loss.
|
|
if stoppedForPreserve {
|
|
m.restartContainer(ctx, botID, containerID)
|
|
}
|
|
return fmt.Errorf("preserve data: %w", err)
|
|
}
|
|
}
|
|
|
|
m.clearLegacyRoute(botID)
|
|
|
|
if err := m.service.RemoveNetwork(ctx, ctr.NetworkSetupRequest{
|
|
ContainerID: containerID,
|
|
CNIBinDir: m.cfg.CNIBinaryDir,
|
|
CNIConfDir: m.cfg.CNIConfigDir,
|
|
}); err != nil {
|
|
m.logger.Warn("delete: remove network failed",
|
|
slog.String("container_id", containerID), slog.Any("error", err))
|
|
}
|
|
if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil {
|
|
m.logger.Warn("delete: delete task failed",
|
|
slog.String("container_id", containerID), slog.Any("error", err))
|
|
}
|
|
return m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{
|
|
CleanupSnapshot: true,
|
|
})
|
|
}
|
|
|
|
func (m *Manager) dataRoot() string {
|
|
if m.cfg.DataRoot == "" {
|
|
return config.DefaultDataRoot
|
|
}
|
|
return m.cfg.DataRoot
|
|
}
|
|
|
|
func (m *Manager) imageRef() string {
|
|
return m.cfg.ImageRef()
|
|
}
|
|
|
|
// IsLegacyContainer returns true if the container was created before the
|
|
// bridge runtime injection architecture (uses the legacy "mcp-" prefix).
|
|
// Legacy containers are functional but unreachable from the server (they
|
|
// use TCP gRPC instead of UDS). Users should delete and recreate them.
|
|
func (*Manager) IsLegacyContainer(_ context.Context, containerID string) bool {
|
|
return strings.HasPrefix(containerID, LegacyContainerPrefix)
|
|
}
|
|
|
|
func validateBotID(botID string) error {
|
|
return identity.ValidateChannelIdentityID(botID)
|
|
}
|