Files
Memoh/internal/workspace/versioning.go
T
Ming Lin 4d3f2de7e2 feat: Add GPU CDI support for workspace containers (#332)
* feat: add CDI GPU support for workspace containers

* feat: expose GPU CDI settings in bot container UI

* feat: move GPU settings into advanced container options

* docs: document advanced CDI device configuration
2026-04-10 14:52:17 +08:00

555 lines
16 KiB
Go

package workspace
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"strings"
"time"
"github.com/containerd/errdefs"
"github.com/google/uuid"
"github.com/jackc/pgx/v5/pgtype"
"github.com/memohai/memoh/internal/config"
ctr "github.com/memohai/memoh/internal/containerd"
"github.com/memohai/memoh/internal/db"
dbsqlc "github.com/memohai/memoh/internal/db/sqlc"
)
const (
SnapshotSourceManual = "manual"
SnapshotSourcePreExec = "pre_exec"
SnapshotSourceRollback = "rollback"
)
type VersionInfo struct {
ID string
Version int
SnapshotName string
RuntimeSnapshotName string
DisplayName string
CreatedAt time.Time
}
type SnapshotCreateInfo struct {
ContainerID string
SnapshotName string
RuntimeSnapshotName string
DisplayName string
Snapshotter string
Version int
CreatedAt time.Time
}
type ManagedSnapshotMeta struct {
Source string
Version *int
DisplayName string
}
type BotSnapshotData struct {
ContainerID string
Info ctr.ContainerInfo
Snapshotter string
RuntimeSnapshots []ctr.SnapshotInfo
ManagedMeta map[string]ManagedSnapshotMeta
}
func (m *Manager) CreateSnapshot(ctx context.Context, botID, snapshotName, source string) (*SnapshotCreateInfo, error) {
if m.db == nil || m.queries == nil {
return nil, errors.New("db is not configured")
}
if err := validateBotID(botID); err != nil {
return nil, err
}
containerID := m.resolveContainerID(ctx, botID)
unlock := m.lockContainer(containerID)
defer unlock()
info, err := m.service.GetContainer(ctx, containerID)
if err != nil {
return nil, err
}
if _, err := m.ensureDBRecords(ctx, botID, info.ID, info.Runtime.Name, info.Image); err != nil {
return nil, err
}
displayName := strings.TrimSpace(snapshotName)
runtimeSnapshotName := fmt.Sprintf("%s-snapshot-%d", containerID, time.Now().UnixNano())
normalizedSource := normalizeSnapshotSource(source)
// The sequence below (stop → commit → replace → start) is atomic from the
// container's perspective: interrupting it mid-way leaves the container missing.
// Use a detached context so a cancelled HTTP request cannot break it.
dctx := context.WithoutCancel(ctx)
if err := m.safeStopTask(dctx, containerID); err != nil {
return nil, err
}
if err := m.service.CommitSnapshot(dctx, info.Snapshotter, runtimeSnapshotName, info.SnapshotKey); err != nil {
return nil, err
}
activeSnapshotName := fmt.Sprintf("%s-active-%d", containerID, time.Now().UnixNano())
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, runtimeSnapshotName); err != nil {
return nil, err
}
_, versionNumber, createdAt, err := m.recordSnapshotVersion(
dctx,
containerID,
runtimeSnapshotName,
displayName,
info.SnapshotKey,
info.Snapshotter,
normalizedSource,
)
if err != nil {
return nil, err
}
if err := m.insertEvent(dctx, containerID, "snapshot_create", map[string]any{
"snapshot_name": coalesceSnapshotName(displayName, versionNumber),
"display_name": displayName,
"runtime_snapshot_name": runtimeSnapshotName,
"snapshotter": info.Snapshotter,
"source": normalizedSource,
"version": versionNumber,
}); err != nil {
return nil, err
}
return &SnapshotCreateInfo{
ContainerID: containerID,
SnapshotName: coalesceSnapshotName(displayName, versionNumber),
RuntimeSnapshotName: runtimeSnapshotName,
DisplayName: displayName,
Snapshotter: info.Snapshotter,
Version: versionNumber,
CreatedAt: createdAt,
}, nil
}
func (m *Manager) CreateVersion(ctx context.Context, botID string) (*VersionInfo, error) {
if m.db == nil || m.queries == nil {
return nil, errors.New("db is not configured")
}
if err := validateBotID(botID); err != nil {
return nil, err
}
containerID := m.resolveContainerID(ctx, botID)
unlock := m.lockContainer(containerID)
defer unlock()
info, err := m.service.GetContainer(ctx, containerID)
if err != nil {
return nil, err
}
if _, err := m.ensureDBRecords(ctx, botID, info.ID, info.Runtime.Name, info.Image); err != nil {
return nil, err
}
dctx := context.WithoutCancel(ctx)
if err := m.safeStopTask(dctx, containerID); err != nil {
return nil, err
}
versionSnapshotName := fmt.Sprintf("%s-v%d", containerID, time.Now().UnixNano())
if err := m.service.CommitSnapshot(dctx, info.Snapshotter, versionSnapshotName, info.SnapshotKey); err != nil {
return nil, err
}
activeSnapshotName := fmt.Sprintf("%s-active-%d", containerID, time.Now().UnixNano())
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, versionSnapshotName); err != nil {
return nil, err
}
versionID, versionNumber, createdAt, err := m.recordSnapshotVersion(
dctx,
containerID,
versionSnapshotName,
"",
info.SnapshotKey,
info.Snapshotter,
SnapshotSourcePreExec,
)
if err != nil {
return nil, err
}
if err := m.insertEvent(dctx, containerID, "version_create", map[string]any{
"snapshot_name": versionSnapshotName,
"version": versionNumber,
"version_id": versionID,
}); err != nil {
return nil, err
}
return &VersionInfo{
ID: versionID,
Version: versionNumber,
SnapshotName: fmt.Sprintf("Version %d", versionNumber),
RuntimeSnapshotName: versionSnapshotName,
DisplayName: "",
CreatedAt: createdAt,
}, nil
}
// ListBotSnapshotData returns the raw snapshot data for a bot under the
// per-container lock, so callers never observe transient state during
// snapshot/version operations.
func (m *Manager) ListBotSnapshotData(ctx context.Context, botID string) (*BotSnapshotData, error) {
if err := validateBotID(botID); err != nil {
return nil, err
}
containerID := m.resolveContainerID(ctx, botID)
unlock := m.lockContainer(containerID)
defer unlock()
info, err := m.service.GetContainer(ctx, containerID)
if err != nil {
return nil, err
}
snapshotter := strings.TrimSpace(info.Snapshotter)
if snapshotter == "" {
snapshotter = m.cfg.Snapshotter
}
if snapshotter == "" {
snapshotter = "overlayfs"
}
runtimeSnapshots, err := m.service.ListSnapshots(ctx, snapshotter)
if err != nil {
return nil, err
}
managedMeta := make(map[string]ManagedSnapshotMeta)
if m.queries != nil {
rows, err := m.queries.ListSnapshotsWithVersionByContainerID(ctx, containerID)
if err != nil {
return nil, err
}
for _, row := range rows {
name := strings.TrimSpace(row.RuntimeSnapshotName)
if name == "" {
continue
}
meta := ManagedSnapshotMeta{
Source: strings.TrimSpace(row.Source),
DisplayName: strings.TrimSpace(row.DisplayName.String),
}
if row.Version.Valid {
v := int(row.Version.Int32)
meta.Version = &v
}
managedMeta[name] = meta
}
}
return &BotSnapshotData{
ContainerID: containerID,
Info: info,
Snapshotter: snapshotter,
RuntimeSnapshots: runtimeSnapshots,
ManagedMeta: managedMeta,
}, nil
}
func (m *Manager) ListVersions(ctx context.Context, botID string) ([]VersionInfo, error) {
if m.db == nil || m.queries == nil {
return nil, errors.New("db is not configured")
}
if err := validateBotID(botID); err != nil {
return nil, err
}
containerID := m.resolveContainerID(ctx, botID)
versions, err := m.queries.ListVersionsByContainerID(ctx, containerID)
if err != nil {
return nil, err
}
out := make([]VersionInfo, 0, len(versions))
for _, row := range versions {
createdAt := time.Time{}
if row.CreatedAt.Valid {
createdAt = row.CreatedAt.Time
}
out = append(out, VersionInfo{
ID: uuidString(row.ID),
Version: int(row.Version),
SnapshotName: coalesceSnapshotName(row.DisplayName.String, int(row.Version)),
RuntimeSnapshotName: row.RuntimeSnapshotName,
DisplayName: strings.TrimSpace(row.DisplayName.String),
CreatedAt: createdAt,
})
}
return out, nil
}
func (m *Manager) RollbackVersion(ctx context.Context, botID string, version int) error {
if m.db == nil || m.queries == nil {
return errors.New("db is not configured")
}
if err := validateBotID(botID); err != nil {
return err
}
if version < 1 || version > math.MaxInt32 {
return errors.New("version out of range")
}
containerID := m.resolveContainerID(ctx, botID)
unlock := m.lockContainer(containerID)
defer unlock()
snapshotName, err := m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{
ContainerID: containerID,
Version: int32(version),
})
if err != nil {
return err
}
info, err := m.service.GetContainer(ctx, containerID)
if err != nil {
return err
}
dctx := context.WithoutCancel(ctx)
if err := m.safeStopTask(dctx, containerID); err != nil {
return err
}
activeSnapshotName := fmt.Sprintf("%s-rollback-%d", containerID, time.Now().UnixNano())
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, snapshotName); err != nil {
return err
}
return m.insertEvent(dctx, containerID, "version_rollback", map[string]any{
"snapshot_name": snapshotName,
"version": version,
"source": SnapshotSourceRollback,
})
}
func (m *Manager) VersionSnapshotName(ctx context.Context, botID string, version int) (string, error) {
if m.db == nil || m.queries == nil {
return "", errors.New("db is not configured")
}
if err := validateBotID(botID); err != nil {
return "", err
}
if version < 1 || version > math.MaxInt32 {
return "", errors.New("version out of range")
}
containerID := m.resolveContainerID(ctx, botID)
return m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{
ContainerID: containerID,
Version: int32(version),
})
}
// replaceContainerSnapshot prepares a new active snapshot from parentSnapshot,
// deletes the old container, recreates it on the new snapshot, and restarts the task.
// Caller must pass a detached context (context.WithoutCancel) to guarantee atomicity.
func (m *Manager) replaceContainerSnapshot(ctx context.Context, botID, containerID string, info ctr.ContainerInfo, activeSnapshotName, parentSnapshot string) error {
if err := m.service.PrepareSnapshot(ctx, info.Snapshotter, activeSnapshotName, parentSnapshot); err != nil {
return err
}
if err := m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{CleanupSnapshot: false}); err != nil {
return err
}
spec, err := m.buildVersionSpec(ctx, botID, workspaceCDIDevicesFromLabels(info.Labels))
if err != nil {
return err
}
if _, err := m.service.CreateContainerFromSnapshot(ctx, ctr.CreateContainerRequest{
ID: containerID,
ImageRef: info.Image,
SnapshotID: activeSnapshotName,
Snapshotter: info.Snapshotter,
Labels: info.Labels,
Spec: spec,
}); err != nil {
return err
}
if err := m.service.StartContainer(ctx, containerID, nil); err != nil {
return err
}
// Container process was recreated — evict the stale gRPC connection
// unconditionally so the next call dials fresh to the new process.
m.grpcPool.Remove(botID)
// CNI network setup (for outbound connectivity).
if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
ContainerID: containerID,
CNIBinDir: m.cfg.CNIBinaryDir,
CNIConfDir: m.cfg.CNIConfigDir,
}); err != nil {
return fmt.Errorf("network setup after snapshot replace: %w", err)
}
return nil
}
func (m *Manager) buildVersionSpec(ctx context.Context, botID string, cdiDevices []string) (ctr.ContainerSpec, error) {
if len(cdiDevices) == 0 {
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
if err != nil {
return ctr.ContainerSpec{}, err
}
cdiDevices = gpu.Devices
}
return m.buildWorkspaceContainerSpec(botID, WorkspaceGPUConfig{Devices: cdiDevices})
}
func (m *Manager) safeStopTask(ctx context.Context, containerID string) error {
err := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{
Timeout: 10 * time.Second,
Force: true,
})
if err == nil {
return nil
}
if errdefs.IsNotFound(err) {
return nil
}
return err
}
func (m *Manager) ensureDBRecords(ctx context.Context, botID, containerID, _ string, imageRef string) (pgtype.UUID, error) {
botUUID, err := db.ParseUUID(botID)
if err != nil {
return pgtype.UUID{}, err
}
if _, err := m.queries.GetBotByID(ctx, botUUID); err != nil {
return pgtype.UUID{}, err
}
containerPath := config.DefaultDataMount
if err := m.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{
BotID: botUUID,
ContainerID: containerID,
ContainerName: containerID,
Image: imageRef,
Status: "created",
Namespace: "default",
AutoStart: true,
ContainerPath: containerPath,
LastStartedAt: pgtype.Timestamptz{},
LastStoppedAt: pgtype.Timestamptz{},
}); err != nil {
return pgtype.UUID{}, err
}
return botUUID, nil
}
func (m *Manager) recordSnapshotVersion(ctx context.Context, containerID, runtimeSnapshotName, displayName, parentRuntimeSnapshotName, snapshotter, source string) (string, int, time.Time, error) {
containerID = strings.TrimSpace(containerID)
runtimeSnapshotName = strings.TrimSpace(runtimeSnapshotName)
snapshotter = strings.TrimSpace(snapshotter)
if containerID == "" || runtimeSnapshotName == "" || snapshotter == "" {
return "", 0, time.Time{}, ctr.ErrInvalidArgument
}
tx, err := m.db.Begin(ctx)
if err != nil {
return "", 0, time.Time{}, err
}
defer func() { _ = tx.Rollback(ctx) }()
qtx := m.queries.WithTx(tx)
parent := pgtype.Text{}
normalizedParent := strings.TrimSpace(parentRuntimeSnapshotName)
if normalizedParent != "" {
parent = pgtype.Text{String: normalizedParent, Valid: true}
}
snapshotRow, err := qtx.UpsertSnapshot(ctx, dbsqlc.UpsertSnapshotParams{
ContainerID: containerID,
RuntimeSnapshotName: runtimeSnapshotName,
DisplayName: pgtype.Text{String: strings.TrimSpace(displayName), Valid: strings.TrimSpace(displayName) != ""},
ParentRuntimeSnapshotName: parent,
Snapshotter: snapshotter,
Source: normalizeSnapshotSource(source),
})
if err != nil {
return "", 0, time.Time{}, err
}
version, err := qtx.NextVersion(ctx, containerID)
if err != nil {
return "", 0, time.Time{}, err
}
versionRow, err := qtx.InsertVersion(ctx, dbsqlc.InsertVersionParams{
ContainerID: containerID,
SnapshotID: snapshotRow.ID,
Version: version,
})
if err != nil {
return "", 0, time.Time{}, err
}
if err := tx.Commit(ctx); err != nil {
return "", 0, time.Time{}, err
}
createdAt := time.Time{}
if versionRow.CreatedAt.Valid {
createdAt = versionRow.CreatedAt.Time
}
return uuidString(versionRow.ID), int(version), createdAt, nil
}
func (m *Manager) insertEvent(ctx context.Context, containerID, eventType string, payload map[string]any) error {
b, err := json.Marshal(payload)
if err != nil {
return err
}
return m.queries.InsertLifecycleEvent(ctx, dbsqlc.InsertLifecycleEventParams{
ID: fmt.Sprintf("%s-%d", containerID, time.Now().UnixNano()),
ContainerID: containerID,
EventType: eventType,
Payload: b,
})
}
func normalizeSnapshotSource(source string) string {
s := strings.TrimSpace(source)
if s == "" {
return SnapshotSourceManual
}
return s
}
func coalesceSnapshotName(displayName string, version int) string {
displayName = strings.TrimSpace(displayName)
if displayName != "" {
return displayName
}
if version > 0 {
return fmt.Sprintf("Version %d", version)
}
return ""
}
func uuidString(v pgtype.UUID) string {
if !v.Valid {
return ""
}
return uuid.UUID(v.Bytes).String()
}