mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
555 lines
16 KiB
Go
555 lines
16 KiB
Go
package workspace
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/containerd/errdefs"
|
|
"github.com/google/uuid"
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
|
|
"github.com/memohai/memoh/internal/config"
|
|
ctr "github.com/memohai/memoh/internal/containerd"
|
|
"github.com/memohai/memoh/internal/db"
|
|
dbsqlc "github.com/memohai/memoh/internal/db/sqlc"
|
|
)
|
|
|
|
const (
|
|
SnapshotSourceManual = "manual"
|
|
SnapshotSourcePreExec = "pre_exec"
|
|
SnapshotSourceRollback = "rollback"
|
|
)
|
|
|
|
type VersionInfo struct {
|
|
ID string
|
|
Version int
|
|
SnapshotName string
|
|
RuntimeSnapshotName string
|
|
DisplayName string
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
type SnapshotCreateInfo struct {
|
|
ContainerID string
|
|
SnapshotName string
|
|
RuntimeSnapshotName string
|
|
DisplayName string
|
|
Snapshotter string
|
|
Version int
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
type ManagedSnapshotMeta struct {
|
|
Source string
|
|
Version *int
|
|
DisplayName string
|
|
}
|
|
|
|
type BotSnapshotData struct {
|
|
ContainerID string
|
|
Info ctr.ContainerInfo
|
|
Snapshotter string
|
|
RuntimeSnapshots []ctr.SnapshotInfo
|
|
ManagedMeta map[string]ManagedSnapshotMeta
|
|
}
|
|
|
|
func (m *Manager) CreateSnapshot(ctx context.Context, botID, snapshotName, source string) (*SnapshotCreateInfo, error) {
|
|
if m.db == nil || m.queries == nil {
|
|
return nil, errors.New("db is not configured")
|
|
}
|
|
if err := validateBotID(botID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
unlock := m.lockContainer(containerID)
|
|
defer unlock()
|
|
|
|
info, err := m.service.GetContainer(ctx, containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if _, err := m.ensureDBRecords(ctx, botID, info.ID, info.Runtime.Name, info.Image); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
displayName := strings.TrimSpace(snapshotName)
|
|
runtimeSnapshotName := fmt.Sprintf("%s-snapshot-%d", containerID, time.Now().UnixNano())
|
|
normalizedSource := normalizeSnapshotSource(source)
|
|
|
|
// The sequence below (stop → commit → replace → start) is atomic from the
|
|
// container's perspective: interrupting it mid-way leaves the container missing.
|
|
// Use a detached context so a cancelled HTTP request cannot break it.
|
|
dctx := context.WithoutCancel(ctx)
|
|
|
|
if err := m.safeStopTask(dctx, containerID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := m.service.CommitSnapshot(dctx, info.Snapshotter, runtimeSnapshotName, info.SnapshotKey); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
activeSnapshotName := fmt.Sprintf("%s-active-%d", containerID, time.Now().UnixNano())
|
|
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, runtimeSnapshotName); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
_, versionNumber, createdAt, err := m.recordSnapshotVersion(
|
|
dctx,
|
|
containerID,
|
|
runtimeSnapshotName,
|
|
displayName,
|
|
info.SnapshotKey,
|
|
info.Snapshotter,
|
|
normalizedSource,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := m.insertEvent(dctx, containerID, "snapshot_create", map[string]any{
|
|
"snapshot_name": coalesceSnapshotName(displayName, versionNumber),
|
|
"display_name": displayName,
|
|
"runtime_snapshot_name": runtimeSnapshotName,
|
|
"snapshotter": info.Snapshotter,
|
|
"source": normalizedSource,
|
|
"version": versionNumber,
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &SnapshotCreateInfo{
|
|
ContainerID: containerID,
|
|
SnapshotName: coalesceSnapshotName(displayName, versionNumber),
|
|
RuntimeSnapshotName: runtimeSnapshotName,
|
|
DisplayName: displayName,
|
|
Snapshotter: info.Snapshotter,
|
|
Version: versionNumber,
|
|
CreatedAt: createdAt,
|
|
}, nil
|
|
}
|
|
|
|
func (m *Manager) CreateVersion(ctx context.Context, botID string) (*VersionInfo, error) {
|
|
if m.db == nil || m.queries == nil {
|
|
return nil, errors.New("db is not configured")
|
|
}
|
|
if err := validateBotID(botID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
unlock := m.lockContainer(containerID)
|
|
defer unlock()
|
|
|
|
info, err := m.service.GetContainer(ctx, containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if _, err := m.ensureDBRecords(ctx, botID, info.ID, info.Runtime.Name, info.Image); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
dctx := context.WithoutCancel(ctx)
|
|
|
|
if err := m.safeStopTask(dctx, containerID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
versionSnapshotName := fmt.Sprintf("%s-v%d", containerID, time.Now().UnixNano())
|
|
if err := m.service.CommitSnapshot(dctx, info.Snapshotter, versionSnapshotName, info.SnapshotKey); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
activeSnapshotName := fmt.Sprintf("%s-active-%d", containerID, time.Now().UnixNano())
|
|
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, versionSnapshotName); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
versionID, versionNumber, createdAt, err := m.recordSnapshotVersion(
|
|
dctx,
|
|
containerID,
|
|
versionSnapshotName,
|
|
"",
|
|
info.SnapshotKey,
|
|
info.Snapshotter,
|
|
SnapshotSourcePreExec,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := m.insertEvent(dctx, containerID, "version_create", map[string]any{
|
|
"snapshot_name": versionSnapshotName,
|
|
"version": versionNumber,
|
|
"version_id": versionID,
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &VersionInfo{
|
|
ID: versionID,
|
|
Version: versionNumber,
|
|
SnapshotName: fmt.Sprintf("Version %d", versionNumber),
|
|
RuntimeSnapshotName: versionSnapshotName,
|
|
DisplayName: "",
|
|
CreatedAt: createdAt,
|
|
}, nil
|
|
}
|
|
|
|
// ListBotSnapshotData returns the raw snapshot data for a bot under the
|
|
// per-container lock, so callers never observe transient state during
|
|
// snapshot/version operations.
|
|
func (m *Manager) ListBotSnapshotData(ctx context.Context, botID string) (*BotSnapshotData, error) {
|
|
if err := validateBotID(botID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
unlock := m.lockContainer(containerID)
|
|
defer unlock()
|
|
|
|
info, err := m.service.GetContainer(ctx, containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
snapshotter := strings.TrimSpace(info.Snapshotter)
|
|
if snapshotter == "" {
|
|
snapshotter = m.cfg.Snapshotter
|
|
}
|
|
if snapshotter == "" {
|
|
snapshotter = "overlayfs"
|
|
}
|
|
|
|
runtimeSnapshots, err := m.service.ListSnapshots(ctx, snapshotter)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
managedMeta := make(map[string]ManagedSnapshotMeta)
|
|
if m.db != nil && m.queries != nil {
|
|
rows, err := m.queries.ListSnapshotsWithVersionByContainerID(ctx, containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, row := range rows {
|
|
name := strings.TrimSpace(row.RuntimeSnapshotName)
|
|
if name == "" {
|
|
continue
|
|
}
|
|
meta := ManagedSnapshotMeta{
|
|
Source: strings.TrimSpace(row.Source),
|
|
DisplayName: strings.TrimSpace(row.DisplayName.String),
|
|
}
|
|
if row.Version.Valid {
|
|
v := int(row.Version.Int32)
|
|
meta.Version = &v
|
|
}
|
|
managedMeta[name] = meta
|
|
}
|
|
}
|
|
|
|
return &BotSnapshotData{
|
|
ContainerID: containerID,
|
|
Info: info,
|
|
Snapshotter: snapshotter,
|
|
RuntimeSnapshots: runtimeSnapshots,
|
|
ManagedMeta: managedMeta,
|
|
}, nil
|
|
}
|
|
|
|
func (m *Manager) ListVersions(ctx context.Context, botID string) ([]VersionInfo, error) {
|
|
if m.db == nil || m.queries == nil {
|
|
return nil, errors.New("db is not configured")
|
|
}
|
|
if err := validateBotID(botID); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
versions, err := m.queries.ListVersionsByContainerID(ctx, containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
out := make([]VersionInfo, 0, len(versions))
|
|
for _, row := range versions {
|
|
createdAt := time.Time{}
|
|
if row.CreatedAt.Valid {
|
|
createdAt = row.CreatedAt.Time
|
|
}
|
|
out = append(out, VersionInfo{
|
|
ID: uuidString(row.ID),
|
|
Version: int(row.Version),
|
|
SnapshotName: coalesceSnapshotName(row.DisplayName.String, int(row.Version)),
|
|
RuntimeSnapshotName: row.RuntimeSnapshotName,
|
|
DisplayName: strings.TrimSpace(row.DisplayName.String),
|
|
CreatedAt: createdAt,
|
|
})
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (m *Manager) RollbackVersion(ctx context.Context, botID string, version int) error {
|
|
if m.db == nil || m.queries == nil {
|
|
return errors.New("db is not configured")
|
|
}
|
|
if err := validateBotID(botID); err != nil {
|
|
return err
|
|
}
|
|
if version < 1 || version > math.MaxInt32 {
|
|
return errors.New("version out of range")
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
unlock := m.lockContainer(containerID)
|
|
defer unlock()
|
|
|
|
snapshotName, err := m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{
|
|
ContainerID: containerID,
|
|
Version: int32(version),
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
info, err := m.service.GetContainer(ctx, containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dctx := context.WithoutCancel(ctx)
|
|
|
|
if err := m.safeStopTask(dctx, containerID); err != nil {
|
|
return err
|
|
}
|
|
|
|
activeSnapshotName := fmt.Sprintf("%s-rollback-%d", containerID, time.Now().UnixNano())
|
|
if err := m.replaceContainerSnapshot(dctx, botID, containerID, info, activeSnapshotName, snapshotName); err != nil {
|
|
return err
|
|
}
|
|
|
|
return m.insertEvent(dctx, containerID, "version_rollback", map[string]any{
|
|
"snapshot_name": snapshotName,
|
|
"version": version,
|
|
"source": SnapshotSourceRollback,
|
|
})
|
|
}
|
|
|
|
func (m *Manager) VersionSnapshotName(ctx context.Context, botID string, version int) (string, error) {
|
|
if m.db == nil || m.queries == nil {
|
|
return "", errors.New("db is not configured")
|
|
}
|
|
if err := validateBotID(botID); err != nil {
|
|
return "", err
|
|
}
|
|
if version < 1 || version > math.MaxInt32 {
|
|
return "", errors.New("version out of range")
|
|
}
|
|
|
|
containerID := m.resolveContainerID(ctx, botID)
|
|
return m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{
|
|
ContainerID: containerID,
|
|
Version: int32(version),
|
|
})
|
|
}
|
|
|
|
// replaceContainerSnapshot prepares a new active snapshot from parentSnapshot,
|
|
// deletes the old container, recreates it on the new snapshot, and restarts the task.
|
|
// Caller must pass a detached context (context.WithoutCancel) to guarantee atomicity.
|
|
func (m *Manager) replaceContainerSnapshot(ctx context.Context, botID, containerID string, info ctr.ContainerInfo, activeSnapshotName, parentSnapshot string) error {
|
|
if err := m.service.PrepareSnapshot(ctx, info.Snapshotter, activeSnapshotName, parentSnapshot); err != nil {
|
|
return err
|
|
}
|
|
if err := m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{CleanupSnapshot: false}); err != nil {
|
|
return err
|
|
}
|
|
spec, err := m.buildVersionSpec(ctx, botID, workspaceCDIDevicesFromLabels(info.Labels))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.service.CreateContainerFromSnapshot(ctx, ctr.CreateContainerRequest{
|
|
ID: containerID,
|
|
ImageRef: info.Image,
|
|
SnapshotID: activeSnapshotName,
|
|
Snapshotter: info.Snapshotter,
|
|
Labels: info.Labels,
|
|
Spec: spec,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
if err := m.service.StartContainer(ctx, containerID, nil); err != nil {
|
|
return err
|
|
}
|
|
// Container process was recreated — evict the stale gRPC connection
|
|
// unconditionally so the next call dials fresh to the new process.
|
|
m.grpcPool.Remove(botID)
|
|
|
|
// CNI network setup (for outbound connectivity).
|
|
if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
|
|
ContainerID: containerID,
|
|
CNIBinDir: m.cfg.CNIBinaryDir,
|
|
CNIConfDir: m.cfg.CNIConfigDir,
|
|
}); err != nil {
|
|
return fmt.Errorf("network setup after snapshot replace: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) buildVersionSpec(ctx context.Context, botID string, cdiDevices []string) (ctr.ContainerSpec, error) {
|
|
if len(cdiDevices) == 0 {
|
|
gpu, err := m.resolveWorkspaceGPU(ctx, botID)
|
|
if err != nil {
|
|
return ctr.ContainerSpec{}, err
|
|
}
|
|
cdiDevices = gpu.Devices
|
|
}
|
|
return m.buildWorkspaceContainerSpec(ctx, botID, WorkspaceGPUConfig{Devices: cdiDevices})
|
|
}
|
|
|
|
func (m *Manager) safeStopTask(ctx context.Context, containerID string) error {
|
|
err := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{
|
|
Timeout: 10 * time.Second,
|
|
Force: true,
|
|
})
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
if errdefs.IsNotFound(err) {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (m *Manager) ensureDBRecords(ctx context.Context, botID, containerID, _ string, imageRef string) (pgtype.UUID, error) {
|
|
botUUID, err := db.ParseUUID(botID)
|
|
if err != nil {
|
|
return pgtype.UUID{}, err
|
|
}
|
|
if _, err := m.queries.GetBotByID(ctx, botUUID); err != nil {
|
|
return pgtype.UUID{}, err
|
|
}
|
|
|
|
containerPath := config.DefaultDataMount
|
|
if err := m.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{
|
|
BotID: botUUID,
|
|
ContainerID: containerID,
|
|
ContainerName: containerID,
|
|
Image: imageRef,
|
|
Status: "created",
|
|
Namespace: "default",
|
|
AutoStart: true,
|
|
ContainerPath: containerPath,
|
|
LastStartedAt: pgtype.Timestamptz{},
|
|
LastStoppedAt: pgtype.Timestamptz{},
|
|
}); err != nil {
|
|
return pgtype.UUID{}, err
|
|
}
|
|
|
|
return botUUID, nil
|
|
}
|
|
|
|
func (m *Manager) recordSnapshotVersion(ctx context.Context, containerID, runtimeSnapshotName, displayName, parentRuntimeSnapshotName, snapshotter, source string) (string, int, time.Time, error) {
|
|
containerID = strings.TrimSpace(containerID)
|
|
runtimeSnapshotName = strings.TrimSpace(runtimeSnapshotName)
|
|
snapshotter = strings.TrimSpace(snapshotter)
|
|
if containerID == "" || runtimeSnapshotName == "" || snapshotter == "" {
|
|
return "", 0, time.Time{}, ctr.ErrInvalidArgument
|
|
}
|
|
|
|
tx, err := m.db.Begin(ctx)
|
|
if err != nil {
|
|
return "", 0, time.Time{}, err
|
|
}
|
|
defer func() { _ = tx.Rollback(ctx) }()
|
|
|
|
qtx := m.queries.WithTx(tx)
|
|
|
|
parent := pgtype.Text{}
|
|
normalizedParent := strings.TrimSpace(parentRuntimeSnapshotName)
|
|
if normalizedParent != "" {
|
|
parent = pgtype.Text{String: normalizedParent, Valid: true}
|
|
}
|
|
snapshotRow, err := qtx.UpsertSnapshot(ctx, dbsqlc.UpsertSnapshotParams{
|
|
ContainerID: containerID,
|
|
RuntimeSnapshotName: runtimeSnapshotName,
|
|
DisplayName: pgtype.Text{String: strings.TrimSpace(displayName), Valid: strings.TrimSpace(displayName) != ""},
|
|
ParentRuntimeSnapshotName: parent,
|
|
Snapshotter: snapshotter,
|
|
Source: normalizeSnapshotSource(source),
|
|
})
|
|
if err != nil {
|
|
return "", 0, time.Time{}, err
|
|
}
|
|
|
|
version, err := qtx.NextVersion(ctx, containerID)
|
|
if err != nil {
|
|
return "", 0, time.Time{}, err
|
|
}
|
|
|
|
versionRow, err := qtx.InsertVersion(ctx, dbsqlc.InsertVersionParams{
|
|
ContainerID: containerID,
|
|
SnapshotID: snapshotRow.ID,
|
|
Version: version,
|
|
})
|
|
if err != nil {
|
|
return "", 0, time.Time{}, err
|
|
}
|
|
|
|
if err := tx.Commit(ctx); err != nil {
|
|
return "", 0, time.Time{}, err
|
|
}
|
|
|
|
createdAt := time.Time{}
|
|
if versionRow.CreatedAt.Valid {
|
|
createdAt = versionRow.CreatedAt.Time
|
|
}
|
|
|
|
return uuidString(versionRow.ID), int(version), createdAt, nil
|
|
}
|
|
|
|
func (m *Manager) insertEvent(ctx context.Context, containerID, eventType string, payload map[string]any) error {
|
|
b, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return m.queries.InsertLifecycleEvent(ctx, dbsqlc.InsertLifecycleEventParams{
|
|
ID: fmt.Sprintf("%s-%d", containerID, time.Now().UnixNano()),
|
|
ContainerID: containerID,
|
|
EventType: eventType,
|
|
Payload: b,
|
|
})
|
|
}
|
|
|
|
func normalizeSnapshotSource(source string) string {
|
|
s := strings.TrimSpace(source)
|
|
if s == "" {
|
|
return SnapshotSourceManual
|
|
}
|
|
return s
|
|
}
|
|
|
|
func coalesceSnapshotName(displayName string, version int) string {
|
|
displayName = strings.TrimSpace(displayName)
|
|
if displayName != "" {
|
|
return displayName
|
|
}
|
|
if version > 0 {
|
|
return fmt.Sprintf("Version %d", version)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func uuidString(v pgtype.UUID) string {
|
|
if !v.Valid {
|
|
return ""
|
|
}
|
|
return uuid.UUID(v.Bytes).String()
|
|
}
|