feat(container): add current container metrics view

Expose a dedicated container metrics endpoint and surface current CPU, memory, and root filesystem usage in the bot container view. This gives operators a quick health snapshot while degrading cleanly on unsupported backends.
This commit is contained in:
Acbox
2026-04-24 15:10:47 +08:00
parent 8136ef6ed6
commit e4aca0db13
20 changed files with 1198 additions and 6 deletions
+15
View File
@@ -869,6 +869,21 @@
"snapshotLoadFailed": "Failed to load snapshots", "snapshotLoadFailed": "Failed to load snapshots",
"snapshotNamePlaceholder": "Snapshot display name (optional)", "snapshotNamePlaceholder": "Snapshot display name (optional)",
"snapshotNameHint": "This field is only for the user-visible display name. The internal snapshot name is generated automatically.", "snapshotNameHint": "This field is only for the user-visible display name. The internal snapshot name is generated automatically.",
"metricsTitle": "Resource Status",
"metricsSubtitle": "View CPU, memory, and storage usage for the container's entire filesystem.",
"metricsLoadFailed": "Failed to load container resource status",
"metricsUnsupported": "The current container backend does not support resource monitoring.",
"metricsUnavailable": "No resource metrics available.",
"metricsStopped": "The container task is not running; CPU and memory metrics are unavailable. Storage information will still be shown if available.",
"metricsPath": "Scope",
"metricsUnlimited": "No memory limit configured",
"currentSample": "Current sample",
"sampledAt": "Sampled at",
"metricsLabels": {
"cpu": "CPU",
"memory": "Memory",
"storage": "Storage"
},
"dataTitle": "Data Operations", "dataTitle": "Data Operations",
"dataSubtitle": "Independently manage import, export, and restore for the container `/data` directory.", "dataSubtitle": "Independently manage import, export, and restore for the container `/data` directory.",
"deleteTitle": "Delete Container", "deleteTitle": "Delete Container",
+15
View File
@@ -865,6 +865,21 @@
"snapshotLoadFailed": "加载快照失败", "snapshotLoadFailed": "加载快照失败",
"snapshotNamePlaceholder": "快照显示名称(可选)", "snapshotNamePlaceholder": "快照显示名称(可选)",
"snapshotNameHint": "这里只填写用户可见的显示名称,系统会自动生成内部快照名。", "snapshotNameHint": "这里只填写用户可见的显示名称,系统会自动生成内部快照名。",
"metricsTitle": "资源状态",
"metricsSubtitle": "查看当前容器的 CPU、内存与整个容器文件系统的存储使用情况。",
"metricsLoadFailed": "加载容器资源状态失败",
"metricsUnsupported": "当前容器后端暂不支持资源监控。",
"metricsUnavailable": "当前暂无可用的资源指标。",
"metricsStopped": "容器任务未运行,CPU 和内存指标暂不可用;如有存储信息仍会继续显示。",
"metricsPath": "统计范围",
"metricsUnlimited": "未配置内存限制",
"currentSample": "当前采样",
"sampledAt": "采样时间",
"metricsLabels": {
"cpu": "CPU",
"memory": "内存",
"storage": "存储"
},
"dataTitle": "数据操作", "dataTitle": "数据操作",
"dataSubtitle": "独立管理容器 `/data` 目录的导入、导出与恢复。", "dataSubtitle": "独立管理容器 `/data` 目录的导入、导出与恢复。",
"deleteTitle": "删除容器", "deleteTitle": "删除容器",
@@ -8,6 +8,7 @@ import { ChevronRight } from 'lucide-vue-next'
import { import {
deleteBotsByBotIdContainer, deleteBotsByBotIdContainer,
getBotsByBotIdContainer, getBotsByBotIdContainer,
getBotsByBotIdContainerMetrics,
getBotsByBotIdContainerSnapshots, getBotsByBotIdContainerSnapshots,
getBotsById, getBotsById,
postBotsByBotIdContainerDataExport, postBotsByBotIdContainerDataExport,
@@ -18,6 +19,7 @@ import {
postBotsByBotIdContainerStart, postBotsByBotIdContainerStart,
postBotsByBotIdContainerStop, postBotsByBotIdContainerStop,
type HandlersCreateContainerRequest, type HandlersCreateContainerRequest,
type HandlersGetContainerMetricsResponse,
type HandlersGetContainerResponse, type HandlersGetContainerResponse,
type HandlersListSnapshotsResponse, type HandlersListSnapshotsResponse,
} from '@memohai/sdk' } from '@memohai/sdk'
@@ -29,6 +31,7 @@ import {
import { Button, Collapsible, CollapsibleContent, CollapsibleTrigger, Input, Label, Separator, Spinner, Switch, Textarea } from '@memohai/ui' import { Button, Collapsible, CollapsibleContent, CollapsibleTrigger, Input, Label, Separator, Spinner, Switch, Textarea } from '@memohai/ui'
import ConfirmPopover from '@/components/confirm-popover/index.vue' import ConfirmPopover from '@/components/confirm-popover/index.vue'
import ContainerCreateProgress from './container-create-progress.vue' import ContainerCreateProgress from './container-create-progress.vue'
import ContainerMetricsPanel from './container-metrics-panel.vue'
import { useSyncedQueryParam } from '@/composables/useSyncedQueryParam' import { useSyncedQueryParam } from '@/composables/useSyncedQueryParam'
import { useBotStatusMeta } from '@/composables/useBotStatusMeta' import { useBotStatusMeta } from '@/composables/useBotStatusMeta'
import { useCapabilitiesStore } from '@/store/capabilities' import { useCapabilitiesStore } from '@/store/capabilities'
@@ -92,11 +95,14 @@ const botId = computed(() => route.params.botId as string)
const containerBusy = computed(() => containerLoading.value || containerAction.value !== '') const containerBusy = computed(() => containerLoading.value || containerAction.value !== '')
type BotContainerInfo = HandlersGetContainerResponse type BotContainerInfo = HandlersGetContainerResponse
type BotContainerMetrics = HandlersGetContainerMetricsResponse
type BotContainerSnapshot = HandlersListSnapshotsResponse extends { snapshots?: (infer T)[] } ? T : never type BotContainerSnapshot = HandlersListSnapshotsResponse extends { snapshots?: (infer T)[] } ? T : never
const containerInfo = ref<BotContainerInfo | null>(null) const containerInfo = ref<BotContainerInfo | null>(null)
const containerMetrics = ref<BotContainerMetrics | null>(null)
const containerMissing = ref(false) const containerMissing = ref(false)
const snapshots = ref<BotContainerSnapshot[]>([]) const snapshots = ref<BotContainerSnapshot[]>([])
const metricsLoading = ref(false)
const snapshotsLoading = ref(false) const snapshotsLoading = ref(false)
function resolveErrorMessage(error: unknown, fallback: string): string { function resolveErrorMessage(error: unknown, fallback: string): string {
@@ -134,6 +140,7 @@ async function loadContainerData(showLoadingToast: boolean) {
if (result.error !== undefined) { if (result.error !== undefined) {
if (result.response.status === 404) { if (result.response.status === 404) {
containerInfo.value = null containerInfo.value = null
containerMetrics.value = null
containerMissing.value = true containerMissing.value = true
snapshots.value = [] snapshots.value = []
return return
@@ -144,10 +151,13 @@ async function loadContainerData(showLoadingToast: boolean) {
containerInfo.value = result.data containerInfo.value = result.data
containerMissing.value = false containerMissing.value = false
const metricsPromise = loadContainerMetrics(showLoadingToast)
if (capabilitiesStore.snapshotSupported) { if (capabilitiesStore.snapshotSupported) {
await loadSnapshots() await Promise.all([metricsPromise, loadSnapshots()])
} else { } else {
snapshots.value = [] snapshots.value = []
await metricsPromise
} }
} catch (error) { } catch (error) {
if (showLoadingToast) { if (showLoadingToast) {
@@ -158,6 +168,24 @@ async function loadContainerData(showLoadingToast: boolean) {
} }
} }
async function loadContainerMetrics(showLoadingToast: boolean) {
metricsLoading.value = true
try {
const { data } = await getBotsByBotIdContainerMetrics({
path: { bot_id: botId.value },
throwOnError: true,
})
containerMetrics.value = data
} catch (error) {
containerMetrics.value = null
if (showLoadingToast) {
toast.error(resolveErrorMessage(error, t('bots.container.metricsLoadFailed')))
}
} finally {
metricsLoading.value = false
}
}
async function loadSnapshots() { async function loadSnapshots() {
if (!containerInfo.value || !capabilitiesStore.snapshotSupported) { if (!containerInfo.value || !capabilitiesStore.snapshotSupported) {
snapshots.value = [] snapshots.value = []
@@ -411,6 +439,7 @@ async function handleDeleteContainer(preserveData: boolean) {
throwOnError: true, throwOnError: true,
}) })
containerInfo.value = null containerInfo.value = null
containerMetrics.value = null
containerMissing.value = true containerMissing.value = true
snapshots.value = [] snapshots.value = []
createRestoreData.value = preserveData createRestoreData.value = preserveData
@@ -958,6 +987,12 @@ watch([activeTab, botId], ([tab]) => {
</dl> </dl>
</div> </div>
<ContainerMetricsPanel
:backend="capabilitiesStore.containerBackend"
:loading="metricsLoading"
:metrics="containerMetrics"
/>
<div class="rounded-md border px-3 py-2 text-xs text-muted-foreground"> <div class="rounded-md border px-3 py-2 text-xs text-muted-foreground">
{{ $t('bots.container.gpuRecreateHint') }} {{ $t('bots.container.gpuRecreateHint') }}
</div> </div>
@@ -0,0 +1,159 @@
<template>
<div class="space-y-4 rounded-md border p-4">
<div class="space-y-1">
<h4 class="text-xs font-medium">
{{ t('bots.container.metricsTitle') }}
</h4>
<p class="text-xs text-muted-foreground">
{{ t('bots.container.metricsSubtitle') }}
</p>
</div>
<div
v-if="loading && !metrics"
class="flex items-center gap-2 text-xs text-muted-foreground"
>
<Spinner />
<span>{{ t('common.loading') }}</span>
</div>
<div
v-else-if="backendUnsupported"
class="rounded-md border border-dashed px-3 py-2 text-xs text-muted-foreground"
>
{{ t('bots.container.metricsUnsupported') }}
</div>
<div
v-else-if="!hasAnyMetric"
class="rounded-md border border-dashed px-3 py-2 text-xs text-muted-foreground"
>
{{ taskRunning === false ? t('bots.container.metricsStopped') : t('bots.container.metricsUnavailable') }}
</div>
<template v-else>
<div
v-if="taskRunning === false"
class="rounded-md border border-primary/20 bg-primary/5 px-3 py-2 text-xs"
>
{{ t('bots.container.metricsStopped') }}
</div>
<div class="grid gap-3 md:grid-cols-3">
<div class="rounded-md border bg-background/70 p-3">
<p class="text-xs text-muted-foreground">
{{ t('bots.container.metricsLabels.cpu') }}
</p>
<p class="mt-2 text-2xl font-semibold">
{{ cpuValueText }}
</p>
<p class="mt-2 text-[11px] text-muted-foreground">
{{ t('bots.container.currentSample') }}
</p>
</div>
<div class="rounded-md border bg-background/70 p-3">
<p class="text-xs text-muted-foreground">
{{ t('bots.container.metricsLabels.memory') }}
</p>
<p class="mt-2 text-2xl font-semibold">
{{ memoryValueText }}
</p>
<p class="mt-2 text-[11px] text-muted-foreground">
{{ memoryHintText }}
</p>
</div>
<div class="rounded-md border bg-background/70 p-3">
<p class="text-xs text-muted-foreground">
{{ t('bots.container.metricsLabels.storage') }}
</p>
<p class="mt-2 text-2xl font-semibold">
{{ storageValueText }}
</p>
<p class="mt-2 text-[11px] text-muted-foreground break-all">
{{ t('bots.container.metricsPath') }}: {{ storagePathText }}
</p>
</div>
</div>
<p
v-if="sampledAtText !== '-'"
class="text-[11px] text-muted-foreground"
>
{{ t('bots.container.sampledAt') }}: {{ sampledAtText }}
</p>
</template>
</div>
</template>
<script setup lang="ts">
import { computed } from 'vue'
import { useI18n } from 'vue-i18n'
import { Spinner } from '@memohai/ui'
import type { HandlersGetContainerMetricsResponse } from '@memohai/sdk'
import { formatDateTime } from '@/utils/date-time'
const props = defineProps<{
backend: string
loading: boolean
metrics: HandlersGetContainerMetricsResponse | null
}>()
const { t } = useI18n()
const status = computed(() => props.metrics?.status)
const cpuMetrics = computed(() => props.metrics?.metrics?.cpu)
const memoryMetrics = computed(() => props.metrics?.metrics?.memory)
const storageMetrics = computed(() => props.metrics?.metrics?.storage)
const backendUnsupported = computed(() =>
props.backend !== 'containerd' || props.metrics?.supported === false,
)
const taskRunning = computed(() => status.value?.task_running)
const hasAnyMetric = computed(() =>
!!cpuMetrics.value || !!memoryMetrics.value || !!storageMetrics.value,
)
const cpuValueText = computed(() => formatPercent(cpuMetrics.value?.usage_percent))
const memoryValueText = computed(() => formatBytes(memoryMetrics.value?.usage_bytes))
const storageValueText = computed(() => formatBytes(storageMetrics.value?.used_bytes))
const storagePathText = computed(() => storageMetrics.value?.path || '-')
const sampledAtText = computed(() =>
formatDateTime(props.metrics?.sampled_at, { fallback: '-' }),
)
const memoryHintText = computed(() => {
const limit = memoryMetrics.value?.limit_bytes
if (limit && limit > 0) {
const usagePercent = formatPercent(memoryMetrics.value?.usage_percent)
return `${formatBytes(memoryMetrics.value?.usage_bytes)} / ${formatBytes(limit)}${usagePercent === '--' ? '' : ` (${usagePercent})`}`
}
if (memoryMetrics.value) {
return t('bots.container.metricsUnlimited')
}
return t('bots.container.metricsUnavailable')
})
function formatBytes(value?: number) {
if (typeof value !== 'number' || Number.isNaN(value) || value < 0) return '--'
if (value === 0) return '0 B'
const units = ['B', 'KiB', 'MiB', 'GiB', 'TiB']
let size = value
let unitIndex = 0
while (size >= 1024 && unitIndex < units.length - 1) {
size /= 1024
unitIndex += 1
}
const fractionDigits = size >= 100 || unitIndex === 0 ? 0 : 1
return `${size.toFixed(fractionDigits)} ${units[unitIndex]}`
}
function formatPercent(value?: number) {
if (typeof value !== 'number' || Number.isNaN(value) || value < 0) return '--'
const fractionDigits = value >= 100 ? 0 : 1
return `${value.toFixed(fractionDigits)}%`
}
</script>
+168
View File
@@ -0,0 +1,168 @@
package containerd
import (
"context"
"fmt"
"time"
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
containerd "github.com/containerd/containerd/v2/client"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/anypb"
)
const metricsSampleInterval = 200 * time.Millisecond
const maxPracticalMemoryLimitBytes = uint64(1) << 60
type taskMetricsSample struct {
timestamp time.Time
cpuUsageNS uint64
cpuUserNS uint64
cpuKernelNS uint64
memoryUsage uint64
memoryLimit uint64
}
func (s *DefaultService) GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error) {
task, ctx, err := s.getTask(ctx, containerID)
if err != nil {
return ContainerMetrics{}, err
}
first, err := sampleTaskMetrics(ctx, task)
if err != nil {
return ContainerMetrics{}, err
}
timer := time.NewTimer(metricsSampleInterval)
defer timer.Stop()
select {
case <-ctx.Done():
return ContainerMetrics{}, ctx.Err()
case <-timer.C:
}
second, err := sampleTaskMetrics(ctx, task)
if err != nil {
return ContainerMetrics{}, err
}
return ContainerMetrics{
SampledAt: second.timestamp,
CPU: buildCPUMetrics(first, second),
Memory: buildMemoryMetrics(second),
}, nil
}
func sampleTaskMetrics(ctx context.Context, task containerd.Task) (taskMetricsSample, error) {
metric, err := task.Metrics(ctx)
if err != nil {
return taskMetricsSample{}, err
}
if metric == nil || metric.Data == nil {
return taskMetricsSample{}, ErrNotSupported
}
timestamp := time.Now()
if ts := metric.GetTimestamp(); ts != nil {
timestamp = ts.AsTime()
}
switch {
case metric.Data.MessageIs(&cgroup1stats.Metrics{}):
var stats cgroup1stats.Metrics
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
return taskMetricsSample{}, fmt.Errorf("decode cgroup v1 metrics: %w", err)
}
return sampleFromCgroup1(timestamp, &stats), nil
case metric.Data.MessageIs(&cgroup2stats.Metrics{}):
var stats cgroup2stats.Metrics
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
return taskMetricsSample{}, fmt.Errorf("decode cgroup v2 metrics: %w", err)
}
return sampleFromCgroup2(timestamp, &stats), nil
default:
msg, decodeErr := anypb.UnmarshalNew(metric.Data, proto.UnmarshalOptions{})
if decodeErr != nil {
return taskMetricsSample{}, fmt.Errorf("decode task metrics: %w", decodeErr)
}
return taskMetricsSample{}, fmt.Errorf("%w: unsupported task metrics type %T", ErrNotSupported, msg)
}
}
func sampleFromCgroup1(timestamp time.Time, stats *cgroup1stats.Metrics) taskMetricsSample {
sample := taskMetricsSample{timestamp: timestamp}
if stats == nil {
return sample
}
if cpu := stats.GetCPU(); cpu != nil {
usage := cpu.GetUsage()
sample.cpuUsageNS = usage.GetTotal()
sample.cpuUserNS = usage.GetUser()
sample.cpuKernelNS = usage.GetKernel()
}
if memory := stats.GetMemory(); memory != nil {
entry := memory.GetUsage()
sample.memoryUsage = entry.GetUsage()
sample.memoryLimit = normalizeMemoryLimit(entry.GetLimit())
}
return sample
}
func sampleFromCgroup2(timestamp time.Time, stats *cgroup2stats.Metrics) taskMetricsSample {
sample := taskMetricsSample{timestamp: timestamp}
if stats == nil {
return sample
}
if cpu := stats.GetCPU(); cpu != nil {
sample.cpuUsageNS = cpu.GetUsageUsec() * 1_000
sample.cpuUserNS = cpu.GetUserUsec() * 1_000
sample.cpuKernelNS = cpu.GetSystemUsec() * 1_000
}
if memory := stats.GetMemory(); memory != nil {
sample.memoryUsage = memory.GetUsage()
sample.memoryLimit = normalizeMemoryLimit(memory.GetUsageLimit())
}
return sample
}
func buildCPUMetrics(first, second taskMetricsSample) *CPUMetrics {
metrics := &CPUMetrics{
UsageNanoseconds: second.cpuUsageNS,
UserNanoseconds: second.cpuUserNS,
KernelNanoseconds: second.cpuKernelNS,
}
elapsedNS := second.timestamp.Sub(first.timestamp).Nanoseconds()
if elapsedNS <= 0 || second.cpuUsageNS < first.cpuUsageNS {
return metrics
}
metrics.UsagePercent = (float64(second.cpuUsageNS-first.cpuUsageNS) / float64(elapsedNS)) * 100
if metrics.UsagePercent < 0 {
metrics.UsagePercent = 0
}
return metrics
}
func buildMemoryMetrics(sample taskMetricsSample) *MemoryMetrics {
metrics := &MemoryMetrics{
UsageBytes: sample.memoryUsage,
LimitBytes: sample.memoryLimit,
}
if sample.memoryLimit > 0 {
metrics.UsagePercent = (float64(sample.memoryUsage) / float64(sample.memoryLimit)) * 100
}
return metrics
}
func normalizeMemoryLimit(limit uint64) uint64 {
if limit == 0 || limit > maxPracticalMemoryLimitBytes {
return 0
}
return limit
}
+88
View File
@@ -0,0 +1,88 @@
package containerd
import (
"testing"
"time"
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
)
func TestBuildCPUMetricsUsesCumulativeDelta(t *testing.T) {
start := time.Unix(0, 0)
first := taskMetricsSample{
timestamp: start,
cpuUsageNS: 100_000_000,
cpuUserNS: 60_000_000,
cpuKernelNS: 40_000_000,
}
second := taskMetricsSample{
timestamp: start.Add(200 * time.Millisecond),
cpuUsageNS: 200_000_000,
cpuUserNS: 120_000_000,
cpuKernelNS: 80_000_000,
}
metrics := buildCPUMetrics(first, second)
if metrics == nil {
t.Fatal("expected cpu metrics")
}
if metrics.UsagePercent != 50 {
t.Fatalf("expected cpu usage percent 50, got %v", metrics.UsagePercent)
}
if metrics.UsageNanoseconds != second.cpuUsageNS {
t.Fatalf("expected latest cpu usage %d, got %d", second.cpuUsageNS, metrics.UsageNanoseconds)
}
}
func TestSampleFromCgroup1(t *testing.T) {
sample := sampleFromCgroup1(time.Unix(1, 0), &cgroup1stats.Metrics{
CPU: &cgroup1stats.CPUStat{
Usage: &cgroup1stats.CPUUsage{
Total: 12,
User: 7,
Kernel: 5,
},
},
Memory: &cgroup1stats.MemoryStat{
Usage: &cgroup1stats.MemoryEntry{
Usage: 4096,
Limit: 8192,
},
},
})
if sample.cpuUsageNS != 12 || sample.cpuUserNS != 7 || sample.cpuKernelNS != 5 {
t.Fatalf("unexpected cpu sample: %+v", sample)
}
if sample.memoryUsage != 4096 || sample.memoryLimit != 8192 {
t.Fatalf("unexpected memory sample: %+v", sample)
}
}
func TestSampleFromCgroup2(t *testing.T) {
sample := sampleFromCgroup2(time.Unix(2, 0), &cgroup2stats.Metrics{
CPU: &cgroup2stats.CPUStat{
UsageUsec: 12,
UserUsec: 7,
SystemUsec: 5,
},
Memory: &cgroup2stats.MemoryStat{
Usage: 16_384,
UsageLimit: 32_768,
},
})
if sample.cpuUsageNS != 12_000 || sample.cpuUserNS != 7_000 || sample.cpuKernelNS != 5_000 {
t.Fatalf("unexpected cpu sample: %+v", sample)
}
if sample.memoryUsage != 16_384 || sample.memoryLimit != 32_768 {
t.Fatalf("unexpected memory sample: %+v", sample)
}
}
func TestNormalizeMemoryLimitTreatsHugeValueAsUnlimited(t *testing.T) {
if got := normalizeMemoryLimit(maxPracticalMemoryLimitBytes + 1); got != 0 {
t.Fatalf("expected unlimited memory limit to normalize to 0, got %d", got)
}
}
+1
View File
@@ -100,6 +100,7 @@ type Service interface {
StopContainer(ctx context.Context, containerID string, opts *StopTaskOptions) error StopContainer(ctx context.Context, containerID string, opts *StopTaskOptions) error
DeleteTask(ctx context.Context, containerID string, opts *DeleteTaskOptions) error DeleteTask(ctx context.Context, containerID string, opts *DeleteTaskOptions) error
GetTaskInfo(ctx context.Context, containerID string) (TaskInfo, error) GetTaskInfo(ctx context.Context, containerID string) (TaskInfo, error)
GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error)
ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error) ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error)
SetupNetwork(ctx context.Context, req NetworkSetupRequest) (NetworkResult, error) SetupNetwork(ctx context.Context, req NetworkSetupRequest) (NetworkResult, error)
RemoveNetwork(ctx context.Context, req NetworkSetupRequest) error RemoveNetwork(ctx context.Context, req NetworkSetupRequest) error
+4
View File
@@ -327,6 +327,10 @@ func (s *AppleService) GetTaskInfo(ctx context.Context, containerID string) (Tas
}, nil }, nil
} }
func (*AppleService) GetContainerMetrics(context.Context, string) (ContainerMetrics, error) {
return ContainerMetrics{}, ErrNotSupported
}
func (s *AppleService) ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error) { func (s *AppleService) ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error) {
if err := s.ensureHealthy(ctx); err != nil { if err := s.ensureHealthy(ctx); err != nil {
return nil, err return nil, err
+19
View File
@@ -61,6 +61,25 @@ type TaskInfo struct {
ExitCode uint32 ExitCode uint32
} }
type ContainerMetrics struct {
SampledAt time.Time
CPU *CPUMetrics
Memory *MemoryMetrics
}
type CPUMetrics struct {
UsagePercent float64
UsageNanoseconds uint64
UserNanoseconds uint64
KernelNanoseconds uint64
}
type MemoryMetrics struct {
UsageBytes uint64
LimitBytes uint64
UsagePercent float64
}
type SnapshotInfo struct { type SnapshotInfo struct {
Name string Name string
Parent string Parent string
+112
View File
@@ -106,6 +106,44 @@ type GetContainerResponse struct {
UpdatedAt time.Time `json:"updated_at"` UpdatedAt time.Time `json:"updated_at"`
} }
type ContainerMetricsStatusResponse struct {
Exists bool `json:"exists"`
TaskRunning bool `json:"task_running"`
}
type ContainerCPUMetricsResponse struct {
UsagePercent float64 `json:"usage_percent"`
UsageNanoseconds uint64 `json:"usage_nanoseconds"`
UserNanoseconds uint64 `json:"user_nanoseconds"`
KernelNanoseconds uint64 `json:"kernel_nanoseconds"`
}
type ContainerMemoryMetricsResponse struct {
UsageBytes uint64 `json:"usage_bytes"`
LimitBytes uint64 `json:"limit_bytes"`
UsagePercent float64 `json:"usage_percent"`
}
type ContainerStorageMetricsResponse struct {
Path string `json:"path"`
UsedBytes uint64 `json:"used_bytes"`
}
type ContainerMetricsPayloadResponse struct {
CPU *ContainerCPUMetricsResponse `json:"cpu,omitempty"`
Memory *ContainerMemoryMetricsResponse `json:"memory,omitempty"`
Storage *ContainerStorageMetricsResponse `json:"storage,omitempty"`
}
type GetContainerMetricsResponse struct {
Supported bool `json:"supported"`
Backend string `json:"backend"`
UnsupportedReason string `json:"unsupported_reason,omitempty"`
Status ContainerMetricsStatusResponse `json:"status"`
Metrics ContainerMetricsPayloadResponse `json:"metrics"`
SampledAt *time.Time `json:"sampled_at,omitempty"`
}
type RollbackRequest struct { type RollbackRequest struct {
Version int `json:"version"` Version int `json:"version"`
} }
@@ -163,6 +201,7 @@ func (h *ContainerdHandler) Register(e *echo.Echo) {
group := e.Group("/bots/:bot_id/container") group := e.Group("/bots/:bot_id/container")
group.POST("", h.CreateContainer) group.POST("", h.CreateContainer)
group.GET("", h.GetContainer) group.GET("", h.GetContainer)
group.GET("/metrics", h.GetContainerMetrics)
group.DELETE("", h.DeleteContainer) group.DELETE("", h.DeleteContainer)
group.POST("/start", h.StartContainer) group.POST("/start", h.StartContainer)
group.POST("/stop", h.StopContainer) group.POST("/stop", h.StopContainer)
@@ -400,6 +439,46 @@ func (h *ContainerdHandler) GetContainer(c echo.Context) error {
}) })
} }
// GetContainerMetrics godoc
// @Summary Get current container metrics for bot
// @Tags containerd
// @Param bot_id path string true "Bot ID"
// @Success 200 {object} GetContainerMetricsResponse
// @Failure 500 {object} ErrorResponse
// @Router /bots/{bot_id}/container/metrics [get].
func (h *ContainerdHandler) GetContainerMetrics(c echo.Context) error {
botID, err := h.requireBotAccess(c)
if err != nil {
return err
}
metrics, err := h.manager.GetContainerMetrics(c.Request().Context(), botID)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
response := GetContainerMetricsResponse{
Supported: metrics.Supported,
Backend: h.containerBackend,
UnsupportedReason: metrics.UnsupportedReason,
Status: ContainerMetricsStatusResponse{
Exists: metrics.Status.Exists,
TaskRunning: metrics.Status.TaskRunning,
},
Metrics: ContainerMetricsPayloadResponse{
CPU: toContainerCPUMetricsResponse(metrics.CPU),
Memory: toContainerMemoryMetricsResponse(metrics.Memory),
Storage: toContainerStorageMetricsResponse(metrics.Storage),
},
}
if !metrics.SampledAt.IsZero() {
sampledAt := metrics.SampledAt
response.SampledAt = &sampledAt
}
return c.JSON(http.StatusOK, response)
}
// DeleteContainer godoc // DeleteContainer godoc
// @Summary Delete MCP container for bot // @Summary Delete MCP container for bot
// @Tags containerd // @Tags containerd
@@ -763,6 +842,39 @@ func (h *ContainerdHandler) RestorePreservedData(c echo.Context) error {
return c.JSON(http.StatusOK, map[string]bool{"restored": true}) return c.JSON(http.StatusOK, map[string]bool{"restored": true})
} }
func toContainerCPUMetricsResponse(metrics *ctr.CPUMetrics) *ContainerCPUMetricsResponse {
if metrics == nil {
return nil
}
return &ContainerCPUMetricsResponse{
UsagePercent: metrics.UsagePercent,
UsageNanoseconds: metrics.UsageNanoseconds,
UserNanoseconds: metrics.UserNanoseconds,
KernelNanoseconds: metrics.KernelNanoseconds,
}
}
func toContainerMemoryMetricsResponse(metrics *ctr.MemoryMetrics) *ContainerMemoryMetricsResponse {
if metrics == nil {
return nil
}
return &ContainerMemoryMetricsResponse{
UsageBytes: metrics.UsageBytes,
LimitBytes: metrics.LimitBytes,
UsagePercent: metrics.UsagePercent,
}
}
func toContainerStorageMetricsResponse(metrics *workspace.ContainerStorageMetrics) *ContainerStorageMetricsResponse {
if metrics == nil {
return nil
}
return &ContainerStorageMetricsResponse{
Path: metrics.Path,
UsedBytes: metrics.UsedBytes,
}
}
func snapshotLineage(root string, all []ctr.SnapshotInfo) ([]ctr.SnapshotInfo, bool) { func snapshotLineage(root string, all []ctr.SnapshotInfo) ([]ctr.SnapshotInfo, bool) {
root = strings.TrimSpace(root) root = strings.TrimSpace(root)
if root == "" { if root == "" {
+20
View File
@@ -51,6 +51,26 @@ type ContainerStatus struct {
UpdatedAt time.Time `json:"updated_at"` UpdatedAt time.Time `json:"updated_at"`
} }
type ContainerMetricsStatus struct {
Exists bool `json:"exists"`
TaskRunning bool `json:"task_running"`
}
type ContainerStorageMetrics struct {
Path string `json:"path"`
UsedBytes uint64 `json:"used_bytes"`
}
type ContainerMetricsResult struct {
Supported bool
UnsupportedReason string
Status ContainerMetricsStatus
SampledAt time.Time
CPU *ctr.CPUMetrics
Memory *ctr.MemoryMetrics
Storage *ContainerStorageMetrics
}
type Manager struct { type Manager struct {
service ctr.Service service ctr.Service
cfg config.WorkspaceConfig cfg config.WorkspaceConfig
@@ -111,6 +111,10 @@ func (*legacyRouteTestService) GetTaskInfo(context.Context, string) (ctr.TaskInf
return ctr.TaskInfo{}, errdefs.ErrNotFound return ctr.TaskInfo{}, errdefs.ErrNotFound
} }
func (*legacyRouteTestService) GetContainerMetrics(context.Context, string) (ctr.ContainerMetrics, error) {
return ctr.ContainerMetrics{}, ctr.ErrNotSupported
}
func (*legacyRouteTestService) ListTasks(context.Context, *ctr.ListTasksOptions) ([]ctr.TaskInfo, error) { func (*legacyRouteTestService) ListTasks(context.Context, *ctr.ListTasksOptions) ([]ctr.TaskInfo, error) {
return nil, nil return nil, nil
} }
+142
View File
@@ -0,0 +1,142 @@
package workspace
import (
"context"
"errors"
"io/fs"
"os"
"path/filepath"
"time"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/errdefs"
ctr "github.com/memohai/memoh/internal/containerd"
)
const unsupportedReasonBackend = "backend_not_supported"
func (m *Manager) GetContainerMetrics(ctx context.Context, botID string) (*ContainerMetricsResult, error) {
result := &ContainerMetricsResult{
Supported: true,
Status: ContainerMetricsStatus{
Exists: false,
},
}
containerID, err := m.ContainerID(ctx, botID)
if err != nil {
if errors.Is(err, ErrContainerNotFound) {
return result, nil
}
return nil, err
}
info, err := m.service.GetContainer(ctx, containerID)
if err != nil {
if errdefs.IsNotFound(err) {
return result, nil
}
return nil, err
}
result.Status.Exists = true
taskInfo, err := m.service.GetTaskInfo(ctx, containerID)
if err == nil {
result.Status.TaskRunning = taskInfo.Status == ctr.TaskStatusRunning
} else if !errdefs.IsNotFound(err) {
return nil, err
}
runtimeMetrics, err := m.service.GetContainerMetrics(ctx, containerID)
switch {
case err == nil:
result.CPU = runtimeMetrics.CPU
result.Memory = runtimeMetrics.Memory
result.SampledAt = runtimeMetrics.SampledAt
case errors.Is(err, ctr.ErrNotSupported):
result.Supported = false
result.UnsupportedReason = unsupportedReasonBackend
case errdefs.IsNotFound(err):
// Task is not running, so CPU and memory metrics are unavailable.
default:
return nil, err
}
if result.Supported {
storage, err := m.collectStorageMetrics(ctx, info)
if err != nil {
if errors.Is(err, ctr.ErrNotSupported) {
return result, nil
}
return nil, err
}
result.Storage = storage
if result.SampledAt.IsZero() {
result.SampledAt = time.Now()
}
}
return result, nil
}
func (m *Manager) collectStorageMetrics(ctx context.Context, info ctr.ContainerInfo) (*ContainerStorageMetrics, error) {
mounts, err := m.snapshotMounts(ctx, info)
if err != nil {
if errors.Is(err, errMountNotSupported) {
return nil, ctr.ErrNotSupported
}
return nil, err
}
var usedBytes uint64
if err := mount.WithReadonlyTempMount(ctx, mounts, func(root string) error {
if _, statErr := os.Stat(root); statErr != nil {
if os.IsNotExist(statErr) {
return nil
}
return statErr
}
size, sizeErr := dirSize(root)
if sizeErr != nil {
return sizeErr
}
usedBytes = size
return nil
}); err != nil {
return nil, err
}
return &ContainerStorageMetrics{
Path: "/",
UsedBytes: usedBytes,
}, nil
}
func dirSize(root string) (uint64, error) {
var size uint64
err := filepath.WalkDir(root, func(_ string, entry fs.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
}
if entry.IsDir() {
return nil
}
info, err := entry.Info()
if err != nil {
return err
}
fileSize := info.Size()
if fileSize > 0 {
size += uint64(fileSize) //nolint:gosec // file sizes are checked to be positive before conversion
}
return nil
})
if err != nil {
return 0, err
}
return size, nil
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+68
View File
@@ -882,10 +882,39 @@ export type HandlersChannelMeta = {
user_config_schema?: ChannelConfigSchema; user_config_schema?: ChannelConfigSchema;
}; };
export type HandlersContainerCpuMetricsResponse = {
kernel_nanoseconds?: number;
usage_nanoseconds?: number;
usage_percent?: number;
user_nanoseconds?: number;
};
export type HandlersContainerGpuRequest = { export type HandlersContainerGpuRequest = {
devices?: Array<string>; devices?: Array<string>;
}; };
export type HandlersContainerMemoryMetricsResponse = {
limit_bytes?: number;
usage_bytes?: number;
usage_percent?: number;
};
export type HandlersContainerMetricsPayloadResponse = {
cpu?: HandlersContainerCpuMetricsResponse;
memory?: HandlersContainerMemoryMetricsResponse;
storage?: HandlersContainerStorageMetricsResponse;
};
export type HandlersContainerMetricsStatusResponse = {
exists?: boolean;
task_running?: boolean;
};
export type HandlersContainerStorageMetricsResponse = {
path?: string;
used_bytes?: number;
};
export type HandlersContextUsage = { export type HandlersContextUsage = {
context_window?: number; context_window?: number;
used_tokens?: number; used_tokens?: number;
@@ -979,6 +1008,15 @@ export type HandlersFsWriteRequest = {
path?: string; path?: string;
}; };
export type HandlersGetContainerMetricsResponse = {
backend?: string;
metrics?: HandlersContainerMetricsPayloadResponse;
sampled_at?: string;
status?: HandlersContainerMetricsStatusResponse;
supported?: boolean;
unsupported_reason?: string;
};
export type HandlersGetContainerResponse = { export type HandlersGetContainerResponse = {
cdi_devices?: Array<string>; cdi_devices?: Array<string>;
container_id?: string; container_id?: string;
@@ -3038,6 +3076,36 @@ export type PostBotsByBotIdContainerFsWriteResponses = {
export type PostBotsByBotIdContainerFsWriteResponse = PostBotsByBotIdContainerFsWriteResponses[keyof PostBotsByBotIdContainerFsWriteResponses]; export type PostBotsByBotIdContainerFsWriteResponse = PostBotsByBotIdContainerFsWriteResponses[keyof PostBotsByBotIdContainerFsWriteResponses];
export type GetBotsByBotIdContainerMetricsData = {
body?: never;
path: {
/**
* Bot ID
*/
bot_id: string;
};
query?: never;
url: '/bots/{bot_id}/container/metrics';
};
export type GetBotsByBotIdContainerMetricsErrors = {
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type GetBotsByBotIdContainerMetricsError = GetBotsByBotIdContainerMetricsErrors[keyof GetBotsByBotIdContainerMetricsErrors];
export type GetBotsByBotIdContainerMetricsResponses = {
/**
* OK
*/
200: HandlersGetContainerMetricsResponse;
};
export type GetBotsByBotIdContainerMetricsResponse = GetBotsByBotIdContainerMetricsResponses[keyof GetBotsByBotIdContainerMetricsResponses];
export type DeleteBotsByBotIdContainerSkillsData = { export type DeleteBotsByBotIdContainerSkillsData = {
/** /**
* Delete skills payload * Delete skills payload
+121
View File
@@ -1503,6 +1503,37 @@ const docTemplate = `{
} }
} }
}, },
"/bots/{bot_id}/container/metrics": {
"get": {
"tags": [
"containerd"
],
"summary": "Get current container metrics for bot",
"parameters": [
{
"type": "string",
"description": "Bot ID",
"name": "bot_id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/handlers.GetContainerMetricsResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/bots/{bot_id}/container/skills": { "/bots/{bot_id}/container/skills": {
"get": { "get": {
"tags": [ "tags": [
@@ -11653,6 +11684,23 @@ const docTemplate = `{
} }
} }
}, },
"handlers.ContainerCPUMetricsResponse": {
"type": "object",
"properties": {
"kernel_nanoseconds": {
"type": "integer"
},
"usage_nanoseconds": {
"type": "integer"
},
"usage_percent": {
"type": "number"
},
"user_nanoseconds": {
"type": "integer"
}
}
},
"handlers.ContainerGPURequest": { "handlers.ContainerGPURequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -11664,6 +11712,56 @@ const docTemplate = `{
} }
} }
}, },
"handlers.ContainerMemoryMetricsResponse": {
"type": "object",
"properties": {
"limit_bytes": {
"type": "integer"
},
"usage_bytes": {
"type": "integer"
},
"usage_percent": {
"type": "number"
}
}
},
"handlers.ContainerMetricsPayloadResponse": {
"type": "object",
"properties": {
"cpu": {
"$ref": "#/definitions/handlers.ContainerCPUMetricsResponse"
},
"memory": {
"$ref": "#/definitions/handlers.ContainerMemoryMetricsResponse"
},
"storage": {
"$ref": "#/definitions/handlers.ContainerStorageMetricsResponse"
}
}
},
"handlers.ContainerMetricsStatusResponse": {
"type": "object",
"properties": {
"exists": {
"type": "boolean"
},
"task_running": {
"type": "boolean"
}
}
},
"handlers.ContainerStorageMetricsResponse": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"used_bytes": {
"type": "integer"
}
}
},
"handlers.ContextUsage": { "handlers.ContextUsage": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -11889,6 +11987,29 @@ const docTemplate = `{
} }
} }
}, },
"handlers.GetContainerMetricsResponse": {
"type": "object",
"properties": {
"backend": {
"type": "string"
},
"metrics": {
"$ref": "#/definitions/handlers.ContainerMetricsPayloadResponse"
},
"sampled_at": {
"type": "string"
},
"status": {
"$ref": "#/definitions/handlers.ContainerMetricsStatusResponse"
},
"supported": {
"type": "boolean"
},
"unsupported_reason": {
"type": "string"
}
}
},
"handlers.GetContainerResponse": { "handlers.GetContainerResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
+121
View File
@@ -1494,6 +1494,37 @@
} }
} }
}, },
"/bots/{bot_id}/container/metrics": {
"get": {
"tags": [
"containerd"
],
"summary": "Get current container metrics for bot",
"parameters": [
{
"type": "string",
"description": "Bot ID",
"name": "bot_id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/handlers.GetContainerMetricsResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/bots/{bot_id}/container/skills": { "/bots/{bot_id}/container/skills": {
"get": { "get": {
"tags": [ "tags": [
@@ -11644,6 +11675,23 @@
} }
} }
}, },
"handlers.ContainerCPUMetricsResponse": {
"type": "object",
"properties": {
"kernel_nanoseconds": {
"type": "integer"
},
"usage_nanoseconds": {
"type": "integer"
},
"usage_percent": {
"type": "number"
},
"user_nanoseconds": {
"type": "integer"
}
}
},
"handlers.ContainerGPURequest": { "handlers.ContainerGPURequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -11655,6 +11703,56 @@
} }
} }
}, },
"handlers.ContainerMemoryMetricsResponse": {
"type": "object",
"properties": {
"limit_bytes": {
"type": "integer"
},
"usage_bytes": {
"type": "integer"
},
"usage_percent": {
"type": "number"
}
}
},
"handlers.ContainerMetricsPayloadResponse": {
"type": "object",
"properties": {
"cpu": {
"$ref": "#/definitions/handlers.ContainerCPUMetricsResponse"
},
"memory": {
"$ref": "#/definitions/handlers.ContainerMemoryMetricsResponse"
},
"storage": {
"$ref": "#/definitions/handlers.ContainerStorageMetricsResponse"
}
}
},
"handlers.ContainerMetricsStatusResponse": {
"type": "object",
"properties": {
"exists": {
"type": "boolean"
},
"task_running": {
"type": "boolean"
}
}
},
"handlers.ContainerStorageMetricsResponse": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"used_bytes": {
"type": "integer"
}
}
},
"handlers.ContextUsage": { "handlers.ContextUsage": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -11880,6 +11978,29 @@
} }
} }
}, },
"handlers.GetContainerMetricsResponse": {
"type": "object",
"properties": {
"backend": {
"type": "string"
},
"metrics": {
"$ref": "#/definitions/handlers.ContainerMetricsPayloadResponse"
},
"sampled_at": {
"type": "string"
},
"status": {
"$ref": "#/definitions/handlers.ContainerMetricsStatusResponse"
},
"supported": {
"type": "boolean"
},
"unsupported_reason": {
"type": "string"
}
}
},
"handlers.GetContainerResponse": { "handlers.GetContainerResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
+78
View File
@@ -1486,6 +1486,17 @@ definitions:
user_config_schema: user_config_schema:
$ref: '#/definitions/channel.ConfigSchema' $ref: '#/definitions/channel.ConfigSchema'
type: object type: object
handlers.ContainerCPUMetricsResponse:
properties:
kernel_nanoseconds:
type: integer
usage_nanoseconds:
type: integer
usage_percent:
type: number
user_nanoseconds:
type: integer
type: object
handlers.ContainerGPURequest: handlers.ContainerGPURequest:
properties: properties:
devices: devices:
@@ -1493,6 +1504,38 @@ definitions:
type: string type: string
type: array type: array
type: object type: object
handlers.ContainerMemoryMetricsResponse:
properties:
limit_bytes:
type: integer
usage_bytes:
type: integer
usage_percent:
type: number
type: object
handlers.ContainerMetricsPayloadResponse:
properties:
cpu:
$ref: '#/definitions/handlers.ContainerCPUMetricsResponse'
memory:
$ref: '#/definitions/handlers.ContainerMemoryMetricsResponse'
storage:
$ref: '#/definitions/handlers.ContainerStorageMetricsResponse'
type: object
handlers.ContainerMetricsStatusResponse:
properties:
exists:
type: boolean
task_running:
type: boolean
type: object
handlers.ContainerStorageMetricsResponse:
properties:
path:
type: string
used_bytes:
type: integer
type: object
handlers.ContextUsage: handlers.ContextUsage:
properties: properties:
context_window: context_window:
@@ -1638,6 +1681,21 @@ definitions:
path: path:
type: string type: string
type: object type: object
handlers.GetContainerMetricsResponse:
properties:
backend:
type: string
metrics:
$ref: '#/definitions/handlers.ContainerMetricsPayloadResponse'
sampled_at:
type: string
status:
$ref: '#/definitions/handlers.ContainerMetricsStatusResponse'
supported:
type: boolean
unsupported_reason:
type: string
type: object
handlers.GetContainerResponse: handlers.GetContainerResponse:
properties: properties:
cdi_devices: cdi_devices:
@@ -4037,6 +4095,26 @@ paths:
summary: Write text content to a file summary: Write text content to a file
tags: tags:
- containerd - containerd
/bots/{bot_id}/container/metrics:
get:
parameters:
- description: Bot ID
in: path
name: bot_id
required: true
type: string
responses:
"200":
description: OK
schema:
$ref: '#/definitions/handlers.GetContainerMetricsResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Get current container metrics for bot
tags:
- containerd
/bots/{bot_id}/container/skills: /bots/{bot_id}/container/skills:
delete: delete:
parameters: parameters: