feat(container): add current container metrics view

Expose a dedicated container metrics endpoint and surface current CPU, memory, and root filesystem usage in the bot container view. This gives operators a quick health snapshot while degrading cleanly on unsupported backends.
This commit is contained in:
Acbox
2026-04-24 15:10:47 +08:00
parent 8136ef6ed6
commit e4aca0db13
20 changed files with 1198 additions and 6 deletions
+168
View File
@@ -0,0 +1,168 @@
package containerd
import (
"context"
"fmt"
"time"
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
containerd "github.com/containerd/containerd/v2/client"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/anypb"
)
const metricsSampleInterval = 200 * time.Millisecond
const maxPracticalMemoryLimitBytes = uint64(1) << 60
type taskMetricsSample struct {
timestamp time.Time
cpuUsageNS uint64
cpuUserNS uint64
cpuKernelNS uint64
memoryUsage uint64
memoryLimit uint64
}
func (s *DefaultService) GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error) {
task, ctx, err := s.getTask(ctx, containerID)
if err != nil {
return ContainerMetrics{}, err
}
first, err := sampleTaskMetrics(ctx, task)
if err != nil {
return ContainerMetrics{}, err
}
timer := time.NewTimer(metricsSampleInterval)
defer timer.Stop()
select {
case <-ctx.Done():
return ContainerMetrics{}, ctx.Err()
case <-timer.C:
}
second, err := sampleTaskMetrics(ctx, task)
if err != nil {
return ContainerMetrics{}, err
}
return ContainerMetrics{
SampledAt: second.timestamp,
CPU: buildCPUMetrics(first, second),
Memory: buildMemoryMetrics(second),
}, nil
}
func sampleTaskMetrics(ctx context.Context, task containerd.Task) (taskMetricsSample, error) {
metric, err := task.Metrics(ctx)
if err != nil {
return taskMetricsSample{}, err
}
if metric == nil || metric.Data == nil {
return taskMetricsSample{}, ErrNotSupported
}
timestamp := time.Now()
if ts := metric.GetTimestamp(); ts != nil {
timestamp = ts.AsTime()
}
switch {
case metric.Data.MessageIs(&cgroup1stats.Metrics{}):
var stats cgroup1stats.Metrics
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
return taskMetricsSample{}, fmt.Errorf("decode cgroup v1 metrics: %w", err)
}
return sampleFromCgroup1(timestamp, &stats), nil
case metric.Data.MessageIs(&cgroup2stats.Metrics{}):
var stats cgroup2stats.Metrics
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
return taskMetricsSample{}, fmt.Errorf("decode cgroup v2 metrics: %w", err)
}
return sampleFromCgroup2(timestamp, &stats), nil
default:
msg, decodeErr := anypb.UnmarshalNew(metric.Data, proto.UnmarshalOptions{})
if decodeErr != nil {
return taskMetricsSample{}, fmt.Errorf("decode task metrics: %w", decodeErr)
}
return taskMetricsSample{}, fmt.Errorf("%w: unsupported task metrics type %T", ErrNotSupported, msg)
}
}
func sampleFromCgroup1(timestamp time.Time, stats *cgroup1stats.Metrics) taskMetricsSample {
sample := taskMetricsSample{timestamp: timestamp}
if stats == nil {
return sample
}
if cpu := stats.GetCPU(); cpu != nil {
usage := cpu.GetUsage()
sample.cpuUsageNS = usage.GetTotal()
sample.cpuUserNS = usage.GetUser()
sample.cpuKernelNS = usage.GetKernel()
}
if memory := stats.GetMemory(); memory != nil {
entry := memory.GetUsage()
sample.memoryUsage = entry.GetUsage()
sample.memoryLimit = normalizeMemoryLimit(entry.GetLimit())
}
return sample
}
func sampleFromCgroup2(timestamp time.Time, stats *cgroup2stats.Metrics) taskMetricsSample {
sample := taskMetricsSample{timestamp: timestamp}
if stats == nil {
return sample
}
if cpu := stats.GetCPU(); cpu != nil {
sample.cpuUsageNS = cpu.GetUsageUsec() * 1_000
sample.cpuUserNS = cpu.GetUserUsec() * 1_000
sample.cpuKernelNS = cpu.GetSystemUsec() * 1_000
}
if memory := stats.GetMemory(); memory != nil {
sample.memoryUsage = memory.GetUsage()
sample.memoryLimit = normalizeMemoryLimit(memory.GetUsageLimit())
}
return sample
}
func buildCPUMetrics(first, second taskMetricsSample) *CPUMetrics {
metrics := &CPUMetrics{
UsageNanoseconds: second.cpuUsageNS,
UserNanoseconds: second.cpuUserNS,
KernelNanoseconds: second.cpuKernelNS,
}
elapsedNS := second.timestamp.Sub(first.timestamp).Nanoseconds()
if elapsedNS <= 0 || second.cpuUsageNS < first.cpuUsageNS {
return metrics
}
metrics.UsagePercent = (float64(second.cpuUsageNS-first.cpuUsageNS) / float64(elapsedNS)) * 100
if metrics.UsagePercent < 0 {
metrics.UsagePercent = 0
}
return metrics
}
func buildMemoryMetrics(sample taskMetricsSample) *MemoryMetrics {
metrics := &MemoryMetrics{
UsageBytes: sample.memoryUsage,
LimitBytes: sample.memoryLimit,
}
if sample.memoryLimit > 0 {
metrics.UsagePercent = (float64(sample.memoryUsage) / float64(sample.memoryLimit)) * 100
}
return metrics
}
func normalizeMemoryLimit(limit uint64) uint64 {
if limit == 0 || limit > maxPracticalMemoryLimitBytes {
return 0
}
return limit
}
+88
View File
@@ -0,0 +1,88 @@
package containerd
import (
"testing"
"time"
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
)
func TestBuildCPUMetricsUsesCumulativeDelta(t *testing.T) {
start := time.Unix(0, 0)
first := taskMetricsSample{
timestamp: start,
cpuUsageNS: 100_000_000,
cpuUserNS: 60_000_000,
cpuKernelNS: 40_000_000,
}
second := taskMetricsSample{
timestamp: start.Add(200 * time.Millisecond),
cpuUsageNS: 200_000_000,
cpuUserNS: 120_000_000,
cpuKernelNS: 80_000_000,
}
metrics := buildCPUMetrics(first, second)
if metrics == nil {
t.Fatal("expected cpu metrics")
}
if metrics.UsagePercent != 50 {
t.Fatalf("expected cpu usage percent 50, got %v", metrics.UsagePercent)
}
if metrics.UsageNanoseconds != second.cpuUsageNS {
t.Fatalf("expected latest cpu usage %d, got %d", second.cpuUsageNS, metrics.UsageNanoseconds)
}
}
func TestSampleFromCgroup1(t *testing.T) {
sample := sampleFromCgroup1(time.Unix(1, 0), &cgroup1stats.Metrics{
CPU: &cgroup1stats.CPUStat{
Usage: &cgroup1stats.CPUUsage{
Total: 12,
User: 7,
Kernel: 5,
},
},
Memory: &cgroup1stats.MemoryStat{
Usage: &cgroup1stats.MemoryEntry{
Usage: 4096,
Limit: 8192,
},
},
})
if sample.cpuUsageNS != 12 || sample.cpuUserNS != 7 || sample.cpuKernelNS != 5 {
t.Fatalf("unexpected cpu sample: %+v", sample)
}
if sample.memoryUsage != 4096 || sample.memoryLimit != 8192 {
t.Fatalf("unexpected memory sample: %+v", sample)
}
}
func TestSampleFromCgroup2(t *testing.T) {
sample := sampleFromCgroup2(time.Unix(2, 0), &cgroup2stats.Metrics{
CPU: &cgroup2stats.CPUStat{
UsageUsec: 12,
UserUsec: 7,
SystemUsec: 5,
},
Memory: &cgroup2stats.MemoryStat{
Usage: 16_384,
UsageLimit: 32_768,
},
})
if sample.cpuUsageNS != 12_000 || sample.cpuUserNS != 7_000 || sample.cpuKernelNS != 5_000 {
t.Fatalf("unexpected cpu sample: %+v", sample)
}
if sample.memoryUsage != 16_384 || sample.memoryLimit != 32_768 {
t.Fatalf("unexpected memory sample: %+v", sample)
}
}
func TestNormalizeMemoryLimitTreatsHugeValueAsUnlimited(t *testing.T) {
if got := normalizeMemoryLimit(maxPracticalMemoryLimitBytes + 1); got != 0 {
t.Fatalf("expected unlimited memory limit to normalize to 0, got %d", got)
}
}
+1
View File
@@ -100,6 +100,7 @@ type Service interface {
StopContainer(ctx context.Context, containerID string, opts *StopTaskOptions) error
DeleteTask(ctx context.Context, containerID string, opts *DeleteTaskOptions) error
GetTaskInfo(ctx context.Context, containerID string) (TaskInfo, error)
GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error)
ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error)
SetupNetwork(ctx context.Context, req NetworkSetupRequest) (NetworkResult, error)
RemoveNetwork(ctx context.Context, req NetworkSetupRequest) error
+4
View File
@@ -327,6 +327,10 @@ func (s *AppleService) GetTaskInfo(ctx context.Context, containerID string) (Tas
}, nil
}
func (*AppleService) GetContainerMetrics(context.Context, string) (ContainerMetrics, error) {
return ContainerMetrics{}, ErrNotSupported
}
func (s *AppleService) ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error) {
if err := s.ensureHealthy(ctx); err != nil {
return nil, err
+19
View File
@@ -61,6 +61,25 @@ type TaskInfo struct {
ExitCode uint32
}
type ContainerMetrics struct {
SampledAt time.Time
CPU *CPUMetrics
Memory *MemoryMetrics
}
type CPUMetrics struct {
UsagePercent float64
UsageNanoseconds uint64
UserNanoseconds uint64
KernelNanoseconds uint64
}
type MemoryMetrics struct {
UsageBytes uint64
LimitBytes uint64
UsagePercent float64
}
type SnapshotInfo struct {
Name string
Parent string