mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
feat(container): add current container metrics view
Expose a dedicated container metrics endpoint and surface current CPU, memory, and root filesystem usage in the bot container view. This gives operators a quick health snapshot while degrading cleanly on unsupported backends.
This commit is contained in:
@@ -0,0 +1,168 @@
|
||||
package containerd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
|
||||
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
|
||||
containerd "github.com/containerd/containerd/v2/client"
|
||||
"google.golang.org/protobuf/proto"
|
||||
"google.golang.org/protobuf/types/known/anypb"
|
||||
)
|
||||
|
||||
const metricsSampleInterval = 200 * time.Millisecond
|
||||
|
||||
const maxPracticalMemoryLimitBytes = uint64(1) << 60
|
||||
|
||||
type taskMetricsSample struct {
|
||||
timestamp time.Time
|
||||
cpuUsageNS uint64
|
||||
cpuUserNS uint64
|
||||
cpuKernelNS uint64
|
||||
memoryUsage uint64
|
||||
memoryLimit uint64
|
||||
}
|
||||
|
||||
func (s *DefaultService) GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error) {
|
||||
task, ctx, err := s.getTask(ctx, containerID)
|
||||
if err != nil {
|
||||
return ContainerMetrics{}, err
|
||||
}
|
||||
|
||||
first, err := sampleTaskMetrics(ctx, task)
|
||||
if err != nil {
|
||||
return ContainerMetrics{}, err
|
||||
}
|
||||
|
||||
timer := time.NewTimer(metricsSampleInterval)
|
||||
defer timer.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ContainerMetrics{}, ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
|
||||
second, err := sampleTaskMetrics(ctx, task)
|
||||
if err != nil {
|
||||
return ContainerMetrics{}, err
|
||||
}
|
||||
|
||||
return ContainerMetrics{
|
||||
SampledAt: second.timestamp,
|
||||
CPU: buildCPUMetrics(first, second),
|
||||
Memory: buildMemoryMetrics(second),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func sampleTaskMetrics(ctx context.Context, task containerd.Task) (taskMetricsSample, error) {
|
||||
metric, err := task.Metrics(ctx)
|
||||
if err != nil {
|
||||
return taskMetricsSample{}, err
|
||||
}
|
||||
if metric == nil || metric.Data == nil {
|
||||
return taskMetricsSample{}, ErrNotSupported
|
||||
}
|
||||
|
||||
timestamp := time.Now()
|
||||
if ts := metric.GetTimestamp(); ts != nil {
|
||||
timestamp = ts.AsTime()
|
||||
}
|
||||
|
||||
switch {
|
||||
case metric.Data.MessageIs(&cgroup1stats.Metrics{}):
|
||||
var stats cgroup1stats.Metrics
|
||||
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
|
||||
return taskMetricsSample{}, fmt.Errorf("decode cgroup v1 metrics: %w", err)
|
||||
}
|
||||
return sampleFromCgroup1(timestamp, &stats), nil
|
||||
case metric.Data.MessageIs(&cgroup2stats.Metrics{}):
|
||||
var stats cgroup2stats.Metrics
|
||||
if err := anypb.UnmarshalTo(metric.Data, &stats, proto.UnmarshalOptions{}); err != nil {
|
||||
return taskMetricsSample{}, fmt.Errorf("decode cgroup v2 metrics: %w", err)
|
||||
}
|
||||
return sampleFromCgroup2(timestamp, &stats), nil
|
||||
default:
|
||||
msg, decodeErr := anypb.UnmarshalNew(metric.Data, proto.UnmarshalOptions{})
|
||||
if decodeErr != nil {
|
||||
return taskMetricsSample{}, fmt.Errorf("decode task metrics: %w", decodeErr)
|
||||
}
|
||||
return taskMetricsSample{}, fmt.Errorf("%w: unsupported task metrics type %T", ErrNotSupported, msg)
|
||||
}
|
||||
}
|
||||
|
||||
func sampleFromCgroup1(timestamp time.Time, stats *cgroup1stats.Metrics) taskMetricsSample {
|
||||
sample := taskMetricsSample{timestamp: timestamp}
|
||||
if stats == nil {
|
||||
return sample
|
||||
}
|
||||
if cpu := stats.GetCPU(); cpu != nil {
|
||||
usage := cpu.GetUsage()
|
||||
sample.cpuUsageNS = usage.GetTotal()
|
||||
sample.cpuUserNS = usage.GetUser()
|
||||
sample.cpuKernelNS = usage.GetKernel()
|
||||
}
|
||||
if memory := stats.GetMemory(); memory != nil {
|
||||
entry := memory.GetUsage()
|
||||
sample.memoryUsage = entry.GetUsage()
|
||||
sample.memoryLimit = normalizeMemoryLimit(entry.GetLimit())
|
||||
}
|
||||
return sample
|
||||
}
|
||||
|
||||
func sampleFromCgroup2(timestamp time.Time, stats *cgroup2stats.Metrics) taskMetricsSample {
|
||||
sample := taskMetricsSample{timestamp: timestamp}
|
||||
if stats == nil {
|
||||
return sample
|
||||
}
|
||||
if cpu := stats.GetCPU(); cpu != nil {
|
||||
sample.cpuUsageNS = cpu.GetUsageUsec() * 1_000
|
||||
sample.cpuUserNS = cpu.GetUserUsec() * 1_000
|
||||
sample.cpuKernelNS = cpu.GetSystemUsec() * 1_000
|
||||
}
|
||||
if memory := stats.GetMemory(); memory != nil {
|
||||
sample.memoryUsage = memory.GetUsage()
|
||||
sample.memoryLimit = normalizeMemoryLimit(memory.GetUsageLimit())
|
||||
}
|
||||
return sample
|
||||
}
|
||||
|
||||
func buildCPUMetrics(first, second taskMetricsSample) *CPUMetrics {
|
||||
metrics := &CPUMetrics{
|
||||
UsageNanoseconds: second.cpuUsageNS,
|
||||
UserNanoseconds: second.cpuUserNS,
|
||||
KernelNanoseconds: second.cpuKernelNS,
|
||||
}
|
||||
|
||||
elapsedNS := second.timestamp.Sub(first.timestamp).Nanoseconds()
|
||||
if elapsedNS <= 0 || second.cpuUsageNS < first.cpuUsageNS {
|
||||
return metrics
|
||||
}
|
||||
|
||||
metrics.UsagePercent = (float64(second.cpuUsageNS-first.cpuUsageNS) / float64(elapsedNS)) * 100
|
||||
if metrics.UsagePercent < 0 {
|
||||
metrics.UsagePercent = 0
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
func buildMemoryMetrics(sample taskMetricsSample) *MemoryMetrics {
|
||||
metrics := &MemoryMetrics{
|
||||
UsageBytes: sample.memoryUsage,
|
||||
LimitBytes: sample.memoryLimit,
|
||||
}
|
||||
if sample.memoryLimit > 0 {
|
||||
metrics.UsagePercent = (float64(sample.memoryUsage) / float64(sample.memoryLimit)) * 100
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
func normalizeMemoryLimit(limit uint64) uint64 {
|
||||
if limit == 0 || limit > maxPracticalMemoryLimitBytes {
|
||||
return 0
|
||||
}
|
||||
return limit
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package containerd
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats"
|
||||
cgroup2stats "github.com/containerd/cgroups/v3/cgroup2/stats"
|
||||
)
|
||||
|
||||
func TestBuildCPUMetricsUsesCumulativeDelta(t *testing.T) {
|
||||
start := time.Unix(0, 0)
|
||||
first := taskMetricsSample{
|
||||
timestamp: start,
|
||||
cpuUsageNS: 100_000_000,
|
||||
cpuUserNS: 60_000_000,
|
||||
cpuKernelNS: 40_000_000,
|
||||
}
|
||||
second := taskMetricsSample{
|
||||
timestamp: start.Add(200 * time.Millisecond),
|
||||
cpuUsageNS: 200_000_000,
|
||||
cpuUserNS: 120_000_000,
|
||||
cpuKernelNS: 80_000_000,
|
||||
}
|
||||
|
||||
metrics := buildCPUMetrics(first, second)
|
||||
if metrics == nil {
|
||||
t.Fatal("expected cpu metrics")
|
||||
}
|
||||
if metrics.UsagePercent != 50 {
|
||||
t.Fatalf("expected cpu usage percent 50, got %v", metrics.UsagePercent)
|
||||
}
|
||||
if metrics.UsageNanoseconds != second.cpuUsageNS {
|
||||
t.Fatalf("expected latest cpu usage %d, got %d", second.cpuUsageNS, metrics.UsageNanoseconds)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSampleFromCgroup1(t *testing.T) {
|
||||
sample := sampleFromCgroup1(time.Unix(1, 0), &cgroup1stats.Metrics{
|
||||
CPU: &cgroup1stats.CPUStat{
|
||||
Usage: &cgroup1stats.CPUUsage{
|
||||
Total: 12,
|
||||
User: 7,
|
||||
Kernel: 5,
|
||||
},
|
||||
},
|
||||
Memory: &cgroup1stats.MemoryStat{
|
||||
Usage: &cgroup1stats.MemoryEntry{
|
||||
Usage: 4096,
|
||||
Limit: 8192,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if sample.cpuUsageNS != 12 || sample.cpuUserNS != 7 || sample.cpuKernelNS != 5 {
|
||||
t.Fatalf("unexpected cpu sample: %+v", sample)
|
||||
}
|
||||
if sample.memoryUsage != 4096 || sample.memoryLimit != 8192 {
|
||||
t.Fatalf("unexpected memory sample: %+v", sample)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSampleFromCgroup2(t *testing.T) {
|
||||
sample := sampleFromCgroup2(time.Unix(2, 0), &cgroup2stats.Metrics{
|
||||
CPU: &cgroup2stats.CPUStat{
|
||||
UsageUsec: 12,
|
||||
UserUsec: 7,
|
||||
SystemUsec: 5,
|
||||
},
|
||||
Memory: &cgroup2stats.MemoryStat{
|
||||
Usage: 16_384,
|
||||
UsageLimit: 32_768,
|
||||
},
|
||||
})
|
||||
|
||||
if sample.cpuUsageNS != 12_000 || sample.cpuUserNS != 7_000 || sample.cpuKernelNS != 5_000 {
|
||||
t.Fatalf("unexpected cpu sample: %+v", sample)
|
||||
}
|
||||
if sample.memoryUsage != 16_384 || sample.memoryLimit != 32_768 {
|
||||
t.Fatalf("unexpected memory sample: %+v", sample)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeMemoryLimitTreatsHugeValueAsUnlimited(t *testing.T) {
|
||||
if got := normalizeMemoryLimit(maxPracticalMemoryLimitBytes + 1); got != 0 {
|
||||
t.Fatalf("expected unlimited memory limit to normalize to 0, got %d", got)
|
||||
}
|
||||
}
|
||||
@@ -100,6 +100,7 @@ type Service interface {
|
||||
StopContainer(ctx context.Context, containerID string, opts *StopTaskOptions) error
|
||||
DeleteTask(ctx context.Context, containerID string, opts *DeleteTaskOptions) error
|
||||
GetTaskInfo(ctx context.Context, containerID string) (TaskInfo, error)
|
||||
GetContainerMetrics(ctx context.Context, containerID string) (ContainerMetrics, error)
|
||||
ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error)
|
||||
SetupNetwork(ctx context.Context, req NetworkSetupRequest) (NetworkResult, error)
|
||||
RemoveNetwork(ctx context.Context, req NetworkSetupRequest) error
|
||||
|
||||
@@ -327,6 +327,10 @@ func (s *AppleService) GetTaskInfo(ctx context.Context, containerID string) (Tas
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (*AppleService) GetContainerMetrics(context.Context, string) (ContainerMetrics, error) {
|
||||
return ContainerMetrics{}, ErrNotSupported
|
||||
}
|
||||
|
||||
func (s *AppleService) ListTasks(ctx context.Context, opts *ListTasksOptions) ([]TaskInfo, error) {
|
||||
if err := s.ensureHealthy(ctx); err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -61,6 +61,25 @@ type TaskInfo struct {
|
||||
ExitCode uint32
|
||||
}
|
||||
|
||||
type ContainerMetrics struct {
|
||||
SampledAt time.Time
|
||||
CPU *CPUMetrics
|
||||
Memory *MemoryMetrics
|
||||
}
|
||||
|
||||
type CPUMetrics struct {
|
||||
UsagePercent float64
|
||||
UsageNanoseconds uint64
|
||||
UserNanoseconds uint64
|
||||
KernelNanoseconds uint64
|
||||
}
|
||||
|
||||
type MemoryMetrics struct {
|
||||
UsageBytes uint64
|
||||
LimitBytes uint64
|
||||
UsagePercent float64
|
||||
}
|
||||
|
||||
type SnapshotInfo struct {
|
||||
Name string
|
||||
Parent string
|
||||
|
||||
Reference in New Issue
Block a user