mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
abbb14c59f
* fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter
160 lines
4.1 KiB
Go
160 lines
4.1 KiB
Go
package containerd
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/containerd/containerd/v2/client"
|
|
gocni "github.com/containerd/go-cni"
|
|
)
|
|
|
|
func setupCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) (string, error) {
|
|
if task == nil {
|
|
return "", ErrInvalidArgument
|
|
}
|
|
if containerID == "" {
|
|
containerID = task.ID()
|
|
}
|
|
if containerID == "" {
|
|
return "", ErrInvalidArgument
|
|
}
|
|
|
|
pid := task.Pid()
|
|
if pid == 0 {
|
|
return "", fmt.Errorf("task pid not available for %s", containerID)
|
|
}
|
|
|
|
if _, err := os.Stat(cniConfDir); err != nil {
|
|
return "", fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
|
|
}
|
|
if _, err := os.Stat(cniBinDir); err != nil {
|
|
return "", fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
|
|
}
|
|
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
|
|
if _, err := os.Stat(netnsPath); err != nil {
|
|
return "", fmt.Errorf("netns not found: %s: %w", netnsPath, err)
|
|
}
|
|
|
|
cni, err := gocni.New(
|
|
gocni.WithPluginDir([]string{cniBinDir}),
|
|
gocni.WithPluginConfDir(cniConfDir),
|
|
)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
|
|
return "", err
|
|
}
|
|
result, err := cni.Setup(ctx, containerID, netnsPath)
|
|
if err != nil {
|
|
retryable := isDuplicateAllocationError(err) || isVethExistsError(err) || isBridgeMACError(err)
|
|
if !retryable {
|
|
return "", err
|
|
}
|
|
if isBridgeMACError(err) {
|
|
// Stale bridge with zeroed MAC after container restart; delete it so
|
|
// the plugin can recreate a healthy one.
|
|
_ = exec.CommandContext(ctx, "ip", "link", "delete", "cni0").Run()
|
|
}
|
|
_ = cni.Remove(ctx, containerID, netnsPath)
|
|
result, err = cni.Setup(ctx, containerID, netnsPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
ip := extractIP(result)
|
|
if ip == "" {
|
|
return "", fmt.Errorf("cni setup returned no usable IP for %s", containerID)
|
|
}
|
|
return ip, nil
|
|
}
|
|
|
|
func extractIP(result *gocni.Result) string {
|
|
if result == nil {
|
|
return ""
|
|
}
|
|
for _, cfg := range result.Interfaces {
|
|
for _, ipCfg := range cfg.IPConfigs {
|
|
if ipCfg.IP != nil {
|
|
ip := ipCfg.IP.String()
|
|
if ip != "" && ip != "127.0.0.1" && ip != "::1" {
|
|
return ip
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func removeCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) error {
|
|
if task == nil {
|
|
return ErrInvalidArgument
|
|
}
|
|
if containerID == "" {
|
|
containerID = task.ID()
|
|
}
|
|
if containerID == "" {
|
|
return ErrInvalidArgument
|
|
}
|
|
|
|
pid := task.Pid()
|
|
if pid == 0 {
|
|
return fmt.Errorf("task pid not available for %s", containerID)
|
|
}
|
|
|
|
if _, err := os.Stat(cniConfDir); err != nil {
|
|
return fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
|
|
}
|
|
if _, err := os.Stat(cniBinDir); err != nil {
|
|
return fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
|
|
}
|
|
|
|
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
|
|
if _, err := os.Stat(netnsPath); err != nil {
|
|
return fmt.Errorf("netns not found: %s: %w", netnsPath, err)
|
|
}
|
|
|
|
cni, err := gocni.New(
|
|
gocni.WithPluginDir([]string{cniBinDir}),
|
|
gocni.WithPluginConfDir(cniConfDir),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
|
|
return err
|
|
}
|
|
return cni.Remove(ctx, containerID, netnsPath)
|
|
}
|
|
|
|
func isDuplicateAllocationError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return strings.Contains(err.Error(), "duplicate allocation")
|
|
}
|
|
|
|
// isVethExistsError returns true if the CNI setup failed because veth devices
|
|
// already exist (e.g. after container restart with stale network state).
|
|
func isVethExistsError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return strings.Contains(err.Error(), "already exists")
|
|
}
|
|
|
|
// isBridgeMACError returns true if the CNI bridge plugin failed because the
|
|
// stale cni0 bridge has a zeroed MAC address (common after container restart).
|
|
func isBridgeMACError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := err.Error()
|
|
return strings.Contains(msg, "set bridge") && strings.Contains(msg, "mac")
|
|
}
|