mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
dae772f729
* fix(containerd): prevent silent network failures from leaving containers unreachable (#202) * fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter * fix(mcp): propagate network errors from replaceContainerSnapshot Network setup failure after snapshot replace (rollback/commit) was silently swallowed — the container would start but remain unreachable via gRPC. Return the error so callers (CreateSnapshot, RollbackVersion, etc.) surface the failure instead of reporting success.
160 lines
4.1 KiB
Go
160 lines
4.1 KiB
Go
package containerd
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/containerd/containerd/v2/client"
|
|
gocni "github.com/containerd/go-cni"
|
|
)
|
|
|
|
func setupCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) (string, error) {
|
|
if task == nil {
|
|
return "", ErrInvalidArgument
|
|
}
|
|
if containerID == "" {
|
|
containerID = task.ID()
|
|
}
|
|
if containerID == "" {
|
|
return "", ErrInvalidArgument
|
|
}
|
|
|
|
pid := task.Pid()
|
|
if pid == 0 {
|
|
return "", fmt.Errorf("task pid not available for %s", containerID)
|
|
}
|
|
|
|
if _, err := os.Stat(cniConfDir); err != nil {
|
|
return "", fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
|
|
}
|
|
if _, err := os.Stat(cniBinDir); err != nil {
|
|
return "", fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
|
|
}
|
|
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
|
|
if _, err := os.Stat(netnsPath); err != nil {
|
|
return "", fmt.Errorf("netns not found: %s: %w", netnsPath, err)
|
|
}
|
|
|
|
cni, err := gocni.New(
|
|
gocni.WithPluginDir([]string{cniBinDir}),
|
|
gocni.WithPluginConfDir(cniConfDir),
|
|
)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
|
|
return "", err
|
|
}
|
|
result, err := cni.Setup(ctx, containerID, netnsPath)
|
|
if err != nil {
|
|
retryable := isDuplicateAllocationError(err) || isVethExistsError(err) || isBridgeMACError(err)
|
|
if !retryable {
|
|
return "", err
|
|
}
|
|
if isBridgeMACError(err) {
|
|
// Stale bridge with zeroed MAC after container restart; delete it so
|
|
// the plugin can recreate a healthy one.
|
|
_ = exec.CommandContext(ctx, "ip", "link", "delete", "cni0").Run()
|
|
}
|
|
_ = cni.Remove(ctx, containerID, netnsPath)
|
|
result, err = cni.Setup(ctx, containerID, netnsPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
ip := extractIP(result)
|
|
if ip == "" {
|
|
return "", fmt.Errorf("cni setup returned no usable IP for %s", containerID)
|
|
}
|
|
return ip, nil
|
|
}
|
|
|
|
func extractIP(result *gocni.Result) string {
|
|
if result == nil {
|
|
return ""
|
|
}
|
|
for _, cfg := range result.Interfaces {
|
|
for _, ipCfg := range cfg.IPConfigs {
|
|
if ipCfg.IP != nil {
|
|
ip := ipCfg.IP.String()
|
|
if ip != "" && ip != "127.0.0.1" && ip != "::1" {
|
|
return ip
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func removeCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) error {
|
|
if task == nil {
|
|
return ErrInvalidArgument
|
|
}
|
|
if containerID == "" {
|
|
containerID = task.ID()
|
|
}
|
|
if containerID == "" {
|
|
return ErrInvalidArgument
|
|
}
|
|
|
|
pid := task.Pid()
|
|
if pid == 0 {
|
|
return fmt.Errorf("task pid not available for %s", containerID)
|
|
}
|
|
|
|
if _, err := os.Stat(cniConfDir); err != nil {
|
|
return fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
|
|
}
|
|
if _, err := os.Stat(cniBinDir); err != nil {
|
|
return fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
|
|
}
|
|
|
|
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
|
|
if _, err := os.Stat(netnsPath); err != nil {
|
|
return fmt.Errorf("netns not found: %s: %w", netnsPath, err)
|
|
}
|
|
|
|
cni, err := gocni.New(
|
|
gocni.WithPluginDir([]string{cniBinDir}),
|
|
gocni.WithPluginConfDir(cniConfDir),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
|
|
return err
|
|
}
|
|
return cni.Remove(ctx, containerID, netnsPath)
|
|
}
|
|
|
|
func isDuplicateAllocationError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return strings.Contains(err.Error(), "duplicate allocation")
|
|
}
|
|
|
|
// isVethExistsError returns true if the CNI setup failed because veth devices
|
|
// already exist (e.g. after container restart with stale network state).
|
|
func isVethExistsError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return strings.Contains(err.Error(), "already exists")
|
|
}
|
|
|
|
// isBridgeMACError returns true if the CNI bridge plugin failed because the
|
|
// stale cni0 bridge has a zeroed MAC address (common after container restart).
|
|
func isBridgeMACError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := err.Error()
|
|
return strings.Contains(msg, "set bridge") && strings.Contains(msg, "mac")
|
|
}
|