Files
Memoh/internal/containerd/network.go
T
BBQ dae772f729 fix(containerd): backport network fallback fixes to v0.4 (#205)
* fix(containerd): prevent silent network failures from leaving containers unreachable (#202)

* fix(containerd): prevent silent network failures from leaving containers unreachable

Container network setup failures were silently swallowed at multiple
points in the call chain, leaving containers in a "running but
unreachable" ghost state. This patch closes every silent-failure path:

- setupCNINetwork: return error when CNI yields no usable IP
- Manager.Start: roll back container when IP is empty instead of
  returning success
- ensureContainerAndTask: extract setupNetworkOrFail with 1 retry,
  propagate error to callers
- ReconcileContainers: stop reporting "healthy" when network setup fails
- recoverContainerIP: retry up to 2 times with backoff for transient
  CNI failures (IPAM lock contention, etc.)
- gRPC Pool: evict connections stuck in Connecting state for >30s

* fix(containerd): clean stale cni0 bridge on startup to prevent MAC error

After a Docker container restart, the cni0 bridge interface can linger
with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge
plugin then fails with "could not set bridge's mac: invalid argument",
making all MCP containers unreachable.

Two-layer fix:
- Entrypoint: delete cni0 and flush IPAM state before starting containerd
- Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0
  before retrying, as defense-in-depth for runtime recovery

* fix(containerd): use exec.CommandContext to satisfy noctx linter

* fix(mcp): propagate network errors from replaceContainerSnapshot

Network setup failure after snapshot replace (rollback/commit) was
silently swallowed — the container would start but remain unreachable
via gRPC. Return the error so callers (CreateSnapshot, RollbackVersion,
etc.) surface the failure instead of reporting success.
2026-03-07 18:13:06 +08:00

160 lines
4.1 KiB
Go

package containerd
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"github.com/containerd/containerd/v2/client"
gocni "github.com/containerd/go-cni"
)
func setupCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) (string, error) {
if task == nil {
return "", ErrInvalidArgument
}
if containerID == "" {
containerID = task.ID()
}
if containerID == "" {
return "", ErrInvalidArgument
}
pid := task.Pid()
if pid == 0 {
return "", fmt.Errorf("task pid not available for %s", containerID)
}
if _, err := os.Stat(cniConfDir); err != nil {
return "", fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
}
if _, err := os.Stat(cniBinDir); err != nil {
return "", fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
}
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
if _, err := os.Stat(netnsPath); err != nil {
return "", fmt.Errorf("netns not found: %s: %w", netnsPath, err)
}
cni, err := gocni.New(
gocni.WithPluginDir([]string{cniBinDir}),
gocni.WithPluginConfDir(cniConfDir),
)
if err != nil {
return "", err
}
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
return "", err
}
result, err := cni.Setup(ctx, containerID, netnsPath)
if err != nil {
retryable := isDuplicateAllocationError(err) || isVethExistsError(err) || isBridgeMACError(err)
if !retryable {
return "", err
}
if isBridgeMACError(err) {
// Stale bridge with zeroed MAC after container restart; delete it so
// the plugin can recreate a healthy one.
_ = exec.CommandContext(ctx, "ip", "link", "delete", "cni0").Run()
}
_ = cni.Remove(ctx, containerID, netnsPath)
result, err = cni.Setup(ctx, containerID, netnsPath)
if err != nil {
return "", err
}
}
ip := extractIP(result)
if ip == "" {
return "", fmt.Errorf("cni setup returned no usable IP for %s", containerID)
}
return ip, nil
}
func extractIP(result *gocni.Result) string {
if result == nil {
return ""
}
for _, cfg := range result.Interfaces {
for _, ipCfg := range cfg.IPConfigs {
if ipCfg.IP != nil {
ip := ipCfg.IP.String()
if ip != "" && ip != "127.0.0.1" && ip != "::1" {
return ip
}
}
}
}
return ""
}
func removeCNINetwork(ctx context.Context, task client.Task, containerID string, cniBinDir string, cniConfDir string) error {
if task == nil {
return ErrInvalidArgument
}
if containerID == "" {
containerID = task.ID()
}
if containerID == "" {
return ErrInvalidArgument
}
pid := task.Pid()
if pid == 0 {
return fmt.Errorf("task pid not available for %s", containerID)
}
if _, err := os.Stat(cniConfDir); err != nil {
return fmt.Errorf("cni config dir missing: %s: %w", cniConfDir, err)
}
if _, err := os.Stat(cniBinDir); err != nil {
return fmt.Errorf("cni bin dir missing: %s: %w", cniBinDir, err)
}
netnsPath := filepath.Join("/proc", strconv.FormatUint(uint64(pid), 10), "ns", "net")
if _, err := os.Stat(netnsPath); err != nil {
return fmt.Errorf("netns not found: %s: %w", netnsPath, err)
}
cni, err := gocni.New(
gocni.WithPluginDir([]string{cniBinDir}),
gocni.WithPluginConfDir(cniConfDir),
)
if err != nil {
return err
}
if err := cni.Load(gocni.WithLoNetwork, gocni.WithDefaultConf); err != nil {
return err
}
return cni.Remove(ctx, containerID, netnsPath)
}
func isDuplicateAllocationError(err error) bool {
if err == nil {
return false
}
return strings.Contains(err.Error(), "duplicate allocation")
}
// isVethExistsError returns true if the CNI setup failed because veth devices
// already exist (e.g. after container restart with stale network state).
func isVethExistsError(err error) bool {
if err == nil {
return false
}
return strings.Contains(err.Error(), "already exists")
}
// isBridgeMACError returns true if the CNI bridge plugin failed because the
// stale cni0 bridge has a zeroed MAC address (common after container restart).
func isBridgeMACError(err error) bool {
if err == nil {
return false
}
msg := err.Error()
return strings.Contains(msg, "set bridge") && strings.Contains(msg, "mac")
}