From b8a6a85fbb577552c154fbcc07bdc6b794c88e79 Mon Sep 17 00:00:00 2001 From: BBQ Date: Mon, 16 Feb 2026 16:45:07 +0800 Subject: [PATCH] fix(containerd): re-apply CNI network after server restart for running MCP tasks Server container restart drops cni0 bridge, veth and iptables masquerade in its network namespace while MCP tasks keep running in containerd. Reconcile and ensureContainerAndTask now re-run SetupNetwork for already- running tasks so outbound connectivity is restored. --- internal/handlers/containerd.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/internal/handlers/containerd.go b/internal/handlers/containerd.go index 1b725263..a3433221 100644 --- a/internal/handlers/containerd.go +++ b/internal/handlers/containerd.go @@ -301,6 +301,14 @@ func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containe } if len(tasks) > 0 { if tasks[0].Status == tasktypes.Status_RUNNING { + // Task is running but CNI state may be stale (e.g. server container restarted). + // Re-apply network to ensure connectivity. + if task, taskErr := h.service.GetTask(ctx, containerID); taskErr == nil { + if netErr := ctr.SetupNetwork(ctx, task, containerID); netErr != nil { + h.logger.Warn("network re-setup failed for running task", + slog.String("container_id", containerID), slog.Any("error", netErr)) + } + } return nil } if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { @@ -928,6 +936,20 @@ func (h *ContainerdHandler) ReconcileContainers(ctx context.Context) { slog.String("bot_id", botID), slog.Any("error", dbErr)) } } + // Re-apply CNI networking: server container restart drops cni0 bridge, + // veth endpoints and iptables masquerade rules while the MCP task keeps + // running inside containerd. + if task, taskErr := h.service.GetTask(ctx, containerID); taskErr == nil { + if netErr := ctr.SetupNetwork(ctx, task, containerID); netErr != nil { + h.logger.Warn("reconcile: network re-setup failed for running task", + slog.String("bot_id", botID), + slog.String("container_id", containerID), + slog.Any("error", netErr)) + } + } else { + h.logger.Warn("reconcile: failed to get task for network re-setup", + slog.String("bot_id", botID), slog.Any("error", taskErr)) + } h.logger.Info("reconcile: container healthy", slog.String("bot_id", botID), slog.String("container_id", containerID)) continue