mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
fix(containerd): re-apply CNI network after server restart for running MCP tasks
Server container restart drops cni0 bridge, veth and iptables masquerade in its network namespace while MCP tasks keep running in containerd. Reconcile and ensureContainerAndTask now re-run SetupNetwork for already- running tasks so outbound connectivity is restored.
This commit is contained in:
@@ -301,6 +301,14 @@ func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containe
|
||||
}
|
||||
if len(tasks) > 0 {
|
||||
if tasks[0].Status == tasktypes.Status_RUNNING {
|
||||
// Task is running but CNI state may be stale (e.g. server container restarted).
|
||||
// Re-apply network to ensure connectivity.
|
||||
if task, taskErr := h.service.GetTask(ctx, containerID); taskErr == nil {
|
||||
if netErr := ctr.SetupNetwork(ctx, task, containerID); netErr != nil {
|
||||
h.logger.Warn("network re-setup failed for running task",
|
||||
slog.String("container_id", containerID), slog.Any("error", netErr))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil {
|
||||
@@ -928,6 +936,20 @@ func (h *ContainerdHandler) ReconcileContainers(ctx context.Context) {
|
||||
slog.String("bot_id", botID), slog.Any("error", dbErr))
|
||||
}
|
||||
}
|
||||
// Re-apply CNI networking: server container restart drops cni0 bridge,
|
||||
// veth endpoints and iptables masquerade rules while the MCP task keeps
|
||||
// running inside containerd.
|
||||
if task, taskErr := h.service.GetTask(ctx, containerID); taskErr == nil {
|
||||
if netErr := ctr.SetupNetwork(ctx, task, containerID); netErr != nil {
|
||||
h.logger.Warn("reconcile: network re-setup failed for running task",
|
||||
slog.String("bot_id", botID),
|
||||
slog.String("container_id", containerID),
|
||||
slog.Any("error", netErr))
|
||||
}
|
||||
} else {
|
||||
h.logger.Warn("reconcile: failed to get task for network re-setup",
|
||||
slog.String("bot_id", botID), slog.Any("error", taskErr))
|
||||
}
|
||||
h.logger.Info("reconcile: container healthy",
|
||||
slog.String("bot_id", botID), slog.String("container_id", containerID))
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user