mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
dae772f729
* fix(containerd): prevent silent network failures from leaving containers unreachable (#202) * fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter * fix(mcp): propagate network errors from replaceContainerSnapshot Network setup failure after snapshot replace (rollback/commit) was silently swallowed — the container would start but remain unreachable via gRPC. Return the error so callers (CreateSnapshot, RollbackVersion, etc.) surface the failure instead of reporting success.
63 lines
1.8 KiB
Bash
63 lines
1.8 KiB
Bash
#!/bin/sh
|
|
set -e
|
|
|
|
# Clean up stale CNI state from previous runs. After a container restart the
|
|
# cni0 bridge may linger with a zeroed MAC (00:00:00:00:00:00), causing the
|
|
# bridge plugin to fail with "could not set bridge's mac: invalid argument".
|
|
ip link delete cni0 2>/dev/null || true
|
|
rm -rf /var/lib/cni/networks/* /var/lib/cni/results/* 2>/dev/null || true
|
|
|
|
# Ensure IP forwarding and subnet MASQUERADE for CNI.
|
|
sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true
|
|
iptables -t nat -C POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || \
|
|
iptables -t nat -A POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || true
|
|
|
|
# Setup cgroup v2 delegation for nested containerd.
|
|
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
|
|
mkdir -p /sys/fs/cgroup/init
|
|
while read -r pid; do
|
|
echo "$pid" > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true
|
|
done < /sys/fs/cgroup/cgroup.procs
|
|
|
|
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
|
|
> /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
|
|
fi
|
|
|
|
mkdir -p /run/containerd
|
|
containerd &
|
|
CONTAINERD_PID=$!
|
|
|
|
echo "Waiting for containerd..."
|
|
for i in $(seq 1 30); do
|
|
if ctr version >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if ! ctr version >/dev/null 2>&1; then
|
|
echo "ERROR: containerd not responsive after 30s"
|
|
exit 1
|
|
fi
|
|
echo "containerd is running (pid $CONTAINERD_PID)"
|
|
|
|
# Build MCP binary and import as containerd image
|
|
echo "Building MCP image..."
|
|
(cd /workspace && sh devenv/mcp-build.sh)
|
|
echo "MCP image ready."
|
|
|
|
echo "Starting server..."
|
|
|
|
trap 'kill ${SERVER_PID:-0} 2>/dev/null || true; kill ${CONTAINERD_PID:-0} 2>/dev/null || true; wait' TERM INT
|
|
|
|
"$@" &
|
|
SERVER_PID=$!
|
|
|
|
wait $SERVER_PID
|
|
EXIT_CODE=$?
|
|
|
|
kill $CONTAINERD_PID 2>/dev/null || true
|
|
wait $CONTAINERD_PID 2>/dev/null || true
|
|
|
|
exit $EXIT_CODE
|