mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
dae772f729
* fix(containerd): prevent silent network failures from leaving containers unreachable (#202) * fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter * fix(mcp): propagate network errors from replaceContainerSnapshot Network setup failure after snapshot replace (rollback/commit) was silently swallowed — the container would start but remain unreachable via gRPC. Return the error so callers (CreateSnapshot, RollbackVersion, etc.) surface the failure instead of reporting success.
81 lines
2.7 KiB
Bash
81 lines
2.7 KiB
Bash
#!/bin/sh
|
|
set -e
|
|
|
|
MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}"
|
|
|
|
# ---- Clean up stale CNI state from previous runs ----
|
|
# After a container restart the cni0 bridge may linger with a zeroed MAC
|
|
# (00:00:00:00:00:00), causing "could not set bridge's mac: invalid argument".
|
|
ip link delete cni0 2>/dev/null || true
|
|
rm -rf /var/lib/cni/networks/* /var/lib/cni/results/* 2>/dev/null || true
|
|
|
|
# ---- Ensure IP forwarding and subnet MASQUERADE for CNI ----
|
|
sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true
|
|
iptables -t nat -C POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || \
|
|
iptables -t nat -A POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || true
|
|
|
|
# ---- Setup cgroup v2 delegation for nested containerd ----
|
|
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
|
|
echo "Setting up cgroup v2 delegation..."
|
|
mkdir -p /sys/fs/cgroup/init
|
|
# Move existing processes out of root cgroup to allow subtree control
|
|
while read -r pid; do
|
|
echo "$pid" > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true
|
|
done < /sys/fs/cgroup/cgroup.procs
|
|
# Enable all available controllers for subtree delegation
|
|
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
|
|
> /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
|
|
fi
|
|
|
|
# ---- Start containerd in background ----
|
|
mkdir -p /run/containerd
|
|
containerd &
|
|
CONTAINERD_PID=$!
|
|
|
|
echo "Waiting for containerd..."
|
|
for i in $(seq 1 30); do
|
|
if ctr version >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if ! ctr version >/dev/null 2>&1; then
|
|
echo "ERROR: containerd not responsive after 30s"
|
|
exit 1
|
|
fi
|
|
echo "containerd is running (pid $CONTAINERD_PID)"
|
|
|
|
# ---- Import MCP image if not already present ----
|
|
if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then
|
|
echo "Importing MCP image into containerd..."
|
|
for tar in /opt/images/*.tar; do
|
|
if [ -f "$tar" ]; then
|
|
ctr -n default images import --all-platforms "$tar" 2>&1 || true
|
|
fi
|
|
done
|
|
if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then
|
|
echo "MCP image ready: ${MCP_IMAGE}"
|
|
else
|
|
echo "WARNING: MCP image not available after import, will try pull at runtime"
|
|
fi
|
|
else
|
|
echo "MCP image already present: ${MCP_IMAGE}"
|
|
fi
|
|
|
|
echo "containerd is ready, starting memoh-server..."
|
|
|
|
# ---- Start server (foreground, trap signals for graceful shutdown) ----
|
|
trap 'echo "Shutting down..."; kill $SERVER_PID 2>/dev/null; kill $CONTAINERD_PID 2>/dev/null; wait' TERM INT
|
|
|
|
/app/memoh-server serve &
|
|
SERVER_PID=$!
|
|
|
|
wait $SERVER_PID
|
|
EXIT_CODE=$?
|
|
|
|
kill $CONTAINERD_PID 2>/dev/null || true
|
|
wait $CONTAINERD_PID 2>/dev/null || true
|
|
|
|
exit $EXIT_CODE
|