mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
abbb14c59f
* fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter
81 lines
2.7 KiB
Bash
81 lines
2.7 KiB
Bash
#!/bin/sh
|
|
set -e
|
|
|
|
MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}"
|
|
|
|
# ---- Clean up stale CNI state from previous runs ----
|
|
# After a container restart the cni0 bridge may linger with a zeroed MAC
|
|
# (00:00:00:00:00:00), causing "could not set bridge's mac: invalid argument".
|
|
ip link delete cni0 2>/dev/null || true
|
|
rm -rf /var/lib/cni/networks/* /var/lib/cni/results/* 2>/dev/null || true
|
|
|
|
# ---- Ensure IP forwarding and subnet MASQUERADE for CNI ----
|
|
sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true
|
|
iptables -t nat -C POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || \
|
|
iptables -t nat -A POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || true
|
|
|
|
# ---- Setup cgroup v2 delegation for nested containerd ----
|
|
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
|
|
echo "Setting up cgroup v2 delegation..."
|
|
mkdir -p /sys/fs/cgroup/init
|
|
# Move existing processes out of root cgroup to allow subtree control
|
|
while read -r pid; do
|
|
echo "$pid" > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || true
|
|
done < /sys/fs/cgroup/cgroup.procs
|
|
# Enable all available controllers for subtree delegation
|
|
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
|
|
> /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
|
|
fi
|
|
|
|
# ---- Start containerd in background ----
|
|
mkdir -p /run/containerd
|
|
containerd &
|
|
CONTAINERD_PID=$!
|
|
|
|
echo "Waiting for containerd..."
|
|
for i in $(seq 1 30); do
|
|
if ctr version >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if ! ctr version >/dev/null 2>&1; then
|
|
echo "ERROR: containerd not responsive after 30s"
|
|
exit 1
|
|
fi
|
|
echo "containerd is running (pid $CONTAINERD_PID)"
|
|
|
|
# ---- Import MCP image if not already present ----
|
|
if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then
|
|
echo "Importing MCP image into containerd..."
|
|
for tar in /opt/images/*.tar; do
|
|
if [ -f "$tar" ]; then
|
|
ctr -n default images import --all-platforms "$tar" 2>&1 || true
|
|
fi
|
|
done
|
|
if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then
|
|
echo "MCP image ready: ${MCP_IMAGE}"
|
|
else
|
|
echo "WARNING: MCP image not available after import, will try pull at runtime"
|
|
fi
|
|
else
|
|
echo "MCP image already present: ${MCP_IMAGE}"
|
|
fi
|
|
|
|
echo "containerd is ready, starting memoh-server..."
|
|
|
|
# ---- Start server (foreground, trap signals for graceful shutdown) ----
|
|
trap 'echo "Shutting down..."; kill $SERVER_PID 2>/dev/null; kill $CONTAINERD_PID 2>/dev/null; wait' TERM INT
|
|
|
|
/app/memoh-server serve &
|
|
SERVER_PID=$!
|
|
|
|
wait $SERVER_PID
|
|
EXIT_CODE=$?
|
|
|
|
kill $CONTAINERD_PID 2>/dev/null || true
|
|
wait $CONTAINERD_PID 2>/dev/null || true
|
|
|
|
exit $EXIT_CODE
|