From d5b410d7e39ae1c52b10eb787ab0f4d36c69b895 Mon Sep 17 00:00:00 2001 From: Menci Date: Wed, 18 Mar 2026 15:19:09 +0800 Subject: [PATCH] refactor(workspace): new workspace v3 container architecture (#244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(mcp): workspace container with bridge architecture Migrate MCP containers to use UDS-based bridge communication instead of TCP gRPC. Containers now mount runtime binaries and Unix domain sockets from the host, eliminating the need for a dedicated MCP Docker image. - Remove Dockerfile.mcp and entrypoint.sh in favor of standard base images - Add toolkit Dockerfile for building MCP binary separately - Containers use bind mounts for /opt/memoh (runtime) and /run/memoh (UDS) - Update all config files with new runtime_path and socket_dir settings - Support custom base images per bot (debian, alpine, ubuntu, etc.) - Legacy container detection and TCP fallback for pre-bridge containers - Frontend: add base image selector in container creation UI * feat(container): SSE progress bar for container creation Add real-time progress feedback during container image pull and creation using Server-Sent Events, without breaking the existing synchronous JSON API (content negotiation via Accept header). Backend: - Add PullProgress/LayerStatus types and OnProgress callback to PullImageOptions (containerd service layer) - DefaultService.PullImage polls ContentStore.ListStatuses every 500ms when OnProgress is set; AppleService ignores it - CreateContainer handler checks Accept: text/event-stream and switches to SSE branch: pulling → pull_progress → creating → complete/error Frontend: - handleCreateContainer/handleRecreateContainer use fetch + SSE instead of the SDK's synchronous postBotsByBotIdContainer - Progress bar shows layer-level pull progress (offset/total) during pulling phase and indeterminate animation during creating phase - i18n keys added for pullingImage and creatingContainer (en/zh) * fix(container): clear stale legacy route and type create SSE * fix(ci): resolve lint errors and arm64 musl node.js download - Fix unused-receiver lint: rename `s` to `_` on stub methods in manager_legacy_test.go - Fix sloglint: use slog.DiscardHandler instead of slog.NewTextHandler(io.Discard, nil) - Handle missing arm64 musl Node.js builds: unofficial-builds.nodejs.org does not provide arm64 musl binaries, fall back to glibc build * fix(lint): address errcheck, staticcheck, and gosec findings - Discard os.Setenv/os.Remove return values explicitly with _ - Use omitted receiver name instead of _ (staticcheck ST1006) - Tighten directory permissions from 0o755 to 0o750 (gosec G301) * fix(lint): sanitize socket path to satisfy gosec G703 filepath.Clean the env-sourced socket path before os.Remove to avoid path-traversal taint warning. * fix(lint): use nolint directive for gosec G703 on socket path filepath.Clean does not satisfy gosec's taint analysis. The socket path comes from MCP_SOCKET_PATH env (operator-configured) or a compiled-in default, not from end-user input. * refactor: rename MCP container/bridge to workspace/bridge Split internal/mcp/ to separate container lifecycle management from Model Context Protocol connections, eliminating naming confusion: - internal/mcp/ (container mgmt) → internal/workspace/ - internal/mcp/mcpclient/ → internal/workspace/bridge/ - internal/mcp/mcpcontainer/ → internal/workspace/bridgepb/ - cmd/mcp/ → cmd/bridge/ - config: MCPConfig → WorkspaceConfig, [mcp] → [workspace] - container prefix: mcp-{id} → workspace-{id} - labels: mcp.bot_id → memoh.bot_id, add memoh.workspace=v1 - socket: mcp.sock → bridge.sock, env BRIDGE_SOCKET_PATH - runtime: /opt/memoh/runtime/mcp → /opt/memoh/runtime/bridge - devenv: mcp-build.sh → bridge-build.sh Legacy containers (mcp- prefix) detected by container name prefix and handled via existing fallback path. * fix(container): use memoh.workspace=v3 label value * refactor(container): drop LegacyBotLabelKey, infer bot ID from container name Legacy containers use mcp-{botID} naming, so bot ID can be derived via TrimPrefix instead of looking up the mcp.bot_id label. * fix(workspace): resolve containers via manager and drop gateway container ID * docs: fix stale mcp references in AGENTS.md and DEPLOYMENT.md * refactor(workspace): move container lifecycle ownership into manager * dev: isolate local devenv from prod config * toolkit: support musl node runtime * containerd: fix fallback resolv.conf permissions * web: preserve container create progress on completion * web: add bot creation wait hint * fix(workspace): preserve image selection across recreate * feat(web): shorten default docker hub image refs * fix(container): address code review findings - Remove synchronous CreateContainer path (SSE-only now) - Move flusher check before WriteHeader to avoid committed 200 on error - Fix legacy container IP not cached via ensureContainerAndTask path - Add atomic guard to prevent stale pull_progress after PullImage returns - Defensive copy for tzEnv slice to avoid mutating shared backing array - Restore network failure severity in restartContainer (return + Error) - Extract duplicate progress bar into ContainerCreateProgress component - Fix codesync comments to use repo-relative paths - Add SaaS image validation note and kernel version comment on reaper * refactor(devenv): extract toolkit install into shared script Unify the Node.js + uv download logic into docker/toolkit/install.sh, used by the production Dockerfile and runnable locally for dev. Dev environment no longer bakes toolkit into the Docker image — it is volume-mounted from .toolkit/ instead, so wrapper script changes take effect immediately without rebuilding. The entrypoint checks for the toolkit directory and prints a clear error if missing. * fix(ci): address go ci failures * chore(docker): remove unused containerd image * refactor(config): rename workspace image key * fix(workspace): fix legacy container data loss on migration and stop swallowing errors Three root causes were identified and fixed: 1. Delete() used hardcoded "workspace-" prefix to look up legacy "mcp-" containers, causing GetContainer to return NotFound. CleanupBotContainer then silently skipped the error and deleted the DB record without ever calling PreserveData. Fix: resolve the actual container ID via ContainerID() (DB → label → scan) before operating. 2. Multiple restore error paths were silently swallowed (logged as Warn but not returned), so the user saw HTTP 200/204 with no data and no error. Fix: all errors in the preserve/restore chain now block the workflow and propagate to the caller. 3. tarGzDir used cached DirEntry.Info() for tar header size, which on overlayfs can differ from the actual file size, causing "archive/tar: write too long". Fix: open the file first, Fstat the fd for a race-free size, and use LimitReader as a safeguard. Also adds a "restoring" SSE phase so the frontend shows a progress indicator ("Restoring data, this may take a while...") during data migration on container recreation. * refactor(workspace): single-point container ID resolution Replace the `containerID func(string) string` field with a single `resolveContainerID(ctx, botID)` method that resolves the actual container ID via DB → label → scan → fallback. All ~16 lookup callsites across manager.go, dataio.go, versioning.go, and manager_lifecycle.go now go through this single resolver, which correctly handles both legacy "mcp-" and new "workspace-" containers. Only `ensureBotWithImage` inlines `ContainerPrefix + botID` for creating brand-new containers — every other path resolves dynamically. * fix(web): show progress during data backup phase of container recreate The recreate flow (delete with preserve_data + create with restore_data) blocked on the DELETE call while backing up /data with no progress indication. Add a 'preserving' phase to the progress component so users see "正在备份数据..." instead of an unexplained hang. * chore: remove [MYDEBUG] debug logging Clean up all 112 temporary debug log statements added during the legacy container migration investigation. Kept only meaningful warn-level logs for non-fatal errors (network teardown, rename failures). --- .air.toml | 2 +- .github/workflows/docker.yml | 12 +- .gitignore | 3 +- AGENTS.md | 9 +- CONTRIBUTING.md | 10 +- DEPLOYMENT.md | 2 +- apps/agent/src/models.ts | 5 +- apps/web/AGENTS.md | 2 +- apps/web/src/i18n/locales/en.json | 10 + apps/web/src/i18n/locales/zh.json | 10 + .../pages/bots/components/bot-container.vue | 229 ++++++- .../components/container-create-progress.vue | 48 ++ .../src/pages/bots/components/create-bot.vue | 3 + apps/web/src/utils/image-ref.test.ts | 22 + apps/web/src/utils/image-ref.ts | 8 + apps/web/vite.config.ts | 9 +- cmd/agent/main.go | 37 +- cmd/{mcp => bridge}/main.go | 40 +- cmd/{mcp => bridge}/server.go | 2 +- cmd/{mcp => bridge}/template/HEARTBEAT.md | 0 cmd/{mcp => bridge}/template/IDENTITY.md | 0 cmd/{mcp => bridge}/template/MEMORY.md | 0 cmd/{mcp => bridge}/template/PROFILES.md | 0 cmd/{mcp => bridge}/template/SOUL.md | 0 cmd/{mcp => bridge}/template/TOOLS.md | 0 cmd/mcp/entrypoint.sh | 5 - cmd/memoh/serve.go | 35 +- conf/app.apple.toml | 6 +- conf/app.docker.toml | 5 +- conf/app.example.toml | 4 +- conf/app.windows.toml | 4 +- devenv/Dockerfile.server | 30 +- devenv/app.dev.toml | 5 +- devenv/bridge-build.sh | 26 + devenv/docker-compose.yml | 32 +- devenv/mcp-build.sh | 70 --- devenv/server-entrypoint.sh | 16 +- docker/Dockerfile.containerd | 86 --- docker/Dockerfile.mcp | 43 -- docker/Dockerfile.server | 63 +- docker/containerd-entrypoint.sh | 43 -- docker/server-entrypoint.sh | 19 - docker/toolkit/bin/node | 9 + docker/toolkit/bin/npm | 10 + docker/toolkit/bin/npx | 10 + docker/toolkit/bin/uv | 2 + docker/toolkit/bin/uvx | 2 + docker/toolkit/install.sh | 132 ++++ docs/docs/installation/docker.md | 2 +- internal/config/config.go | 61 +- internal/config/config_test.go | 59 ++ internal/containerd/resolv.go | 25 +- internal/containerd/resolv_test.go | 101 +++ internal/containerd/service.go | 34 + internal/containerd/types.go | 10 + internal/conversation/flow/resolver.go | 23 - internal/conversation/flow/resolver_test.go | 1 - internal/conversation/types.go | 1 - internal/handlers/containerd.go | 579 +++++------------- internal/handlers/containerd_terminal.go | 2 +- internal/handlers/filemanager.go | 12 +- internal/handlers/mcp_federation_gateway.go | 8 +- internal/handlers/mcp_stdio.go | 20 +- internal/handlers/memory.go | 6 +- internal/handlers/skills.go | 4 +- internal/mcp/manager.go | 409 ------------- internal/mcp/providers/browser/provider.go | 6 +- internal/mcp/providers/container/provider.go | 16 +- .../mcp/providers/container/provider_test.go | 16 +- internal/memory/storefs/service.go | 10 +- .../storage/providers/containerfs/provider.go | 6 +- .../mcpclient => workspace/bridge}/client.go | 42 +- .../bridge}/client_test.go | 4 +- .../mcpclient => workspace/bridge}/errors.go | 2 +- .../bridgepb/bridge.pb.go} | 4 +- .../bridgepb/bridge.proto} | 4 +- .../bridgepb/bridge_grpc.pb.go} | 6 +- internal/{mcp => workspace}/dataio.go | 79 +-- internal/workspace/identity.go | 34 + internal/workspace/identity_test.go | 52 ++ internal/workspace/image_preference.go | 176 ++++++ internal/workspace/image_preference_test.go | 53 ++ internal/workspace/manager.go | 435 +++++++++++++ internal/workspace/manager_legacy_test.go | 292 +++++++++ internal/workspace/manager_lifecycle.go | 515 ++++++++++++++++ internal/{mcp => workspace}/versioning.go | 50 +- mise.toml | 9 +- packages/agent/src/agent.ts | 1 - packages/agent/src/types/agent.ts | 5 +- packages/config/src/index.ts | 10 +- packages/config/src/types.ts | 11 +- packages/sdk/package.json | 3 +- packages/sdk/src/container-stream.ts | 105 ++++ packages/sdk/src/extra/index.ts | 10 + packages/sdk/src/types.gen.ts | 2 + spec/docs.go | 6 + spec/swagger.json | 6 + spec/swagger.yaml | 4 + 98 files changed, 2979 insertions(+), 1472 deletions(-) create mode 100644 apps/web/src/pages/bots/components/container-create-progress.vue create mode 100644 apps/web/src/utils/image-ref.test.ts create mode 100644 apps/web/src/utils/image-ref.ts rename cmd/{mcp => bridge}/main.go (54%) rename cmd/{mcp => bridge}/server.go (99%) rename cmd/{mcp => bridge}/template/HEARTBEAT.md (100%) rename cmd/{mcp => bridge}/template/IDENTITY.md (100%) rename cmd/{mcp => bridge}/template/MEMORY.md (100%) rename cmd/{mcp => bridge}/template/PROFILES.md (100%) rename cmd/{mcp => bridge}/template/SOUL.md (100%) rename cmd/{mcp => bridge}/template/TOOLS.md (100%) delete mode 100644 cmd/mcp/entrypoint.sh create mode 100755 devenv/bridge-build.sh delete mode 100755 devenv/mcp-build.sh mode change 100644 => 100755 devenv/server-entrypoint.sh delete mode 100644 docker/Dockerfile.containerd delete mode 100644 docker/Dockerfile.mcp delete mode 100644 docker/containerd-entrypoint.sh create mode 100755 docker/toolkit/bin/node create mode 100755 docker/toolkit/bin/npm create mode 100755 docker/toolkit/bin/npx create mode 100755 docker/toolkit/bin/uv create mode 100755 docker/toolkit/bin/uvx create mode 100755 docker/toolkit/install.sh create mode 100644 internal/config/config_test.go create mode 100644 internal/containerd/resolv_test.go delete mode 100644 internal/mcp/manager.go rename internal/{mcp/mcpclient => workspace/bridge}/client.go (90%) rename internal/{mcp/mcpclient => workspace/bridge}/client_test.go (97%) rename internal/{mcp/mcpclient => workspace/bridge}/errors.go (98%) rename internal/{mcp/mcpcontainer/mcpcontainer.pb.go => workspace/bridgepb/bridge.pb.go} (99%) rename internal/{mcp/mcpcontainer/mcpcontainer.proto => workspace/bridgepb/bridge.proto} (95%) rename internal/{mcp/mcpcontainer/mcpcontainer_grpc.pb.go => workspace/bridgepb/bridge_grpc.pb.go} (99%) rename internal/{mcp => workspace}/dataio.go (92%) create mode 100644 internal/workspace/identity.go create mode 100644 internal/workspace/identity_test.go create mode 100644 internal/workspace/image_preference.go create mode 100644 internal/workspace/image_preference_test.go create mode 100644 internal/workspace/manager.go create mode 100644 internal/workspace/manager_legacy_test.go create mode 100644 internal/workspace/manager_lifecycle.go rename internal/{mcp => workspace}/versioning.go (93%) create mode 100644 packages/sdk/src/container-stream.ts create mode 100644 packages/sdk/src/extra/index.ts diff --git a/.air.toml b/.air.toml index 44899831..9bd18b43 100644 --- a/.air.toml +++ b/.air.toml @@ -2,7 +2,7 @@ root = "." tmp_dir = "tmp" [build] -cmd = "go build -o ./tmp/memoh-server ./cmd/agent/main.go && sh devenv/mcp-build.sh" +cmd = "go build -o ./tmp/memoh-server ./cmd/agent/main.go && sh devenv/bridge-build.sh" bin = "./tmp/memoh-server" args_bin = ["serve"] include_ext = ["go", "toml"] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 254f3b32..fff2cb83 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - image: [server, agent, web, mcp, browser, sparse] + image: [server, agent, web, browser, sparse] platform: [linux/amd64, linux/arm64] include: - image: server @@ -45,8 +45,6 @@ jobs: dockerfile: docker/Dockerfile.agent - image: web dockerfile: docker/Dockerfile.web - - image: mcp - dockerfile: docker/Dockerfile.mcp - image: browser dockerfile: docker/Dockerfile.browser - image: sparse @@ -64,14 +62,14 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Set up Go - if: matrix.image == 'server' || matrix.image == 'mcp' + if: matrix.image == 'server' uses: actions/setup-go@v5 with: go-version: '1.25' cache: true - name: Pre-warm Go mod cache - if: matrix.image == 'server' || matrix.image == 'mcp' + if: matrix.image == 'server' run: | mkdir -p .go-cache GOMODCACHE=$(pwd)/.go-cache go mod download @@ -99,7 +97,7 @@ jobs: file: ${{ matrix.dockerfile }} platforms: ${{ matrix.platform }} outputs: ${{ env.PUSH == 'true' && format('type=image,"name={0}/{1}/{2}",push-by-digest=true,name-canonical=true,push=true,compression=zstd', env.REGISTRY, github.repository_owner, matrix.image) || '' }} - build-contexts: ${{ (matrix.image == 'server' || matrix.image == 'mcp') && format('gomodcache={0}/.go-cache', github.workspace) || '' }} + build-contexts: ${{ matrix.image == 'server' && format('gomodcache={0}/.go-cache', github.workspace) || '' }} build-args: | VERSION=${{ github.ref_name }} COMMIT_HASH=${{ github.sha }} @@ -134,7 +132,7 @@ jobs: needs: build strategy: matrix: - image: [server, agent, web, mcp, browser, sparse] + image: [server, agent, web, browser, sparse] steps: - name: Download digests uses: actions/download-artifact@v4 diff --git a/.gitignore b/.gitignore index 215184b9..2a5be4ee 100644 --- a/.gitignore +++ b/.gitignore @@ -93,7 +93,7 @@ tmp/ # compiled files /memoh /agent -/mcp +/bridge docs/docs/.vitepress/cache .pnpm-store @@ -106,3 +106,4 @@ config.toml .workdocs/ data _main-ref/ +.toolkit/ diff --git a/AGENTS.md b/AGENTS.md index 00166f62..9728a740 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,7 +60,7 @@ Infrastructure dependencies: Memoh/ ├── cmd/ # Go application entry points │ ├── agent/ # Main backend server (main.go) -│ ├── mcp/ # MCP server binary (stdio transport, template/, entrypoint.sh) +│ ├── bridge/ # Bridge server binary (in-container gRPC, template/) │ └── memoh/ # Unified binary wrapper (Cobra CLI) ├── internal/ # Go backend core code (domain packages) │ ├── accounts/ # User account management (CRUD, password hashing) @@ -86,7 +86,8 @@ Memoh/ │ ├── identity/ # Identity type utilities (human vs bot) │ ├── inbox/ # Bot inbox service (notifications, triggers) │ ├── logger/ # Structured logging (slog) -│ ├── mcp/ # MCP protocol manager (container lifecycle, tool gateway) +│ ├── mcp/ # MCP protocol manager (connections, OAuth, tool gateway) +│ ├── workspace/ # Workspace container lifecycle (bridge client, protobuf) │ ├── media/ # Content-addressed media asset service │ ├── memory/ # Long-term memory system (Qdrant, BM25, LLM extraction) │ ├── message/ # Message persistence and event publishing @@ -141,7 +142,7 @@ Memoh/ │ ├── migrations/ # SQL migration files │ └── queries/ # SQL query files (sqlc input) ├── conf/ # Configuration templates (app.example.toml, app.docker.toml, app.apple.toml, app.windows.toml) -├── devenv/ # Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, mcp-build.sh, server-entrypoint.sh) +├── devenv/ # Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, bridge-build.sh, server-entrypoint.sh) ├── docker/ # Production Docker (Dockerfiles, entrypoints, nginx.conf, docker-compose.yml, docker-compose.cn.yml) ├── docs/ # Documentation site ├── scripts/ # Utility scripts (db, release, install) @@ -291,7 +292,7 @@ The main configuration file is `config.toml` (copied from `conf/app.example.toml - `[admin]` — Admin account credentials - `[auth]` — JWT authentication settings - `[containerd]` — Container runtime configuration (socket path, namespace) -- `[mcp]` — MCP image and data configuration +- `[workspace]` — Workspace image and data configuration - `[postgres]` — PostgreSQL connection - `[qdrant]` — Qdrant vector database connection - `[agent_gateway]` — Agent Gateway address diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 539b41b1..c5c8a8b3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,8 @@ winget install jdx.mise ```bash mise install # Install toolchains (Go, Node, Bun, pnpm, sqlc) -mise run setup # Copy config + install deps +./docker/toolkit/install.sh # Install toolkit used by the nested workspace runtime +mise run setup # Install deps and prepare local tooling mise run dev # Start full containerized dev environment ``` @@ -32,6 +33,9 @@ That's it. `dev` launches everything in Docker containers: 4. Agent Gateway (Bun, hot-reload) 5. Web frontend (Vite, hot-reload) +The dev stack uses `devenv/app.dev.toml` directly and no longer overwrites the repo root `config.toml`. +Default host ports are shifted away from the production compose stack: Web `18082`, API `18080`, Agent `18081`, Postgres `15432`, Qdrant `16333`/`16334`, Sparse `18085`. + ## Daily Development ```bash @@ -49,7 +53,7 @@ mise run dev:restart -- server # Restart a specific service | `mise run dev:down` | Stop dev environment | | `mise run dev:logs` | View dev logs | | `mise run dev:restart` | Restart a service (e.g. `-- server`) | -| `mise run setup` | Copy config + install deps | +| `mise run setup` | Install deps and prepare local tooling | | `mise run db-up` | Run database migrations | | `mise run db-down` | Roll back database migrations | | `mise run swagger-generate` | Generate Swagger documentation | @@ -60,7 +64,7 @@ mise run dev:restart -- server # Restart a specific service ``` conf/ — Configuration templates (app.example.toml, app.docker.toml) -devenv/ — Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, mcp-build.sh) +devenv/ — Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, bridge-build.sh) docker/ — Production Docker build & runtime (Dockerfiles, entrypoints) cmd/ — Go application entry points internal/ — Go backend core code diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index 04d4473a..5d54d4fd 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -66,7 +66,7 @@ For Mem0 or OpenViking SaaS, no profile is needed. Configure the provider direct ### China Mainland Mirror -Uncomment `registry = "memoh.cn"` in `config.toml` under `[mcp]`, then add the CN overlay: +Uncomment `registry = "memoh.cn"` in `config.toml` under `[workspace]`, then add the CN overlay: ```bash sudo docker compose -f docker-compose.yml -f docker/docker-compose.cn.yml \ diff --git a/apps/agent/src/models.ts b/apps/agent/src/models.ts index d5966e9f..e6903ca4 100644 --- a/apps/agent/src/models.ts +++ b/apps/agent/src/models.ts @@ -27,9 +27,8 @@ export const ModelConfigModel = z.object({ export const IdentityContextModel = z.object({ botId: z.string().min(1, 'Bot ID is required'), - containerId: z.string().optional().default(''), - channelIdentityId: z.string().optional().default(''), - displayName: z.string().optional().default(''), + channelIdentityId: z.string().min(1, 'Channel identity ID is required'), + displayName: z.string().min(1, 'Display name is required'), currentPlatform: z.string().optional(), replyTarget: z.string().optional(), conversationType: z.string().optional(), diff --git a/apps/web/AGENTS.md b/apps/web/AGENTS.md index b532909f..818f8215 100644 --- a/apps/web/AGENTS.md +++ b/apps/web/AGENTS.md @@ -319,7 +319,7 @@ Chat responses are streamed via Server-Sent Events: - Dev server port: 8082 (from `config.toml`) - Proxy: `/api` → backend (default `http://localhost:8080`) - Aliases: `@` → `./src`, `#` → `../ui/src` -- Config: reads from `../../config.toml` via `@memoh/config` +- Config: reads from `MEMOH_CONFIG_PATH` / `CONFIG_PATH` when provided, otherwise `../../config.toml`, via `@memoh/config` ## Development Rules diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index a3d3dc5d..5439e3af 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -486,6 +486,7 @@ "title": "Bots", "searchPlaceholder": "Search bots…", "createBot": "New Bot", + "createBotWaitHint": "Creating a bot may need to pull the base image on first use. Please wait a moment after submission.", "editBot": "Edit Bot", "deleteConfirm": "Are you sure you want to delete this bot?", "renameSuccess": "Bot name updated", @@ -636,9 +637,14 @@ "subtitle": "Manage the runtime container attached to this bot.", "botNotReady": "This bot is in lifecycle transition. Container actions are temporarily disabled.", "empty": "No container found for this bot. Create one to enable runtime tooling.", + "legacyWarning": "This container uses an older architecture and needs to be recreated for full compatibility. Your data will be preserved automatically.", + "legacyRecreate": "Recreate Container", + "legacyRecreateSuccess": "Container recreated successfully", "createHint": "The container is created from the current image. If you explicitly enable restore, preserved data will be restored after creation.", "createRestoreDataLabel": "Restore preserved data after creation", "createRestoreDataDescription": "If a previously exported backup or legacy bind-mounted data exists, it will be restored into `/data` after the container is created.", + "createImageLabel": "Base image", + "createImageDescription": "Docker image to use as the container base (e.g. debian:bookworm-slim, alpine:latest, ubuntu:24.04). Leave empty for the default.", "deleteConfirm": "Are you sure you want to permanently delete this container? Unpreserved data cannot be recovered.", "deletePreserveConfirm": "Are you sure you want to export `/data` and then delete this container?", "restoreConfirm": "Are you sure you want to restore preserved data into this container's `/data`?", @@ -664,6 +670,10 @@ "importSuccess": "Data imported", "restoreSuccess": "Preserved data restored", "rollbackSuccess": "Snapshot rolled back", + "pullingImage": "Pulling image...", + "creatingContainer": "Creating container...", + "preservingData": "Backing up data, this may take a while for large volumes...", + "restoringData": "Restoring data, this may take a while for large volumes...", "snapshotEmpty": "No snapshots found", "snapshotLoadFailed": "Failed to load snapshots", "snapshotNamePlaceholder": "Snapshot display name (optional)", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index b2704c18..a4b95216 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -482,6 +482,7 @@ "title": "Bots", "searchPlaceholder": "搜索 Bot…", "createBot": "新建 Bot", + "createBotWaitHint": "首次创建时可能需要拉取基础镜像,提交后请耐心等待片刻。", "editBot": "编辑 Bot", "deleteConfirm": "确定要删除这个 Bot 吗?", "renameSuccess": "Bot 名称已更新", @@ -632,9 +633,14 @@ "subtitle": "管理当前 Bot 对应的运行容器。", "botNotReady": "当前 Bot 正在生命周期变更中,暂时不可操作容器。", "empty": "当前 Bot 尚未创建容器,创建后可启用运行环境能力。", + "legacyWarning": "当前容器使用旧版架构,需要重建以获得完整兼容性。重建时数据会自动保留。", + "legacyRecreate": "重建容器", + "legacyRecreateSuccess": "容器重建成功", "createHint": "容器会基于当前镜像创建;如你显式开启恢复,则会在创建后尝试恢复已保留的数据。", "createRestoreDataLabel": "创建后恢复已保留数据", "createRestoreDataDescription": "如果存在之前导出的备份或旧版 bind mount 数据,将在容器创建后恢复到 `/data`。", + "createImageLabel": "基础镜像", + "createImageDescription": "作为容器基础环境的 Docker 镜像(如 debian:bookworm-slim、alpine:latest、ubuntu:24.04)。留空则使用默认镜像。", "deleteConfirm": "确定要彻底删除这个容器吗?未保留的数据将无法恢复。", "deletePreserveConfirm": "确定要先导出 `/data` 再删除这个容器吗?", "restoreConfirm": "确定要将已保留的数据恢复到当前容器的 `/data` 吗?", @@ -660,6 +666,10 @@ "importSuccess": "数据导入成功", "restoreSuccess": "已恢复保留数据", "rollbackSuccess": "快照回滚成功", + "pullingImage": "正在拉取镜像...", + "creatingContainer": "正在创建容器...", + "preservingData": "正在备份数据,数据量较大时可能需要一段时间...", + "restoringData": "正在迁移数据,数据量较大时可能需要一段时间...", "snapshotEmpty": "暂无快照", "snapshotLoadFailed": "加载快照失败", "snapshotNamePlaceholder": "快照显示名称(可选)", diff --git a/apps/web/src/pages/bots/components/bot-container.vue b/apps/web/src/pages/bots/components/bot-container.vue index 1575ba94..e0bcee37 100644 --- a/apps/web/src/pages/bots/components/bot-container.vue +++ b/apps/web/src/pages/bots/components/bot-container.vue @@ -9,7 +9,6 @@ import { getBotsByBotIdContainer, getBotsByBotIdContainerSnapshots, getBotsById, - postBotsByBotIdContainer, postBotsByBotIdContainerDataExport, postBotsByBotIdContainerDataImport, postBotsByBotIdContainerDataRestore, @@ -17,15 +16,23 @@ import { postBotsByBotIdContainerSnapshotsRollback, postBotsByBotIdContainerStart, postBotsByBotIdContainerStop, + type HandlersCreateContainerRequest, type HandlersGetContainerResponse, type HandlersListSnapshotsResponse, } from '@memoh/sdk' +import { + postBotsByBotIdContainerStream, + type ContainerCreateLayerStatus, + type ContainerCreateStreamEvent, +} from '@memoh/sdk/extra' import { Button, Input, Label, Separator, Spinner, Switch } from '@memoh/ui' import ConfirmPopover from '@/components/confirm-popover/index.vue' +import ContainerCreateProgress from './container-create-progress.vue' import { useSyncedQueryParam } from '@/composables/useSyncedQueryParam' import { useBotStatusMeta } from '@/composables/useBotStatusMeta' import { useCapabilitiesStore } from '@/store/capabilities' import { formatDateTime } from '@/utils/date-time' +import { shortenImageRef } from '@/utils/image-ref' import { resolveApiErrorMessage } from '@/utils/api-error' const route = useRoute() @@ -43,15 +50,38 @@ type ContainerAction = | 'import' | 'restore' | 'rollback' + | 'recreate' | '' const containerLoading = ref(false) const containerAction = ref('') const rollbackVersion = ref(null) const createRestoreData = ref(false) +const createImage = ref('') +const createImagePrefilled = ref(false) const newSnapshotName = ref('') const importInputRef = ref(null) +interface CreateProgress { + phase: 'preserving' | 'pulling' | 'creating' | 'restoring' | 'complete' | 'error' + layers?: ContainerCreateLayerStatus[] + image?: string + error?: string +} +const createProgress = ref(null) + +const createProgressPercent = computed(() => { + const layers = createProgress.value?.layers + if (!layers || layers.length === 0) return 0 + let totalOffset = 0 + let totalSize = 0 + for (const l of layers) { + totalOffset += l.offset + totalSize += l.total + } + return totalSize > 0 ? Math.round((totalOffset / totalSize) * 100) : 0 +}) + const capabilitiesStore = useCapabilitiesStore() const botId = computed(() => route.params.botId as string) const containerBusy = computed(() => containerLoading.value || containerAction.value !== '') @@ -157,29 +187,88 @@ const { data: bot } = useQuery({ enabled: () => !!botId.value, }) +function rememberedWorkspaceImage(metadata: Record | undefined): string { + const workspace = metadata?.workspace + if (!workspace || typeof workspace !== 'object' || Array.isArray(workspace)) return '' + const image = (workspace as Record).image + return typeof image === 'string' ? shortenImageRef(image) : '' +} + +const rememberedCreateImage = computed(() => rememberedWorkspaceImage(bot.value?.metadata as Record | undefined)) +const displayedContainerImage = computed(() => shortenImageRef(containerInfo.value?.image)) + const { isPending: botLifecyclePending } = useBotStatusMeta(bot, t) +function applyCreateContainerEvent(event: ContainerCreateStreamEvent): boolean { + switch (event.type) { + case 'pulling': + createProgress.value = { phase: 'pulling', image: event.image } + return false + case 'pull_progress': + createProgress.value = { + phase: 'pulling', + image: createProgress.value?.image, + layers: event.layers, + } + return false + case 'creating': + createProgress.value = { phase: 'creating' } + return false + case 'restoring': + createProgress.value = { phase: 'restoring' } + return false + case 'complete': + // Keep the last visible progress state until the container detail view loads. + // Rendering a separate "complete" phase here looks like the bar jumped back to 0. + return !!event.container.data_restored + case 'error': + createProgress.value = { phase: 'error', error: event.message } + throw new Error(event.message || 'Unknown error') + } +} + +async function createContainerSSE(body: HandlersCreateContainerRequest): Promise<{ dataRestored: boolean }> { + const { stream } = await postBotsByBotIdContainerStream({ + path: { bot_id: botId.value }, + body, + throwOnError: true, + }) + + let dataRestored = false + for await (const event of stream) { + dataRestored = applyCreateContainerEvent(event) || dataRestored + } + + return { dataRestored } +} + async function handleCreateContainer() { if (botLifecyclePending.value) return - await runContainerAction( - 'create', - async () => { - const { data } = await postBotsByBotIdContainer({ - path: { bot_id: botId.value }, - body: { - restore_data: createRestoreData.value, - }, - throwOnError: true, - }) - createRestoreData.value = false - await loadContainerData(false) - return data - }, - (result) => result.data_restored + containerAction.value = 'create' + createProgress.value = { phase: 'pulling' } + try { + const body: HandlersCreateContainerRequest = { + restore_data: createRestoreData.value, + } + const trimmedImage = createImage.value.trim() + if (trimmedImage) body.image = trimmedImage + + const { dataRestored } = await createContainerSSE(body) + createRestoreData.value = false + createImage.value = '' + await loadContainerData(false) + toast.success(dataRestored ? t('bots.container.createRestoreSuccess') - : t('bots.container.createSuccess'), - ) + : t('bots.container.createSuccess')) + } + catch (error) { + toast.error(resolveErrorMessage(error, t('bots.container.actionFailed'))) + } + finally { + containerAction.value = '' + createProgress.value = null + } } const isContainerTaskRunning = computed(() => { @@ -192,6 +281,33 @@ const isContainerTaskRunning = computed(() => { }) const hasPreservedData = computed(() => !!containerInfo.value?.has_preserved_data) +const isLegacy = computed(() => !!containerInfo.value?.legacy) + +async function handleRecreateContainer() { + if (botLifecyclePending.value || !containerInfo.value) return + + containerAction.value = 'recreate' + try { + createProgress.value = { phase: 'preserving' } + await deleteBotsByBotIdContainer({ + path: { bot_id: botId.value }, + query: { preserve_data: true }, + throwOnError: true, + }) + + createProgress.value = { phase: 'pulling' } + await createContainerSSE({ restore_data: true }) + await loadContainerData(false) + toast.success(t('bots.container.legacyRecreateSuccess')) + } + catch (error) { + toast.error(resolveErrorMessage(error, t('bots.container.actionFailed'))) + } + finally { + containerAction.value = '' + createProgress.value = null + } +} async function handleStopContainer() { if (botLifecyclePending.value || !containerInfo.value) return @@ -226,6 +342,7 @@ async function handleDeleteContainer(preserveData: boolean) { const successMessage = preserveData ? t('bots.container.deletePreserveSuccess') : t('bots.container.deleteSuccess') + const lastImage = shortenImageRef(containerInfo.value.image) await runContainerAction( action, @@ -239,6 +356,8 @@ async function handleDeleteContainer(preserveData: boolean) { containerMissing.value = true snapshots.value = [] createRestoreData.value = preserveData + createImage.value = lastImage + createImagePrefilled.value = !!lastImage }, successMessage, ) @@ -445,6 +564,19 @@ const sortedSnapshots = computed(() => { const activeTab = useSyncedQueryParam('tab', 'overview') +watch(containerMissing, (missing) => { + if (!missing) { + createImagePrefilled.value = false + } +}) + +watch([containerMissing, rememberedCreateImage], ([missing, remembered]) => { + if (!missing || createImagePrefilled.value) return + if (!remembered || createImage.value.trim()) return + createImage.value = remembered + createImagePrefilled.value = true +}, { immediate: true }) + watch([activeTab, botId], ([tab]) => { if (!botId.value) return if (tab === 'container') { @@ -540,6 +672,19 @@ watch([activeTab, botId], ([tab]) => { /> +
+ + +

+ {{ $t('bots.container.createImageDescription') }} +

+
+
+ +
+ +
@@ -559,6 +715,39 @@ watch([activeTab, botId], ([tab]) => { v-else-if="containerInfo" class="space-y-5" > +
+

+ {{ $t('bots.container.legacyWarning') }} +

+ +
+ +
+ +
+
@@ -592,7 +781,7 @@ watch([activeTab, botId], ([tab]) => { {{ $t('bots.container.fields.image') }}
- {{ containerInfo.image }} + {{ displayedContainerImage }}
@@ -969,4 +1158,4 @@ watch([activeTab, botId], ([tab]) => {
- \ No newline at end of file + diff --git a/apps/web/src/pages/bots/components/container-create-progress.vue b/apps/web/src/pages/bots/components/container-create-progress.vue new file mode 100644 index 00000000..36f99ffb --- /dev/null +++ b/apps/web/src/pages/bots/components/container-create-progress.vue @@ -0,0 +1,48 @@ + + + diff --git a/apps/web/src/pages/bots/components/create-bot.vue b/apps/web/src/pages/bots/components/create-bot.vue index 46758d88..813ade60 100644 --- a/apps/web/src/pages/bots/components/create-bot.vue +++ b/apps/web/src/pages/bots/components/create-bot.vue @@ -57,6 +57,9 @@ +
+ {{ $t('bots.createBotWaitHint') }} +
diff --git a/apps/web/src/utils/image-ref.test.ts b/apps/web/src/utils/image-ref.test.ts new file mode 100644 index 00000000..d26e6279 --- /dev/null +++ b/apps/web/src/utils/image-ref.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from 'vitest' +import { shortenImageRef } from './image-ref' + +describe('shortenImageRef', () => { + it('returns empty string for missing values', () => { + expect(shortenImageRef(undefined)).toBe('') + expect(shortenImageRef(null)).toBe('') + expect(shortenImageRef('')).toBe('') + }) + + it('strips docker hub library prefix', () => { + expect(shortenImageRef('docker.io/library/nginx:latest')).toBe('nginx:latest') + }) + + it('strips docker hub registry prefix for namespaced images', () => { + expect(shortenImageRef('docker.io/memohai/memoh:latest')).toBe('memohai/memoh:latest') + }) + + it('preserves non-docker-hub registries', () => { + expect(shortenImageRef('ghcr.io/memohai/memoh:latest')).toBe('ghcr.io/memohai/memoh:latest') + }) +}) diff --git a/apps/web/src/utils/image-ref.ts b/apps/web/src/utils/image-ref.ts new file mode 100644 index 00000000..7933d80d --- /dev/null +++ b/apps/web/src/utils/image-ref.ts @@ -0,0 +1,8 @@ +// Keep this display helper aligned with internal/config.NormalizeImageRef. +export function shortenImageRef(value: string | null | undefined): string { + const ref = value?.trim() ?? '' + if (!ref) return '' + if (ref.startsWith('docker.io/library/')) return ref.slice('docker.io/library/'.length) + if (ref.startsWith('docker.io/')) return ref.slice('docker.io/'.length) + return ref +} diff --git a/apps/web/vite.config.ts b/apps/web/vite.config.ts index 2c61cd3a..b18414d9 100644 --- a/apps/web/vite.config.ts +++ b/apps/web/vite.config.ts @@ -10,10 +10,13 @@ export default defineConfig(({ command }) => { const defaultPort = 8082 const defaultHost = '127.0.0.1' const defaultApiBaseUrl = process.env.VITE_API_URL ?? 'http://localhost:8080' + const configuredProxyTarget = process.env.MEMOH_WEB_PROXY_TARGET?.trim() + const configuredPath = process.env.MEMOH_CONFIG_PATH?.trim() || process.env.CONFIG_PATH?.trim() + const configPath = configuredPath && configuredPath.length > 0 ? configuredPath : '../../config.toml' let port = defaultPort let host = defaultHost - let baseUrl = defaultApiBaseUrl + let baseUrl = configuredProxyTarget || defaultApiBaseUrl if (command !== 'build') { try { @@ -25,13 +28,13 @@ export default defineConfig(({ command }) => { } let config try { - config = loadConfig('../../config.toml') + config = loadConfig(configPath) } catch { config = loadConfig('../../conf/app.docker.toml') } port = config.web?.port ?? defaultPort host = config.web?.host ?? defaultHost - baseUrl = getBaseUrl(config) + baseUrl = configuredProxyTarget || getBaseUrl(config) } catch { // Fall back to env/default values when config.toml is unavailable. } diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 58f43a2d..b8a37202 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -90,6 +90,7 @@ import ( ttspkg "github.com/memohai/memoh/internal/tts" ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge" "github.com/memohai/memoh/internal/version" + "github.com/memohai/memoh/internal/workspace" ) func migrationsFS() fs.FS { @@ -156,8 +157,8 @@ func runServe() { provideDBConn, provideDBQueries, - // container & mcp infrastructure - provideMCPManager, + // container & workspace infrastructure + provideWorkspaceManager, // memory pipeline provideMemoryLLM, @@ -336,8 +337,8 @@ func provideDBQueries(conn *pgxpool.Pool) *dbsqlc.Queries { return dbsqlc.New(conn) } -func provideMCPManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *mcp.Manager { - return mcp.NewManager(log, service, cfg.MCP, cfg.Containerd.Namespace, conn) +func provideWorkspaceManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *workspace.Manager { + return workspace.NewManager(log, service, cfg.Workspace, cfg.Containerd.Namespace, conn) } // --------------------------------------------------------------------------- @@ -353,7 +354,7 @@ func provideMemoryLLM(modelsService *models.Service, queries *dbsqlc.Queries, lo } } -func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *mcp.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { +func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *workspace.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { registry := memprovider.NewRegistry(log) fileRuntime := handlers.NewBuiltinMemoryRuntime(manager) fileStore := storefs.New(log, manager) @@ -466,7 +467,7 @@ func provideChannelRouter( heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, - manager *mcp.Manager, + manager *workspace.Manager, rc *boot.RuntimeConfig, ) *inbound.ChannelInboundProcessor { adapter, ok := registry.Get(qq.Type) @@ -526,8 +527,8 @@ func provideChannelLifecycleService(channelStore *channel.Store, channelManager // containerd handler & tool gateway // --------------------------------------------------------------------------- -func provideContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *handlers.ContainerdHandler { - return handlers.NewContainerdHandler(log, service, manager, cfg.MCP, cfg.Containerd.Namespace, rc.ContainerBackend, botService, accountService, policyService, queries) +func provideContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *handlers.ContainerdHandler { + return handlers.NewContainerdHandler(log, manager, cfg.Workspace, rc.ContainerBackend, botService, accountService, policyService) } func provideFederationGateway(log *slog.Logger, containerdHandler *handlers.ContainerdHandler) *handlers.MCPFederationGateway { @@ -547,7 +548,7 @@ func provideOAuthService(log *slog.Logger, queries *dbsqlc.Queries, cfg config.C return mcp.NewOAuthService(log, queries, callbackURL) } -func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *mcp.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { +func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { fedGateway.SetOAuthService(oauthService) var assetResolver mcpmessage.AssetResolver if mediaService != nil { @@ -581,7 +582,7 @@ func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManag // handler providers (interface adaptation / config extraction) // --------------------------------------------------------------------------- -func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *mcp.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { +func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *workspace.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { h := handlers.NewMemoryHandler(log, botService, accountService) h.SetMemoryRegistry(memoryRegistry) h.SetSettingsService(settingsService) @@ -599,7 +600,7 @@ func provideMessageHandler(log *slog.Logger, chatService *conversation.Service, return h } -func provideMediaService(log *slog.Logger, manager *mcp.Manager) *media.Service { +func provideMediaService(log *slog.Logger, manager *workspace.Manager) *media.Service { provider := containerfs.New(manager) return media.NewService(log, provider) } @@ -788,16 +789,16 @@ func startChannelManager(lc fx.Lifecycle, channelManager *channel.Manager) { }) } -func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { +func startContainerReconciliation(lc fx.Lifecycle, manager *workspace.Manager, _ *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { - go containerdHandler.ReconcileContainers(ctx) + go manager.ReconcileContainers(ctx) return nil }, }) } -func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { +func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, _ *handlers.ContainerdHandler, manager *workspace.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo()) lc.Append(fx.Hook{ @@ -805,7 +806,7 @@ func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutd if err := ensureAdminUser(ctx, logger, queries, cfg); err != nil { return err } - botService.SetContainerLifecycle(containerdHandler) + botService.SetContainerLifecycle(manager) botService.SetContainerReachability(func(ctx context.Context, botID string) error { _, err := manager.MCPClient(ctx, botID) return err @@ -881,7 +882,7 @@ func ensureAdminUser(ctx context.Context, log *slog.Logger, queries *dbsqlc.Quer emailValue = pgtype.Text{String: email, Valid: true} } displayName := pgtype.Text{String: username, Valid: true} - dataRoot := pgtype.Text{String: cfg.MCP.DataRoot, Valid: cfg.MCP.DataRoot != ""} + dataRoot := pgtype.Text{String: cfg.Workspace.DataRoot, Valid: cfg.Workspace.DataRoot != ""} _, err = queries.CreateAccount(ctx, dbsqlc.CreateAccountParams{ UserID: user.ID, @@ -1056,9 +1057,9 @@ func (a *commandSkillLoaderAdapter) LoadSkills(ctx context.Context, botID string return skills, nil } -// commandContainerFSAdapter bridges mcp.Manager to command.ContainerFS. +// commandContainerFSAdapter bridges workspace.Manager to command.ContainerFS. type commandContainerFSAdapter struct { - manager *mcp.Manager + manager *workspace.Manager } func (a *commandContainerFSAdapter) ListDir(ctx context.Context, botID, dirPath string) ([]command.FSEntry, error) { diff --git a/cmd/mcp/main.go b/cmd/bridge/main.go similarity index 54% rename from cmd/mcp/main.go rename to cmd/bridge/main.go index d494211a..b8dde819 100644 --- a/cmd/mcp/main.go +++ b/cmd/bridge/main.go @@ -9,17 +9,18 @@ import ( "os/signal" "path/filepath" "syscall" + "time" "google.golang.org/grpc" "google.golang.org/grpc/reflection" "github.com/memohai/memoh/internal/logger" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const ( - defaultListenAddr = ":9090" - templateDir = "/opt/mcp-template" + defaultSocketPath = "/run/memoh/bridge.sock" + templateDir = "/opt/memoh/templates" ) // initDataDir ensures /data exists and seeds template files on first boot. @@ -60,14 +61,33 @@ func main() { initDataDir() - addr := os.Getenv("MCP_LISTEN_ADDR") - if addr == "" { - addr = defaultListenAddr - } + // Append toolkit to PATH so child processes (via /bin/sh -c) can find npx/uvx. + // Container-native tools take priority since toolkit is appended at the end. + _ = os.Setenv("PATH", os.Getenv("PATH")+":/opt/memoh/toolkit/bin") - lis, err := (&net.ListenConfig{}).Listen(ctx, "tcp", addr) + // PID 1 zombie reaping: when bridge runs as PID 1 inside a container, + // orphaned child processes become zombies unless reaped. + // On Linux 5.3+, Go's os/exec uses pidfd_open which avoids races between + // this reaper and cmd.Wait(). Kernels below 5.3 may see rare ECHILD errors. + go func() { + var status syscall.WaitStatus + for { + if _, err := syscall.Wait4(-1, &status, 0, nil); err != nil { + time.Sleep(time.Second) + } + } + }() + + socketPath := os.Getenv("BRIDGE_SOCKET_PATH") + if socketPath == "" { + socketPath = defaultSocketPath + } + // Clean up residual socket from a previous run. + _ = os.Remove(filepath.Clean(socketPath)) //nolint:gosec // G703: socketPath is from BRIDGE_SOCKET_PATH env or a compiled-in default, not end-user input + + lis, err := (&net.ListenConfig{}).Listen(ctx, "unix", socketPath) if err != nil { - logger.Error("failed to listen", slog.String("addr", addr), slog.Any("error", err)) + logger.Error("failed to listen", slog.String("socket", socketPath), slog.Any("error", err)) return } @@ -81,7 +101,7 @@ func main() { srv.GracefulStop() }() - logger.Info("mcp gRPC server listening", slog.String("addr", addr)) + logger.Info("bridge gRPC server listening", slog.String("socket", socketPath)) if err := srv.Serve(lis); err != nil { logger.Error("gRPC server failed", slog.Any("error", err)) return diff --git a/cmd/mcp/server.go b/cmd/bridge/server.go similarity index 99% rename from cmd/mcp/server.go rename to cmd/bridge/server.go index 0899186a..6fa325f7 100644 --- a/cmd/mcp/server.go +++ b/cmd/bridge/server.go @@ -20,7 +20,7 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const ( diff --git a/cmd/mcp/template/HEARTBEAT.md b/cmd/bridge/template/HEARTBEAT.md similarity index 100% rename from cmd/mcp/template/HEARTBEAT.md rename to cmd/bridge/template/HEARTBEAT.md diff --git a/cmd/mcp/template/IDENTITY.md b/cmd/bridge/template/IDENTITY.md similarity index 100% rename from cmd/mcp/template/IDENTITY.md rename to cmd/bridge/template/IDENTITY.md diff --git a/cmd/mcp/template/MEMORY.md b/cmd/bridge/template/MEMORY.md similarity index 100% rename from cmd/mcp/template/MEMORY.md rename to cmd/bridge/template/MEMORY.md diff --git a/cmd/mcp/template/PROFILES.md b/cmd/bridge/template/PROFILES.md similarity index 100% rename from cmd/mcp/template/PROFILES.md rename to cmd/bridge/template/PROFILES.md diff --git a/cmd/mcp/template/SOUL.md b/cmd/bridge/template/SOUL.md similarity index 100% rename from cmd/mcp/template/SOUL.md rename to cmd/bridge/template/SOUL.md diff --git a/cmd/mcp/template/TOOLS.md b/cmd/bridge/template/TOOLS.md similarity index 100% rename from cmd/mcp/template/TOOLS.md rename to cmd/bridge/template/TOOLS.md diff --git a/cmd/mcp/entrypoint.sh b/cmd/mcp/entrypoint.sh deleted file mode 100644 index 118d32e4..00000000 --- a/cmd/mcp/entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -# Copy binary to writable layer so it survives snapshot restores. -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; } -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi -exec /opt/mcp "$@" diff --git a/cmd/memoh/serve.go b/cmd/memoh/serve.go index f0951355..be81895d 100644 --- a/cmd/memoh/serve.go +++ b/cmd/memoh/serve.go @@ -92,6 +92,7 @@ import ( ttspkg "github.com/memohai/memoh/internal/tts" ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge" "github.com/memohai/memoh/internal/version" + "github.com/memohai/memoh/internal/workspace" ) func runServe() { @@ -103,7 +104,7 @@ func runServe() { provideContainerService, provideDBConn, provideDBQueries, - provideMCPManager, + provideWorkspaceManager, provideAgentRuntimeManager, provideMemoryLLM, memprovider.NewService, @@ -245,8 +246,8 @@ func provideDBConn(lc fx.Lifecycle, cfg config.Config) (*pgxpool.Pool, error) { } func provideDBQueries(conn *pgxpool.Pool) *dbsqlc.Queries { return dbsqlc.New(conn) } -func provideMCPManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *mcp.Manager { - return mcp.NewManager(log, service, cfg.MCP, cfg.Containerd.Namespace, conn) +func provideWorkspaceManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *workspace.Manager { + return workspace.NewManager(log, service, cfg.Workspace, cfg.Containerd.Namespace, conn) } func provideAgentRuntimeManager(log *slog.Logger, cfg config.Config) *agentruntime.Manager { @@ -257,7 +258,7 @@ func provideMemoryLLM(modelsService *models.Service, queries *dbsqlc.Queries, lo return &lazyLLMClient{modelsService: modelsService, queries: queries, timeout: 30 * time.Second, logger: log} } -func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *mcp.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { +func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *workspace.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { registry := memprovider.NewRegistry(log) builtinRuntime := handlers.NewBuiltinMemoryRuntime(manager) fileStore := storefs.New(log, manager) @@ -342,7 +343,7 @@ func provideChannelRegistry(log *slog.Logger, hub *local.RouteHub, mediaService return registry } -func provideChannelRouter(log *slog.Logger, registry *channel.Registry, hub *local.RouteHub, routeService *route.DBService, msgService *message.DBService, resolver *flow.Resolver, identityService *identities.Service, botService *bots.Service, aclService *acl.Service, policyService *policy.Service, bindService *bind.Service, mediaService *media.Service, inboxService *inbox.Service, ttsService *ttspkg.Service, settingsService *settings.Service, subagentService *subagent.Service, scheduleService *schedule.Service, mcpConnService *mcp.ConnectionService, modelsService *models.Service, providersService *providers.Service, memProvService *memprovider.Service, searchProvService *searchproviders.Service, browserCtxService *browsercontexts.Service, emailService *emailpkg.Service, emailOutboxService *emailpkg.OutboxService, heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, rc *boot.RuntimeConfig) *inbound.ChannelInboundProcessor { +func provideChannelRouter(log *slog.Logger, registry *channel.Registry, hub *local.RouteHub, routeService *route.DBService, msgService *message.DBService, resolver *flow.Resolver, identityService *identities.Service, botService *bots.Service, aclService *acl.Service, policyService *policy.Service, bindService *bind.Service, mediaService *media.Service, inboxService *inbox.Service, ttsService *ttspkg.Service, settingsService *settings.Service, subagentService *subagent.Service, scheduleService *schedule.Service, mcpConnService *mcp.ConnectionService, modelsService *models.Service, providersService *providers.Service, memProvService *memprovider.Service, searchProvService *searchproviders.Service, browserCtxService *browsercontexts.Service, emailService *emailpkg.Service, emailOutboxService *emailpkg.OutboxService, heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, manager *workspace.Manager, rc *boot.RuntimeConfig) *inbound.ChannelInboundProcessor { adapter, ok := registry.Get(qq.Type) if !ok { panic("qq adapter not registered") @@ -395,8 +396,8 @@ func provideChannelLifecycleService(channelStore *channel.Store, channelManager return channel.NewLifecycle(channelStore, channelManager) } -func provideContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *handlers.ContainerdHandler { - return handlers.NewContainerdHandler(log, service, manager, cfg.MCP, cfg.Containerd.Namespace, rc.ContainerBackend, botService, accountService, policyService, queries) +func provideContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *handlers.ContainerdHandler { + return handlers.NewContainerdHandler(log, manager, cfg.Workspace, rc.ContainerBackend, botService, accountService, policyService) } func provideFederationGateway(log *slog.Logger, containerdHandler *handlers.ContainerdHandler) *handlers.MCPFederationGateway { @@ -416,7 +417,7 @@ func provideOAuthService(log *slog.Logger, queries *dbsqlc.Queries, cfg config.C return mcp.NewOAuthService(log, queries, callbackURL) } -func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *mcp.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { +func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { fedGateway.SetOAuthService(oauthService) var assetResolver mcpmessage.AssetResolver if mediaService != nil { @@ -441,7 +442,7 @@ func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManag return svc } -func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *mcp.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { +func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *workspace.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { h := handlers.NewMemoryHandler(log, botService, accountService) h.SetMemoryRegistry(memoryRegistry) h.SetSettingsService(settingsService) @@ -466,7 +467,7 @@ func (h *memohAuthHandler) Register(e *echo.Echo) { e.POST("/api/auth/refresh", h.inner.Refresh) } -func provideMediaService(log *slog.Logger, manager *mcp.Manager) *media.Service { +func provideMediaService(log *slog.Logger, manager *workspace.Manager) *media.Service { provider := containerfs.New(manager) return media.NewService(log, provider) } @@ -619,8 +620,8 @@ func startChannelManager(lc fx.Lifecycle, channelManager *channel.Manager) { }) } -func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { - lc.Append(fx.Hook{OnStart: func(ctx context.Context) error { go containerdHandler.ReconcileContainers(ctx); return nil }}) +func startContainerReconciliation(lc fx.Lifecycle, manager *workspace.Manager, _ *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { + lc.Append(fx.Hook{OnStart: func(ctx context.Context) error { go manager.ReconcileContainers(ctx); return nil }}) } func startAgentRuntime(lc fx.Lifecycle, manager *agentruntime.Manager) { @@ -630,14 +631,14 @@ func startAgentRuntime(lc fx.Lifecycle, manager *agentruntime.Manager) { }) } -func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *memohServer, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { +func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *memohServer, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, _ *handlers.ContainerdHandler, manager *workspace.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo()) lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { if err := ensureAdminUser(ctx, logger, queries, cfg); err != nil { return err } - botService.SetContainerLifecycle(containerdHandler) + botService.SetContainerLifecycle(manager) botService.SetContainerReachability(func(ctx context.Context, botID string) error { _, err := manager.MCPClient(ctx, botID) return err @@ -831,7 +832,7 @@ func ensureAdminUser(ctx context.Context, log *slog.Logger, queries *dbsqlc.Quer emailValue = pgtype.Text{String: email, Valid: true} } displayName := pgtype.Text{String: username, Valid: true} - dataRoot := pgtype.Text{String: cfg.MCP.DataRoot, Valid: cfg.MCP.DataRoot != ""} + dataRoot := pgtype.Text{String: cfg.Workspace.DataRoot, Valid: cfg.Workspace.DataRoot != ""} _, err = queries.CreateAccount(ctx, dbsqlc.CreateAccountParams{ UserID: user.ID, Username: pgtype.Text{String: username, Valid: true}, Email: emailValue, PasswordHash: pgtype.Text{String: string(hashed), Valid: true}, Role: "admin", @@ -971,9 +972,9 @@ func (a *commandSkillLoaderAdapter) LoadSkills(ctx context.Context, botID string return skills, nil } -// commandContainerFSAdapter bridges mcp.Manager to command.ContainerFS. +// commandContainerFSAdapter bridges workspace.Manager to command.ContainerFS. type commandContainerFSAdapter struct { - manager *mcp.Manager + manager *workspace.Manager } func (a *commandContainerFSAdapter) ListDir(ctx context.Context, botID, dirPath string) ([]command.FSEntry, error) { diff --git a/conf/app.apple.toml b/conf/app.apple.toml index e226cd7d..1c561b54 100644 --- a/conf/app.apple.toml +++ b/conf/app.apple.toml @@ -26,8 +26,8 @@ jwt_expires_in = "168h" # socket_path = "/path/to/your/.socktainer/container.sock" # binary_path = "/opt/homebrew/bin/socktainer" -[mcp] -image = "memohai/mcp:latest" +[workspace] +default_image = "debian:bookworm-slim" data_root = "data" [postgres] @@ -54,4 +54,4 @@ server_addr = "127.0.0.1:8080" [browser_gateway] host = "127.0.0.1" port = 8083 -server_addr = "127.0.0.1:8080" \ No newline at end of file +server_addr = "127.0.0.1:8080" diff --git a/conf/app.docker.toml b/conf/app.docker.toml index 113e02df..79571e07 100644 --- a/conf/app.docker.toml +++ b/conf/app.docker.toml @@ -22,11 +22,12 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "/opt/memoh/data" +runtime_dir = "/opt/memoh/runtime" ## Postgres configuration [postgres] diff --git a/conf/app.example.toml b/conf/app.example.toml index 2ef45d7a..bedabc14 100644 --- a/conf/app.example.toml +++ b/conf/app.example.toml @@ -22,9 +22,9 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "data" cni_bin_dir = "/opt/cni/bin" diff --git a/conf/app.windows.toml b/conf/app.windows.toml index 88b1eea3..dba41a4d 100644 --- a/conf/app.windows.toml +++ b/conf/app.windows.toml @@ -23,9 +23,9 @@ jwt_expires_in = "168h" socket_path = "npipe:////./pipe/containerd-containerd" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "data" diff --git a/devenv/Dockerfile.server b/devenv/Dockerfile.server index 63eac40b..ad4f094e 100644 --- a/devenv/Dockerfile.server +++ b/devenv/Dockerfile.server @@ -1,28 +1,9 @@ # syntax=docker/dockerfile:1 -# ---- Stage 1: Assemble MCP image rootfs (runtime deps only, no Go binary) ---- -FROM alpine:latest AS mcp-rootfs +# Dev server image: Go + containerd + CNI. +# Toolkit (Node.js, uv) is NOT baked in — it is volume-mounted from the host. +# Run ./docker/toolkit/install.sh once before first use. -RUN apk add --no-cache grep curl bash -RUN apk add --no-cache nodejs npm -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 2: Dev server image ---- FROM golang:1.25-alpine WORKDIR /workspace @@ -43,7 +24,7 @@ RUN apk add --no-cache \ && mkdir -p /opt/cni/bin \ && (cp -a /usr/lib/cni/. /opt/cni/bin/ 2>/dev/null || true) \ && (cp -a /usr/libexec/cni/. /opt/cni/bin/ 2>/dev/null || true) \ - && mkdir -p /etc/cni/net.d /var/lib/cni /run/containerd /var/lib/containerd /opt/memoh/data + && mkdir -p /etc/cni/net.d /var/lib/cni /run/containerd /var/lib/containerd /opt/memoh/data /opt/memoh/runtime RUN printf '%s\n' \ '{' \ @@ -73,9 +54,6 @@ RUN printf '%s\n' \ ' ]' \ '}' > /etc/cni/net.d/10-memoh.conflist -# Raw MCP rootfs for mcp-build.sh to package with compiled binary -COPY --from=mcp-rootfs /tmp/rootfs.tar /opt/images/memoh-mcp-rootfs.tar - COPY devenv/server-entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/devenv/app.dev.toml b/devenv/app.dev.toml index 3b03ce59..6e57f2d4 100644 --- a/devenv/app.dev.toml +++ b/devenv/app.dev.toml @@ -21,11 +21,12 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "/opt/memoh/data" +runtime_dir = "/opt/memoh/runtime" cni_bin_dir = "/opt/cni/bin" cni_conf_dir = "/etc/cni/net.d" diff --git a/devenv/bridge-build.sh b/devenv/bridge-build.sh new file mode 100755 index 00000000..c6528d03 --- /dev/null +++ b/devenv/bridge-build.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Build bridge binary and place in runtime directory. +# Called by air after server build — safe to skip outside dev container. +set -e + +RUNTIME_DIR="/opt/memoh/runtime" +BRIDGE_BINARY="$RUNTIME_DIR/bridge" +STAGING="${BRIDGE_BINARY}.new" + +[ -d "$RUNTIME_DIR" ] || exit 0 +command -v ctr >/dev/null 2>&1 || exit 0 + +OLD_HASH=$(sha256sum "$BRIDGE_BINARY" 2>/dev/null | cut -d' ' -f1) +go build -o "$STAGING" ./cmd/bridge || exit 0 +NEW_HASH=$(sha256sum "$STAGING" | cut -d' ' -f1) + +if [ "$OLD_HASH" = "$NEW_HASH" ]; then + rm -f "$STAGING" + exit 0 +fi + +# Atomic replace avoids "text busy" when the old binary is running. +mv -f "$STAGING" "$BRIDGE_BINARY" +chmod +x "$BRIDGE_BINARY" + +echo "[bridge-dev] Done. Containers will restart with new binary on next access." diff --git a/devenv/docker-compose.yml b/devenv/docker-compose.yml index eb4fb3f4..e8571b98 100644 --- a/devenv/docker-compose.yml +++ b/devenv/docker-compose.yml @@ -11,7 +11,7 @@ services: - postgres_data:/var/lib/postgresql/data - /etc/localtime:/etc/localtime:ro ports: - - "5432:5432" + - "${MEMOH_DEV_POSTGRES_PORT:-15432}:5432" healthcheck: test: ["CMD-SHELL", "pg_isready -U memoh"] interval: 5s @@ -25,8 +25,8 @@ services: volumes: - qdrant_data:/qdrant/storage ports: - - "6333:6333" - - "6334:6334" + - "${MEMOH_DEV_QDRANT_HTTP_PORT:-16333}:6333" + - "${MEMOH_DEV_QDRANT_GRPC_PORT:-16334}:6334" healthcheck: test: ["CMD-SHELL", "timeout 5s bash -c ':> /dev/tcp/127.0.0.1/6333' || exit 1"] interval: 5s @@ -57,6 +57,7 @@ services: command: ["go", "run", "./cmd/agent/main.go", "migrate", "up"] environment: CONFIG_PATH: /workspace/devenv/app.dev.toml + GOFLAGS: -buildvcs=false volumes: - ..:/workspace - go_mod_cache:/go/pkg/mod @@ -79,6 +80,7 @@ services: command: ["air", "-c", ".air.toml"] environment: CONFIG_PATH: /workspace/devenv/app.dev.toml + GOFLAGS: -buildvcs=false volumes: - ..:/workspace - go_mod_cache:/go/pkg/mod @@ -86,9 +88,13 @@ services: - containerd_data:/var/lib/containerd - server_cni_state:/var/lib/cni - memoh_data:/opt/memoh/data + # Toolkit: run ./docker/toolkit/install.sh once before first use + - ../.toolkit:/opt/memoh/runtime/toolkit + - ../docker/toolkit/bin:/opt/memoh/runtime/toolkit/bin + - ../cmd/bridge/template:/opt/memoh/runtime/templates - /etc/localtime:/etc/localtime:ro ports: - - "8080:8080" + - "${MEMOH_DEV_SERVER_PORT:-18080}:8080" healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1"] interval: 5s @@ -106,11 +112,13 @@ services: container_name: memoh-dev-agent working_dir: /workspace/apps/agent command: ["bun", "run", "--watch", "src/index.ts"] + environment: + MEMOH_CONFIG_PATH: /workspace/devenv/app.dev.toml volumes: - ..:/workspace - node_modules:/workspace/node_modules ports: - - "8081:8081" + - "${MEMOH_DEV_AGENT_PORT:-18081}:8081" depends_on: deps: condition: service_completed_successfully @@ -124,12 +132,17 @@ services: dockerfile: devenv/Dockerfile.web container_name: memoh-dev-web working_dir: /workspace/apps/web - command: ["pnpm", "dev"] + command: ["pnpm", "exec", "vite", "--host", "0.0.0.0", "--port", "8082"] + environment: + MEMOH_CONFIG_PATH: /workspace/devenv/app.dev.toml + MEMOH_WEB_PROXY_TARGET: http://host.docker.internal:18080 + extra_hosts: + - "host.docker.internal:host-gateway" volumes: - ..:/workspace - node_modules:/workspace/node_modules ports: - - "8082:8082" + - "${MEMOH_DEV_WEB_PORT:-18082}:8082" depends_on: deps: condition: service_completed_successfully @@ -147,12 +160,13 @@ services: # working_dir: /workspace/apps/browser # command: ["bun", "run", "--watch", "src/index.ts"] # environment: + # - MEMOH_CONFIG_PATH=/workspace/devenv/app.dev.toml # - BROWSER_CORES=${BROWSER_CORES:-chromium,firefox} # volumes: # - ..:/workspace # - node_modules:/workspace/node_modules # ports: - # - "8083:8083" + # - "${MEMOH_DEV_BROWSER_PORT:-18083}:8083" # depends_on: # deps: # condition: service_completed_successfully @@ -166,7 +180,7 @@ services: dockerfile: docker/Dockerfile.sparse container_name: memoh-dev-sparse ports: - - "8085:8085" + - "${MEMOH_DEV_SPARSE_PORT:-18085}:8085" healthcheck: test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:8085/health')\" || exit 1"] interval: 15s diff --git a/devenv/mcp-build.sh b/devenv/mcp-build.sh deleted file mode 100755 index f4acab56..00000000 --- a/devenv/mcp-build.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -# Build MCP binary, package as containerd image, and import. -# Called by air after server build — safe to skip outside dev container. -set -e - -MCP_IMAGE="${MCP_IMAGE:-docker.io/memohai/mcp:latest}" -MCP_BINARY="/opt/memoh/data/.dev/mcp" -BASE_ROOTFS="/opt/images/memoh-mcp-rootfs.tar" - -[ -f "$BASE_ROOTFS" ] || exit 0 -command -v ctr >/dev/null 2>&1 || exit 0 - -mkdir -p "$(dirname "$MCP_BINARY")" - -OLD_HASH=$(sha256sum "$MCP_BINARY" 2>/dev/null | cut -d' ' -f1) -go build -o "$MCP_BINARY" ./cmd/mcp || exit 0 -NEW_HASH=$(sha256sum "$MCP_BINARY" | cut -d' ' -f1) - -[ "$OLD_HASH" = "$NEW_HASH" ] && exit 0 - -echo "[mcp-dev] Binary changed, rebuilding MCP image..." - -WORK=$(mktemp -d) -trap 'rm -rf "$WORK"' EXIT - -# Layer 1: base rootfs (symlink to avoid copying the large file) -LAYER1_SHA=$(sha256sum "$BASE_ROOTFS" | cut -d' ' -f1) -mkdir -p "$WORK/$LAYER1_SHA" -ln -s "$BASE_ROOTFS" "$WORK/$LAYER1_SHA/layer.tar" - -# Layer 2: compiled binary + template + entrypoint overlay -mkdir -p "$WORK/overlay/opt" -cp "$MCP_BINARY" "$WORK/overlay/opt/mcp" -chmod +x "$WORK/overlay/opt/mcp" -cp -a /workspace/cmd/mcp/template "$WORK/overlay/opt/mcp-template" -cp /workspace/cmd/mcp/entrypoint.sh "$WORK/overlay/opt/entrypoint.sh" -chmod +x "$WORK/overlay/opt/entrypoint.sh" -tar -cf "$WORK/layer2.tar" -C "$WORK/overlay" opt -LAYER2_SHA=$(sha256sum "$WORK/layer2.tar" | cut -d' ' -f1) -mkdir -p "$WORK/$LAYER2_SHA" -mv "$WORK/layer2.tar" "$WORK/$LAYER2_SHA/layer.tar" - -# OCI image config -ARCH=$(uname -m) -case "$ARCH" in aarch64|arm64) ARCH="arm64" ;; x86_64|amd64) ARCH="amd64" ;; esac - -printf '{"architecture":"%s","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s","sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp rootfs"},{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp binary"}]}' \ - "$ARCH" "$LAYER1_SHA" "$LAYER2_SHA" > "$WORK/config.json" - -CONFIG_SHA=$(sha256sum "$WORK/config.json" | cut -d' ' -f1) -mv "$WORK/config.json" "$WORK/$CONFIG_SHA.json" - -printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar","%s/layer.tar"]}]' \ - "$CONFIG_SHA" "$MCP_IMAGE" "$LAYER1_SHA" "$LAYER2_SHA" > "$WORK/manifest.json" - -# -h follows symlinks (layer 1 is symlinked to avoid copying) -tar -chf "$WORK/memoh-mcp.tar" -C "$WORK" manifest.json "$CONFIG_SHA.json" "$LAYER1_SHA/" "$LAYER2_SHA/" - -# Replace image in containerd -ctr -n default images rm "$MCP_IMAGE" 2>/dev/null || true -ctr -n default images import --all-platforms "$WORK/memoh-mcp.tar" 2>&1 || true - -# Clean old MCP containers so they recreate with new image -for c in $(ctr -n default containers ls -q 2>/dev/null | grep "^mcp-"); do - ctr -n default tasks kill "$c" 2>/dev/null || true - ctr -n default tasks delete "$c" 2>/dev/null || true - ctr -n default containers delete "$c" 2>/dev/null || true -done - -echo "[mcp-dev] Done. Containers will auto-recreate with new image." diff --git a/devenv/server-entrypoint.sh b/devenv/server-entrypoint.sh old mode 100644 new mode 100755 index 69e03bfc..7f3a0403 --- a/devenv/server-entrypoint.sh +++ b/devenv/server-entrypoint.sh @@ -1,6 +1,14 @@ #!/bin/sh set -e +# Toolkit is volume-mounted from the host (.toolkit/). +# If missing, the user forgot to run the install script. +if [ ! -d /opt/memoh/runtime/toolkit/node-glibc ]; then + echo "ERROR: Toolkit not found at /opt/memoh/runtime/toolkit/." >&2 + echo " Run ./docker/toolkit/install.sh before starting the dev environment." >&2 + exit 1 +fi + # Clean up stale CNI state from previous runs. After a container restart the # cni0 bridge may linger with a zeroed MAC (00:00:00:00:00:00), causing the # bridge plugin to fail with "could not set bridge's mac: invalid argument". @@ -41,10 +49,10 @@ if ! ctr version >/dev/null 2>&1; then fi echo "containerd is running (pid $CONTAINERD_PID)" -# Build MCP binary and import as containerd image -echo "Building MCP image..." -(cd /workspace && sh devenv/mcp-build.sh) -echo "MCP image ready." +# Build bridge binary into runtime directory (first boot) +echo "Building bridge binary..." +(cd /workspace && go build -o /opt/memoh/runtime/bridge ./cmd/bridge) +echo "Bridge binary ready." echo "Starting server..." diff --git a/docker/Dockerfile.containerd b/docker/Dockerfile.containerd deleted file mode 100644 index ae847b5d..00000000 --- a/docker/Dockerfile.containerd +++ /dev/null @@ -1,86 +0,0 @@ -# syntax=docker/dockerfile:1 - -# ---- Stage 1: Build MCP binary ---- -FROM golang:1.25-alpine AS mcp-builder - -WORKDIR /src -RUN apk add --no-cache ca-certificates git - -COPY go.mod go.sum ./ -RUN --mount=type=cache,target=/go/pkg/mod \ - go mod download - -COPY . . - -ARG TARGETARCH=amd64 -ARG COMMIT_HASH=unknown -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} \ - go build -trimpath \ - -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" \ - -o /out/mcp ./cmd/mcp - -# ---- Stage 2: Assemble MCP image rootfs ---- -FROM alpine:latest AS mcp-rootfs - -# Base utilities -RUN apk add --no-cache grep curl bash - -# Node.js + npm (provides npx for JS/TS MCP servers) -RUN apk add --no-cache nodejs npm - -# Python 3 + uv (provides uvx for Python MCP servers) -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY --from=mcp-builder /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -# Create rootfs tar excluding pseudo-filesystems -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 3: Package rootfs as Docker image tar ---- -FROM alpine:latest AS oci-exporter - -COPY --from=mcp-rootfs /tmp/rootfs.tar /tmp/layer.tar -ARG MCP_IMAGE_TAG=docker.io/library/memoh-mcp:latest - -RUN set -e \ - && LAYER_SHA=$(sha256sum /tmp/layer.tar | awk '{print $1}') \ - && LAYER_SIZE=$(wc -c < /tmp/layer.tar) \ - && mkdir -p "/tmp/image/${LAYER_SHA}" /out \ - && mv /tmp/layer.tar "/tmp/image/${LAYER_SHA}/layer.tar" \ - && printf '{"architecture":"amd64","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp image"}]}' \ - "${LAYER_SHA}" > /tmp/config.json \ - && CONFIG_SHA=$(sha256sum /tmp/config.json | awk '{print $1}') \ - && mv /tmp/config.json "/tmp/image/${CONFIG_SHA}.json" \ - && printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar"]}]' \ - "${CONFIG_SHA}" "${MCP_IMAGE_TAG}" "${LAYER_SHA}" > /tmp/image/manifest.json \ - && cd /tmp/image && tar -cf /out/memoh-mcp.tar manifest.json "${CONFIG_SHA}.json" "${LAYER_SHA}/" - -# ---- Stage 4: Containerd runtime ---- -FROM alpine:latest - -RUN apk add --no-cache containerd containerd-ctr - -COPY --from=oci-exporter /out/memoh-mcp.tar /opt/images/memoh-mcp.tar -COPY docker/containerd-entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -VOLUME ["/run/containerd", "/var/lib/containerd", "/opt/memoh/data"] - -HEALTHCHECK --interval=5s --timeout=3s --start-period=10s --retries=10 \ - CMD test -S /run/containerd/containerd.sock - -ENTRYPOINT ["/entrypoint.sh"] diff --git a/docker/Dockerfile.mcp b/docker/Dockerfile.mcp deleted file mode 100644 index 770344d4..00000000 --- a/docker/Dockerfile.mcp +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1 -FROM scratch AS gomodcache - -FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS build - -WORKDIR /src -COPY go.mod go.sum ./ -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=bind,from=gomodcache,target=/tmp/gomodcache \ - set -eux; \ - if [ -d /tmp/gomodcache/cache/download ]; then \ - cp -a /tmp/gomodcache/. /go/pkg/mod/; \ - fi; \ - go mod download - -COPY . . -ARG TARGETARCH -ARG COMMIT_HASH=unknown -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH:-amd64} \ - go build -trimpath -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" -o /out/mcp ./cmd/mcp - -FROM alpine:latest - -# Base utilities -RUN apk add --no-cache grep curl bash dumb-init - -# Node.js + npm (provides npx for JS/TS MCP servers) -RUN apk add --no-cache nodejs npm - -# Python 3 + uv (provides uvx for Python MCP servers) -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -WORKDIR /app -COPY --from=build /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template -COPY cmd/mcp/entrypoint.sh /opt/entrypoint.sh -RUN chmod +x /opt/entrypoint.sh -ENTRYPOINT ["/usr/bin/dumb-init", "--", "/opt/entrypoint.sh"] diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server index 3e820b8e..8515b5dc 100644 --- a/docker/Dockerfile.server +++ b/docker/Dockerfile.server @@ -35,8 +35,8 @@ RUN --mount=type=cache,target=/go/pkg/mod \ -X github.com/memohai/memoh/internal/version.BuildTime=${BUILD_TIME}" \ -o memoh-server ./cmd/agent/main.go -# ---- Stage 3: Build MCP binary ---- -FROM build-base AS mcp-builder +# ---- Stage 3: Build bridge binary ---- +FROM build-base AS bridge-builder ARG TARGETARCH ARG COMMIT_HASH=unknown RUN --mount=type=cache,target=/go/pkg/mod \ @@ -44,50 +44,21 @@ RUN --mount=type=cache,target=/go/pkg/mod \ CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH:-amd64} \ go build -trimpath \ -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" \ - -o /out/mcp ./cmd/mcp + -o /out/bridge ./cmd/bridge -# ---- Stage 3: Assemble MCP image rootfs ---- -FROM alpine:latest AS mcp-rootfs - -RUN apk add --no-cache grep curl bash -RUN apk add --no-cache nodejs npm -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY --from=mcp-builder /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 4: Package rootfs as OCI image tar ---- -FROM alpine:latest AS oci-exporter - -COPY --from=mcp-rootfs /tmp/rootfs.tar /tmp/layer.tar -ARG MCP_IMAGE_TAG=docker.io/library/memoh-mcp:latest +# ---- Stage 4: Assemble workspace runtime + toolkit ---- +FROM alpine:latest AS toolkit-assembly ARG TARGETARCH -RUN set -e \ - && LAYER_SHA=$(sha256sum /tmp/layer.tar | awk '{print $1}') \ - && LAYER_SIZE=$(wc -c < /tmp/layer.tar) \ - && mkdir -p "/tmp/image/${LAYER_SHA}" /out \ - && mv /tmp/layer.tar "/tmp/image/${LAYER_SHA}/layer.tar" \ - && printf '{"architecture":"%s","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp image"}]}' \ - "${TARGETARCH:-amd64}" "${LAYER_SHA}" > /tmp/config.json \ - && CONFIG_SHA=$(sha256sum /tmp/config.json | awk '{print $1}') \ - && mv /tmp/config.json "/tmp/image/${CONFIG_SHA}.json" \ - && printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar"]}]' \ - "${CONFIG_SHA}" "${MCP_IMAGE_TAG}" "${LAYER_SHA}" > /tmp/image/manifest.json \ - && cd /tmp/image && tar -cf /out/memoh-mcp.tar manifest.json "${CONFIG_SHA}.json" "${LAYER_SHA}/" +RUN apk add --no-cache xz +COPY docker/toolkit/install.sh /tmp/install.sh +RUN /tmp/install.sh /assembly/toolkit "${TARGETARCH:-amd64}" + +# Assemble runtime directory +COPY --from=bridge-builder /out/bridge /assembly/bridge +COPY cmd/bridge/template /assembly/templates +COPY docker/toolkit/bin /assembly/toolkit/bin +RUN chmod +x /assembly/toolkit/bin/* # ---- Stage 5: Final runtime (containerd + server + CNI) ---- FROM alpine:latest @@ -97,7 +68,7 @@ WORKDIR /app # containerd runtime RUN apk add --no-cache containerd containerd-ctr -# CNI plugins + iptables (for MCP container networking) +# CNI plugins + iptables (for workspace container networking) RUN apk add --no-cache ca-certificates tzdata wget cni-plugins iptables \ && mkdir -p /opt/cni/bin \ && (cp -a /usr/lib/cni/. /opt/cni/bin/ 2>/dev/null || true) \ @@ -131,8 +102,8 @@ RUN apk add --no-cache ca-certificates tzdata wget cni-plugins iptables \ ' ]' \ '}' > /etc/cni/net.d/10-memoh.conflist -# MCP image for containerd import -COPY --from=oci-exporter /out/memoh-mcp.tar /opt/images/memoh-mcp.tar +# Workspace runtime (bind-mounted into bot containers) +COPY --from=toolkit-assembly /assembly /opt/memoh/runtime # Server binary and spec COPY --from=server-builder /build/memoh-server /app/memoh-server diff --git a/docker/containerd-entrypoint.sh b/docker/containerd-entrypoint.sh deleted file mode 100644 index 31c811e5..00000000 --- a/docker/containerd-entrypoint.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh - -MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}" - -# Start containerd in background -mkdir -p /run/containerd -containerd & -CONTAINERD_PID=$! - -# Wait for containerd to be fully responsive -echo "Waiting for containerd..." -for i in $(seq 1 30); do - if ctr version >/dev/null 2>&1; then - break - fi - sleep 1 -done - -if ! ctr version >/dev/null 2>&1; then - echo "ERROR: containerd not responsive after 30s" - exit 1 -fi -echo "containerd is running" - -# Import MCP image if not already present -if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "Importing MCP image into containerd..." - for tar in /opt/images/*.tar; do - if [ -f "$tar" ]; then - ctr -n default images import --all-platforms "$tar" 2>&1 || true - fi - done - if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "MCP image ready: ${MCP_IMAGE}" - else - echo "WARNING: MCP image not available after import, will try pull at runtime" - fi -else - echo "MCP image already present: ${MCP_IMAGE}" -fi - -echo "containerd is ready" -wait $CONTAINERD_PID diff --git a/docker/server-entrypoint.sh b/docker/server-entrypoint.sh index 6dbf1df1..27e68e35 100644 --- a/docker/server-entrypoint.sh +++ b/docker/server-entrypoint.sh @@ -1,8 +1,6 @@ #!/bin/sh set -e -MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}" - # ---- Clean up stale CNI state from previous runs ---- # After a container restart the cni0 bridge may linger with a zeroed MAC # (00:00:00:00:00:00), causing "could not set bridge's mac: invalid argument". @@ -46,23 +44,6 @@ if ! ctr version >/dev/null 2>&1; then fi echo "containerd is running (pid $CONTAINERD_PID)" -# ---- Import MCP image if not already present ---- -if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "Importing MCP image into containerd..." - for tar in /opt/images/*.tar; do - if [ -f "$tar" ]; then - ctr -n default images import --all-platforms "$tar" 2>&1 || true - fi - done - if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "MCP image ready: ${MCP_IMAGE}" - else - echo "WARNING: MCP image not available after import, will try pull at runtime" - fi -else - echo "MCP image already present: ${MCP_IMAGE}" -fi - echo "containerd is ready, starting memoh-server..." # ---- Start server (foreground, trap signals for graceful shutdown) ---- diff --git a/docker/toolkit/bin/node b/docker/toolkit/bin/node new file mode 100755 index 00000000..0cd59973 --- /dev/null +++ b/docker/toolkit/bin/node @@ -0,0 +1,9 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +exec "$NODEDIR/bin/node" "$@" diff --git a/docker/toolkit/bin/npm b/docker/toolkit/bin/npm new file mode 100755 index 00000000..6b54ac26 --- /dev/null +++ b/docker/toolkit/bin/npm @@ -0,0 +1,10 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +export PATH="$NODEDIR/bin:$PATH" +exec "$NODEDIR/bin/npm" "$@" diff --git a/docker/toolkit/bin/npx b/docker/toolkit/bin/npx new file mode 100755 index 00000000..fa27a8e5 --- /dev/null +++ b/docker/toolkit/bin/npx @@ -0,0 +1,10 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +export PATH="$NODEDIR/bin:$PATH" +exec "$NODEDIR/bin/npx" "$@" diff --git a/docker/toolkit/bin/uv b/docker/toolkit/bin/uv new file mode 100755 index 00000000..d5393f1e --- /dev/null +++ b/docker/toolkit/bin/uv @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/memoh/toolkit/uv "$@" diff --git a/docker/toolkit/bin/uvx b/docker/toolkit/bin/uvx new file mode 100755 index 00000000..c717cf7c --- /dev/null +++ b/docker/toolkit/bin/uvx @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/memoh/toolkit/uv tool run "$@" diff --git a/docker/toolkit/install.sh b/docker/toolkit/install.sh new file mode 100755 index 00000000..9372d44c --- /dev/null +++ b/docker/toolkit/install.sh @@ -0,0 +1,132 @@ +#!/bin/sh +# Download Node.js (glibc + musl) and uv into a toolkit directory. +# +# Usage: +# ./docker/toolkit/install.sh [output_dir] [arch] +# +# Arguments: +# output_dir Target directory (default: .toolkit) +# arch amd64 or arm64 (default: auto-detect from uname -m) +# +# Environment variables for mirrors (useful in mainland China): +# NODEJS_MIRROR Default: https://nodejs.org/dist +# NODEJS_MUSL_MIRROR Default: https://unofficial-builds.nodejs.org/download/release +# NPM_MIRROR Default: https://registry.npmjs.org +# ALPINE_MIRROR Default: https://dl-cdn.alpinelinux.org/alpine +# UV_MIRROR Default: https://github.com/astral-sh/uv/releases/latest/download +# +set -eu + +ALPINE_VERSION=3.23 +NODE_VERSION=24.14.0 +NPM_VERSION=10.9.2 + +OUTDIR="${1:-.toolkit}" +ARCH="${2:-}" + +if [ -z "$ARCH" ]; then + case "$(uname -m)" in + x86_64) ARCH=amd64 ;; + aarch64) ARCH=arm64 ;; + arm64) ARCH=arm64 ;; + *) echo "ERROR: unsupported architecture: $(uname -m)" >&2; exit 1 ;; + esac +fi + +NODEJS_MIRROR="${NODEJS_MIRROR:-https://nodejs.org/dist}" +NODEJS_MUSL_MIRROR="${NODEJS_MUSL_MIRROR:-https://unofficial-builds.nodejs.org/download/release}" +NPM_MIRROR="${NPM_MIRROR:-https://registry.npmjs.org}" +ALPINE_MIRROR="${ALPINE_MIRROR:-https://dl-cdn.alpinelinux.org/alpine}" +UV_MIRROR="${UV_MIRROR:-https://github.com/astral-sh/uv/releases/latest/download}" + +case "$ARCH" in + amd64) NODE_ARCH=x64; UV_ARCH=x86_64; APK_ARCH=x86_64 ;; + arm64) NODE_ARCH=arm64; UV_ARCH=aarch64; APK_ARCH=aarch64 ;; + *) echo "ERROR: unsupported arch: $ARCH" >&2; exit 1 ;; +esac + +ALPINE_REPO="${ALPINE_MIRROR}/v${ALPINE_VERSION}/main/${APK_ARCH}" + +TMPDIR="$(mktemp -d)" +cleanup() { + rm -rf "$TMPDIR" +} +trap cleanup EXIT INT TERM + +apk_index_path="$TMPDIR/APKINDEX.tar.gz" + +apk_package_filename() { + pkg="$1" + tar -xzOf "$apk_index_path" APKINDEX | awk -v pkg="$pkg" ' + $0 == "P:" pkg { hit = 1; next } + hit && /^V:/ { print pkg "-" substr($0, 3) ".apk"; exit } + /^$/ { hit = 0 } + ' +} + +install_musl_runtime_libs() { + dest_dir="$OUTDIR/node-musl/runtime-lib" + rm -rf "$dest_dir" + mkdir -p "$dest_dir" + + echo "Downloading musl runtime libs (${APK_ARCH})..." + wget -qO "$apk_index_path" "${ALPINE_REPO}/APKINDEX.tar.gz" + + for pkg in libgcc libstdc++; do + apk_file="$(apk_package_filename "$pkg")" + if [ -z "$apk_file" ]; then + echo "ERROR: failed to resolve Alpine package for $pkg (${APK_ARCH})" >&2 + exit 1 + fi + pkg_path="$TMPDIR/$apk_file" + extract_dir="$TMPDIR/extract-$pkg" + rm -rf "$extract_dir" + mkdir -p "$extract_dir" + wget -qO "$pkg_path" "${ALPINE_REPO}/$apk_file" + tar -xzf "$pkg_path" -C "$extract_dir" + cp -a "$extract_dir/usr/lib/." "$dest_dir/" + done +} + +install_pinned_npm() { + node_dir="$1" + dest_dir="$OUTDIR/$node_dir/lib/node_modules/npm" + extract_dir="$TMPDIR/npm-$node_dir" + + rm -rf "$dest_dir" "$extract_dir" + mkdir -p "$extract_dir" "$(dirname "$dest_dir")" + tar -xzf "$npm_archive" -C "$extract_dir" + mv "$extract_dir/package" "$dest_dir" +} + +mkdir -p "$OUTDIR/node-glibc" "$OUTDIR/node-musl" + +echo "Downloading Node.js v${NODE_VERSION} (glibc, ${NODE_ARCH})..." +wget -qO- "${NODEJS_MIRROR}/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}.tar.xz" \ + | tar -xJf - --strip-components=1 -C "$OUTDIR/node-glibc" + +MUSL_URL="${NODEJS_MUSL_MIRROR}/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}-musl.tar.xz" +echo "Downloading Node.js v${NODE_VERSION} (musl, ${NODE_ARCH})..." +musl_archive="$TMPDIR/node-musl.tar.xz" +if wget -qO "$musl_archive" "$MUSL_URL" 2>/dev/null; then + tar -xJf "$musl_archive" --strip-components=1 -C "$OUTDIR/node-musl" +else + echo "ERROR: failed to download musl Node.js build for ${NODE_ARCH}" >&2 + exit 1 +fi + +install_musl_runtime_libs + +echo "Downloading npm v${NPM_VERSION}..." +npm_archive="$TMPDIR/npm.tgz" +wget -qO "$npm_archive" "${NPM_MIRROR}/npm/-/npm-${NPM_VERSION}.tgz" +install_pinned_npm node-glibc +install_pinned_npm node-musl + +echo "Downloading uv (${UV_ARCH})..." +wget -qO- "${UV_MIRROR}/uv-${UV_ARCH}-unknown-linux-musl.tar.gz" \ + | tar -xzf - --strip-components=1 -C /tmp +mv /tmp/uv "$OUTDIR/uv" +chmod +x "$OUTDIR/uv" + +echo "Toolkit installed to $OUTDIR" diff --git a/docs/docs/installation/docker.md b/docs/docs/installation/docker.md index 72525eef..01cb8665 100644 --- a/docs/docs/installation/docker.md +++ b/docs/docs/installation/docker.md @@ -134,7 +134,7 @@ sudo POSTGRES_PASSWORD=your-db-password docker compose up -d For users in mainland China who cannot access Docker Hub directly, uncomment the `registry` line in `config.toml`: ```toml -[mcp] +[workspace] registry = "memoh.cn" ``` diff --git a/internal/config/config.go b/internal/config/config.go index 68ec6206..a8de1d89 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,7 +1,9 @@ package config import ( + "errors" "os" + "path/filepath" "strconv" "strings" @@ -13,7 +15,6 @@ const ( DefaultHTTPAddr = ":8080" DefaultNamespace = "default" DefaultSocketPath = "/run/containerd/containerd.sock" - DefaultMCPImage = "memohai/mcp:latest" DefaultDataRoot = "data" DefaultDataMount = "/data" DefaultCNIBinaryDir = "/opt/cni/bin" @@ -26,7 +27,8 @@ const ( DefaultPGSSLMode = "disable" DefaultQdrantURL = "http://127.0.0.1:6334" DefaultQdrantCollection = "memory" - MCPGRPCPort = 9090 + DefaultRuntimeDir = "/opt/memoh/runtime" + DefaultBaseImage = "debian:bookworm-slim" ) type Config struct { @@ -35,7 +37,7 @@ type Config struct { Admin AdminConfig `toml:"admin"` Auth AuthConfig `toml:"auth"` Containerd ContainerdConfig `toml:"containerd"` - MCP MCPConfig `toml:"mcp"` + Workspace WorkspaceConfig `toml:"workspace"` Postgres PostgresConfig `toml:"postgres"` Qdrant QdrantConfig `toml:"qdrant"` Sparse SparseConfig `toml:"sparse"` @@ -74,23 +76,23 @@ type SocktainerConfig struct { BinaryPath string `toml:"binary_path"` } -type MCPConfig struct { +type WorkspaceConfig struct { Registry string `toml:"registry"` - Image string `toml:"image"` + DefaultImage string `toml:"default_image"` Snapshotter string `toml:"snapshotter"` DataRoot string `toml:"data_root"` CNIBinaryDir string `toml:"cni_bin_dir"` CNIConfigDir string `toml:"cni_conf_dir"` + RuntimeDir string `toml:"runtime_dir"` } -// ImageRef returns the fully qualified image reference, prepending the -// registry mirror when configured and normalizing for containerd compatibility. -// Containerd requires a fully-qualified domain in image references — short -// Docker Hub names like "memohai/mcp:latest" are misinterpreted as hosts. -func (c MCPConfig) ImageRef() string { - img := c.Image +// ImageRef returns the fully qualified image reference for the base image, +// prepending the registry mirror when configured and normalizing for containerd +// compatibility. +func (c WorkspaceConfig) ImageRef() string { + img := c.DefaultImage if img == "" { - img = DefaultMCPImage + img = DefaultBaseImage } if c.Registry != "" { return c.Registry + "/" + img @@ -98,6 +100,14 @@ func (c MCPConfig) ImageRef() string { return NormalizeImageRef(img) } +// RuntimePath returns the path to the workspace runtime directory. +func (c WorkspaceConfig) RuntimePath() string { + if c.RuntimeDir != "" { + return c.RuntimeDir + } + return DefaultRuntimeDir +} + // NormalizeImageRef ensures an image reference is fully qualified for containerd. func NormalizeImageRef(ref string) string { firstSlash := strings.Index(ref, "/") @@ -185,8 +195,8 @@ func Load(path string) (Config, error) { SocketPath: DefaultSocketPath, Namespace: DefaultNamespace, }, - MCP: MCPConfig{ - Image: DefaultMCPImage, + Workspace: WorkspaceConfig{ + DefaultImage: DefaultBaseImage, DataRoot: DefaultDataRoot, CNIBinaryDir: DefaultCNIBinaryDir, CNIConfigDir: DefaultCNIConfigDir, @@ -211,6 +221,7 @@ func Load(path string) (Config, error) { if path == "" { path = DefaultConfigPath } + path = filepath.Clean(path) if _, err := os.Stat(path); err != nil { if os.IsNotExist(err) { @@ -219,7 +230,27 @@ func Load(path string) (Config, error) { return cfg, err } - if _, err := toml.DecodeFile(path, &cfg); err != nil { + //nolint:gosec // config path is intentionally user-configurable + data, err := os.ReadFile(path) + if err != nil { + return cfg, err + } + + var raw struct { + Workspace map[string]any `toml:"workspace"` + MCP map[string]any `toml:"mcp"` + } + if _, err := toml.Decode(string(data), &raw); err != nil { + return cfg, err + } + if raw.MCP != nil { + if raw.Workspace != nil { + return cfg, errors.New("config uses both [mcp] and [workspace]; remove [mcp] and keep only [workspace]") + } + return cfg, errors.New("config section [mcp] has been renamed to [workspace]; update your config.toml and restart") + } + + if _, err := toml.Decode(string(data), &cfg); err != nil { return cfg, err } diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 00000000..5908631e --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,59 @@ +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestLoadRejectsLegacyMCPSection(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[mcp]\nfoo = \"legacy\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + _, err := Load(configPath) + if err == nil { + t.Fatal("expected load to fail for legacy [mcp] section") + } + if !strings.Contains(err.Error(), "[mcp]") || !strings.Contains(err.Error(), "[workspace]") { + t.Fatalf("expected migration error mentioning [mcp] and [workspace], got %v", err) + } +} + +func TestLoadRejectsMixedMCPAndWorkspaceSections(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[mcp]\nfoo = \"legacy\"\n[workspace]\ndefault_image = \"current\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + _, err := Load(configPath) + if err == nil { + t.Fatal("expected load to fail when both [mcp] and [workspace] are present") + } + if !strings.Contains(err.Error(), "both [mcp] and [workspace]") { + t.Fatalf("expected mixed-section error, got %v", err) + } +} + +func TestLoadReadsWorkspaceDefaultImage(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[workspace]\ndefault_image = \"alpine:3.22\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + cfg, err := Load(configPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + if cfg.Workspace.DefaultImage != "alpine:3.22" { + t.Fatalf("expected default_image to load, got %q", cfg.Workspace.DefaultImage) + } +} diff --git a/internal/containerd/resolv.go b/internal/containerd/resolv.go index de6ddbbe..620742ee 100644 --- a/internal/containerd/resolv.go +++ b/internal/containerd/resolv.go @@ -1,25 +1,35 @@ package containerd import ( + "errors" "os" "path/filepath" "strings" ) const ( - systemdResolvConf = "/run/systemd/resolve/resolv.conf" - fallbackResolv = "nameserver 1.1.1.1\nnameserver 8.8.8.8\n" + systemdResolvConf = "/run/systemd/resolve/resolv.conf" + fallbackResolv = "nameserver 1.1.1.1\nnameserver 8.8.8.8\n" + fallbackResolvPerm = 0o644 ) // ResolveConfSource returns a host path to mount as /etc/resolv.conf. // If systemd-resolved config is available, use it. Otherwise write a fallback // resolv.conf under dataDir and return that path. func ResolveConfSource(dataDir string) (string, error) { + return resolveConfSource(dataDir, systemdResolvConf) +} + +func resolveConfSource(dataDir, preferredPath string) (string, error) { if strings.TrimSpace(dataDir) == "" { return "", ErrInvalidArgument } - if _, err := os.Stat(systemdResolvConf); err == nil { - return systemdResolvConf, nil + if strings.TrimSpace(preferredPath) != "" { + if _, err := os.Stat(preferredPath); err == nil { + return preferredPath, nil + } else if !errors.Is(err, os.ErrNotExist) { + return "", err + } } if err := os.MkdirAll(dataDir, 0o750); err != nil { @@ -27,11 +37,14 @@ func ResolveConfSource(dataDir string) (string, error) { } fallbackPath := filepath.Join(dataDir, "resolv.conf") if _, err := os.Stat(fallbackPath); err == nil { + if err := os.Chmod(fallbackPath, fallbackResolvPerm); err != nil { + return "", err + } return fallbackPath, nil - } else if !os.IsNotExist(err) { + } else if !errors.Is(err, os.ErrNotExist) { return "", err } - if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), 0o600); err != nil { + if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), fallbackResolvPerm); err != nil { return "", err } return fallbackPath, nil diff --git a/internal/containerd/resolv_test.go b/internal/containerd/resolv_test.go new file mode 100644 index 00000000..4b3accbd --- /dev/null +++ b/internal/containerd/resolv_test.go @@ -0,0 +1,101 @@ +package containerd + +import ( + "errors" + "os" + "path/filepath" + "testing" +) + +func TestResolveConfSource_InvalidArgument(t *testing.T) { + if _, err := ResolveConfSource(""); !errors.Is(err, ErrInvalidArgument) { + t.Fatalf("expected ErrInvalidArgument, got %v", err) + } +} + +func TestResolveConfSource_UsesPreferredResolvedWhenAvailable(t *testing.T) { + dataDir := t.TempDir() + preferredPath := filepath.Join(dataDir, "preferred-resolv.conf") + if err := os.WriteFile(preferredPath, []byte("nameserver 9.9.9.9\n"), 0o600); err != nil { + t.Fatalf("failed to seed preferred resolv.conf: %v", err) + } + + path, err := resolveConfSource(dataDir, preferredPath) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + if path != preferredPath { + t.Fatalf("expected preferred path, got %q", path) + } +} + +func TestResolveConfSource_UsesSystemdResolvedWhenAvailable(t *testing.T) { + if _, err := os.Stat(systemdResolvConf); errors.Is(err, os.ErrNotExist) { + t.Skip("systemd-resolved config not available on this host") + } else if err != nil { + t.Fatalf("failed to stat %s: %v", systemdResolvConf, err) + } + + path, err := ResolveConfSource(t.TempDir()) + if err != nil { + t.Fatalf("ResolveConfSource returned error: %v", err) + } + if path != systemdResolvConf { + t.Fatalf("expected systemd-resolved path, got %q", path) + } +} + +func TestResolveConfSource_FallbackCreatesReadableFile(t *testing.T) { + dataDir := t.TempDir() + preferredPath := filepath.Join(dataDir, "missing-preferred-resolv.conf") + + path, err := resolveConfSource(dataDir, preferredPath) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + + if path != filepath.Join(dataDir, "resolv.conf") { + t.Fatalf("expected fallback path, got %q", path) + } + + //nolint:gosec // test reads a file it just created in a temp directory + content, err := os.ReadFile(path) + if err != nil { + t.Fatalf("failed to read fallback resolv.conf: %v", err) + } + if string(content) != fallbackResolv { + t.Fatalf("unexpected fallback resolv.conf contents: %q", string(content)) + } + + info, err := os.Stat(path) + if err != nil { + t.Fatalf("failed to stat fallback resolv.conf: %v", err) + } + if perm := info.Mode().Perm(); perm != fallbackResolvPerm { + t.Fatalf("expected permissions %o, got %o", fallbackResolvPerm, perm) + } +} + +func TestResolveConfSource_FallbackFixesExistingPermissions(t *testing.T) { + dataDir := t.TempDir() + fallbackPath := filepath.Join(dataDir, "resolv.conf") + if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), 0o600); err != nil { + t.Fatalf("failed to seed fallback resolv.conf: %v", err) + } + + path, err := resolveConfSource(dataDir, filepath.Join(dataDir, "missing-preferred-resolv.conf")) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + if path != fallbackPath { + t.Fatalf("expected existing fallback path, got %q", path) + } + + info, err := os.Stat(fallbackPath) + if err != nil { + t.Fatalf("failed to stat fallback resolv.conf: %v", err) + } + if perm := info.Mode().Perm(); perm != fallbackResolvPerm { + t.Fatalf("expected permissions %o, got %o", fallbackResolvPerm, perm) + } +} diff --git a/internal/containerd/service.go b/internal/containerd/service.go index f4a929e0..ed868dd9 100644 --- a/internal/containerd/service.go +++ b/internal/containerd/service.go @@ -34,6 +34,7 @@ var ( type PullImageOptions struct { Unpack bool Snapshotter string + OnProgress func(PullProgress) // optional, nil = no progress reporting } type DeleteImageOptions struct { @@ -140,6 +141,39 @@ func (s *DefaultService) PullImage(ctx context.Context, ref string, opts *PullIm pullOpts = append(pullOpts, containerd.WithPullSnapshotter(opts.Snapshotter)) } + // When OnProgress is set, poll content store for active download statuses. + if opts != nil && opts.OnProgress != nil { + stop := make(chan struct{}) + go func() { + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + cs := s.client.ContentStore() + for { + select { + case <-stop: + return + case <-ctx.Done(): + return + case <-ticker.C: + statuses, err := cs.ListStatuses(ctx) + if err != nil { + continue + } + layers := make([]LayerStatus, len(statuses)) + for i, st := range statuses { + layers[i] = LayerStatus{ + Ref: st.Ref, + Offset: st.Offset, + Total: st.Total, + } + } + opts.OnProgress(PullProgress{Layers: layers}) + } + } + }() + defer close(stop) + } + img, err := s.client.Pull(ctx, ref, pullOpts...) if err != nil { return ImageInfo{}, err diff --git a/internal/containerd/types.go b/internal/containerd/types.go index a98ff169..31242f0e 100644 --- a/internal/containerd/types.go +++ b/internal/containerd/types.go @@ -94,6 +94,16 @@ type ContainerSpec struct { TTY bool } +type LayerStatus struct { + Ref string `json:"ref"` + Offset int64 `json:"offset"` + Total int64 `json:"total"` +} + +type PullProgress struct { + Layers []LayerStatus `json:"layers"` +} + type NetworkSetupRequest struct { ContainerID string PID uint32 diff --git a/internal/conversation/flow/resolver.go b/internal/conversation/flow/resolver.go index b93f5b5b..434adfdf 100644 --- a/internal/conversation/flow/resolver.go +++ b/internal/conversation/flow/resolver.go @@ -157,7 +157,6 @@ type gatewayModelConfig struct { type gatewayIdentity struct { BotID string `json:"botId"` - ContainerID string `json:"containerId"` ChannelIdentityID string `json:"channelIdentityId"` DisplayName string `json:"displayName"` CurrentPlatform string `json:"currentPlatform,omitempty"` @@ -379,8 +378,6 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r messages = append(messages, reqMessages...) messages = sanitizeMessages(messages) skills := dedup(req.Skills) - containerID := r.resolveContainerID(ctx, req.BotID, req.ContainerID) - var usableSkills []gatewaySkill if r.skillLoader != nil { entries, err := r.skillLoader.LoadSkills(ctx, req.BotID) @@ -467,7 +464,6 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r Query: headerifiedQuery, Identity: gatewayIdentity{ BotID: req.BotID, - ContainerID: containerID, ChannelIdentityID: strings.TrimSpace(req.SourceChannelIdentityID), DisplayName: displayName, CurrentPlatform: req.CurrentChannel, @@ -1275,25 +1271,6 @@ func encodeReaderAsDataURL(reader io.Reader, maxBytes int64, attachmentType, fal return encoded.String(), mime, nil } -// --- container resolution --- - -func (r *Resolver) resolveContainerID(ctx context.Context, botID, explicit string) string { - if strings.TrimSpace(explicit) != "" { - return explicit - } - if r.queries != nil { - pgBotID, err := parseResolverUUID(botID) - if err == nil { - row, err := r.queries.GetContainerByBotID(ctx, pgBotID) - if err == nil && strings.TrimSpace(row.ContainerID) != "" { - return row.ContainerID - } - } - } - r.logger.Warn("no container found for bot, using fallback", slog.String("bot_id", botID)) - return "mcp-" + botID -} - // --- message loading --- type messageWithUsage struct { diff --git a/internal/conversation/flow/resolver_test.go b/internal/conversation/flow/resolver_test.go index 6c1ac84a..eb7c4367 100644 --- a/internal/conversation/flow/resolver_test.go +++ b/internal/conversation/flow/resolver_test.go @@ -57,7 +57,6 @@ func TestPostTriggerSchedule_Endpoint(t *testing.T) { Skills: []string{}, Identity: gatewayIdentity{ BotID: "bot-123", - ContainerID: "mcp-bot-123", ChannelIdentityID: "owner-user-1", DisplayName: "Scheduler", }, diff --git a/internal/conversation/types.go b/internal/conversation/types.go index 8ff0bda2..cf1cdf41 100644 --- a/internal/conversation/types.go +++ b/internal/conversation/types.go @@ -220,7 +220,6 @@ type ChatRequest struct { Token string `json:"-"` UserID string `json:"-"` SourceChannelIdentityID string `json:"-"` - ContainerID string `json:"-"` DisplayName string `json:"-"` RouteID string `json:"-"` ChatToken string `json:"-"` diff --git a/internal/handlers/containerd.go b/internal/handlers/containerd.go index fd5ce7cb..a086aca5 100644 --- a/internal/handlers/containerd.go +++ b/internal/handlers/containerd.go @@ -1,7 +1,9 @@ package handlers import ( + "bufio" "context" + "encoding/json" "errors" "fmt" "io" @@ -10,28 +12,24 @@ import ( "sort" "strings" "sync" + "sync/atomic" "time" "github.com/containerd/errdefs" - "github.com/google/uuid" - "github.com/jackc/pgx/v5" "github.com/labstack/echo/v4" "github.com/memohai/memoh/internal/accounts" "github.com/memohai/memoh/internal/bots" "github.com/memohai/memoh/internal/config" ctr "github.com/memohai/memoh/internal/containerd" - "github.com/memohai/memoh/internal/db" - dbsqlc "github.com/memohai/memoh/internal/db/sqlc" "github.com/memohai/memoh/internal/mcp" "github.com/memohai/memoh/internal/policy" + "github.com/memohai/memoh/internal/workspace" ) type ContainerdHandler struct { - service ctr.Service - manager *mcp.Manager - cfg config.MCPConfig - namespace string + manager *workspace.Manager + cfg config.WorkspaceConfig containerBackend string logger *slog.Logger toolGateway *mcp.ToolGatewayService @@ -41,12 +39,12 @@ type ContainerdHandler struct { botService *bots.Service accountService *accounts.Service policyService *policy.Service - queries *dbsqlc.Queries } type CreateContainerRequest struct { Snapshotter string `json:"snapshotter,omitempty"` RestoreData bool `json:"restore_data,omitempty"` + Image string `json:"image,omitempty"` } type CreateContainerResponse struct { @@ -58,6 +56,36 @@ type CreateContainerResponse struct { HasPreservedData bool `json:"has_preserved_data"` } +// codesync(container-create-stream): keep these SSE payloads in sync with +// packages/sdk/src/container-stream.ts. +type createContainerPullingEvent struct { + Type string `json:"type"` + Image string `json:"image"` +} + +type createContainerPullProgressEvent struct { + Type string `json:"type"` + Layers []ctr.LayerStatus `json:"layers"` +} + +type createContainerCreatingEvent struct { + Type string `json:"type"` +} + +type createContainerCompleteEvent struct { + Type string `json:"type"` + Container CreateContainerResponse `json:"container"` +} + +type createContainerRestoringEvent struct { + Type string `json:"type"` +} + +type createContainerErrorEvent struct { + Type string `json:"type"` + Message string `json:"message"` +} + type GetContainerResponse struct { ContainerID string `json:"container_id"` Image string `json:"image"` @@ -66,6 +94,7 @@ type GetContainerResponse struct { ContainerPath string `json:"container_path"` TaskRunning bool `json:"task_running"` HasPreservedData bool `json:"has_preserved_data"` + Legacy bool `json:"legacy"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } @@ -108,12 +137,10 @@ type ListSnapshotsResponse struct { Snapshots []SnapshotInfo `json:"snapshots"` } -func NewContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.MCPConfig, namespace string, containerBackend string, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *ContainerdHandler { +func NewContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.WorkspaceConfig, containerBackend string, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *ContainerdHandler { h := &ContainerdHandler{ - service: service, manager: manager, cfg: cfg, - namespace: namespace, containerBackend: containerBackend, logger: log.With(slog.String("handler", "containerd")), mcpSess: make(map[string]*mcpSession), @@ -121,7 +148,6 @@ func NewContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Ma botService: botService, accountService: accountService, policyService: policyService, - queries: queries, } return h } @@ -166,7 +192,7 @@ func (h *ContainerdHandler) Register(e *echo.Echo) { // @Tags containerd // @Param bot_id path string true "Bot ID" // @Param payload body CreateContainerRequest true "Create container payload" -// @Success 200 {object} CreateContainerResponse +// @Success 200 {object} CreateContainerResponse "SSE stream of container creation events" // @Failure 400 {object} ErrorResponse // @Failure 500 {object} ErrorResponse // @Router /bots/{bot_id}/container [post]. @@ -180,156 +206,129 @@ func (h *ContainerdHandler) CreateContainer(c echo.Context) error { if err := c.Bind(&req); err != nil { return echo.NewHTTPError(http.StatusBadRequest, err.Error()) } - containerID := mcp.ContainerPrefix + botID + // Image override lets administrators specify a custom base image. + // NOTE(saas): if this becomes a multi-tenant SaaS, image override must be + // validated against an allowlist to prevent SSRF and resource abuse. + ctx := c.Request().Context() + imageOverride := strings.TrimSpace(req.Image) + image, err := h.manager.ResolveWorkspaceImage(ctx, botID) + if err != nil { + h.logger.Error("resolve workspace image failed", + slog.String("bot_id", botID), slog.Any("error", err)) + return nil + } + if imageOverride != "" { + image = config.NormalizeImageRef(imageOverride) + } - image := h.mcpImageRef() snapshotter := strings.TrimSpace(req.Snapshotter) if snapshotter == "" { snapshotter = h.cfg.Snapshotter } - ctx := c.Request().Context() - - if h.manager == nil { - return echo.NewHTTPError(http.StatusInternalServerError, "manager not configured") + flusher, ok := c.Response().Writer.(http.Flusher) + if !ok { + return echo.NewHTTPError(http.StatusInternalServerError, "streaming not supported") } - started := false - if err := h.manager.Start(ctx, botID); err != nil { - h.logger.Error("mcp container start failed", - slog.String("container_id", containerID), - slog.Any("error", err), - ) - } else { - started = true + c.Response().Header().Set(echo.HeaderContentType, "text/event-stream") + c.Response().Header().Set(echo.HeaderCacheControl, "no-cache") + c.Response().Header().Set(echo.HeaderConnection, "keep-alive") + c.Response().WriteHeader(http.StatusOK) + writer := bufio.NewWriter(c.Response().Writer) + + var mu sync.Mutex + send := func(payload any) { + mu.Lock() + defer mu.Unlock() + data, err := json.Marshal(payload) + if err != nil { + return + } + _ = writeSSEData(writer, flusher, string(data)) + } + + sendError := func(msg string) { + send(createContainerErrorEvent{Type: "error", Message: msg}) + } + + // Phase 1: Pull image with progress + send(createContainerPullingEvent{Type: "pulling", Image: image}) + + var pullDone atomic.Bool + _, pullErr := h.manager.PullImage(ctx, image, &ctr.PullImageOptions{ + Unpack: true, + Snapshotter: snapshotter, + OnProgress: func(p ctr.PullProgress) { + if pullDone.Load() { + return + } + send(createContainerPullProgressEvent{Type: "pull_progress", Layers: p.Layers}) + }, + }) + pullDone.Store(true) + if pullErr != nil { + h.logger.Error("image pull failed", + slog.String("image", image), slog.Any("error", pullErr)) + sendError("image pull failed: " + pullErr.Error()) + return nil + } + + // Phase 2: Create container (image is local, should be fast) + send(createContainerCreatingEvent{Type: "creating"}) + + // Notify the client before starting if data migration will happen, + // since restoring a large /data volume can take a while. + if h.manager.HasPreservedData(botID) { + send(createContainerRestoringEvent{Type: "restoring"}) + } + + if err := h.manager.StartWithResolvedImage(ctx, botID, image); err != nil { + h.logger.Error("container start failed", + slog.String("bot_id", botID), slog.Any("error", err)) + sendError("container start failed: " + err.Error()) + return nil + } + if err := h.manager.RememberWorkspaceImage(ctx, botID, image); err != nil { + h.logger.Warn("remember workspace image failed", + slog.String("bot_id", botID), slog.String("image", image), slog.Any("error", err)) + } + + containerID, err := h.manager.ContainerID(ctx, botID) + if err != nil { + h.logger.Error("container ID resolution failed after start", + slog.String("bot_id", botID), slog.Any("error", err)) + sendError("container ID resolution failed: " + err.Error()) + return nil } dataRestored := false - if started && req.RestoreData && h.manager.HasPreservedData(botID) { + if req.RestoreData && h.manager.HasPreservedData(botID) { if err := h.manager.RestorePreservedData(ctx, botID); err != nil { - h.logger.Warn("restore preserved data on create failed", + h.logger.Error("restore preserved data failed", slog.String("bot_id", botID), slog.Any("error", err)) - } else { - dataRestored = true - } - } - - h.upsertContainerRecord(ctx, botID, containerID, map[bool]string{true: "running", false: "created"}[started]) - - return c.JSON(http.StatusOK, CreateContainerResponse{ - ContainerID: containerID, - Image: image, - Snapshotter: snapshotter, - Started: started, - DataRestored: dataRestored, - HasPreservedData: h.manager.HasPreservedData(botID), - }) -} - -// ensureContainerAndTask verifies the container exists in containerd and its task is -// running. If the container is missing (e.g. after a VM restart) it is recreated via -// SetupBotContainer. This prevents permanent desync between DB and containerd state. -func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containerID, botID string) error { - _, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if !errdefs.IsNotFound(err) { - return err - } - h.logger.Warn("container missing in containerd, rebuilding", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - ) - return h.SetupBotContainer(ctx, botID) - } - - tasks, err := h.service.ListTasks(ctx, &ctr.ListTasksOptions{ - Filter: "container.id==" + containerID, - }) - if err != nil { - return err - } - if len(tasks) > 0 { - if tasks[0].Status == ctr.TaskStatusRunning { - if err := h.setupNetworkOrFail(ctx, containerID, botID); err != nil { - return err - } + sendError("restore preserved data failed: " + err.Error()) return nil } - if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - if !errdefs.IsNotFound(err) { - h.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - return err - } - } + dataRestored = true } - if err := h.service.StartContainer(ctx, containerID, nil); err != nil { - return err - } - return h.setupNetworkOrFail(ctx, containerID, botID) -} + h.manager.RecordContainerRunning(ctx, botID, containerID, image) -// setupNetworkOrFail attempts CNI network setup with one retry. Returns an error -// if no usable IP is obtained — callers must not silently ignore this. -func (h *ContainerdHandler) setupNetworkOrFail(ctx context.Context, containerID, botID string) error { - var lastErr error - for attempt := 0; attempt < 2; attempt++ { - netResult, err := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: h.cfg.CNIBinaryDir, - CNIConfDir: h.cfg.CNIConfigDir, - }) - if err != nil { - lastErr = err - h.logger.Warn("network setup attempt failed", - slog.String("container_id", containerID), - slog.Int("attempt", attempt+1), - slog.Any("error", err)) - continue - } - if netResult.IP == "" { - lastErr = fmt.Errorf("network setup returned no IP for %s", containerID) - continue - } - if h.manager != nil { - h.manager.SetContainerIP(botID, netResult.IP) - } - return nil - } - return fmt.Errorf("network setup failed for container %s: %w", containerID, lastErr) -} + // Phase 3: Complete + send(createContainerCompleteEvent{ + Type: "complete", + Container: CreateContainerResponse{ + ContainerID: containerID, + Image: image, + Snapshotter: snapshotter, + Started: true, + DataRestored: dataRestored, + HasPreservedData: h.manager.HasPreservedData(botID), + }, + }) -// botContainerID resolves container_id for a bot from the database. -func (h *ContainerdHandler) botContainerID(ctx context.Context, botID string) (string, error) { - if h.queries != nil { - pgBotID, err := db.ParseUUID(botID) - if err == nil { - row, dbErr := h.queries.GetContainerByBotID(ctx, pgBotID) - if dbErr == nil && strings.TrimSpace(row.ContainerID) != "" { - return row.ContainerID, nil - } - if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { - h.logger.Warn("botContainerID: db lookup failed", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - containers, err := h.service.ListContainersByLabel(ctx, mcp.BotLabelKey, botID) - if err != nil { - return "", err - } - if len(containers) == 0 { - return "", echo.NewHTTPError(http.StatusNotFound, "container not found") - } - bestID := "" - var bestUpdated time.Time - for _, info := range containers { - if bestID == "" || info.UpdatedAt.After(bestUpdated) { - bestID = info.ID - bestUpdated = info.UpdatedAt - } - } - return bestID, nil + return nil } // GetContainer godoc @@ -345,57 +344,24 @@ func (h *ContainerdHandler) GetContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - - if h.queries != nil { - pgBotID, parseErr := db.ParseUUID(botID) - if parseErr == nil { - row, dbErr := h.queries.GetContainerByBotID(ctx, pgBotID) - if dbErr == nil { - taskRunning := h.isTaskRunning(ctx, row.ContainerID) - createdAt := time.Time{} - if row.CreatedAt.Valid { - createdAt = row.CreatedAt.Time - } - updatedAt := time.Time{} - if row.UpdatedAt.Valid { - updatedAt = row.UpdatedAt.Time - } - return c.JSON(http.StatusOK, GetContainerResponse{ - ContainerID: row.ContainerID, - Image: row.Image, - Status: row.Status, - Namespace: row.Namespace, - ContainerPath: row.ContainerPath, - TaskRunning: taskRunning, - HasPreservedData: h.manager.HasPreservedData(botID), - CreatedAt: createdAt, - UpdatedAt: updatedAt, - }) - } - } - } - - containerID, err := h.botContainerID(ctx, botID) + status, err := h.manager.GetContainerInfo(c.Request().Context(), botID) if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - info, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if errdefs.IsNotFound(err) { - return echo.NewHTTPError(http.StatusNotFound, "container not found") + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, GetContainerResponse{ - ContainerID: info.ID, - Image: info.Image, - Status: "unknown", - Namespace: h.namespace, - TaskRunning: h.isTaskRunning(ctx, containerID), - HasPreservedData: h.manager.HasPreservedData(botID), - CreatedAt: info.CreatedAt, - UpdatedAt: info.UpdatedAt, + ContainerID: status.ContainerID, + Image: status.Image, + Status: status.Status, + Namespace: status.Namespace, + ContainerPath: status.ContainerPath, + TaskRunning: status.TaskRunning, + HasPreservedData: status.HasPreservedData, + Legacy: status.Legacy, + CreatedAt: status.CreatedAt, + UpdatedAt: status.UpdatedAt, }) } @@ -414,7 +380,7 @@ func (h *ContainerdHandler) DeleteContainer(c echo.Context) error { return err } preserveData := c.QueryParam("preserve_data") == "true" - if err := h.CleanupBotContainer(c.Request().Context(), botID, preserveData); err != nil { + if err := h.manager.CleanupBotContainer(c.Request().Context(), botID, preserveData); err != nil { return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.NoContent(http.StatusNoContent) @@ -433,21 +399,11 @@ func (h *ContainerdHandler) StartContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) - if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container started status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } + if err := h.manager.EnsureRunning(c.Request().Context(), botID); err != nil { + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, map[string]bool{"started": true}) } @@ -465,27 +421,11 @@ func (h *ContainerdHandler) StopContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) - if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - if err := h.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{ - Timeout: 10 * time.Second, - Force: true, - }); err != nil && !errdefs.IsNotFound(err) { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - h.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.UpdateContainerStopped(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container stopped status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } + if err := h.manager.StopBot(c.Request().Context(), botID); err != nil { + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, map[string]bool{"stopped": true}) } @@ -515,7 +455,7 @@ func (h *ContainerdHandler) CreateSnapshot(c echo.Context) error { if err := c.Bind(&req); err != nil { return echo.NewHTTPError(http.StatusBadRequest, err.Error()) } - created, err := h.manager.CreateSnapshot(c.Request().Context(), botID, req.SnapshotName, mcp.SnapshotSourceManual) + created, err := h.manager.CreateSnapshot(c.Request().Context(), botID, req.SnapshotName, workspace.SnapshotSourceManual) if err != nil { if errdefs.IsNotFound(err) { return echo.NewHTTPError(http.StatusNotFound, "container not found") @@ -529,7 +469,7 @@ func (h *ContainerdHandler) CreateSnapshot(c echo.Context) error { DisplayName: created.DisplayName, Snapshotter: created.Snapshotter, Version: created.Version, - Source: mcp.SnapshotSourceManual, + Source: workspace.SnapshotSourceManual, }) } @@ -590,7 +530,7 @@ func (h *ContainerdHandler) ListSnapshots(c echo.Context) error { items := make([]SnapshotInfo, 0, len(lineage)+len(data.ManagedMeta)) seen := make(map[string]struct{}, len(lineage)+len(data.ManagedMeta)) - appendRuntime := func(runtimeInfo ctr.SnapshotInfo, fallbackSource string, meta *mcp.ManagedSnapshotMeta) { + appendRuntime := func(runtimeInfo ctr.SnapshotInfo, fallbackSource string, meta *workspace.ManagedSnapshotMeta) { source := fallbackSource managed := false var version *int @@ -824,10 +764,6 @@ func snapshotLineage(root string, all []ctr.SnapshotInfo) ([]ctr.SnapshotInfo, b // ---------- auth helpers ---------- -func (h *ContainerdHandler) mcpImageRef() string { - return h.cfg.ImageRef() -} - // requireBotAccess extracts bot_id from path, validates user auth, and authorizes bot access. func (h *ContainerdHandler) requireBotAccess(c echo.Context) (string, error) { channelIdentityID, err := h.requireChannelIdentityID(c) @@ -868,180 +804,3 @@ func (h *ContainerdHandler) requireBotAccessWithGuest(c echo.Context) (string, e } return botID, nil } - -// SetupBotContainer creates and starts the MCP container for a bot. -func (h *ContainerdHandler) SetupBotContainer(ctx context.Context, botID string) error { - containerID := mcp.ContainerPrefix + botID - - if h.manager == nil { - return errors.New("manager not configured") - } - - if err := h.manager.Start(ctx, botID); err != nil { - h.logger.Error("setup bot container: start failed", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - slog.Any("error", err), - ) - return err - } - - h.upsertContainerRecord(ctx, botID, containerID, "running") - return nil -} - -// CleanupBotContainer removes the containerd container and DB record for a bot. -// When preserveData is true, /data is exported to a backup archive before -// deletion so it can be restored into a future container. -func (h *ContainerdHandler) CleanupBotContainer(ctx context.Context, botID string, preserveData bool) error { - h.logger.Info("CleanupBotContainer starting", - slog.String("bot_id", botID), slog.Bool("preserve_data", preserveData)) - - if h.manager != nil { - if err := h.manager.Delete(ctx, botID, preserveData); err != nil { - if !errdefs.IsNotFound(err) { - return err - } - h.logger.Warn("CleanupBotContainer: container not found in containerd", - slog.String("bot_id", botID)) - } - } - - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.DeleteContainerByBotID(ctx, pgBotID); dbErr != nil { - h.logger.Error("CleanupBotContainer: failed to delete DB record", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - h.logger.Info("CleanupBotContainer finished", slog.String("bot_id", botID)) - return nil -} - -func (h *ContainerdHandler) isTaskRunning(ctx context.Context, containerID string) bool { - tasks, err := h.service.ListTasks(ctx, &ctr.ListTasksOptions{ - Filter: "container.id==" + containerID, - }) - return err == nil && len(tasks) > 0 && tasks[0].Status == ctr.TaskStatusRunning -} - -// ReconcileContainers compares the DB containers table against actual containerd -// state on startup. For each auto_start container in DB it verifies the container -// and task exist; if missing they are rebuilt via SetupBotContainer. Containers that -// the DB claims are running but are not present in containerd get corrected. -func (h *ContainerdHandler) ReconcileContainers(ctx context.Context) { - if h.queries == nil { - return - } - rows, err := h.queries.ListAutoStartContainers(ctx) - if err != nil { - h.logger.Error("reconcile: failed to list containers from DB", slog.Any("error", err)) - return - } - if len(rows) == 0 { - h.logger.Info("reconcile: no auto-start containers in DB") - return - } - - h.logger.Info("reconcile: checking containers", slog.Int("count", len(rows))) - for _, row := range rows { - containerID := row.ContainerID - botID := uuid.UUID(row.BotID.Bytes).String() - - _, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if !errdefs.IsNotFound(err) { - h.logger.Error("reconcile: failed to get container", - slog.String("container_id", containerID), slog.Any("error", err)) - continue - } - // Container missing in containerd — rebuild. - h.logger.Warn("reconcile: container missing, rebuilding", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - if setupErr := h.SetupBotContainer(ctx, botID); setupErr != nil { - h.logger.Error("reconcile: rebuild failed", - slog.String("bot_id", botID), slog.Any("error", setupErr)) - if dbErr := h.queries.UpdateContainerStatus(ctx, dbsqlc.UpdateContainerStatusParams{ - Status: "error", - BotID: row.BotID, - }); dbErr != nil { - h.logger.Error("reconcile: failed to mark container as error", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - continue - } - - // Container exists — ensure the task is running. - running := h.isTaskRunning(ctx, containerID) - if running { - if row.Status != "running" { - if dbErr := h.queries.UpdateContainerStarted(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to update DB status to running", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - if netErr := h.setupNetworkOrFail(ctx, containerID, botID); netErr != nil { - h.logger.Error("reconcile: network setup failed for running task, container unreachable", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - slog.Any("error", netErr)) - } else { - h.logger.Info("reconcile: container healthy", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - } - continue - } - - // Task not running — try to start it. - h.logger.Warn("reconcile: task not running, starting", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - h.logger.Error("reconcile: failed to start task", - slog.String("bot_id", botID), slog.Any("error", err)) - if dbErr := h.queries.UpdateContainerStopped(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to mark container as stopped", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } else { - if dbErr := h.queries.UpdateContainerStarted(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to update DB status to running", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - h.logger.Info("reconcile: completed") -} - -func (h *ContainerdHandler) upsertContainerRecord(ctx context.Context, botID, containerID, status string) { - if h.queries == nil { - return - } - pgBotID, err := db.ParseUUID(botID) - if err != nil { - return - } - ns := strings.TrimSpace(h.namespace) - if ns == "" { - ns = "default" - } - if dbErr := h.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{ - BotID: pgBotID, - ContainerID: containerID, - ContainerName: containerID, - Image: h.mcpImageRef(), - Status: status, - Namespace: ns, - AutoStart: true, - }); dbErr != nil { - h.logger.Error("failed to upsert container record", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - if status == "running" { - if dbErr := h.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container started status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } -} diff --git a/internal/handlers/containerd_terminal.go b/internal/handlers/containerd_terminal.go index c26fef65..d0300212 100644 --- a/internal/handlers/containerd_terminal.go +++ b/internal/handlers/containerd_terminal.go @@ -9,7 +9,7 @@ import ( "github.com/gorilla/websocket" "github.com/labstack/echo/v4" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) var terminalUpgrader = websocket.Upgrader{ diff --git a/internal/handlers/filemanager.go b/internal/handlers/filemanager.go index 627e0c78..b40cdbe7 100644 --- a/internal/handlers/filemanager.go +++ b/internal/handlers/filemanager.go @@ -12,7 +12,7 @@ import ( "github.com/labstack/echo/v4" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) // ---------- request / response types ---------- @@ -84,7 +84,7 @@ func resolveContainerPath(rawPath string) (string, error) { } // getGRPCClient returns the gRPC client for the bot's container. -func (h *ContainerdHandler) getGRPCClient(ctx context.Context, botID string) (*mcpclient.Client, error) { +func (h *ContainerdHandler) getGRPCClient(ctx context.Context, botID string) (*bridge.Client, error) { return h.manager.MCPClient(ctx, botID) } @@ -103,13 +103,13 @@ func fsFileInfoFromEntry(containerPath, name string, isDir bool, size int64, mod // fsHTTPError maps mcpclient domain errors to HTTP status codes. func fsHTTPError(err error) *echo.HTTPError { switch { - case errors.Is(err, mcpclient.ErrNotFound): + case errors.Is(err, bridge.ErrNotFound): return echo.NewHTTPError(http.StatusNotFound, err.Error()) - case errors.Is(err, mcpclient.ErrBadRequest): + case errors.Is(err, bridge.ErrBadRequest): return echo.NewHTTPError(http.StatusBadRequest, err.Error()) - case errors.Is(err, mcpclient.ErrForbidden): + case errors.Is(err, bridge.ErrForbidden): return echo.NewHTTPError(http.StatusForbidden, err.Error()) - case errors.Is(err, mcpclient.ErrUnavailable): + case errors.Is(err, bridge.ErrUnavailable): return echo.NewHTTPError(http.StatusServiceUnavailable, err.Error()) default: return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) diff --git a/internal/handlers/mcp_federation_gateway.go b/internal/handlers/mcp_federation_gateway.go index 0db5d2ac..701d1290 100644 --- a/internal/handlers/mcp_federation_gateway.go +++ b/internal/handlers/mcp_federation_gateway.go @@ -277,11 +277,11 @@ func (g *MCPFederationGateway) startStdioConnectionSession(ctx context.Context, if g.handler == nil { return nil, errors.New("containerd handler not configured") } - containerID, err := g.handler.botContainerID(ctx, botID) - if err != nil { + if err := g.handler.manager.EnsureRunning(ctx, botID); err != nil { return nil, err } - if err := g.handler.ensureContainerAndTask(ctx, containerID, botID); err != nil { + containerID, err := g.handler.manager.ContainerID(ctx, botID) + if err != nil { return nil, err } @@ -296,7 +296,7 @@ func (g *MCPFederationGateway) startStdioConnectionSession(ctx context.Context, Env: normalizeStringMap(connection.Config["env"]), Cwd: strings.TrimSpace(anyToString(connection.Config["cwd"])), } - return g.handler.startContainerdMCPCommandSession(ctx, containerID, request) + return g.handler.startContainerdMCPCommandSession(ctx, botID, containerID, request) } func parseGatewayToolsListPayload(payload map[string]any) ([]mcpgw.ToolDescriptor, error) { diff --git a/internal/handlers/mcp_stdio.go b/internal/handlers/mcp_stdio.go index 27e4545a..ac37e83b 100644 --- a/internal/handlers/mcp_stdio.go +++ b/internal/handlers/mcp_stdio.go @@ -20,7 +20,7 @@ import ( sdkmcp "github.com/modelcontextprotocol/go-sdk/mcp" mcptools "github.com/memohai/memoh/internal/mcp" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) // MCPStdioRequest represents a request to create an MCP stdio session. @@ -588,15 +588,15 @@ func (h *ContainerdHandler) CreateMCPStdio(c echo.Context) error { return echo.NewHTTPError(http.StatusBadRequest, "command is required") } ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) + if err := h.manager.EnsureRunning(ctx, botID); err != nil { + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) + } + containerID, err := h.manager.ContainerID(ctx, botID) if err != nil { return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - sess, err := h.startContainerdMCPCommandSession(ctx, containerID, req) + sess, err := h.startContainerdMCPCommandSession(ctx, botID, containerID, req) if err != nil { return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } @@ -686,13 +686,7 @@ func (h *ContainerdHandler) HandleMCPStdio(c echo.Context) error { return c.JSON(http.StatusOK, payload) } -func (h *ContainerdHandler) startContainerdMCPCommandSession(ctx context.Context, containerID string, req MCPStdioRequest) (*mcpSession, error) { - // Extract bot_id from container_id (remove "mcp-" prefix) - botID := strings.TrimPrefix(containerID, "mcp-") - if botID == "" || botID == containerID { - return nil, fmt.Errorf("invalid container_id: %s", containerID) - } - +func (h *ContainerdHandler) startContainerdMCPCommandSession(ctx context.Context, botID, containerID string, req MCPStdioRequest) (*mcpSession, error) { // Get gRPC client for the bot container via manager client, err := h.manager.MCPClient(ctx, botID) if err != nil { diff --git a/internal/handlers/memory.go b/internal/handlers/memory.go index b9844990..4810ddbc 100644 --- a/internal/handlers/memory.go +++ b/internal/handlers/memory.go @@ -18,10 +18,10 @@ import ( "github.com/memohai/memoh/internal/accounts" "github.com/memohai/memoh/internal/bots" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" memprovider "github.com/memohai/memoh/internal/memory/adapters" storefs "github.com/memohai/memoh/internal/memory/storefs" "github.com/memohai/memoh/internal/settings" + "github.com/memohai/memoh/internal/workspace/bridge" ) // MemoryHandler handles memory CRUD operations scoped by bot. @@ -120,7 +120,7 @@ func (h *MemoryHandler) resolveProvider(ctx context.Context, botID string) mempr } // SetMCPClientProvider sets the gRPC client provider for filesystem persistence. -func (h *MemoryHandler) SetMCPClientProvider(p mcpclient.Provider) { +func (h *MemoryHandler) SetMCPClientProvider(p bridge.Provider) { if p == nil { h.memoryStore = nil return @@ -671,7 +671,7 @@ func (h *MemoryHandler) requireBotAccess(c echo.Context) (string, error) { } // NewBuiltinMemoryRuntime keeps provider architecture while using file memory backend. -func NewBuiltinMemoryRuntime(p mcpclient.Provider) any { +func NewBuiltinMemoryRuntime(p bridge.Provider) any { if p == nil { return nil } diff --git a/internal/handlers/skills.go b/internal/handlers/skills.go index a3f9787a..9239d3e1 100644 --- a/internal/handlers/skills.go +++ b/internal/handlers/skills.go @@ -11,7 +11,7 @@ import ( "gopkg.in/yaml.v3" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const skillsDirPath = config.DefaultDataMount + "/.skills" @@ -196,7 +196,7 @@ func (h *ContainerdHandler) loadSkillsFromContainer(ctx context.Context, botID s return skills, nil } -func readContainerSkillFile(ctx context.Context, client *mcpclient.Client, filePath string) (string, error) { +func readContainerSkillFile(ctx context.Context, client *bridge.Client, filePath string) (string, error) { resp, err := client.ReadFile(ctx, filePath, 0, 0) if err != nil { return "", err diff --git a/internal/mcp/manager.go b/internal/mcp/manager.go deleted file mode 100644 index 26b737ec..00000000 --- a/internal/mcp/manager.go +++ /dev/null @@ -1,409 +0,0 @@ -package mcp - -import ( - "context" - "errors" - "fmt" - "log/slog" - "strings" - "sync" - "time" - - "github.com/containerd/errdefs" - "github.com/jackc/pgx/v5/pgxpool" - - "github.com/memohai/memoh/internal/config" - ctr "github.com/memohai/memoh/internal/containerd" - dbsqlc "github.com/memohai/memoh/internal/db/sqlc" - "github.com/memohai/memoh/internal/identity" - "github.com/memohai/memoh/internal/mcp/mcpclient" -) - -const ( - BotLabelKey = "mcp.bot_id" - ContainerPrefix = "mcp-" -) - -type Manager struct { - service ctr.Service - cfg config.MCPConfig - namespace string - containerID func(string) string - db *pgxpool.Pool - queries *dbsqlc.Queries - logger *slog.Logger - containerLockMu sync.Mutex - containerLocks map[string]*sync.Mutex - mu sync.RWMutex - containerIPs map[string]string - grpcPool *mcpclient.Pool -} - -func NewManager(log *slog.Logger, service ctr.Service, cfg config.MCPConfig, namespace string, conn *pgxpool.Pool) *Manager { - if namespace == "" { - namespace = config.DefaultNamespace - } - m := &Manager{ - service: service, - cfg: cfg, - namespace: namespace, - db: conn, - queries: dbsqlc.New(conn), - logger: log.With(slog.String("component", "mcp")), - containerLocks: make(map[string]*sync.Mutex), - containerIPs: make(map[string]string), - containerID: func(botID string) string { - return ContainerPrefix + botID - }, - } - m.grpcPool = mcpclient.NewPool(m.ContainerIP) - return m -} - -func (m *Manager) lockContainer(containerID string) func() { - m.containerLockMu.Lock() - lock, ok := m.containerLocks[containerID] - if !ok { - lock = &sync.Mutex{} - m.containerLocks[containerID] = lock - } - m.containerLockMu.Unlock() - - lock.Lock() - return lock.Unlock -} - -// ContainerIP returns the cached IP address for a bot's container. -// If not cached, it attempts to recover the IP by re-running CNI setup. -func (m *Manager) ContainerIP(botID string) string { - m.mu.RLock() - if ip, ok := m.containerIPs[botID]; ok { - m.mu.RUnlock() - return ip - } - m.mu.RUnlock() - - // Cache miss - try to recover IP via CNI setup (idempotent) - ip, err := m.recoverContainerIP(botID) - if err != nil { - m.logger.Warn("container IP recovery failed", slog.String("bot_id", botID), slog.Any("error", err)) - return "" - } - if ip != "" { - m.mu.Lock() - m.containerIPs[botID] = ip - m.mu.Unlock() - m.logger.Info("container IP recovered", slog.String("bot_id", botID), slog.String("ip", ip)) - } - return ip -} - -// SetContainerIP stores the container IP in the cache. -// If the IP changed, the stale gRPC connection is evicted from the pool. -func (m *Manager) SetContainerIP(botID, ip string) { - if ip == "" { - return - } - m.mu.Lock() - old := m.containerIPs[botID] - m.containerIPs[botID] = ip - m.mu.Unlock() - - if old != "" && old != ip { - m.grpcPool.Remove(botID) - m.logger.Info("evicted stale gRPC connection", slog.String("bot_id", botID), slog.String("old_ip", old), slog.String("new_ip", ip)) - } -} - -// recoverContainerIP attempts to restore the container IP by re-running CNI setup. -// CNI plugins are idempotent — calling Setup again returns the existing IP allocation. -// Retries up to 2 times to tolerate transient CNI failures (IPAM lock contention, etc.). -func (m *Manager) recoverContainerIP(botID string) (string, error) { - ctx := context.Background() - containerID := m.containerID(botID) - - info, err := m.service.GetContainer(ctx, containerID) - if err != nil { - return "", err - } - - if ip, ok := info.Labels["mcp.container_ip"]; ok { - return ip, nil - } - - const maxAttempts = 2 - var lastErr error - for i := 0; i < maxAttempts; i++ { - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - lastErr = err - m.logger.Warn("IP recovery attempt failed", - slog.String("bot_id", botID), slog.Int("attempt", i+1), slog.Any("error", err)) - time.Sleep(time.Duration(i+1) * 500 * time.Millisecond) - continue - } - return netResult.IP, nil - } - return "", fmt.Errorf("network setup for IP recovery after %d attempts: %w", maxAttempts, lastErr) -} - -// MCPClient returns a gRPC client for the given bot's container. -// Implements mcpclient.Provider. -func (m *Manager) MCPClient(ctx context.Context, botID string) (*mcpclient.Client, error) { - return m.grpcPool.Get(ctx, botID) -} - -func (m *Manager) Init(ctx context.Context) error { - image := m.imageRef() - - needsPull, remoteErr := m.checkImageUpgrade(ctx, image) - if remoteErr != nil { - // Remote check failed (network unavailable, registry down, etc.). - // Fall back to local image if available; fail only when nothing is cached. - m.logger.Warn("image upgrade check failed, falling back to local", - slog.String("image", image), slog.Any("error", remoteErr)) - if _, err := m.service.GetImage(ctx, image); err != nil { - _, err = m.service.PullImage(ctx, image, &ctr.PullImageOptions{ - Unpack: true, - Snapshotter: m.cfg.Snapshotter, - }) - return err - } - return nil - } - - if !needsPull { - return nil - } - - m.logger.Info("pulling updated MCP image", slog.String("image", image)) - if _, err := m.service.PullImage(ctx, image, &ctr.PullImageOptions{ - Unpack: true, - Snapshotter: m.cfg.Snapshotter, - }); err != nil { - m.logger.Warn("image pull failed, using existing version", slog.Any("error", err)) - if _, err2 := m.service.GetImage(ctx, image); err2 != nil { - return err - } - return nil - } - - // Existing bot containers keep running with their current image. - // New containers created after this point will use the updated image. - return nil -} - -// checkImageUpgrade compares the local image digest against the remote registry. -// Returns (true, nil) when a newer image is available or no local image exists. -// Returns (false, err) when the remote cannot be reached. -func (m *Manager) checkImageUpgrade(ctx context.Context, image string) (needsPull bool, _ error) { - checkCtx, cancel := context.WithTimeout(ctx, 15*time.Second) - defer cancel() - - remoteDigest, err := m.service.ResolveRemoteDigest(checkCtx, image) - if err != nil { - return false, err - } - - localImg, err := m.service.GetImage(ctx, image) - if err != nil { - return true, nil // no local image - } - return localImg.ID != remoteDigest, nil -} - -// EnsureBot creates the MCP container for a bot if it does not exist. -// Bot data lives in the container's writable layer (snapshot), not bind mounts. -func (m *Manager) EnsureBot(ctx context.Context, botID string) error { - if err := validateBotID(botID); err != nil { - return err - } - - image := m.imageRef() - resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) - if err != nil { - return err - } - - mounts := []ctr.MountSpec{ - { - Destination: "/etc/resolv.conf", - Type: "bind", - Source: resolvPath, - Options: []string{"rbind", "ro"}, - }, - } - tzMounts, tzEnv := ctr.TimezoneSpec() - mounts = append(mounts, tzMounts...) - - _, err = m.service.CreateContainer(ctx, ctr.CreateContainerRequest{ - ID: m.containerID(botID), - ImageRef: image, - Snapshotter: m.cfg.Snapshotter, - Labels: map[string]string{ - BotLabelKey: botID, - }, - Spec: ctr.ContainerSpec{ - Mounts: mounts, - Env: tzEnv, - }, - }) - if err == nil { - return nil - } - - if !errdefs.IsAlreadyExists(err) { - return err - } - - return nil -} - -// ListBots returns the bot IDs that have MCP containers. -func (m *Manager) ListBots(ctx context.Context) ([]string, error) { - containers, err := m.service.ListContainers(ctx) - if err != nil { - return nil, err - } - - botIDs := make([]string, 0, len(containers)) - for _, info := range containers { - if strings.HasPrefix(info.ID, ContainerPrefix) { - if botID, ok := info.Labels[BotLabelKey]; ok { - botIDs = append(botIDs, botID) - } - } - } - return botIDs, nil -} - -func (m *Manager) Start(ctx context.Context, botID string) error { - containerID := m.containerID(botID) - - // Before creating a new container, check for an orphaned snapshot - // (container deleted but snapshot with /data survived). Export /data - // to a backup so it can be restored after EnsureBot creates a fresh - // container. This covers dev image rebuilds, containerd metadata loss, - // and manual container deletion. - if _, err := m.service.GetContainer(ctx, containerID); errdefs.IsNotFound(err) { - m.recoverOrphanedSnapshot(ctx, botID) - } - - if err := m.EnsureBot(ctx, botID); err != nil { - return err - } - - // Restore preserved data (from orphaned snapshot recovery or a previous - // CleanupBotContainer with preserveData) into the fresh snapshot before - // starting the task, avoiding a redundant stop/start cycle. - if m.HasPreservedData(botID) { - if err := m.restorePreservedIntoSnapshot(ctx, botID); err != nil { - m.logger.Warn("restore preserved data into new container failed", - slog.String("bot_id", botID), slog.Any("error", err)) - } - } - - if err := m.service.StartContainer(ctx, containerID, nil); err != nil { - return err - } - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { - m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) - } - return err - } - if netResult.IP == "" { - if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { - m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) - } - return fmt.Errorf("network setup returned no IP for bot %s", botID) - } - m.SetContainerIP(botID, netResult.IP) - m.logger.Info("container network ready", slog.String("bot_id", botID), slog.String("ip", netResult.IP)) - return nil -} - -func (m *Manager) Stop(ctx context.Context, botID string, timeout time.Duration) error { - if err := validateBotID(botID); err != nil { - return err - } - return m.service.StopContainer(ctx, m.containerID(botID), &ctr.StopTaskOptions{ - Timeout: timeout, - Force: true, - }) -} - -func (m *Manager) Delete(ctx context.Context, botID string, preserveData bool) error { - if err := validateBotID(botID); err != nil { - return err - } - - containerID := m.containerID(botID) - stoppedForPreserve := false - - if preserveData { - info, err := m.service.GetContainer(ctx, containerID) - if err != nil { - return fmt.Errorf("get container for preserve: %w", err) - } - if _, err := m.snapshotMounts(ctx, info); errors.Is(err, errMountNotSupported) { - // Apple backend fallback uses gRPC against a running container. - } else if err != nil { - return err - } else { - if err := m.safeStopTask(ctx, containerID); err != nil { - return fmt.Errorf("stop for data preserve: %w", err) - } - stoppedForPreserve = true - } - - if err := m.PreserveData(ctx, botID); err != nil { - // Export failed — restart only if we stopped the task, and abort - // deletion to prevent data loss. - if stoppedForPreserve { - m.restartContainer(ctx, botID, containerID) - } - return fmt.Errorf("preserve data: %w", err) - } - } - - m.grpcPool.Remove(botID) - - if err := m.service.RemoveNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }); err != nil { - m.logger.Warn("cleanup: remove network failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - m.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - return m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{ - CleanupSnapshot: true, - }) -} - -func (m *Manager) dataRoot() string { - if m.cfg.DataRoot == "" { - return config.DefaultDataRoot - } - return m.cfg.DataRoot -} - -func (m *Manager) imageRef() string { - return m.cfg.ImageRef() -} - -func validateBotID(botID string) error { - return identity.ValidateChannelIdentityID(botID) -} diff --git a/internal/mcp/providers/browser/provider.go b/internal/mcp/providers/browser/provider.go index 17061d6a..00fc7a4b 100644 --- a/internal/mcp/providers/browser/provider.go +++ b/internal/mcp/providers/browser/provider.go @@ -15,8 +15,8 @@ import ( "github.com/memohai/memoh/internal/browsercontexts" "github.com/memohai/memoh/internal/config" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" "github.com/memohai/memoh/internal/settings" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -28,12 +28,12 @@ type Executor struct { logger *slog.Logger settings *settings.Service browserContexts *browsercontexts.Service - containers mcpclient.Provider + containers bridge.Provider gatewayBaseURL string httpClient *http.Client } -func NewExecutor(log *slog.Logger, settingsSvc *settings.Service, browserSvc *browsercontexts.Service, containers mcpclient.Provider, gatewayCfg config.BrowserGatewayConfig) *Executor { +func NewExecutor(log *slog.Logger, settingsSvc *settings.Service, browserSvc *browsercontexts.Service, containers bridge.Provider, gatewayCfg config.BrowserGatewayConfig) *Executor { if log == nil { log = slog.Default() } diff --git a/internal/mcp/providers/container/provider.go b/internal/mcp/providers/container/provider.go index 2a400567..8a4cc14c 100644 --- a/internal/mcp/providers/container/provider.go +++ b/internal/mcp/providers/container/provider.go @@ -9,7 +9,7 @@ import ( "strings" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -26,13 +26,13 @@ const ( // operate inside the bot container via gRPC. All I/O goes through the container // sandbox — no direct host filesystem access. type Executor struct { - clients mcpclient.Provider + clients bridge.Provider execWorkDir string logger *slog.Logger } // NewExecutor returns a tool executor backed by gRPC container clients. -func NewExecutor(log *slog.Logger, clients mcpclient.Provider, execWorkDir string) *Executor { +func NewExecutor(log *slog.Logger, clients bridge.Provider, execWorkDir string) *Executor { if log == nil { log = slog.Default() } @@ -187,7 +187,7 @@ func (p *Executor) CallTool(ctx context.Context, session mcpgw.ToolSessionContex } } -func (p *Executor) callRead(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callRead(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) if filePath == "" { return mcpgw.BuildToolErrorResult("path is required"), nil @@ -233,7 +233,7 @@ func (p *Executor) callRead(ctx context.Context, client *mcpclient.Client, args }), nil } -func (p *Executor) callWrite(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callWrite(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) content := mcpgw.StringArg(args, "content") if filePath == "" { @@ -245,7 +245,7 @@ func (p *Executor) callWrite(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"ok": true}), nil } -func (p *Executor) callList(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callList(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { dirPath := p.normalizePath(mcpgw.StringArg(args, "path")) if dirPath == "" { dirPath = "." @@ -269,7 +269,7 @@ func (p *Executor) callList(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"path": dirPath, "entries": entriesMaps}), nil } -func (p *Executor) callEdit(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callEdit(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) oldText := mcpgw.StringArg(args, "old_text") newText := mcpgw.StringArg(args, "new_text") @@ -298,7 +298,7 @@ func (p *Executor) callEdit(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"ok": true}), nil } -func (p *Executor) callExec(ctx context.Context, client *mcpclient.Client, botID string, args map[string]any) (map[string]any, error) { +func (p *Executor) callExec(ctx context.Context, client *bridge.Client, botID string, args map[string]any) (map[string]any, error) { command := strings.TrimSpace(mcpgw.StringArg(args, "command")) if command == "" { return mcpgw.BuildToolErrorResult("command is required"), nil diff --git a/internal/mcp/providers/container/provider_test.go b/internal/mcp/providers/container/provider_test.go index f9972f97..d978725a 100644 --- a/internal/mcp/providers/container/provider_test.go +++ b/internal/mcp/providers/container/provider_test.go @@ -13,8 +13,8 @@ import ( "google.golang.org/grpc/test/bufconn" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + "github.com/memohai/memoh/internal/workspace/bridge" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const bufSize = 1 << 20 @@ -164,8 +164,8 @@ func splitLines(s string) []string { return lines } -// testSetup creates a bufconn gRPC server and a matching mcpclient.Provider. -func testSetup(t *testing.T, svc *fakeContainerService) mcpclient.Provider { +// testSetup creates a bufconn gRPC server and a matching bridge.Provider. +func testSetup(t *testing.T, svc *fakeContainerService) bridge.Provider { t.Helper() lis := bufconn.Listen(bufSize) srv := grpc.NewServer() @@ -193,16 +193,16 @@ func testSetup(t *testing.T, svc *fakeContainerService) mcpclient.Provider { } t.Cleanup(func() { _ = conn.Close() }) - client := mcpclient.NewClientFromConn(conn) + client := bridge.NewClientFromConn(conn) return &staticProvider{client: client} } // staticProvider always returns the same client, ignoring botID. type staticProvider struct { - client *mcpclient.Client + client *bridge.Client } -func (p *staticProvider) MCPClient(_ context.Context, _ string) (*mcpclient.Client, error) { +func (p *staticProvider) MCPClient(_ context.Context, _ string) (*bridge.Client, error) { return p.client, nil } @@ -210,7 +210,7 @@ func session() mcpgw.ToolSessionContext { return mcpgw.ToolSessionContext{BotID: "bot-test"} } -func executor(provider mcpclient.Provider) *Executor { +func executor(provider bridge.Provider) *Executor { return NewExecutor(nil, provider, defaultExecWorkDir) } diff --git a/internal/memory/storefs/service.go b/internal/memory/storefs/service.go index d7f0536b..6b5c0107 100644 --- a/internal/memory/storefs/service.go +++ b/internal/memory/storefs/service.go @@ -16,7 +16,7 @@ import ( "gopkg.in/yaml.v3" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -34,7 +34,7 @@ type scanEntry struct { } type Service struct { - provider mcpclient.Provider + provider bridge.Provider logger *slog.Logger } @@ -59,14 +59,14 @@ type memoryEntryMeta struct { Metadata map[string]any `yaml:"metadata,omitempty"` } -func New(log *slog.Logger, provider mcpclient.Provider) *Service { +func New(log *slog.Logger, provider bridge.Provider) *Service { if log == nil { log = slog.Default() } return &Service{provider: provider, logger: log.With(slog.String("component", "storefs"))} } -func (s *Service) client(ctx context.Context, botID string) (*mcpclient.Client, error) { +func (s *Service) client(ctx context.Context, botID string) (*bridge.Client, error) { if s.provider == nil { return nil, ErrNotConfigured } @@ -665,7 +665,7 @@ func formatMemoryOverviewMD(items []MemoryItem) string { // --- utility helpers --- func isNotFound(err error) bool { - return errors.Is(err, mcpclient.ErrNotFound) + return errors.Is(err, bridge.ErrNotFound) } func toItemMap(items []MemoryItem) map[string]MemoryItem { diff --git a/internal/storage/providers/containerfs/provider.go b/internal/storage/providers/containerfs/provider.go index e6eb120c..b23a70b5 100644 --- a/internal/storage/providers/containerfs/provider.go +++ b/internal/storage/providers/containerfs/provider.go @@ -11,18 +11,18 @@ import ( "path/filepath" "strings" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const containerMediaRoot = "media" // Provider stores media assets inside bot containers via gRPC. type Provider struct { - clients mcpclient.Provider + clients bridge.Provider } // New creates a container-based storage provider. -func New(clients mcpclient.Provider) *Provider { +func New(clients bridge.Provider) *Provider { return &Provider{clients: clients} } diff --git a/internal/mcp/mcpclient/client.go b/internal/workspace/bridge/client.go similarity index 90% rename from internal/mcp/mcpclient/client.go rename to internal/workspace/bridge/client.go index 9cb27ab9..57462762 100644 --- a/internal/mcp/mcpclient/client.go +++ b/internal/workspace/bridge/client.go @@ -1,8 +1,8 @@ -// Package mcpclient provides a gRPC client for the MCP container service. -// Each bot container runs a gRPC server on port 9090 exposing file and exec -// operations. This client wraps the generated gRPC stubs with connection -// pooling and a simplified API for callers. -package mcpclient +// Package bridge provides a gRPC client for the workspace container bridge service. +// Each bot container runs a gRPC server listening on a Unix domain socket. +// This client wraps the generated gRPC stubs with connection pooling and a +// simplified API for callers. +package bridge import ( "bytes" @@ -17,8 +17,7 @@ import ( "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" - "github.com/memohai/memoh/internal/config" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const connectingTimeout = 30 * time.Second @@ -41,9 +40,9 @@ func NewClientFromConn(conn *grpc.ClientConn) *Client { } } -// Dial creates a new Client connected to the given container IP. -func Dial(_ context.Context, ip string) (*Client, error) { - target := fmt.Sprintf("%s:%d", ip, config.MCPGRPCPort) +// Dial creates a new Client connected to the given gRPC target. +// For UDS use "unix:///path/to/sock", for TCP use "host:port". +func Dial(_ context.Context, target string) (*Client, error) { conn, err := grpc.NewClient(target, grpc.WithTransportCredentials(insecure.NewCredentials()), ) @@ -347,16 +346,17 @@ type Provider interface { // Pool manages cached gRPC clients keyed by bot ID. type Pool struct { - mu sync.RWMutex - clients map[string]*Client - ipFunc func(botID string) string + mu sync.RWMutex + clients map[string]*Client + dialTargetFunc func(botID string) string } -// NewPool creates a client pool. ipFunc maps bot ID to container IP. -func NewPool(ipFunc func(string) string) *Pool { +// NewPool creates a client pool. dialTargetFunc maps bot ID to a gRPC target +// string (e.g. "unix:///path/sock" or "host:port"). +func NewPool(dialTargetFunc func(string) string) *Pool { return &Pool{ - clients: make(map[string]*Client), - ipFunc: ipFunc, + clients: make(map[string]*Client), + dialTargetFunc: dialTargetFunc, } } @@ -383,12 +383,12 @@ func (p *Pool) Get(ctx context.Context, botID string) (*Client, error) { p.mu.RUnlock() } - ip := p.ipFunc(botID) - if ip == "" { - return nil, fmt.Errorf("no IP for bot %s", botID) + target := p.dialTargetFunc(botID) + if target == "" { + return nil, fmt.Errorf("no dial target for bot %s", botID) } - c, err := Dial(ctx, ip) + c, err := Dial(ctx, target) if err != nil { return nil, err } diff --git a/internal/mcp/mcpclient/client_test.go b/internal/workspace/bridge/client_test.go similarity index 97% rename from internal/mcp/mcpclient/client_test.go rename to internal/workspace/bridge/client_test.go index df75c1e5..62106269 100644 --- a/internal/mcp/mcpclient/client_test.go +++ b/internal/workspace/bridge/client_test.go @@ -1,4 +1,4 @@ -package mcpclient +package bridge import ( "context" @@ -13,7 +13,7 @@ import ( "google.golang.org/grpc/status" "google.golang.org/grpc/test/bufconn" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const testBufSize = 1 << 20 diff --git a/internal/mcp/mcpclient/errors.go b/internal/workspace/bridge/errors.go similarity index 98% rename from internal/mcp/mcpclient/errors.go rename to internal/workspace/bridge/errors.go index f7ba2d42..93918083 100644 --- a/internal/mcp/mcpclient/errors.go +++ b/internal/workspace/bridge/errors.go @@ -1,4 +1,4 @@ -package mcpclient +package bridge import ( "errors" diff --git a/internal/mcp/mcpcontainer/mcpcontainer.pb.go b/internal/workspace/bridgepb/bridge.pb.go similarity index 99% rename from internal/mcp/mcpcontainer/mcpcontainer.pb.go rename to internal/workspace/bridgepb/bridge.pb.go index b88c865f..7e2910c9 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer.pb.go +++ b/internal/workspace/bridgepb/bridge.pb.go @@ -2,9 +2,9 @@ // versions: // protoc-gen-go v1.36.11 // protoc v7.34.0 -// source: internal/mcp/mcpcontainer/mcpcontainer.proto +// source: internal/workspace/bridgepb/bridge.proto -package mcpcontainer +package bridgepb import ( reflect "reflect" diff --git a/internal/mcp/mcpcontainer/mcpcontainer.proto b/internal/workspace/bridgepb/bridge.proto similarity index 95% rename from internal/mcp/mcpcontainer/mcpcontainer.proto rename to internal/workspace/bridgepb/bridge.proto index 0b7be27f..220d1596 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer.proto +++ b/internal/workspace/bridgepb/bridge.proto @@ -1,8 +1,8 @@ syntax = "proto3"; -package mcpcontainer; +package bridgepb; -option go_package = "github.com/memohai/memoh/internal/mcp/mcpcontainer"; +option go_package = "github.com/memohai/memoh/internal/workspace/bridgepb"; service ContainerService { rpc ReadFile(ReadFileRequest) returns (ReadFileResponse); diff --git a/internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go b/internal/workspace/bridgepb/bridge_grpc.pb.go similarity index 99% rename from internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go rename to internal/workspace/bridgepb/bridge_grpc.pb.go index 01a7744e..90f39ab2 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go +++ b/internal/workspace/bridgepb/bridge_grpc.pb.go @@ -2,9 +2,9 @@ // versions: // - protoc-gen-go-grpc v1.6.1 // - protoc v7.34.0 -// source: internal/mcp/mcpcontainer/mcpcontainer.proto +// source: internal/workspace/bridgepb/bridge.proto -package mcpcontainer +package bridgepb import ( context "context" @@ -451,5 +451,5 @@ var ContainerService_ServiceDesc = grpc.ServiceDesc{ ClientStreams: true, }, }, - Metadata: "internal/mcp/mcpcontainer/mcpcontainer.proto", + Metadata: "internal/workspace/bridgepb/bridge.proto", } diff --git a/internal/mcp/dataio.go b/internal/workspace/dataio.go similarity index 92% rename from internal/mcp/dataio.go rename to internal/workspace/dataio.go index 2546dfbd..27f61c9c 100644 --- a/internal/mcp/dataio.go +++ b/internal/workspace/dataio.go @@ -1,4 +1,4 @@ -package mcp +package workspace import ( "archive/tar" @@ -30,7 +30,7 @@ const ( // The container is stopped during export and restarted afterwards. // Caller must consume the returned reader before the context is cancelled. func (m *Manager) ExportData(ctx context.Context, botID string) (io.ReadCloser, error) { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -75,7 +75,7 @@ func (m *Manager) ExportData(ctx context.Context, botID string) (io.ReadCloser, // ImportData extracts a tar.gz archive into the container's /data directory. // The container is stopped during import and restarted afterwards. func (m *Manager) ImportData(ctx context.Context, botID string, r io.Reader) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -112,7 +112,7 @@ func (m *Manager) ImportData(ctx context.Context, botID string, r io.Reader) err // mounted snapshot is consistent; the Apple fallback uses gRPC and does not // require a stop. func (m *Manager) PreserveData(ctx context.Context, botID string) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { @@ -150,7 +150,10 @@ func (m *Manager) PreserveData(ctx context.Context, botID string) error { _ = os.Remove(backupPath) return fmt.Errorf("export data: %w", writeErr) } - return closeErr + if closeErr != nil { + return closeErr + } + return nil } // RestorePreservedData imports preserved data (backup tar.gz or legacy @@ -172,15 +175,13 @@ func (m *Manager) RestorePreservedData(ctx context.Context, botID string) error // Legacy bind-mount directory legacyDir := m.legacyDataDir(botID) - migratedDir := legacyDir + migratedSuffix - if _, err := os.Stat(migratedDir); err == nil { + if _, err := os.Stat(legacyDir + migratedSuffix); err == nil { return nil // already imported previously } info, err := os.Stat(legacyDir) if err != nil || !info.IsDir() { return errors.New("no preserved data found") } - return m.importLegacyDir(ctx, botID, legacyDir) } @@ -201,7 +202,7 @@ func (m *Manager) HasPreservedData(botID string) bool { // importLegacyDir copies a legacy bind-mount directory into the container // via snapshot mount, then renames the source to .migrated. func (m *Manager) importLegacyDir(ctx context.Context, botID, srcDir string) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { @@ -233,7 +234,7 @@ func (m *Manager) importLegacyDir(ctx context.Context, botID, srcDir string) err } if err := os.Rename(srcDir, srcDir+migratedSuffix); err != nil { - m.logger.Warn("legacy import: rename failed", + m.logger.Warn("legacy import: rename to .migrated failed", slog.String("src", srcDir), slog.Any("error", err)) } return nil @@ -249,7 +250,7 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo return false } - snapshotKey := m.containerID(botID) + snapshotKey := m.resolveContainerID(ctx, botID) raw, err := m.service.SnapshotMounts(ctx, snapshotter, snapshotKey) if err != nil { return false @@ -269,7 +270,7 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo f, err := os.Create(backupPath) //nolint:gosec // G304: operator-controlled path if err != nil { - m.logger.Warn("recover orphaned snapshot: create backup failed", + m.logger.Warn("recover orphaned snapshot: create backup file failed", slog.String("bot_id", botID), slog.Any("error", err)) return false } @@ -293,9 +294,6 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo _ = os.Remove(backupPath) return false } - - m.logger.Info("recovered data from orphaned snapshot", - slog.String("bot_id", botID), slog.String("backup", backupPath)) return true } @@ -310,7 +308,7 @@ func (m *Manager) restorePreservedIntoSnapshot(ctx context.Context, botID string } defer func() { _ = f.Close() }() - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { return fmt.Errorf("get container: %w", err) @@ -332,8 +330,6 @@ func (m *Manager) restorePreservedIntoSnapshot(ctx context.Context, botID string } _ = os.Remove(bp) - m.logger.Info("restored preserved data into new container", - slog.String("bot_id", botID)) return nil } @@ -372,17 +368,17 @@ func (m *Manager) restartContainer(ctx context.Context, botID, containerID strin slog.String("container_id", containerID), slog.Any("error", err)) return } - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + // CNI network setup — outbound connectivity is required for package + // downloads and other network-dependent operations in the container. + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ ContainerID: containerID, CNIBinDir: m.cfg.CNIBinaryDir, CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - m.logger.Warn("network setup after restart failed", + }); err != nil { + m.logger.Error("network setup after restart failed", slog.String("container_id", containerID), slog.Any("error", err)) return } - m.SetContainerIP(botID, netResult.IP) } func mountedDataDir(root string) string { @@ -560,11 +556,34 @@ func tarGzDir(w io.Writer, dir string) error { if err != nil || rel == "." { return err } - info, err := d.Info() + + if d.IsDir() { + info, err := d.Info() + if err != nil { + return err + } + header, err := tar.FileInfoHeader(info, "") + if err != nil { + return err + } + header.Name = filepath.ToSlash(rel) + return tw.WriteHeader(header) + } + + // For regular files: open first, then Fstat on the same fd so that + // the size in the tar header is guaranteed to match the content we + // read. This avoids race conditions and overlayfs size mismatches + // that cause "archive/tar: write too long". + f, err := os.Open(path) //nolint:gosec // G304: iterating operator-controlled data directory if err != nil { return err } + defer func() { _ = f.Close() }() + info, err := f.Stat() + if err != nil { + return err + } header, err := tar.FileInfoHeader(info, "") if err != nil { return err @@ -574,17 +593,7 @@ func tarGzDir(w io.Writer, dir string) error { if err := tw.WriteHeader(header); err != nil { return err } - - if d.IsDir() { - return nil - } - - f, err := os.Open(path) //nolint:gosec // G304: iterating operator-controlled data directory - if err != nil { - return err - } - defer func() { _ = f.Close() }() - _, err = io.Copy(tw, f) + _, err = io.Copy(tw, io.LimitReader(f, info.Size())) return err }) } diff --git a/internal/workspace/identity.go b/internal/workspace/identity.go new file mode 100644 index 00000000..52746b84 --- /dev/null +++ b/internal/workspace/identity.go @@ -0,0 +1,34 @@ +package workspace + +import ( + "strings" + + ctr "github.com/memohai/memoh/internal/containerd" +) + +var knownContainerPrefixes = []string{ContainerPrefix, LegacyContainerPrefix} + +// BotIDFromContainerID infers a bot ID from a known container naming scheme. +// This is only used as a fallback for legacy containers when labels are missing. +func BotIDFromContainerID(containerID string) (string, bool) { + for _, prefix := range knownContainerPrefixes { + if !strings.HasPrefix(containerID, prefix) { + continue + } + botID := strings.TrimPrefix(containerID, prefix) + if botID == "" { + return "", false + } + return botID, true + } + return "", false +} + +// BotIDFromContainerInfo resolves the bot ID from container metadata. +// It prefers the current label and only falls back to name inference. +func BotIDFromContainerInfo(info ctr.ContainerInfo) (string, bool) { + if botID := strings.TrimSpace(info.Labels[BotLabelKey]); botID != "" { + return botID, true + } + return BotIDFromContainerID(info.ID) +} diff --git a/internal/workspace/identity_test.go b/internal/workspace/identity_test.go new file mode 100644 index 00000000..65328271 --- /dev/null +++ b/internal/workspace/identity_test.go @@ -0,0 +1,52 @@ +package workspace + +import ( + "testing" + "time" + + ctr "github.com/memohai/memoh/internal/containerd" +) + +func TestBotIDFromContainerInfoPrefersCurrentLabel(t *testing.T) { + t.Parallel() + + info := ctr.ContainerInfo{ + ID: "workspace-ignored", + Labels: map[string]string{ + BotLabelKey: "bot-from-label", + }, + UpdatedAt: time.Now(), + } + + botID, ok := BotIDFromContainerInfo(info) + if !ok { + t.Fatal("expected bot ID to resolve") + } + if botID != "bot-from-label" { + t.Fatalf("expected labeled bot ID, got %q", botID) + } +} + +func TestBotIDFromContainerInfoFallsBackToKnownPrefixes(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + containerID string + want string + }{ + {name: "workspace", containerID: "workspace-bot-123", want: "bot-123"}, + {name: "legacy", containerID: "mcp-bot-456", want: "bot-456"}, + } { + t.Run(tc.name, func(t *testing.T) { + info := ctr.ContainerInfo{ID: tc.containerID} + got, ok := BotIDFromContainerInfo(info) + if !ok { + t.Fatal("expected bot ID to resolve") + } + if got != tc.want { + t.Fatalf("expected %q, got %q", tc.want, got) + } + }) + } +} diff --git a/internal/workspace/image_preference.go b/internal/workspace/image_preference.go new file mode 100644 index 00000000..a298e202 --- /dev/null +++ b/internal/workspace/image_preference.go @@ -0,0 +1,176 @@ +package workspace + +import ( + "context" + "encoding/json" + "errors" + "strings" + + "github.com/jackc/pgx/v5" + + "github.com/memohai/memoh/internal/config" + "github.com/memohai/memoh/internal/db" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" +) + +const ( + workspaceMetadataKey = "workspace" + workspaceImageMetadataKey = "image" +) + +func decodeBotMetadata(payload []byte) (map[string]any, error) { + if len(payload) == 0 { + return map[string]any{}, nil + } + var data map[string]any + if err := json.Unmarshal(payload, &data); err != nil { + return nil, err + } + if data == nil { + data = map[string]any{} + } + return data, nil +} + +func cloneAnyMap(src map[string]any) map[string]any { + if src == nil { + return map[string]any{} + } + cloned := make(map[string]any, len(src)) + for key, value := range src { + cloned[key] = value + } + return cloned +} + +func workspaceSection(metadata map[string]any) map[string]any { + raw, ok := metadata[workspaceMetadataKey] + if !ok { + return map[string]any{} + } + section, ok := raw.(map[string]any) + if !ok { + return map[string]any{} + } + return cloneAnyMap(section) +} + +func workspaceImageFromMetadata(metadata map[string]any) string { + section := workspaceSection(metadata) + image, _ := section[workspaceImageMetadataKey].(string) + return strings.TrimSpace(image) +} + +func withWorkspaceImagePreference(metadata map[string]any, image string) map[string]any { + next := cloneAnyMap(metadata) + section := workspaceSection(next) + section[workspaceImageMetadataKey] = strings.TrimSpace(image) + next[workspaceMetadataKey] = section + return next +} + +func withoutWorkspaceImagePreference(metadata map[string]any) map[string]any { + next := cloneAnyMap(metadata) + section := workspaceSection(next) + delete(section, workspaceImageMetadataKey) + if len(section) == 0 { + delete(next, workspaceMetadataKey) + return next + } + next[workspaceMetadataKey] = section + return next +} + +func (m *Manager) botWorkspaceImagePreference(ctx context.Context, botID string) (string, error) { + if m.queries == nil { + return "", nil + } + botUUID, err := db.ParseUUID(botID) + if err != nil { + return "", err + } + row, err := m.queries.GetBotByID(ctx, botUUID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return "", nil + } + return "", err + } + metadata, err := decodeBotMetadata(row.Metadata) + if err != nil { + return "", err + } + return workspaceImageFromMetadata(metadata), nil +} + +func (m *Manager) updateBotWorkspaceImagePreference(ctx context.Context, botID, image string, clearPreference bool) error { + if m.queries == nil { + return nil + } + botUUID, err := db.ParseUUID(botID) + if err != nil { + return err + } + row, err := m.queries.GetBotByID(ctx, botUUID) + if err != nil { + return err + } + metadata, err := decodeBotMetadata(row.Metadata) + if err != nil { + return err + } + if clearPreference { + metadata = withoutWorkspaceImagePreference(metadata) + } else { + metadata = withWorkspaceImagePreference(metadata, image) + } + payload, err := json.Marshal(metadata) + if err != nil { + return err + } + _, err = m.queries.UpdateBotProfile(ctx, dbsqlc.UpdateBotProfileParams{ + ID: botUUID, + DisplayName: row.DisplayName, + AvatarUrl: row.AvatarUrl, + IsActive: row.IsActive, + Metadata: payload, + }) + return err +} + +func (m *Manager) RememberWorkspaceImage(ctx context.Context, botID, image string) error { + return m.updateBotWorkspaceImagePreference(ctx, botID, config.NormalizeImageRef(image), false) +} + +func (m *Manager) ClearWorkspaceImagePreference(ctx context.Context, botID string) error { + return m.updateBotWorkspaceImagePreference(ctx, botID, "", true) +} + +func (m *Manager) ResolveWorkspaceImage(ctx context.Context, botID string) (string, error) { + return m.resolveWorkspaceImage(ctx, botID) +} + +func (m *Manager) resolveWorkspaceImage(ctx context.Context, botID string) (string, error) { + if m.queries != nil { + pgBotID, err := db.ParseUUID(botID) + if err == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil && strings.TrimSpace(row.Image) != "" { + return config.NormalizeImageRef(strings.TrimSpace(row.Image)), nil + } + if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { + return "", dbErr + } + } + } + + preferredImage, err := m.botWorkspaceImagePreference(ctx, botID) + if err != nil { + return "", err + } + if preferredImage != "" { + return config.NormalizeImageRef(preferredImage), nil + } + + return m.imageRef(), nil +} diff --git a/internal/workspace/image_preference_test.go b/internal/workspace/image_preference_test.go new file mode 100644 index 00000000..dcbe68fc --- /dev/null +++ b/internal/workspace/image_preference_test.go @@ -0,0 +1,53 @@ +package workspace + +import "testing" + +func TestWorkspaceImageMetadataRoundTrip(t *testing.T) { + t.Parallel() + + metadata := map[string]any{ + "name": "test", + workspaceMetadataKey: map[string]any{ + "keep": "value", + }, + } + + updated := withWorkspaceImagePreference(metadata, "alpine:3.20") + + if got := workspaceImageFromMetadata(updated); got != "alpine:3.20" { + t.Fatalf("expected image preference to round-trip, got %q", got) + } + workspace, ok := updated[workspaceMetadataKey].(map[string]any) + if !ok { + t.Fatal("expected workspace metadata section") + } + if workspace["keep"] != "value" { + t.Fatalf("expected existing workspace metadata to be preserved, got %#v", workspace) + } + if _, exists := metadata[workspaceMetadataKey].(map[string]any)[workspaceImageMetadataKey]; exists { + t.Fatal("expected original metadata map to remain unchanged") + } +} + +func TestWithoutWorkspaceImagePreferenceRemovesOnlyImageKey(t *testing.T) { + t.Parallel() + + metadata := map[string]any{ + workspaceMetadataKey: map[string]any{ + workspaceImageMetadataKey: "debian:bookworm-slim", + "keep": true, + }, + } + + updated := withoutWorkspaceImagePreference(metadata) + if got := workspaceImageFromMetadata(updated); got != "" { + t.Fatalf("expected image preference to be cleared, got %q", got) + } + workspace, ok := updated[workspaceMetadataKey].(map[string]any) + if !ok { + t.Fatal("expected workspace metadata section to remain") + } + if workspace["keep"] != true { + t.Fatalf("expected unrelated workspace metadata to remain, got %#v", workspace) + } +} diff --git a/internal/workspace/manager.go b/internal/workspace/manager.go new file mode 100644 index 00000000..7315e75e --- /dev/null +++ b/internal/workspace/manager.go @@ -0,0 +1,435 @@ +package workspace + +import ( + "context" + "errors" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/containerd/errdefs" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/memohai/memoh/internal/config" + ctr "github.com/memohai/memoh/internal/containerd" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" + "github.com/memohai/memoh/internal/identity" + "github.com/memohai/memoh/internal/workspace/bridge" +) + +const ( + BotLabelKey = "memoh.bot_id" + WorkspaceLabelKey = "memoh.workspace" + WorkspaceLabelValue = "v3" + ContainerPrefix = "workspace-" + LegacyContainerPrefix = "mcp-" + + legacyGRPCPort = 9090 +) + +// ErrContainerNotFound is returned when no container exists for a bot. +var ErrContainerNotFound = errors.New("container not found for bot") + +// ContainerStatus combines DB records with live containerd state. +type ContainerStatus struct { + ContainerID string `json:"container_id"` + Image string `json:"image"` + Status string `json:"status"` + Namespace string `json:"namespace"` + ContainerPath string `json:"container_path"` + TaskRunning bool `json:"task_running"` + HasPreservedData bool `json:"has_preserved_data"` + Legacy bool `json:"legacy"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type Manager struct { + service ctr.Service + cfg config.WorkspaceConfig + namespace string + db *pgxpool.Pool + queries *dbsqlc.Queries + logger *slog.Logger + containerLockMu sync.Mutex + containerLocks map[string]*sync.Mutex + grpcPool *bridge.Pool + legacyMu sync.RWMutex + legacyIPs map[string]string // botID → IP for pre-bridge containers +} + +func NewManager(log *slog.Logger, service ctr.Service, cfg config.WorkspaceConfig, namespace string, conn *pgxpool.Pool) *Manager { + if namespace == "" { + namespace = config.DefaultNamespace + } + m := &Manager{ + service: service, + cfg: cfg, + namespace: namespace, + db: conn, + queries: dbsqlc.New(conn), + logger: log.With(slog.String("component", "workspace")), + containerLocks: make(map[string]*sync.Mutex), + legacyIPs: make(map[string]string), + } + m.grpcPool = bridge.NewPool(m.dialTarget) + return m +} + +// resolveContainerID resolves the actual containerd container ID for a bot. +// This is the SINGLE point of container ID resolution for all lookup operations. +// It delegates to ContainerID (DB → label → scan) and falls back to the +// new-style prefix if no container exists yet. +func (m *Manager) resolveContainerID(ctx context.Context, botID string) string { + id, err := m.ContainerID(ctx, botID) + if err != nil { + return ContainerPrefix + botID + } + return id +} + +func (m *Manager) lockContainer(containerID string) func() { + m.containerLockMu.Lock() + lock, ok := m.containerLocks[containerID] + if !ok { + lock = &sync.Mutex{} + m.containerLocks[containerID] = lock + } + m.containerLockMu.Unlock() + + lock.Lock() + return lock.Unlock +} + +// socketDir returns the host-side directory that is bind-mounted into the +// container at /run/memoh, holding the UDS socket file. +func (m *Manager) socketDir(botID string) string { + return filepath.Join(m.dataRoot(), "run", botID) +} + +// socketPath returns the path to the UDS socket file for a bot's container. +func (m *Manager) socketPath(botID string) string { + return filepath.Join(m.socketDir(botID), "bridge.sock") +} + +// dialTarget returns the gRPC dial target for a bot. Legacy containers +// (pre-bridge) are reached via TCP; bridge containers use UDS. +func (m *Manager) dialTarget(botID string) string { + m.legacyMu.RLock() + ip, legacy := m.legacyIPs[botID] + m.legacyMu.RUnlock() + if legacy { + return fmt.Sprintf("%s:%d", ip, legacyGRPCPort) + } + return "unix://" + m.socketPath(botID) +} + +// SetLegacyIP records the IP address of a legacy (pre-bridge) container +// so the gRPC pool can reach it via TCP. +func (m *Manager) SetLegacyIP(botID, ip string) { + m.legacyMu.Lock() + m.legacyIPs[botID] = ip + m.legacyMu.Unlock() +} + +// ClearLegacyIP removes a cached legacy IP (e.g. when the container is deleted). +func (m *Manager) ClearLegacyIP(botID string) { + m.legacyMu.Lock() + delete(m.legacyIPs, botID) + m.legacyMu.Unlock() +} + +// clearLegacyRoute evicts any stale TCP fallback state for a bot so future +// gRPC dials use the bridge container's Unix socket. +func (m *Manager) clearLegacyRoute(botID string) { + m.ClearLegacyIP(botID) + m.grpcPool.Remove(botID) +} + +// MCPClient returns a gRPC client for the given bot's container. +// Implements bridge.Provider. +func (m *Manager) MCPClient(ctx context.Context, botID string) (*bridge.Client, error) { + return m.grpcPool.Get(ctx, botID) +} + +func (m *Manager) Init(ctx context.Context) error { + image := m.imageRef() + + // Pre-pull the default base image so container creation doesn't block + // on a network download. If the image is already present, this is a no-op. + if _, err := m.service.GetImage(ctx, image); err != nil { + m.logger.Info("pulling base image for workspace containers", slog.String("image", image)) + if _, pullErr := m.service.PullImage(ctx, image, &ctr.PullImageOptions{ + Unpack: true, + Snapshotter: m.cfg.Snapshotter, + }); pullErr != nil { + m.logger.Warn("base image pull failed", slog.String("image", image), slog.Any("error", pullErr)) + return pullErr + } + } + return nil +} + +// EnsureBot creates the workspace container for a bot if it does not exist. +// Bot data lives in the container's writable layer (snapshot), not bind mounts. +// The Memoh runtime (bridge binary + toolkit) is injected via read-only bind mount. +// If imageOverride is non-empty, it is used instead of the configured default. +func (m *Manager) EnsureBot(ctx context.Context, botID, imageOverride string) error { + image := m.imageRef() + if imageOverride != "" { + image = config.NormalizeImageRef(imageOverride) + } + return m.ensureBotWithImage(ctx, botID, image) +} + +func (m *Manager) ensureBotWithImage(ctx context.Context, botID, image string) error { + if err := validateBotID(botID); err != nil { + return err + } + + resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) + if err != nil { + return err + } + + runtimeDir := m.cfg.RuntimePath() + sockDir := m.socketDir(botID) + if err := os.MkdirAll(sockDir, 0o750); err != nil { + return fmt.Errorf("create socket dir: %w", err) + } + + mounts := []ctr.MountSpec{ + { + Destination: "/etc/resolv.conf", + Type: "bind", + Source: resolvPath, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/opt/memoh", + Type: "bind", + Source: runtimeDir, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/run/memoh", + Type: "bind", + Source: sockDir, + Options: []string{"rbind", "rw"}, + }, + } + tzMounts, tzEnv := ctr.TimezoneSpec() + mounts = append(mounts, tzMounts...) + + env := make([]string, 0, len(tzEnv)+1) + env = append(env, tzEnv...) + env = append(env, "BRIDGE_SOCKET_PATH=/run/memoh/bridge.sock") + + _, err = m.service.CreateContainer(ctx, ctr.CreateContainerRequest{ + ID: ContainerPrefix + botID, + ImageRef: image, + Snapshotter: m.cfg.Snapshotter, + Labels: map[string]string{ + BotLabelKey: botID, + WorkspaceLabelKey: WorkspaceLabelValue, + }, + Spec: ctr.ContainerSpec{ + Cmd: []string{"/opt/memoh/bridge"}, + Mounts: mounts, + Env: env, + }, + }) + if err == nil { + return nil + } + + if !errdefs.IsAlreadyExists(err) { + return err + } + + return nil +} + +// ListBots returns the bot IDs that have workspace containers. +func (m *Manager) ListBots(ctx context.Context) ([]string, error) { + containers, err := m.service.ListContainers(ctx) + if err != nil { + return nil, err + } + + botIDs := make([]string, 0, len(containers)) + for _, info := range containers { + if botID, ok := BotIDFromContainerInfo(info); ok { + botIDs = append(botIDs, botID) + } + } + return botIDs, nil +} + +func (m *Manager) Start(ctx context.Context, botID string) error { + image, err := m.resolveWorkspaceImage(ctx, botID) + if err != nil { + return err + } + return m.startWithResolvedImage(ctx, botID, image) +} + +// StartWithImage creates and starts the MCP container for a bot. +// If imageOverride is non-empty, it is used as the base image instead of the +// configured default. The override only applies when creating a new container. +func (m *Manager) StartWithImage(ctx context.Context, botID, imageOverride string) error { + image := strings.TrimSpace(imageOverride) + if image == "" { + return m.Start(ctx, botID) + } + return m.startWithResolvedImage(ctx, botID, config.NormalizeImageRef(image)) +} + +// StartWithResolvedImage creates and starts the workspace container for a bot +// using an explicit image reference. +func (m *Manager) StartWithResolvedImage(ctx context.Context, botID, image string) error { + image = strings.TrimSpace(image) + if image == "" { + return errors.New("image is required") + } + return m.startWithResolvedImage(ctx, botID, image) +} + +func (m *Manager) startWithResolvedImage(ctx context.Context, botID, image string) error { + containerID := m.resolveContainerID(ctx, botID) + + // Before creating a new container, check for an orphaned snapshot + // (container deleted but snapshot with /data survived). Export /data + // to a backup so it can be restored after EnsureBot creates a fresh + // container. This covers dev image rebuilds, containerd metadata loss, + // and manual container deletion. + if _, err := m.service.GetContainer(ctx, containerID); errdefs.IsNotFound(err) { + m.recoverOrphanedSnapshot(ctx, botID) + } + + if err := m.ensureBotWithImage(ctx, botID, image); err != nil { + return err + } + + // Restore preserved data (from orphaned snapshot recovery or a previous + // CleanupBotContainer with preserveData) into the fresh snapshot before + // starting the task, avoiding a redundant stop/start cycle. + if m.HasPreservedData(botID) { + if err := m.restorePreservedIntoSnapshot(ctx, botID); err != nil { + return fmt.Errorf("restore preserved data: %w", err) + } + } + + if err := m.service.StartContainer(ctx, containerID, nil); err != nil { + return err + } + + // CNI network setup (for outbound connectivity — container processes + // may need to download packages). Server communicates via UDS, not IP. + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }); err != nil { + if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { + m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) + } + return err + } + if !m.IsLegacyContainer(ctx, containerID) { + m.clearLegacyRoute(botID) + } + return nil +} + +func (m *Manager) Stop(ctx context.Context, botID string, timeout time.Duration) error { + if err := validateBotID(botID); err != nil { + return err + } + return m.service.StopContainer(ctx, m.resolveContainerID(ctx, botID), &ctr.StopTaskOptions{ + Timeout: timeout, + Force: true, + }) +} + +func (m *Manager) Delete(ctx context.Context, botID string, preserveData bool) error { + if err := validateBotID(botID); err != nil { + return err + } + + containerID := m.resolveContainerID(ctx, botID) + + stoppedForPreserve := false + + if preserveData { + info, err := m.service.GetContainer(ctx, containerID) + if err != nil { + return fmt.Errorf("get container for preserve: %w", err) + } + + if _, err := m.snapshotMounts(ctx, info); errors.Is(err, errMountNotSupported) { + // Apple backend fallback uses gRPC against a running container. + } else if err != nil { + return err + } else { + if err := m.safeStopTask(ctx, containerID); err != nil { + return fmt.Errorf("stop for data preserve: %w", err) + } + stoppedForPreserve = true + } + + if err := m.PreserveData(ctx, botID); err != nil { + // Export failed — restart only if we stopped the task, and abort + // deletion to prevent data loss. + if stoppedForPreserve { + m.restartContainer(ctx, botID, containerID) + } + return fmt.Errorf("preserve data: %w", err) + } + } + + m.clearLegacyRoute(botID) + + if err := m.service.RemoveNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }); err != nil { + m.logger.Warn("delete: remove network failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + m.logger.Warn("delete: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + return m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{ + CleanupSnapshot: true, + }) +} + +func (m *Manager) dataRoot() string { + if m.cfg.DataRoot == "" { + return config.DefaultDataRoot + } + return m.cfg.DataRoot +} + +func (m *Manager) imageRef() string { + return m.cfg.ImageRef() +} + +// IsLegacyContainer returns true if the container was created before the +// bridge runtime injection architecture (uses the legacy "mcp-" prefix). +// Legacy containers are functional but unreachable from the server (they +// use TCP gRPC instead of UDS). Users should delete and recreate them. +func (*Manager) IsLegacyContainer(_ context.Context, containerID string) bool { + return strings.HasPrefix(containerID, LegacyContainerPrefix) +} + +func validateBotID(botID string) error { + return identity.ValidateChannelIdentityID(botID) +} diff --git a/internal/workspace/manager_legacy_test.go b/internal/workspace/manager_legacy_test.go new file mode 100644 index 00000000..ac921dc4 --- /dev/null +++ b/internal/workspace/manager_legacy_test.go @@ -0,0 +1,292 @@ +package workspace + +import ( + "context" + "log/slog" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/containerd/errdefs" + + "github.com/memohai/memoh/internal/config" + ctr "github.com/memohai/memoh/internal/containerd" + "github.com/memohai/memoh/internal/workspace/bridge" +) + +type legacyRouteTestService struct { + container ctr.ContainerInfo + created bool + byLabel []ctr.ContainerInfo + + createCalls int + startCalls int + deleteCalls int + removeNet int + deleteTask int + setupNet int + + getContainerBeforeCreateErr error + setupNetworkResults []ctr.NetworkResult + setupNetworkErrs []error +} + +func (*legacyRouteTestService) PullImage(context.Context, string, *ctr.PullImageOptions) (ctr.ImageInfo, error) { + return ctr.ImageInfo{}, nil +} + +func (*legacyRouteTestService) GetImage(context.Context, string) (ctr.ImageInfo, error) { + return ctr.ImageInfo{}, nil +} + +func (*legacyRouteTestService) ListImages(context.Context) ([]ctr.ImageInfo, error) { + return nil, nil +} + +func (*legacyRouteTestService) DeleteImage(context.Context, string, *ctr.DeleteImageOptions) error { + return nil +} + +func (*legacyRouteTestService) ResolveRemoteDigest(context.Context, string) (string, error) { + return "", nil +} + +func (s *legacyRouteTestService) CreateContainer(_ context.Context, req ctr.CreateContainerRequest) (ctr.ContainerInfo, error) { + s.createCalls++ + s.created = true + s.container = ctr.ContainerInfo{ + ID: req.ID, + Image: req.ImageRef, + Labels: req.Labels, + Snapshotter: req.Snapshotter, + SnapshotKey: req.ID, + } + return s.container, nil +} + +func (s *legacyRouteTestService) GetContainer(context.Context, string) (ctr.ContainerInfo, error) { + if !s.created { + if s.getContainerBeforeCreateErr != nil { + return ctr.ContainerInfo{}, s.getContainerBeforeCreateErr + } + return ctr.ContainerInfo{}, errdefs.ErrNotFound + } + return s.container, nil +} + +func (s *legacyRouteTestService) ListContainers(context.Context) ([]ctr.ContainerInfo, error) { + if !s.created { + return nil, nil + } + return []ctr.ContainerInfo{s.container}, nil +} + +func (s *legacyRouteTestService) DeleteContainer(context.Context, string, *ctr.DeleteContainerOptions) error { + s.deleteCalls++ + s.created = false + return nil +} + +func (s *legacyRouteTestService) ListContainersByLabel(context.Context, string, string) ([]ctr.ContainerInfo, error) { + return s.byLabel, nil +} + +func (s *legacyRouteTestService) StartContainer(context.Context, string, *ctr.StartTaskOptions) error { + s.startCalls++ + return nil +} + +func (*legacyRouteTestService) StopContainer(context.Context, string, *ctr.StopTaskOptions) error { + return nil +} + +func (s *legacyRouteTestService) DeleteTask(context.Context, string, *ctr.DeleteTaskOptions) error { + s.deleteTask++ + return nil +} + +func (*legacyRouteTestService) GetTaskInfo(context.Context, string) (ctr.TaskInfo, error) { + return ctr.TaskInfo{}, errdefs.ErrNotFound +} + +func (*legacyRouteTestService) ListTasks(context.Context, *ctr.ListTasksOptions) ([]ctr.TaskInfo, error) { + return nil, nil +} + +func (s *legacyRouteTestService) SetupNetwork(context.Context, ctr.NetworkSetupRequest) (ctr.NetworkResult, error) { + idx := s.setupNet + s.setupNet++ + if idx < len(s.setupNetworkErrs) && s.setupNetworkErrs[idx] != nil { + return ctr.NetworkResult{}, s.setupNetworkErrs[idx] + } + if idx < len(s.setupNetworkResults) { + return s.setupNetworkResults[idx], nil + } + return ctr.NetworkResult{IP: "10.0.0.2"}, nil +} + +func (s *legacyRouteTestService) RemoveNetwork(context.Context, ctr.NetworkSetupRequest) error { + s.removeNet++ + return nil +} + +func (*legacyRouteTestService) CommitSnapshot(context.Context, string, string, string) error { + return nil +} + +func (*legacyRouteTestService) ListSnapshots(context.Context, string) ([]ctr.SnapshotInfo, error) { + return nil, nil +} + +func (*legacyRouteTestService) PrepareSnapshot(context.Context, string, string, string) error { + return nil +} + +func (*legacyRouteTestService) CreateContainerFromSnapshot(context.Context, ctr.CreateContainerRequest) (ctr.ContainerInfo, error) { + return ctr.ContainerInfo{}, nil +} + +func (*legacyRouteTestService) SnapshotMounts(context.Context, string, string) ([]ctr.MountInfo, error) { + return nil, ctr.ErrNotSupported +} + +func newLegacyRouteTestManager(t *testing.T, svc ctr.Service, cfg config.WorkspaceConfig) *Manager { + t.Helper() + logger := slog.New(slog.DiscardHandler) + m := &Manager{ + service: svc, + cfg: cfg, + namespace: config.DefaultNamespace, + containerLocks: make(map[string]*sync.Mutex), + legacyIPs: make(map[string]string), + logger: logger, + } + m.grpcPool = bridge.NewPool(m.dialTarget) + return m +} + +func TestStartWithImageClearsLegacyRouteForBridgeContainer(t *testing.T) { + dataRoot := t.TempDir() + runtimeDir := filepath.Join(dataRoot, "runtime") + if err := os.MkdirAll(runtimeDir, 0o750); err != nil { + t.Fatalf("mkdir runtime dir: %v", err) + } + + svc := &legacyRouteTestService{} + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + DataRoot: dataRoot, + RuntimeDir: runtimeDir, + Snapshotter: "overlayfs", + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + botID := "00000000-0000-0000-0000-000000000001" + m.SetLegacyIP(botID, "10.0.0.9") + + if got := m.dialTarget(botID); got != "10.0.0.9:9090" { + t.Fatalf("expected legacy dial target before start, got %q", got) + } + + if err := m.StartWithImage(context.Background(), botID, ""); err != nil { + t.Fatalf("StartWithImage failed: %v", err) + } + + if got := m.dialTarget(botID); got != "unix://"+filepath.Join(dataRoot, "run", botID, "bridge.sock") { + t.Fatalf("expected unix dial target after bridge start, got %q", got) + } + if svc.createCalls != 1 || svc.startCalls != 1 { + t.Fatalf("expected create/start once, got create=%d start=%d", svc.createCalls, svc.startCalls) + } +} + +func TestDeleteClearsLegacyRoute(t *testing.T) { + svc := &legacyRouteTestService{created: true, container: ctr.ContainerInfo{ID: "workspace-00000000-0000-0000-0000-000000000001"}} + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + DataRoot: t.TempDir(), + Snapshotter: "overlayfs", + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + botID := "00000000-0000-0000-0000-000000000001" + m.SetLegacyIP(botID, "10.0.0.9") + + if err := m.Delete(context.Background(), botID, false); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + if got := m.dialTarget(botID); got == "10.0.0.9:9090" { + t.Fatalf("expected legacy TCP target to be cleared, got %q", got) + } + if svc.removeNet != 1 || svc.deleteTask != 1 || svc.deleteCalls != 1 { + t.Fatalf("expected delete cleanup once, got removeNet=%d deleteTask=%d delete=%d", svc.removeNet, svc.deleteTask, svc.deleteCalls) + } +} + +func TestSetupNetworkAndGetIPRejectsEmptyIP(t *testing.T) { + svc := &legacyRouteTestService{ + setupNetworkResults: []ctr.NetworkResult{{IP: ""}, {IP: "10.0.0.3"}}, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + ip, err := m.setupNetworkAndGetIP(context.Background(), "workspace-bot") + if err != nil { + t.Fatalf("setupNetworkAndGetIP failed: %v", err) + } + if ip != "10.0.0.3" { + t.Fatalf("expected retry IP, got %q", ip) + } + if svc.setupNet != 2 { + t.Fatalf("expected two network setup attempts, got %d", svc.setupNet) + } +} + +func TestContainerIDPrefersCurrentLabelSearch(t *testing.T) { + t.Parallel() + + botID := "00000000-0000-0000-0000-000000000001" + svc := &legacyRouteTestService{ + byLabel: []ctr.ContainerInfo{{ + ID: "workspace-from-label", + Labels: map[string]string{BotLabelKey: botID}, + UpdatedAt: time.Now(), + }}, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{}) + + containerID, err := m.ContainerID(context.Background(), botID) + if err != nil { + t.Fatalf("ContainerID failed: %v", err) + } + if containerID != "workspace-from-label" { + t.Fatalf("expected label-resolved container ID, got %q", containerID) + } +} + +func TestContainerIDFallsBackToNameInference(t *testing.T) { + t.Parallel() + + botID := "00000000-0000-0000-0000-000000000001" + svc := &legacyRouteTestService{ + created: true, + container: ctr.ContainerInfo{ + ID: ContainerPrefix + botID, + UpdatedAt: time.Now(), + }, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{}) + + containerID, err := m.ContainerID(context.Background(), botID) + if err != nil { + t.Fatalf("ContainerID failed: %v", err) + } + if containerID != ContainerPrefix+botID { + t.Fatalf("expected inferred container ID, got %q", containerID) + } +} diff --git a/internal/workspace/manager_lifecycle.go b/internal/workspace/manager_lifecycle.go new file mode 100644 index 00000000..7f8e56a3 --- /dev/null +++ b/internal/workspace/manager_lifecycle.go @@ -0,0 +1,515 @@ +package workspace + +import ( + "context" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "github.com/containerd/errdefs" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + + ctr "github.com/memohai/memoh/internal/containerd" + "github.com/memohai/memoh/internal/db" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" +) + +// --------------------------------------------------------------------------- +// Container ID resolution +// --------------------------------------------------------------------------- + +// ContainerID resolves the containerd container ID for a bot. +// Resolution order: DB lookup → label search → full container scan. +func (m *Manager) ContainerID(ctx context.Context, botID string) (string, error) { + if m.queries != nil { + pgBotID, err := db.ParseUUID(botID) + if err == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil && strings.TrimSpace(row.ContainerID) != "" { + return row.ContainerID, nil + } + if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { + m.logger.Warn("ContainerID: db lookup failed", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } + } + } + + containers, err := m.service.ListContainersByLabel(ctx, BotLabelKey, botID) + if err != nil { + return "", err + } + if id, ok := newestContainerID(containers); ok { + return id, nil + } + + containers, err = m.service.ListContainers(ctx) + if err != nil { + return "", err + } + matched := make([]ctr.ContainerInfo, 0, len(containers)) + for _, info := range containers { + resolvedBotID, ok := BotIDFromContainerInfo(info) + if !ok || resolvedBotID != botID { + continue + } + matched = append(matched, info) + } + if id, ok := newestContainerID(matched); ok { + return id, nil + } + + return "", ErrContainerNotFound +} + +func newestContainerID(containers []ctr.ContainerInfo) (string, bool) { + bestID := "" + var bestUpdated time.Time + for _, info := range containers { + if bestID == "" || info.UpdatedAt.After(bestUpdated) { + bestID = info.ID + bestUpdated = info.UpdatedAt + } + } + return bestID, bestID != "" +} + +// --------------------------------------------------------------------------- +// Task & network helpers +// --------------------------------------------------------------------------- + +func (m *Manager) isTaskRunning(ctx context.Context, containerID string) bool { + tasks, err := m.service.ListTasks(ctx, &ctr.ListTasksOptions{ + Filter: "container.id==" + containerID, + }) + return err == nil && len(tasks) > 0 && tasks[0].Status == ctr.TaskStatusRunning +} + +func (m *Manager) setupNetworkAndGetIP(ctx context.Context, containerID string) (string, error) { + var lastErr error + for attempt := range 2 { + result, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }) + if err != nil { + lastErr = err + m.logger.Warn("network setup attempt failed", + slog.String("container_id", containerID), + slog.Int("attempt", attempt+1), + slog.Any("error", err)) + continue + } + if strings.TrimSpace(result.IP) == "" { + lastErr = fmt.Errorf("network setup returned no IP for %s", containerID) + continue + } + return result.IP, nil + } + return "", fmt.Errorf("network setup failed for container %s: %w", containerID, lastErr) +} + +func (m *Manager) setupNetworkOrFail(ctx context.Context, containerID, botID string) error { + ip, err := m.setupNetworkAndGetIP(ctx, containerID) + if err != nil { + return err + } + // Legacy containers use TCP gRPC — cache their IP for the pool. + if m.IsLegacyContainer(ctx, containerID) { + m.SetLegacyIP(botID, ip) + } + return nil +} + +// --------------------------------------------------------------------------- +// Lifecycle: ensure / stop / info +// --------------------------------------------------------------------------- + +// EnsureRunning verifies the container exists and its task is running. +// If the container is missing, it rebuilds via SetupBotContainer. +// If the task is stopped, it restarts and sets up networking. +func (m *Manager) EnsureRunning(ctx context.Context, botID string) error { + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + if errors.Is(err, ErrContainerNotFound) { + m.logger.Warn("container missing, rebuilding", slog.String("bot_id", botID)) + return m.SetupBotContainer(ctx, botID) + } + return err + } + + _, err = m.service.GetContainer(ctx, containerID) + if err != nil { + if !errdefs.IsNotFound(err) { + return err + } + m.logger.Warn("container missing in containerd, rebuilding", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + return m.SetupBotContainer(ctx, botID) + } + + tasks, err := m.service.ListTasks(ctx, &ctr.ListTasksOptions{ + Filter: "container.id==" + containerID, + }) + if err != nil { + return err + } + if len(tasks) > 0 { + if tasks[0].Status == ctr.TaskStatusRunning { + return m.setupNetworkOrFail(ctx, containerID, botID) + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + if !errdefs.IsNotFound(err) { + m.logger.Warn("cleanup: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + return err + } + } + } + + if err := m.service.StartContainer(ctx, containerID, nil); err != nil { + return err + } + return m.setupNetworkOrFail(ctx, containerID, botID) +} + +// StopBot stops the container task for a bot and marks it stopped in DB. +func (m *Manager) StopBot(ctx context.Context, botID string) error { + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + return err + } + + if err := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{ + Timeout: 10 * time.Second, + Force: true, + }); err != nil && !errdefs.IsNotFound(err) { + return err + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + m.logger.Warn("cleanup: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + + m.markContainerStopped(ctx, botID) + return nil +} + +// GetContainerInfo returns current container status for a bot, +// combining DB records with live containerd state. +func (m *Manager) GetContainerInfo(ctx context.Context, botID string) (*ContainerStatus, error) { + if m.queries != nil { + pgBotID, parseErr := db.ParseUUID(botID) + if parseErr == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil { + createdAt := time.Time{} + if row.CreatedAt.Valid { + createdAt = row.CreatedAt.Time + } + updatedAt := time.Time{} + if row.UpdatedAt.Valid { + updatedAt = row.UpdatedAt.Time + } + return &ContainerStatus{ + ContainerID: row.ContainerID, + Image: row.Image, + Status: row.Status, + Namespace: row.Namespace, + ContainerPath: row.ContainerPath, + TaskRunning: m.isTaskRunning(ctx, row.ContainerID), + HasPreservedData: m.HasPreservedData(botID), + Legacy: m.IsLegacyContainer(ctx, row.ContainerID), + CreatedAt: createdAt, + UpdatedAt: updatedAt, + }, nil + } + } + } + + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + return nil, err + } + info, err := m.service.GetContainer(ctx, containerID) + if err != nil { + if errdefs.IsNotFound(err) { + return nil, ErrContainerNotFound + } + return nil, err + } + return &ContainerStatus{ + ContainerID: info.ID, + Image: info.Image, + Status: "unknown", + Namespace: m.namespace, + TaskRunning: m.isTaskRunning(ctx, containerID), + HasPreservedData: m.HasPreservedData(botID), + Legacy: m.IsLegacyContainer(ctx, containerID), + CreatedAt: info.CreatedAt, + UpdatedAt: info.UpdatedAt, + }, nil +} + +// PullImage pulls a container image. This is exposed so the HTTP layer can +// pass progress callbacks for SSE streaming without needing direct ctr.Service access. +func (m *Manager) PullImage(ctx context.Context, image string, opts *ctr.PullImageOptions) (ctr.ImageInfo, error) { + return m.service.PullImage(ctx, image, opts) +} + +// --------------------------------------------------------------------------- +// Container lifecycle (bots.ContainerLifecycle interface) +// --------------------------------------------------------------------------- + +// SetupBotContainer creates/starts the container and upserts the DB record. +func (m *Manager) SetupBotContainer(ctx context.Context, botID string) error { + image, err := m.resolveWorkspaceImage(ctx, botID) + if err != nil { + m.logger.Error("setup bot container: resolve image failed", + slog.String("bot_id", botID), + slog.Any("error", err)) + return err + } + + if err := m.startWithResolvedImage(ctx, botID, image); err != nil { + m.logger.Error("setup bot container: start failed", + slog.String("bot_id", botID), + slog.Any("error", err)) + return err + } + if err := m.RememberWorkspaceImage(ctx, botID, image); err != nil { + m.logger.Warn("setup bot container: remember workspace image failed", + slog.String("bot_id", botID), + slog.String("image", image), + slog.Any("error", err)) + } + + containerID := m.resolveContainerID(ctx, botID) + m.upsertContainerRecord(ctx, botID, containerID, "running", image) + return nil +} + +// CleanupBotContainer removes the container and DB record for a bot. +// When preserveData is true, /data is exported to a backup archive before deletion. +func (m *Manager) CleanupBotContainer(ctx context.Context, botID string, preserveData bool) error { + if err := m.Delete(ctx, botID, preserveData); err != nil { + if preserveData { + // When preserving data, any error (including NotFound) must + // block the workflow — we cannot delete the DB record if we + // failed to preserve data. + return err + } + if !errdefs.IsNotFound(err) { + return err + } + m.logger.Warn("cleanup: container not found in containerd, continuing", + slog.String("bot_id", botID)) + } + + m.deleteContainerRecord(ctx, botID) + return nil +} + +// --------------------------------------------------------------------------- +// Reconciliation +// --------------------------------------------------------------------------- + +// ReconcileContainers compares the DB containers table against actual containerd +// state on startup. For each auto_start container in DB it verifies the container +// and task exist; if missing they are rebuilt. +func (m *Manager) ReconcileContainers(ctx context.Context) { + if m.queries == nil { + return + } + rows, err := m.queries.ListAutoStartContainers(ctx) + if err != nil { + m.logger.Error("reconcile: failed to list containers from DB", slog.Any("error", err)) + return + } + if len(rows) == 0 { + m.logger.Info("reconcile: no auto-start containers in DB") + return + } + + m.logger.Info("reconcile: checking containers", slog.Int("count", len(rows))) + for _, row := range rows { + containerID := row.ContainerID + botID := uuid.UUID(row.BotID.Bytes).String() + + _, err := m.service.GetContainer(ctx, containerID) + if err != nil { + if !errdefs.IsNotFound(err) { + m.logger.Error("reconcile: failed to get container", + slog.String("container_id", containerID), slog.Any("error", err)) + continue + } + // Container missing in containerd — rebuild. + m.logger.Warn("reconcile: container missing, rebuilding", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + if setupErr := m.SetupBotContainer(ctx, botID); setupErr != nil { + m.logger.Error("reconcile: rebuild failed", + slog.String("bot_id", botID), slog.Any("error", setupErr)) + m.markContainerStatus(ctx, botID, "error") + } + continue + } + + // --- legacy container support (mcp- prefix, TCP gRPC) --- + // Remove when all deployments have migrated to workspace- containers. + if m.IsLegacyContainer(ctx, containerID) { + m.logger.Warn("reconcile: legacy container (pre-bridge), using TCP fallback", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + + running := m.isTaskRunning(ctx, containerID) + if !running { + if err := m.EnsureRunning(ctx, botID); err != nil { + m.logger.Error("reconcile: failed to start legacy container", + slog.String("bot_id", botID), slog.Any("error", err)) + continue + } + } + if ip, netErr := m.setupNetworkAndGetIP(ctx, containerID); netErr != nil { + m.logger.Error("reconcile: network setup failed for legacy container", + slog.String("bot_id", botID), slog.Any("error", netErr)) + } else { + m.SetLegacyIP(botID, ip) + m.logger.Info("reconcile: legacy container reachable via TCP", + slog.String("bot_id", botID), slog.String("ip", ip)) + } + continue + } + + // Container exists — ensure the task is running. + running := m.isTaskRunning(ctx, containerID) + if running { + if row.Status != "running" { + m.markContainerStarted(ctx, botID) + } + if netErr := m.setupNetworkOrFail(ctx, containerID, botID); netErr != nil { + m.logger.Error("reconcile: network setup failed for running task, container unreachable", + slog.String("bot_id", botID), + slog.String("container_id", containerID), + slog.Any("error", netErr)) + } else { + m.logger.Info("reconcile: container healthy", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + } + continue + } + + // Task not running — try to start it. + m.logger.Warn("reconcile: task not running, starting", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + if err := m.EnsureRunning(ctx, botID); err != nil { + m.logger.Error("reconcile: failed to start task", + slog.String("bot_id", botID), slog.Any("error", err)) + m.markContainerStopped(ctx, botID) + } else { + m.markContainerStarted(ctx, botID) + } + } + m.logger.Info("reconcile: completed") +} + +// RecordContainerRunning upserts a DB record marking the resolved container as running. +// This is exported for the HTTP handler's SSE-based creation flow, where the +// pull + start happen in the handler but the DB write belongs to Manager. +func (m *Manager) RecordContainerRunning(ctx context.Context, botID, containerID, image string) { + m.upsertContainerRecord(ctx, botID, containerID, "running", image) +} + +// --------------------------------------------------------------------------- +// DB record helpers (unexported) +// --------------------------------------------------------------------------- + +func (m *Manager) upsertContainerRecord(ctx context.Context, botID, containerID, status, image string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + ns := strings.TrimSpace(m.namespace) + if ns == "" { + ns = "default" + } + if dbErr := m.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{ + BotID: pgBotID, + ContainerID: containerID, + ContainerName: containerID, + Image: image, + Status: status, + Namespace: ns, + AutoStart: true, + }); dbErr != nil { + m.logger.Error("failed to upsert container record", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } + if status == "running" { + m.markContainerStarted(ctx, botID) + } +} + +func (m *Manager) deleteContainerRecord(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.DeleteContainerByBotID(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to delete container record", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStarted(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to update container started status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStopped(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStopped(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to update container stopped status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStatus(ctx context.Context, botID, status string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStatus(ctx, dbsqlc.UpdateContainerStatusParams{ + Status: status, + BotID: pgBotID, + }); dbErr != nil { + m.logger.Error("failed to update container status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} diff --git a/internal/mcp/versioning.go b/internal/workspace/versioning.go similarity index 93% rename from internal/mcp/versioning.go rename to internal/workspace/versioning.go index 17505ffb..b5c70d9c 100644 --- a/internal/mcp/versioning.go +++ b/internal/workspace/versioning.go @@ -1,4 +1,4 @@ -package mcp +package workspace import ( "context" @@ -66,7 +66,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, botID, snapshotName, sourc return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -142,7 +142,7 @@ func (m *Manager) CreateVersion(ctx context.Context, botID string) (*VersionInfo return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -210,7 +210,7 @@ func (m *Manager) ListBotSnapshotData(ctx context.Context, botID string) (*BotSn return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -272,7 +272,7 @@ func (m *Manager) ListVersions(ctx context.Context, botID string) ([]VersionInfo return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) versions, err := m.queries.ListVersionsByContainerID(ctx, containerID) if err != nil { return nil, err @@ -307,7 +307,7 @@ func (m *Manager) RollbackVersion(ctx context.Context, botID string, version int return errors.New("version out of range") } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -353,7 +353,7 @@ func (m *Manager) VersionSnapshotName(ctx context.Context, botID string, version return "", errors.New("version out of range") } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) return m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{ ContainerID: containerID, Version: int32(version), @@ -391,26 +391,26 @@ func (m *Manager) replaceContainerSnapshot(ctx context.Context, botID, container // unconditionally so the next call dials fresh to the new process. m.grpcPool.Remove(botID) - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + // CNI network setup (for outbound connectivity). + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ ContainerID: containerID, CNIBinDir: m.cfg.CNIBinaryDir, CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { + }); err != nil { return fmt.Errorf("network setup after snapshot replace: %w", err) } - if netResult.IP == "" { - return fmt.Errorf("network setup returned no IP after snapshot replace for %s", containerID) - } - m.SetContainerIP(botID, netResult.IP) return nil } -func (m *Manager) buildVersionSpec(_ string) (ctr.ContainerSpec, error) { +func (m *Manager) buildVersionSpec(botID string) (ctr.ContainerSpec, error) { resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) if err != nil { return ctr.ContainerSpec{}, err } + + runtimeDir := m.cfg.RuntimePath() + sockDir := m.socketDir(botID) + mounts := []ctr.MountSpec{ { Destination: "/etc/resolv.conf", @@ -418,12 +418,30 @@ func (m *Manager) buildVersionSpec(_ string) (ctr.ContainerSpec, error) { Source: resolvPath, Options: []string{"rbind", "ro"}, }, + { + Destination: "/opt/memoh", + Type: "bind", + Source: runtimeDir, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/run/memoh", + Type: "bind", + Source: sockDir, + Options: []string{"rbind", "rw"}, + }, } tzMounts, tzEnv := ctr.TimezoneSpec() mounts = append(mounts, tzMounts...) + + env := make([]string, 0, len(tzEnv)+1) + env = append(env, tzEnv...) + env = append(env, "BRIDGE_SOCKET_PATH=/run/memoh/bridge.sock") + return ctr.ContainerSpec{ + Cmd: []string{"/opt/memoh/bridge"}, Mounts: mounts, - Env: tzEnv, + Env: env, }, nil } diff --git a/mise.toml b/mise.toml index ae21575c..62d09979 100644 --- a/mise.toml +++ b/mise.toml @@ -59,7 +59,6 @@ description = "Start development environment" run = """ #!/bin/bash set -e -cp devenv/app.dev.toml config.toml docker compose -f devenv/docker-compose.yml up --build """ @@ -75,13 +74,13 @@ run = "docker compose -f devenv/docker-compose.yml logs -f" description = "Restart a service (usage: mise run dev:restart -- server)" run = "docker compose -f devenv/docker-compose.yml restart $@" -[tasks."mcp:build"] -description = "Manually build MCP dev binary (normally auto-triggered by air)" +[tasks."bridge:build"] +description = "Manually rebuild bridge binary in dev container (normally auto-triggered by air)" run = """ #!/bin/bash set -e docker compose -f devenv/docker-compose.yml exec server \ - sh -c 'cd /workspace && sh devenv/mcp-build.sh' + sh -c 'cd /workspace && sh devenv/bridge-build.sh' """ [tasks.db-up] @@ -161,6 +160,6 @@ depends = [ run = """ #!/bin/bash set -e -cp devenv/app.dev.toml config.toml echo '✓ Setup complete! Run: mise run dev' +echo ' Dev web UI will be available at http://localhost:18082' """ diff --git a/packages/agent/src/agent.ts b/packages/agent/src/agent.ts index 94e2b830..eab1bde2 100644 --- a/packages/agent/src/agent.ts +++ b/packages/agent/src/agent.ts @@ -110,7 +110,6 @@ export const createAgent = ( currentChannel = 'Unknown Channel', identity = { botId: '', - containerId: '', channelIdentityId: '', displayName: '', }, diff --git a/packages/agent/src/types/agent.ts b/packages/agent/src/types/agent.ts index d6fb1da8..6aa6dbab 100644 --- a/packages/agent/src/types/agent.ts +++ b/packages/agent/src/types/agent.ts @@ -5,9 +5,8 @@ import { MCPConnection } from './mcp' export interface IdentityContext { botId: string - containerId?: string - channelIdentityId?: string - displayName?: string + channelIdentityId: string + displayName: string currentPlatform?: string replyTarget?: string conversationType?: string diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index 32ee2f98..8deba1a7 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -1,9 +1,15 @@ import { parse } from 'toml' import { readFileSync } from 'fs' -import type { Config } from './types.ts' +import type { Config } from './types' export const loadConfig = (path: string = './config.toml'): Config => { const config = parse(readFileSync(path, 'utf-8')) + if ('mcp' in config) { + if ('workspace' in config) { + throw new Error('config uses both [mcp] and [workspace]; remove [mcp] and keep only [workspace]') + } + throw new Error('config section [mcp] has been renamed to [workspace]; update your config.toml and restart') + } return config satisfies Config } @@ -25,4 +31,4 @@ export const getBaseUrl = (config: Config) => { return `http://${rawAddr}` } -export * from './types.ts' +export * from './types' diff --git a/packages/config/src/types.ts b/packages/config/src/types.ts index a5fc862d..a4da3f08 100644 --- a/packages/config/src/types.ts +++ b/packages/config/src/types.ts @@ -4,7 +4,7 @@ export interface Config { admin: AdminConfig; auth: AuthConfig; containerd: ContainerdConfig; - mcp: McpConfig; + workspace: WorkspaceConfig; postgres: PostgresConfig; qdrant: QdrantConfig; sparse: SparseConfig; @@ -38,10 +38,14 @@ export interface ContainerdConfig { namespace: string; } -export interface McpConfig { - image: string; +export interface WorkspaceConfig { + registry?: string; + default_image: string; snapshotter: string; data_root: string; + cni_bin_dir?: string; + cni_conf_dir?: string; + runtime_dir?: string; } export interface PostgresConfig { @@ -80,4 +84,3 @@ export interface WebConfig { host: string; port: number; } - diff --git a/packages/sdk/package.json b/packages/sdk/package.json index 720667f7..04f68440 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -5,7 +5,8 @@ "exports": { ".": "./src/index.ts", "./client": "./src/client.gen.ts", - "./colada": "./src/@pinia/colada.gen.ts" + "./colada": "./src/@pinia/colada.gen.ts", + "./extra": "./src/extra/index.ts" }, "main": "index.js", "scripts": { diff --git a/packages/sdk/src/container-stream.ts b/packages/sdk/src/container-stream.ts new file mode 100644 index 00000000..70aae99a --- /dev/null +++ b/packages/sdk/src/container-stream.ts @@ -0,0 +1,105 @@ +import { mergeHeaders } from './client' +import { client } from './client.gen' +import type { Options } from './sdk.gen' +import type { + HandlersCreateContainerResponse, + PostBotsByBotIdContainerData, +} from './types.gen' + +// Handwritten SDK supplement for container-create SSE. +// Re-export this module via @memoh/sdk/extra instead of the generated root entry, +// because packages/sdk/src/index.ts is regenerated from OpenAPI. + +export type ContainerCreateLayerStatus = { + ref: string + offset: number + total: number +} + +// codesync(container-create-stream): keep these manual SSE payload types in sync +// with internal/handlers/containerd.go. +export type ContainerCreateStreamEvent = + | { type: 'pulling'; image: string } + | { type: 'pull_progress'; layers: ContainerCreateLayerStatus[] } + | { type: 'creating' } + | { type: 'restoring' } + | { type: 'complete'; container: HandlersCreateContainerResponse } + | { type: 'error'; message: string } + +export type ContainerCreateStreamResult = { + stream: AsyncGenerator +} + +function isLayerStatus(value: unknown): value is ContainerCreateLayerStatus { + return !!value + && typeof value === 'object' + && typeof (value as { ref?: unknown }).ref === 'string' + && typeof (value as { offset?: unknown }).offset === 'number' + && typeof (value as { total?: unknown }).total === 'number' +} + +function isContainerCreateStreamEvent(value: unknown): value is ContainerCreateStreamEvent { + if (!value || typeof value !== 'object') return false + + const event = value as Record + switch (event.type) { + case 'pulling': + return typeof event.image === 'string' + case 'pull_progress': + return Array.isArray(event.layers) && event.layers.every(isLayerStatus) + case 'creating': + case 'restoring': + return true + case 'complete': + return !!event.container && typeof event.container === 'object' + case 'error': + return typeof event.message === 'string' + default: + return false + } +} + +function toError(error: unknown): Error { + if (error instanceof Error) return error + if (typeof error === 'string' && error.trim()) return new Error(error) + return new Error('Container create stream failed') +} + +export async function postBotsByBotIdContainerStream( + options: Options, +): Promise { + let streamError: unknown + + const result = await client.sse.post({ + url: '/bots/{bot_id}/container', + ...options, + headers: mergeHeaders(options.headers, { + Accept: 'text/event-stream', + 'Content-Type': 'application/json', + }), + onSseError: (error) => { + streamError = error + }, + responseValidator: async (data) => { + if (!isContainerCreateStreamEvent(data)) { + throw new Error('Invalid container create stream event') + } + }, + sseMaxRetryAttempts: 1, + }) + + return { + stream: (async function* () { + for await (const event of result.stream as AsyncGenerator) { + if (!isContainerCreateStreamEvent(event)) { + throw new Error('Invalid container create stream event') + } + yield event + } + + if (streamError) { + throw toError(streamError) + } + })(), + } +} diff --git a/packages/sdk/src/extra/index.ts b/packages/sdk/src/extra/index.ts new file mode 100644 index 00000000..bee9b7a5 --- /dev/null +++ b/packages/sdk/src/extra/index.ts @@ -0,0 +1,10 @@ +// Handwritten SDK supplements that OpenAPI generation cannot express cleanly. +// Keep these exports under @memoh/sdk/extra instead of the generated root entry, +// because packages/sdk/src/index.ts is overwritten by sdk generation. + +export { postBotsByBotIdContainerStream } from '../container-stream' +export type { + ContainerCreateLayerStatus, + ContainerCreateStreamEvent, + ContainerCreateStreamResult, +} from '../container-stream' diff --git a/packages/sdk/src/types.gen.ts b/packages/sdk/src/types.gen.ts index 72269806..1da5fdc4 100644 --- a/packages/sdk/src/types.gen.ts +++ b/packages/sdk/src/types.gen.ts @@ -690,6 +690,7 @@ export type HandlersChannelMeta = { }; export type HandlersCreateContainerRequest = { + image?: string; restore_data?: boolean; snapshotter?: string; }; @@ -780,6 +781,7 @@ export type HandlersGetContainerResponse = { created_at?: string; has_preserved_data?: boolean; image?: string; + legacy?: boolean; namespace?: string; status?: string; task_running?: boolean; diff --git a/spec/docs.go b/spec/docs.go index 009e0685..42e50096 100644 --- a/spec/docs.go +++ b/spec/docs.go @@ -10566,6 +10566,9 @@ const docTemplate = `{ "handlers.CreateContainerRequest": { "type": "object", "properties": { + "image": { + "type": "string" + }, "restore_data": { "type": "boolean" }, @@ -10783,6 +10786,9 @@ const docTemplate = `{ "image": { "type": "string" }, + "legacy": { + "type": "boolean" + }, "namespace": { "type": "string" }, diff --git a/spec/swagger.json b/spec/swagger.json index 9b3c3e65..5b9805e6 100644 --- a/spec/swagger.json +++ b/spec/swagger.json @@ -10557,6 +10557,9 @@ "handlers.CreateContainerRequest": { "type": "object", "properties": { + "image": { + "type": "string" + }, "restore_data": { "type": "boolean" }, @@ -10774,6 +10777,9 @@ "image": { "type": "string" }, + "legacy": { + "type": "boolean" + }, "namespace": { "type": "string" }, diff --git a/spec/swagger.yaml b/spec/swagger.yaml index bd41330c..c6731446 100644 --- a/spec/swagger.yaml +++ b/spec/swagger.yaml @@ -1140,6 +1140,8 @@ definitions: type: object handlers.CreateContainerRequest: properties: + image: + type: string restore_data: type: boolean snapshotter: @@ -1280,6 +1282,8 @@ definitions: type: boolean image: type: string + legacy: + type: boolean namespace: type: string status: