diff --git a/.air.toml b/.air.toml index 44899831..9bd18b43 100644 --- a/.air.toml +++ b/.air.toml @@ -2,7 +2,7 @@ root = "." tmp_dir = "tmp" [build] -cmd = "go build -o ./tmp/memoh-server ./cmd/agent/main.go && sh devenv/mcp-build.sh" +cmd = "go build -o ./tmp/memoh-server ./cmd/agent/main.go && sh devenv/bridge-build.sh" bin = "./tmp/memoh-server" args_bin = ["serve"] include_ext = ["go", "toml"] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 254f3b32..fff2cb83 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - image: [server, agent, web, mcp, browser, sparse] + image: [server, agent, web, browser, sparse] platform: [linux/amd64, linux/arm64] include: - image: server @@ -45,8 +45,6 @@ jobs: dockerfile: docker/Dockerfile.agent - image: web dockerfile: docker/Dockerfile.web - - image: mcp - dockerfile: docker/Dockerfile.mcp - image: browser dockerfile: docker/Dockerfile.browser - image: sparse @@ -64,14 +62,14 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Set up Go - if: matrix.image == 'server' || matrix.image == 'mcp' + if: matrix.image == 'server' uses: actions/setup-go@v5 with: go-version: '1.25' cache: true - name: Pre-warm Go mod cache - if: matrix.image == 'server' || matrix.image == 'mcp' + if: matrix.image == 'server' run: | mkdir -p .go-cache GOMODCACHE=$(pwd)/.go-cache go mod download @@ -99,7 +97,7 @@ jobs: file: ${{ matrix.dockerfile }} platforms: ${{ matrix.platform }} outputs: ${{ env.PUSH == 'true' && format('type=image,"name={0}/{1}/{2}",push-by-digest=true,name-canonical=true,push=true,compression=zstd', env.REGISTRY, github.repository_owner, matrix.image) || '' }} - build-contexts: ${{ (matrix.image == 'server' || matrix.image == 'mcp') && format('gomodcache={0}/.go-cache', github.workspace) || '' }} + build-contexts: ${{ matrix.image == 'server' && format('gomodcache={0}/.go-cache', github.workspace) || '' }} build-args: | VERSION=${{ github.ref_name }} COMMIT_HASH=${{ github.sha }} @@ -134,7 +132,7 @@ jobs: needs: build strategy: matrix: - image: [server, agent, web, mcp, browser, sparse] + image: [server, agent, web, browser, sparse] steps: - name: Download digests uses: actions/download-artifact@v4 diff --git a/.gitignore b/.gitignore index 215184b9..2a5be4ee 100644 --- a/.gitignore +++ b/.gitignore @@ -93,7 +93,7 @@ tmp/ # compiled files /memoh /agent -/mcp +/bridge docs/docs/.vitepress/cache .pnpm-store @@ -106,3 +106,4 @@ config.toml .workdocs/ data _main-ref/ +.toolkit/ diff --git a/AGENTS.md b/AGENTS.md index 00166f62..9728a740 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,7 +60,7 @@ Infrastructure dependencies: Memoh/ ├── cmd/ # Go application entry points │ ├── agent/ # Main backend server (main.go) -│ ├── mcp/ # MCP server binary (stdio transport, template/, entrypoint.sh) +│ ├── bridge/ # Bridge server binary (in-container gRPC, template/) │ └── memoh/ # Unified binary wrapper (Cobra CLI) ├── internal/ # Go backend core code (domain packages) │ ├── accounts/ # User account management (CRUD, password hashing) @@ -86,7 +86,8 @@ Memoh/ │ ├── identity/ # Identity type utilities (human vs bot) │ ├── inbox/ # Bot inbox service (notifications, triggers) │ ├── logger/ # Structured logging (slog) -│ ├── mcp/ # MCP protocol manager (container lifecycle, tool gateway) +│ ├── mcp/ # MCP protocol manager (connections, OAuth, tool gateway) +│ ├── workspace/ # Workspace container lifecycle (bridge client, protobuf) │ ├── media/ # Content-addressed media asset service │ ├── memory/ # Long-term memory system (Qdrant, BM25, LLM extraction) │ ├── message/ # Message persistence and event publishing @@ -141,7 +142,7 @@ Memoh/ │ ├── migrations/ # SQL migration files │ └── queries/ # SQL query files (sqlc input) ├── conf/ # Configuration templates (app.example.toml, app.docker.toml, app.apple.toml, app.windows.toml) -├── devenv/ # Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, mcp-build.sh, server-entrypoint.sh) +├── devenv/ # Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, bridge-build.sh, server-entrypoint.sh) ├── docker/ # Production Docker (Dockerfiles, entrypoints, nginx.conf, docker-compose.yml, docker-compose.cn.yml) ├── docs/ # Documentation site ├── scripts/ # Utility scripts (db, release, install) @@ -291,7 +292,7 @@ The main configuration file is `config.toml` (copied from `conf/app.example.toml - `[admin]` — Admin account credentials - `[auth]` — JWT authentication settings - `[containerd]` — Container runtime configuration (socket path, namespace) -- `[mcp]` — MCP image and data configuration +- `[workspace]` — Workspace image and data configuration - `[postgres]` — PostgreSQL connection - `[qdrant]` — Qdrant vector database connection - `[agent_gateway]` — Agent Gateway address diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 539b41b1..c5c8a8b3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,8 @@ winget install jdx.mise ```bash mise install # Install toolchains (Go, Node, Bun, pnpm, sqlc) -mise run setup # Copy config + install deps +./docker/toolkit/install.sh # Install toolkit used by the nested workspace runtime +mise run setup # Install deps and prepare local tooling mise run dev # Start full containerized dev environment ``` @@ -32,6 +33,9 @@ That's it. `dev` launches everything in Docker containers: 4. Agent Gateway (Bun, hot-reload) 5. Web frontend (Vite, hot-reload) +The dev stack uses `devenv/app.dev.toml` directly and no longer overwrites the repo root `config.toml`. +Default host ports are shifted away from the production compose stack: Web `18082`, API `18080`, Agent `18081`, Postgres `15432`, Qdrant `16333`/`16334`, Sparse `18085`. + ## Daily Development ```bash @@ -49,7 +53,7 @@ mise run dev:restart -- server # Restart a specific service | `mise run dev:down` | Stop dev environment | | `mise run dev:logs` | View dev logs | | `mise run dev:restart` | Restart a service (e.g. `-- server`) | -| `mise run setup` | Copy config + install deps | +| `mise run setup` | Install deps and prepare local tooling | | `mise run db-up` | Run database migrations | | `mise run db-down` | Roll back database migrations | | `mise run swagger-generate` | Generate Swagger documentation | @@ -60,7 +64,7 @@ mise run dev:restart -- server # Restart a specific service ``` conf/ — Configuration templates (app.example.toml, app.docker.toml) -devenv/ — Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, mcp-build.sh) +devenv/ — Dev environment (docker-compose, dev Dockerfiles, app.dev.toml, bridge-build.sh) docker/ — Production Docker build & runtime (Dockerfiles, entrypoints) cmd/ — Go application entry points internal/ — Go backend core code diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index 04d4473a..5d54d4fd 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -66,7 +66,7 @@ For Mem0 or OpenViking SaaS, no profile is needed. Configure the provider direct ### China Mainland Mirror -Uncomment `registry = "memoh.cn"` in `config.toml` under `[mcp]`, then add the CN overlay: +Uncomment `registry = "memoh.cn"` in `config.toml` under `[workspace]`, then add the CN overlay: ```bash sudo docker compose -f docker-compose.yml -f docker/docker-compose.cn.yml \ diff --git a/apps/agent/src/models.ts b/apps/agent/src/models.ts index d5966e9f..e6903ca4 100644 --- a/apps/agent/src/models.ts +++ b/apps/agent/src/models.ts @@ -27,9 +27,8 @@ export const ModelConfigModel = z.object({ export const IdentityContextModel = z.object({ botId: z.string().min(1, 'Bot ID is required'), - containerId: z.string().optional().default(''), - channelIdentityId: z.string().optional().default(''), - displayName: z.string().optional().default(''), + channelIdentityId: z.string().min(1, 'Channel identity ID is required'), + displayName: z.string().min(1, 'Display name is required'), currentPlatform: z.string().optional(), replyTarget: z.string().optional(), conversationType: z.string().optional(), diff --git a/apps/web/AGENTS.md b/apps/web/AGENTS.md index b532909f..818f8215 100644 --- a/apps/web/AGENTS.md +++ b/apps/web/AGENTS.md @@ -319,7 +319,7 @@ Chat responses are streamed via Server-Sent Events: - Dev server port: 8082 (from `config.toml`) - Proxy: `/api` → backend (default `http://localhost:8080`) - Aliases: `@` → `./src`, `#` → `../ui/src` -- Config: reads from `../../config.toml` via `@memoh/config` +- Config: reads from `MEMOH_CONFIG_PATH` / `CONFIG_PATH` when provided, otherwise `../../config.toml`, via `@memoh/config` ## Development Rules diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index a3d3dc5d..5439e3af 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -486,6 +486,7 @@ "title": "Bots", "searchPlaceholder": "Search bots…", "createBot": "New Bot", + "createBotWaitHint": "Creating a bot may need to pull the base image on first use. Please wait a moment after submission.", "editBot": "Edit Bot", "deleteConfirm": "Are you sure you want to delete this bot?", "renameSuccess": "Bot name updated", @@ -636,9 +637,14 @@ "subtitle": "Manage the runtime container attached to this bot.", "botNotReady": "This bot is in lifecycle transition. Container actions are temporarily disabled.", "empty": "No container found for this bot. Create one to enable runtime tooling.", + "legacyWarning": "This container uses an older architecture and needs to be recreated for full compatibility. Your data will be preserved automatically.", + "legacyRecreate": "Recreate Container", + "legacyRecreateSuccess": "Container recreated successfully", "createHint": "The container is created from the current image. If you explicitly enable restore, preserved data will be restored after creation.", "createRestoreDataLabel": "Restore preserved data after creation", "createRestoreDataDescription": "If a previously exported backup or legacy bind-mounted data exists, it will be restored into `/data` after the container is created.", + "createImageLabel": "Base image", + "createImageDescription": "Docker image to use as the container base (e.g. debian:bookworm-slim, alpine:latest, ubuntu:24.04). Leave empty for the default.", "deleteConfirm": "Are you sure you want to permanently delete this container? Unpreserved data cannot be recovered.", "deletePreserveConfirm": "Are you sure you want to export `/data` and then delete this container?", "restoreConfirm": "Are you sure you want to restore preserved data into this container's `/data`?", @@ -664,6 +670,10 @@ "importSuccess": "Data imported", "restoreSuccess": "Preserved data restored", "rollbackSuccess": "Snapshot rolled back", + "pullingImage": "Pulling image...", + "creatingContainer": "Creating container...", + "preservingData": "Backing up data, this may take a while for large volumes...", + "restoringData": "Restoring data, this may take a while for large volumes...", "snapshotEmpty": "No snapshots found", "snapshotLoadFailed": "Failed to load snapshots", "snapshotNamePlaceholder": "Snapshot display name (optional)", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index b2704c18..a4b95216 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -482,6 +482,7 @@ "title": "Bots", "searchPlaceholder": "搜索 Bot…", "createBot": "新建 Bot", + "createBotWaitHint": "首次创建时可能需要拉取基础镜像,提交后请耐心等待片刻。", "editBot": "编辑 Bot", "deleteConfirm": "确定要删除这个 Bot 吗?", "renameSuccess": "Bot 名称已更新", @@ -632,9 +633,14 @@ "subtitle": "管理当前 Bot 对应的运行容器。", "botNotReady": "当前 Bot 正在生命周期变更中,暂时不可操作容器。", "empty": "当前 Bot 尚未创建容器,创建后可启用运行环境能力。", + "legacyWarning": "当前容器使用旧版架构,需要重建以获得完整兼容性。重建时数据会自动保留。", + "legacyRecreate": "重建容器", + "legacyRecreateSuccess": "容器重建成功", "createHint": "容器会基于当前镜像创建;如你显式开启恢复,则会在创建后尝试恢复已保留的数据。", "createRestoreDataLabel": "创建后恢复已保留数据", "createRestoreDataDescription": "如果存在之前导出的备份或旧版 bind mount 数据,将在容器创建后恢复到 `/data`。", + "createImageLabel": "基础镜像", + "createImageDescription": "作为容器基础环境的 Docker 镜像(如 debian:bookworm-slim、alpine:latest、ubuntu:24.04)。留空则使用默认镜像。", "deleteConfirm": "确定要彻底删除这个容器吗?未保留的数据将无法恢复。", "deletePreserveConfirm": "确定要先导出 `/data` 再删除这个容器吗?", "restoreConfirm": "确定要将已保留的数据恢复到当前容器的 `/data` 吗?", @@ -660,6 +666,10 @@ "importSuccess": "数据导入成功", "restoreSuccess": "已恢复保留数据", "rollbackSuccess": "快照回滚成功", + "pullingImage": "正在拉取镜像...", + "creatingContainer": "正在创建容器...", + "preservingData": "正在备份数据,数据量较大时可能需要一段时间...", + "restoringData": "正在迁移数据,数据量较大时可能需要一段时间...", "snapshotEmpty": "暂无快照", "snapshotLoadFailed": "加载快照失败", "snapshotNamePlaceholder": "快照显示名称(可选)", diff --git a/apps/web/src/pages/bots/components/bot-container.vue b/apps/web/src/pages/bots/components/bot-container.vue index 1575ba94..e0bcee37 100644 --- a/apps/web/src/pages/bots/components/bot-container.vue +++ b/apps/web/src/pages/bots/components/bot-container.vue @@ -9,7 +9,6 @@ import { getBotsByBotIdContainer, getBotsByBotIdContainerSnapshots, getBotsById, - postBotsByBotIdContainer, postBotsByBotIdContainerDataExport, postBotsByBotIdContainerDataImport, postBotsByBotIdContainerDataRestore, @@ -17,15 +16,23 @@ import { postBotsByBotIdContainerSnapshotsRollback, postBotsByBotIdContainerStart, postBotsByBotIdContainerStop, + type HandlersCreateContainerRequest, type HandlersGetContainerResponse, type HandlersListSnapshotsResponse, } from '@memoh/sdk' +import { + postBotsByBotIdContainerStream, + type ContainerCreateLayerStatus, + type ContainerCreateStreamEvent, +} from '@memoh/sdk/extra' import { Button, Input, Label, Separator, Spinner, Switch } from '@memoh/ui' import ConfirmPopover from '@/components/confirm-popover/index.vue' +import ContainerCreateProgress from './container-create-progress.vue' import { useSyncedQueryParam } from '@/composables/useSyncedQueryParam' import { useBotStatusMeta } from '@/composables/useBotStatusMeta' import { useCapabilitiesStore } from '@/store/capabilities' import { formatDateTime } from '@/utils/date-time' +import { shortenImageRef } from '@/utils/image-ref' import { resolveApiErrorMessage } from '@/utils/api-error' const route = useRoute() @@ -43,15 +50,38 @@ type ContainerAction = | 'import' | 'restore' | 'rollback' + | 'recreate' | '' const containerLoading = ref(false) const containerAction = ref('') const rollbackVersion = ref(null) const createRestoreData = ref(false) +const createImage = ref('') +const createImagePrefilled = ref(false) const newSnapshotName = ref('') const importInputRef = ref(null) +interface CreateProgress { + phase: 'preserving' | 'pulling' | 'creating' | 'restoring' | 'complete' | 'error' + layers?: ContainerCreateLayerStatus[] + image?: string + error?: string +} +const createProgress = ref(null) + +const createProgressPercent = computed(() => { + const layers = createProgress.value?.layers + if (!layers || layers.length === 0) return 0 + let totalOffset = 0 + let totalSize = 0 + for (const l of layers) { + totalOffset += l.offset + totalSize += l.total + } + return totalSize > 0 ? Math.round((totalOffset / totalSize) * 100) : 0 +}) + const capabilitiesStore = useCapabilitiesStore() const botId = computed(() => route.params.botId as string) const containerBusy = computed(() => containerLoading.value || containerAction.value !== '') @@ -157,29 +187,88 @@ const { data: bot } = useQuery({ enabled: () => !!botId.value, }) +function rememberedWorkspaceImage(metadata: Record | undefined): string { + const workspace = metadata?.workspace + if (!workspace || typeof workspace !== 'object' || Array.isArray(workspace)) return '' + const image = (workspace as Record).image + return typeof image === 'string' ? shortenImageRef(image) : '' +} + +const rememberedCreateImage = computed(() => rememberedWorkspaceImage(bot.value?.metadata as Record | undefined)) +const displayedContainerImage = computed(() => shortenImageRef(containerInfo.value?.image)) + const { isPending: botLifecyclePending } = useBotStatusMeta(bot, t) +function applyCreateContainerEvent(event: ContainerCreateStreamEvent): boolean { + switch (event.type) { + case 'pulling': + createProgress.value = { phase: 'pulling', image: event.image } + return false + case 'pull_progress': + createProgress.value = { + phase: 'pulling', + image: createProgress.value?.image, + layers: event.layers, + } + return false + case 'creating': + createProgress.value = { phase: 'creating' } + return false + case 'restoring': + createProgress.value = { phase: 'restoring' } + return false + case 'complete': + // Keep the last visible progress state until the container detail view loads. + // Rendering a separate "complete" phase here looks like the bar jumped back to 0. + return !!event.container.data_restored + case 'error': + createProgress.value = { phase: 'error', error: event.message } + throw new Error(event.message || 'Unknown error') + } +} + +async function createContainerSSE(body: HandlersCreateContainerRequest): Promise<{ dataRestored: boolean }> { + const { stream } = await postBotsByBotIdContainerStream({ + path: { bot_id: botId.value }, + body, + throwOnError: true, + }) + + let dataRestored = false + for await (const event of stream) { + dataRestored = applyCreateContainerEvent(event) || dataRestored + } + + return { dataRestored } +} + async function handleCreateContainer() { if (botLifecyclePending.value) return - await runContainerAction( - 'create', - async () => { - const { data } = await postBotsByBotIdContainer({ - path: { bot_id: botId.value }, - body: { - restore_data: createRestoreData.value, - }, - throwOnError: true, - }) - createRestoreData.value = false - await loadContainerData(false) - return data - }, - (result) => result.data_restored + containerAction.value = 'create' + createProgress.value = { phase: 'pulling' } + try { + const body: HandlersCreateContainerRequest = { + restore_data: createRestoreData.value, + } + const trimmedImage = createImage.value.trim() + if (trimmedImage) body.image = trimmedImage + + const { dataRestored } = await createContainerSSE(body) + createRestoreData.value = false + createImage.value = '' + await loadContainerData(false) + toast.success(dataRestored ? t('bots.container.createRestoreSuccess') - : t('bots.container.createSuccess'), - ) + : t('bots.container.createSuccess')) + } + catch (error) { + toast.error(resolveErrorMessage(error, t('bots.container.actionFailed'))) + } + finally { + containerAction.value = '' + createProgress.value = null + } } const isContainerTaskRunning = computed(() => { @@ -192,6 +281,33 @@ const isContainerTaskRunning = computed(() => { }) const hasPreservedData = computed(() => !!containerInfo.value?.has_preserved_data) +const isLegacy = computed(() => !!containerInfo.value?.legacy) + +async function handleRecreateContainer() { + if (botLifecyclePending.value || !containerInfo.value) return + + containerAction.value = 'recreate' + try { + createProgress.value = { phase: 'preserving' } + await deleteBotsByBotIdContainer({ + path: { bot_id: botId.value }, + query: { preserve_data: true }, + throwOnError: true, + }) + + createProgress.value = { phase: 'pulling' } + await createContainerSSE({ restore_data: true }) + await loadContainerData(false) + toast.success(t('bots.container.legacyRecreateSuccess')) + } + catch (error) { + toast.error(resolveErrorMessage(error, t('bots.container.actionFailed'))) + } + finally { + containerAction.value = '' + createProgress.value = null + } +} async function handleStopContainer() { if (botLifecyclePending.value || !containerInfo.value) return @@ -226,6 +342,7 @@ async function handleDeleteContainer(preserveData: boolean) { const successMessage = preserveData ? t('bots.container.deletePreserveSuccess') : t('bots.container.deleteSuccess') + const lastImage = shortenImageRef(containerInfo.value.image) await runContainerAction( action, @@ -239,6 +356,8 @@ async function handleDeleteContainer(preserveData: boolean) { containerMissing.value = true snapshots.value = [] createRestoreData.value = preserveData + createImage.value = lastImage + createImagePrefilled.value = !!lastImage }, successMessage, ) @@ -445,6 +564,19 @@ const sortedSnapshots = computed(() => { const activeTab = useSyncedQueryParam('tab', 'overview') +watch(containerMissing, (missing) => { + if (!missing) { + createImagePrefilled.value = false + } +}) + +watch([containerMissing, rememberedCreateImage], ([missing, remembered]) => { + if (!missing || createImagePrefilled.value) return + if (!remembered || createImage.value.trim()) return + createImage.value = remembered + createImagePrefilled.value = true +}, { immediate: true }) + watch([activeTab, botId], ([tab]) => { if (!botId.value) return if (tab === 'container') { @@ -540,6 +672,19 @@ watch([activeTab, botId], ([tab]) => { /> +
+ + +

+ {{ $t('bots.container.createImageDescription') }} +

+
+
+ +
+ +
@@ -559,6 +715,39 @@ watch([activeTab, botId], ([tab]) => { v-else-if="containerInfo" class="space-y-5" > +
+

+ {{ $t('bots.container.legacyWarning') }} +

+ +
+ +
+ +
+
@@ -592,7 +781,7 @@ watch([activeTab, botId], ([tab]) => { {{ $t('bots.container.fields.image') }}
- {{ containerInfo.image }} + {{ displayedContainerImage }}
@@ -969,4 +1158,4 @@ watch([activeTab, botId], ([tab]) => {
- \ No newline at end of file + diff --git a/apps/web/src/pages/bots/components/container-create-progress.vue b/apps/web/src/pages/bots/components/container-create-progress.vue new file mode 100644 index 00000000..36f99ffb --- /dev/null +++ b/apps/web/src/pages/bots/components/container-create-progress.vue @@ -0,0 +1,48 @@ + + + diff --git a/apps/web/src/pages/bots/components/create-bot.vue b/apps/web/src/pages/bots/components/create-bot.vue index 46758d88..813ade60 100644 --- a/apps/web/src/pages/bots/components/create-bot.vue +++ b/apps/web/src/pages/bots/components/create-bot.vue @@ -57,6 +57,9 @@ +
+ {{ $t('bots.createBotWaitHint') }} +
diff --git a/apps/web/src/utils/image-ref.test.ts b/apps/web/src/utils/image-ref.test.ts new file mode 100644 index 00000000..d26e6279 --- /dev/null +++ b/apps/web/src/utils/image-ref.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from 'vitest' +import { shortenImageRef } from './image-ref' + +describe('shortenImageRef', () => { + it('returns empty string for missing values', () => { + expect(shortenImageRef(undefined)).toBe('') + expect(shortenImageRef(null)).toBe('') + expect(shortenImageRef('')).toBe('') + }) + + it('strips docker hub library prefix', () => { + expect(shortenImageRef('docker.io/library/nginx:latest')).toBe('nginx:latest') + }) + + it('strips docker hub registry prefix for namespaced images', () => { + expect(shortenImageRef('docker.io/memohai/memoh:latest')).toBe('memohai/memoh:latest') + }) + + it('preserves non-docker-hub registries', () => { + expect(shortenImageRef('ghcr.io/memohai/memoh:latest')).toBe('ghcr.io/memohai/memoh:latest') + }) +}) diff --git a/apps/web/src/utils/image-ref.ts b/apps/web/src/utils/image-ref.ts new file mode 100644 index 00000000..7933d80d --- /dev/null +++ b/apps/web/src/utils/image-ref.ts @@ -0,0 +1,8 @@ +// Keep this display helper aligned with internal/config.NormalizeImageRef. +export function shortenImageRef(value: string | null | undefined): string { + const ref = value?.trim() ?? '' + if (!ref) return '' + if (ref.startsWith('docker.io/library/')) return ref.slice('docker.io/library/'.length) + if (ref.startsWith('docker.io/')) return ref.slice('docker.io/'.length) + return ref +} diff --git a/apps/web/vite.config.ts b/apps/web/vite.config.ts index 2c61cd3a..b18414d9 100644 --- a/apps/web/vite.config.ts +++ b/apps/web/vite.config.ts @@ -10,10 +10,13 @@ export default defineConfig(({ command }) => { const defaultPort = 8082 const defaultHost = '127.0.0.1' const defaultApiBaseUrl = process.env.VITE_API_URL ?? 'http://localhost:8080' + const configuredProxyTarget = process.env.MEMOH_WEB_PROXY_TARGET?.trim() + const configuredPath = process.env.MEMOH_CONFIG_PATH?.trim() || process.env.CONFIG_PATH?.trim() + const configPath = configuredPath && configuredPath.length > 0 ? configuredPath : '../../config.toml' let port = defaultPort let host = defaultHost - let baseUrl = defaultApiBaseUrl + let baseUrl = configuredProxyTarget || defaultApiBaseUrl if (command !== 'build') { try { @@ -25,13 +28,13 @@ export default defineConfig(({ command }) => { } let config try { - config = loadConfig('../../config.toml') + config = loadConfig(configPath) } catch { config = loadConfig('../../conf/app.docker.toml') } port = config.web?.port ?? defaultPort host = config.web?.host ?? defaultHost - baseUrl = getBaseUrl(config) + baseUrl = configuredProxyTarget || getBaseUrl(config) } catch { // Fall back to env/default values when config.toml is unavailable. } diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 58f43a2d..b8a37202 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -90,6 +90,7 @@ import ( ttspkg "github.com/memohai/memoh/internal/tts" ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge" "github.com/memohai/memoh/internal/version" + "github.com/memohai/memoh/internal/workspace" ) func migrationsFS() fs.FS { @@ -156,8 +157,8 @@ func runServe() { provideDBConn, provideDBQueries, - // container & mcp infrastructure - provideMCPManager, + // container & workspace infrastructure + provideWorkspaceManager, // memory pipeline provideMemoryLLM, @@ -336,8 +337,8 @@ func provideDBQueries(conn *pgxpool.Pool) *dbsqlc.Queries { return dbsqlc.New(conn) } -func provideMCPManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *mcp.Manager { - return mcp.NewManager(log, service, cfg.MCP, cfg.Containerd.Namespace, conn) +func provideWorkspaceManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *workspace.Manager { + return workspace.NewManager(log, service, cfg.Workspace, cfg.Containerd.Namespace, conn) } // --------------------------------------------------------------------------- @@ -353,7 +354,7 @@ func provideMemoryLLM(modelsService *models.Service, queries *dbsqlc.Queries, lo } } -func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *mcp.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { +func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *workspace.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { registry := memprovider.NewRegistry(log) fileRuntime := handlers.NewBuiltinMemoryRuntime(manager) fileStore := storefs.New(log, manager) @@ -466,7 +467,7 @@ func provideChannelRouter( heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, - manager *mcp.Manager, + manager *workspace.Manager, rc *boot.RuntimeConfig, ) *inbound.ChannelInboundProcessor { adapter, ok := registry.Get(qq.Type) @@ -526,8 +527,8 @@ func provideChannelLifecycleService(channelStore *channel.Store, channelManager // containerd handler & tool gateway // --------------------------------------------------------------------------- -func provideContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *handlers.ContainerdHandler { - return handlers.NewContainerdHandler(log, service, manager, cfg.MCP, cfg.Containerd.Namespace, rc.ContainerBackend, botService, accountService, policyService, queries) +func provideContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *handlers.ContainerdHandler { + return handlers.NewContainerdHandler(log, manager, cfg.Workspace, rc.ContainerBackend, botService, accountService, policyService) } func provideFederationGateway(log *slog.Logger, containerdHandler *handlers.ContainerdHandler) *handlers.MCPFederationGateway { @@ -547,7 +548,7 @@ func provideOAuthService(log *slog.Logger, queries *dbsqlc.Queries, cfg config.C return mcp.NewOAuthService(log, queries, callbackURL) } -func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *mcp.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { +func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { fedGateway.SetOAuthService(oauthService) var assetResolver mcpmessage.AssetResolver if mediaService != nil { @@ -581,7 +582,7 @@ func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManag // handler providers (interface adaptation / config extraction) // --------------------------------------------------------------------------- -func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *mcp.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { +func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *workspace.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { h := handlers.NewMemoryHandler(log, botService, accountService) h.SetMemoryRegistry(memoryRegistry) h.SetSettingsService(settingsService) @@ -599,7 +600,7 @@ func provideMessageHandler(log *slog.Logger, chatService *conversation.Service, return h } -func provideMediaService(log *slog.Logger, manager *mcp.Manager) *media.Service { +func provideMediaService(log *slog.Logger, manager *workspace.Manager) *media.Service { provider := containerfs.New(manager) return media.NewService(log, provider) } @@ -788,16 +789,16 @@ func startChannelManager(lc fx.Lifecycle, channelManager *channel.Manager) { }) } -func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { +func startContainerReconciliation(lc fx.Lifecycle, manager *workspace.Manager, _ *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { - go containerdHandler.ReconcileContainers(ctx) + go manager.ReconcileContainers(ctx) return nil }, }) } -func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { +func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, _ *handlers.ContainerdHandler, manager *workspace.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo()) lc.Append(fx.Hook{ @@ -805,7 +806,7 @@ func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *server.Server, shutd if err := ensureAdminUser(ctx, logger, queries, cfg); err != nil { return err } - botService.SetContainerLifecycle(containerdHandler) + botService.SetContainerLifecycle(manager) botService.SetContainerReachability(func(ctx context.Context, botID string) error { _, err := manager.MCPClient(ctx, botID) return err @@ -881,7 +882,7 @@ func ensureAdminUser(ctx context.Context, log *slog.Logger, queries *dbsqlc.Quer emailValue = pgtype.Text{String: email, Valid: true} } displayName := pgtype.Text{String: username, Valid: true} - dataRoot := pgtype.Text{String: cfg.MCP.DataRoot, Valid: cfg.MCP.DataRoot != ""} + dataRoot := pgtype.Text{String: cfg.Workspace.DataRoot, Valid: cfg.Workspace.DataRoot != ""} _, err = queries.CreateAccount(ctx, dbsqlc.CreateAccountParams{ UserID: user.ID, @@ -1056,9 +1057,9 @@ func (a *commandSkillLoaderAdapter) LoadSkills(ctx context.Context, botID string return skills, nil } -// commandContainerFSAdapter bridges mcp.Manager to command.ContainerFS. +// commandContainerFSAdapter bridges workspace.Manager to command.ContainerFS. type commandContainerFSAdapter struct { - manager *mcp.Manager + manager *workspace.Manager } func (a *commandContainerFSAdapter) ListDir(ctx context.Context, botID, dirPath string) ([]command.FSEntry, error) { diff --git a/cmd/mcp/main.go b/cmd/bridge/main.go similarity index 54% rename from cmd/mcp/main.go rename to cmd/bridge/main.go index d494211a..b8dde819 100644 --- a/cmd/mcp/main.go +++ b/cmd/bridge/main.go @@ -9,17 +9,18 @@ import ( "os/signal" "path/filepath" "syscall" + "time" "google.golang.org/grpc" "google.golang.org/grpc/reflection" "github.com/memohai/memoh/internal/logger" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const ( - defaultListenAddr = ":9090" - templateDir = "/opt/mcp-template" + defaultSocketPath = "/run/memoh/bridge.sock" + templateDir = "/opt/memoh/templates" ) // initDataDir ensures /data exists and seeds template files on first boot. @@ -60,14 +61,33 @@ func main() { initDataDir() - addr := os.Getenv("MCP_LISTEN_ADDR") - if addr == "" { - addr = defaultListenAddr - } + // Append toolkit to PATH so child processes (via /bin/sh -c) can find npx/uvx. + // Container-native tools take priority since toolkit is appended at the end. + _ = os.Setenv("PATH", os.Getenv("PATH")+":/opt/memoh/toolkit/bin") - lis, err := (&net.ListenConfig{}).Listen(ctx, "tcp", addr) + // PID 1 zombie reaping: when bridge runs as PID 1 inside a container, + // orphaned child processes become zombies unless reaped. + // On Linux 5.3+, Go's os/exec uses pidfd_open which avoids races between + // this reaper and cmd.Wait(). Kernels below 5.3 may see rare ECHILD errors. + go func() { + var status syscall.WaitStatus + for { + if _, err := syscall.Wait4(-1, &status, 0, nil); err != nil { + time.Sleep(time.Second) + } + } + }() + + socketPath := os.Getenv("BRIDGE_SOCKET_PATH") + if socketPath == "" { + socketPath = defaultSocketPath + } + // Clean up residual socket from a previous run. + _ = os.Remove(filepath.Clean(socketPath)) //nolint:gosec // G703: socketPath is from BRIDGE_SOCKET_PATH env or a compiled-in default, not end-user input + + lis, err := (&net.ListenConfig{}).Listen(ctx, "unix", socketPath) if err != nil { - logger.Error("failed to listen", slog.String("addr", addr), slog.Any("error", err)) + logger.Error("failed to listen", slog.String("socket", socketPath), slog.Any("error", err)) return } @@ -81,7 +101,7 @@ func main() { srv.GracefulStop() }() - logger.Info("mcp gRPC server listening", slog.String("addr", addr)) + logger.Info("bridge gRPC server listening", slog.String("socket", socketPath)) if err := srv.Serve(lis); err != nil { logger.Error("gRPC server failed", slog.Any("error", err)) return diff --git a/cmd/mcp/server.go b/cmd/bridge/server.go similarity index 99% rename from cmd/mcp/server.go rename to cmd/bridge/server.go index 0899186a..6fa325f7 100644 --- a/cmd/mcp/server.go +++ b/cmd/bridge/server.go @@ -20,7 +20,7 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const ( diff --git a/cmd/mcp/template/HEARTBEAT.md b/cmd/bridge/template/HEARTBEAT.md similarity index 100% rename from cmd/mcp/template/HEARTBEAT.md rename to cmd/bridge/template/HEARTBEAT.md diff --git a/cmd/mcp/template/IDENTITY.md b/cmd/bridge/template/IDENTITY.md similarity index 100% rename from cmd/mcp/template/IDENTITY.md rename to cmd/bridge/template/IDENTITY.md diff --git a/cmd/mcp/template/MEMORY.md b/cmd/bridge/template/MEMORY.md similarity index 100% rename from cmd/mcp/template/MEMORY.md rename to cmd/bridge/template/MEMORY.md diff --git a/cmd/mcp/template/PROFILES.md b/cmd/bridge/template/PROFILES.md similarity index 100% rename from cmd/mcp/template/PROFILES.md rename to cmd/bridge/template/PROFILES.md diff --git a/cmd/mcp/template/SOUL.md b/cmd/bridge/template/SOUL.md similarity index 100% rename from cmd/mcp/template/SOUL.md rename to cmd/bridge/template/SOUL.md diff --git a/cmd/mcp/template/TOOLS.md b/cmd/bridge/template/TOOLS.md similarity index 100% rename from cmd/mcp/template/TOOLS.md rename to cmd/bridge/template/TOOLS.md diff --git a/cmd/mcp/entrypoint.sh b/cmd/mcp/entrypoint.sh deleted file mode 100644 index 118d32e4..00000000 --- a/cmd/mcp/entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -# Copy binary to writable layer so it survives snapshot restores. -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; } -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi -exec /opt/mcp "$@" diff --git a/cmd/memoh/serve.go b/cmd/memoh/serve.go index f0951355..be81895d 100644 --- a/cmd/memoh/serve.go +++ b/cmd/memoh/serve.go @@ -92,6 +92,7 @@ import ( ttspkg "github.com/memohai/memoh/internal/tts" ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge" "github.com/memohai/memoh/internal/version" + "github.com/memohai/memoh/internal/workspace" ) func runServe() { @@ -103,7 +104,7 @@ func runServe() { provideContainerService, provideDBConn, provideDBQueries, - provideMCPManager, + provideWorkspaceManager, provideAgentRuntimeManager, provideMemoryLLM, memprovider.NewService, @@ -245,8 +246,8 @@ func provideDBConn(lc fx.Lifecycle, cfg config.Config) (*pgxpool.Pool, error) { } func provideDBQueries(conn *pgxpool.Pool) *dbsqlc.Queries { return dbsqlc.New(conn) } -func provideMCPManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *mcp.Manager { - return mcp.NewManager(log, service, cfg.MCP, cfg.Containerd.Namespace, conn) +func provideWorkspaceManager(log *slog.Logger, service ctr.Service, cfg config.Config, conn *pgxpool.Pool) *workspace.Manager { + return workspace.NewManager(log, service, cfg.Workspace, cfg.Containerd.Namespace, conn) } func provideAgentRuntimeManager(log *slog.Logger, cfg config.Config) *agentruntime.Manager { @@ -257,7 +258,7 @@ func provideMemoryLLM(modelsService *models.Service, queries *dbsqlc.Queries, lo return &lazyLLMClient{modelsService: modelsService, queries: queries, timeout: 30 * time.Second, logger: log} } -func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *mcp.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { +func provideMemoryProviderRegistry(log *slog.Logger, chatService *conversation.Service, accountService *accounts.Service, manager *workspace.Manager, queries *dbsqlc.Queries, cfg config.Config) *memprovider.Registry { registry := memprovider.NewRegistry(log) builtinRuntime := handlers.NewBuiltinMemoryRuntime(manager) fileStore := storefs.New(log, manager) @@ -342,7 +343,7 @@ func provideChannelRegistry(log *slog.Logger, hub *local.RouteHub, mediaService return registry } -func provideChannelRouter(log *slog.Logger, registry *channel.Registry, hub *local.RouteHub, routeService *route.DBService, msgService *message.DBService, resolver *flow.Resolver, identityService *identities.Service, botService *bots.Service, aclService *acl.Service, policyService *policy.Service, bindService *bind.Service, mediaService *media.Service, inboxService *inbox.Service, ttsService *ttspkg.Service, settingsService *settings.Service, subagentService *subagent.Service, scheduleService *schedule.Service, mcpConnService *mcp.ConnectionService, modelsService *models.Service, providersService *providers.Service, memProvService *memprovider.Service, searchProvService *searchproviders.Service, browserCtxService *browsercontexts.Service, emailService *emailpkg.Service, emailOutboxService *emailpkg.OutboxService, heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, rc *boot.RuntimeConfig) *inbound.ChannelInboundProcessor { +func provideChannelRouter(log *slog.Logger, registry *channel.Registry, hub *local.RouteHub, routeService *route.DBService, msgService *message.DBService, resolver *flow.Resolver, identityService *identities.Service, botService *bots.Service, aclService *acl.Service, policyService *policy.Service, bindService *bind.Service, mediaService *media.Service, inboxService *inbox.Service, ttsService *ttspkg.Service, settingsService *settings.Service, subagentService *subagent.Service, scheduleService *schedule.Service, mcpConnService *mcp.ConnectionService, modelsService *models.Service, providersService *providers.Service, memProvService *memprovider.Service, searchProvService *searchproviders.Service, browserCtxService *browsercontexts.Service, emailService *emailpkg.Service, emailOutboxService *emailpkg.OutboxService, heartbeatService *heartbeat.Service, queries *dbsqlc.Queries, containerdHandler *handlers.ContainerdHandler, manager *workspace.Manager, rc *boot.RuntimeConfig) *inbound.ChannelInboundProcessor { adapter, ok := registry.Get(qq.Type) if !ok { panic("qq adapter not registered") @@ -395,8 +396,8 @@ func provideChannelLifecycleService(channelStore *channel.Store, channelManager return channel.NewLifecycle(channelStore, channelManager) } -func provideContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *handlers.ContainerdHandler { - return handlers.NewContainerdHandler(log, service, manager, cfg.MCP, cfg.Containerd.Namespace, rc.ContainerBackend, botService, accountService, policyService, queries) +func provideContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.Config, rc *boot.RuntimeConfig, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *handlers.ContainerdHandler { + return handlers.NewContainerdHandler(log, manager, cfg.Workspace, rc.ContainerBackend, botService, accountService, policyService) } func provideFederationGateway(log *slog.Logger, containerdHandler *handlers.ContainerdHandler) *handlers.MCPFederationGateway { @@ -416,7 +417,7 @@ func provideOAuthService(log *slog.Logger, queries *dbsqlc.Queries, cfg config.C return mcp.NewOAuthService(log, queries, callbackURL) } -func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *mcp.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { +func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, _ *conversation.Service, _ *accounts.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, containerdHandler *handlers.ContainerdHandler, mcpConnService *mcp.ConnectionService, mediaService *media.Service, inboxService *inbox.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, oauthService *mcp.OAuthService, subagentService *subagent.Service, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service) *mcp.ToolGatewayService { fedGateway.SetOAuthService(oauthService) var assetResolver mcpmessage.AssetResolver if mediaService != nil { @@ -441,7 +442,7 @@ func provideToolGatewayService(log *slog.Logger, cfg config.Config, channelManag return svc } -func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *mcp.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { +func provideMemoryHandler(log *slog.Logger, botService *bots.Service, accountService *accounts.Service, _ config.Config, manager *workspace.Manager, memoryRegistry *memprovider.Registry, settingsService *settings.Service, _ *handlers.ContainerdHandler) *handlers.MemoryHandler { h := handlers.NewMemoryHandler(log, botService, accountService) h.SetMemoryRegistry(memoryRegistry) h.SetSettingsService(settingsService) @@ -466,7 +467,7 @@ func (h *memohAuthHandler) Register(e *echo.Echo) { e.POST("/api/auth/refresh", h.inner.Refresh) } -func provideMediaService(log *slog.Logger, manager *mcp.Manager) *media.Service { +func provideMediaService(log *slog.Logger, manager *workspace.Manager) *media.Service { provider := containerfs.New(manager) return media.NewService(log, provider) } @@ -619,8 +620,8 @@ func startChannelManager(lc fx.Lifecycle, channelManager *channel.Manager) { }) } -func startContainerReconciliation(lc fx.Lifecycle, containerdHandler *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { - lc.Append(fx.Hook{OnStart: func(ctx context.Context) error { go containerdHandler.ReconcileContainers(ctx); return nil }}) +func startContainerReconciliation(lc fx.Lifecycle, manager *workspace.Manager, _ *handlers.ContainerdHandler, _ *mcp.ToolGatewayService) { + lc.Append(fx.Hook{OnStart: func(ctx context.Context) error { go manager.ReconcileContainers(ctx); return nil }}) } func startAgentRuntime(lc fx.Lifecycle, manager *agentruntime.Manager) { @@ -630,14 +631,14 @@ func startAgentRuntime(lc fx.Lifecycle, manager *agentruntime.Manager) { }) } -func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *memohServer, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, containerdHandler *handlers.ContainerdHandler, manager *mcp.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { +func startServer(lc fx.Lifecycle, logger *slog.Logger, srv *memohServer, shutdowner fx.Shutdowner, cfg config.Config, queries *dbsqlc.Queries, botService *bots.Service, _ *handlers.ContainerdHandler, manager *workspace.Manager, mcpConnService *mcp.ConnectionService, toolGateway *mcp.ToolGatewayService, channelManager *channel.Manager, modelsService *models.Service) { fmt.Printf("Starting Memoh Agent %s\n", version.GetInfo()) lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { if err := ensureAdminUser(ctx, logger, queries, cfg); err != nil { return err } - botService.SetContainerLifecycle(containerdHandler) + botService.SetContainerLifecycle(manager) botService.SetContainerReachability(func(ctx context.Context, botID string) error { _, err := manager.MCPClient(ctx, botID) return err @@ -831,7 +832,7 @@ func ensureAdminUser(ctx context.Context, log *slog.Logger, queries *dbsqlc.Quer emailValue = pgtype.Text{String: email, Valid: true} } displayName := pgtype.Text{String: username, Valid: true} - dataRoot := pgtype.Text{String: cfg.MCP.DataRoot, Valid: cfg.MCP.DataRoot != ""} + dataRoot := pgtype.Text{String: cfg.Workspace.DataRoot, Valid: cfg.Workspace.DataRoot != ""} _, err = queries.CreateAccount(ctx, dbsqlc.CreateAccountParams{ UserID: user.ID, Username: pgtype.Text{String: username, Valid: true}, Email: emailValue, PasswordHash: pgtype.Text{String: string(hashed), Valid: true}, Role: "admin", @@ -971,9 +972,9 @@ func (a *commandSkillLoaderAdapter) LoadSkills(ctx context.Context, botID string return skills, nil } -// commandContainerFSAdapter bridges mcp.Manager to command.ContainerFS. +// commandContainerFSAdapter bridges workspace.Manager to command.ContainerFS. type commandContainerFSAdapter struct { - manager *mcp.Manager + manager *workspace.Manager } func (a *commandContainerFSAdapter) ListDir(ctx context.Context, botID, dirPath string) ([]command.FSEntry, error) { diff --git a/conf/app.apple.toml b/conf/app.apple.toml index e226cd7d..1c561b54 100644 --- a/conf/app.apple.toml +++ b/conf/app.apple.toml @@ -26,8 +26,8 @@ jwt_expires_in = "168h" # socket_path = "/path/to/your/.socktainer/container.sock" # binary_path = "/opt/homebrew/bin/socktainer" -[mcp] -image = "memohai/mcp:latest" +[workspace] +default_image = "debian:bookworm-slim" data_root = "data" [postgres] @@ -54,4 +54,4 @@ server_addr = "127.0.0.1:8080" [browser_gateway] host = "127.0.0.1" port = 8083 -server_addr = "127.0.0.1:8080" \ No newline at end of file +server_addr = "127.0.0.1:8080" diff --git a/conf/app.docker.toml b/conf/app.docker.toml index 113e02df..79571e07 100644 --- a/conf/app.docker.toml +++ b/conf/app.docker.toml @@ -22,11 +22,12 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "/opt/memoh/data" +runtime_dir = "/opt/memoh/runtime" ## Postgres configuration [postgres] diff --git a/conf/app.example.toml b/conf/app.example.toml index 2ef45d7a..bedabc14 100644 --- a/conf/app.example.toml +++ b/conf/app.example.toml @@ -22,9 +22,9 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "data" cni_bin_dir = "/opt/cni/bin" diff --git a/conf/app.windows.toml b/conf/app.windows.toml index 88b1eea3..dba41a4d 100644 --- a/conf/app.windows.toml +++ b/conf/app.windows.toml @@ -23,9 +23,9 @@ jwt_expires_in = "168h" socket_path = "npipe:////./pipe/containerd-containerd" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "data" diff --git a/devenv/Dockerfile.server b/devenv/Dockerfile.server index 63eac40b..ad4f094e 100644 --- a/devenv/Dockerfile.server +++ b/devenv/Dockerfile.server @@ -1,28 +1,9 @@ # syntax=docker/dockerfile:1 -# ---- Stage 1: Assemble MCP image rootfs (runtime deps only, no Go binary) ---- -FROM alpine:latest AS mcp-rootfs +# Dev server image: Go + containerd + CNI. +# Toolkit (Node.js, uv) is NOT baked in — it is volume-mounted from the host. +# Run ./docker/toolkit/install.sh once before first use. -RUN apk add --no-cache grep curl bash -RUN apk add --no-cache nodejs npm -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 2: Dev server image ---- FROM golang:1.25-alpine WORKDIR /workspace @@ -43,7 +24,7 @@ RUN apk add --no-cache \ && mkdir -p /opt/cni/bin \ && (cp -a /usr/lib/cni/. /opt/cni/bin/ 2>/dev/null || true) \ && (cp -a /usr/libexec/cni/. /opt/cni/bin/ 2>/dev/null || true) \ - && mkdir -p /etc/cni/net.d /var/lib/cni /run/containerd /var/lib/containerd /opt/memoh/data + && mkdir -p /etc/cni/net.d /var/lib/cni /run/containerd /var/lib/containerd /opt/memoh/data /opt/memoh/runtime RUN printf '%s\n' \ '{' \ @@ -73,9 +54,6 @@ RUN printf '%s\n' \ ' ]' \ '}' > /etc/cni/net.d/10-memoh.conflist -# Raw MCP rootfs for mcp-build.sh to package with compiled binary -COPY --from=mcp-rootfs /tmp/rootfs.tar /opt/images/memoh-mcp-rootfs.tar - COPY devenv/server-entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/devenv/app.dev.toml b/devenv/app.dev.toml index 3b03ce59..6e57f2d4 100644 --- a/devenv/app.dev.toml +++ b/devenv/app.dev.toml @@ -21,11 +21,12 @@ jwt_expires_in = "168h" socket_path = "/run/containerd/containerd.sock" namespace = "default" -[mcp] +[workspace] # registry = "memoh.cn" # Uncomment for China mainland mirror -image = "memohai/mcp:latest" +default_image = "debian:bookworm-slim" snapshotter = "overlayfs" data_root = "/opt/memoh/data" +runtime_dir = "/opt/memoh/runtime" cni_bin_dir = "/opt/cni/bin" cni_conf_dir = "/etc/cni/net.d" diff --git a/devenv/bridge-build.sh b/devenv/bridge-build.sh new file mode 100755 index 00000000..c6528d03 --- /dev/null +++ b/devenv/bridge-build.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Build bridge binary and place in runtime directory. +# Called by air after server build — safe to skip outside dev container. +set -e + +RUNTIME_DIR="/opt/memoh/runtime" +BRIDGE_BINARY="$RUNTIME_DIR/bridge" +STAGING="${BRIDGE_BINARY}.new" + +[ -d "$RUNTIME_DIR" ] || exit 0 +command -v ctr >/dev/null 2>&1 || exit 0 + +OLD_HASH=$(sha256sum "$BRIDGE_BINARY" 2>/dev/null | cut -d' ' -f1) +go build -o "$STAGING" ./cmd/bridge || exit 0 +NEW_HASH=$(sha256sum "$STAGING" | cut -d' ' -f1) + +if [ "$OLD_HASH" = "$NEW_HASH" ]; then + rm -f "$STAGING" + exit 0 +fi + +# Atomic replace avoids "text busy" when the old binary is running. +mv -f "$STAGING" "$BRIDGE_BINARY" +chmod +x "$BRIDGE_BINARY" + +echo "[bridge-dev] Done. Containers will restart with new binary on next access." diff --git a/devenv/docker-compose.yml b/devenv/docker-compose.yml index eb4fb3f4..e8571b98 100644 --- a/devenv/docker-compose.yml +++ b/devenv/docker-compose.yml @@ -11,7 +11,7 @@ services: - postgres_data:/var/lib/postgresql/data - /etc/localtime:/etc/localtime:ro ports: - - "5432:5432" + - "${MEMOH_DEV_POSTGRES_PORT:-15432}:5432" healthcheck: test: ["CMD-SHELL", "pg_isready -U memoh"] interval: 5s @@ -25,8 +25,8 @@ services: volumes: - qdrant_data:/qdrant/storage ports: - - "6333:6333" - - "6334:6334" + - "${MEMOH_DEV_QDRANT_HTTP_PORT:-16333}:6333" + - "${MEMOH_DEV_QDRANT_GRPC_PORT:-16334}:6334" healthcheck: test: ["CMD-SHELL", "timeout 5s bash -c ':> /dev/tcp/127.0.0.1/6333' || exit 1"] interval: 5s @@ -57,6 +57,7 @@ services: command: ["go", "run", "./cmd/agent/main.go", "migrate", "up"] environment: CONFIG_PATH: /workspace/devenv/app.dev.toml + GOFLAGS: -buildvcs=false volumes: - ..:/workspace - go_mod_cache:/go/pkg/mod @@ -79,6 +80,7 @@ services: command: ["air", "-c", ".air.toml"] environment: CONFIG_PATH: /workspace/devenv/app.dev.toml + GOFLAGS: -buildvcs=false volumes: - ..:/workspace - go_mod_cache:/go/pkg/mod @@ -86,9 +88,13 @@ services: - containerd_data:/var/lib/containerd - server_cni_state:/var/lib/cni - memoh_data:/opt/memoh/data + # Toolkit: run ./docker/toolkit/install.sh once before first use + - ../.toolkit:/opt/memoh/runtime/toolkit + - ../docker/toolkit/bin:/opt/memoh/runtime/toolkit/bin + - ../cmd/bridge/template:/opt/memoh/runtime/templates - /etc/localtime:/etc/localtime:ro ports: - - "8080:8080" + - "${MEMOH_DEV_SERVER_PORT:-18080}:8080" healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1"] interval: 5s @@ -106,11 +112,13 @@ services: container_name: memoh-dev-agent working_dir: /workspace/apps/agent command: ["bun", "run", "--watch", "src/index.ts"] + environment: + MEMOH_CONFIG_PATH: /workspace/devenv/app.dev.toml volumes: - ..:/workspace - node_modules:/workspace/node_modules ports: - - "8081:8081" + - "${MEMOH_DEV_AGENT_PORT:-18081}:8081" depends_on: deps: condition: service_completed_successfully @@ -124,12 +132,17 @@ services: dockerfile: devenv/Dockerfile.web container_name: memoh-dev-web working_dir: /workspace/apps/web - command: ["pnpm", "dev"] + command: ["pnpm", "exec", "vite", "--host", "0.0.0.0", "--port", "8082"] + environment: + MEMOH_CONFIG_PATH: /workspace/devenv/app.dev.toml + MEMOH_WEB_PROXY_TARGET: http://host.docker.internal:18080 + extra_hosts: + - "host.docker.internal:host-gateway" volumes: - ..:/workspace - node_modules:/workspace/node_modules ports: - - "8082:8082" + - "${MEMOH_DEV_WEB_PORT:-18082}:8082" depends_on: deps: condition: service_completed_successfully @@ -147,12 +160,13 @@ services: # working_dir: /workspace/apps/browser # command: ["bun", "run", "--watch", "src/index.ts"] # environment: + # - MEMOH_CONFIG_PATH=/workspace/devenv/app.dev.toml # - BROWSER_CORES=${BROWSER_CORES:-chromium,firefox} # volumes: # - ..:/workspace # - node_modules:/workspace/node_modules # ports: - # - "8083:8083" + # - "${MEMOH_DEV_BROWSER_PORT:-18083}:8083" # depends_on: # deps: # condition: service_completed_successfully @@ -166,7 +180,7 @@ services: dockerfile: docker/Dockerfile.sparse container_name: memoh-dev-sparse ports: - - "8085:8085" + - "${MEMOH_DEV_SPARSE_PORT:-18085}:8085" healthcheck: test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:8085/health')\" || exit 1"] interval: 15s diff --git a/devenv/mcp-build.sh b/devenv/mcp-build.sh deleted file mode 100755 index f4acab56..00000000 --- a/devenv/mcp-build.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -# Build MCP binary, package as containerd image, and import. -# Called by air after server build — safe to skip outside dev container. -set -e - -MCP_IMAGE="${MCP_IMAGE:-docker.io/memohai/mcp:latest}" -MCP_BINARY="/opt/memoh/data/.dev/mcp" -BASE_ROOTFS="/opt/images/memoh-mcp-rootfs.tar" - -[ -f "$BASE_ROOTFS" ] || exit 0 -command -v ctr >/dev/null 2>&1 || exit 0 - -mkdir -p "$(dirname "$MCP_BINARY")" - -OLD_HASH=$(sha256sum "$MCP_BINARY" 2>/dev/null | cut -d' ' -f1) -go build -o "$MCP_BINARY" ./cmd/mcp || exit 0 -NEW_HASH=$(sha256sum "$MCP_BINARY" | cut -d' ' -f1) - -[ "$OLD_HASH" = "$NEW_HASH" ] && exit 0 - -echo "[mcp-dev] Binary changed, rebuilding MCP image..." - -WORK=$(mktemp -d) -trap 'rm -rf "$WORK"' EXIT - -# Layer 1: base rootfs (symlink to avoid copying the large file) -LAYER1_SHA=$(sha256sum "$BASE_ROOTFS" | cut -d' ' -f1) -mkdir -p "$WORK/$LAYER1_SHA" -ln -s "$BASE_ROOTFS" "$WORK/$LAYER1_SHA/layer.tar" - -# Layer 2: compiled binary + template + entrypoint overlay -mkdir -p "$WORK/overlay/opt" -cp "$MCP_BINARY" "$WORK/overlay/opt/mcp" -chmod +x "$WORK/overlay/opt/mcp" -cp -a /workspace/cmd/mcp/template "$WORK/overlay/opt/mcp-template" -cp /workspace/cmd/mcp/entrypoint.sh "$WORK/overlay/opt/entrypoint.sh" -chmod +x "$WORK/overlay/opt/entrypoint.sh" -tar -cf "$WORK/layer2.tar" -C "$WORK/overlay" opt -LAYER2_SHA=$(sha256sum "$WORK/layer2.tar" | cut -d' ' -f1) -mkdir -p "$WORK/$LAYER2_SHA" -mv "$WORK/layer2.tar" "$WORK/$LAYER2_SHA/layer.tar" - -# OCI image config -ARCH=$(uname -m) -case "$ARCH" in aarch64|arm64) ARCH="arm64" ;; x86_64|amd64) ARCH="amd64" ;; esac - -printf '{"architecture":"%s","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s","sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp rootfs"},{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp binary"}]}' \ - "$ARCH" "$LAYER1_SHA" "$LAYER2_SHA" > "$WORK/config.json" - -CONFIG_SHA=$(sha256sum "$WORK/config.json" | cut -d' ' -f1) -mv "$WORK/config.json" "$WORK/$CONFIG_SHA.json" - -printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar","%s/layer.tar"]}]' \ - "$CONFIG_SHA" "$MCP_IMAGE" "$LAYER1_SHA" "$LAYER2_SHA" > "$WORK/manifest.json" - -# -h follows symlinks (layer 1 is symlinked to avoid copying) -tar -chf "$WORK/memoh-mcp.tar" -C "$WORK" manifest.json "$CONFIG_SHA.json" "$LAYER1_SHA/" "$LAYER2_SHA/" - -# Replace image in containerd -ctr -n default images rm "$MCP_IMAGE" 2>/dev/null || true -ctr -n default images import --all-platforms "$WORK/memoh-mcp.tar" 2>&1 || true - -# Clean old MCP containers so they recreate with new image -for c in $(ctr -n default containers ls -q 2>/dev/null | grep "^mcp-"); do - ctr -n default tasks kill "$c" 2>/dev/null || true - ctr -n default tasks delete "$c" 2>/dev/null || true - ctr -n default containers delete "$c" 2>/dev/null || true -done - -echo "[mcp-dev] Done. Containers will auto-recreate with new image." diff --git a/devenv/server-entrypoint.sh b/devenv/server-entrypoint.sh old mode 100644 new mode 100755 index 69e03bfc..7f3a0403 --- a/devenv/server-entrypoint.sh +++ b/devenv/server-entrypoint.sh @@ -1,6 +1,14 @@ #!/bin/sh set -e +# Toolkit is volume-mounted from the host (.toolkit/). +# If missing, the user forgot to run the install script. +if [ ! -d /opt/memoh/runtime/toolkit/node-glibc ]; then + echo "ERROR: Toolkit not found at /opt/memoh/runtime/toolkit/." >&2 + echo " Run ./docker/toolkit/install.sh before starting the dev environment." >&2 + exit 1 +fi + # Clean up stale CNI state from previous runs. After a container restart the # cni0 bridge may linger with a zeroed MAC (00:00:00:00:00:00), causing the # bridge plugin to fail with "could not set bridge's mac: invalid argument". @@ -41,10 +49,10 @@ if ! ctr version >/dev/null 2>&1; then fi echo "containerd is running (pid $CONTAINERD_PID)" -# Build MCP binary and import as containerd image -echo "Building MCP image..." -(cd /workspace && sh devenv/mcp-build.sh) -echo "MCP image ready." +# Build bridge binary into runtime directory (first boot) +echo "Building bridge binary..." +(cd /workspace && go build -o /opt/memoh/runtime/bridge ./cmd/bridge) +echo "Bridge binary ready." echo "Starting server..." diff --git a/docker/Dockerfile.containerd b/docker/Dockerfile.containerd deleted file mode 100644 index ae847b5d..00000000 --- a/docker/Dockerfile.containerd +++ /dev/null @@ -1,86 +0,0 @@ -# syntax=docker/dockerfile:1 - -# ---- Stage 1: Build MCP binary ---- -FROM golang:1.25-alpine AS mcp-builder - -WORKDIR /src -RUN apk add --no-cache ca-certificates git - -COPY go.mod go.sum ./ -RUN --mount=type=cache,target=/go/pkg/mod \ - go mod download - -COPY . . - -ARG TARGETARCH=amd64 -ARG COMMIT_HASH=unknown -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} \ - go build -trimpath \ - -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" \ - -o /out/mcp ./cmd/mcp - -# ---- Stage 2: Assemble MCP image rootfs ---- -FROM alpine:latest AS mcp-rootfs - -# Base utilities -RUN apk add --no-cache grep curl bash - -# Node.js + npm (provides npx for JS/TS MCP servers) -RUN apk add --no-cache nodejs npm - -# Python 3 + uv (provides uvx for Python MCP servers) -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY --from=mcp-builder /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -# Create rootfs tar excluding pseudo-filesystems -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 3: Package rootfs as Docker image tar ---- -FROM alpine:latest AS oci-exporter - -COPY --from=mcp-rootfs /tmp/rootfs.tar /tmp/layer.tar -ARG MCP_IMAGE_TAG=docker.io/library/memoh-mcp:latest - -RUN set -e \ - && LAYER_SHA=$(sha256sum /tmp/layer.tar | awk '{print $1}') \ - && LAYER_SIZE=$(wc -c < /tmp/layer.tar) \ - && mkdir -p "/tmp/image/${LAYER_SHA}" /out \ - && mv /tmp/layer.tar "/tmp/image/${LAYER_SHA}/layer.tar" \ - && printf '{"architecture":"amd64","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp image"}]}' \ - "${LAYER_SHA}" > /tmp/config.json \ - && CONFIG_SHA=$(sha256sum /tmp/config.json | awk '{print $1}') \ - && mv /tmp/config.json "/tmp/image/${CONFIG_SHA}.json" \ - && printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar"]}]' \ - "${CONFIG_SHA}" "${MCP_IMAGE_TAG}" "${LAYER_SHA}" > /tmp/image/manifest.json \ - && cd /tmp/image && tar -cf /out/memoh-mcp.tar manifest.json "${CONFIG_SHA}.json" "${LAYER_SHA}/" - -# ---- Stage 4: Containerd runtime ---- -FROM alpine:latest - -RUN apk add --no-cache containerd containerd-ctr - -COPY --from=oci-exporter /out/memoh-mcp.tar /opt/images/memoh-mcp.tar -COPY docker/containerd-entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -VOLUME ["/run/containerd", "/var/lib/containerd", "/opt/memoh/data"] - -HEALTHCHECK --interval=5s --timeout=3s --start-period=10s --retries=10 \ - CMD test -S /run/containerd/containerd.sock - -ENTRYPOINT ["/entrypoint.sh"] diff --git a/docker/Dockerfile.mcp b/docker/Dockerfile.mcp deleted file mode 100644 index 770344d4..00000000 --- a/docker/Dockerfile.mcp +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1 -FROM scratch AS gomodcache - -FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS build - -WORKDIR /src -COPY go.mod go.sum ./ -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=bind,from=gomodcache,target=/tmp/gomodcache \ - set -eux; \ - if [ -d /tmp/gomodcache/cache/download ]; then \ - cp -a /tmp/gomodcache/. /go/pkg/mod/; \ - fi; \ - go mod download - -COPY . . -ARG TARGETARCH -ARG COMMIT_HASH=unknown -RUN --mount=type=cache,target=/go/pkg/mod \ - --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH:-amd64} \ - go build -trimpath -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" -o /out/mcp ./cmd/mcp - -FROM alpine:latest - -# Base utilities -RUN apk add --no-cache grep curl bash dumb-init - -# Node.js + npm (provides npx for JS/TS MCP servers) -RUN apk add --no-cache nodejs npm - -# Python 3 + uv (provides uvx for Python MCP servers) -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -WORKDIR /app -COPY --from=build /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template -COPY cmd/mcp/entrypoint.sh /opt/entrypoint.sh -RUN chmod +x /opt/entrypoint.sh -ENTRYPOINT ["/usr/bin/dumb-init", "--", "/opt/entrypoint.sh"] diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server index 3e820b8e..8515b5dc 100644 --- a/docker/Dockerfile.server +++ b/docker/Dockerfile.server @@ -35,8 +35,8 @@ RUN --mount=type=cache,target=/go/pkg/mod \ -X github.com/memohai/memoh/internal/version.BuildTime=${BUILD_TIME}" \ -o memoh-server ./cmd/agent/main.go -# ---- Stage 3: Build MCP binary ---- -FROM build-base AS mcp-builder +# ---- Stage 3: Build bridge binary ---- +FROM build-base AS bridge-builder ARG TARGETARCH ARG COMMIT_HASH=unknown RUN --mount=type=cache,target=/go/pkg/mod \ @@ -44,50 +44,21 @@ RUN --mount=type=cache,target=/go/pkg/mod \ CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH:-amd64} \ go build -trimpath \ -ldflags "-s -w -X github.com/memohai/memoh/internal/version.CommitHash=${COMMIT_HASH}" \ - -o /out/mcp ./cmd/mcp + -o /out/bridge ./cmd/bridge -# ---- Stage 3: Assemble MCP image rootfs ---- -FROM alpine:latest AS mcp-rootfs - -RUN apk add --no-cache grep curl bash -RUN apk add --no-cache nodejs npm -RUN apk add --no-cache python3 && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ - ln -sf /root/.local/bin/uv /usr/local/bin/uv && \ - ln -sf /root/.local/bin/uvx /usr/local/bin/uvx - -COPY --from=mcp-builder /out/mcp /opt/mcp -COPY cmd/mcp/template /opt/mcp-template - -RUN printf '#!/bin/sh\n\ -[ -e /app/mcp ] || { mkdir -p /app; [ -f /opt/mcp ] && cp -a /opt/mcp /app/mcp 2>/dev/null || true; }\n\ -if [ -x /app/mcp ]; then exec /app/mcp "$@"; fi\n\ -exec /opt/mcp "$@"\n' > /opt/entrypoint.sh && chmod +x /opt/entrypoint.sh - -RUN tar -cf /tmp/rootfs.tar \ - --exclude='./proc' --exclude='./sys' --exclude='./dev' \ - --exclude='./tmp' --exclude='./run' \ - -C / . - -# ---- Stage 4: Package rootfs as OCI image tar ---- -FROM alpine:latest AS oci-exporter - -COPY --from=mcp-rootfs /tmp/rootfs.tar /tmp/layer.tar -ARG MCP_IMAGE_TAG=docker.io/library/memoh-mcp:latest +# ---- Stage 4: Assemble workspace runtime + toolkit ---- +FROM alpine:latest AS toolkit-assembly ARG TARGETARCH -RUN set -e \ - && LAYER_SHA=$(sha256sum /tmp/layer.tar | awk '{print $1}') \ - && LAYER_SIZE=$(wc -c < /tmp/layer.tar) \ - && mkdir -p "/tmp/image/${LAYER_SHA}" /out \ - && mv /tmp/layer.tar "/tmp/image/${LAYER_SHA}/layer.tar" \ - && printf '{"architecture":"%s","os":"linux","created":"1970-01-01T00:00:00Z","config":{"Entrypoint":["/opt/entrypoint.sh"],"WorkingDir":"/app","Env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]},"rootfs":{"type":"layers","diff_ids":["sha256:%s"]},"history":[{"created":"1970-01-01T00:00:00Z","comment":"memoh-mcp image"}]}' \ - "${TARGETARCH:-amd64}" "${LAYER_SHA}" > /tmp/config.json \ - && CONFIG_SHA=$(sha256sum /tmp/config.json | awk '{print $1}') \ - && mv /tmp/config.json "/tmp/image/${CONFIG_SHA}.json" \ - && printf '[{"Config":"%s.json","RepoTags":["%s"],"Layers":["%s/layer.tar"]}]' \ - "${CONFIG_SHA}" "${MCP_IMAGE_TAG}" "${LAYER_SHA}" > /tmp/image/manifest.json \ - && cd /tmp/image && tar -cf /out/memoh-mcp.tar manifest.json "${CONFIG_SHA}.json" "${LAYER_SHA}/" +RUN apk add --no-cache xz +COPY docker/toolkit/install.sh /tmp/install.sh +RUN /tmp/install.sh /assembly/toolkit "${TARGETARCH:-amd64}" + +# Assemble runtime directory +COPY --from=bridge-builder /out/bridge /assembly/bridge +COPY cmd/bridge/template /assembly/templates +COPY docker/toolkit/bin /assembly/toolkit/bin +RUN chmod +x /assembly/toolkit/bin/* # ---- Stage 5: Final runtime (containerd + server + CNI) ---- FROM alpine:latest @@ -97,7 +68,7 @@ WORKDIR /app # containerd runtime RUN apk add --no-cache containerd containerd-ctr -# CNI plugins + iptables (for MCP container networking) +# CNI plugins + iptables (for workspace container networking) RUN apk add --no-cache ca-certificates tzdata wget cni-plugins iptables \ && mkdir -p /opt/cni/bin \ && (cp -a /usr/lib/cni/. /opt/cni/bin/ 2>/dev/null || true) \ @@ -131,8 +102,8 @@ RUN apk add --no-cache ca-certificates tzdata wget cni-plugins iptables \ ' ]' \ '}' > /etc/cni/net.d/10-memoh.conflist -# MCP image for containerd import -COPY --from=oci-exporter /out/memoh-mcp.tar /opt/images/memoh-mcp.tar +# Workspace runtime (bind-mounted into bot containers) +COPY --from=toolkit-assembly /assembly /opt/memoh/runtime # Server binary and spec COPY --from=server-builder /build/memoh-server /app/memoh-server diff --git a/docker/containerd-entrypoint.sh b/docker/containerd-entrypoint.sh deleted file mode 100644 index 31c811e5..00000000 --- a/docker/containerd-entrypoint.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh - -MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}" - -# Start containerd in background -mkdir -p /run/containerd -containerd & -CONTAINERD_PID=$! - -# Wait for containerd to be fully responsive -echo "Waiting for containerd..." -for i in $(seq 1 30); do - if ctr version >/dev/null 2>&1; then - break - fi - sleep 1 -done - -if ! ctr version >/dev/null 2>&1; then - echo "ERROR: containerd not responsive after 30s" - exit 1 -fi -echo "containerd is running" - -# Import MCP image if not already present -if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "Importing MCP image into containerd..." - for tar in /opt/images/*.tar; do - if [ -f "$tar" ]; then - ctr -n default images import --all-platforms "$tar" 2>&1 || true - fi - done - if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "MCP image ready: ${MCP_IMAGE}" - else - echo "WARNING: MCP image not available after import, will try pull at runtime" - fi -else - echo "MCP image already present: ${MCP_IMAGE}" -fi - -echo "containerd is ready" -wait $CONTAINERD_PID diff --git a/docker/server-entrypoint.sh b/docker/server-entrypoint.sh index 6dbf1df1..27e68e35 100644 --- a/docker/server-entrypoint.sh +++ b/docker/server-entrypoint.sh @@ -1,8 +1,6 @@ #!/bin/sh set -e -MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}" - # ---- Clean up stale CNI state from previous runs ---- # After a container restart the cni0 bridge may linger with a zeroed MAC # (00:00:00:00:00:00), causing "could not set bridge's mac: invalid argument". @@ -46,23 +44,6 @@ if ! ctr version >/dev/null 2>&1; then fi echo "containerd is running (pid $CONTAINERD_PID)" -# ---- Import MCP image if not already present ---- -if ! ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "Importing MCP image into containerd..." - for tar in /opt/images/*.tar; do - if [ -f "$tar" ]; then - ctr -n default images import --all-platforms "$tar" 2>&1 || true - fi - done - if ctr -n default images check "name==${MCP_IMAGE}" 2>/dev/null | grep -q "${MCP_IMAGE}"; then - echo "MCP image ready: ${MCP_IMAGE}" - else - echo "WARNING: MCP image not available after import, will try pull at runtime" - fi -else - echo "MCP image already present: ${MCP_IMAGE}" -fi - echo "containerd is ready, starting memoh-server..." # ---- Start server (foreground, trap signals for graceful shutdown) ---- diff --git a/docker/toolkit/bin/node b/docker/toolkit/bin/node new file mode 100755 index 00000000..0cd59973 --- /dev/null +++ b/docker/toolkit/bin/node @@ -0,0 +1,9 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +exec "$NODEDIR/bin/node" "$@" diff --git a/docker/toolkit/bin/npm b/docker/toolkit/bin/npm new file mode 100755 index 00000000..6b54ac26 --- /dev/null +++ b/docker/toolkit/bin/npm @@ -0,0 +1,10 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +export PATH="$NODEDIR/bin:$PATH" +exec "$NODEDIR/bin/npm" "$@" diff --git a/docker/toolkit/bin/npx b/docker/toolkit/bin/npx new file mode 100755 index 00000000..fa27a8e5 --- /dev/null +++ b/docker/toolkit/bin/npx @@ -0,0 +1,10 @@ +#!/bin/sh +TOOLKIT=/opt/memoh/toolkit +if [ -f /lib/ld-musl-*.so.1 ] 2>/dev/null; then + NODEDIR="$TOOLKIT/node-musl" + export LD_LIBRARY_PATH="$NODEDIR/runtime-lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +else + NODEDIR="$TOOLKIT/node-glibc" +fi +export PATH="$NODEDIR/bin:$PATH" +exec "$NODEDIR/bin/npx" "$@" diff --git a/docker/toolkit/bin/uv b/docker/toolkit/bin/uv new file mode 100755 index 00000000..d5393f1e --- /dev/null +++ b/docker/toolkit/bin/uv @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/memoh/toolkit/uv "$@" diff --git a/docker/toolkit/bin/uvx b/docker/toolkit/bin/uvx new file mode 100755 index 00000000..c717cf7c --- /dev/null +++ b/docker/toolkit/bin/uvx @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/memoh/toolkit/uv tool run "$@" diff --git a/docker/toolkit/install.sh b/docker/toolkit/install.sh new file mode 100755 index 00000000..9372d44c --- /dev/null +++ b/docker/toolkit/install.sh @@ -0,0 +1,132 @@ +#!/bin/sh +# Download Node.js (glibc + musl) and uv into a toolkit directory. +# +# Usage: +# ./docker/toolkit/install.sh [output_dir] [arch] +# +# Arguments: +# output_dir Target directory (default: .toolkit) +# arch amd64 or arm64 (default: auto-detect from uname -m) +# +# Environment variables for mirrors (useful in mainland China): +# NODEJS_MIRROR Default: https://nodejs.org/dist +# NODEJS_MUSL_MIRROR Default: https://unofficial-builds.nodejs.org/download/release +# NPM_MIRROR Default: https://registry.npmjs.org +# ALPINE_MIRROR Default: https://dl-cdn.alpinelinux.org/alpine +# UV_MIRROR Default: https://github.com/astral-sh/uv/releases/latest/download +# +set -eu + +ALPINE_VERSION=3.23 +NODE_VERSION=24.14.0 +NPM_VERSION=10.9.2 + +OUTDIR="${1:-.toolkit}" +ARCH="${2:-}" + +if [ -z "$ARCH" ]; then + case "$(uname -m)" in + x86_64) ARCH=amd64 ;; + aarch64) ARCH=arm64 ;; + arm64) ARCH=arm64 ;; + *) echo "ERROR: unsupported architecture: $(uname -m)" >&2; exit 1 ;; + esac +fi + +NODEJS_MIRROR="${NODEJS_MIRROR:-https://nodejs.org/dist}" +NODEJS_MUSL_MIRROR="${NODEJS_MUSL_MIRROR:-https://unofficial-builds.nodejs.org/download/release}" +NPM_MIRROR="${NPM_MIRROR:-https://registry.npmjs.org}" +ALPINE_MIRROR="${ALPINE_MIRROR:-https://dl-cdn.alpinelinux.org/alpine}" +UV_MIRROR="${UV_MIRROR:-https://github.com/astral-sh/uv/releases/latest/download}" + +case "$ARCH" in + amd64) NODE_ARCH=x64; UV_ARCH=x86_64; APK_ARCH=x86_64 ;; + arm64) NODE_ARCH=arm64; UV_ARCH=aarch64; APK_ARCH=aarch64 ;; + *) echo "ERROR: unsupported arch: $ARCH" >&2; exit 1 ;; +esac + +ALPINE_REPO="${ALPINE_MIRROR}/v${ALPINE_VERSION}/main/${APK_ARCH}" + +TMPDIR="$(mktemp -d)" +cleanup() { + rm -rf "$TMPDIR" +} +trap cleanup EXIT INT TERM + +apk_index_path="$TMPDIR/APKINDEX.tar.gz" + +apk_package_filename() { + pkg="$1" + tar -xzOf "$apk_index_path" APKINDEX | awk -v pkg="$pkg" ' + $0 == "P:" pkg { hit = 1; next } + hit && /^V:/ { print pkg "-" substr($0, 3) ".apk"; exit } + /^$/ { hit = 0 } + ' +} + +install_musl_runtime_libs() { + dest_dir="$OUTDIR/node-musl/runtime-lib" + rm -rf "$dest_dir" + mkdir -p "$dest_dir" + + echo "Downloading musl runtime libs (${APK_ARCH})..." + wget -qO "$apk_index_path" "${ALPINE_REPO}/APKINDEX.tar.gz" + + for pkg in libgcc libstdc++; do + apk_file="$(apk_package_filename "$pkg")" + if [ -z "$apk_file" ]; then + echo "ERROR: failed to resolve Alpine package for $pkg (${APK_ARCH})" >&2 + exit 1 + fi + pkg_path="$TMPDIR/$apk_file" + extract_dir="$TMPDIR/extract-$pkg" + rm -rf "$extract_dir" + mkdir -p "$extract_dir" + wget -qO "$pkg_path" "${ALPINE_REPO}/$apk_file" + tar -xzf "$pkg_path" -C "$extract_dir" + cp -a "$extract_dir/usr/lib/." "$dest_dir/" + done +} + +install_pinned_npm() { + node_dir="$1" + dest_dir="$OUTDIR/$node_dir/lib/node_modules/npm" + extract_dir="$TMPDIR/npm-$node_dir" + + rm -rf "$dest_dir" "$extract_dir" + mkdir -p "$extract_dir" "$(dirname "$dest_dir")" + tar -xzf "$npm_archive" -C "$extract_dir" + mv "$extract_dir/package" "$dest_dir" +} + +mkdir -p "$OUTDIR/node-glibc" "$OUTDIR/node-musl" + +echo "Downloading Node.js v${NODE_VERSION} (glibc, ${NODE_ARCH})..." +wget -qO- "${NODEJS_MIRROR}/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}.tar.xz" \ + | tar -xJf - --strip-components=1 -C "$OUTDIR/node-glibc" + +MUSL_URL="${NODEJS_MUSL_MIRROR}/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}-musl.tar.xz" +echo "Downloading Node.js v${NODE_VERSION} (musl, ${NODE_ARCH})..." +musl_archive="$TMPDIR/node-musl.tar.xz" +if wget -qO "$musl_archive" "$MUSL_URL" 2>/dev/null; then + tar -xJf "$musl_archive" --strip-components=1 -C "$OUTDIR/node-musl" +else + echo "ERROR: failed to download musl Node.js build for ${NODE_ARCH}" >&2 + exit 1 +fi + +install_musl_runtime_libs + +echo "Downloading npm v${NPM_VERSION}..." +npm_archive="$TMPDIR/npm.tgz" +wget -qO "$npm_archive" "${NPM_MIRROR}/npm/-/npm-${NPM_VERSION}.tgz" +install_pinned_npm node-glibc +install_pinned_npm node-musl + +echo "Downloading uv (${UV_ARCH})..." +wget -qO- "${UV_MIRROR}/uv-${UV_ARCH}-unknown-linux-musl.tar.gz" \ + | tar -xzf - --strip-components=1 -C /tmp +mv /tmp/uv "$OUTDIR/uv" +chmod +x "$OUTDIR/uv" + +echo "Toolkit installed to $OUTDIR" diff --git a/docs/docs/installation/docker.md b/docs/docs/installation/docker.md index 72525eef..01cb8665 100644 --- a/docs/docs/installation/docker.md +++ b/docs/docs/installation/docker.md @@ -134,7 +134,7 @@ sudo POSTGRES_PASSWORD=your-db-password docker compose up -d For users in mainland China who cannot access Docker Hub directly, uncomment the `registry` line in `config.toml`: ```toml -[mcp] +[workspace] registry = "memoh.cn" ``` diff --git a/internal/config/config.go b/internal/config/config.go index 68ec6206..a8de1d89 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,7 +1,9 @@ package config import ( + "errors" "os" + "path/filepath" "strconv" "strings" @@ -13,7 +15,6 @@ const ( DefaultHTTPAddr = ":8080" DefaultNamespace = "default" DefaultSocketPath = "/run/containerd/containerd.sock" - DefaultMCPImage = "memohai/mcp:latest" DefaultDataRoot = "data" DefaultDataMount = "/data" DefaultCNIBinaryDir = "/opt/cni/bin" @@ -26,7 +27,8 @@ const ( DefaultPGSSLMode = "disable" DefaultQdrantURL = "http://127.0.0.1:6334" DefaultQdrantCollection = "memory" - MCPGRPCPort = 9090 + DefaultRuntimeDir = "/opt/memoh/runtime" + DefaultBaseImage = "debian:bookworm-slim" ) type Config struct { @@ -35,7 +37,7 @@ type Config struct { Admin AdminConfig `toml:"admin"` Auth AuthConfig `toml:"auth"` Containerd ContainerdConfig `toml:"containerd"` - MCP MCPConfig `toml:"mcp"` + Workspace WorkspaceConfig `toml:"workspace"` Postgres PostgresConfig `toml:"postgres"` Qdrant QdrantConfig `toml:"qdrant"` Sparse SparseConfig `toml:"sparse"` @@ -74,23 +76,23 @@ type SocktainerConfig struct { BinaryPath string `toml:"binary_path"` } -type MCPConfig struct { +type WorkspaceConfig struct { Registry string `toml:"registry"` - Image string `toml:"image"` + DefaultImage string `toml:"default_image"` Snapshotter string `toml:"snapshotter"` DataRoot string `toml:"data_root"` CNIBinaryDir string `toml:"cni_bin_dir"` CNIConfigDir string `toml:"cni_conf_dir"` + RuntimeDir string `toml:"runtime_dir"` } -// ImageRef returns the fully qualified image reference, prepending the -// registry mirror when configured and normalizing for containerd compatibility. -// Containerd requires a fully-qualified domain in image references — short -// Docker Hub names like "memohai/mcp:latest" are misinterpreted as hosts. -func (c MCPConfig) ImageRef() string { - img := c.Image +// ImageRef returns the fully qualified image reference for the base image, +// prepending the registry mirror when configured and normalizing for containerd +// compatibility. +func (c WorkspaceConfig) ImageRef() string { + img := c.DefaultImage if img == "" { - img = DefaultMCPImage + img = DefaultBaseImage } if c.Registry != "" { return c.Registry + "/" + img @@ -98,6 +100,14 @@ func (c MCPConfig) ImageRef() string { return NormalizeImageRef(img) } +// RuntimePath returns the path to the workspace runtime directory. +func (c WorkspaceConfig) RuntimePath() string { + if c.RuntimeDir != "" { + return c.RuntimeDir + } + return DefaultRuntimeDir +} + // NormalizeImageRef ensures an image reference is fully qualified for containerd. func NormalizeImageRef(ref string) string { firstSlash := strings.Index(ref, "/") @@ -185,8 +195,8 @@ func Load(path string) (Config, error) { SocketPath: DefaultSocketPath, Namespace: DefaultNamespace, }, - MCP: MCPConfig{ - Image: DefaultMCPImage, + Workspace: WorkspaceConfig{ + DefaultImage: DefaultBaseImage, DataRoot: DefaultDataRoot, CNIBinaryDir: DefaultCNIBinaryDir, CNIConfigDir: DefaultCNIConfigDir, @@ -211,6 +221,7 @@ func Load(path string) (Config, error) { if path == "" { path = DefaultConfigPath } + path = filepath.Clean(path) if _, err := os.Stat(path); err != nil { if os.IsNotExist(err) { @@ -219,7 +230,27 @@ func Load(path string) (Config, error) { return cfg, err } - if _, err := toml.DecodeFile(path, &cfg); err != nil { + //nolint:gosec // config path is intentionally user-configurable + data, err := os.ReadFile(path) + if err != nil { + return cfg, err + } + + var raw struct { + Workspace map[string]any `toml:"workspace"` + MCP map[string]any `toml:"mcp"` + } + if _, err := toml.Decode(string(data), &raw); err != nil { + return cfg, err + } + if raw.MCP != nil { + if raw.Workspace != nil { + return cfg, errors.New("config uses both [mcp] and [workspace]; remove [mcp] and keep only [workspace]") + } + return cfg, errors.New("config section [mcp] has been renamed to [workspace]; update your config.toml and restart") + } + + if _, err := toml.Decode(string(data), &cfg); err != nil { return cfg, err } diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 00000000..5908631e --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,59 @@ +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestLoadRejectsLegacyMCPSection(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[mcp]\nfoo = \"legacy\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + _, err := Load(configPath) + if err == nil { + t.Fatal("expected load to fail for legacy [mcp] section") + } + if !strings.Contains(err.Error(), "[mcp]") || !strings.Contains(err.Error(), "[workspace]") { + t.Fatalf("expected migration error mentioning [mcp] and [workspace], got %v", err) + } +} + +func TestLoadRejectsMixedMCPAndWorkspaceSections(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[mcp]\nfoo = \"legacy\"\n[workspace]\ndefault_image = \"current\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + _, err := Load(configPath) + if err == nil { + t.Fatal("expected load to fail when both [mcp] and [workspace] are present") + } + if !strings.Contains(err.Error(), "both [mcp] and [workspace]") { + t.Fatalf("expected mixed-section error, got %v", err) + } +} + +func TestLoadReadsWorkspaceDefaultImage(t *testing.T) { + t.Parallel() + + configPath := filepath.Join(t.TempDir(), "config.toml") + if err := os.WriteFile(configPath, []byte("[workspace]\ndefault_image = \"alpine:3.22\"\n"), 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + + cfg, err := Load(configPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + if cfg.Workspace.DefaultImage != "alpine:3.22" { + t.Fatalf("expected default_image to load, got %q", cfg.Workspace.DefaultImage) + } +} diff --git a/internal/containerd/resolv.go b/internal/containerd/resolv.go index de6ddbbe..620742ee 100644 --- a/internal/containerd/resolv.go +++ b/internal/containerd/resolv.go @@ -1,25 +1,35 @@ package containerd import ( + "errors" "os" "path/filepath" "strings" ) const ( - systemdResolvConf = "/run/systemd/resolve/resolv.conf" - fallbackResolv = "nameserver 1.1.1.1\nnameserver 8.8.8.8\n" + systemdResolvConf = "/run/systemd/resolve/resolv.conf" + fallbackResolv = "nameserver 1.1.1.1\nnameserver 8.8.8.8\n" + fallbackResolvPerm = 0o644 ) // ResolveConfSource returns a host path to mount as /etc/resolv.conf. // If systemd-resolved config is available, use it. Otherwise write a fallback // resolv.conf under dataDir and return that path. func ResolveConfSource(dataDir string) (string, error) { + return resolveConfSource(dataDir, systemdResolvConf) +} + +func resolveConfSource(dataDir, preferredPath string) (string, error) { if strings.TrimSpace(dataDir) == "" { return "", ErrInvalidArgument } - if _, err := os.Stat(systemdResolvConf); err == nil { - return systemdResolvConf, nil + if strings.TrimSpace(preferredPath) != "" { + if _, err := os.Stat(preferredPath); err == nil { + return preferredPath, nil + } else if !errors.Is(err, os.ErrNotExist) { + return "", err + } } if err := os.MkdirAll(dataDir, 0o750); err != nil { @@ -27,11 +37,14 @@ func ResolveConfSource(dataDir string) (string, error) { } fallbackPath := filepath.Join(dataDir, "resolv.conf") if _, err := os.Stat(fallbackPath); err == nil { + if err := os.Chmod(fallbackPath, fallbackResolvPerm); err != nil { + return "", err + } return fallbackPath, nil - } else if !os.IsNotExist(err) { + } else if !errors.Is(err, os.ErrNotExist) { return "", err } - if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), 0o600); err != nil { + if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), fallbackResolvPerm); err != nil { return "", err } return fallbackPath, nil diff --git a/internal/containerd/resolv_test.go b/internal/containerd/resolv_test.go new file mode 100644 index 00000000..4b3accbd --- /dev/null +++ b/internal/containerd/resolv_test.go @@ -0,0 +1,101 @@ +package containerd + +import ( + "errors" + "os" + "path/filepath" + "testing" +) + +func TestResolveConfSource_InvalidArgument(t *testing.T) { + if _, err := ResolveConfSource(""); !errors.Is(err, ErrInvalidArgument) { + t.Fatalf("expected ErrInvalidArgument, got %v", err) + } +} + +func TestResolveConfSource_UsesPreferredResolvedWhenAvailable(t *testing.T) { + dataDir := t.TempDir() + preferredPath := filepath.Join(dataDir, "preferred-resolv.conf") + if err := os.WriteFile(preferredPath, []byte("nameserver 9.9.9.9\n"), 0o600); err != nil { + t.Fatalf("failed to seed preferred resolv.conf: %v", err) + } + + path, err := resolveConfSource(dataDir, preferredPath) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + if path != preferredPath { + t.Fatalf("expected preferred path, got %q", path) + } +} + +func TestResolveConfSource_UsesSystemdResolvedWhenAvailable(t *testing.T) { + if _, err := os.Stat(systemdResolvConf); errors.Is(err, os.ErrNotExist) { + t.Skip("systemd-resolved config not available on this host") + } else if err != nil { + t.Fatalf("failed to stat %s: %v", systemdResolvConf, err) + } + + path, err := ResolveConfSource(t.TempDir()) + if err != nil { + t.Fatalf("ResolveConfSource returned error: %v", err) + } + if path != systemdResolvConf { + t.Fatalf("expected systemd-resolved path, got %q", path) + } +} + +func TestResolveConfSource_FallbackCreatesReadableFile(t *testing.T) { + dataDir := t.TempDir() + preferredPath := filepath.Join(dataDir, "missing-preferred-resolv.conf") + + path, err := resolveConfSource(dataDir, preferredPath) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + + if path != filepath.Join(dataDir, "resolv.conf") { + t.Fatalf("expected fallback path, got %q", path) + } + + //nolint:gosec // test reads a file it just created in a temp directory + content, err := os.ReadFile(path) + if err != nil { + t.Fatalf("failed to read fallback resolv.conf: %v", err) + } + if string(content) != fallbackResolv { + t.Fatalf("unexpected fallback resolv.conf contents: %q", string(content)) + } + + info, err := os.Stat(path) + if err != nil { + t.Fatalf("failed to stat fallback resolv.conf: %v", err) + } + if perm := info.Mode().Perm(); perm != fallbackResolvPerm { + t.Fatalf("expected permissions %o, got %o", fallbackResolvPerm, perm) + } +} + +func TestResolveConfSource_FallbackFixesExistingPermissions(t *testing.T) { + dataDir := t.TempDir() + fallbackPath := filepath.Join(dataDir, "resolv.conf") + if err := os.WriteFile(fallbackPath, []byte(fallbackResolv), 0o600); err != nil { + t.Fatalf("failed to seed fallback resolv.conf: %v", err) + } + + path, err := resolveConfSource(dataDir, filepath.Join(dataDir, "missing-preferred-resolv.conf")) + if err != nil { + t.Fatalf("resolveConfSource returned error: %v", err) + } + if path != fallbackPath { + t.Fatalf("expected existing fallback path, got %q", path) + } + + info, err := os.Stat(fallbackPath) + if err != nil { + t.Fatalf("failed to stat fallback resolv.conf: %v", err) + } + if perm := info.Mode().Perm(); perm != fallbackResolvPerm { + t.Fatalf("expected permissions %o, got %o", fallbackResolvPerm, perm) + } +} diff --git a/internal/containerd/service.go b/internal/containerd/service.go index f4a929e0..ed868dd9 100644 --- a/internal/containerd/service.go +++ b/internal/containerd/service.go @@ -34,6 +34,7 @@ var ( type PullImageOptions struct { Unpack bool Snapshotter string + OnProgress func(PullProgress) // optional, nil = no progress reporting } type DeleteImageOptions struct { @@ -140,6 +141,39 @@ func (s *DefaultService) PullImage(ctx context.Context, ref string, opts *PullIm pullOpts = append(pullOpts, containerd.WithPullSnapshotter(opts.Snapshotter)) } + // When OnProgress is set, poll content store for active download statuses. + if opts != nil && opts.OnProgress != nil { + stop := make(chan struct{}) + go func() { + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + cs := s.client.ContentStore() + for { + select { + case <-stop: + return + case <-ctx.Done(): + return + case <-ticker.C: + statuses, err := cs.ListStatuses(ctx) + if err != nil { + continue + } + layers := make([]LayerStatus, len(statuses)) + for i, st := range statuses { + layers[i] = LayerStatus{ + Ref: st.Ref, + Offset: st.Offset, + Total: st.Total, + } + } + opts.OnProgress(PullProgress{Layers: layers}) + } + } + }() + defer close(stop) + } + img, err := s.client.Pull(ctx, ref, pullOpts...) if err != nil { return ImageInfo{}, err diff --git a/internal/containerd/types.go b/internal/containerd/types.go index a98ff169..31242f0e 100644 --- a/internal/containerd/types.go +++ b/internal/containerd/types.go @@ -94,6 +94,16 @@ type ContainerSpec struct { TTY bool } +type LayerStatus struct { + Ref string `json:"ref"` + Offset int64 `json:"offset"` + Total int64 `json:"total"` +} + +type PullProgress struct { + Layers []LayerStatus `json:"layers"` +} + type NetworkSetupRequest struct { ContainerID string PID uint32 diff --git a/internal/conversation/flow/resolver.go b/internal/conversation/flow/resolver.go index b93f5b5b..434adfdf 100644 --- a/internal/conversation/flow/resolver.go +++ b/internal/conversation/flow/resolver.go @@ -157,7 +157,6 @@ type gatewayModelConfig struct { type gatewayIdentity struct { BotID string `json:"botId"` - ContainerID string `json:"containerId"` ChannelIdentityID string `json:"channelIdentityId"` DisplayName string `json:"displayName"` CurrentPlatform string `json:"currentPlatform,omitempty"` @@ -379,8 +378,6 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r messages = append(messages, reqMessages...) messages = sanitizeMessages(messages) skills := dedup(req.Skills) - containerID := r.resolveContainerID(ctx, req.BotID, req.ContainerID) - var usableSkills []gatewaySkill if r.skillLoader != nil { entries, err := r.skillLoader.LoadSkills(ctx, req.BotID) @@ -467,7 +464,6 @@ func (r *Resolver) resolve(ctx context.Context, req conversation.ChatRequest) (r Query: headerifiedQuery, Identity: gatewayIdentity{ BotID: req.BotID, - ContainerID: containerID, ChannelIdentityID: strings.TrimSpace(req.SourceChannelIdentityID), DisplayName: displayName, CurrentPlatform: req.CurrentChannel, @@ -1275,25 +1271,6 @@ func encodeReaderAsDataURL(reader io.Reader, maxBytes int64, attachmentType, fal return encoded.String(), mime, nil } -// --- container resolution --- - -func (r *Resolver) resolveContainerID(ctx context.Context, botID, explicit string) string { - if strings.TrimSpace(explicit) != "" { - return explicit - } - if r.queries != nil { - pgBotID, err := parseResolverUUID(botID) - if err == nil { - row, err := r.queries.GetContainerByBotID(ctx, pgBotID) - if err == nil && strings.TrimSpace(row.ContainerID) != "" { - return row.ContainerID - } - } - } - r.logger.Warn("no container found for bot, using fallback", slog.String("bot_id", botID)) - return "mcp-" + botID -} - // --- message loading --- type messageWithUsage struct { diff --git a/internal/conversation/flow/resolver_test.go b/internal/conversation/flow/resolver_test.go index 6c1ac84a..eb7c4367 100644 --- a/internal/conversation/flow/resolver_test.go +++ b/internal/conversation/flow/resolver_test.go @@ -57,7 +57,6 @@ func TestPostTriggerSchedule_Endpoint(t *testing.T) { Skills: []string{}, Identity: gatewayIdentity{ BotID: "bot-123", - ContainerID: "mcp-bot-123", ChannelIdentityID: "owner-user-1", DisplayName: "Scheduler", }, diff --git a/internal/conversation/types.go b/internal/conversation/types.go index 8ff0bda2..cf1cdf41 100644 --- a/internal/conversation/types.go +++ b/internal/conversation/types.go @@ -220,7 +220,6 @@ type ChatRequest struct { Token string `json:"-"` UserID string `json:"-"` SourceChannelIdentityID string `json:"-"` - ContainerID string `json:"-"` DisplayName string `json:"-"` RouteID string `json:"-"` ChatToken string `json:"-"` diff --git a/internal/handlers/containerd.go b/internal/handlers/containerd.go index fd5ce7cb..a086aca5 100644 --- a/internal/handlers/containerd.go +++ b/internal/handlers/containerd.go @@ -1,7 +1,9 @@ package handlers import ( + "bufio" "context" + "encoding/json" "errors" "fmt" "io" @@ -10,28 +12,24 @@ import ( "sort" "strings" "sync" + "sync/atomic" "time" "github.com/containerd/errdefs" - "github.com/google/uuid" - "github.com/jackc/pgx/v5" "github.com/labstack/echo/v4" "github.com/memohai/memoh/internal/accounts" "github.com/memohai/memoh/internal/bots" "github.com/memohai/memoh/internal/config" ctr "github.com/memohai/memoh/internal/containerd" - "github.com/memohai/memoh/internal/db" - dbsqlc "github.com/memohai/memoh/internal/db/sqlc" "github.com/memohai/memoh/internal/mcp" "github.com/memohai/memoh/internal/policy" + "github.com/memohai/memoh/internal/workspace" ) type ContainerdHandler struct { - service ctr.Service - manager *mcp.Manager - cfg config.MCPConfig - namespace string + manager *workspace.Manager + cfg config.WorkspaceConfig containerBackend string logger *slog.Logger toolGateway *mcp.ToolGatewayService @@ -41,12 +39,12 @@ type ContainerdHandler struct { botService *bots.Service accountService *accounts.Service policyService *policy.Service - queries *dbsqlc.Queries } type CreateContainerRequest struct { Snapshotter string `json:"snapshotter,omitempty"` RestoreData bool `json:"restore_data,omitempty"` + Image string `json:"image,omitempty"` } type CreateContainerResponse struct { @@ -58,6 +56,36 @@ type CreateContainerResponse struct { HasPreservedData bool `json:"has_preserved_data"` } +// codesync(container-create-stream): keep these SSE payloads in sync with +// packages/sdk/src/container-stream.ts. +type createContainerPullingEvent struct { + Type string `json:"type"` + Image string `json:"image"` +} + +type createContainerPullProgressEvent struct { + Type string `json:"type"` + Layers []ctr.LayerStatus `json:"layers"` +} + +type createContainerCreatingEvent struct { + Type string `json:"type"` +} + +type createContainerCompleteEvent struct { + Type string `json:"type"` + Container CreateContainerResponse `json:"container"` +} + +type createContainerRestoringEvent struct { + Type string `json:"type"` +} + +type createContainerErrorEvent struct { + Type string `json:"type"` + Message string `json:"message"` +} + type GetContainerResponse struct { ContainerID string `json:"container_id"` Image string `json:"image"` @@ -66,6 +94,7 @@ type GetContainerResponse struct { ContainerPath string `json:"container_path"` TaskRunning bool `json:"task_running"` HasPreservedData bool `json:"has_preserved_data"` + Legacy bool `json:"legacy"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } @@ -108,12 +137,10 @@ type ListSnapshotsResponse struct { Snapshots []SnapshotInfo `json:"snapshots"` } -func NewContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Manager, cfg config.MCPConfig, namespace string, containerBackend string, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service, queries *dbsqlc.Queries) *ContainerdHandler { +func NewContainerdHandler(log *slog.Logger, manager *workspace.Manager, cfg config.WorkspaceConfig, containerBackend string, botService *bots.Service, accountService *accounts.Service, policyService *policy.Service) *ContainerdHandler { h := &ContainerdHandler{ - service: service, manager: manager, cfg: cfg, - namespace: namespace, containerBackend: containerBackend, logger: log.With(slog.String("handler", "containerd")), mcpSess: make(map[string]*mcpSession), @@ -121,7 +148,6 @@ func NewContainerdHandler(log *slog.Logger, service ctr.Service, manager *mcp.Ma botService: botService, accountService: accountService, policyService: policyService, - queries: queries, } return h } @@ -166,7 +192,7 @@ func (h *ContainerdHandler) Register(e *echo.Echo) { // @Tags containerd // @Param bot_id path string true "Bot ID" // @Param payload body CreateContainerRequest true "Create container payload" -// @Success 200 {object} CreateContainerResponse +// @Success 200 {object} CreateContainerResponse "SSE stream of container creation events" // @Failure 400 {object} ErrorResponse // @Failure 500 {object} ErrorResponse // @Router /bots/{bot_id}/container [post]. @@ -180,156 +206,129 @@ func (h *ContainerdHandler) CreateContainer(c echo.Context) error { if err := c.Bind(&req); err != nil { return echo.NewHTTPError(http.StatusBadRequest, err.Error()) } - containerID := mcp.ContainerPrefix + botID + // Image override lets administrators specify a custom base image. + // NOTE(saas): if this becomes a multi-tenant SaaS, image override must be + // validated against an allowlist to prevent SSRF and resource abuse. + ctx := c.Request().Context() + imageOverride := strings.TrimSpace(req.Image) + image, err := h.manager.ResolveWorkspaceImage(ctx, botID) + if err != nil { + h.logger.Error("resolve workspace image failed", + slog.String("bot_id", botID), slog.Any("error", err)) + return nil + } + if imageOverride != "" { + image = config.NormalizeImageRef(imageOverride) + } - image := h.mcpImageRef() snapshotter := strings.TrimSpace(req.Snapshotter) if snapshotter == "" { snapshotter = h.cfg.Snapshotter } - ctx := c.Request().Context() - - if h.manager == nil { - return echo.NewHTTPError(http.StatusInternalServerError, "manager not configured") + flusher, ok := c.Response().Writer.(http.Flusher) + if !ok { + return echo.NewHTTPError(http.StatusInternalServerError, "streaming not supported") } - started := false - if err := h.manager.Start(ctx, botID); err != nil { - h.logger.Error("mcp container start failed", - slog.String("container_id", containerID), - slog.Any("error", err), - ) - } else { - started = true + c.Response().Header().Set(echo.HeaderContentType, "text/event-stream") + c.Response().Header().Set(echo.HeaderCacheControl, "no-cache") + c.Response().Header().Set(echo.HeaderConnection, "keep-alive") + c.Response().WriteHeader(http.StatusOK) + writer := bufio.NewWriter(c.Response().Writer) + + var mu sync.Mutex + send := func(payload any) { + mu.Lock() + defer mu.Unlock() + data, err := json.Marshal(payload) + if err != nil { + return + } + _ = writeSSEData(writer, flusher, string(data)) + } + + sendError := func(msg string) { + send(createContainerErrorEvent{Type: "error", Message: msg}) + } + + // Phase 1: Pull image with progress + send(createContainerPullingEvent{Type: "pulling", Image: image}) + + var pullDone atomic.Bool + _, pullErr := h.manager.PullImage(ctx, image, &ctr.PullImageOptions{ + Unpack: true, + Snapshotter: snapshotter, + OnProgress: func(p ctr.PullProgress) { + if pullDone.Load() { + return + } + send(createContainerPullProgressEvent{Type: "pull_progress", Layers: p.Layers}) + }, + }) + pullDone.Store(true) + if pullErr != nil { + h.logger.Error("image pull failed", + slog.String("image", image), slog.Any("error", pullErr)) + sendError("image pull failed: " + pullErr.Error()) + return nil + } + + // Phase 2: Create container (image is local, should be fast) + send(createContainerCreatingEvent{Type: "creating"}) + + // Notify the client before starting if data migration will happen, + // since restoring a large /data volume can take a while. + if h.manager.HasPreservedData(botID) { + send(createContainerRestoringEvent{Type: "restoring"}) + } + + if err := h.manager.StartWithResolvedImage(ctx, botID, image); err != nil { + h.logger.Error("container start failed", + slog.String("bot_id", botID), slog.Any("error", err)) + sendError("container start failed: " + err.Error()) + return nil + } + if err := h.manager.RememberWorkspaceImage(ctx, botID, image); err != nil { + h.logger.Warn("remember workspace image failed", + slog.String("bot_id", botID), slog.String("image", image), slog.Any("error", err)) + } + + containerID, err := h.manager.ContainerID(ctx, botID) + if err != nil { + h.logger.Error("container ID resolution failed after start", + slog.String("bot_id", botID), slog.Any("error", err)) + sendError("container ID resolution failed: " + err.Error()) + return nil } dataRestored := false - if started && req.RestoreData && h.manager.HasPreservedData(botID) { + if req.RestoreData && h.manager.HasPreservedData(botID) { if err := h.manager.RestorePreservedData(ctx, botID); err != nil { - h.logger.Warn("restore preserved data on create failed", + h.logger.Error("restore preserved data failed", slog.String("bot_id", botID), slog.Any("error", err)) - } else { - dataRestored = true - } - } - - h.upsertContainerRecord(ctx, botID, containerID, map[bool]string{true: "running", false: "created"}[started]) - - return c.JSON(http.StatusOK, CreateContainerResponse{ - ContainerID: containerID, - Image: image, - Snapshotter: snapshotter, - Started: started, - DataRestored: dataRestored, - HasPreservedData: h.manager.HasPreservedData(botID), - }) -} - -// ensureContainerAndTask verifies the container exists in containerd and its task is -// running. If the container is missing (e.g. after a VM restart) it is recreated via -// SetupBotContainer. This prevents permanent desync between DB and containerd state. -func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containerID, botID string) error { - _, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if !errdefs.IsNotFound(err) { - return err - } - h.logger.Warn("container missing in containerd, rebuilding", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - ) - return h.SetupBotContainer(ctx, botID) - } - - tasks, err := h.service.ListTasks(ctx, &ctr.ListTasksOptions{ - Filter: "container.id==" + containerID, - }) - if err != nil { - return err - } - if len(tasks) > 0 { - if tasks[0].Status == ctr.TaskStatusRunning { - if err := h.setupNetworkOrFail(ctx, containerID, botID); err != nil { - return err - } + sendError("restore preserved data failed: " + err.Error()) return nil } - if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - if !errdefs.IsNotFound(err) { - h.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - return err - } - } + dataRestored = true } - if err := h.service.StartContainer(ctx, containerID, nil); err != nil { - return err - } - return h.setupNetworkOrFail(ctx, containerID, botID) -} + h.manager.RecordContainerRunning(ctx, botID, containerID, image) -// setupNetworkOrFail attempts CNI network setup with one retry. Returns an error -// if no usable IP is obtained — callers must not silently ignore this. -func (h *ContainerdHandler) setupNetworkOrFail(ctx context.Context, containerID, botID string) error { - var lastErr error - for attempt := 0; attempt < 2; attempt++ { - netResult, err := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: h.cfg.CNIBinaryDir, - CNIConfDir: h.cfg.CNIConfigDir, - }) - if err != nil { - lastErr = err - h.logger.Warn("network setup attempt failed", - slog.String("container_id", containerID), - slog.Int("attempt", attempt+1), - slog.Any("error", err)) - continue - } - if netResult.IP == "" { - lastErr = fmt.Errorf("network setup returned no IP for %s", containerID) - continue - } - if h.manager != nil { - h.manager.SetContainerIP(botID, netResult.IP) - } - return nil - } - return fmt.Errorf("network setup failed for container %s: %w", containerID, lastErr) -} + // Phase 3: Complete + send(createContainerCompleteEvent{ + Type: "complete", + Container: CreateContainerResponse{ + ContainerID: containerID, + Image: image, + Snapshotter: snapshotter, + Started: true, + DataRestored: dataRestored, + HasPreservedData: h.manager.HasPreservedData(botID), + }, + }) -// botContainerID resolves container_id for a bot from the database. -func (h *ContainerdHandler) botContainerID(ctx context.Context, botID string) (string, error) { - if h.queries != nil { - pgBotID, err := db.ParseUUID(botID) - if err == nil { - row, dbErr := h.queries.GetContainerByBotID(ctx, pgBotID) - if dbErr == nil && strings.TrimSpace(row.ContainerID) != "" { - return row.ContainerID, nil - } - if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { - h.logger.Warn("botContainerID: db lookup failed", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - containers, err := h.service.ListContainersByLabel(ctx, mcp.BotLabelKey, botID) - if err != nil { - return "", err - } - if len(containers) == 0 { - return "", echo.NewHTTPError(http.StatusNotFound, "container not found") - } - bestID := "" - var bestUpdated time.Time - for _, info := range containers { - if bestID == "" || info.UpdatedAt.After(bestUpdated) { - bestID = info.ID - bestUpdated = info.UpdatedAt - } - } - return bestID, nil + return nil } // GetContainer godoc @@ -345,57 +344,24 @@ func (h *ContainerdHandler) GetContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - - if h.queries != nil { - pgBotID, parseErr := db.ParseUUID(botID) - if parseErr == nil { - row, dbErr := h.queries.GetContainerByBotID(ctx, pgBotID) - if dbErr == nil { - taskRunning := h.isTaskRunning(ctx, row.ContainerID) - createdAt := time.Time{} - if row.CreatedAt.Valid { - createdAt = row.CreatedAt.Time - } - updatedAt := time.Time{} - if row.UpdatedAt.Valid { - updatedAt = row.UpdatedAt.Time - } - return c.JSON(http.StatusOK, GetContainerResponse{ - ContainerID: row.ContainerID, - Image: row.Image, - Status: row.Status, - Namespace: row.Namespace, - ContainerPath: row.ContainerPath, - TaskRunning: taskRunning, - HasPreservedData: h.manager.HasPreservedData(botID), - CreatedAt: createdAt, - UpdatedAt: updatedAt, - }) - } - } - } - - containerID, err := h.botContainerID(ctx, botID) + status, err := h.manager.GetContainerInfo(c.Request().Context(), botID) if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - info, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if errdefs.IsNotFound(err) { - return echo.NewHTTPError(http.StatusNotFound, "container not found") + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, GetContainerResponse{ - ContainerID: info.ID, - Image: info.Image, - Status: "unknown", - Namespace: h.namespace, - TaskRunning: h.isTaskRunning(ctx, containerID), - HasPreservedData: h.manager.HasPreservedData(botID), - CreatedAt: info.CreatedAt, - UpdatedAt: info.UpdatedAt, + ContainerID: status.ContainerID, + Image: status.Image, + Status: status.Status, + Namespace: status.Namespace, + ContainerPath: status.ContainerPath, + TaskRunning: status.TaskRunning, + HasPreservedData: status.HasPreservedData, + Legacy: status.Legacy, + CreatedAt: status.CreatedAt, + UpdatedAt: status.UpdatedAt, }) } @@ -414,7 +380,7 @@ func (h *ContainerdHandler) DeleteContainer(c echo.Context) error { return err } preserveData := c.QueryParam("preserve_data") == "true" - if err := h.CleanupBotContainer(c.Request().Context(), botID, preserveData); err != nil { + if err := h.manager.CleanupBotContainer(c.Request().Context(), botID, preserveData); err != nil { return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.NoContent(http.StatusNoContent) @@ -433,21 +399,11 @@ func (h *ContainerdHandler) StartContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) - if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container started status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } + if err := h.manager.EnsureRunning(c.Request().Context(), botID); err != nil { + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, map[string]bool{"started": true}) } @@ -465,27 +421,11 @@ func (h *ContainerdHandler) StopContainer(c echo.Context) error { if err != nil { return err } - ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) - if err != nil { - return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") - } - if err := h.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{ - Timeout: 10 * time.Second, - Force: true, - }); err != nil && !errdefs.IsNotFound(err) { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - if err := h.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - h.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.UpdateContainerStopped(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container stopped status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } + if err := h.manager.StopBot(c.Request().Context(), botID); err != nil { + if errors.Is(err, workspace.ErrContainerNotFound) { + return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } return c.JSON(http.StatusOK, map[string]bool{"stopped": true}) } @@ -515,7 +455,7 @@ func (h *ContainerdHandler) CreateSnapshot(c echo.Context) error { if err := c.Bind(&req); err != nil { return echo.NewHTTPError(http.StatusBadRequest, err.Error()) } - created, err := h.manager.CreateSnapshot(c.Request().Context(), botID, req.SnapshotName, mcp.SnapshotSourceManual) + created, err := h.manager.CreateSnapshot(c.Request().Context(), botID, req.SnapshotName, workspace.SnapshotSourceManual) if err != nil { if errdefs.IsNotFound(err) { return echo.NewHTTPError(http.StatusNotFound, "container not found") @@ -529,7 +469,7 @@ func (h *ContainerdHandler) CreateSnapshot(c echo.Context) error { DisplayName: created.DisplayName, Snapshotter: created.Snapshotter, Version: created.Version, - Source: mcp.SnapshotSourceManual, + Source: workspace.SnapshotSourceManual, }) } @@ -590,7 +530,7 @@ func (h *ContainerdHandler) ListSnapshots(c echo.Context) error { items := make([]SnapshotInfo, 0, len(lineage)+len(data.ManagedMeta)) seen := make(map[string]struct{}, len(lineage)+len(data.ManagedMeta)) - appendRuntime := func(runtimeInfo ctr.SnapshotInfo, fallbackSource string, meta *mcp.ManagedSnapshotMeta) { + appendRuntime := func(runtimeInfo ctr.SnapshotInfo, fallbackSource string, meta *workspace.ManagedSnapshotMeta) { source := fallbackSource managed := false var version *int @@ -824,10 +764,6 @@ func snapshotLineage(root string, all []ctr.SnapshotInfo) ([]ctr.SnapshotInfo, b // ---------- auth helpers ---------- -func (h *ContainerdHandler) mcpImageRef() string { - return h.cfg.ImageRef() -} - // requireBotAccess extracts bot_id from path, validates user auth, and authorizes bot access. func (h *ContainerdHandler) requireBotAccess(c echo.Context) (string, error) { channelIdentityID, err := h.requireChannelIdentityID(c) @@ -868,180 +804,3 @@ func (h *ContainerdHandler) requireBotAccessWithGuest(c echo.Context) (string, e } return botID, nil } - -// SetupBotContainer creates and starts the MCP container for a bot. -func (h *ContainerdHandler) SetupBotContainer(ctx context.Context, botID string) error { - containerID := mcp.ContainerPrefix + botID - - if h.manager == nil { - return errors.New("manager not configured") - } - - if err := h.manager.Start(ctx, botID); err != nil { - h.logger.Error("setup bot container: start failed", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - slog.Any("error", err), - ) - return err - } - - h.upsertContainerRecord(ctx, botID, containerID, "running") - return nil -} - -// CleanupBotContainer removes the containerd container and DB record for a bot. -// When preserveData is true, /data is exported to a backup archive before -// deletion so it can be restored into a future container. -func (h *ContainerdHandler) CleanupBotContainer(ctx context.Context, botID string, preserveData bool) error { - h.logger.Info("CleanupBotContainer starting", - slog.String("bot_id", botID), slog.Bool("preserve_data", preserveData)) - - if h.manager != nil { - if err := h.manager.Delete(ctx, botID, preserveData); err != nil { - if !errdefs.IsNotFound(err) { - return err - } - h.logger.Warn("CleanupBotContainer: container not found in containerd", - slog.String("bot_id", botID)) - } - } - - if h.queries != nil { - if pgBotID, parseErr := db.ParseUUID(botID); parseErr == nil { - if dbErr := h.queries.DeleteContainerByBotID(ctx, pgBotID); dbErr != nil { - h.logger.Error("CleanupBotContainer: failed to delete DB record", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - h.logger.Info("CleanupBotContainer finished", slog.String("bot_id", botID)) - return nil -} - -func (h *ContainerdHandler) isTaskRunning(ctx context.Context, containerID string) bool { - tasks, err := h.service.ListTasks(ctx, &ctr.ListTasksOptions{ - Filter: "container.id==" + containerID, - }) - return err == nil && len(tasks) > 0 && tasks[0].Status == ctr.TaskStatusRunning -} - -// ReconcileContainers compares the DB containers table against actual containerd -// state on startup. For each auto_start container in DB it verifies the container -// and task exist; if missing they are rebuilt via SetupBotContainer. Containers that -// the DB claims are running but are not present in containerd get corrected. -func (h *ContainerdHandler) ReconcileContainers(ctx context.Context) { - if h.queries == nil { - return - } - rows, err := h.queries.ListAutoStartContainers(ctx) - if err != nil { - h.logger.Error("reconcile: failed to list containers from DB", slog.Any("error", err)) - return - } - if len(rows) == 0 { - h.logger.Info("reconcile: no auto-start containers in DB") - return - } - - h.logger.Info("reconcile: checking containers", slog.Int("count", len(rows))) - for _, row := range rows { - containerID := row.ContainerID - botID := uuid.UUID(row.BotID.Bytes).String() - - _, err := h.service.GetContainer(ctx, containerID) - if err != nil { - if !errdefs.IsNotFound(err) { - h.logger.Error("reconcile: failed to get container", - slog.String("container_id", containerID), slog.Any("error", err)) - continue - } - // Container missing in containerd — rebuild. - h.logger.Warn("reconcile: container missing, rebuilding", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - if setupErr := h.SetupBotContainer(ctx, botID); setupErr != nil { - h.logger.Error("reconcile: rebuild failed", - slog.String("bot_id", botID), slog.Any("error", setupErr)) - if dbErr := h.queries.UpdateContainerStatus(ctx, dbsqlc.UpdateContainerStatusParams{ - Status: "error", - BotID: row.BotID, - }); dbErr != nil { - h.logger.Error("reconcile: failed to mark container as error", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - continue - } - - // Container exists — ensure the task is running. - running := h.isTaskRunning(ctx, containerID) - if running { - if row.Status != "running" { - if dbErr := h.queries.UpdateContainerStarted(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to update DB status to running", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - if netErr := h.setupNetworkOrFail(ctx, containerID, botID); netErr != nil { - h.logger.Error("reconcile: network setup failed for running task, container unreachable", - slog.String("bot_id", botID), - slog.String("container_id", containerID), - slog.Any("error", netErr)) - } else { - h.logger.Info("reconcile: container healthy", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - } - continue - } - - // Task not running — try to start it. - h.logger.Warn("reconcile: task not running, starting", - slog.String("bot_id", botID), slog.String("container_id", containerID)) - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - h.logger.Error("reconcile: failed to start task", - slog.String("bot_id", botID), slog.Any("error", err)) - if dbErr := h.queries.UpdateContainerStopped(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to mark container as stopped", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } else { - if dbErr := h.queries.UpdateContainerStarted(ctx, row.BotID); dbErr != nil { - h.logger.Error("reconcile: failed to update DB status to running", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } - } - h.logger.Info("reconcile: completed") -} - -func (h *ContainerdHandler) upsertContainerRecord(ctx context.Context, botID, containerID, status string) { - if h.queries == nil { - return - } - pgBotID, err := db.ParseUUID(botID) - if err != nil { - return - } - ns := strings.TrimSpace(h.namespace) - if ns == "" { - ns = "default" - } - if dbErr := h.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{ - BotID: pgBotID, - ContainerID: containerID, - ContainerName: containerID, - Image: h.mcpImageRef(), - Status: status, - Namespace: ns, - AutoStart: true, - }); dbErr != nil { - h.logger.Error("failed to upsert container record", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - if status == "running" { - if dbErr := h.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { - h.logger.Error("failed to update container started status", - slog.String("bot_id", botID), slog.Any("error", dbErr)) - } - } -} diff --git a/internal/handlers/containerd_terminal.go b/internal/handlers/containerd_terminal.go index c26fef65..d0300212 100644 --- a/internal/handlers/containerd_terminal.go +++ b/internal/handlers/containerd_terminal.go @@ -9,7 +9,7 @@ import ( "github.com/gorilla/websocket" "github.com/labstack/echo/v4" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) var terminalUpgrader = websocket.Upgrader{ diff --git a/internal/handlers/filemanager.go b/internal/handlers/filemanager.go index 627e0c78..b40cdbe7 100644 --- a/internal/handlers/filemanager.go +++ b/internal/handlers/filemanager.go @@ -12,7 +12,7 @@ import ( "github.com/labstack/echo/v4" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) // ---------- request / response types ---------- @@ -84,7 +84,7 @@ func resolveContainerPath(rawPath string) (string, error) { } // getGRPCClient returns the gRPC client for the bot's container. -func (h *ContainerdHandler) getGRPCClient(ctx context.Context, botID string) (*mcpclient.Client, error) { +func (h *ContainerdHandler) getGRPCClient(ctx context.Context, botID string) (*bridge.Client, error) { return h.manager.MCPClient(ctx, botID) } @@ -103,13 +103,13 @@ func fsFileInfoFromEntry(containerPath, name string, isDir bool, size int64, mod // fsHTTPError maps mcpclient domain errors to HTTP status codes. func fsHTTPError(err error) *echo.HTTPError { switch { - case errors.Is(err, mcpclient.ErrNotFound): + case errors.Is(err, bridge.ErrNotFound): return echo.NewHTTPError(http.StatusNotFound, err.Error()) - case errors.Is(err, mcpclient.ErrBadRequest): + case errors.Is(err, bridge.ErrBadRequest): return echo.NewHTTPError(http.StatusBadRequest, err.Error()) - case errors.Is(err, mcpclient.ErrForbidden): + case errors.Is(err, bridge.ErrForbidden): return echo.NewHTTPError(http.StatusForbidden, err.Error()) - case errors.Is(err, mcpclient.ErrUnavailable): + case errors.Is(err, bridge.ErrUnavailable): return echo.NewHTTPError(http.StatusServiceUnavailable, err.Error()) default: return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) diff --git a/internal/handlers/mcp_federation_gateway.go b/internal/handlers/mcp_federation_gateway.go index 0db5d2ac..701d1290 100644 --- a/internal/handlers/mcp_federation_gateway.go +++ b/internal/handlers/mcp_federation_gateway.go @@ -277,11 +277,11 @@ func (g *MCPFederationGateway) startStdioConnectionSession(ctx context.Context, if g.handler == nil { return nil, errors.New("containerd handler not configured") } - containerID, err := g.handler.botContainerID(ctx, botID) - if err != nil { + if err := g.handler.manager.EnsureRunning(ctx, botID); err != nil { return nil, err } - if err := g.handler.ensureContainerAndTask(ctx, containerID, botID); err != nil { + containerID, err := g.handler.manager.ContainerID(ctx, botID) + if err != nil { return nil, err } @@ -296,7 +296,7 @@ func (g *MCPFederationGateway) startStdioConnectionSession(ctx context.Context, Env: normalizeStringMap(connection.Config["env"]), Cwd: strings.TrimSpace(anyToString(connection.Config["cwd"])), } - return g.handler.startContainerdMCPCommandSession(ctx, containerID, request) + return g.handler.startContainerdMCPCommandSession(ctx, botID, containerID, request) } func parseGatewayToolsListPayload(payload map[string]any) ([]mcpgw.ToolDescriptor, error) { diff --git a/internal/handlers/mcp_stdio.go b/internal/handlers/mcp_stdio.go index 27e4545a..ac37e83b 100644 --- a/internal/handlers/mcp_stdio.go +++ b/internal/handlers/mcp_stdio.go @@ -20,7 +20,7 @@ import ( sdkmcp "github.com/modelcontextprotocol/go-sdk/mcp" mcptools "github.com/memohai/memoh/internal/mcp" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) // MCPStdioRequest represents a request to create an MCP stdio session. @@ -588,15 +588,15 @@ func (h *ContainerdHandler) CreateMCPStdio(c echo.Context) error { return echo.NewHTTPError(http.StatusBadRequest, "command is required") } ctx := c.Request().Context() - containerID, err := h.botContainerID(ctx, botID) + if err := h.manager.EnsureRunning(ctx, botID); err != nil { + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) + } + containerID, err := h.manager.ContainerID(ctx, botID) if err != nil { return echo.NewHTTPError(http.StatusNotFound, "container not found for bot") } - if err := h.ensureContainerAndTask(ctx, containerID, botID); err != nil { - return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) - } - sess, err := h.startContainerdMCPCommandSession(ctx, containerID, req) + sess, err := h.startContainerdMCPCommandSession(ctx, botID, containerID, req) if err != nil { return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } @@ -686,13 +686,7 @@ func (h *ContainerdHandler) HandleMCPStdio(c echo.Context) error { return c.JSON(http.StatusOK, payload) } -func (h *ContainerdHandler) startContainerdMCPCommandSession(ctx context.Context, containerID string, req MCPStdioRequest) (*mcpSession, error) { - // Extract bot_id from container_id (remove "mcp-" prefix) - botID := strings.TrimPrefix(containerID, "mcp-") - if botID == "" || botID == containerID { - return nil, fmt.Errorf("invalid container_id: %s", containerID) - } - +func (h *ContainerdHandler) startContainerdMCPCommandSession(ctx context.Context, botID, containerID string, req MCPStdioRequest) (*mcpSession, error) { // Get gRPC client for the bot container via manager client, err := h.manager.MCPClient(ctx, botID) if err != nil { diff --git a/internal/handlers/memory.go b/internal/handlers/memory.go index b9844990..4810ddbc 100644 --- a/internal/handlers/memory.go +++ b/internal/handlers/memory.go @@ -18,10 +18,10 @@ import ( "github.com/memohai/memoh/internal/accounts" "github.com/memohai/memoh/internal/bots" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" memprovider "github.com/memohai/memoh/internal/memory/adapters" storefs "github.com/memohai/memoh/internal/memory/storefs" "github.com/memohai/memoh/internal/settings" + "github.com/memohai/memoh/internal/workspace/bridge" ) // MemoryHandler handles memory CRUD operations scoped by bot. @@ -120,7 +120,7 @@ func (h *MemoryHandler) resolveProvider(ctx context.Context, botID string) mempr } // SetMCPClientProvider sets the gRPC client provider for filesystem persistence. -func (h *MemoryHandler) SetMCPClientProvider(p mcpclient.Provider) { +func (h *MemoryHandler) SetMCPClientProvider(p bridge.Provider) { if p == nil { h.memoryStore = nil return @@ -671,7 +671,7 @@ func (h *MemoryHandler) requireBotAccess(c echo.Context) (string, error) { } // NewBuiltinMemoryRuntime keeps provider architecture while using file memory backend. -func NewBuiltinMemoryRuntime(p mcpclient.Provider) any { +func NewBuiltinMemoryRuntime(p bridge.Provider) any { if p == nil { return nil } diff --git a/internal/handlers/skills.go b/internal/handlers/skills.go index a3f9787a..9239d3e1 100644 --- a/internal/handlers/skills.go +++ b/internal/handlers/skills.go @@ -11,7 +11,7 @@ import ( "gopkg.in/yaml.v3" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const skillsDirPath = config.DefaultDataMount + "/.skills" @@ -196,7 +196,7 @@ func (h *ContainerdHandler) loadSkillsFromContainer(ctx context.Context, botID s return skills, nil } -func readContainerSkillFile(ctx context.Context, client *mcpclient.Client, filePath string) (string, error) { +func readContainerSkillFile(ctx context.Context, client *bridge.Client, filePath string) (string, error) { resp, err := client.ReadFile(ctx, filePath, 0, 0) if err != nil { return "", err diff --git a/internal/mcp/manager.go b/internal/mcp/manager.go deleted file mode 100644 index 26b737ec..00000000 --- a/internal/mcp/manager.go +++ /dev/null @@ -1,409 +0,0 @@ -package mcp - -import ( - "context" - "errors" - "fmt" - "log/slog" - "strings" - "sync" - "time" - - "github.com/containerd/errdefs" - "github.com/jackc/pgx/v5/pgxpool" - - "github.com/memohai/memoh/internal/config" - ctr "github.com/memohai/memoh/internal/containerd" - dbsqlc "github.com/memohai/memoh/internal/db/sqlc" - "github.com/memohai/memoh/internal/identity" - "github.com/memohai/memoh/internal/mcp/mcpclient" -) - -const ( - BotLabelKey = "mcp.bot_id" - ContainerPrefix = "mcp-" -) - -type Manager struct { - service ctr.Service - cfg config.MCPConfig - namespace string - containerID func(string) string - db *pgxpool.Pool - queries *dbsqlc.Queries - logger *slog.Logger - containerLockMu sync.Mutex - containerLocks map[string]*sync.Mutex - mu sync.RWMutex - containerIPs map[string]string - grpcPool *mcpclient.Pool -} - -func NewManager(log *slog.Logger, service ctr.Service, cfg config.MCPConfig, namespace string, conn *pgxpool.Pool) *Manager { - if namespace == "" { - namespace = config.DefaultNamespace - } - m := &Manager{ - service: service, - cfg: cfg, - namespace: namespace, - db: conn, - queries: dbsqlc.New(conn), - logger: log.With(slog.String("component", "mcp")), - containerLocks: make(map[string]*sync.Mutex), - containerIPs: make(map[string]string), - containerID: func(botID string) string { - return ContainerPrefix + botID - }, - } - m.grpcPool = mcpclient.NewPool(m.ContainerIP) - return m -} - -func (m *Manager) lockContainer(containerID string) func() { - m.containerLockMu.Lock() - lock, ok := m.containerLocks[containerID] - if !ok { - lock = &sync.Mutex{} - m.containerLocks[containerID] = lock - } - m.containerLockMu.Unlock() - - lock.Lock() - return lock.Unlock -} - -// ContainerIP returns the cached IP address for a bot's container. -// If not cached, it attempts to recover the IP by re-running CNI setup. -func (m *Manager) ContainerIP(botID string) string { - m.mu.RLock() - if ip, ok := m.containerIPs[botID]; ok { - m.mu.RUnlock() - return ip - } - m.mu.RUnlock() - - // Cache miss - try to recover IP via CNI setup (idempotent) - ip, err := m.recoverContainerIP(botID) - if err != nil { - m.logger.Warn("container IP recovery failed", slog.String("bot_id", botID), slog.Any("error", err)) - return "" - } - if ip != "" { - m.mu.Lock() - m.containerIPs[botID] = ip - m.mu.Unlock() - m.logger.Info("container IP recovered", slog.String("bot_id", botID), slog.String("ip", ip)) - } - return ip -} - -// SetContainerIP stores the container IP in the cache. -// If the IP changed, the stale gRPC connection is evicted from the pool. -func (m *Manager) SetContainerIP(botID, ip string) { - if ip == "" { - return - } - m.mu.Lock() - old := m.containerIPs[botID] - m.containerIPs[botID] = ip - m.mu.Unlock() - - if old != "" && old != ip { - m.grpcPool.Remove(botID) - m.logger.Info("evicted stale gRPC connection", slog.String("bot_id", botID), slog.String("old_ip", old), slog.String("new_ip", ip)) - } -} - -// recoverContainerIP attempts to restore the container IP by re-running CNI setup. -// CNI plugins are idempotent — calling Setup again returns the existing IP allocation. -// Retries up to 2 times to tolerate transient CNI failures (IPAM lock contention, etc.). -func (m *Manager) recoverContainerIP(botID string) (string, error) { - ctx := context.Background() - containerID := m.containerID(botID) - - info, err := m.service.GetContainer(ctx, containerID) - if err != nil { - return "", err - } - - if ip, ok := info.Labels["mcp.container_ip"]; ok { - return ip, nil - } - - const maxAttempts = 2 - var lastErr error - for i := 0; i < maxAttempts; i++ { - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - lastErr = err - m.logger.Warn("IP recovery attempt failed", - slog.String("bot_id", botID), slog.Int("attempt", i+1), slog.Any("error", err)) - time.Sleep(time.Duration(i+1) * 500 * time.Millisecond) - continue - } - return netResult.IP, nil - } - return "", fmt.Errorf("network setup for IP recovery after %d attempts: %w", maxAttempts, lastErr) -} - -// MCPClient returns a gRPC client for the given bot's container. -// Implements mcpclient.Provider. -func (m *Manager) MCPClient(ctx context.Context, botID string) (*mcpclient.Client, error) { - return m.grpcPool.Get(ctx, botID) -} - -func (m *Manager) Init(ctx context.Context) error { - image := m.imageRef() - - needsPull, remoteErr := m.checkImageUpgrade(ctx, image) - if remoteErr != nil { - // Remote check failed (network unavailable, registry down, etc.). - // Fall back to local image if available; fail only when nothing is cached. - m.logger.Warn("image upgrade check failed, falling back to local", - slog.String("image", image), slog.Any("error", remoteErr)) - if _, err := m.service.GetImage(ctx, image); err != nil { - _, err = m.service.PullImage(ctx, image, &ctr.PullImageOptions{ - Unpack: true, - Snapshotter: m.cfg.Snapshotter, - }) - return err - } - return nil - } - - if !needsPull { - return nil - } - - m.logger.Info("pulling updated MCP image", slog.String("image", image)) - if _, err := m.service.PullImage(ctx, image, &ctr.PullImageOptions{ - Unpack: true, - Snapshotter: m.cfg.Snapshotter, - }); err != nil { - m.logger.Warn("image pull failed, using existing version", slog.Any("error", err)) - if _, err2 := m.service.GetImage(ctx, image); err2 != nil { - return err - } - return nil - } - - // Existing bot containers keep running with their current image. - // New containers created after this point will use the updated image. - return nil -} - -// checkImageUpgrade compares the local image digest against the remote registry. -// Returns (true, nil) when a newer image is available or no local image exists. -// Returns (false, err) when the remote cannot be reached. -func (m *Manager) checkImageUpgrade(ctx context.Context, image string) (needsPull bool, _ error) { - checkCtx, cancel := context.WithTimeout(ctx, 15*time.Second) - defer cancel() - - remoteDigest, err := m.service.ResolveRemoteDigest(checkCtx, image) - if err != nil { - return false, err - } - - localImg, err := m.service.GetImage(ctx, image) - if err != nil { - return true, nil // no local image - } - return localImg.ID != remoteDigest, nil -} - -// EnsureBot creates the MCP container for a bot if it does not exist. -// Bot data lives in the container's writable layer (snapshot), not bind mounts. -func (m *Manager) EnsureBot(ctx context.Context, botID string) error { - if err := validateBotID(botID); err != nil { - return err - } - - image := m.imageRef() - resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) - if err != nil { - return err - } - - mounts := []ctr.MountSpec{ - { - Destination: "/etc/resolv.conf", - Type: "bind", - Source: resolvPath, - Options: []string{"rbind", "ro"}, - }, - } - tzMounts, tzEnv := ctr.TimezoneSpec() - mounts = append(mounts, tzMounts...) - - _, err = m.service.CreateContainer(ctx, ctr.CreateContainerRequest{ - ID: m.containerID(botID), - ImageRef: image, - Snapshotter: m.cfg.Snapshotter, - Labels: map[string]string{ - BotLabelKey: botID, - }, - Spec: ctr.ContainerSpec{ - Mounts: mounts, - Env: tzEnv, - }, - }) - if err == nil { - return nil - } - - if !errdefs.IsAlreadyExists(err) { - return err - } - - return nil -} - -// ListBots returns the bot IDs that have MCP containers. -func (m *Manager) ListBots(ctx context.Context) ([]string, error) { - containers, err := m.service.ListContainers(ctx) - if err != nil { - return nil, err - } - - botIDs := make([]string, 0, len(containers)) - for _, info := range containers { - if strings.HasPrefix(info.ID, ContainerPrefix) { - if botID, ok := info.Labels[BotLabelKey]; ok { - botIDs = append(botIDs, botID) - } - } - } - return botIDs, nil -} - -func (m *Manager) Start(ctx context.Context, botID string) error { - containerID := m.containerID(botID) - - // Before creating a new container, check for an orphaned snapshot - // (container deleted but snapshot with /data survived). Export /data - // to a backup so it can be restored after EnsureBot creates a fresh - // container. This covers dev image rebuilds, containerd metadata loss, - // and manual container deletion. - if _, err := m.service.GetContainer(ctx, containerID); errdefs.IsNotFound(err) { - m.recoverOrphanedSnapshot(ctx, botID) - } - - if err := m.EnsureBot(ctx, botID); err != nil { - return err - } - - // Restore preserved data (from orphaned snapshot recovery or a previous - // CleanupBotContainer with preserveData) into the fresh snapshot before - // starting the task, avoiding a redundant stop/start cycle. - if m.HasPreservedData(botID) { - if err := m.restorePreservedIntoSnapshot(ctx, botID); err != nil { - m.logger.Warn("restore preserved data into new container failed", - slog.String("bot_id", botID), slog.Any("error", err)) - } - } - - if err := m.service.StartContainer(ctx, containerID, nil); err != nil { - return err - } - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { - m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) - } - return err - } - if netResult.IP == "" { - if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { - m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) - } - return fmt.Errorf("network setup returned no IP for bot %s", botID) - } - m.SetContainerIP(botID, netResult.IP) - m.logger.Info("container network ready", slog.String("bot_id", botID), slog.String("ip", netResult.IP)) - return nil -} - -func (m *Manager) Stop(ctx context.Context, botID string, timeout time.Duration) error { - if err := validateBotID(botID); err != nil { - return err - } - return m.service.StopContainer(ctx, m.containerID(botID), &ctr.StopTaskOptions{ - Timeout: timeout, - Force: true, - }) -} - -func (m *Manager) Delete(ctx context.Context, botID string, preserveData bool) error { - if err := validateBotID(botID); err != nil { - return err - } - - containerID := m.containerID(botID) - stoppedForPreserve := false - - if preserveData { - info, err := m.service.GetContainer(ctx, containerID) - if err != nil { - return fmt.Errorf("get container for preserve: %w", err) - } - if _, err := m.snapshotMounts(ctx, info); errors.Is(err, errMountNotSupported) { - // Apple backend fallback uses gRPC against a running container. - } else if err != nil { - return err - } else { - if err := m.safeStopTask(ctx, containerID); err != nil { - return fmt.Errorf("stop for data preserve: %w", err) - } - stoppedForPreserve = true - } - - if err := m.PreserveData(ctx, botID); err != nil { - // Export failed — restart only if we stopped the task, and abort - // deletion to prevent data loss. - if stoppedForPreserve { - m.restartContainer(ctx, botID, containerID) - } - return fmt.Errorf("preserve data: %w", err) - } - } - - m.grpcPool.Remove(botID) - - if err := m.service.RemoveNetwork(ctx, ctr.NetworkSetupRequest{ - ContainerID: containerID, - CNIBinDir: m.cfg.CNIBinaryDir, - CNIConfDir: m.cfg.CNIConfigDir, - }); err != nil { - m.logger.Warn("cleanup: remove network failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { - m.logger.Warn("cleanup: delete task failed", slog.String("container_id", containerID), slog.Any("error", err)) - } - return m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{ - CleanupSnapshot: true, - }) -} - -func (m *Manager) dataRoot() string { - if m.cfg.DataRoot == "" { - return config.DefaultDataRoot - } - return m.cfg.DataRoot -} - -func (m *Manager) imageRef() string { - return m.cfg.ImageRef() -} - -func validateBotID(botID string) error { - return identity.ValidateChannelIdentityID(botID) -} diff --git a/internal/mcp/providers/browser/provider.go b/internal/mcp/providers/browser/provider.go index 17061d6a..00fc7a4b 100644 --- a/internal/mcp/providers/browser/provider.go +++ b/internal/mcp/providers/browser/provider.go @@ -15,8 +15,8 @@ import ( "github.com/memohai/memoh/internal/browsercontexts" "github.com/memohai/memoh/internal/config" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" "github.com/memohai/memoh/internal/settings" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -28,12 +28,12 @@ type Executor struct { logger *slog.Logger settings *settings.Service browserContexts *browsercontexts.Service - containers mcpclient.Provider + containers bridge.Provider gatewayBaseURL string httpClient *http.Client } -func NewExecutor(log *slog.Logger, settingsSvc *settings.Service, browserSvc *browsercontexts.Service, containers mcpclient.Provider, gatewayCfg config.BrowserGatewayConfig) *Executor { +func NewExecutor(log *slog.Logger, settingsSvc *settings.Service, browserSvc *browsercontexts.Service, containers bridge.Provider, gatewayCfg config.BrowserGatewayConfig) *Executor { if log == nil { log = slog.Default() } diff --git a/internal/mcp/providers/container/provider.go b/internal/mcp/providers/container/provider.go index 2a400567..8a4cc14c 100644 --- a/internal/mcp/providers/container/provider.go +++ b/internal/mcp/providers/container/provider.go @@ -9,7 +9,7 @@ import ( "strings" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -26,13 +26,13 @@ const ( // operate inside the bot container via gRPC. All I/O goes through the container // sandbox — no direct host filesystem access. type Executor struct { - clients mcpclient.Provider + clients bridge.Provider execWorkDir string logger *slog.Logger } // NewExecutor returns a tool executor backed by gRPC container clients. -func NewExecutor(log *slog.Logger, clients mcpclient.Provider, execWorkDir string) *Executor { +func NewExecutor(log *slog.Logger, clients bridge.Provider, execWorkDir string) *Executor { if log == nil { log = slog.Default() } @@ -187,7 +187,7 @@ func (p *Executor) CallTool(ctx context.Context, session mcpgw.ToolSessionContex } } -func (p *Executor) callRead(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callRead(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) if filePath == "" { return mcpgw.BuildToolErrorResult("path is required"), nil @@ -233,7 +233,7 @@ func (p *Executor) callRead(ctx context.Context, client *mcpclient.Client, args }), nil } -func (p *Executor) callWrite(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callWrite(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) content := mcpgw.StringArg(args, "content") if filePath == "" { @@ -245,7 +245,7 @@ func (p *Executor) callWrite(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"ok": true}), nil } -func (p *Executor) callList(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callList(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { dirPath := p.normalizePath(mcpgw.StringArg(args, "path")) if dirPath == "" { dirPath = "." @@ -269,7 +269,7 @@ func (p *Executor) callList(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"path": dirPath, "entries": entriesMaps}), nil } -func (p *Executor) callEdit(ctx context.Context, client *mcpclient.Client, args map[string]any) (map[string]any, error) { +func (p *Executor) callEdit(ctx context.Context, client *bridge.Client, args map[string]any) (map[string]any, error) { filePath := p.normalizePath(mcpgw.StringArg(args, "path")) oldText := mcpgw.StringArg(args, "old_text") newText := mcpgw.StringArg(args, "new_text") @@ -298,7 +298,7 @@ func (p *Executor) callEdit(ctx context.Context, client *mcpclient.Client, args return mcpgw.BuildToolSuccessResult(map[string]any{"ok": true}), nil } -func (p *Executor) callExec(ctx context.Context, client *mcpclient.Client, botID string, args map[string]any) (map[string]any, error) { +func (p *Executor) callExec(ctx context.Context, client *bridge.Client, botID string, args map[string]any) (map[string]any, error) { command := strings.TrimSpace(mcpgw.StringArg(args, "command")) if command == "" { return mcpgw.BuildToolErrorResult("command is required"), nil diff --git a/internal/mcp/providers/container/provider_test.go b/internal/mcp/providers/container/provider_test.go index f9972f97..d978725a 100644 --- a/internal/mcp/providers/container/provider_test.go +++ b/internal/mcp/providers/container/provider_test.go @@ -13,8 +13,8 @@ import ( "google.golang.org/grpc/test/bufconn" mcpgw "github.com/memohai/memoh/internal/mcp" - "github.com/memohai/memoh/internal/mcp/mcpclient" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + "github.com/memohai/memoh/internal/workspace/bridge" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const bufSize = 1 << 20 @@ -164,8 +164,8 @@ func splitLines(s string) []string { return lines } -// testSetup creates a bufconn gRPC server and a matching mcpclient.Provider. -func testSetup(t *testing.T, svc *fakeContainerService) mcpclient.Provider { +// testSetup creates a bufconn gRPC server and a matching bridge.Provider. +func testSetup(t *testing.T, svc *fakeContainerService) bridge.Provider { t.Helper() lis := bufconn.Listen(bufSize) srv := grpc.NewServer() @@ -193,16 +193,16 @@ func testSetup(t *testing.T, svc *fakeContainerService) mcpclient.Provider { } t.Cleanup(func() { _ = conn.Close() }) - client := mcpclient.NewClientFromConn(conn) + client := bridge.NewClientFromConn(conn) return &staticProvider{client: client} } // staticProvider always returns the same client, ignoring botID. type staticProvider struct { - client *mcpclient.Client + client *bridge.Client } -func (p *staticProvider) MCPClient(_ context.Context, _ string) (*mcpclient.Client, error) { +func (p *staticProvider) MCPClient(_ context.Context, _ string) (*bridge.Client, error) { return p.client, nil } @@ -210,7 +210,7 @@ func session() mcpgw.ToolSessionContext { return mcpgw.ToolSessionContext{BotID: "bot-test"} } -func executor(provider mcpclient.Provider) *Executor { +func executor(provider bridge.Provider) *Executor { return NewExecutor(nil, provider, defaultExecWorkDir) } diff --git a/internal/memory/storefs/service.go b/internal/memory/storefs/service.go index d7f0536b..6b5c0107 100644 --- a/internal/memory/storefs/service.go +++ b/internal/memory/storefs/service.go @@ -16,7 +16,7 @@ import ( "gopkg.in/yaml.v3" "github.com/memohai/memoh/internal/config" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const ( @@ -34,7 +34,7 @@ type scanEntry struct { } type Service struct { - provider mcpclient.Provider + provider bridge.Provider logger *slog.Logger } @@ -59,14 +59,14 @@ type memoryEntryMeta struct { Metadata map[string]any `yaml:"metadata,omitempty"` } -func New(log *slog.Logger, provider mcpclient.Provider) *Service { +func New(log *slog.Logger, provider bridge.Provider) *Service { if log == nil { log = slog.Default() } return &Service{provider: provider, logger: log.With(slog.String("component", "storefs"))} } -func (s *Service) client(ctx context.Context, botID string) (*mcpclient.Client, error) { +func (s *Service) client(ctx context.Context, botID string) (*bridge.Client, error) { if s.provider == nil { return nil, ErrNotConfigured } @@ -665,7 +665,7 @@ func formatMemoryOverviewMD(items []MemoryItem) string { // --- utility helpers --- func isNotFound(err error) bool { - return errors.Is(err, mcpclient.ErrNotFound) + return errors.Is(err, bridge.ErrNotFound) } func toItemMap(items []MemoryItem) map[string]MemoryItem { diff --git a/internal/storage/providers/containerfs/provider.go b/internal/storage/providers/containerfs/provider.go index e6eb120c..b23a70b5 100644 --- a/internal/storage/providers/containerfs/provider.go +++ b/internal/storage/providers/containerfs/provider.go @@ -11,18 +11,18 @@ import ( "path/filepath" "strings" - "github.com/memohai/memoh/internal/mcp/mcpclient" + "github.com/memohai/memoh/internal/workspace/bridge" ) const containerMediaRoot = "media" // Provider stores media assets inside bot containers via gRPC. type Provider struct { - clients mcpclient.Provider + clients bridge.Provider } // New creates a container-based storage provider. -func New(clients mcpclient.Provider) *Provider { +func New(clients bridge.Provider) *Provider { return &Provider{clients: clients} } diff --git a/internal/mcp/mcpclient/client.go b/internal/workspace/bridge/client.go similarity index 90% rename from internal/mcp/mcpclient/client.go rename to internal/workspace/bridge/client.go index 9cb27ab9..57462762 100644 --- a/internal/mcp/mcpclient/client.go +++ b/internal/workspace/bridge/client.go @@ -1,8 +1,8 @@ -// Package mcpclient provides a gRPC client for the MCP container service. -// Each bot container runs a gRPC server on port 9090 exposing file and exec -// operations. This client wraps the generated gRPC stubs with connection -// pooling and a simplified API for callers. -package mcpclient +// Package bridge provides a gRPC client for the workspace container bridge service. +// Each bot container runs a gRPC server listening on a Unix domain socket. +// This client wraps the generated gRPC stubs with connection pooling and a +// simplified API for callers. +package bridge import ( "bytes" @@ -17,8 +17,7 @@ import ( "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" - "github.com/memohai/memoh/internal/config" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const connectingTimeout = 30 * time.Second @@ -41,9 +40,9 @@ func NewClientFromConn(conn *grpc.ClientConn) *Client { } } -// Dial creates a new Client connected to the given container IP. -func Dial(_ context.Context, ip string) (*Client, error) { - target := fmt.Sprintf("%s:%d", ip, config.MCPGRPCPort) +// Dial creates a new Client connected to the given gRPC target. +// For UDS use "unix:///path/to/sock", for TCP use "host:port". +func Dial(_ context.Context, target string) (*Client, error) { conn, err := grpc.NewClient(target, grpc.WithTransportCredentials(insecure.NewCredentials()), ) @@ -347,16 +346,17 @@ type Provider interface { // Pool manages cached gRPC clients keyed by bot ID. type Pool struct { - mu sync.RWMutex - clients map[string]*Client - ipFunc func(botID string) string + mu sync.RWMutex + clients map[string]*Client + dialTargetFunc func(botID string) string } -// NewPool creates a client pool. ipFunc maps bot ID to container IP. -func NewPool(ipFunc func(string) string) *Pool { +// NewPool creates a client pool. dialTargetFunc maps bot ID to a gRPC target +// string (e.g. "unix:///path/sock" or "host:port"). +func NewPool(dialTargetFunc func(string) string) *Pool { return &Pool{ - clients: make(map[string]*Client), - ipFunc: ipFunc, + clients: make(map[string]*Client), + dialTargetFunc: dialTargetFunc, } } @@ -383,12 +383,12 @@ func (p *Pool) Get(ctx context.Context, botID string) (*Client, error) { p.mu.RUnlock() } - ip := p.ipFunc(botID) - if ip == "" { - return nil, fmt.Errorf("no IP for bot %s", botID) + target := p.dialTargetFunc(botID) + if target == "" { + return nil, fmt.Errorf("no dial target for bot %s", botID) } - c, err := Dial(ctx, ip) + c, err := Dial(ctx, target) if err != nil { return nil, err } diff --git a/internal/mcp/mcpclient/client_test.go b/internal/workspace/bridge/client_test.go similarity index 97% rename from internal/mcp/mcpclient/client_test.go rename to internal/workspace/bridge/client_test.go index df75c1e5..62106269 100644 --- a/internal/mcp/mcpclient/client_test.go +++ b/internal/workspace/bridge/client_test.go @@ -1,4 +1,4 @@ -package mcpclient +package bridge import ( "context" @@ -13,7 +13,7 @@ import ( "google.golang.org/grpc/status" "google.golang.org/grpc/test/bufconn" - pb "github.com/memohai/memoh/internal/mcp/mcpcontainer" + pb "github.com/memohai/memoh/internal/workspace/bridgepb" ) const testBufSize = 1 << 20 diff --git a/internal/mcp/mcpclient/errors.go b/internal/workspace/bridge/errors.go similarity index 98% rename from internal/mcp/mcpclient/errors.go rename to internal/workspace/bridge/errors.go index f7ba2d42..93918083 100644 --- a/internal/mcp/mcpclient/errors.go +++ b/internal/workspace/bridge/errors.go @@ -1,4 +1,4 @@ -package mcpclient +package bridge import ( "errors" diff --git a/internal/mcp/mcpcontainer/mcpcontainer.pb.go b/internal/workspace/bridgepb/bridge.pb.go similarity index 99% rename from internal/mcp/mcpcontainer/mcpcontainer.pb.go rename to internal/workspace/bridgepb/bridge.pb.go index b88c865f..7e2910c9 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer.pb.go +++ b/internal/workspace/bridgepb/bridge.pb.go @@ -2,9 +2,9 @@ // versions: // protoc-gen-go v1.36.11 // protoc v7.34.0 -// source: internal/mcp/mcpcontainer/mcpcontainer.proto +// source: internal/workspace/bridgepb/bridge.proto -package mcpcontainer +package bridgepb import ( reflect "reflect" diff --git a/internal/mcp/mcpcontainer/mcpcontainer.proto b/internal/workspace/bridgepb/bridge.proto similarity index 95% rename from internal/mcp/mcpcontainer/mcpcontainer.proto rename to internal/workspace/bridgepb/bridge.proto index 0b7be27f..220d1596 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer.proto +++ b/internal/workspace/bridgepb/bridge.proto @@ -1,8 +1,8 @@ syntax = "proto3"; -package mcpcontainer; +package bridgepb; -option go_package = "github.com/memohai/memoh/internal/mcp/mcpcontainer"; +option go_package = "github.com/memohai/memoh/internal/workspace/bridgepb"; service ContainerService { rpc ReadFile(ReadFileRequest) returns (ReadFileResponse); diff --git a/internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go b/internal/workspace/bridgepb/bridge_grpc.pb.go similarity index 99% rename from internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go rename to internal/workspace/bridgepb/bridge_grpc.pb.go index 01a7744e..90f39ab2 100644 --- a/internal/mcp/mcpcontainer/mcpcontainer_grpc.pb.go +++ b/internal/workspace/bridgepb/bridge_grpc.pb.go @@ -2,9 +2,9 @@ // versions: // - protoc-gen-go-grpc v1.6.1 // - protoc v7.34.0 -// source: internal/mcp/mcpcontainer/mcpcontainer.proto +// source: internal/workspace/bridgepb/bridge.proto -package mcpcontainer +package bridgepb import ( context "context" @@ -451,5 +451,5 @@ var ContainerService_ServiceDesc = grpc.ServiceDesc{ ClientStreams: true, }, }, - Metadata: "internal/mcp/mcpcontainer/mcpcontainer.proto", + Metadata: "internal/workspace/bridgepb/bridge.proto", } diff --git a/internal/mcp/dataio.go b/internal/workspace/dataio.go similarity index 92% rename from internal/mcp/dataio.go rename to internal/workspace/dataio.go index 2546dfbd..27f61c9c 100644 --- a/internal/mcp/dataio.go +++ b/internal/workspace/dataio.go @@ -1,4 +1,4 @@ -package mcp +package workspace import ( "archive/tar" @@ -30,7 +30,7 @@ const ( // The container is stopped during export and restarted afterwards. // Caller must consume the returned reader before the context is cancelled. func (m *Manager) ExportData(ctx context.Context, botID string) (io.ReadCloser, error) { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -75,7 +75,7 @@ func (m *Manager) ExportData(ctx context.Context, botID string) (io.ReadCloser, // ImportData extracts a tar.gz archive into the container's /data directory. // The container is stopped during import and restarted afterwards. func (m *Manager) ImportData(ctx context.Context, botID string, r io.Reader) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -112,7 +112,7 @@ func (m *Manager) ImportData(ctx context.Context, botID string, r io.Reader) err // mounted snapshot is consistent; the Apple fallback uses gRPC and does not // require a stop. func (m *Manager) PreserveData(ctx context.Context, botID string) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { @@ -150,7 +150,10 @@ func (m *Manager) PreserveData(ctx context.Context, botID string) error { _ = os.Remove(backupPath) return fmt.Errorf("export data: %w", writeErr) } - return closeErr + if closeErr != nil { + return closeErr + } + return nil } // RestorePreservedData imports preserved data (backup tar.gz or legacy @@ -172,15 +175,13 @@ func (m *Manager) RestorePreservedData(ctx context.Context, botID string) error // Legacy bind-mount directory legacyDir := m.legacyDataDir(botID) - migratedDir := legacyDir + migratedSuffix - if _, err := os.Stat(migratedDir); err == nil { + if _, err := os.Stat(legacyDir + migratedSuffix); err == nil { return nil // already imported previously } info, err := os.Stat(legacyDir) if err != nil || !info.IsDir() { return errors.New("no preserved data found") } - return m.importLegacyDir(ctx, botID, legacyDir) } @@ -201,7 +202,7 @@ func (m *Manager) HasPreservedData(botID string) bool { // importLegacyDir copies a legacy bind-mount directory into the container // via snapshot mount, then renames the source to .migrated. func (m *Manager) importLegacyDir(ctx context.Context, botID, srcDir string) error { - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { @@ -233,7 +234,7 @@ func (m *Manager) importLegacyDir(ctx context.Context, botID, srcDir string) err } if err := os.Rename(srcDir, srcDir+migratedSuffix); err != nil { - m.logger.Warn("legacy import: rename failed", + m.logger.Warn("legacy import: rename to .migrated failed", slog.String("src", srcDir), slog.Any("error", err)) } return nil @@ -249,7 +250,7 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo return false } - snapshotKey := m.containerID(botID) + snapshotKey := m.resolveContainerID(ctx, botID) raw, err := m.service.SnapshotMounts(ctx, snapshotter, snapshotKey) if err != nil { return false @@ -269,7 +270,7 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo f, err := os.Create(backupPath) //nolint:gosec // G304: operator-controlled path if err != nil { - m.logger.Warn("recover orphaned snapshot: create backup failed", + m.logger.Warn("recover orphaned snapshot: create backup file failed", slog.String("bot_id", botID), slog.Any("error", err)) return false } @@ -293,9 +294,6 @@ func (m *Manager) recoverOrphanedSnapshot(ctx context.Context, botID string) boo _ = os.Remove(backupPath) return false } - - m.logger.Info("recovered data from orphaned snapshot", - slog.String("bot_id", botID), slog.String("backup", backupPath)) return true } @@ -310,7 +308,7 @@ func (m *Manager) restorePreservedIntoSnapshot(ctx context.Context, botID string } defer func() { _ = f.Close() }() - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) info, err := m.service.GetContainer(ctx, containerID) if err != nil { return fmt.Errorf("get container: %w", err) @@ -332,8 +330,6 @@ func (m *Manager) restorePreservedIntoSnapshot(ctx context.Context, botID string } _ = os.Remove(bp) - m.logger.Info("restored preserved data into new container", - slog.String("bot_id", botID)) return nil } @@ -372,17 +368,17 @@ func (m *Manager) restartContainer(ctx context.Context, botID, containerID strin slog.String("container_id", containerID), slog.Any("error", err)) return } - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + // CNI network setup — outbound connectivity is required for package + // downloads and other network-dependent operations in the container. + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ ContainerID: containerID, CNIBinDir: m.cfg.CNIBinaryDir, CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { - m.logger.Warn("network setup after restart failed", + }); err != nil { + m.logger.Error("network setup after restart failed", slog.String("container_id", containerID), slog.Any("error", err)) return } - m.SetContainerIP(botID, netResult.IP) } func mountedDataDir(root string) string { @@ -560,11 +556,34 @@ func tarGzDir(w io.Writer, dir string) error { if err != nil || rel == "." { return err } - info, err := d.Info() + + if d.IsDir() { + info, err := d.Info() + if err != nil { + return err + } + header, err := tar.FileInfoHeader(info, "") + if err != nil { + return err + } + header.Name = filepath.ToSlash(rel) + return tw.WriteHeader(header) + } + + // For regular files: open first, then Fstat on the same fd so that + // the size in the tar header is guaranteed to match the content we + // read. This avoids race conditions and overlayfs size mismatches + // that cause "archive/tar: write too long". + f, err := os.Open(path) //nolint:gosec // G304: iterating operator-controlled data directory if err != nil { return err } + defer func() { _ = f.Close() }() + info, err := f.Stat() + if err != nil { + return err + } header, err := tar.FileInfoHeader(info, "") if err != nil { return err @@ -574,17 +593,7 @@ func tarGzDir(w io.Writer, dir string) error { if err := tw.WriteHeader(header); err != nil { return err } - - if d.IsDir() { - return nil - } - - f, err := os.Open(path) //nolint:gosec // G304: iterating operator-controlled data directory - if err != nil { - return err - } - defer func() { _ = f.Close() }() - _, err = io.Copy(tw, f) + _, err = io.Copy(tw, io.LimitReader(f, info.Size())) return err }) } diff --git a/internal/workspace/identity.go b/internal/workspace/identity.go new file mode 100644 index 00000000..52746b84 --- /dev/null +++ b/internal/workspace/identity.go @@ -0,0 +1,34 @@ +package workspace + +import ( + "strings" + + ctr "github.com/memohai/memoh/internal/containerd" +) + +var knownContainerPrefixes = []string{ContainerPrefix, LegacyContainerPrefix} + +// BotIDFromContainerID infers a bot ID from a known container naming scheme. +// This is only used as a fallback for legacy containers when labels are missing. +func BotIDFromContainerID(containerID string) (string, bool) { + for _, prefix := range knownContainerPrefixes { + if !strings.HasPrefix(containerID, prefix) { + continue + } + botID := strings.TrimPrefix(containerID, prefix) + if botID == "" { + return "", false + } + return botID, true + } + return "", false +} + +// BotIDFromContainerInfo resolves the bot ID from container metadata. +// It prefers the current label and only falls back to name inference. +func BotIDFromContainerInfo(info ctr.ContainerInfo) (string, bool) { + if botID := strings.TrimSpace(info.Labels[BotLabelKey]); botID != "" { + return botID, true + } + return BotIDFromContainerID(info.ID) +} diff --git a/internal/workspace/identity_test.go b/internal/workspace/identity_test.go new file mode 100644 index 00000000..65328271 --- /dev/null +++ b/internal/workspace/identity_test.go @@ -0,0 +1,52 @@ +package workspace + +import ( + "testing" + "time" + + ctr "github.com/memohai/memoh/internal/containerd" +) + +func TestBotIDFromContainerInfoPrefersCurrentLabel(t *testing.T) { + t.Parallel() + + info := ctr.ContainerInfo{ + ID: "workspace-ignored", + Labels: map[string]string{ + BotLabelKey: "bot-from-label", + }, + UpdatedAt: time.Now(), + } + + botID, ok := BotIDFromContainerInfo(info) + if !ok { + t.Fatal("expected bot ID to resolve") + } + if botID != "bot-from-label" { + t.Fatalf("expected labeled bot ID, got %q", botID) + } +} + +func TestBotIDFromContainerInfoFallsBackToKnownPrefixes(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + containerID string + want string + }{ + {name: "workspace", containerID: "workspace-bot-123", want: "bot-123"}, + {name: "legacy", containerID: "mcp-bot-456", want: "bot-456"}, + } { + t.Run(tc.name, func(t *testing.T) { + info := ctr.ContainerInfo{ID: tc.containerID} + got, ok := BotIDFromContainerInfo(info) + if !ok { + t.Fatal("expected bot ID to resolve") + } + if got != tc.want { + t.Fatalf("expected %q, got %q", tc.want, got) + } + }) + } +} diff --git a/internal/workspace/image_preference.go b/internal/workspace/image_preference.go new file mode 100644 index 00000000..a298e202 --- /dev/null +++ b/internal/workspace/image_preference.go @@ -0,0 +1,176 @@ +package workspace + +import ( + "context" + "encoding/json" + "errors" + "strings" + + "github.com/jackc/pgx/v5" + + "github.com/memohai/memoh/internal/config" + "github.com/memohai/memoh/internal/db" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" +) + +const ( + workspaceMetadataKey = "workspace" + workspaceImageMetadataKey = "image" +) + +func decodeBotMetadata(payload []byte) (map[string]any, error) { + if len(payload) == 0 { + return map[string]any{}, nil + } + var data map[string]any + if err := json.Unmarshal(payload, &data); err != nil { + return nil, err + } + if data == nil { + data = map[string]any{} + } + return data, nil +} + +func cloneAnyMap(src map[string]any) map[string]any { + if src == nil { + return map[string]any{} + } + cloned := make(map[string]any, len(src)) + for key, value := range src { + cloned[key] = value + } + return cloned +} + +func workspaceSection(metadata map[string]any) map[string]any { + raw, ok := metadata[workspaceMetadataKey] + if !ok { + return map[string]any{} + } + section, ok := raw.(map[string]any) + if !ok { + return map[string]any{} + } + return cloneAnyMap(section) +} + +func workspaceImageFromMetadata(metadata map[string]any) string { + section := workspaceSection(metadata) + image, _ := section[workspaceImageMetadataKey].(string) + return strings.TrimSpace(image) +} + +func withWorkspaceImagePreference(metadata map[string]any, image string) map[string]any { + next := cloneAnyMap(metadata) + section := workspaceSection(next) + section[workspaceImageMetadataKey] = strings.TrimSpace(image) + next[workspaceMetadataKey] = section + return next +} + +func withoutWorkspaceImagePreference(metadata map[string]any) map[string]any { + next := cloneAnyMap(metadata) + section := workspaceSection(next) + delete(section, workspaceImageMetadataKey) + if len(section) == 0 { + delete(next, workspaceMetadataKey) + return next + } + next[workspaceMetadataKey] = section + return next +} + +func (m *Manager) botWorkspaceImagePreference(ctx context.Context, botID string) (string, error) { + if m.queries == nil { + return "", nil + } + botUUID, err := db.ParseUUID(botID) + if err != nil { + return "", err + } + row, err := m.queries.GetBotByID(ctx, botUUID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return "", nil + } + return "", err + } + metadata, err := decodeBotMetadata(row.Metadata) + if err != nil { + return "", err + } + return workspaceImageFromMetadata(metadata), nil +} + +func (m *Manager) updateBotWorkspaceImagePreference(ctx context.Context, botID, image string, clearPreference bool) error { + if m.queries == nil { + return nil + } + botUUID, err := db.ParseUUID(botID) + if err != nil { + return err + } + row, err := m.queries.GetBotByID(ctx, botUUID) + if err != nil { + return err + } + metadata, err := decodeBotMetadata(row.Metadata) + if err != nil { + return err + } + if clearPreference { + metadata = withoutWorkspaceImagePreference(metadata) + } else { + metadata = withWorkspaceImagePreference(metadata, image) + } + payload, err := json.Marshal(metadata) + if err != nil { + return err + } + _, err = m.queries.UpdateBotProfile(ctx, dbsqlc.UpdateBotProfileParams{ + ID: botUUID, + DisplayName: row.DisplayName, + AvatarUrl: row.AvatarUrl, + IsActive: row.IsActive, + Metadata: payload, + }) + return err +} + +func (m *Manager) RememberWorkspaceImage(ctx context.Context, botID, image string) error { + return m.updateBotWorkspaceImagePreference(ctx, botID, config.NormalizeImageRef(image), false) +} + +func (m *Manager) ClearWorkspaceImagePreference(ctx context.Context, botID string) error { + return m.updateBotWorkspaceImagePreference(ctx, botID, "", true) +} + +func (m *Manager) ResolveWorkspaceImage(ctx context.Context, botID string) (string, error) { + return m.resolveWorkspaceImage(ctx, botID) +} + +func (m *Manager) resolveWorkspaceImage(ctx context.Context, botID string) (string, error) { + if m.queries != nil { + pgBotID, err := db.ParseUUID(botID) + if err == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil && strings.TrimSpace(row.Image) != "" { + return config.NormalizeImageRef(strings.TrimSpace(row.Image)), nil + } + if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { + return "", dbErr + } + } + } + + preferredImage, err := m.botWorkspaceImagePreference(ctx, botID) + if err != nil { + return "", err + } + if preferredImage != "" { + return config.NormalizeImageRef(preferredImage), nil + } + + return m.imageRef(), nil +} diff --git a/internal/workspace/image_preference_test.go b/internal/workspace/image_preference_test.go new file mode 100644 index 00000000..dcbe68fc --- /dev/null +++ b/internal/workspace/image_preference_test.go @@ -0,0 +1,53 @@ +package workspace + +import "testing" + +func TestWorkspaceImageMetadataRoundTrip(t *testing.T) { + t.Parallel() + + metadata := map[string]any{ + "name": "test", + workspaceMetadataKey: map[string]any{ + "keep": "value", + }, + } + + updated := withWorkspaceImagePreference(metadata, "alpine:3.20") + + if got := workspaceImageFromMetadata(updated); got != "alpine:3.20" { + t.Fatalf("expected image preference to round-trip, got %q", got) + } + workspace, ok := updated[workspaceMetadataKey].(map[string]any) + if !ok { + t.Fatal("expected workspace metadata section") + } + if workspace["keep"] != "value" { + t.Fatalf("expected existing workspace metadata to be preserved, got %#v", workspace) + } + if _, exists := metadata[workspaceMetadataKey].(map[string]any)[workspaceImageMetadataKey]; exists { + t.Fatal("expected original metadata map to remain unchanged") + } +} + +func TestWithoutWorkspaceImagePreferenceRemovesOnlyImageKey(t *testing.T) { + t.Parallel() + + metadata := map[string]any{ + workspaceMetadataKey: map[string]any{ + workspaceImageMetadataKey: "debian:bookworm-slim", + "keep": true, + }, + } + + updated := withoutWorkspaceImagePreference(metadata) + if got := workspaceImageFromMetadata(updated); got != "" { + t.Fatalf("expected image preference to be cleared, got %q", got) + } + workspace, ok := updated[workspaceMetadataKey].(map[string]any) + if !ok { + t.Fatal("expected workspace metadata section to remain") + } + if workspace["keep"] != true { + t.Fatalf("expected unrelated workspace metadata to remain, got %#v", workspace) + } +} diff --git a/internal/workspace/manager.go b/internal/workspace/manager.go new file mode 100644 index 00000000..7315e75e --- /dev/null +++ b/internal/workspace/manager.go @@ -0,0 +1,435 @@ +package workspace + +import ( + "context" + "errors" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/containerd/errdefs" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/memohai/memoh/internal/config" + ctr "github.com/memohai/memoh/internal/containerd" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" + "github.com/memohai/memoh/internal/identity" + "github.com/memohai/memoh/internal/workspace/bridge" +) + +const ( + BotLabelKey = "memoh.bot_id" + WorkspaceLabelKey = "memoh.workspace" + WorkspaceLabelValue = "v3" + ContainerPrefix = "workspace-" + LegacyContainerPrefix = "mcp-" + + legacyGRPCPort = 9090 +) + +// ErrContainerNotFound is returned when no container exists for a bot. +var ErrContainerNotFound = errors.New("container not found for bot") + +// ContainerStatus combines DB records with live containerd state. +type ContainerStatus struct { + ContainerID string `json:"container_id"` + Image string `json:"image"` + Status string `json:"status"` + Namespace string `json:"namespace"` + ContainerPath string `json:"container_path"` + TaskRunning bool `json:"task_running"` + HasPreservedData bool `json:"has_preserved_data"` + Legacy bool `json:"legacy"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type Manager struct { + service ctr.Service + cfg config.WorkspaceConfig + namespace string + db *pgxpool.Pool + queries *dbsqlc.Queries + logger *slog.Logger + containerLockMu sync.Mutex + containerLocks map[string]*sync.Mutex + grpcPool *bridge.Pool + legacyMu sync.RWMutex + legacyIPs map[string]string // botID → IP for pre-bridge containers +} + +func NewManager(log *slog.Logger, service ctr.Service, cfg config.WorkspaceConfig, namespace string, conn *pgxpool.Pool) *Manager { + if namespace == "" { + namespace = config.DefaultNamespace + } + m := &Manager{ + service: service, + cfg: cfg, + namespace: namespace, + db: conn, + queries: dbsqlc.New(conn), + logger: log.With(slog.String("component", "workspace")), + containerLocks: make(map[string]*sync.Mutex), + legacyIPs: make(map[string]string), + } + m.grpcPool = bridge.NewPool(m.dialTarget) + return m +} + +// resolveContainerID resolves the actual containerd container ID for a bot. +// This is the SINGLE point of container ID resolution for all lookup operations. +// It delegates to ContainerID (DB → label → scan) and falls back to the +// new-style prefix if no container exists yet. +func (m *Manager) resolveContainerID(ctx context.Context, botID string) string { + id, err := m.ContainerID(ctx, botID) + if err != nil { + return ContainerPrefix + botID + } + return id +} + +func (m *Manager) lockContainer(containerID string) func() { + m.containerLockMu.Lock() + lock, ok := m.containerLocks[containerID] + if !ok { + lock = &sync.Mutex{} + m.containerLocks[containerID] = lock + } + m.containerLockMu.Unlock() + + lock.Lock() + return lock.Unlock +} + +// socketDir returns the host-side directory that is bind-mounted into the +// container at /run/memoh, holding the UDS socket file. +func (m *Manager) socketDir(botID string) string { + return filepath.Join(m.dataRoot(), "run", botID) +} + +// socketPath returns the path to the UDS socket file for a bot's container. +func (m *Manager) socketPath(botID string) string { + return filepath.Join(m.socketDir(botID), "bridge.sock") +} + +// dialTarget returns the gRPC dial target for a bot. Legacy containers +// (pre-bridge) are reached via TCP; bridge containers use UDS. +func (m *Manager) dialTarget(botID string) string { + m.legacyMu.RLock() + ip, legacy := m.legacyIPs[botID] + m.legacyMu.RUnlock() + if legacy { + return fmt.Sprintf("%s:%d", ip, legacyGRPCPort) + } + return "unix://" + m.socketPath(botID) +} + +// SetLegacyIP records the IP address of a legacy (pre-bridge) container +// so the gRPC pool can reach it via TCP. +func (m *Manager) SetLegacyIP(botID, ip string) { + m.legacyMu.Lock() + m.legacyIPs[botID] = ip + m.legacyMu.Unlock() +} + +// ClearLegacyIP removes a cached legacy IP (e.g. when the container is deleted). +func (m *Manager) ClearLegacyIP(botID string) { + m.legacyMu.Lock() + delete(m.legacyIPs, botID) + m.legacyMu.Unlock() +} + +// clearLegacyRoute evicts any stale TCP fallback state for a bot so future +// gRPC dials use the bridge container's Unix socket. +func (m *Manager) clearLegacyRoute(botID string) { + m.ClearLegacyIP(botID) + m.grpcPool.Remove(botID) +} + +// MCPClient returns a gRPC client for the given bot's container. +// Implements bridge.Provider. +func (m *Manager) MCPClient(ctx context.Context, botID string) (*bridge.Client, error) { + return m.grpcPool.Get(ctx, botID) +} + +func (m *Manager) Init(ctx context.Context) error { + image := m.imageRef() + + // Pre-pull the default base image so container creation doesn't block + // on a network download. If the image is already present, this is a no-op. + if _, err := m.service.GetImage(ctx, image); err != nil { + m.logger.Info("pulling base image for workspace containers", slog.String("image", image)) + if _, pullErr := m.service.PullImage(ctx, image, &ctr.PullImageOptions{ + Unpack: true, + Snapshotter: m.cfg.Snapshotter, + }); pullErr != nil { + m.logger.Warn("base image pull failed", slog.String("image", image), slog.Any("error", pullErr)) + return pullErr + } + } + return nil +} + +// EnsureBot creates the workspace container for a bot if it does not exist. +// Bot data lives in the container's writable layer (snapshot), not bind mounts. +// The Memoh runtime (bridge binary + toolkit) is injected via read-only bind mount. +// If imageOverride is non-empty, it is used instead of the configured default. +func (m *Manager) EnsureBot(ctx context.Context, botID, imageOverride string) error { + image := m.imageRef() + if imageOverride != "" { + image = config.NormalizeImageRef(imageOverride) + } + return m.ensureBotWithImage(ctx, botID, image) +} + +func (m *Manager) ensureBotWithImage(ctx context.Context, botID, image string) error { + if err := validateBotID(botID); err != nil { + return err + } + + resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) + if err != nil { + return err + } + + runtimeDir := m.cfg.RuntimePath() + sockDir := m.socketDir(botID) + if err := os.MkdirAll(sockDir, 0o750); err != nil { + return fmt.Errorf("create socket dir: %w", err) + } + + mounts := []ctr.MountSpec{ + { + Destination: "/etc/resolv.conf", + Type: "bind", + Source: resolvPath, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/opt/memoh", + Type: "bind", + Source: runtimeDir, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/run/memoh", + Type: "bind", + Source: sockDir, + Options: []string{"rbind", "rw"}, + }, + } + tzMounts, tzEnv := ctr.TimezoneSpec() + mounts = append(mounts, tzMounts...) + + env := make([]string, 0, len(tzEnv)+1) + env = append(env, tzEnv...) + env = append(env, "BRIDGE_SOCKET_PATH=/run/memoh/bridge.sock") + + _, err = m.service.CreateContainer(ctx, ctr.CreateContainerRequest{ + ID: ContainerPrefix + botID, + ImageRef: image, + Snapshotter: m.cfg.Snapshotter, + Labels: map[string]string{ + BotLabelKey: botID, + WorkspaceLabelKey: WorkspaceLabelValue, + }, + Spec: ctr.ContainerSpec{ + Cmd: []string{"/opt/memoh/bridge"}, + Mounts: mounts, + Env: env, + }, + }) + if err == nil { + return nil + } + + if !errdefs.IsAlreadyExists(err) { + return err + } + + return nil +} + +// ListBots returns the bot IDs that have workspace containers. +func (m *Manager) ListBots(ctx context.Context) ([]string, error) { + containers, err := m.service.ListContainers(ctx) + if err != nil { + return nil, err + } + + botIDs := make([]string, 0, len(containers)) + for _, info := range containers { + if botID, ok := BotIDFromContainerInfo(info); ok { + botIDs = append(botIDs, botID) + } + } + return botIDs, nil +} + +func (m *Manager) Start(ctx context.Context, botID string) error { + image, err := m.resolveWorkspaceImage(ctx, botID) + if err != nil { + return err + } + return m.startWithResolvedImage(ctx, botID, image) +} + +// StartWithImage creates and starts the MCP container for a bot. +// If imageOverride is non-empty, it is used as the base image instead of the +// configured default. The override only applies when creating a new container. +func (m *Manager) StartWithImage(ctx context.Context, botID, imageOverride string) error { + image := strings.TrimSpace(imageOverride) + if image == "" { + return m.Start(ctx, botID) + } + return m.startWithResolvedImage(ctx, botID, config.NormalizeImageRef(image)) +} + +// StartWithResolvedImage creates and starts the workspace container for a bot +// using an explicit image reference. +func (m *Manager) StartWithResolvedImage(ctx context.Context, botID, image string) error { + image = strings.TrimSpace(image) + if image == "" { + return errors.New("image is required") + } + return m.startWithResolvedImage(ctx, botID, image) +} + +func (m *Manager) startWithResolvedImage(ctx context.Context, botID, image string) error { + containerID := m.resolveContainerID(ctx, botID) + + // Before creating a new container, check for an orphaned snapshot + // (container deleted but snapshot with /data survived). Export /data + // to a backup so it can be restored after EnsureBot creates a fresh + // container. This covers dev image rebuilds, containerd metadata loss, + // and manual container deletion. + if _, err := m.service.GetContainer(ctx, containerID); errdefs.IsNotFound(err) { + m.recoverOrphanedSnapshot(ctx, botID) + } + + if err := m.ensureBotWithImage(ctx, botID, image); err != nil { + return err + } + + // Restore preserved data (from orphaned snapshot recovery or a previous + // CleanupBotContainer with preserveData) into the fresh snapshot before + // starting the task, avoiding a redundant stop/start cycle. + if m.HasPreservedData(botID) { + if err := m.restorePreservedIntoSnapshot(ctx, botID); err != nil { + return fmt.Errorf("restore preserved data: %w", err) + } + } + + if err := m.service.StartContainer(ctx, containerID, nil); err != nil { + return err + } + + // CNI network setup (for outbound connectivity — container processes + // may need to download packages). Server communicates via UDS, not IP. + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }); err != nil { + if stopErr := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{Force: true}); stopErr != nil { + m.logger.Warn("cleanup: stop task failed", slog.String("container_id", containerID), slog.Any("error", stopErr)) + } + return err + } + if !m.IsLegacyContainer(ctx, containerID) { + m.clearLegacyRoute(botID) + } + return nil +} + +func (m *Manager) Stop(ctx context.Context, botID string, timeout time.Duration) error { + if err := validateBotID(botID); err != nil { + return err + } + return m.service.StopContainer(ctx, m.resolveContainerID(ctx, botID), &ctr.StopTaskOptions{ + Timeout: timeout, + Force: true, + }) +} + +func (m *Manager) Delete(ctx context.Context, botID string, preserveData bool) error { + if err := validateBotID(botID); err != nil { + return err + } + + containerID := m.resolveContainerID(ctx, botID) + + stoppedForPreserve := false + + if preserveData { + info, err := m.service.GetContainer(ctx, containerID) + if err != nil { + return fmt.Errorf("get container for preserve: %w", err) + } + + if _, err := m.snapshotMounts(ctx, info); errors.Is(err, errMountNotSupported) { + // Apple backend fallback uses gRPC against a running container. + } else if err != nil { + return err + } else { + if err := m.safeStopTask(ctx, containerID); err != nil { + return fmt.Errorf("stop for data preserve: %w", err) + } + stoppedForPreserve = true + } + + if err := m.PreserveData(ctx, botID); err != nil { + // Export failed — restart only if we stopped the task, and abort + // deletion to prevent data loss. + if stoppedForPreserve { + m.restartContainer(ctx, botID, containerID) + } + return fmt.Errorf("preserve data: %w", err) + } + } + + m.clearLegacyRoute(botID) + + if err := m.service.RemoveNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }); err != nil { + m.logger.Warn("delete: remove network failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + m.logger.Warn("delete: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + return m.service.DeleteContainer(ctx, containerID, &ctr.DeleteContainerOptions{ + CleanupSnapshot: true, + }) +} + +func (m *Manager) dataRoot() string { + if m.cfg.DataRoot == "" { + return config.DefaultDataRoot + } + return m.cfg.DataRoot +} + +func (m *Manager) imageRef() string { + return m.cfg.ImageRef() +} + +// IsLegacyContainer returns true if the container was created before the +// bridge runtime injection architecture (uses the legacy "mcp-" prefix). +// Legacy containers are functional but unreachable from the server (they +// use TCP gRPC instead of UDS). Users should delete and recreate them. +func (*Manager) IsLegacyContainer(_ context.Context, containerID string) bool { + return strings.HasPrefix(containerID, LegacyContainerPrefix) +} + +func validateBotID(botID string) error { + return identity.ValidateChannelIdentityID(botID) +} diff --git a/internal/workspace/manager_legacy_test.go b/internal/workspace/manager_legacy_test.go new file mode 100644 index 00000000..ac921dc4 --- /dev/null +++ b/internal/workspace/manager_legacy_test.go @@ -0,0 +1,292 @@ +package workspace + +import ( + "context" + "log/slog" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/containerd/errdefs" + + "github.com/memohai/memoh/internal/config" + ctr "github.com/memohai/memoh/internal/containerd" + "github.com/memohai/memoh/internal/workspace/bridge" +) + +type legacyRouteTestService struct { + container ctr.ContainerInfo + created bool + byLabel []ctr.ContainerInfo + + createCalls int + startCalls int + deleteCalls int + removeNet int + deleteTask int + setupNet int + + getContainerBeforeCreateErr error + setupNetworkResults []ctr.NetworkResult + setupNetworkErrs []error +} + +func (*legacyRouteTestService) PullImage(context.Context, string, *ctr.PullImageOptions) (ctr.ImageInfo, error) { + return ctr.ImageInfo{}, nil +} + +func (*legacyRouteTestService) GetImage(context.Context, string) (ctr.ImageInfo, error) { + return ctr.ImageInfo{}, nil +} + +func (*legacyRouteTestService) ListImages(context.Context) ([]ctr.ImageInfo, error) { + return nil, nil +} + +func (*legacyRouteTestService) DeleteImage(context.Context, string, *ctr.DeleteImageOptions) error { + return nil +} + +func (*legacyRouteTestService) ResolveRemoteDigest(context.Context, string) (string, error) { + return "", nil +} + +func (s *legacyRouteTestService) CreateContainer(_ context.Context, req ctr.CreateContainerRequest) (ctr.ContainerInfo, error) { + s.createCalls++ + s.created = true + s.container = ctr.ContainerInfo{ + ID: req.ID, + Image: req.ImageRef, + Labels: req.Labels, + Snapshotter: req.Snapshotter, + SnapshotKey: req.ID, + } + return s.container, nil +} + +func (s *legacyRouteTestService) GetContainer(context.Context, string) (ctr.ContainerInfo, error) { + if !s.created { + if s.getContainerBeforeCreateErr != nil { + return ctr.ContainerInfo{}, s.getContainerBeforeCreateErr + } + return ctr.ContainerInfo{}, errdefs.ErrNotFound + } + return s.container, nil +} + +func (s *legacyRouteTestService) ListContainers(context.Context) ([]ctr.ContainerInfo, error) { + if !s.created { + return nil, nil + } + return []ctr.ContainerInfo{s.container}, nil +} + +func (s *legacyRouteTestService) DeleteContainer(context.Context, string, *ctr.DeleteContainerOptions) error { + s.deleteCalls++ + s.created = false + return nil +} + +func (s *legacyRouteTestService) ListContainersByLabel(context.Context, string, string) ([]ctr.ContainerInfo, error) { + return s.byLabel, nil +} + +func (s *legacyRouteTestService) StartContainer(context.Context, string, *ctr.StartTaskOptions) error { + s.startCalls++ + return nil +} + +func (*legacyRouteTestService) StopContainer(context.Context, string, *ctr.StopTaskOptions) error { + return nil +} + +func (s *legacyRouteTestService) DeleteTask(context.Context, string, *ctr.DeleteTaskOptions) error { + s.deleteTask++ + return nil +} + +func (*legacyRouteTestService) GetTaskInfo(context.Context, string) (ctr.TaskInfo, error) { + return ctr.TaskInfo{}, errdefs.ErrNotFound +} + +func (*legacyRouteTestService) ListTasks(context.Context, *ctr.ListTasksOptions) ([]ctr.TaskInfo, error) { + return nil, nil +} + +func (s *legacyRouteTestService) SetupNetwork(context.Context, ctr.NetworkSetupRequest) (ctr.NetworkResult, error) { + idx := s.setupNet + s.setupNet++ + if idx < len(s.setupNetworkErrs) && s.setupNetworkErrs[idx] != nil { + return ctr.NetworkResult{}, s.setupNetworkErrs[idx] + } + if idx < len(s.setupNetworkResults) { + return s.setupNetworkResults[idx], nil + } + return ctr.NetworkResult{IP: "10.0.0.2"}, nil +} + +func (s *legacyRouteTestService) RemoveNetwork(context.Context, ctr.NetworkSetupRequest) error { + s.removeNet++ + return nil +} + +func (*legacyRouteTestService) CommitSnapshot(context.Context, string, string, string) error { + return nil +} + +func (*legacyRouteTestService) ListSnapshots(context.Context, string) ([]ctr.SnapshotInfo, error) { + return nil, nil +} + +func (*legacyRouteTestService) PrepareSnapshot(context.Context, string, string, string) error { + return nil +} + +func (*legacyRouteTestService) CreateContainerFromSnapshot(context.Context, ctr.CreateContainerRequest) (ctr.ContainerInfo, error) { + return ctr.ContainerInfo{}, nil +} + +func (*legacyRouteTestService) SnapshotMounts(context.Context, string, string) ([]ctr.MountInfo, error) { + return nil, ctr.ErrNotSupported +} + +func newLegacyRouteTestManager(t *testing.T, svc ctr.Service, cfg config.WorkspaceConfig) *Manager { + t.Helper() + logger := slog.New(slog.DiscardHandler) + m := &Manager{ + service: svc, + cfg: cfg, + namespace: config.DefaultNamespace, + containerLocks: make(map[string]*sync.Mutex), + legacyIPs: make(map[string]string), + logger: logger, + } + m.grpcPool = bridge.NewPool(m.dialTarget) + return m +} + +func TestStartWithImageClearsLegacyRouteForBridgeContainer(t *testing.T) { + dataRoot := t.TempDir() + runtimeDir := filepath.Join(dataRoot, "runtime") + if err := os.MkdirAll(runtimeDir, 0o750); err != nil { + t.Fatalf("mkdir runtime dir: %v", err) + } + + svc := &legacyRouteTestService{} + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + DataRoot: dataRoot, + RuntimeDir: runtimeDir, + Snapshotter: "overlayfs", + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + botID := "00000000-0000-0000-0000-000000000001" + m.SetLegacyIP(botID, "10.0.0.9") + + if got := m.dialTarget(botID); got != "10.0.0.9:9090" { + t.Fatalf("expected legacy dial target before start, got %q", got) + } + + if err := m.StartWithImage(context.Background(), botID, ""); err != nil { + t.Fatalf("StartWithImage failed: %v", err) + } + + if got := m.dialTarget(botID); got != "unix://"+filepath.Join(dataRoot, "run", botID, "bridge.sock") { + t.Fatalf("expected unix dial target after bridge start, got %q", got) + } + if svc.createCalls != 1 || svc.startCalls != 1 { + t.Fatalf("expected create/start once, got create=%d start=%d", svc.createCalls, svc.startCalls) + } +} + +func TestDeleteClearsLegacyRoute(t *testing.T) { + svc := &legacyRouteTestService{created: true, container: ctr.ContainerInfo{ID: "workspace-00000000-0000-0000-0000-000000000001"}} + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + DataRoot: t.TempDir(), + Snapshotter: "overlayfs", + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + botID := "00000000-0000-0000-0000-000000000001" + m.SetLegacyIP(botID, "10.0.0.9") + + if err := m.Delete(context.Background(), botID, false); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + if got := m.dialTarget(botID); got == "10.0.0.9:9090" { + t.Fatalf("expected legacy TCP target to be cleared, got %q", got) + } + if svc.removeNet != 1 || svc.deleteTask != 1 || svc.deleteCalls != 1 { + t.Fatalf("expected delete cleanup once, got removeNet=%d deleteTask=%d delete=%d", svc.removeNet, svc.deleteTask, svc.deleteCalls) + } +} + +func TestSetupNetworkAndGetIPRejectsEmptyIP(t *testing.T) { + svc := &legacyRouteTestService{ + setupNetworkResults: []ctr.NetworkResult{{IP: ""}, {IP: "10.0.0.3"}}, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{ + CNIBinaryDir: "/opt/cni/bin", + CNIConfigDir: "/etc/cni/net.d", + }) + + ip, err := m.setupNetworkAndGetIP(context.Background(), "workspace-bot") + if err != nil { + t.Fatalf("setupNetworkAndGetIP failed: %v", err) + } + if ip != "10.0.0.3" { + t.Fatalf("expected retry IP, got %q", ip) + } + if svc.setupNet != 2 { + t.Fatalf("expected two network setup attempts, got %d", svc.setupNet) + } +} + +func TestContainerIDPrefersCurrentLabelSearch(t *testing.T) { + t.Parallel() + + botID := "00000000-0000-0000-0000-000000000001" + svc := &legacyRouteTestService{ + byLabel: []ctr.ContainerInfo{{ + ID: "workspace-from-label", + Labels: map[string]string{BotLabelKey: botID}, + UpdatedAt: time.Now(), + }}, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{}) + + containerID, err := m.ContainerID(context.Background(), botID) + if err != nil { + t.Fatalf("ContainerID failed: %v", err) + } + if containerID != "workspace-from-label" { + t.Fatalf("expected label-resolved container ID, got %q", containerID) + } +} + +func TestContainerIDFallsBackToNameInference(t *testing.T) { + t.Parallel() + + botID := "00000000-0000-0000-0000-000000000001" + svc := &legacyRouteTestService{ + created: true, + container: ctr.ContainerInfo{ + ID: ContainerPrefix + botID, + UpdatedAt: time.Now(), + }, + } + m := newLegacyRouteTestManager(t, svc, config.WorkspaceConfig{}) + + containerID, err := m.ContainerID(context.Background(), botID) + if err != nil { + t.Fatalf("ContainerID failed: %v", err) + } + if containerID != ContainerPrefix+botID { + t.Fatalf("expected inferred container ID, got %q", containerID) + } +} diff --git a/internal/workspace/manager_lifecycle.go b/internal/workspace/manager_lifecycle.go new file mode 100644 index 00000000..7f8e56a3 --- /dev/null +++ b/internal/workspace/manager_lifecycle.go @@ -0,0 +1,515 @@ +package workspace + +import ( + "context" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "github.com/containerd/errdefs" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + + ctr "github.com/memohai/memoh/internal/containerd" + "github.com/memohai/memoh/internal/db" + dbsqlc "github.com/memohai/memoh/internal/db/sqlc" +) + +// --------------------------------------------------------------------------- +// Container ID resolution +// --------------------------------------------------------------------------- + +// ContainerID resolves the containerd container ID for a bot. +// Resolution order: DB lookup → label search → full container scan. +func (m *Manager) ContainerID(ctx context.Context, botID string) (string, error) { + if m.queries != nil { + pgBotID, err := db.ParseUUID(botID) + if err == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil && strings.TrimSpace(row.ContainerID) != "" { + return row.ContainerID, nil + } + if dbErr != nil && !errors.Is(dbErr, pgx.ErrNoRows) { + m.logger.Warn("ContainerID: db lookup failed", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } + } + } + + containers, err := m.service.ListContainersByLabel(ctx, BotLabelKey, botID) + if err != nil { + return "", err + } + if id, ok := newestContainerID(containers); ok { + return id, nil + } + + containers, err = m.service.ListContainers(ctx) + if err != nil { + return "", err + } + matched := make([]ctr.ContainerInfo, 0, len(containers)) + for _, info := range containers { + resolvedBotID, ok := BotIDFromContainerInfo(info) + if !ok || resolvedBotID != botID { + continue + } + matched = append(matched, info) + } + if id, ok := newestContainerID(matched); ok { + return id, nil + } + + return "", ErrContainerNotFound +} + +func newestContainerID(containers []ctr.ContainerInfo) (string, bool) { + bestID := "" + var bestUpdated time.Time + for _, info := range containers { + if bestID == "" || info.UpdatedAt.After(bestUpdated) { + bestID = info.ID + bestUpdated = info.UpdatedAt + } + } + return bestID, bestID != "" +} + +// --------------------------------------------------------------------------- +// Task & network helpers +// --------------------------------------------------------------------------- + +func (m *Manager) isTaskRunning(ctx context.Context, containerID string) bool { + tasks, err := m.service.ListTasks(ctx, &ctr.ListTasksOptions{ + Filter: "container.id==" + containerID, + }) + return err == nil && len(tasks) > 0 && tasks[0].Status == ctr.TaskStatusRunning +} + +func (m *Manager) setupNetworkAndGetIP(ctx context.Context, containerID string) (string, error) { + var lastErr error + for attempt := range 2 { + result, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + ContainerID: containerID, + CNIBinDir: m.cfg.CNIBinaryDir, + CNIConfDir: m.cfg.CNIConfigDir, + }) + if err != nil { + lastErr = err + m.logger.Warn("network setup attempt failed", + slog.String("container_id", containerID), + slog.Int("attempt", attempt+1), + slog.Any("error", err)) + continue + } + if strings.TrimSpace(result.IP) == "" { + lastErr = fmt.Errorf("network setup returned no IP for %s", containerID) + continue + } + return result.IP, nil + } + return "", fmt.Errorf("network setup failed for container %s: %w", containerID, lastErr) +} + +func (m *Manager) setupNetworkOrFail(ctx context.Context, containerID, botID string) error { + ip, err := m.setupNetworkAndGetIP(ctx, containerID) + if err != nil { + return err + } + // Legacy containers use TCP gRPC — cache their IP for the pool. + if m.IsLegacyContainer(ctx, containerID) { + m.SetLegacyIP(botID, ip) + } + return nil +} + +// --------------------------------------------------------------------------- +// Lifecycle: ensure / stop / info +// --------------------------------------------------------------------------- + +// EnsureRunning verifies the container exists and its task is running. +// If the container is missing, it rebuilds via SetupBotContainer. +// If the task is stopped, it restarts and sets up networking. +func (m *Manager) EnsureRunning(ctx context.Context, botID string) error { + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + if errors.Is(err, ErrContainerNotFound) { + m.logger.Warn("container missing, rebuilding", slog.String("bot_id", botID)) + return m.SetupBotContainer(ctx, botID) + } + return err + } + + _, err = m.service.GetContainer(ctx, containerID) + if err != nil { + if !errdefs.IsNotFound(err) { + return err + } + m.logger.Warn("container missing in containerd, rebuilding", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + return m.SetupBotContainer(ctx, botID) + } + + tasks, err := m.service.ListTasks(ctx, &ctr.ListTasksOptions{ + Filter: "container.id==" + containerID, + }) + if err != nil { + return err + } + if len(tasks) > 0 { + if tasks[0].Status == ctr.TaskStatusRunning { + return m.setupNetworkOrFail(ctx, containerID, botID) + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + if !errdefs.IsNotFound(err) { + m.logger.Warn("cleanup: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + return err + } + } + } + + if err := m.service.StartContainer(ctx, containerID, nil); err != nil { + return err + } + return m.setupNetworkOrFail(ctx, containerID, botID) +} + +// StopBot stops the container task for a bot and marks it stopped in DB. +func (m *Manager) StopBot(ctx context.Context, botID string) error { + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + return err + } + + if err := m.service.StopContainer(ctx, containerID, &ctr.StopTaskOptions{ + Timeout: 10 * time.Second, + Force: true, + }); err != nil && !errdefs.IsNotFound(err) { + return err + } + if err := m.service.DeleteTask(ctx, containerID, &ctr.DeleteTaskOptions{Force: true}); err != nil { + m.logger.Warn("cleanup: delete task failed", + slog.String("container_id", containerID), slog.Any("error", err)) + } + + m.markContainerStopped(ctx, botID) + return nil +} + +// GetContainerInfo returns current container status for a bot, +// combining DB records with live containerd state. +func (m *Manager) GetContainerInfo(ctx context.Context, botID string) (*ContainerStatus, error) { + if m.queries != nil { + pgBotID, parseErr := db.ParseUUID(botID) + if parseErr == nil { + row, dbErr := m.queries.GetContainerByBotID(ctx, pgBotID) + if dbErr == nil { + createdAt := time.Time{} + if row.CreatedAt.Valid { + createdAt = row.CreatedAt.Time + } + updatedAt := time.Time{} + if row.UpdatedAt.Valid { + updatedAt = row.UpdatedAt.Time + } + return &ContainerStatus{ + ContainerID: row.ContainerID, + Image: row.Image, + Status: row.Status, + Namespace: row.Namespace, + ContainerPath: row.ContainerPath, + TaskRunning: m.isTaskRunning(ctx, row.ContainerID), + HasPreservedData: m.HasPreservedData(botID), + Legacy: m.IsLegacyContainer(ctx, row.ContainerID), + CreatedAt: createdAt, + UpdatedAt: updatedAt, + }, nil + } + } + } + + containerID, err := m.ContainerID(ctx, botID) + if err != nil { + return nil, err + } + info, err := m.service.GetContainer(ctx, containerID) + if err != nil { + if errdefs.IsNotFound(err) { + return nil, ErrContainerNotFound + } + return nil, err + } + return &ContainerStatus{ + ContainerID: info.ID, + Image: info.Image, + Status: "unknown", + Namespace: m.namespace, + TaskRunning: m.isTaskRunning(ctx, containerID), + HasPreservedData: m.HasPreservedData(botID), + Legacy: m.IsLegacyContainer(ctx, containerID), + CreatedAt: info.CreatedAt, + UpdatedAt: info.UpdatedAt, + }, nil +} + +// PullImage pulls a container image. This is exposed so the HTTP layer can +// pass progress callbacks for SSE streaming without needing direct ctr.Service access. +func (m *Manager) PullImage(ctx context.Context, image string, opts *ctr.PullImageOptions) (ctr.ImageInfo, error) { + return m.service.PullImage(ctx, image, opts) +} + +// --------------------------------------------------------------------------- +// Container lifecycle (bots.ContainerLifecycle interface) +// --------------------------------------------------------------------------- + +// SetupBotContainer creates/starts the container and upserts the DB record. +func (m *Manager) SetupBotContainer(ctx context.Context, botID string) error { + image, err := m.resolveWorkspaceImage(ctx, botID) + if err != nil { + m.logger.Error("setup bot container: resolve image failed", + slog.String("bot_id", botID), + slog.Any("error", err)) + return err + } + + if err := m.startWithResolvedImage(ctx, botID, image); err != nil { + m.logger.Error("setup bot container: start failed", + slog.String("bot_id", botID), + slog.Any("error", err)) + return err + } + if err := m.RememberWorkspaceImage(ctx, botID, image); err != nil { + m.logger.Warn("setup bot container: remember workspace image failed", + slog.String("bot_id", botID), + slog.String("image", image), + slog.Any("error", err)) + } + + containerID := m.resolveContainerID(ctx, botID) + m.upsertContainerRecord(ctx, botID, containerID, "running", image) + return nil +} + +// CleanupBotContainer removes the container and DB record for a bot. +// When preserveData is true, /data is exported to a backup archive before deletion. +func (m *Manager) CleanupBotContainer(ctx context.Context, botID string, preserveData bool) error { + if err := m.Delete(ctx, botID, preserveData); err != nil { + if preserveData { + // When preserving data, any error (including NotFound) must + // block the workflow — we cannot delete the DB record if we + // failed to preserve data. + return err + } + if !errdefs.IsNotFound(err) { + return err + } + m.logger.Warn("cleanup: container not found in containerd, continuing", + slog.String("bot_id", botID)) + } + + m.deleteContainerRecord(ctx, botID) + return nil +} + +// --------------------------------------------------------------------------- +// Reconciliation +// --------------------------------------------------------------------------- + +// ReconcileContainers compares the DB containers table against actual containerd +// state on startup. For each auto_start container in DB it verifies the container +// and task exist; if missing they are rebuilt. +func (m *Manager) ReconcileContainers(ctx context.Context) { + if m.queries == nil { + return + } + rows, err := m.queries.ListAutoStartContainers(ctx) + if err != nil { + m.logger.Error("reconcile: failed to list containers from DB", slog.Any("error", err)) + return + } + if len(rows) == 0 { + m.logger.Info("reconcile: no auto-start containers in DB") + return + } + + m.logger.Info("reconcile: checking containers", slog.Int("count", len(rows))) + for _, row := range rows { + containerID := row.ContainerID + botID := uuid.UUID(row.BotID.Bytes).String() + + _, err := m.service.GetContainer(ctx, containerID) + if err != nil { + if !errdefs.IsNotFound(err) { + m.logger.Error("reconcile: failed to get container", + slog.String("container_id", containerID), slog.Any("error", err)) + continue + } + // Container missing in containerd — rebuild. + m.logger.Warn("reconcile: container missing, rebuilding", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + if setupErr := m.SetupBotContainer(ctx, botID); setupErr != nil { + m.logger.Error("reconcile: rebuild failed", + slog.String("bot_id", botID), slog.Any("error", setupErr)) + m.markContainerStatus(ctx, botID, "error") + } + continue + } + + // --- legacy container support (mcp- prefix, TCP gRPC) --- + // Remove when all deployments have migrated to workspace- containers. + if m.IsLegacyContainer(ctx, containerID) { + m.logger.Warn("reconcile: legacy container (pre-bridge), using TCP fallback", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + + running := m.isTaskRunning(ctx, containerID) + if !running { + if err := m.EnsureRunning(ctx, botID); err != nil { + m.logger.Error("reconcile: failed to start legacy container", + slog.String("bot_id", botID), slog.Any("error", err)) + continue + } + } + if ip, netErr := m.setupNetworkAndGetIP(ctx, containerID); netErr != nil { + m.logger.Error("reconcile: network setup failed for legacy container", + slog.String("bot_id", botID), slog.Any("error", netErr)) + } else { + m.SetLegacyIP(botID, ip) + m.logger.Info("reconcile: legacy container reachable via TCP", + slog.String("bot_id", botID), slog.String("ip", ip)) + } + continue + } + + // Container exists — ensure the task is running. + running := m.isTaskRunning(ctx, containerID) + if running { + if row.Status != "running" { + m.markContainerStarted(ctx, botID) + } + if netErr := m.setupNetworkOrFail(ctx, containerID, botID); netErr != nil { + m.logger.Error("reconcile: network setup failed for running task, container unreachable", + slog.String("bot_id", botID), + slog.String("container_id", containerID), + slog.Any("error", netErr)) + } else { + m.logger.Info("reconcile: container healthy", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + } + continue + } + + // Task not running — try to start it. + m.logger.Warn("reconcile: task not running, starting", + slog.String("bot_id", botID), slog.String("container_id", containerID)) + if err := m.EnsureRunning(ctx, botID); err != nil { + m.logger.Error("reconcile: failed to start task", + slog.String("bot_id", botID), slog.Any("error", err)) + m.markContainerStopped(ctx, botID) + } else { + m.markContainerStarted(ctx, botID) + } + } + m.logger.Info("reconcile: completed") +} + +// RecordContainerRunning upserts a DB record marking the resolved container as running. +// This is exported for the HTTP handler's SSE-based creation flow, where the +// pull + start happen in the handler but the DB write belongs to Manager. +func (m *Manager) RecordContainerRunning(ctx context.Context, botID, containerID, image string) { + m.upsertContainerRecord(ctx, botID, containerID, "running", image) +} + +// --------------------------------------------------------------------------- +// DB record helpers (unexported) +// --------------------------------------------------------------------------- + +func (m *Manager) upsertContainerRecord(ctx context.Context, botID, containerID, status, image string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + ns := strings.TrimSpace(m.namespace) + if ns == "" { + ns = "default" + } + if dbErr := m.queries.UpsertContainer(ctx, dbsqlc.UpsertContainerParams{ + BotID: pgBotID, + ContainerID: containerID, + ContainerName: containerID, + Image: image, + Status: status, + Namespace: ns, + AutoStart: true, + }); dbErr != nil { + m.logger.Error("failed to upsert container record", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } + if status == "running" { + m.markContainerStarted(ctx, botID) + } +} + +func (m *Manager) deleteContainerRecord(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.DeleteContainerByBotID(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to delete container record", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStarted(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStarted(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to update container started status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStopped(ctx context.Context, botID string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStopped(ctx, pgBotID); dbErr != nil { + m.logger.Error("failed to update container stopped status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} + +func (m *Manager) markContainerStatus(ctx context.Context, botID, status string) { + if m.queries == nil { + return + } + pgBotID, err := db.ParseUUID(botID) + if err != nil { + return + } + if dbErr := m.queries.UpdateContainerStatus(ctx, dbsqlc.UpdateContainerStatusParams{ + Status: status, + BotID: pgBotID, + }); dbErr != nil { + m.logger.Error("failed to update container status", + slog.String("bot_id", botID), slog.Any("error", dbErr)) + } +} diff --git a/internal/mcp/versioning.go b/internal/workspace/versioning.go similarity index 93% rename from internal/mcp/versioning.go rename to internal/workspace/versioning.go index 17505ffb..b5c70d9c 100644 --- a/internal/mcp/versioning.go +++ b/internal/workspace/versioning.go @@ -1,4 +1,4 @@ -package mcp +package workspace import ( "context" @@ -66,7 +66,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, botID, snapshotName, sourc return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -142,7 +142,7 @@ func (m *Manager) CreateVersion(ctx context.Context, botID string) (*VersionInfo return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -210,7 +210,7 @@ func (m *Manager) ListBotSnapshotData(ctx context.Context, botID string) (*BotSn return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -272,7 +272,7 @@ func (m *Manager) ListVersions(ctx context.Context, botID string) ([]VersionInfo return nil, err } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) versions, err := m.queries.ListVersionsByContainerID(ctx, containerID) if err != nil { return nil, err @@ -307,7 +307,7 @@ func (m *Manager) RollbackVersion(ctx context.Context, botID string, version int return errors.New("version out of range") } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) unlock := m.lockContainer(containerID) defer unlock() @@ -353,7 +353,7 @@ func (m *Manager) VersionSnapshotName(ctx context.Context, botID string, version return "", errors.New("version out of range") } - containerID := m.containerID(botID) + containerID := m.resolveContainerID(ctx, botID) return m.queries.GetVersionSnapshotRuntimeName(ctx, dbsqlc.GetVersionSnapshotRuntimeNameParams{ ContainerID: containerID, Version: int32(version), @@ -391,26 +391,26 @@ func (m *Manager) replaceContainerSnapshot(ctx context.Context, botID, container // unconditionally so the next call dials fresh to the new process. m.grpcPool.Remove(botID) - netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ + // CNI network setup (for outbound connectivity). + if _, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{ ContainerID: containerID, CNIBinDir: m.cfg.CNIBinaryDir, CNIConfDir: m.cfg.CNIConfigDir, - }) - if err != nil { + }); err != nil { return fmt.Errorf("network setup after snapshot replace: %w", err) } - if netResult.IP == "" { - return fmt.Errorf("network setup returned no IP after snapshot replace for %s", containerID) - } - m.SetContainerIP(botID, netResult.IP) return nil } -func (m *Manager) buildVersionSpec(_ string) (ctr.ContainerSpec, error) { +func (m *Manager) buildVersionSpec(botID string) (ctr.ContainerSpec, error) { resolvPath, err := ctr.ResolveConfSource(m.dataRoot()) if err != nil { return ctr.ContainerSpec{}, err } + + runtimeDir := m.cfg.RuntimePath() + sockDir := m.socketDir(botID) + mounts := []ctr.MountSpec{ { Destination: "/etc/resolv.conf", @@ -418,12 +418,30 @@ func (m *Manager) buildVersionSpec(_ string) (ctr.ContainerSpec, error) { Source: resolvPath, Options: []string{"rbind", "ro"}, }, + { + Destination: "/opt/memoh", + Type: "bind", + Source: runtimeDir, + Options: []string{"rbind", "ro"}, + }, + { + Destination: "/run/memoh", + Type: "bind", + Source: sockDir, + Options: []string{"rbind", "rw"}, + }, } tzMounts, tzEnv := ctr.TimezoneSpec() mounts = append(mounts, tzMounts...) + + env := make([]string, 0, len(tzEnv)+1) + env = append(env, tzEnv...) + env = append(env, "BRIDGE_SOCKET_PATH=/run/memoh/bridge.sock") + return ctr.ContainerSpec{ + Cmd: []string{"/opt/memoh/bridge"}, Mounts: mounts, - Env: tzEnv, + Env: env, }, nil } diff --git a/mise.toml b/mise.toml index ae21575c..62d09979 100644 --- a/mise.toml +++ b/mise.toml @@ -59,7 +59,6 @@ description = "Start development environment" run = """ #!/bin/bash set -e -cp devenv/app.dev.toml config.toml docker compose -f devenv/docker-compose.yml up --build """ @@ -75,13 +74,13 @@ run = "docker compose -f devenv/docker-compose.yml logs -f" description = "Restart a service (usage: mise run dev:restart -- server)" run = "docker compose -f devenv/docker-compose.yml restart $@" -[tasks."mcp:build"] -description = "Manually build MCP dev binary (normally auto-triggered by air)" +[tasks."bridge:build"] +description = "Manually rebuild bridge binary in dev container (normally auto-triggered by air)" run = """ #!/bin/bash set -e docker compose -f devenv/docker-compose.yml exec server \ - sh -c 'cd /workspace && sh devenv/mcp-build.sh' + sh -c 'cd /workspace && sh devenv/bridge-build.sh' """ [tasks.db-up] @@ -161,6 +160,6 @@ depends = [ run = """ #!/bin/bash set -e -cp devenv/app.dev.toml config.toml echo '✓ Setup complete! Run: mise run dev' +echo ' Dev web UI will be available at http://localhost:18082' """ diff --git a/packages/agent/src/agent.ts b/packages/agent/src/agent.ts index 94e2b830..eab1bde2 100644 --- a/packages/agent/src/agent.ts +++ b/packages/agent/src/agent.ts @@ -110,7 +110,6 @@ export const createAgent = ( currentChannel = 'Unknown Channel', identity = { botId: '', - containerId: '', channelIdentityId: '', displayName: '', }, diff --git a/packages/agent/src/types/agent.ts b/packages/agent/src/types/agent.ts index d6fb1da8..6aa6dbab 100644 --- a/packages/agent/src/types/agent.ts +++ b/packages/agent/src/types/agent.ts @@ -5,9 +5,8 @@ import { MCPConnection } from './mcp' export interface IdentityContext { botId: string - containerId?: string - channelIdentityId?: string - displayName?: string + channelIdentityId: string + displayName: string currentPlatform?: string replyTarget?: string conversationType?: string diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index 32ee2f98..8deba1a7 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -1,9 +1,15 @@ import { parse } from 'toml' import { readFileSync } from 'fs' -import type { Config } from './types.ts' +import type { Config } from './types' export const loadConfig = (path: string = './config.toml'): Config => { const config = parse(readFileSync(path, 'utf-8')) + if ('mcp' in config) { + if ('workspace' in config) { + throw new Error('config uses both [mcp] and [workspace]; remove [mcp] and keep only [workspace]') + } + throw new Error('config section [mcp] has been renamed to [workspace]; update your config.toml and restart') + } return config satisfies Config } @@ -25,4 +31,4 @@ export const getBaseUrl = (config: Config) => { return `http://${rawAddr}` } -export * from './types.ts' +export * from './types' diff --git a/packages/config/src/types.ts b/packages/config/src/types.ts index a5fc862d..a4da3f08 100644 --- a/packages/config/src/types.ts +++ b/packages/config/src/types.ts @@ -4,7 +4,7 @@ export interface Config { admin: AdminConfig; auth: AuthConfig; containerd: ContainerdConfig; - mcp: McpConfig; + workspace: WorkspaceConfig; postgres: PostgresConfig; qdrant: QdrantConfig; sparse: SparseConfig; @@ -38,10 +38,14 @@ export interface ContainerdConfig { namespace: string; } -export interface McpConfig { - image: string; +export interface WorkspaceConfig { + registry?: string; + default_image: string; snapshotter: string; data_root: string; + cni_bin_dir?: string; + cni_conf_dir?: string; + runtime_dir?: string; } export interface PostgresConfig { @@ -80,4 +84,3 @@ export interface WebConfig { host: string; port: number; } - diff --git a/packages/sdk/package.json b/packages/sdk/package.json index 720667f7..04f68440 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -5,7 +5,8 @@ "exports": { ".": "./src/index.ts", "./client": "./src/client.gen.ts", - "./colada": "./src/@pinia/colada.gen.ts" + "./colada": "./src/@pinia/colada.gen.ts", + "./extra": "./src/extra/index.ts" }, "main": "index.js", "scripts": { diff --git a/packages/sdk/src/container-stream.ts b/packages/sdk/src/container-stream.ts new file mode 100644 index 00000000..70aae99a --- /dev/null +++ b/packages/sdk/src/container-stream.ts @@ -0,0 +1,105 @@ +import { mergeHeaders } from './client' +import { client } from './client.gen' +import type { Options } from './sdk.gen' +import type { + HandlersCreateContainerResponse, + PostBotsByBotIdContainerData, +} from './types.gen' + +// Handwritten SDK supplement for container-create SSE. +// Re-export this module via @memoh/sdk/extra instead of the generated root entry, +// because packages/sdk/src/index.ts is regenerated from OpenAPI. + +export type ContainerCreateLayerStatus = { + ref: string + offset: number + total: number +} + +// codesync(container-create-stream): keep these manual SSE payload types in sync +// with internal/handlers/containerd.go. +export type ContainerCreateStreamEvent = + | { type: 'pulling'; image: string } + | { type: 'pull_progress'; layers: ContainerCreateLayerStatus[] } + | { type: 'creating' } + | { type: 'restoring' } + | { type: 'complete'; container: HandlersCreateContainerResponse } + | { type: 'error'; message: string } + +export type ContainerCreateStreamResult = { + stream: AsyncGenerator +} + +function isLayerStatus(value: unknown): value is ContainerCreateLayerStatus { + return !!value + && typeof value === 'object' + && typeof (value as { ref?: unknown }).ref === 'string' + && typeof (value as { offset?: unknown }).offset === 'number' + && typeof (value as { total?: unknown }).total === 'number' +} + +function isContainerCreateStreamEvent(value: unknown): value is ContainerCreateStreamEvent { + if (!value || typeof value !== 'object') return false + + const event = value as Record + switch (event.type) { + case 'pulling': + return typeof event.image === 'string' + case 'pull_progress': + return Array.isArray(event.layers) && event.layers.every(isLayerStatus) + case 'creating': + case 'restoring': + return true + case 'complete': + return !!event.container && typeof event.container === 'object' + case 'error': + return typeof event.message === 'string' + default: + return false + } +} + +function toError(error: unknown): Error { + if (error instanceof Error) return error + if (typeof error === 'string' && error.trim()) return new Error(error) + return new Error('Container create stream failed') +} + +export async function postBotsByBotIdContainerStream( + options: Options, +): Promise { + let streamError: unknown + + const result = await client.sse.post({ + url: '/bots/{bot_id}/container', + ...options, + headers: mergeHeaders(options.headers, { + Accept: 'text/event-stream', + 'Content-Type': 'application/json', + }), + onSseError: (error) => { + streamError = error + }, + responseValidator: async (data) => { + if (!isContainerCreateStreamEvent(data)) { + throw new Error('Invalid container create stream event') + } + }, + sseMaxRetryAttempts: 1, + }) + + return { + stream: (async function* () { + for await (const event of result.stream as AsyncGenerator) { + if (!isContainerCreateStreamEvent(event)) { + throw new Error('Invalid container create stream event') + } + yield event + } + + if (streamError) { + throw toError(streamError) + } + })(), + } +} diff --git a/packages/sdk/src/extra/index.ts b/packages/sdk/src/extra/index.ts new file mode 100644 index 00000000..bee9b7a5 --- /dev/null +++ b/packages/sdk/src/extra/index.ts @@ -0,0 +1,10 @@ +// Handwritten SDK supplements that OpenAPI generation cannot express cleanly. +// Keep these exports under @memoh/sdk/extra instead of the generated root entry, +// because packages/sdk/src/index.ts is overwritten by sdk generation. + +export { postBotsByBotIdContainerStream } from '../container-stream' +export type { + ContainerCreateLayerStatus, + ContainerCreateStreamEvent, + ContainerCreateStreamResult, +} from '../container-stream' diff --git a/packages/sdk/src/types.gen.ts b/packages/sdk/src/types.gen.ts index 72269806..1da5fdc4 100644 --- a/packages/sdk/src/types.gen.ts +++ b/packages/sdk/src/types.gen.ts @@ -690,6 +690,7 @@ export type HandlersChannelMeta = { }; export type HandlersCreateContainerRequest = { + image?: string; restore_data?: boolean; snapshotter?: string; }; @@ -780,6 +781,7 @@ export type HandlersGetContainerResponse = { created_at?: string; has_preserved_data?: boolean; image?: string; + legacy?: boolean; namespace?: string; status?: string; task_running?: boolean; diff --git a/spec/docs.go b/spec/docs.go index 009e0685..42e50096 100644 --- a/spec/docs.go +++ b/spec/docs.go @@ -10566,6 +10566,9 @@ const docTemplate = `{ "handlers.CreateContainerRequest": { "type": "object", "properties": { + "image": { + "type": "string" + }, "restore_data": { "type": "boolean" }, @@ -10783,6 +10786,9 @@ const docTemplate = `{ "image": { "type": "string" }, + "legacy": { + "type": "boolean" + }, "namespace": { "type": "string" }, diff --git a/spec/swagger.json b/spec/swagger.json index 9b3c3e65..5b9805e6 100644 --- a/spec/swagger.json +++ b/spec/swagger.json @@ -10557,6 +10557,9 @@ "handlers.CreateContainerRequest": { "type": "object", "properties": { + "image": { + "type": "string" + }, "restore_data": { "type": "boolean" }, @@ -10774,6 +10777,9 @@ "image": { "type": "string" }, + "legacy": { + "type": "boolean" + }, "namespace": { "type": "string" }, diff --git a/spec/swagger.yaml b/spec/swagger.yaml index bd41330c..c6731446 100644 --- a/spec/swagger.yaml +++ b/spec/swagger.yaml @@ -1140,6 +1140,8 @@ definitions: type: object handlers.CreateContainerRequest: properties: + image: + type: string restore_data: type: boolean snapshotter: @@ -1280,6 +1282,8 @@ definitions: type: boolean image: type: string + legacy: + type: boolean namespace: type: string status: