From 4d3f2de7e2aca9222cfcd10f392ba038d4036411 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Fri, 10 Apr 2026 14:52:17 +0800 Subject: [PATCH] feat: Add GPU CDI support for workspace containers (#332) * feat: add CDI GPU support for workspace containers * feat: expose GPU CDI settings in bot container UI * feat: move GPU settings into advanced container options * docs: document advanced CDI device configuration --- apps/web/src/i18n/locales/en.json | 11 ++ apps/web/src/i18n/locales/zh.json | 11 ++ .../pages/bots/components/bot-container.vue | 153 ++++++++++++++++- devenv/docker-compose.yml | 3 + docker-compose.yml | 3 + docs/docs/getting-started/container.md | 46 +++++ go.mod | 7 + go.sum | 14 ++ internal/containerd/service.go | 19 ++ internal/containerd/service_apple.go | 3 + internal/containerd/types.go | 5 +- internal/handlers/containerd.go | 54 ++++-- internal/workspace/gpu_labels_test.go | 29 ++++ internal/workspace/image_preference.go | 162 ++++++++++++++++++ internal/workspace/image_preference_test.go | 68 ++++++++ internal/workspace/manager.go | 114 ++++++++---- internal/workspace/manager_lifecycle.go | 8 +- internal/workspace/versioning.go | 50 +----- packages/sdk/src/types.gen.ts | 7 + spec/docs.go | 26 +++ spec/swagger.json | 26 +++ spec/swagger.yaml | 17 ++ 22 files changed, 752 insertions(+), 84 deletions(-) create mode 100644 internal/workspace/gpu_labels_test.go diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index 09050332..22ee44f0 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -748,6 +748,16 @@ "createRestoreDataDescription": "If a previously exported backup or legacy bind-mounted data exists, it will be restored into `/data` after the container is created.", "createImageLabel": "Base image", "createImageDescription": "Docker image to use as the container base (e.g. debian:bookworm-slim, alpine:latest, ubuntu:24.04). Leave empty for the default.", + "createAdvancedTitle": "Advanced options", + "createAdvancedDescription": "Configure optional GPU access and raw CDI device mappings for this container.", + "createGpuLabel": "Enable GPU", + "createGpuDescription": "Attach host GPU access to the new container.", + "createGpuDevicesLabel": "CDI devices", + "createGpuDevicesDescription": "Enter one CDI device per line or separate them with commas, for example `nvidia.com/gpu=0` or `amd.com/gpu=0`. Turning GPU off explicitly clears the saved GPU preference.", + "createGpuDevicesPlaceholder": "nvidia.com/gpu=0\namd.com/gpu=0", + "gpuDevicesRequired": "At least one CDI device is required when GPU is enabled.", + "cdiDevicesEmpty": "No GPU attached", + "gpuRecreateHint": "Changing GPU settings requires recreating the container. A simple start or stop will not change the devices already attached.", "deleteConfirm": "Are you sure you want to permanently delete this container? Unpreserved data cannot be recovered.", "deletePreserveConfirm": "Are you sure you want to export `/data` and then delete this container?", "restoreConfirm": "Are you sure you want to restore preserved data into this container's `/data`?", @@ -797,6 +807,7 @@ "task": "Task", "namespace": "Namespace", "image": "Image", + "cdiDevices": "CDI Devices", "hostPath": "Host Path", "containerPath": "Container Path", "preservedData": "Preserved Data", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index d59da1cb..bed56e6c 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -744,6 +744,16 @@ "createRestoreDataDescription": "如果存在之前导出的备份或旧版 bind mount 数据,将在容器创建后恢复到 `/data`。", "createImageLabel": "基础镜像", "createImageDescription": "作为容器基础环境的 Docker 镜像(如 debian:bookworm-slim、alpine:latest、ubuntu:24.04)。留空则使用默认镜像。", + "createAdvancedTitle": "高级选项", + "createAdvancedDescription": "配置该容器的可选 GPU 访问能力与原始 CDI 设备映射。", + "createGpuLabel": "启用 GPU", + "createGpuDescription": "为新容器开启宿主机 GPU 访问。", + "createGpuDevicesLabel": "CDI 设备", + "createGpuDevicesDescription": "每行或用逗号填写一个 CDI 设备名,例如 `nvidia.com/gpu=0` 或 `amd.com/gpu=0`。关闭 GPU 后会显式清空已保存的 GPU 偏好。", + "createGpuDevicesPlaceholder": "nvidia.com/gpu=0\namd.com/gpu=0", + "gpuDevicesRequired": "已启用 GPU 时,至少需要填写一个 CDI 设备名。", + "cdiDevicesEmpty": "未附加 GPU", + "gpuRecreateHint": "GPU 配置变更需要重建容器后才会生效,单纯启动或停止不会更新当前已附加的设备。", "deleteConfirm": "确定要彻底删除这个容器吗?未保留的数据将无法恢复。", "deletePreserveConfirm": "确定要先导出 `/data` 再删除这个容器吗?", "restoreConfirm": "确定要将已保留的数据恢复到当前容器的 `/data` 吗?", @@ -793,6 +803,7 @@ "task": "任务状态", "namespace": "命名空间", "image": "镜像", + "cdiDevices": "CDI 设备", "hostPath": "主机路径", "containerPath": "容器路径", "preservedData": "保留数据", diff --git a/apps/web/src/pages/bots/components/bot-container.vue b/apps/web/src/pages/bots/components/bot-container.vue index 8d49da2c..02209fbf 100644 --- a/apps/web/src/pages/bots/components/bot-container.vue +++ b/apps/web/src/pages/bots/components/bot-container.vue @@ -4,6 +4,7 @@ import { toast } from 'vue-sonner' import { useI18n } from 'vue-i18n' import { useRoute } from 'vue-router' import { useQuery } from '@pinia/colada' +import { ChevronRight } from 'lucide-vue-next' import { deleteBotsByBotIdContainer, getBotsByBotIdContainer, @@ -25,7 +26,7 @@ import { type ContainerCreateLayerStatus, type ContainerCreateStreamEvent, } from '@/composables/api/useContainerStream' -import { Button, Input, Label, Separator, Spinner, Switch } from '@memohai/ui' +import { Button, Collapsible, CollapsibleContent, CollapsibleTrigger, Input, Label, Separator, Spinner, Switch, Textarea } from '@memohai/ui' import ConfirmPopover from '@/components/confirm-popover/index.vue' import ContainerCreateProgress from './container-create-progress.vue' import { useSyncedQueryParam } from '@/composables/useSyncedQueryParam' @@ -59,6 +60,10 @@ const rollbackVersion = ref(null) const createRestoreData = ref(false) const createImage = ref('') const createImagePrefilled = ref(false) +const createGPUEnabled = ref(false) +const createGPUDevices = ref('') +const createGPUPrefilled = ref(false) +const createAdvancedOpen = ref(false) const newSnapshotName = ref('') const importInputRef = ref(null) @@ -178,7 +183,7 @@ async function handleRefreshContainer() { await runContainerAction('refresh', () => loadContainerData(false)) } -const { data: bot } = useQuery({ +const { data: bot, refetch: refetchBot } = useQuery({ key: () => ['bot', botId.value], query: async () => { const { data } = await getBotsById({ path: { id: botId.value }, throwOnError: true }) @@ -194,8 +199,48 @@ function rememberedWorkspaceImage(metadata: Record | undefined) return typeof image === 'string' ? shortenImageRef(image) : '' } +type RememberedWorkspaceGPU = { + exists: boolean + devices: string[] +} + +function rememberedWorkspaceGPU(metadata: Record | undefined): RememberedWorkspaceGPU { + const workspace = metadata?.workspace + if (!workspace || typeof workspace !== 'object' || Array.isArray(workspace)) { + return { exists: false, devices: [] } + } + + const workspaceRecord = workspace as Record + if (!Object.prototype.hasOwnProperty.call(workspaceRecord, 'gpu')) { + return { exists: false, devices: [] } + } + + const gpu = workspaceRecord.gpu + if (!gpu || typeof gpu !== 'object' || Array.isArray(gpu)) { + return { exists: true, devices: [] } + } + + const rawDevices = (gpu as Record).devices + const devices = Array.isArray(rawDevices) + ? rawDevices.filter((value): value is string => typeof value === 'string').map(value => value.trim()).filter(Boolean) + : [] + + return { exists: true, devices: [...new Set(devices)] } +} + +function parseCDIDevices(value: string): string[] { + return [...new Set( + value + .split(/[\n,]/) + .map(item => item.trim()) + .filter(Boolean), + )] +} + const rememberedCreateImage = computed(() => rememberedWorkspaceImage(bot.value?.metadata as Record | undefined)) +const rememberedCreateGPU = computed(() => rememberedWorkspaceGPU(bot.value?.metadata as Record | undefined)) const displayedContainerImage = computed(() => shortenImageRef(containerInfo.value?.image)) +const displayedCDIDevices = computed(() => containerInfo.value?.cdi_devices ?? []) const { isPending: botLifecyclePending } = useBotStatusMeta(bot, t) @@ -248,16 +293,29 @@ async function handleCreateContainer() { containerAction.value = 'create' createProgress.value = { phase: 'pulling' } try { + const gpuDevices = parseCDIDevices(createGPUDevices.value) + if (createGPUEnabled.value && gpuDevices.length === 0) { + throw new Error(t('bots.container.gpuDevicesRequired')) + } + const body: HandlersCreateContainerRequest = { restore_data: createRestoreData.value, } const trimmedImage = createImage.value.trim() if (trimmedImage) body.image = trimmedImage + if (createGPUEnabled.value || rememberedCreateGPU.value.exists) { + body.gpu = { + devices: createGPUEnabled.value ? gpuDevices : [], + } + } const { dataRestored } = await createContainerSSE(body) createRestoreData.value = false createImage.value = '' + createGPUEnabled.value = false + createGPUDevices.value = '' await loadContainerData(false) + await refetchBot() toast.success(dataRestored ? t('bots.container.createRestoreSuccess') : t('bots.container.createSuccess')) @@ -567,6 +625,8 @@ const activeTab = useSyncedQueryParam('tab', 'overview') watch(containerMissing, (missing) => { if (!missing) { createImagePrefilled.value = false + createGPUPrefilled.value = false + createAdvancedOpen.value = false } }) @@ -577,6 +637,15 @@ watch([containerMissing, rememberedCreateImage], ([missing, remembered]) => { createImagePrefilled.value = true }, { immediate: true }) +watch([containerMissing, rememberedCreateGPU], ([missing, remembered]) => { + if (!missing || createGPUPrefilled.value) return + if (!remembered.exists) return + if (createGPUEnabled.value || createGPUDevices.value.trim()) return + createGPUEnabled.value = remembered.devices.length > 0 + createGPUDevices.value = remembered.devices.join('\n') + createGPUPrefilled.value = true +}, { immediate: true }) + watch([activeTab, botId], ([tab]) => { if (!botId.value) return if (tab === 'container') { @@ -685,6 +754,59 @@ watch([activeTab, botId], ([tab]) => {

+ +
+ +
+

+ {{ $t('bots.container.createAdvancedTitle') }} +

+

+ {{ $t('bots.container.createAdvancedDescription') }} +

+
+ +
+ + +
+
+
+ +

+ {{ $t('bots.container.createGpuDescription') }} +

+
+ +
+ +
+ +