From fd8f1ec078dd43f3657017a41c059e3a7f4f64fb Mon Sep 17 00:00:00 2001 From: Acbox Date: Wed, 22 Apr 2026 00:11:16 +0800 Subject: [PATCH] Revert "Feat/speech support (#392)" (#393) This reverts commit c9dcfe287fcbbf9ff4d82121a7ebf3356717bc32. --- .../web/src/components/create-model/index.vue | 63 +- .../src/components/settings-sidebar/index.vue | 7 +- apps/web/src/constants/client-types.ts | 32 +- apps/web/src/i18n/locales/en.json | 27 - apps/web/src/i18n/locales/zh.json | 27 - .../pages/bots/components/bot-settings.vue | 35 +- .../speech/components/model-config-editor.vue | 95 +- .../speech/components/provider-setting.vue | 73 +- apps/web/src/pages/transcription/index.vue | 126 -- .../pages/transcription/provider-setting.vue | 480 -------- apps/web/src/router.ts | 8 - cmd/agent/app.go | 66 +- cmd/agent/module.go | 16 +- conf/providers/deepgram-transcription.yaml | 9 - conf/providers/elevenlabs-transcription.yaml | 9 - conf/providers/google-transcription.yaml | 9 - conf/providers/openai-transcription.yaml | 9 - conf/providers/openrouter-transcription.yaml | 9 - db/migrations/0001_init.up.sql | 11 +- ...cription_models_and_speech_domain.down.sql | 33 - ...nscription_models_and_speech_domain.up.sql | 31 - .../0070_add_bot_transcription_model.down.sql | 8 - .../0070_add_bot_transcription_model.up.sql | 5 - ...071_split_transcription_providers.down.sql | 33 - .../0071_split_transcription_providers.up.sql | 29 - db/queries/models.sql | 72 +- db/queries/settings.sql | 8 +- internal/agent/background/manager_test.go | 6 +- internal/agent/retry.go | 2 +- internal/agent/tools/container.go | 6 +- internal/agent/tools/transcribe.go | 232 ---- internal/agent/tools/tts.go | 12 +- internal/audio/bootstrap.go | 100 -- internal/audio/service.go | 769 ------------ internal/audio/types.go | 102 -- .../identities/service_integration_test.go | 1 + internal/channel/inbound/channel.go | 208 +--- internal/db/sqlc/conversations.sql.go | 2 +- internal/db/sqlc/models.go | 1 - internal/db/sqlc/models.sql.go | 236 +--- internal/db/sqlc/settings.sql.go | 20 +- internal/handlers/bot_tts.go | 26 +- internal/handlers/local_channel.go | 48 +- internal/handlers/tts_providers.go | 356 +----- internal/models/models.go | 22 +- internal/models/types.go | 45 +- internal/settings/service.go | 15 - internal/settings/types.go | 2 - internal/{audio => tts}/adapter.go | 2 +- internal/{audio => tts}/adapter/edge/edge.go | 28 +- .../{audio => tts}/adapter/edge/edge_test.go | 8 +- internal/{audio => tts}/adapter/edge/type.go | 0 .../{audio => tts}/adapter/edge/voices.json | 0 internal/{audio => tts}/adapter/edge/ws.go | 10 +- .../adapter/edge/ws_integration_test.go | 24 +- .../{audio => tts}/adapter/edge/ws_test.go | 8 +- internal/tts/bootstrap.go | 68 ++ internal/{audio => tts}/config.go | 2 +- internal/{audio => tts}/registry.go | 344 +----- internal/tts/service.go | 435 +++++++ internal/{audio => tts}/tempstore.go | 6 +- internal/tts/types.go | 62 + internal/workspace/image_preference.go | 2 - packages/sdk/src/@pinia/colada.gen.ts | 193 +-- packages/sdk/src/index.ts | 4 +- packages/sdk/src/sdk.gen.ts | 93 +- packages/sdk/src/types.gen.ts | 609 ++-------- spec/docs.go | 1072 ++++------------- spec/swagger.json | 1072 ++++------------- spec/swagger.yaml | 715 +++-------- 70 files changed, 1689 insertions(+), 6609 deletions(-) delete mode 100644 apps/web/src/pages/transcription/index.vue delete mode 100644 apps/web/src/pages/transcription/provider-setting.vue delete mode 100644 conf/providers/deepgram-transcription.yaml delete mode 100644 conf/providers/elevenlabs-transcription.yaml delete mode 100644 conf/providers/google-transcription.yaml delete mode 100644 conf/providers/openai-transcription.yaml delete mode 100644 conf/providers/openrouter-transcription.yaml delete mode 100644 db/migrations/0069_add_transcription_models_and_speech_domain.down.sql delete mode 100644 db/migrations/0069_add_transcription_models_and_speech_domain.up.sql delete mode 100644 db/migrations/0070_add_bot_transcription_model.down.sql delete mode 100644 db/migrations/0070_add_bot_transcription_model.up.sql delete mode 100644 db/migrations/0071_split_transcription_providers.down.sql delete mode 100644 db/migrations/0071_split_transcription_providers.up.sql delete mode 100644 internal/agent/tools/transcribe.go delete mode 100644 internal/audio/bootstrap.go delete mode 100644 internal/audio/service.go delete mode 100644 internal/audio/types.go rename internal/{audio => tts}/adapter.go (97%) rename internal/{audio => tts}/adapter/edge/edge.go (78%) rename internal/{audio => tts}/adapter/edge/edge_test.go (90%) rename internal/{audio => tts}/adapter/edge/type.go (100%) rename internal/{audio => tts}/adapter/edge/voices.json (100%) rename internal/{audio => tts}/adapter/edge/ws.go (97%) rename internal/{audio => tts}/adapter/edge/ws_integration_test.go (74%) rename internal/{audio => tts}/adapter/edge/ws_test.go (94%) create mode 100644 internal/tts/bootstrap.go rename internal/{audio => tts}/config.go (99%) rename internal/{audio => tts}/registry.go (64%) create mode 100644 internal/tts/service.go rename internal/{audio => tts}/tempstore.go (96%) create mode 100644 internal/tts/types.go diff --git a/apps/web/src/components/create-model/index.vue b/apps/web/src/components/create-model/index.vue index d168332b..93442bfb 100644 --- a/apps/web/src/components/create-model/index.vue +++ b/apps/web/src/components/create-model/index.vue @@ -18,7 +18,6 @@
@@ -36,12 +35,11 @@ - - {{ opt.label }} + + Chat + + + Embedding @@ -183,11 +181,6 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities' import FormDialogShell from '@/components/form-dialog-shell/index.vue' import { useDialogMutation } from '@/composables/useDialogMutation' -interface ModelTypeOption { - value: string - label: string -} - const selectedCompat = ref([]) const { t } = useI18n() const { run } = useDialogMutation() @@ -200,30 +193,14 @@ const formSchema = toTypedSchema(z.object({ context_window: z.coerce.number().min(1).optional(), })) -const props = withDefaults(defineProps<{ - id: string - typeOptions?: ModelTypeOption[] - defaultType?: string - hideType?: boolean - invalidateKeys?: string[] -}>(), { - typeOptions: () => [ - { value: 'chat', label: 'Chat' }, - { value: 'embedding', label: 'Embedding' }, - ], - defaultType: 'chat', - hideType: false, - invalidateKeys: () => ['provider-models'], -}) - const form = useForm({ validationSchema: formSchema, initialValues: { - type: props.defaultType, + type: 'chat', }, }) -const selectedType = computed(() => form.values.type || props.defaultType) +const selectedType = computed(() => form.values.type || 'chat') const open = inject>('openModel', ref(false)) const title = inject>('openModelTitle', ref('title')) @@ -260,19 +237,15 @@ function onNameInput(e: Event) { form.setFieldValue('name', (e.target as HTMLInputElement).value) } -const queryCache = useQueryCache() -function invalidateModelQueries() { - for (const key of props.invalidateKeys) { - queryCache.invalidateQueries({ key: [key] }) - } -} +const { id } = defineProps<{ id: string }>() +const queryCache = useQueryCache() const { mutateAsync: createModel, isLoading: createLoading } = useMutation({ mutation: async (data: Record) => { const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true }) return result }, - onSettled: invalidateModelQueries, + onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), }) const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({ mutation: async ({ id, data }: { id: string; data: Record }) => { @@ -283,7 +256,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({ }) return result }, - onSettled: invalidateModelQueries, + onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), }) const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({ mutation: async ({ modelId, data }: { modelId: string; data: Record }) => { @@ -294,7 +267,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading }) return result }, - onSettled: invalidateModelQueries, + onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), }) const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value) @@ -324,7 +297,7 @@ async function addModel() { const payload: Record = { type, model_id, - provider_id: props.id, + provider_id: id, config, } @@ -375,15 +348,7 @@ watch(open, async () => { selectedCompat.value = config?.compatibilities ?? [] userEditedName.value = !!(name && name !== model_id) } else { - form.resetForm({ - values: { - type: props.defaultType, - model_id: '', - name: '', - dimensions: undefined, - context_window: undefined, - }, - }) + form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } }) selectedCompat.value = [] userEditedName.value = false } diff --git a/apps/web/src/components/settings-sidebar/index.vue b/apps/web/src/components/settings-sidebar/index.vue index 66e81ea0..aef7f5fd 100644 --- a/apps/web/src/components/settings-sidebar/index.vue +++ b/apps/web/src/components/settings-sidebar/index.vue @@ -52,7 +52,7 @@ import { computed, type Component } from 'vue' import { storeToRefs } from 'pinia' import { useRouter, useRoute } from 'vue-router' import { useI18n } from 'vue-i18n' -import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next' +import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next' import { useChatSelectionStore } from '@/store/chat-selection' import { Sidebar, @@ -118,11 +118,6 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(() name: 'speech', icon: Volume2, }, - { - title: t('sidebar.transcription'), - name: 'transcription', - icon: AudioLines, - }, { title: t('sidebar.email'), name: 'email', diff --git a/apps/web/src/constants/client-types.ts b/apps/web/src/constants/client-types.ts index c69615c3..812786b2 100644 --- a/apps/web/src/constants/client-types.ts +++ b/apps/web/src/constants/client-types.ts @@ -45,41 +45,21 @@ export const CLIENT_TYPE_META: Record = { label: 'OpenAI Speech', hint: 'OpenAI /audio/speech compatible TTS', }, - 'openai-transcription': { - value: 'openai-transcription', - label: 'OpenAI Transcription', - hint: 'OpenAI audio transcription', - }, 'openrouter-speech': { value: 'openrouter-speech', label: 'OpenRouter Speech', hint: 'OpenRouter audio modality TTS', }, - 'openrouter-transcription': { - value: 'openrouter-transcription', - label: 'OpenRouter Transcription', - hint: 'OpenRouter transcription models', - }, 'elevenlabs-speech': { value: 'elevenlabs-speech', label: 'ElevenLabs Speech', hint: 'ElevenLabs text-to-speech', }, - 'elevenlabs-transcription': { - value: 'elevenlabs-transcription', - label: 'ElevenLabs Transcription', - hint: 'ElevenLabs speech-to-text', - }, 'deepgram-speech': { value: 'deepgram-speech', label: 'Deepgram Speech', hint: 'Deepgram TTS', }, - 'deepgram-transcription': { - value: 'deepgram-transcription', - label: 'Deepgram Transcription', - hint: 'Deepgram speech-to-text', - }, 'minimax-speech': { value: 'minimax-speech', label: 'MiniMax Speech', @@ -100,19 +80,9 @@ export const CLIENT_TYPE_META: Record = { label: 'Microsoft Speech', hint: 'Azure Cognitive Services TTS', }, - 'google-speech': { - value: 'google-speech', - label: 'Google Speech', - hint: 'Gemini speech transcription', - }, - 'google-transcription': { - value: 'google-transcription', - label: 'Google Transcription', - hint: 'Gemini speech transcription', - }, } export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META) export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST - .filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription')) + .filter(ct => !ct.value.endsWith('-speech')) diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index bdaaefc9..59f83d6c 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -63,7 +63,6 @@ "webSearch": "Web Search", "memory": "Memory", "speech": "Speech", - "transcription": "Transcription", "email": "Email", "settings": "Settings", "profile": "Profile", @@ -426,9 +425,6 @@ "noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.", "noCapabilities": "No capabilities available for this model.", "saveSuccess": "Speech configuration saved", - "synthesis": { - "models": "Synthesis Models" - }, "advanced": { "title": "Advanced Settings", "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults." @@ -452,27 +448,6 @@ "failed": "Synthesis failed" } }, - "transcription": { - "title": "Transcription", - "emptyTitle": "No Transcription Providers", - "emptyDescription": "Add a transcription provider to enable speech-to-text for your bots", - "models": "Transcription Models", - "noModels": "No transcription models found. Import available models or keep the default template model.", - "noCapabilities": "No capabilities available for this model.", - "importModels": "Import Models", - "importSuccess": "Transcription models imported successfully", - "importFailed": "Failed to import transcription models", - "saveSuccess": "Transcription configuration saved", - "advanced": { - "title": "Advanced Settings", - "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults." - }, - "test": { - "title": "Test Transcription", - "run": "Transcribe", - "failed": "Transcription failed" - } - }, "email": { "title": "Email", "add": "Add Email", @@ -945,8 +920,6 @@ "memoryHealthUnavailable": "Unavailable", "ttsModel": "TTS Model", "ttsModelPlaceholder": "Select TTS model", - "transcriptionModel": "Transcription Model", - "transcriptionModelPlaceholder": "Select transcription model", "imageModel": "Image Generation Model", "imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.", "imageModelPlaceholder": "Select image model (optional)", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index 382a3ee9..8891cad6 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -64,7 +64,6 @@ "webSearch": "搜索", "memory": "记忆", "speech": "语音", - "transcription": "转写", "email": "邮件", "profile": "用户", "home": "首页", @@ -422,9 +421,6 @@ "noModels": "暂无模型,点击\"导入模型\"发现可用模型,或点击\"新建模型\"手动创建。", "noCapabilities": "该模型暂无可用能力信息。", "saveSuccess": "语音配置已保存", - "synthesis": { - "models": "语音合成模型" - }, "advanced": { "title": "高级设置", "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。" @@ -448,27 +444,6 @@ "failed": "合成失败" } }, - "transcription": { - "title": "语音转写", - "emptyTitle": "暂无转写提供方", - "emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能", - "models": "语音识别模型", - "noModels": "暂无语音识别模型,可导入可用模型,或保留默认模板模型。", - "importModels": "导入模型", - "importSuccess": "识别模型导入成功", - "importFailed": "识别模型导入失败", - "saveSuccess": "转写配置已保存", - "noCapabilities": "该模型暂无可用能力信息。", - "advanced": { - "title": "高级设置", - "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。" - }, - "test": { - "title": "测试识别", - "run": "开始识别", - "failed": "识别失败" - } - }, "email": { "title": "邮件提供方", "add": "添加邮件提供方", @@ -941,8 +916,6 @@ "memoryHealthUnavailable": "暂不可用", "ttsModel": "语音合成模型", "ttsModelPlaceholder": "选择语音合成模型", - "transcriptionModel": "转写模型", - "transcriptionModelPlaceholder": "选择语音转写模型", "imageModel": "图片生成模型", "imageModelDescription": "用于 generate_image 工具的模型,必须支持 image-output 兼容性。", "imageModelPlaceholder": "选择图片模型(可选)", diff --git a/apps/web/src/pages/bots/components/bot-settings.vue b/apps/web/src/pages/bots/components/bot-settings.vue index 7f1e3e84..7fd4632b 100644 --- a/apps/web/src/pages/bots/components/bot-settings.vue +++ b/apps/web/src/pages/bots/components/bot-settings.vue @@ -187,17 +187,6 @@ />
- -
- - -
-
@@ -367,7 +356,7 @@ import MemoryProviderSelect from './memory-provider-select.vue' import TtsModelSelect from './tts-model-select.vue' import BrowserContextSelect from './browser-context-select.vue' import { useQuery, useMutation, useQueryCache } from '@pinia/colada' -import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getTranscriptionProviders, getTranscriptionModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk' +import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk' import type { SettingsSettings } from '@memohai/sdk' import type { Ref } from 'vue' import { resolveApiErrorMessage } from '@/utils/api-error' @@ -451,22 +440,6 @@ const { data: ttsModelData } = useQuery({ }, }) -const { data: transcriptionModelData } = useQuery({ - key: ['transcription-models'], - query: async () => { - const { data } = await getTranscriptionModels({ throwOnError: true }) - return data - }, -}) - -const { data: transcriptionProviderData } = useQuery({ - key: ['transcription-providers'], - query: async () => { - const { data } = await getTranscriptionProviders({ throwOnError: true }) - return data - }, -}) - const { data: browserContextData } = useQuery({ key: ['all-browser-contexts'], query: async () => { @@ -521,10 +494,7 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter(( const memoryProviders = computed(() => memoryProviderData.value ?? []) const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false)) const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id))) -const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record) => p.enable !== false)) -const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record) => p.id as string))) const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string))) -const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTranscriptionProviderIds.value.has(m.provider_id as string))) const browserContexts = computed(() => browserContextData.value ?? []) // ---- Form ---- @@ -535,7 +505,6 @@ const form = reactive({ search_provider_id: '', memory_provider_id: '', tts_model_id: '', - transcription_model_id: '', browser_context_id: '', timezone: '', language: '', @@ -675,7 +644,6 @@ watch(settings, (val) => { form.search_provider_id = val.search_provider_id ?? '' form.memory_provider_id = val.memory_provider_id ?? '' form.tts_model_id = val.tts_model_id ?? '' - form.transcription_model_id = val.transcription_model_id ?? '' form.browser_context_id = val.browser_context_id ?? '' form.language = val.language ?? '' form.timezone = val.timezone ?? '' @@ -698,7 +666,6 @@ const hasSettingsChanges = computed(() => { || form.search_provider_id !== (s.search_provider_id ?? '') || form.memory_provider_id !== (s.memory_provider_id ?? '') || form.tts_model_id !== (s.tts_model_id ?? '') - || form.transcription_model_id !== (s.transcription_model_id ?? '') || form.browser_context_id !== (s.browser_context_id ?? '') || form.language !== (s.language ?? '') || form.timezone !== (s.timezone ?? '') diff --git a/apps/web/src/pages/speech/components/model-config-editor.vue b/apps/web/src/pages/speech/components/model-config-editor.vue index d0b9c4cc..334d8f0d 100644 --- a/apps/web/src/pages/speech/components/model-config-editor.vue +++ b/apps/web/src/pages/speech/components/model-config-editor.vue @@ -85,7 +85,7 @@ v-else-if="advancedFields.length === 0" class="text-xs text-muted-foreground" > - {{ mode === 'transcription' ? $t('transcription.noCapabilities') : $t('speech.noCapabilities') }} + {{ $t('speech.noCapabilities') }}
- {{ mode === 'transcription' ? $t('transcription.advanced.title') : $t('speech.advanced.title') }} + {{ $t('speech.advanced.title') }}

- {{ mode === 'transcription' ? $t('transcription.advanced.description') : $t('speech.advanced.description') }} + {{ $t('speech.advanced.description') }}

- {{ mode === 'transcription' ? $t('transcription.test.title') : $t('speech.test.title') }} + {{ $t('speech.test.title') }}

-
+