From 66c529e4b17031ce3c311bb45d020d6e4f763b56 Mon Sep 17 00:00:00 2001 From: aki Date: Mon, 20 Apr 2026 17:57:27 +0900 Subject: [PATCH] feat: Ear and Mouth --- apps/web/src/constants/client-types.ts | 5 + apps/web/src/i18n/locales/en.json | 16 + apps/web/src/i18n/locales/zh.json | 16 + .../pages/bots/components/bot-settings.vue | 27 ++ .../speech/components/model-config-editor.vue | 87 ++++- .../speech/components/provider-setting.vue | 214 ++++++++++- cmd/agent/app.go | 32 ++ db/migrations/0001_init.up.sql | 6 +- ...cription_models_and_google_speech.down.sql | 33 ++ ...nscription_models_and_google_speech.up.sql | 31 ++ .../0070_add_bot_transcription_model.down.sql | 8 + .../0070_add_bot_transcription_model.up.sql | 5 + db/queries/models.sql | 51 ++- db/queries/settings.sql | 8 +- internal/agent/retry.go | 2 +- internal/agent/tools/container.go | 6 +- internal/agent/tools/transcribe.go | 232 ++++++++++++ .../identities/service_integration_test.go | 1 - internal/channel/inbound/channel.go | 140 +++++++ internal/db/sqlc/conversations.sql.go | 2 +- internal/db/sqlc/models.go | 1 + internal/db/sqlc/models.sql.go | 166 +++++++- internal/db/sqlc/settings.sql.go | 20 +- internal/handlers/tts_providers.go | 185 +++++++++ internal/models/models.go | 13 +- internal/models/types.go | 10 +- internal/settings/service.go | 15 + internal/settings/types.go | 2 + internal/tts/bootstrap.go | 34 +- internal/tts/registry.go | 216 +++++++++-- internal/tts/service.go | 358 ++++++++++++++++-- internal/tts/types.go | 52 ++- internal/workspace/image_preference.go | 2 + packages/sdk/src/types.gen.ts | 2 + 34 files changed, 1863 insertions(+), 135 deletions(-) create mode 100644 db/migrations/0069_add_transcription_models_and_google_speech.down.sql create mode 100644 db/migrations/0069_add_transcription_models_and_google_speech.up.sql create mode 100644 db/migrations/0070_add_bot_transcription_model.down.sql create mode 100644 db/migrations/0070_add_bot_transcription_model.up.sql create mode 100644 internal/agent/tools/transcribe.go diff --git a/apps/web/src/constants/client-types.ts b/apps/web/src/constants/client-types.ts index 812786b2..27e634df 100644 --- a/apps/web/src/constants/client-types.ts +++ b/apps/web/src/constants/client-types.ts @@ -80,6 +80,11 @@ export const CLIENT_TYPE_META: Record = { label: 'Microsoft Speech', hint: 'Azure Cognitive Services TTS', }, + 'google-speech': { + value: 'google-speech', + label: 'Google Speech', + hint: 'Gemini speech transcription', + }, } export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META) diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index 59f83d6c..65eaea5c 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -425,6 +425,20 @@ "noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.", "noCapabilities": "No capabilities available for this model.", "saveSuccess": "Speech configuration saved", + "synthesis": { + "models": "Synthesis Models" + }, + "transcription": { + "models": "Transcription Models", + "noModels": "No transcription models found. Import available models or keep the default template model.", + "importModels": "Import Transcription Models", + "importSuccess": "Transcription models imported successfully", + "importFailed": "Failed to import transcription models", + "test": { + "title": "Test Transcription", + "run": "Transcribe" + } + }, "advanced": { "title": "Advanced Settings", "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults." @@ -920,6 +934,8 @@ "memoryHealthUnavailable": "Unavailable", "ttsModel": "TTS Model", "ttsModelPlaceholder": "Select TTS model", + "transcriptionModel": "Transcription Model", + "transcriptionModelPlaceholder": "Select transcription model", "imageModel": "Image Generation Model", "imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.", "imageModelPlaceholder": "Select image model (optional)", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index 8891cad6..9d3b9120 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -421,6 +421,20 @@ "noModels": "暂无模型,点击\"导入模型\"发现可用模型,或点击\"新建模型\"手动创建。", "noCapabilities": "该模型暂无可用能力信息。", "saveSuccess": "语音配置已保存", + "synthesis": { + "models": "语音合成模型" + }, + "transcription": { + "models": "语音识别模型", + "noModels": "暂无语音识别模型,可导入可用模型,或保留默认模板模型。", + "importModels": "导入识别模型", + "importSuccess": "识别模型导入成功", + "importFailed": "识别模型导入失败", + "test": { + "title": "测试识别", + "run": "开始识别" + } + }, "advanced": { "title": "高级设置", "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。" @@ -916,6 +930,8 @@ "memoryHealthUnavailable": "暂不可用", "ttsModel": "语音合成模型", "ttsModelPlaceholder": "选择语音合成模型", + "transcriptionModel": "转写模型", + "transcriptionModelPlaceholder": "选择语音转写模型", "imageModel": "图片生成模型", "imageModelDescription": "用于 generate_image 工具的模型,必须支持 image-output 兼容性。", "imageModelPlaceholder": "选择图片模型(可选)", diff --git a/apps/web/src/pages/bots/components/bot-settings.vue b/apps/web/src/pages/bots/components/bot-settings.vue index 7fd4632b..8087de75 100644 --- a/apps/web/src/pages/bots/components/bot-settings.vue +++ b/apps/web/src/pages/bots/components/bot-settings.vue @@ -187,6 +187,17 @@ /> + +
+ + +
+
@@ -357,6 +368,7 @@ import TtsModelSelect from './tts-model-select.vue' import BrowserContextSelect from './browser-context-select.vue' import { useQuery, useMutation, useQueryCache } from '@pinia/colada' import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk' +import { client } from '@memohai/sdk/client' import type { SettingsSettings } from '@memohai/sdk' import type { Ref } from 'vue' import { resolveApiErrorMessage } from '@/utils/api-error' @@ -440,6 +452,17 @@ const { data: ttsModelData } = useQuery({ }, }) +const { data: transcriptionModelData } = useQuery({ + key: ['transcription-models'], + query: async () => { + const resp = await client.get({ + url: '/transcription-models', + throwOnError: true, + }) + return resp.data + }, +}) + const { data: browserContextData } = useQuery({ key: ['all-browser-contexts'], query: async () => { @@ -495,6 +518,7 @@ const memoryProviders = computed(() => memoryProviderData.value ?? []) const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false)) const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id))) const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string))) +const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string))) const browserContexts = computed(() => browserContextData.value ?? []) // ---- Form ---- @@ -505,6 +529,7 @@ const form = reactive({ search_provider_id: '', memory_provider_id: '', tts_model_id: '', + transcription_model_id: '', browser_context_id: '', timezone: '', language: '', @@ -644,6 +669,7 @@ watch(settings, (val) => { form.search_provider_id = val.search_provider_id ?? '' form.memory_provider_id = val.memory_provider_id ?? '' form.tts_model_id = val.tts_model_id ?? '' + form.transcription_model_id = val.transcription_model_id ?? '' form.browser_context_id = val.browser_context_id ?? '' form.language = val.language ?? '' form.timezone = val.timezone ?? '' @@ -666,6 +692,7 @@ const hasSettingsChanges = computed(() => { || form.search_provider_id !== (s.search_provider_id ?? '') || form.memory_provider_id !== (s.memory_provider_id ?? '') || form.tts_model_id !== (s.tts_model_id ?? '') + || form.transcription_model_id !== (s.transcription_model_id ?? '') || form.browser_context_id !== (s.browser_context_id ?? '') || form.language !== (s.language ?? '') || form.timezone !== (s.timezone ?? '') diff --git a/apps/web/src/pages/speech/components/model-config-editor.vue b/apps/web/src/pages/speech/components/model-config-editor.vue index 334d8f0d..62e24821 100644 --- a/apps/web/src/pages/speech/components/model-config-editor.vue +++ b/apps/web/src/pages/speech/components/model-config-editor.vue @@ -195,9 +195,12 @@

- {{ $t('speech.test.title') }} + {{ mode === 'transcription' ? $t('speech.transcription.test.title') : $t('speech.test.title') }}

-
+