From f845e936f89572966c56910d8613b16dad35e612 Mon Sep 17 00:00:00 2001 From: aki Date: Mon, 20 Apr 2026 22:04:58 +0900 Subject: [PATCH] fix: separate ear/mouth page --- .../web/src/components/create-model/index.vue | 63 ++- .../src/components/settings-sidebar/index.vue | 7 +- apps/web/src/constants/client-types.ts | 27 +- apps/web/src/i18n/locales/en.json | 6 + apps/web/src/i18n/locales/zh.json | 6 + .../pages/bots/components/bot-settings.vue | 15 +- .../speech/components/provider-setting.vue | 203 +-------- apps/web/src/pages/transcription/index.vue | 126 ++++++ .../pages/transcription/provider-setting.vue | 417 ++++++++++++++++++ apps/web/src/router.ts | 8 + db/migrations/0001_init.up.sql | 7 +- ...ription_models_and_speech_domain.down.sql} | 4 +- ...scription_models_and_speech_domain.up.sql} | 4 +- ...071_split_transcription_providers.down.sql | 33 ++ .../0071_split_transcription_providers.up.sql | 29 ++ db/queries/models.sql | 28 +- internal/db/sqlc/models.sql.go | 59 ++- internal/handlers/tts_providers.go | 27 +- internal/models/models.go | 11 +- internal/models/types.go | 37 +- internal/tts/bootstrap.go | 50 ++- internal/tts/registry.go | 126 +++++- internal/tts/service.go | 20 + 23 files changed, 1055 insertions(+), 258 deletions(-) create mode 100644 apps/web/src/pages/transcription/index.vue create mode 100644 apps/web/src/pages/transcription/provider-setting.vue rename db/migrations/{0069_add_transcription_models_and_google_speech.down.sql => 0069_add_transcription_models_and_speech_domain.down.sql} (86%) rename db/migrations/{0069_add_transcription_models_and_google_speech.up.sql => 0069_add_transcription_models_and_speech_domain.up.sql} (83%) create mode 100644 db/migrations/0071_split_transcription_providers.down.sql create mode 100644 db/migrations/0071_split_transcription_providers.up.sql diff --git a/apps/web/src/components/create-model/index.vue b/apps/web/src/components/create-model/index.vue index 93442bfb..d168332b 100644 --- a/apps/web/src/components/create-model/index.vue +++ b/apps/web/src/components/create-model/index.vue @@ -18,6 +18,7 @@
@@ -35,11 +36,12 @@ - - Chat - - - Embedding + + {{ opt.label }} @@ -181,6 +183,11 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities' import FormDialogShell from '@/components/form-dialog-shell/index.vue' import { useDialogMutation } from '@/composables/useDialogMutation' +interface ModelTypeOption { + value: string + label: string +} + const selectedCompat = ref([]) const { t } = useI18n() const { run } = useDialogMutation() @@ -193,14 +200,30 @@ const formSchema = toTypedSchema(z.object({ context_window: z.coerce.number().min(1).optional(), })) +const props = withDefaults(defineProps<{ + id: string + typeOptions?: ModelTypeOption[] + defaultType?: string + hideType?: boolean + invalidateKeys?: string[] +}>(), { + typeOptions: () => [ + { value: 'chat', label: 'Chat' }, + { value: 'embedding', label: 'Embedding' }, + ], + defaultType: 'chat', + hideType: false, + invalidateKeys: () => ['provider-models'], +}) + const form = useForm({ validationSchema: formSchema, initialValues: { - type: 'chat', + type: props.defaultType, }, }) -const selectedType = computed(() => form.values.type || 'chat') +const selectedType = computed(() => form.values.type || props.defaultType) const open = inject>('openModel', ref(false)) const title = inject>('openModelTitle', ref('title')) @@ -237,15 +260,19 @@ function onNameInput(e: Event) { form.setFieldValue('name', (e.target as HTMLInputElement).value) } -const { id } = defineProps<{ id: string }>() - const queryCache = useQueryCache() +function invalidateModelQueries() { + for (const key of props.invalidateKeys) { + queryCache.invalidateQueries({ key: [key] }) + } +} + const { mutateAsync: createModel, isLoading: createLoading } = useMutation({ mutation: async (data: Record) => { const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true }) return result }, - onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), + onSettled: invalidateModelQueries, }) const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({ mutation: async ({ id, data }: { id: string; data: Record }) => { @@ -256,7 +283,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({ }) return result }, - onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), + onSettled: invalidateModelQueries, }) const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({ mutation: async ({ modelId, data }: { modelId: string; data: Record }) => { @@ -267,7 +294,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading }) return result }, - onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }), + onSettled: invalidateModelQueries, }) const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value) @@ -297,7 +324,7 @@ async function addModel() { const payload: Record = { type, model_id, - provider_id: id, + provider_id: props.id, config, } @@ -348,7 +375,15 @@ watch(open, async () => { selectedCompat.value = config?.compatibilities ?? [] userEditedName.value = !!(name && name !== model_id) } else { - form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } }) + form.resetForm({ + values: { + type: props.defaultType, + model_id: '', + name: '', + dimensions: undefined, + context_window: undefined, + }, + }) selectedCompat.value = [] userEditedName.value = false } diff --git a/apps/web/src/components/settings-sidebar/index.vue b/apps/web/src/components/settings-sidebar/index.vue index aef7f5fd..66e81ea0 100644 --- a/apps/web/src/components/settings-sidebar/index.vue +++ b/apps/web/src/components/settings-sidebar/index.vue @@ -52,7 +52,7 @@ import { computed, type Component } from 'vue' import { storeToRefs } from 'pinia' import { useRouter, useRoute } from 'vue-router' import { useI18n } from 'vue-i18n' -import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next' +import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next' import { useChatSelectionStore } from '@/store/chat-selection' import { Sidebar, @@ -118,6 +118,11 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(() name: 'speech', icon: Volume2, }, + { + title: t('sidebar.transcription'), + name: 'transcription', + icon: AudioLines, + }, { title: t('sidebar.email'), name: 'email', diff --git a/apps/web/src/constants/client-types.ts b/apps/web/src/constants/client-types.ts index 27e634df..c69615c3 100644 --- a/apps/web/src/constants/client-types.ts +++ b/apps/web/src/constants/client-types.ts @@ -45,21 +45,41 @@ export const CLIENT_TYPE_META: Record = { label: 'OpenAI Speech', hint: 'OpenAI /audio/speech compatible TTS', }, + 'openai-transcription': { + value: 'openai-transcription', + label: 'OpenAI Transcription', + hint: 'OpenAI audio transcription', + }, 'openrouter-speech': { value: 'openrouter-speech', label: 'OpenRouter Speech', hint: 'OpenRouter audio modality TTS', }, + 'openrouter-transcription': { + value: 'openrouter-transcription', + label: 'OpenRouter Transcription', + hint: 'OpenRouter transcription models', + }, 'elevenlabs-speech': { value: 'elevenlabs-speech', label: 'ElevenLabs Speech', hint: 'ElevenLabs text-to-speech', }, + 'elevenlabs-transcription': { + value: 'elevenlabs-transcription', + label: 'ElevenLabs Transcription', + hint: 'ElevenLabs speech-to-text', + }, 'deepgram-speech': { value: 'deepgram-speech', label: 'Deepgram Speech', hint: 'Deepgram TTS', }, + 'deepgram-transcription': { + value: 'deepgram-transcription', + label: 'Deepgram Transcription', + hint: 'Deepgram speech-to-text', + }, 'minimax-speech': { value: 'minimax-speech', label: 'MiniMax Speech', @@ -85,9 +105,14 @@ export const CLIENT_TYPE_META: Record = { label: 'Google Speech', hint: 'Gemini speech transcription', }, + 'google-transcription': { + value: 'google-transcription', + label: 'Google Transcription', + hint: 'Gemini speech transcription', + }, } export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META) export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST - .filter(ct => !ct.value.endsWith('-speech')) + .filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription')) diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json index 65eaea5c..0fd14a94 100644 --- a/apps/web/src/i18n/locales/en.json +++ b/apps/web/src/i18n/locales/en.json @@ -63,6 +63,7 @@ "webSearch": "Web Search", "memory": "Memory", "speech": "Speech", + "transcription": "Transcription", "email": "Email", "settings": "Settings", "profile": "Profile", @@ -462,6 +463,11 @@ "failed": "Synthesis failed" } }, + "transcription": { + "title": "Transcription", + "emptyTitle": "No Transcription Providers", + "emptyDescription": "Add a transcription provider to enable speech-to-text for your bots" + }, "email": { "title": "Email", "add": "Add Email", diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json index 9d3b9120..930b6028 100644 --- a/apps/web/src/i18n/locales/zh.json +++ b/apps/web/src/i18n/locales/zh.json @@ -64,6 +64,7 @@ "webSearch": "搜索", "memory": "记忆", "speech": "语音", + "transcription": "转写", "email": "邮件", "profile": "用户", "home": "首页", @@ -458,6 +459,11 @@ "failed": "合成失败" } }, + "transcription": { + "title": "语音转写", + "emptyTitle": "暂无转写提供方", + "emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能" + }, "email": { "title": "邮件提供方", "add": "添加邮件提供方", diff --git a/apps/web/src/pages/bots/components/bot-settings.vue b/apps/web/src/pages/bots/components/bot-settings.vue index 8087de75..718cc089 100644 --- a/apps/web/src/pages/bots/components/bot-settings.vue +++ b/apps/web/src/pages/bots/components/bot-settings.vue @@ -463,6 +463,17 @@ const { data: transcriptionModelData } = useQuery({ }, }) +const { data: transcriptionProviderData } = useQuery({ + key: ['transcription-providers'], + query: async () => { + const resp = await client.get({ + url: '/transcription-providers', + throwOnError: true, + }) + return resp.data + }, +}) + const { data: browserContextData } = useQuery({ key: ['all-browser-contexts'], query: async () => { @@ -517,8 +528,10 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter(( const memoryProviders = computed(() => memoryProviderData.value ?? []) const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false)) const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id))) +const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record) => p.enable !== false)) +const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record) => p.id as string))) const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string))) -const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string))) +const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTranscriptionProviderIds.value.has(m.provider_id as string))) const browserContexts = computed(() => browserContextData.value ?? []) // ---- Form ---- diff --git a/apps/web/src/pages/speech/components/provider-setting.vue b/apps/web/src/pages/speech/components/provider-setting.vue index feb4ff6d..ac8846e3 100644 --- a/apps/web/src/pages/speech/components/provider-setting.vue +++ b/apps/web/src/pages/speech/components/provider-setting.vue @@ -140,16 +140,27 @@

{{ $t('speech.synthesis.models') }}

- - {{ $t('speech.importModels') }} - + + {{ $t('speech.importModels') }} + + +
- - - -
-
-

- {{ $t('speech.transcription.models') }} -

- - {{ $t('speech.transcription.importModels') }} - -
- -
- {{ $t('speech.transcription.noModels') }} -
- -
- - -
- -
-
-
@@ -288,6 +233,7 @@ import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvider import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk' import LoadingButton from '@/components/loading-button/index.vue' import ProviderIcon from '@/components/provider-icon/index.vue' +import CreateModel from '@/components/create-model/index.vue' interface SpeechFieldSchema { key: string @@ -324,8 +270,6 @@ interface SpeechProviderMeta { models?: SpeechModelMeta[] default_synthesis_model?: string synthesis_models?: SpeechModelMeta[] - default_transcription_model?: string - transcription_models?: SpeechModelMeta[] } function getInitials(name: string | undefined) { @@ -340,12 +284,13 @@ const providerName = ref('') const providerConfig = reactive>({}) const visibleSecrets = reactive>({}) const expandedModelId = ref('') -const expandedTranscriptionModelId = ref('') const enableLoading = ref(false) const saveLoading = ref(false) const importLoading = ref(false) -const importTranscriptionLoading = ref(false) const queryCache = useQueryCache() +const speechTypeOptions = [ + { value: 'speech', label: 'Speech' }, +] const { data: providerDetail } = useQuery({ key: () => ['speech-provider-detail', curProviderId.value], @@ -389,22 +334,7 @@ const { data: providerSpeechModels } = useQuery({ }, }) -const { data: providerTranscriptionModelsData } = useQuery({ - key: () => ['speech-provider-transcription-models', curProviderId.value], - query: async () => { - if (!curProviderId.value) return [] - const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api' - const token = localStorage.getItem('token') - const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/transcription-models`, { - headers: token ? { Authorization: `Bearer ${token}` } : undefined, - }) - if (!resp.ok) throw new Error(await resp.text()) - return await resp.json() - }, -}) - const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? [])) -const providerTranscriptionModels = computed(() => ((providerTranscriptionModelsData.value as TtsSpeechModelResponse[] | undefined) ?? [])) watch(() => providerDetail.value, (provider) => { providerName.value = provider?.name ?? curProvider.value?.name ?? '' @@ -426,29 +356,10 @@ function getModelSchema(modelID: string): SpeechConfigSchema | null { return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null } -function getTranscriptionModelMeta(modelID: string): SpeechModelMeta | null { - const models = currentMeta.value?.transcription_models ?? [] - const exact = models.find(m => m.id === modelID) - if (exact) return exact - if (currentMeta.value?.default_transcription_model) { - return models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? null - } - return models[0] ?? null -} - -function getTranscriptionModelSchema(modelID: string): SpeechConfigSchema | null { - const meta = getTranscriptionModelMeta(modelID) - return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null -} - function toggleModel(id: string) { expandedModelId.value = expandedModelId.value === id ? '' : id } -function toggleTranscriptionModel(id: string) { - expandedTranscriptionModelId.value = expandedTranscriptionModelId.value === id ? '' : id -} - async function handleToggleEnable(value: boolean) { if (!curProviderId.value || !curProvider.value) return const prev = curProvider.value.enable ?? false @@ -526,31 +437,6 @@ async function handleSaveModel(modelId: string, config: Record) } } -async function handleSaveTranscriptionModel(modelId: string, config: Record) { - const model = providerTranscriptionModels.value.find(item => item.id === modelId) - if (!model) return - try { - const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api' - const token = localStorage.getItem('token') - const resp = await fetch(`${apiBase}/transcription-models/${modelId}`, { - method: 'PUT', - headers: { - 'Content-Type': 'application/json', - ...(token ? { Authorization: `Bearer ${token}` } : {}), - }, - body: JSON.stringify({ - name: model.name ?? model.model_id, - config, - }), - }) - if (!resp.ok) throw new Error(await resp.text()) - toast.success(t('speech.saveSuccess')) - queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] }) - } catch { - toast.error(t('common.saveFailed')) - } -} - async function handleImportModels() { if (!curProviderId.value) return importLoading.value = true @@ -573,31 +459,6 @@ async function handleImportModels() { } } -async function handleImportTranscriptionModels() { - if (!curProviderId.value) return - importTranscriptionLoading.value = true - try { - const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api' - const token = localStorage.getItem('token') - const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/import-transcription-models`, { - method: 'POST', - headers: token ? { Authorization: `Bearer ${token}` } : undefined, - }) - if (!resp.ok) throw new Error(await resp.text()) - const data = await resp.json() - toast.success(t('speech.transcription.importSuccess', { - created: data?.created ?? 0, - skipped: data?.skipped ?? 0, - })) - queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] }) - queryCache.invalidateQueries({ key: ['speech-providers-meta'] }) - } catch { - toast.error(t('speech.transcription.importFailed')) - } finally { - importTranscriptionLoading.value = false - } -} - async function handleTestModel(modelId: string, text: string, config: Record) { const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api' const token = localStorage.getItem('token') @@ -622,24 +483,6 @@ async function handleTestModel(modelId: string, text: string, config: Record) { - const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api' - const token = localStorage.getItem('token') - const form = new FormData() - form.append('file', file) - form.append('config', JSON.stringify(config)) - const resp = await fetch(`${apiBase}/transcription-models/${modelId}/test`, { - method: 'POST', - headers: token ? { Authorization: `Bearer ${token}` } : undefined, - body: form, - }) - if (!resp.ok) { - const errBody = await resp.text() - throw new Error(errBody) - } - return await resp.json() -} - function sanitizeConfig(input: Record) { const result: Record = {} for (const [key, value] of Object.entries(input)) { diff --git a/apps/web/src/pages/transcription/index.vue b/apps/web/src/pages/transcription/index.vue new file mode 100644 index 00000000..14d85adb --- /dev/null +++ b/apps/web/src/pages/transcription/index.vue @@ -0,0 +1,126 @@ + + + diff --git a/apps/web/src/pages/transcription/provider-setting.vue b/apps/web/src/pages/transcription/provider-setting.vue new file mode 100644 index 00000000..5b127143 --- /dev/null +++ b/apps/web/src/pages/transcription/provider-setting.vue @@ -0,0 +1,417 @@ + + + diff --git a/apps/web/src/router.ts b/apps/web/src/router.ts index 5926afb9..1bac3340 100644 --- a/apps/web/src/router.ts +++ b/apps/web/src/router.ts @@ -89,6 +89,14 @@ const routes = [ breadcrumb: i18nRef('sidebar.speech'), }, }, + { + name: 'transcription', + path: 'transcription', + component: () => import('@/pages/transcription/index.vue'), + meta: { + breadcrumb: i18nRef('sidebar.transcription'), + }, + }, { name: 'email', path: 'email', diff --git a/db/migrations/0001_init.up.sql b/db/migrations/0001_init.up.sql index 45db41ea..855b6734 100644 --- a/db/migrations/0001_init.up.sql +++ b/db/migrations/0001_init.up.sql @@ -77,14 +77,19 @@ CREATE TABLE IF NOT EXISTS providers ( 'github-copilot', 'edge-speech', 'openai-speech', + 'openai-transcription', 'openrouter-speech', + 'openrouter-transcription', 'elevenlabs-speech', + 'elevenlabs-transcription', 'deepgram-speech', + 'deepgram-transcription', 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', 'microsoft-speech', - 'google-speech' + 'google-speech', + 'google-transcription' )) ); diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.down.sql b/db/migrations/0069_add_transcription_models_and_speech_domain.down.sql similarity index 86% rename from db/migrations/0069_add_transcription_models_and_google_speech.down.sql rename to db/migrations/0069_add_transcription_models_and_speech_domain.down.sql index 10135402..31cd9b8a 100644 --- a/db/migrations/0069_add_transcription_models_and_google_speech.down.sql +++ b/db/migrations/0069_add_transcription_models_and_speech_domain.down.sql @@ -1,5 +1,5 @@ --- 0069_add_transcription_models_and_google_speech --- Revert transcription model type and Google speech provider support. +-- 0069_add_transcription_models_and_speech_domain +-- Revert transcription model type and speech-domain expansion. DELETE FROM models WHERE type = 'transcription'; DELETE FROM providers WHERE client_type = 'google-speech'; diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.up.sql b/db/migrations/0069_add_transcription_models_and_speech_domain.up.sql similarity index 83% rename from db/migrations/0069_add_transcription_models_and_google_speech.up.sql rename to db/migrations/0069_add_transcription_models_and_speech_domain.up.sql index e1f24cc3..2342f685 100644 --- a/db/migrations/0069_add_transcription_models_and_google_speech.up.sql +++ b/db/migrations/0069_add_transcription_models_and_speech_domain.up.sql @@ -1,5 +1,5 @@ --- 0069_add_transcription_models_and_google_speech --- Expand unified speech domain to support transcription models and Google speech providers. +-- 0069_add_transcription_models_and_speech_domain +-- Expand the speech domain to support transcription models and shared speech providers. ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check; diff --git a/db/migrations/0071_split_transcription_providers.down.sql b/db/migrations/0071_split_transcription_providers.down.sql new file mode 100644 index 00000000..b250f193 --- /dev/null +++ b/db/migrations/0071_split_transcription_providers.down.sql @@ -0,0 +1,33 @@ +-- 0071_split_transcription_providers +-- Remove dedicated transcription provider client types. + +DELETE FROM providers +WHERE client_type IN ( + 'openai-transcription', + 'openrouter-transcription', + 'elevenlabs-transcription', + 'deepgram-transcription', + 'google-transcription' +); + +ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check; + +ALTER TABLE providers +ADD CONSTRAINT providers_client_type_check CHECK (client_type IN ( + 'openai-responses', + 'openai-completions', + 'anthropic-messages', + 'google-generative-ai', + 'openai-codex', + 'github-copilot', + 'edge-speech', + 'openai-speech', + 'openrouter-speech', + 'elevenlabs-speech', + 'deepgram-speech', + 'minimax-speech', + 'volcengine-speech', + 'alibabacloud-speech', + 'microsoft-speech', + 'google-speech' +)); diff --git a/db/migrations/0071_split_transcription_providers.up.sql b/db/migrations/0071_split_transcription_providers.up.sql new file mode 100644 index 00000000..2c08550c --- /dev/null +++ b/db/migrations/0071_split_transcription_providers.up.sql @@ -0,0 +1,29 @@ +-- 0071_split_transcription_providers +-- Add dedicated transcription provider client types. + +ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check; + +ALTER TABLE providers +ADD CONSTRAINT providers_client_type_check CHECK (client_type IN ( + 'openai-responses', + 'openai-completions', + 'anthropic-messages', + 'google-generative-ai', + 'openai-codex', + 'github-copilot', + 'edge-speech', + 'openai-speech', + 'openai-transcription', + 'openrouter-speech', + 'openrouter-transcription', + 'elevenlabs-speech', + 'elevenlabs-transcription', + 'deepgram-speech', + 'deepgram-transcription', + 'minimax-speech', + 'volcengine-speech', + 'alibabacloud-speech', + 'microsoft-speech', + 'google-speech', + 'google-transcription' +)); diff --git a/db/queries/models.sql b/db/queries/models.sql index fd2e0703..d87de737 100644 --- a/db/queries/models.sql +++ b/db/queries/models.sql @@ -21,14 +21,19 @@ SELECT * FROM providers WHERE client_type NOT IN ( 'edge-speech', 'openai-speech', + 'openai-transcription', 'openrouter-speech', + 'openrouter-transcription', 'elevenlabs-speech', + 'elevenlabs-transcription', 'deepgram-speech', + 'deepgram-transcription', 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', 'microsoft-speech', - 'google-speech' + 'google-speech', + 'google-transcription' ) ORDER BY created_at DESC; @@ -54,14 +59,19 @@ FROM providers WHERE client_type NOT IN ( 'edge-speech', 'openai-speech', + 'openai-transcription', 'openrouter-speech', + 'openrouter-transcription', 'elevenlabs-speech', + 'elevenlabs-transcription', 'deepgram-speech', + 'deepgram-transcription', 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', 'microsoft-speech', - 'google-speech' + 'google-speech', + 'google-transcription' ); -- name: CreateModel :one @@ -230,8 +240,18 @@ WHERE client_type IN ( 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', - 'microsoft-speech', - 'google-speech' + 'microsoft-speech' +) +ORDER BY created_at DESC; + +-- name: ListTranscriptionProviders :many +SELECT * FROM providers +WHERE client_type IN ( + 'openai-transcription', + 'openrouter-transcription', + 'elevenlabs-transcription', + 'deepgram-transcription', + 'google-transcription' ) ORDER BY created_at DESC; diff --git a/internal/db/sqlc/models.sql.go b/internal/db/sqlc/models.sql.go index a94ca59b..400100c9 100644 --- a/internal/db/sqlc/models.sql.go +++ b/internal/db/sqlc/models.sql.go @@ -40,14 +40,19 @@ FROM providers WHERE client_type NOT IN ( 'edge-speech', 'openai-speech', + 'openai-transcription', 'openrouter-speech', + 'openrouter-transcription', 'elevenlabs-speech', + 'elevenlabs-transcription', 'deepgram-speech', + 'deepgram-transcription', 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', 'microsoft-speech', - 'google-speech' + 'google-speech', + 'google-transcription' ) ` @@ -805,14 +810,19 @@ SELECT id, name, client_type, icon, enable, config, metadata, created_at, update WHERE client_type NOT IN ( 'edge-speech', 'openai-speech', + 'openai-transcription', 'openrouter-speech', + 'openrouter-transcription', 'elevenlabs-speech', + 'elevenlabs-transcription', 'deepgram-speech', + 'deepgram-transcription', 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', 'microsoft-speech', - 'google-speech' + 'google-speech', + 'google-transcription' ) ORDER BY created_at DESC ` @@ -945,8 +955,7 @@ WHERE client_type IN ( 'minimax-speech', 'volcengine-speech', 'alibabacloud-speech', - 'microsoft-speech', - 'google-speech' + 'microsoft-speech' ) ORDER BY created_at DESC ` @@ -1068,6 +1077,48 @@ func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, provi return items, nil } +const listTranscriptionProviders = `-- name: ListTranscriptionProviders :many +SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers +WHERE client_type IN ( + 'openai-transcription', + 'openrouter-transcription', + 'elevenlabs-transcription', + 'deepgram-transcription', + 'google-transcription' +) +ORDER BY created_at DESC +` + +func (q *Queries) ListTranscriptionProviders(ctx context.Context) ([]Provider, error) { + rows, err := q.db.Query(ctx, listTranscriptionProviders) + if err != nil { + return nil, err + } + defer rows.Close() + var items []Provider + for rows.Next() { + var i Provider + if err := rows.Scan( + &i.ID, + &i.Name, + &i.ClientType, + &i.Icon, + &i.Enable, + &i.Config, + &i.Metadata, + &i.CreatedAt, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const updateModel = `-- name: UpdateModel :one UPDATE models SET diff --git a/internal/handlers/tts_providers.go b/internal/handlers/tts_providers.go index b8440bae..e802fae3 100644 --- a/internal/handlers/tts_providers.go +++ b/internal/handlers/tts_providers.go @@ -34,11 +34,16 @@ func (h *SpeechHandler) Register(e *echo.Echo) { pg := e.Group("/speech-providers") pg.GET("", h.ListProviders) pg.GET("/:id", h.GetProvider) - pg.GET("/meta", h.ListMeta) + pg.GET("/meta", h.ListSpeechMeta) pg.GET("/:id/models", h.ListModelsByProvider) pg.POST("/:id/import-models", h.ImportModels) - pg.GET("/:id/transcription-models", h.ListTranscriptionModelsByProvider) - pg.POST("/:id/import-transcription-models", h.ImportTranscriptionModels) + + tpg := e.Group("/transcription-providers") + tpg.GET("", h.ListTranscriptionProviders) + tpg.GET("/meta", h.ListTranscriptionMeta) + tpg.GET("/:id", h.GetProvider) + tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider) + tpg.POST("/:id/import-models", h.ImportTranscriptionModels) mg := e.Group("/speech-models") mg.GET("", h.ListModels) @@ -61,8 +66,12 @@ func (h *SpeechHandler) Register(e *echo.Echo) { // @Tags speech-providers // @Success 200 {array} tts.ProviderMetaResponse // @Router /speech-providers/meta [get]. -func (h *SpeechHandler) ListMeta(c echo.Context) error { - return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context())) +func (h *SpeechHandler) ListSpeechMeta(c echo.Context) error { + return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context())) +} + +func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error { + return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context())) } // ListProviders godoc @@ -81,6 +90,14 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error { return c.JSON(http.StatusOK, items) } +func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error { + items, err := h.service.ListTranscriptionProviders(c.Request().Context()) + if err != nil { + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) + } + return c.JSON(http.StatusOK, items) +} + // GetProvider godoc // @Summary Get speech provider // @Description Get a speech provider with masked config values diff --git a/internal/models/models.go b/internal/models/models.go index 3c2f04d0..def74323 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -432,14 +432,19 @@ func IsValidClientType(clientType ClientType) bool { ClientTypeGitHubCopilot, ClientTypeEdgeSpeech, ClientTypeOpenAISpeech, + ClientTypeOpenAITranscription, ClientTypeOpenRouterSpeech, + ClientTypeOpenRouterTranscription, ClientTypeElevenLabsSpeech, + ClientTypeElevenLabsTranscription, ClientTypeDeepgramSpeech, + ClientTypeDeepgramTranscription, ClientTypeMiniMaxSpeech, ClientTypeVolcengineSpeech, ClientTypeAlibabaSpeech, ClientTypeMicrosoftSpeech, - ClientTypeGoogleSpeech: + ClientTypeGoogleSpeech, + ClientTypeGoogleTranscription: return true default: return false @@ -449,7 +454,9 @@ func IsValidClientType(clientType ClientType) bool { // IsLLMClientType returns true if the client type belongs to the LLM domain // (chat/embedding), excluding speech-only types (any type ending in "-speech"). func IsLLMClientType(clientType ClientType) bool { - return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech") + return IsValidClientType(clientType) && + !strings.HasSuffix(string(clientType), "-speech") && + !strings.HasSuffix(string(clientType), "-transcription") } // SelectMemoryModel selects a chat model for memory operations. diff --git a/internal/models/types.go b/internal/models/types.go index a4ef8e1b..d0b180f6 100644 --- a/internal/models/types.go +++ b/internal/models/types.go @@ -18,22 +18,27 @@ const ( type ClientType string const ( - ClientTypeOpenAIResponses ClientType = "openai-responses" - ClientTypeOpenAICompletions ClientType = "openai-completions" - ClientTypeAnthropicMessages ClientType = "anthropic-messages" - ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai" - ClientTypeOpenAICodex ClientType = "openai-codex" - ClientTypeGitHubCopilot ClientType = "github-copilot" - ClientTypeEdgeSpeech ClientType = "edge-speech" - ClientTypeOpenAISpeech ClientType = "openai-speech" - ClientTypeOpenRouterSpeech ClientType = "openrouter-speech" - ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech" - ClientTypeDeepgramSpeech ClientType = "deepgram-speech" - ClientTypeMiniMaxSpeech ClientType = "minimax-speech" - ClientTypeVolcengineSpeech ClientType = "volcengine-speech" - ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech" - ClientTypeMicrosoftSpeech ClientType = "microsoft-speech" - ClientTypeGoogleSpeech ClientType = "google-speech" + ClientTypeOpenAIResponses ClientType = "openai-responses" + ClientTypeOpenAICompletions ClientType = "openai-completions" + ClientTypeAnthropicMessages ClientType = "anthropic-messages" + ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai" + ClientTypeOpenAICodex ClientType = "openai-codex" + ClientTypeGitHubCopilot ClientType = "github-copilot" + ClientTypeEdgeSpeech ClientType = "edge-speech" + ClientTypeOpenAISpeech ClientType = "openai-speech" + ClientTypeOpenAITranscription ClientType = "openai-transcription" + ClientTypeOpenRouterSpeech ClientType = "openrouter-speech" + ClientTypeOpenRouterTranscription ClientType = "openrouter-transcription" + ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech" + ClientTypeElevenLabsTranscription ClientType = "elevenlabs-transcription" + ClientTypeDeepgramSpeech ClientType = "deepgram-speech" + ClientTypeDeepgramTranscription ClientType = "deepgram-transcription" + ClientTypeMiniMaxSpeech ClientType = "minimax-speech" + ClientTypeVolcengineSpeech ClientType = "volcengine-speech" + ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech" + ClientTypeMicrosoftSpeech ClientType = "microsoft-speech" + ClientTypeGoogleSpeech ClientType = "google-speech" + ClientTypeGoogleTranscription ClientType = "google-transcription" ) const ( diff --git a/internal/tts/bootstrap.go b/internal/tts/bootstrap.go index 91e6a0d7..5f4cbe96 100644 --- a/internal/tts/bootstrap.go +++ b/internal/tts/bootstrap.go @@ -34,32 +34,34 @@ func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Querie } synced := 0 - for _, model := range def.Models { - if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) { - if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{ - ProviderID: provider.ID, - ModelID: model.ID, - Type: string(models.ModelTypeSpeech), - }); err != nil { - return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err) + if !isTranscriptionClientType(def.ClientType) { + for _, model := range def.Models { + if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) { + if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{ + ProviderID: provider.ID, + ModelID: model.ID, + Type: string(models.ModelTypeSpeech), + }); err != nil { + return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err) + } + continue } - continue + modelConfigJSON, err := json.Marshal(map[string]any{}) + if err != nil { + return fmt.Errorf("marshal speech model config: %w", err) + } + name := pgtype.Text{String: model.Name, Valid: model.Name != ""} + if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{ + ModelID: model.ID, + Name: name, + ProviderID: provider.ID, + Type: string(models.ModelTypeSpeech), + Config: modelConfigJSON, + }); err != nil { + return fmt.Errorf("upsert speech model %s: %w", model.ID, err) + } + synced++ } - modelConfigJSON, err := json.Marshal(map[string]any{}) - if err != nil { - return fmt.Errorf("marshal speech model config: %w", err) - } - name := pgtype.Text{String: model.Name, Valid: model.Name != ""} - if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{ - ModelID: model.ID, - Name: name, - ProviderID: provider.ID, - Type: string(models.ModelTypeSpeech), - Config: modelConfigJSON, - }); err != nil { - return fmt.Errorf("upsert speech model %s: %w", model.ID, err) - } - synced++ } for _, model := range def.TranscriptionModels { if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) { diff --git a/internal/tts/registry.go b/internal/tts/registry.go index 8a2d9ed1..ff4800a8 100644 --- a/internal/tts/registry.go +++ b/internal/tts/registry.go @@ -53,11 +53,60 @@ type Registry struct { ordered []models.ClientType } +func isTranscriptionClientType(clientType models.ClientType) bool { + switch clientType { + case + models.ClientTypeOpenAITranscription, + models.ClientTypeOpenRouterTranscription, + models.ClientTypeElevenLabsTranscription, + models.ClientTypeDeepgramTranscription, + models.ClientTypeGoogleTranscription: + return true + default: + return false + } +} + +func speechToTranscriptionClientType(clientType models.ClientType) models.ClientType { + switch clientType { + case models.ClientTypeOpenAISpeech: + return models.ClientTypeOpenAITranscription + case models.ClientTypeOpenRouterSpeech: + return models.ClientTypeOpenRouterTranscription + case models.ClientTypeElevenLabsSpeech: + return models.ClientTypeElevenLabsTranscription + case models.ClientTypeDeepgramSpeech: + return models.ClientTypeDeepgramTranscription + case models.ClientTypeGoogleSpeech: + return models.ClientTypeGoogleTranscription + default: + return "" + } +} + +func transcriptionDisplayName(displayName string) string { + displayName = strings.TrimSpace(displayName) + if displayName == "Google Speech" { + return "Google Transcription" + } + if strings.HasSuffix(displayName, " Speech") { + return strings.TrimSuffix(displayName, " Speech") + " Transcription" + } + return displayName + " Transcription" +} + func NewRegistry() *Registry { r := &Registry{ providers: make(map[models.ClientType]ProviderDefinition), } - for _, def := range defaultProviderDefinitions() { + baseDefs := defaultProviderDefinitions() + for _, def := range baseDefs { + if def.Factory == nil && def.TranscriptionFactory != nil { + continue + } + r.Register(def) + } + for _, def := range transcriptionProviderDefinitions(baseDefs) { r.Register(def) } return r @@ -123,6 +172,81 @@ func (r *Registry) ListMeta() []ProviderMetaResponse { return metas } +func (r *Registry) ListSpeechMeta() []ProviderMetaResponse { + defs := r.List() + metas := make([]ProviderMetaResponse, 0, len(defs)) + for _, def := range defs { + if def.Factory == nil { + continue + } + metas = append(metas, ProviderMetaResponse{ + Provider: string(def.ClientType), + DisplayName: def.DisplayName, + Description: def.Description, + ConfigSchema: def.ConfigSchema, + DefaultModel: def.DefaultModel, + Models: def.Models, + DefaultSynthesisModel: def.DefaultModel, + SynthesisModels: def.Models, + SupportsSynthesisList: def.SupportsList, + }) + } + return metas +} + +func (r *Registry) ListTranscriptionMeta() []ProviderMetaResponse { + defs := r.List() + metas := make([]ProviderMetaResponse, 0, len(defs)) + for _, def := range defs { + if def.TranscriptionFactory == nil || !isTranscriptionClientType(def.ClientType) { + continue + } + modelsList := def.TranscriptionModels + if len(modelsList) == 0 { + modelsList = def.Models + } + metas = append(metas, ProviderMetaResponse{ + Provider: string(def.ClientType), + DisplayName: def.DisplayName, + Description: def.Description, + ConfigSchema: def.ConfigSchema, + DefaultModel: def.DefaultTranscriptionModel, + Models: modelsList, + DefaultTranscriptionModel: def.DefaultTranscriptionModel, + TranscriptionModels: modelsList, + SupportsTranscriptionList: def.SupportsTranscriptionList, + }) + } + return metas +} + +func transcriptionProviderDefinitions(base []ProviderDefinition) []ProviderDefinition { + out := make([]ProviderDefinition, 0, len(base)) + for _, def := range base { + clientType := speechToTranscriptionClientType(def.ClientType) + if clientType == "" || def.TranscriptionFactory == nil { + continue + } + modelsList := def.TranscriptionModels + out = append(out, ProviderDefinition{ + ClientType: clientType, + DisplayName: transcriptionDisplayName(def.DisplayName), + Icon: def.Icon, + Description: strings.TrimSpace(def.Description), + ConfigSchema: def.ConfigSchema, + DefaultModel: def.DefaultTranscriptionModel, + SupportsList: def.SupportsTranscriptionList, + Models: modelsList, + DefaultTranscriptionModel: def.DefaultTranscriptionModel, + SupportsTranscriptionList: def.SupportsTranscriptionList, + TranscriptionModels: modelsList, + TranscriptionFactory: def.TranscriptionFactory, + Order: def.Order + 1, + }) + } + return out +} + func defaultProviderDefinitions() []ProviderDefinition { edgeVoices := make([]VoiceInfo, 0) for lang, ids := range edgespeech.EdgeTTSVoices { diff --git a/internal/tts/service.go b/internal/tts/service.go index eb4da940..17c3fe51 100644 --- a/internal/tts/service.go +++ b/internal/tts/service.go @@ -35,6 +35,14 @@ func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse { return s.registry.ListMeta() } +func (s *Service) ListSpeechMeta(_ context.Context) []ProviderMetaResponse { + return s.registry.ListSpeechMeta() +} + +func (s *Service) ListTranscriptionMeta(_ context.Context) []ProviderMetaResponse { + return s.registry.ListTranscriptionMeta() +} + func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) { rows, err := s.queries.ListSpeechProviders(ctx) if err != nil { @@ -47,6 +55,18 @@ func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResp return items, nil } +func (s *Service) ListTranscriptionProviders(ctx context.Context) ([]SpeechProviderResponse, error) { + rows, err := s.queries.ListTranscriptionProviders(ctx) + if err != nil { + return nil, fmt.Errorf("list transcription providers: %w", err) + } + items := make([]SpeechProviderResponse, 0, len(rows)) + for _, row := range rows { + items = append(items, toSpeechProviderResponse(row)) + } + return items, nil +} + func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) { pgID, err := db.ParseUUID(id) if err != nil {