diff --git a/apps/web/src/components/create-model/index.vue b/apps/web/src/components/create-model/index.vue
index 93442bfb..d168332b 100644
--- a/apps/web/src/components/create-model/index.vue
+++ b/apps/web/src/components/create-model/index.vue
@@ -18,6 +18,7 @@
@@ -35,11 +36,12 @@
-
- Chat
-
-
- Embedding
+
+ {{ opt.label }}
@@ -181,6 +183,11 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities'
import FormDialogShell from '@/components/form-dialog-shell/index.vue'
import { useDialogMutation } from '@/composables/useDialogMutation'
+interface ModelTypeOption {
+ value: string
+ label: string
+}
+
const selectedCompat = ref([])
const { t } = useI18n()
const { run } = useDialogMutation()
@@ -193,14 +200,30 @@ const formSchema = toTypedSchema(z.object({
context_window: z.coerce.number().min(1).optional(),
}))
+const props = withDefaults(defineProps<{
+ id: string
+ typeOptions?: ModelTypeOption[]
+ defaultType?: string
+ hideType?: boolean
+ invalidateKeys?: string[]
+}>(), {
+ typeOptions: () => [
+ { value: 'chat', label: 'Chat' },
+ { value: 'embedding', label: 'Embedding' },
+ ],
+ defaultType: 'chat',
+ hideType: false,
+ invalidateKeys: () => ['provider-models'],
+})
+
const form = useForm({
validationSchema: formSchema,
initialValues: {
- type: 'chat',
+ type: props.defaultType,
},
})
-const selectedType = computed(() => form.values.type || 'chat')
+const selectedType = computed(() => form.values.type || props.defaultType)
const open = inject[>('openModel', ref(false))
const title = inject][>('openModelTitle', ref('title'))
@@ -237,15 +260,19 @@ function onNameInput(e: Event) {
form.setFieldValue('name', (e.target as HTMLInputElement).value)
}
-const { id } = defineProps<{ id: string }>()
-
const queryCache = useQueryCache()
+function invalidateModelQueries() {
+ for (const key of props.invalidateKeys) {
+ queryCache.invalidateQueries({ key: [key] })
+ }
+}
+
const { mutateAsync: createModel, isLoading: createLoading } = useMutation({
mutation: async (data: Record) => {
const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true })
return result
},
- onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
+ onSettled: invalidateModelQueries,
})
const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
mutation: async ({ id, data }: { id: string; data: Record }) => {
@@ -256,7 +283,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
})
return result
},
- onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
+ onSettled: invalidateModelQueries,
})
const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({
mutation: async ({ modelId, data }: { modelId: string; data: Record }) => {
@@ -267,7 +294,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading
})
return result
},
- onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
+ onSettled: invalidateModelQueries,
})
const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value)
@@ -297,7 +324,7 @@ async function addModel() {
const payload: Record = {
type,
model_id,
- provider_id: id,
+ provider_id: props.id,
config,
}
@@ -348,7 +375,15 @@ watch(open, async () => {
selectedCompat.value = config?.compatibilities ?? []
userEditedName.value = !!(name && name !== model_id)
} else {
- form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } })
+ form.resetForm({
+ values: {
+ type: props.defaultType,
+ model_id: '',
+ name: '',
+ dimensions: undefined,
+ context_window: undefined,
+ },
+ })
selectedCompat.value = []
userEditedName.value = false
}
diff --git a/apps/web/src/components/settings-sidebar/index.vue b/apps/web/src/components/settings-sidebar/index.vue
index aef7f5fd..66e81ea0 100644
--- a/apps/web/src/components/settings-sidebar/index.vue
+++ b/apps/web/src/components/settings-sidebar/index.vue
@@ -52,7 +52,7 @@ import { computed, type Component } from 'vue'
import { storeToRefs } from 'pinia'
import { useRouter, useRoute } from 'vue-router'
import { useI18n } from 'vue-i18n'
-import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
+import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
import { useChatSelectionStore } from '@/store/chat-selection'
import {
Sidebar,
@@ -118,6 +118,11 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(()
name: 'speech',
icon: Volume2,
},
+ {
+ title: t('sidebar.transcription'),
+ name: 'transcription',
+ icon: AudioLines,
+ },
{
title: t('sidebar.email'),
name: 'email',
diff --git a/apps/web/src/constants/client-types.ts b/apps/web/src/constants/client-types.ts
index 27e634df..c69615c3 100644
--- a/apps/web/src/constants/client-types.ts
+++ b/apps/web/src/constants/client-types.ts
@@ -45,21 +45,41 @@ export const CLIENT_TYPE_META: Record = {
label: 'OpenAI Speech',
hint: 'OpenAI /audio/speech compatible TTS',
},
+ 'openai-transcription': {
+ value: 'openai-transcription',
+ label: 'OpenAI Transcription',
+ hint: 'OpenAI audio transcription',
+ },
'openrouter-speech': {
value: 'openrouter-speech',
label: 'OpenRouter Speech',
hint: 'OpenRouter audio modality TTS',
},
+ 'openrouter-transcription': {
+ value: 'openrouter-transcription',
+ label: 'OpenRouter Transcription',
+ hint: 'OpenRouter transcription models',
+ },
'elevenlabs-speech': {
value: 'elevenlabs-speech',
label: 'ElevenLabs Speech',
hint: 'ElevenLabs text-to-speech',
},
+ 'elevenlabs-transcription': {
+ value: 'elevenlabs-transcription',
+ label: 'ElevenLabs Transcription',
+ hint: 'ElevenLabs speech-to-text',
+ },
'deepgram-speech': {
value: 'deepgram-speech',
label: 'Deepgram Speech',
hint: 'Deepgram TTS',
},
+ 'deepgram-transcription': {
+ value: 'deepgram-transcription',
+ label: 'Deepgram Transcription',
+ hint: 'Deepgram speech-to-text',
+ },
'minimax-speech': {
value: 'minimax-speech',
label: 'MiniMax Speech',
@@ -85,9 +105,14 @@ export const CLIENT_TYPE_META: Record = {
label: 'Google Speech',
hint: 'Gemini speech transcription',
},
+ 'google-transcription': {
+ value: 'google-transcription',
+ label: 'Google Transcription',
+ hint: 'Gemini speech transcription',
+ },
}
export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)
export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
- .filter(ct => !ct.value.endsWith('-speech'))
+ .filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription'))
diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json
index 65eaea5c..0fd14a94 100644
--- a/apps/web/src/i18n/locales/en.json
+++ b/apps/web/src/i18n/locales/en.json
@@ -63,6 +63,7 @@
"webSearch": "Web Search",
"memory": "Memory",
"speech": "Speech",
+ "transcription": "Transcription",
"email": "Email",
"settings": "Settings",
"profile": "Profile",
@@ -462,6 +463,11 @@
"failed": "Synthesis failed"
}
},
+ "transcription": {
+ "title": "Transcription",
+ "emptyTitle": "No Transcription Providers",
+ "emptyDescription": "Add a transcription provider to enable speech-to-text for your bots"
+ },
"email": {
"title": "Email",
"add": "Add Email",
diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json
index 9d3b9120..930b6028 100644
--- a/apps/web/src/i18n/locales/zh.json
+++ b/apps/web/src/i18n/locales/zh.json
@@ -64,6 +64,7 @@
"webSearch": "搜索",
"memory": "记忆",
"speech": "语音",
+ "transcription": "转写",
"email": "邮件",
"profile": "用户",
"home": "首页",
@@ -458,6 +459,11 @@
"failed": "合成失败"
}
},
+ "transcription": {
+ "title": "语音转写",
+ "emptyTitle": "暂无转写提供方",
+ "emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能"
+ },
"email": {
"title": "邮件提供方",
"add": "添加邮件提供方",
diff --git a/apps/web/src/pages/bots/components/bot-settings.vue b/apps/web/src/pages/bots/components/bot-settings.vue
index 8087de75..718cc089 100644
--- a/apps/web/src/pages/bots/components/bot-settings.vue
+++ b/apps/web/src/pages/bots/components/bot-settings.vue
@@ -463,6 +463,17 @@ const { data: transcriptionModelData } = useQuery({
},
})
+const { data: transcriptionProviderData } = useQuery({
+ key: ['transcription-providers'],
+ query: async () => {
+ const resp = await client.get({
+ url: '/transcription-providers',
+ throwOnError: true,
+ })
+ return resp.data
+ },
+})
+
const { data: browserContextData } = useQuery({
key: ['all-browser-contexts'],
query: async () => {
@@ -517,8 +528,10 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter((
const memoryProviders = computed(() => memoryProviderData.value ?? [])
const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false))
const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id)))
+const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record) => p.enable !== false))
+const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record) => p.id as string)))
const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string)))
-const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTtsProviderIds.value.has(m.provider_id as string)))
+const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record) => enabledTranscriptionProviderIds.value.has(m.provider_id as string)))
const browserContexts = computed(() => browserContextData.value ?? [])
// ---- Form ----
diff --git a/apps/web/src/pages/speech/components/provider-setting.vue b/apps/web/src/pages/speech/components/provider-setting.vue
index feb4ff6d..ac8846e3 100644
--- a/apps/web/src/pages/speech/components/provider-setting.vue
+++ b/apps/web/src/pages/speech/components/provider-setting.vue
@@ -140,16 +140,27 @@
]
{{ $t('speech.synthesis.models') }}
-
- {{ $t('speech.importModels') }}
-
+
+ {{ $t('speech.importModels') }}
+
+
+
-
-
-
-
-
-
- {{ $t('speech.transcription.models') }}
-
-
- {{ $t('speech.transcription.importModels') }}
-
-
-
-
- {{ $t('speech.transcription.noModels') }}
-
-
-
-
-
-
- handleSaveTranscriptionModel(model.id ?? '', cfg)"
- />
-
-
-
@@ -288,6 +233,7 @@ import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvider
import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
import LoadingButton from '@/components/loading-button/index.vue'
import ProviderIcon from '@/components/provider-icon/index.vue'
+import CreateModel from '@/components/create-model/index.vue'
interface SpeechFieldSchema {
key: string
@@ -324,8 +270,6 @@ interface SpeechProviderMeta {
models?: SpeechModelMeta[]
default_synthesis_model?: string
synthesis_models?: SpeechModelMeta[]
- default_transcription_model?: string
- transcription_models?: SpeechModelMeta[]
}
function getInitials(name: string | undefined) {
@@ -340,12 +284,13 @@ const providerName = ref('')
const providerConfig = reactive>({})
const visibleSecrets = reactive>({})
const expandedModelId = ref('')
-const expandedTranscriptionModelId = ref('')
const enableLoading = ref(false)
const saveLoading = ref(false)
const importLoading = ref(false)
-const importTranscriptionLoading = ref(false)
const queryCache = useQueryCache()
+const speechTypeOptions = [
+ { value: 'speech', label: 'Speech' },
+]
const { data: providerDetail } = useQuery({
key: () => ['speech-provider-detail', curProviderId.value],
@@ -389,22 +334,7 @@ const { data: providerSpeechModels } = useQuery({
},
})
-const { data: providerTranscriptionModelsData } = useQuery({
- key: () => ['speech-provider-transcription-models', curProviderId.value],
- query: async () => {
- if (!curProviderId.value) return []
- const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
- const token = localStorage.getItem('token')
- const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/transcription-models`, {
- headers: token ? { Authorization: `Bearer ${token}` } : undefined,
- })
- if (!resp.ok) throw new Error(await resp.text())
- return await resp.json()
- },
-})
-
const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []))
-const providerTranscriptionModels = computed(() => ((providerTranscriptionModelsData.value as TtsSpeechModelResponse[] | undefined) ?? []))
watch(() => providerDetail.value, (provider) => {
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
@@ -426,29 +356,10 @@ function getModelSchema(modelID: string): SpeechConfigSchema | null {
return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
}
-function getTranscriptionModelMeta(modelID: string): SpeechModelMeta | null {
- const models = currentMeta.value?.transcription_models ?? []
- const exact = models.find(m => m.id === modelID)
- if (exact) return exact
- if (currentMeta.value?.default_transcription_model) {
- return models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? null
- }
- return models[0] ?? null
-}
-
-function getTranscriptionModelSchema(modelID: string): SpeechConfigSchema | null {
- const meta = getTranscriptionModelMeta(modelID)
- return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
-}
-
function toggleModel(id: string) {
expandedModelId.value = expandedModelId.value === id ? '' : id
}
-function toggleTranscriptionModel(id: string) {
- expandedTranscriptionModelId.value = expandedTranscriptionModelId.value === id ? '' : id
-}
-
async function handleToggleEnable(value: boolean) {
if (!curProviderId.value || !curProvider.value) return
const prev = curProvider.value.enable ?? false
@@ -526,31 +437,6 @@ async function handleSaveModel(modelId: string, config: Record)
}
}
-async function handleSaveTranscriptionModel(modelId: string, config: Record) {
- const model = providerTranscriptionModels.value.find(item => item.id === modelId)
- if (!model) return
- try {
- const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
- const token = localStorage.getItem('token')
- const resp = await fetch(`${apiBase}/transcription-models/${modelId}`, {
- method: 'PUT',
- headers: {
- 'Content-Type': 'application/json',
- ...(token ? { Authorization: `Bearer ${token}` } : {}),
- },
- body: JSON.stringify({
- name: model.name ?? model.model_id,
- config,
- }),
- })
- if (!resp.ok) throw new Error(await resp.text())
- toast.success(t('speech.saveSuccess'))
- queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] })
- } catch {
- toast.error(t('common.saveFailed'))
- }
-}
-
async function handleImportModels() {
if (!curProviderId.value) return
importLoading.value = true
@@ -573,31 +459,6 @@ async function handleImportModels() {
}
}
-async function handleImportTranscriptionModels() {
- if (!curProviderId.value) return
- importTranscriptionLoading.value = true
- try {
- const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
- const token = localStorage.getItem('token')
- const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/import-transcription-models`, {
- method: 'POST',
- headers: token ? { Authorization: `Bearer ${token}` } : undefined,
- })
- if (!resp.ok) throw new Error(await resp.text())
- const data = await resp.json()
- toast.success(t('speech.transcription.importSuccess', {
- created: data?.created ?? 0,
- skipped: data?.skipped ?? 0,
- }))
- queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] })
- queryCache.invalidateQueries({ key: ['speech-providers-meta'] })
- } catch {
- toast.error(t('speech.transcription.importFailed'))
- } finally {
- importTranscriptionLoading.value = false
- }
-}
-
async function handleTestModel(modelId: string, text: string, config: Record) {
const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
const token = localStorage.getItem('token')
@@ -622,24 +483,6 @@ async function handleTestModel(modelId: string, text: string, config: Record) {
- const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
- const token = localStorage.getItem('token')
- const form = new FormData()
- form.append('file', file)
- form.append('config', JSON.stringify(config))
- const resp = await fetch(`${apiBase}/transcription-models/${modelId}/test`, {
- method: 'POST',
- headers: token ? { Authorization: `Bearer ${token}` } : undefined,
- body: form,
- })
- if (!resp.ok) {
- const errBody = await resp.text()
- throw new Error(errBody)
- }
- return await resp.json()
-}
-
function sanitizeConfig(input: Record) {
const result: Record = {}
for (const [key, value] of Object.entries(input)) {
diff --git a/apps/web/src/pages/transcription/index.vue b/apps/web/src/pages/transcription/index.vue
new file mode 100644
index 00000000..14d85adb
--- /dev/null
+++ b/apps/web/src/pages/transcription/index.vue
@@ -0,0 +1,126 @@
+
+
+
+
+
+
+
+
+ { if (isSelect) curProvider = item }"
+ >
+
+
+
+
+ {{ getInitials(item.name) }}
+
+
+
+
+ {{ item.name }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ $t('transcription.emptyTitle') }}
+ {{ $t('transcription.emptyDescription') }}
+
+
+
+
diff --git a/apps/web/src/pages/transcription/provider-setting.vue b/apps/web/src/pages/transcription/provider-setting.vue
new file mode 100644
index 00000000..5b127143
--- /dev/null
+++ b/apps/web/src/pages/transcription/provider-setting.vue
@@ -0,0 +1,417 @@
+
+
+
+
+
+
+ {{ getInitials(curProvider?.name) }}
+
+
+
+
+ {{ curProvider?.name }}
+
+
+ {{ currentMeta?.display_name ?? curProvider?.client_type }}
+
+
+
+
+ {{ $t('common.enable') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ $t('speech.transcription.models') }}
+
+
+
+ {{ $t('speech.transcription.importModels') }}
+
+
+
+
+
+
+ {{ $t('speech.transcription.noModels') }}
+
+
+
+
+
+ handleSaveModel(model.id ?? '', cfg)"
+ />
+
+
+
+
+
+
+
diff --git a/apps/web/src/router.ts b/apps/web/src/router.ts
index 5926afb9..1bac3340 100644
--- a/apps/web/src/router.ts
+++ b/apps/web/src/router.ts
@@ -89,6 +89,14 @@ const routes = [
breadcrumb: i18nRef('sidebar.speech'),
},
},
+ {
+ name: 'transcription',
+ path: 'transcription',
+ component: () => import('@/pages/transcription/index.vue'),
+ meta: {
+ breadcrumb: i18nRef('sidebar.transcription'),
+ },
+ },
{
name: 'email',
path: 'email',
diff --git a/db/migrations/0001_init.up.sql b/db/migrations/0001_init.up.sql
index 45db41ea..855b6734 100644
--- a/db/migrations/0001_init.up.sql
+++ b/db/migrations/0001_init.up.sql
@@ -77,14 +77,19 @@ CREATE TABLE IF NOT EXISTS providers (
'github-copilot',
'edge-speech',
'openai-speech',
+ 'openai-transcription',
'openrouter-speech',
+ 'openrouter-transcription',
'elevenlabs-speech',
+ 'elevenlabs-transcription',
'deepgram-speech',
+ 'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
- 'google-speech'
+ 'google-speech',
+ 'google-transcription'
))
);
diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.down.sql b/db/migrations/0069_add_transcription_models_and_speech_domain.down.sql
similarity index 86%
rename from db/migrations/0069_add_transcription_models_and_google_speech.down.sql
rename to db/migrations/0069_add_transcription_models_and_speech_domain.down.sql
index 10135402..31cd9b8a 100644
--- a/db/migrations/0069_add_transcription_models_and_google_speech.down.sql
+++ b/db/migrations/0069_add_transcription_models_and_speech_domain.down.sql
@@ -1,5 +1,5 @@
--- 0069_add_transcription_models_and_google_speech
--- Revert transcription model type and Google speech provider support.
+-- 0069_add_transcription_models_and_speech_domain
+-- Revert transcription model type and speech-domain expansion.
DELETE FROM models WHERE type = 'transcription';
DELETE FROM providers WHERE client_type = 'google-speech';
diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.up.sql b/db/migrations/0069_add_transcription_models_and_speech_domain.up.sql
similarity index 83%
rename from db/migrations/0069_add_transcription_models_and_google_speech.up.sql
rename to db/migrations/0069_add_transcription_models_and_speech_domain.up.sql
index e1f24cc3..2342f685 100644
--- a/db/migrations/0069_add_transcription_models_and_google_speech.up.sql
+++ b/db/migrations/0069_add_transcription_models_and_speech_domain.up.sql
@@ -1,5 +1,5 @@
--- 0069_add_transcription_models_and_google_speech
--- Expand unified speech domain to support transcription models and Google speech providers.
+-- 0069_add_transcription_models_and_speech_domain
+-- Expand the speech domain to support transcription models and shared speech providers.
ALTER TABLE providers
DROP CONSTRAINT IF EXISTS providers_client_type_check;
diff --git a/db/migrations/0071_split_transcription_providers.down.sql b/db/migrations/0071_split_transcription_providers.down.sql
new file mode 100644
index 00000000..b250f193
--- /dev/null
+++ b/db/migrations/0071_split_transcription_providers.down.sql
@@ -0,0 +1,33 @@
+-- 0071_split_transcription_providers
+-- Remove dedicated transcription provider client types.
+
+DELETE FROM providers
+WHERE client_type IN (
+ 'openai-transcription',
+ 'openrouter-transcription',
+ 'elevenlabs-transcription',
+ 'deepgram-transcription',
+ 'google-transcription'
+);
+
+ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE providers
+ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
+ 'openai-responses',
+ 'openai-completions',
+ 'anthropic-messages',
+ 'google-generative-ai',
+ 'openai-codex',
+ 'github-copilot',
+ 'edge-speech',
+ 'openai-speech',
+ 'openrouter-speech',
+ 'elevenlabs-speech',
+ 'deepgram-speech',
+ 'minimax-speech',
+ 'volcengine-speech',
+ 'alibabacloud-speech',
+ 'microsoft-speech',
+ 'google-speech'
+));
diff --git a/db/migrations/0071_split_transcription_providers.up.sql b/db/migrations/0071_split_transcription_providers.up.sql
new file mode 100644
index 00000000..2c08550c
--- /dev/null
+++ b/db/migrations/0071_split_transcription_providers.up.sql
@@ -0,0 +1,29 @@
+-- 0071_split_transcription_providers
+-- Add dedicated transcription provider client types.
+
+ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE providers
+ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
+ 'openai-responses',
+ 'openai-completions',
+ 'anthropic-messages',
+ 'google-generative-ai',
+ 'openai-codex',
+ 'github-copilot',
+ 'edge-speech',
+ 'openai-speech',
+ 'openai-transcription',
+ 'openrouter-speech',
+ 'openrouter-transcription',
+ 'elevenlabs-speech',
+ 'elevenlabs-transcription',
+ 'deepgram-speech',
+ 'deepgram-transcription',
+ 'minimax-speech',
+ 'volcengine-speech',
+ 'alibabacloud-speech',
+ 'microsoft-speech',
+ 'google-speech',
+ 'google-transcription'
+));
diff --git a/db/queries/models.sql b/db/queries/models.sql
index fd2e0703..d87de737 100644
--- a/db/queries/models.sql
+++ b/db/queries/models.sql
@@ -21,14 +21,19 @@ SELECT * FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
+ 'openai-transcription',
'openrouter-speech',
+ 'openrouter-transcription',
'elevenlabs-speech',
+ 'elevenlabs-transcription',
'deepgram-speech',
+ 'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
- 'google-speech'
+ 'google-speech',
+ 'google-transcription'
)
ORDER BY created_at DESC;
@@ -54,14 +59,19 @@ FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
+ 'openai-transcription',
'openrouter-speech',
+ 'openrouter-transcription',
'elevenlabs-speech',
+ 'elevenlabs-transcription',
'deepgram-speech',
+ 'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
- 'google-speech'
+ 'google-speech',
+ 'google-transcription'
);
-- name: CreateModel :one
@@ -230,8 +240,18 @@ WHERE client_type IN (
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
- 'microsoft-speech',
- 'google-speech'
+ 'microsoft-speech'
+)
+ORDER BY created_at DESC;
+
+-- name: ListTranscriptionProviders :many
+SELECT * FROM providers
+WHERE client_type IN (
+ 'openai-transcription',
+ 'openrouter-transcription',
+ 'elevenlabs-transcription',
+ 'deepgram-transcription',
+ 'google-transcription'
)
ORDER BY created_at DESC;
diff --git a/internal/db/sqlc/models.sql.go b/internal/db/sqlc/models.sql.go
index a94ca59b..400100c9 100644
--- a/internal/db/sqlc/models.sql.go
+++ b/internal/db/sqlc/models.sql.go
@@ -40,14 +40,19 @@ FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
+ 'openai-transcription',
'openrouter-speech',
+ 'openrouter-transcription',
'elevenlabs-speech',
+ 'elevenlabs-transcription',
'deepgram-speech',
+ 'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
- 'google-speech'
+ 'google-speech',
+ 'google-transcription'
)
`
@@ -805,14 +810,19 @@ SELECT id, name, client_type, icon, enable, config, metadata, created_at, update
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
+ 'openai-transcription',
'openrouter-speech',
+ 'openrouter-transcription',
'elevenlabs-speech',
+ 'elevenlabs-transcription',
'deepgram-speech',
+ 'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
- 'google-speech'
+ 'google-speech',
+ 'google-transcription'
)
ORDER BY created_at DESC
`
@@ -945,8 +955,7 @@ WHERE client_type IN (
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
- 'microsoft-speech',
- 'google-speech'
+ 'microsoft-speech'
)
ORDER BY created_at DESC
`
@@ -1068,6 +1077,48 @@ func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, provi
return items, nil
}
+const listTranscriptionProviders = `-- name: ListTranscriptionProviders :many
+SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
+WHERE client_type IN (
+ 'openai-transcription',
+ 'openrouter-transcription',
+ 'elevenlabs-transcription',
+ 'deepgram-transcription',
+ 'google-transcription'
+)
+ORDER BY created_at DESC
+`
+
+func (q *Queries) ListTranscriptionProviders(ctx context.Context) ([]Provider, error) {
+ rows, err := q.db.Query(ctx, listTranscriptionProviders)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+ var items []Provider
+ for rows.Next() {
+ var i Provider
+ if err := rows.Scan(
+ &i.ID,
+ &i.Name,
+ &i.ClientType,
+ &i.Icon,
+ &i.Enable,
+ &i.Config,
+ &i.Metadata,
+ &i.CreatedAt,
+ &i.UpdatedAt,
+ ); err != nil {
+ return nil, err
+ }
+ items = append(items, i)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return items, nil
+}
+
const updateModel = `-- name: UpdateModel :one
UPDATE models
SET
diff --git a/internal/handlers/tts_providers.go b/internal/handlers/tts_providers.go
index b8440bae..e802fae3 100644
--- a/internal/handlers/tts_providers.go
+++ b/internal/handlers/tts_providers.go
@@ -34,11 +34,16 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
pg := e.Group("/speech-providers")
pg.GET("", h.ListProviders)
pg.GET("/:id", h.GetProvider)
- pg.GET("/meta", h.ListMeta)
+ pg.GET("/meta", h.ListSpeechMeta)
pg.GET("/:id/models", h.ListModelsByProvider)
pg.POST("/:id/import-models", h.ImportModels)
- pg.GET("/:id/transcription-models", h.ListTranscriptionModelsByProvider)
- pg.POST("/:id/import-transcription-models", h.ImportTranscriptionModels)
+
+ tpg := e.Group("/transcription-providers")
+ tpg.GET("", h.ListTranscriptionProviders)
+ tpg.GET("/meta", h.ListTranscriptionMeta)
+ tpg.GET("/:id", h.GetProvider)
+ tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider)
+ tpg.POST("/:id/import-models", h.ImportTranscriptionModels)
mg := e.Group("/speech-models")
mg.GET("", h.ListModels)
@@ -61,8 +66,12 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
// @Tags speech-providers
// @Success 200 {array} tts.ProviderMetaResponse
// @Router /speech-providers/meta [get].
-func (h *SpeechHandler) ListMeta(c echo.Context) error {
- return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context()))
+func (h *SpeechHandler) ListSpeechMeta(c echo.Context) error {
+ return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
+}
+
+func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
+ return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
}
// ListProviders godoc
@@ -81,6 +90,14 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
+func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
+ items, err := h.service.ListTranscriptionProviders(c.Request().Context())
+ if err != nil {
+ return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+ }
+ return c.JSON(http.StatusOK, items)
+}
+
// GetProvider godoc
// @Summary Get speech provider
// @Description Get a speech provider with masked config values
diff --git a/internal/models/models.go b/internal/models/models.go
index 3c2f04d0..def74323 100644
--- a/internal/models/models.go
+++ b/internal/models/models.go
@@ -432,14 +432,19 @@ func IsValidClientType(clientType ClientType) bool {
ClientTypeGitHubCopilot,
ClientTypeEdgeSpeech,
ClientTypeOpenAISpeech,
+ ClientTypeOpenAITranscription,
ClientTypeOpenRouterSpeech,
+ ClientTypeOpenRouterTranscription,
ClientTypeElevenLabsSpeech,
+ ClientTypeElevenLabsTranscription,
ClientTypeDeepgramSpeech,
+ ClientTypeDeepgramTranscription,
ClientTypeMiniMaxSpeech,
ClientTypeVolcengineSpeech,
ClientTypeAlibabaSpeech,
ClientTypeMicrosoftSpeech,
- ClientTypeGoogleSpeech:
+ ClientTypeGoogleSpeech,
+ ClientTypeGoogleTranscription:
return true
default:
return false
@@ -449,7 +454,9 @@ func IsValidClientType(clientType ClientType) bool {
// IsLLMClientType returns true if the client type belongs to the LLM domain
// (chat/embedding), excluding speech-only types (any type ending in "-speech").
func IsLLMClientType(clientType ClientType) bool {
- return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
+ return IsValidClientType(clientType) &&
+ !strings.HasSuffix(string(clientType), "-speech") &&
+ !strings.HasSuffix(string(clientType), "-transcription")
}
// SelectMemoryModel selects a chat model for memory operations.
diff --git a/internal/models/types.go b/internal/models/types.go
index a4ef8e1b..d0b180f6 100644
--- a/internal/models/types.go
+++ b/internal/models/types.go
@@ -18,22 +18,27 @@ const (
type ClientType string
const (
- ClientTypeOpenAIResponses ClientType = "openai-responses"
- ClientTypeOpenAICompletions ClientType = "openai-completions"
- ClientTypeAnthropicMessages ClientType = "anthropic-messages"
- ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
- ClientTypeOpenAICodex ClientType = "openai-codex"
- ClientTypeGitHubCopilot ClientType = "github-copilot"
- ClientTypeEdgeSpeech ClientType = "edge-speech"
- ClientTypeOpenAISpeech ClientType = "openai-speech"
- ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
- ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
- ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
- ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
- ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
- ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
- ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
- ClientTypeGoogleSpeech ClientType = "google-speech"
+ ClientTypeOpenAIResponses ClientType = "openai-responses"
+ ClientTypeOpenAICompletions ClientType = "openai-completions"
+ ClientTypeAnthropicMessages ClientType = "anthropic-messages"
+ ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
+ ClientTypeOpenAICodex ClientType = "openai-codex"
+ ClientTypeGitHubCopilot ClientType = "github-copilot"
+ ClientTypeEdgeSpeech ClientType = "edge-speech"
+ ClientTypeOpenAISpeech ClientType = "openai-speech"
+ ClientTypeOpenAITranscription ClientType = "openai-transcription"
+ ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
+ ClientTypeOpenRouterTranscription ClientType = "openrouter-transcription"
+ ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
+ ClientTypeElevenLabsTranscription ClientType = "elevenlabs-transcription"
+ ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
+ ClientTypeDeepgramTranscription ClientType = "deepgram-transcription"
+ ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
+ ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
+ ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
+ ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
+ ClientTypeGoogleSpeech ClientType = "google-speech"
+ ClientTypeGoogleTranscription ClientType = "google-transcription"
)
const (
diff --git a/internal/tts/bootstrap.go b/internal/tts/bootstrap.go
index 91e6a0d7..5f4cbe96 100644
--- a/internal/tts/bootstrap.go
+++ b/internal/tts/bootstrap.go
@@ -34,32 +34,34 @@ func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Querie
}
synced := 0
- for _, model := range def.Models {
- if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
- if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
- ProviderID: provider.ID,
- ModelID: model.ID,
- Type: string(models.ModelTypeSpeech),
- }); err != nil {
- return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
+ if !isTranscriptionClientType(def.ClientType) {
+ for _, model := range def.Models {
+ if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
+ if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
+ ProviderID: provider.ID,
+ ModelID: model.ID,
+ Type: string(models.ModelTypeSpeech),
+ }); err != nil {
+ return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
+ }
+ continue
}
- continue
+ modelConfigJSON, err := json.Marshal(map[string]any{})
+ if err != nil {
+ return fmt.Errorf("marshal speech model config: %w", err)
+ }
+ name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
+ if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
+ ModelID: model.ID,
+ Name: name,
+ ProviderID: provider.ID,
+ Type: string(models.ModelTypeSpeech),
+ Config: modelConfigJSON,
+ }); err != nil {
+ return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
+ }
+ synced++
}
- modelConfigJSON, err := json.Marshal(map[string]any{})
- if err != nil {
- return fmt.Errorf("marshal speech model config: %w", err)
- }
- name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
- if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
- ModelID: model.ID,
- Name: name,
- ProviderID: provider.ID,
- Type: string(models.ModelTypeSpeech),
- Config: modelConfigJSON,
- }); err != nil {
- return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
- }
- synced++
}
for _, model := range def.TranscriptionModels {
if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) {
diff --git a/internal/tts/registry.go b/internal/tts/registry.go
index 8a2d9ed1..ff4800a8 100644
--- a/internal/tts/registry.go
+++ b/internal/tts/registry.go
@@ -53,11 +53,60 @@ type Registry struct {
ordered []models.ClientType
}
+func isTranscriptionClientType(clientType models.ClientType) bool {
+ switch clientType {
+ case
+ models.ClientTypeOpenAITranscription,
+ models.ClientTypeOpenRouterTranscription,
+ models.ClientTypeElevenLabsTranscription,
+ models.ClientTypeDeepgramTranscription,
+ models.ClientTypeGoogleTranscription:
+ return true
+ default:
+ return false
+ }
+}
+
+func speechToTranscriptionClientType(clientType models.ClientType) models.ClientType {
+ switch clientType {
+ case models.ClientTypeOpenAISpeech:
+ return models.ClientTypeOpenAITranscription
+ case models.ClientTypeOpenRouterSpeech:
+ return models.ClientTypeOpenRouterTranscription
+ case models.ClientTypeElevenLabsSpeech:
+ return models.ClientTypeElevenLabsTranscription
+ case models.ClientTypeDeepgramSpeech:
+ return models.ClientTypeDeepgramTranscription
+ case models.ClientTypeGoogleSpeech:
+ return models.ClientTypeGoogleTranscription
+ default:
+ return ""
+ }
+}
+
+func transcriptionDisplayName(displayName string) string {
+ displayName = strings.TrimSpace(displayName)
+ if displayName == "Google Speech" {
+ return "Google Transcription"
+ }
+ if strings.HasSuffix(displayName, " Speech") {
+ return strings.TrimSuffix(displayName, " Speech") + " Transcription"
+ }
+ return displayName + " Transcription"
+}
+
func NewRegistry() *Registry {
r := &Registry{
providers: make(map[models.ClientType]ProviderDefinition),
}
- for _, def := range defaultProviderDefinitions() {
+ baseDefs := defaultProviderDefinitions()
+ for _, def := range baseDefs {
+ if def.Factory == nil && def.TranscriptionFactory != nil {
+ continue
+ }
+ r.Register(def)
+ }
+ for _, def := range transcriptionProviderDefinitions(baseDefs) {
r.Register(def)
}
return r
@@ -123,6 +172,81 @@ func (r *Registry) ListMeta() []ProviderMetaResponse {
return metas
}
+func (r *Registry) ListSpeechMeta() []ProviderMetaResponse {
+ defs := r.List()
+ metas := make([]ProviderMetaResponse, 0, len(defs))
+ for _, def := range defs {
+ if def.Factory == nil {
+ continue
+ }
+ metas = append(metas, ProviderMetaResponse{
+ Provider: string(def.ClientType),
+ DisplayName: def.DisplayName,
+ Description: def.Description,
+ ConfigSchema: def.ConfigSchema,
+ DefaultModel: def.DefaultModel,
+ Models: def.Models,
+ DefaultSynthesisModel: def.DefaultModel,
+ SynthesisModels: def.Models,
+ SupportsSynthesisList: def.SupportsList,
+ })
+ }
+ return metas
+}
+
+func (r *Registry) ListTranscriptionMeta() []ProviderMetaResponse {
+ defs := r.List()
+ metas := make([]ProviderMetaResponse, 0, len(defs))
+ for _, def := range defs {
+ if def.TranscriptionFactory == nil || !isTranscriptionClientType(def.ClientType) {
+ continue
+ }
+ modelsList := def.TranscriptionModels
+ if len(modelsList) == 0 {
+ modelsList = def.Models
+ }
+ metas = append(metas, ProviderMetaResponse{
+ Provider: string(def.ClientType),
+ DisplayName: def.DisplayName,
+ Description: def.Description,
+ ConfigSchema: def.ConfigSchema,
+ DefaultModel: def.DefaultTranscriptionModel,
+ Models: modelsList,
+ DefaultTranscriptionModel: def.DefaultTranscriptionModel,
+ TranscriptionModels: modelsList,
+ SupportsTranscriptionList: def.SupportsTranscriptionList,
+ })
+ }
+ return metas
+}
+
+func transcriptionProviderDefinitions(base []ProviderDefinition) []ProviderDefinition {
+ out := make([]ProviderDefinition, 0, len(base))
+ for _, def := range base {
+ clientType := speechToTranscriptionClientType(def.ClientType)
+ if clientType == "" || def.TranscriptionFactory == nil {
+ continue
+ }
+ modelsList := def.TranscriptionModels
+ out = append(out, ProviderDefinition{
+ ClientType: clientType,
+ DisplayName: transcriptionDisplayName(def.DisplayName),
+ Icon: def.Icon,
+ Description: strings.TrimSpace(def.Description),
+ ConfigSchema: def.ConfigSchema,
+ DefaultModel: def.DefaultTranscriptionModel,
+ SupportsList: def.SupportsTranscriptionList,
+ Models: modelsList,
+ DefaultTranscriptionModel: def.DefaultTranscriptionModel,
+ SupportsTranscriptionList: def.SupportsTranscriptionList,
+ TranscriptionModels: modelsList,
+ TranscriptionFactory: def.TranscriptionFactory,
+ Order: def.Order + 1,
+ })
+ }
+ return out
+}
+
func defaultProviderDefinitions() []ProviderDefinition {
edgeVoices := make([]VoiceInfo, 0)
for lang, ids := range edgespeech.EdgeTTSVoices {
diff --git a/internal/tts/service.go b/internal/tts/service.go
index eb4da940..17c3fe51 100644
--- a/internal/tts/service.go
+++ b/internal/tts/service.go
@@ -35,6 +35,14 @@ func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListMeta()
}
+func (s *Service) ListSpeechMeta(_ context.Context) []ProviderMetaResponse {
+ return s.registry.ListSpeechMeta()
+}
+
+func (s *Service) ListTranscriptionMeta(_ context.Context) []ProviderMetaResponse {
+ return s.registry.ListTranscriptionMeta()
+}
+
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
rows, err := s.queries.ListSpeechProviders(ctx)
if err != nil {
@@ -47,6 +55,18 @@ func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResp
return items, nil
}
+func (s *Service) ListTranscriptionProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
+ rows, err := s.queries.ListTranscriptionProviders(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("list transcription providers: %w", err)
+ }
+ items := make([]SpeechProviderResponse, 0, len(rows))
+ for _, row := range rows {
+ items = append(items, toSpeechProviderResponse(row))
+ }
+ return items, nil
+}
+
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {