Revert "Feat/speech support (#392)"

This reverts commit c9dcfe287f.
2026-04-25 07:00:48 +09:00 · 2026-04-22 00:10:36 +08:00
parent c9dcfe287f
commit 63fe03cfff
70 changed files with 1689 additions and 6609 deletions
@@ -18,7 +18,6 @@
        <div class="flex flex-col gap-3 mt-4">
          <!-- Type -->
          <FormField
-            v-if="!hideType"
            v-slot="{ componentField }"
            name="type"
          >
@@ -36,12 +35,11 @@
                  </SelectTrigger>
                  <SelectContent>
                    <SelectGroup>
-                      <SelectItem
-                        v-for="opt in typeOptions"
-                        :key="opt.value"
-                        :value="opt.value"
-                      >
-                        {{ opt.label }}
+                      <SelectItem value="chat">
+                        Chat
+                      </SelectItem>
+                      <SelectItem value="embedding">
+                        Embedding
                      </SelectItem>
                    </SelectGroup>
                  </SelectContent>
@@ -183,11 +181,6 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities'
 import FormDialogShell from '@/components/form-dialog-shell/index.vue'
 import { useDialogMutation } from '@/composables/useDialogMutation'

-interface ModelTypeOption {
-  value: string
-  label: string
-}
-
 const selectedCompat = ref<string[]>([])
 const { t } = useI18n()
 const { run } = useDialogMutation()
@@ -200,30 +193,14 @@ const formSchema = toTypedSchema(z.object({
  context_window: z.coerce.number().min(1).optional(),
 }))

-const props = withDefaults(defineProps<{
-  id: string
-  typeOptions?: ModelTypeOption[]
-  defaultType?: string
-  hideType?: boolean
-  invalidateKeys?: string[]
-}>(), {
-  typeOptions: () => [
-    { value: 'chat', label: 'Chat' },
-    { value: 'embedding', label: 'Embedding' },
-  ],
-  defaultType: 'chat',
-  hideType: false,
-  invalidateKeys: () => ['provider-models'],
-})
-
 const form = useForm({
  validationSchema: formSchema,
  initialValues: {
-    type: props.defaultType,
+    type: 'chat',
  },
 })

-const selectedType = computed(() => form.values.type || props.defaultType)
+const selectedType = computed(() => form.values.type || 'chat')

 const open = inject<Ref<boolean>>('openModel', ref(false))
 const title = inject<Ref<'edit' | 'title'>>('openModelTitle', ref('title'))
@@ -260,19 +237,15 @@ function onNameInput(e: Event) {
  form.setFieldValue('name', (e.target as HTMLInputElement).value)
 }

-const queryCache = useQueryCache()
-function invalidateModelQueries() {
-  for (const key of props.invalidateKeys) {
-    queryCache.invalidateQueries({ key: [key] })
-  }
-}
+const { id } = defineProps<{ id: string }>()

+const queryCache = useQueryCache()
 const { mutateAsync: createModel, isLoading: createLoading } = useMutation({
  mutation: async (data: Record<string, unknown>) => {
    const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true })
    return result
  },
-  onSettled: invalidateModelQueries,
+  onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
 })
 const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
  mutation: async ({ id, data }: { id: string; data: Record<string, unknown> }) => {
@@ -283,7 +256,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
    })
    return result
  },
-  onSettled: invalidateModelQueries,
+  onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
 })
 const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({
  mutation: async ({ modelId, data }: { modelId: string; data: Record<string, unknown> }) => {
@@ -294,7 +267,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading
    })
    return result
  },
-  onSettled: invalidateModelQueries,
+  onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
 })
 const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value)

@@ -324,7 +297,7 @@ async function addModel() {
  const payload: Record<string, unknown> = {
    type,
    model_id,
-    provider_id: props.id,
+    provider_id: id,
    config,
  }

@@ -375,15 +348,7 @@ watch(open, async () => {
    selectedCompat.value = config?.compatibilities ?? []
    userEditedName.value = !!(name && name !== model_id)
  } else {
-    form.resetForm({
-      values: {
-        type: props.defaultType,
-        model_id: '',
-        name: '',
-        dimensions: undefined,
-        context_window: undefined,
-      },
-    })
+    form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } })
    selectedCompat.value = []
    userEditedName.value = false
  }
@@ -52,7 +52,7 @@ import { computed, type Component } from 'vue'
 import { storeToRefs } from 'pinia'
 import { useRouter, useRoute } from 'vue-router'
 import { useI18n } from 'vue-i18n'
-import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
+import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
 import { useChatSelectionStore } from '@/store/chat-selection'
 import {
  Sidebar,
@@ -118,11 +118,6 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(()
    name: 'speech',
    icon: Volume2,
  },
-  {
-    title: t('sidebar.transcription'),
-    name: 'transcription',
-    icon: AudioLines,
-  },
  {
    title: t('sidebar.email'),
    name: 'email',
@@ -45,41 +45,21 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
    label: 'OpenAI Speech',
    hint: 'OpenAI /audio/speech compatible TTS',
  },
-  'openai-transcription': {
-    value: 'openai-transcription',
-    label: 'OpenAI Transcription',
-    hint: 'OpenAI audio transcription',
-  },
  'openrouter-speech': {
    value: 'openrouter-speech',
    label: 'OpenRouter Speech',
    hint: 'OpenRouter audio modality TTS',
  },
-  'openrouter-transcription': {
-    value: 'openrouter-transcription',
-    label: 'OpenRouter Transcription',
-    hint: 'OpenRouter transcription models',
-  },
  'elevenlabs-speech': {
    value: 'elevenlabs-speech',
    label: 'ElevenLabs Speech',
    hint: 'ElevenLabs text-to-speech',
  },
-  'elevenlabs-transcription': {
-    value: 'elevenlabs-transcription',
-    label: 'ElevenLabs Transcription',
-    hint: 'ElevenLabs speech-to-text',
-  },
  'deepgram-speech': {
    value: 'deepgram-speech',
    label: 'Deepgram Speech',
    hint: 'Deepgram TTS',
  },
-  'deepgram-transcription': {
-    value: 'deepgram-transcription',
-    label: 'Deepgram Transcription',
-    hint: 'Deepgram speech-to-text',
-  },
  'minimax-speech': {
    value: 'minimax-speech',
    label: 'MiniMax Speech',
@@ -100,19 +80,9 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
    label: 'Microsoft Speech',
    hint: 'Azure Cognitive Services TTS',
  },
-  'google-speech': {
-    value: 'google-speech',
-    label: 'Google Speech',
-    hint: 'Gemini speech transcription',
-  },
-  'google-transcription': {
-    value: 'google-transcription',
-    label: 'Google Transcription',
-    hint: 'Gemini speech transcription',
-  },
 }

 export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)

 export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
-  .filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription'))
+  .filter(ct => !ct.value.endsWith('-speech'))
@@ -63,7 +63,6 @@
    "webSearch": "Web Search",
    "memory": "Memory",
    "speech": "Speech",
-    "transcription": "Transcription",
    "email": "Email",
    "settings": "Settings",
    "profile": "Profile",
@@ -426,9 +425,6 @@
    "noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
    "noCapabilities": "No capabilities available for this model.",
    "saveSuccess": "Speech configuration saved",
-    "synthesis": {
-      "models": "Synthesis Models"
-    },
    "advanced": {
      "title": "Advanced Settings",
      "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
@@ -452,27 +448,6 @@
      "failed": "Synthesis failed"
    }
  },
-  "transcription": {
-    "title": "Transcription",
-    "emptyTitle": "No Transcription Providers",
-    "emptyDescription": "Add a transcription provider to enable speech-to-text for your bots",
-    "models": "Transcription Models",
-    "noModels": "No transcription models found. Import available models or keep the default template model.",
-    "noCapabilities": "No capabilities available for this model.",
-    "importModels": "Import Models",
-    "importSuccess": "Transcription models imported successfully",
-    "importFailed": "Failed to import transcription models",
-    "saveSuccess": "Transcription configuration saved",
-    "advanced": {
-      "title": "Advanced Settings",
-      "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
-    },
-    "test": {
-      "title": "Test Transcription",
-      "run": "Transcribe",
-      "failed": "Transcription failed"
-    }
-  },
  "email": {
    "title": "Email",
    "add": "Add Email",
@@ -945,8 +920,6 @@
      "memoryHealthUnavailable": "Unavailable",
      "ttsModel": "TTS Model",
      "ttsModelPlaceholder": "Select TTS model",
-      "transcriptionModel": "Transcription Model",
-      "transcriptionModelPlaceholder": "Select transcription model",
      "imageModel": "Image Generation Model",
      "imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.",
      "imageModelPlaceholder": "Select image model (optional)",
@@ -64,7 +64,6 @@
    "webSearch": "搜索",
    "memory": "记忆",
    "speech": "语音",
-    "transcription": "转写",
    "email": "邮件",
    "profile": "用户",
    "home": "首页",
@@ -422,9 +421,6 @@
    "noModels": "暂无模型，点击\"导入模型\"发现可用模型，或点击\"新建模型\"手动创建。",
    "noCapabilities": "该模型暂无可用能力信息。",
    "saveSuccess": "语音配置已保存",
-    "synthesis": {
-      "models": "语音合成模型"
-    },
    "advanced": {
      "title": "高级设置",
      "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
@@ -448,27 +444,6 @@
      "failed": "合成失败"
    }
  },
-  "transcription": {
-    "title": "语音转写",
-    "emptyTitle": "暂无转写提供方",
-    "emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能",
-    "models": "语音识别模型",
-    "noModels": "暂无语音识别模型，可导入可用模型，或保留默认模板模型。",
-    "importModels": "导入模型",
-    "importSuccess": "识别模型导入成功",
-    "importFailed": "识别模型导入失败",
-    "saveSuccess": "转写配置已保存",
-    "noCapabilities": "该模型暂无可用能力信息。",
-    "advanced": {
-      "title": "高级设置",
-      "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
-    },
-    "test": {
-      "title": "测试识别",
-      "run": "开始识别",
-      "failed": "识别失败"
-    }
-  },
  "email": {
    "title": "邮件提供方",
    "add": "添加邮件提供方",
@@ -941,8 +916,6 @@
      "memoryHealthUnavailable": "暂不可用",
      "ttsModel": "语音合成模型",
      "ttsModelPlaceholder": "选择语音合成模型",
-      "transcriptionModel": "转写模型",
-      "transcriptionModelPlaceholder": "选择语音转写模型",
      "imageModel": "图片生成模型",
      "imageModelDescription": "用于 generate_image 工具的模型，必须支持 image-output 兼容性。",
      "imageModelPlaceholder": "选择图片模型（可选）",
@@ -187,17 +187,6 @@
      />
    </div>

-    <!-- Transcription Model -->
-    <div class="space-y-2">
-      <Label>{{ $t('bots.settings.transcriptionModel') }}</Label>
-      <TtsModelSelect
-        v-model="form.transcription_model_id"
-        :models="transcriptionModels"
-        :providers="ttsProviders"
-        :placeholder="$t('bots.settings.transcriptionModelPlaceholder')"
-      />
-    </div>
-
    <!-- Image Generation Model -->
    <div class="space-y-2">
      <Label>{{ $t('bots.settings.imageModel') }}</Label>
@@ -367,7 +356,7 @@ import MemoryProviderSelect from './memory-provider-select.vue'
 import TtsModelSelect from './tts-model-select.vue'
 import BrowserContextSelect from './browser-context-select.vue'
 import { useQuery, useMutation, useQueryCache } from '@pinia/colada'
-import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getTranscriptionProviders, getTranscriptionModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
+import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
 import type { SettingsSettings } from '@memohai/sdk'
 import type { Ref } from 'vue'
 import { resolveApiErrorMessage } from '@/utils/api-error'
@@ -451,22 +440,6 @@ const { data: ttsModelData } = useQuery({
  },
 })

-const { data: transcriptionModelData } = useQuery({
-  key: ['transcription-models'],
-  query: async () => {
-    const { data } = await getTranscriptionModels({ throwOnError: true })
-    return data
-  },
-})
-
-const { data: transcriptionProviderData } = useQuery({
-  key: ['transcription-providers'],
-  query: async () => {
-    const { data } = await getTranscriptionProviders({ throwOnError: true })
-    return data
-  },
-})
-
 const { data: browserContextData } = useQuery({
  key: ['all-browser-contexts'],
  query: async () => {
@@ -521,10 +494,7 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter((
 const memoryProviders = computed(() => memoryProviderData.value ?? [])
 const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false))
 const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id)))
-const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record<string, unknown>) => p.enable !== false))
-const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record<string, unknown>) => p.id as string)))
 const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTtsProviderIds.value.has(m.provider_id as string)))
-const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTranscriptionProviderIds.value.has(m.provider_id as string)))
 const browserContexts = computed(() => browserContextData.value ?? [])

 // ---- Form ----
@@ -535,7 +505,6 @@ const form = reactive({
  search_provider_id: '',
  memory_provider_id: '',
  tts_model_id: '',
-  transcription_model_id: '',
  browser_context_id: '',
  timezone: '',
  language: '',
@@ -675,7 +644,6 @@ watch(settings, (val) => {
    form.search_provider_id = val.search_provider_id ?? ''
    form.memory_provider_id = val.memory_provider_id ?? ''
    form.tts_model_id = val.tts_model_id ?? ''
-    form.transcription_model_id = val.transcription_model_id ?? ''
    form.browser_context_id = val.browser_context_id ?? ''
    form.language = val.language ?? ''
    form.timezone = val.timezone ?? ''
@@ -698,7 +666,6 @@ const hasSettingsChanges = computed(() => {
    || form.search_provider_id !== (s.search_provider_id ?? '')
    || form.memory_provider_id !== (s.memory_provider_id ?? '')
    || form.tts_model_id !== (s.tts_model_id ?? '')
-    || form.transcription_model_id !== (s.transcription_model_id ?? '')
    || form.browser_context_id !== (s.browser_context_id ?? '')
    || form.language !== (s.language ?? '')
    || form.timezone !== (s.timezone ?? '')
@@ -85,7 +85,7 @@
      v-else-if="advancedFields.length === 0"
      class="text-xs text-muted-foreground"
    >
-      {{ mode === 'transcription' ? $t('transcription.noCapabilities') : $t('speech.noCapabilities') }}
+      {{ $t('speech.noCapabilities') }}
    </div>

    <div
@@ -97,7 +97,7 @@
        class="flex w-full items-center justify-between px-3 py-2 text-left text-xs font-medium"
        @click="showAdvanced = !showAdvanced"
      >
-        <span>{{ mode === 'transcription' ? $t('transcription.advanced.title') : $t('speech.advanced.title') }}</span>
+        <span>{{ $t('speech.advanced.title') }}</span>
        <component
          :is="showAdvanced ? ChevronUp : ChevronDown"
          class="size-3 text-muted-foreground"
@@ -108,7 +108,7 @@
        class="space-y-4 border-t border-border px-3 py-3"
      >
        <p class="text-xs text-muted-foreground">
-          {{ mode === 'transcription' ? $t('transcription.advanced.description') : $t('speech.advanced.description') }}
+          {{ $t('speech.advanced.description') }}
        </p>
        <section
          v-for="field in advancedFields"
@@ -195,12 +195,9 @@

    <div class="space-y-3">
      <h4 class="text-xs font-medium">
-        {{ mode === 'transcription' ? $t('transcription.test.title') : $t('speech.test.title') }}
+        {{ $t('speech.test.title') }}
      </h4>
-      <div
-        v-if="mode === 'synthesis'"
-        class="relative"
-      >
+      <div class="relative">
        <Textarea
          v-model="testText"
          :placeholder="$t('speech.test.placeholder')"
@@ -212,36 +209,17 @@
          {{ testText.length }}/{{ maxTestTextLen }}
        </span>
      </div>
-      <div
-        v-else
-        class="space-y-2"
-      >
-        <Input
-          type="file"
-          accept="audio/*"
-          @change="handleFileChange"
-        />
-        <p
-          v-if="selectedFileName"
-          class="text-xs text-muted-foreground"
-        >
-          {{ selectedFileName }}
-        </p>
-      </div>
      <div class="flex items-center gap-3">
        <LoadingButton
          type="button"
          variant="outline"
          size="sm"
          :loading="testLoading"
-          :disabled="mode === 'synthesis' ? (!testText.trim() || testText.length > maxTestTextLen) : !selectedFile"
+          :disabled="!testText.trim() || testText.length > maxTestTextLen"
          @click="handleTest"
        >
-          <Play
-            v-if="mode === 'synthesis'"
-            class="mr-1.5"
-          />
-          {{ mode === 'transcription' ? $t('transcription.test.run') : $t('speech.test.generate') }}
+          <Play class="mr-1.5" />
+          {{ $t('speech.test.generate') }}
        </LoadingButton>
        <span
          v-if="testError"
@@ -251,7 +229,7 @@
        </span>
      </div>
      <div
-        v-if="mode === 'synthesis' && audioUrl"
+        v-if="audioUrl"
        class="rounded-md border border-border bg-muted/30 p-3"
      >
        <audio
@@ -261,20 +239,6 @@
          class="w-full"
        />
      </div>
-      <div
-        v-if="mode === 'transcription' && transcriptionText"
-        class="rounded-md border border-border bg-muted/30 p-3 space-y-2"
-      >
-        <p class="text-sm whitespace-pre-wrap wrap-break-word">
-          {{ transcriptionText }}
-        </p>
-        <p
-          v-if="transcriptionLanguage"
-          class="text-xs text-muted-foreground"
-        >
-          {{ transcriptionLanguage }}
-        </p>
-      </div>
    </div>

    <Separator class="my-3" />
@@ -332,8 +296,7 @@ const props = defineProps<{
  modelName: string
  config: Record<string, unknown>
  schema: SpeechConfigSchema | null
-  mode?: 'synthesis' | 'transcription'
-  onTest: (payload: string | File, config: Record<string, unknown>) => Promise<Blob | { text?: string, language?: string }>
+  onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
 }>()

 const emit = defineEmits<{
@@ -346,16 +309,11 @@ const visibleSecrets = reactive<Record<string, boolean>>({})
 const saving = ref(false)
 const showAdvanced = ref(false)
 const testText = ref('')
-const selectedFile = ref<File | null>(null)
-const selectedFileName = ref('')
 const testLoading = ref(false)
 const testError = ref('')
 const audioUrl = ref('')
-const transcriptionText = ref('')
-const transcriptionLanguage = ref('')
 const audioEl = ref<HTMLAudioElement>()
 const maxTestTextLen = 500
-const mode = computed(() => props.mode ?? 'synthesis')

 const orderedFields = computed(() => {
  const fields = props.schema?.fields ?? []
@@ -390,11 +348,6 @@ function revokeAudio() {
  }
 }

-function resetTranscription() {
-  transcriptionText.value = ''
-  transcriptionLanguage.value = ''
-}
-
 onBeforeUnmount(revokeAudio)

 async function handleSaveConfig() {
@@ -407,39 +360,23 @@ async function handleSaveConfig() {
 }

 async function handleTest() {
-  if (mode.value === 'synthesis' && !testText.value.trim()) return
-  if (mode.value === 'transcription' && !selectedFile.value) return
+  if (!testText.value.trim()) return
  testLoading.value = true
  testError.value = ''
  revokeAudio()
-  resetTranscription()

  try {
-    const result = await props.onTest(mode.value === 'synthesis' ? testText.value : selectedFile.value as File, buildConfig())
+    const blob = await props.onTest(testText.value, buildConfig())

-    if (mode.value === 'synthesis') {
-      const blob = result as Blob
-      audioUrl.value = URL.createObjectURL(blob)
-      await new Promise<void>((resolve) => setTimeout(resolve, 50))
-      audioEl.value?.play()
-    } else {
-      const payload = result as { text?: string, language?: string }
-      transcriptionText.value = payload.text ?? ''
-      transcriptionLanguage.value = payload.language ?? ''
-    }
+    audioUrl.value = URL.createObjectURL(blob)
+    await new Promise<void>((resolve) => setTimeout(resolve, 50))
+    audioEl.value?.play()
  } catch (error: unknown) {
-    const msg = error instanceof Error ? error.message : t(mode.value === 'transcription' ? 'transcription.test.failed' : 'speech.test.failed')
+    const msg = error instanceof Error ? error.message : t('speech.test.failed')
    testError.value = msg
    toast.error(msg)
  } finally {
    testLoading.value = false
  }
 }
-
-function handleFileChange(event: Event) {
-  const input = event.target as HTMLInputElement
-  const file = input.files?.[0] ?? null
-  selectedFile.value = file
-  selectedFileName.value = file?.name ?? ''
-}
 </script>
@@ -138,29 +138,18 @@
    <section>
      <div class="flex justify-between items-center mb-4">
        <h3 class="text-xs font-medium">
-          {{ $t('speech.synthesis.models') }}
+          {{ $t('speech.models') }}
        </h3>
-        <div
+        <LoadingButton
          v-if="curProviderId"
-          class="flex items-center gap-2"
+          type="button"
+          variant="outline"
+          size="sm"
+          :loading="importLoading"
+          @click="handleImportModels"
        >
-          <LoadingButton
-            type="button"
-            variant="outline"
-            size="sm"
-            :loading="importLoading"
-            @click="handleImportModels"
-          >
-            {{ $t('speech.importModels') }}
-          </LoadingButton>
-          <CreateModel
-            :id="curProviderId"
-            default-type="speech"
-            hide-type
-            :type-options="speechTypeOptions"
-            :invalidate-keys="['speech-provider-models', 'speech-models']"
-          />
-        </div>
+          {{ $t('speech.importModels') }}
+        </LoadingButton>
      </div>

      <div
@@ -202,7 +191,7 @@
            :model-name="model.model_id ?? ''"
            :config="model.config || {}"
            :schema="getModelSchema(model.model_id ?? '')"
-            :on-test="(text, cfg) => handleTestModel(model.id ?? '', text as string, cfg)"
+            :on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
            @save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
          />
        </div>
@@ -229,11 +218,10 @@ import { computed, inject, reactive, ref, watch } from 'vue'
 import { toast } from 'vue-sonner'
 import { useI18n } from 'vue-i18n'
 import { useQuery, useQueryCache } from '@pinia/colada'
-import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putProvidersById } from '@memohai/sdk'
+import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
 import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
 import LoadingButton from '@/components/loading-button/index.vue'
 import ProviderIcon from '@/components/provider-icon/index.vue'
-import CreateModel from '@/components/create-model/index.vue'

 interface SpeechFieldSchema {
  key: string
@@ -268,8 +256,6 @@ interface SpeechProviderMeta {
  config_schema?: SpeechConfigSchema
  default_model?: string
  models?: SpeechModelMeta[]
-  default_synthesis_model?: string
-  synthesis_models?: SpeechModelMeta[]
 }

 function getInitials(name: string | undefined) {
@@ -288,9 +274,6 @@ const enableLoading = ref(false)
 const saveLoading = ref(false)
 const importLoading = ref(false)
 const queryCache = useQueryCache()
-const speechTypeOptions = [
-  { value: 'speech', label: 'Speech' },
-]

 const { data: providerDetail } = useQuery({
  key: () => ['speech-provider-detail', curProviderId.value],
@@ -314,7 +297,7 @@ const { data: metaList } = useQuery({

 const currentMeta = computed(() => {
  if (!metaList.value || !curProvider.value?.client_type) return null
-  return (metaList.value as SpeechProviderMeta[]).find(m => m.provider === curProvider.value?.client_type) ?? null
+  return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
 })

 const orderedProviderFields = computed(() => {
@@ -334,7 +317,9 @@ const { data: providerSpeechModels } = useQuery({
  },
 })

-const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []))
+const providerModels = computed(() => {
+  return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
+})

 watch(() => providerDetail.value, (provider) => {
  providerName.value = provider?.name ?? curProvider.value?.name ?? ''
@@ -343,11 +328,12 @@ watch(() => providerDetail.value, (provider) => {
 }, { immediate: true, deep: true })

 function getModelMeta(modelID: string): SpeechModelMeta | null {
-  const models = currentMeta.value?.synthesis_models ?? currentMeta.value?.models ?? []
+  const models = currentMeta.value?.models ?? []
  const exact = models.find(m => m.id === modelID)
  if (exact) return exact
-  const defaultModel = currentMeta.value?.default_synthesis_model ?? currentMeta.value?.default_model
-  if (defaultModel) return models.find(m => m.id === defaultModel) ?? null
+  if (currentMeta.value?.default_model) {
+    return models.find(m => m.id === currentMeta.value?.default_model) ?? null
+  }
  return models[0] ?? null
 }

@@ -412,23 +398,20 @@ async function handleSaveProvider() {
 }

 async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
-  const model = providerModels.value.find(item => item.id === modelId)
+  const model = providerModels.value.find((item) => item.id === modelId)
  if (!model) return
  try {
-    const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
-    const token = localStorage.getItem('token')
-    const resp = await fetch(`${apiBase}/speech-models/${modelId}`, {
-      method: 'PUT',
-      headers: {
-        'Content-Type': 'application/json',
-        ...(token ? { Authorization: `Bearer ${token}` } : {}),
-      },
-      body: JSON.stringify({
+    await putModelsById({
+      path: { id: modelId },
+      body: {
+        model_id: model.model_id,
        name: model.name ?? model.model_id,
+        provider_id: model.provider_id,
+        type: 'speech',
        config,
-      }),
+      },
+      throwOnError: true,
    })
-    if (!resp.ok) throw new Error(await resp.text())
    toast.success(t('speech.saveSuccess'))
    queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
    queryCache.invalidateQueries({ key: ['speech-models'] })
@@ -1,126 +0,0 @@
-<script setup lang="ts">
-import { computed, ref, provide, watch } from 'vue'
-import { useQuery } from '@pinia/colada'
-import {
-  ScrollArea,
-  SidebarMenu,
-  SidebarMenuButton,
-  SidebarMenuItem,
-  Toggle,
-  Empty,
-  EmptyDescription,
-  EmptyHeader,
-  EmptyMedia,
-  EmptyTitle,
-} from '@memohai/ui'
-import { getTranscriptionProviders } from '@memohai/sdk'
-import type { AudioSpeechProviderResponse } from '@memohai/sdk'
-import ProviderSetting from './provider-setting.vue'
-import { AudioLines } from 'lucide-vue-next'
-import MasterDetailSidebarLayout from '@/components/master-detail-sidebar-layout/index.vue'
-import ProviderIcon from '@/components/provider-icon/index.vue'
-
-function getInitials(name: string | undefined) {
-  const label = name?.trim() ?? ''
-  return label ? label.slice(0, 2).toUpperCase() : '?'
-}
-
-const { data: providerData } = useQuery({
-  key: () => ['transcription-providers'],
-  query: async () => {
-    const { data } = await getTranscriptionProviders({ throwOnError: true })
-    return (data ?? []) as AudioSpeechProviderResponse[]
-  },
-})
-const curProvider = ref<AudioSpeechProviderResponse>()
-provide('curTranscriptionProvider', curProvider)
-
-const selectProvider = (name: string) => computed(() => curProvider.value?.name === name)
-
-const filteredProviders = computed(() => {
-  if (!Array.isArray(providerData.value)) return []
-  return [...providerData.value].sort((a, b) => Number(b.enable !== false) - Number(a.enable !== false))
-})
-
-watch(filteredProviders, (list) => {
-  if (!list || list.length === 0) {
-    curProvider.value = { id: '' }
-    return
-  }
-  const currentId = curProvider.value?.id
-  if (currentId) {
-    const stillExists = list.find(p => p.id === currentId)
-    if (stillExists) {
-      curProvider.value = stillExists
-      return
-    }
-  }
-  curProvider.value = list[0]
-}, { immediate: true })
-</script>
-
-<template>
-  <MasterDetailSidebarLayout>
-    <template #sidebar-content>
-      <SidebarMenu
-        v-for="item in filteredProviders"
-        :key="item.id"
-      >
-        <SidebarMenuItem>
-          <SidebarMenuButton
-            as-child
-            class="justify-start py-5! px-4"
-          >
-            <Toggle
-              :class="['py-4 border', curProvider?.id === item.id ? 'border-border' : 'border-transparent']"
-              :model-value="selectProvider(item.name ?? '').value"
-              @update:model-value="(isSelect) => { if (isSelect) curProvider = item }"
-            >
-              <span class="relative shrink-0">
-                <span class="flex size-7 items-center justify-center rounded-full bg-muted">
-                  <ProviderIcon
-                    v-if="item.icon"
-                    :icon="item.icon"
-                    size="1.25em"
-                  />
-                  <span
-                    v-else
-                    class="text-xs font-medium text-muted-foreground"
-                  >
-                    {{ getInitials(item.name) }}
-                  </span>
-                </span>
-                <span
-                  v-if="item.enable !== false"
-                  class="absolute -bottom-0.5 -right-0.5 size-2.5 rounded-full bg-green-500 ring-2 ring-background"
-                />
-              </span>
-              <span class="truncate">{{ item.name }}</span>
-            </Toggle>
-          </SidebarMenuButton>
-        </SidebarMenuItem>
-      </SidebarMenu>
-    </template>
-
-    <template #detail>
-      <ScrollArea
-        v-if="curProvider?.id"
-        class="max-h-full h-full"
-      >
-        <ProviderSetting />
-      </ScrollArea>
-      <Empty
-        v-else
-        class="h-full flex justify-center items-center"
-      >
-        <EmptyHeader>
-          <EmptyMedia variant="icon">
-            <AudioLines />
-          </EmptyMedia>
-        </EmptyHeader>
-        <EmptyTitle>{{ $t('transcription.emptyTitle') }}</EmptyTitle>
-        <EmptyDescription>{{ $t('transcription.emptyDescription') }}</EmptyDescription>
-      </Empty>
-    </template>
-  </MasterDetailSidebarLayout>
-</template>
@@ -1,480 +0,0 @@
-<template>
-  <div class="p-4">
-    <section class="flex items-center gap-3">
-      <span class="flex size-10 shrink-0 items-center justify-center rounded-full bg-muted">
-        <ProviderIcon
-          v-if="curProvider?.icon"
-          :icon="curProvider.icon"
-          size="1.5em"
-        />
-        <span
-          v-else
-          class="text-xs font-medium text-muted-foreground"
-        >
-          {{ getInitials(curProvider?.name) }}
-        </span>
-      </span>
-      <div class="min-w-0">
-        <h2 class="text-sm font-semibold truncate">
-          {{ curProvider?.name }}
-        </h2>
-        <p class="text-xs text-muted-foreground">
-          {{ currentMeta?.display_name ?? curProvider?.client_type }}
-        </p>
-      </div>
-      <div class="ml-auto flex items-center gap-2">
-        <span class="text-xs text-muted-foreground">
-          {{ $t('common.enable') }}
-        </span>
-        <Switch
-          :model-value="curProvider?.enable ?? false"
-          :disabled="!curProvider?.id || enableLoading"
-          @update:model-value="handleToggleEnable"
-        />
-      </div>
-    </section>
-    <Separator class="mt-4 mb-6" />
-
-    <form
-      class="space-y-4"
-      @submit.prevent="handleSaveProvider"
-    >
-      <section class="space-y-2">
-        <Label for="transcription-provider-name">{{ $t('common.name') }}</Label>
-        <Input
-          id="transcription-provider-name"
-          v-model="providerName"
-          type="text"
-          :placeholder="$t('common.namePlaceholder')"
-        />
-      </section>
-
-      <section
-        v-for="field in orderedProviderFields"
-        :key="field.key"
-        class="space-y-2"
-      >
-        <Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `transcription-provider-${field.key}`">
-          {{ field.title || field.key }}
-        </Label>
-        <p
-          v-if="field.description"
-          class="text-xs text-muted-foreground"
-        >
-          {{ field.description }}
-        </p>
-        <div
-          v-if="field.type === 'secret'"
-          class="relative"
-        >
-          <Input
-            :id="`transcription-provider-${field.key}`"
-            v-model="providerConfig[field.key] as string"
-            :type="visibleSecrets[field.key] ? 'text' : 'password'"
-          />
-          <button
-            type="button"
-            class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
-            @click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
-          >
-            <component
-              :is="visibleSecrets[field.key] ? EyeOff : Eye"
-              class="size-3.5"
-            />
-          </button>
-        </div>
-        <Switch
-          v-else-if="field.type === 'bool'"
-          :model-value="!!providerConfig[field.key]"
-          @update:model-value="(val) => providerConfig[field.key] = !!val"
-        />
-        <Input
-          v-else-if="field.type === 'number'"
-          :id="`transcription-provider-${field.key}`"
-          v-model.number="providerConfig[field.key] as number"
-          type="number"
-        />
-        <Select
-          v-else-if="field.type === 'enum' && field.enum"
-          :model-value="String(providerConfig[field.key] ?? '')"
-          @update:model-value="(val) => providerConfig[field.key] = val"
-        >
-          <SelectTrigger>
-            <SelectValue :placeholder="field.title || field.key" />
-          </SelectTrigger>
-          <SelectContent>
-            <SelectItem
-              v-for="opt in field.enum"
-              :key="opt"
-              :value="opt"
-            >
-              {{ opt }}
-            </SelectItem>
-          </SelectContent>
-        </Select>
-        <Input
-          v-else
-          :id="`transcription-provider-${field.key}`"
-          v-model="providerConfig[field.key] as string"
-          type="text"
-        />
-      </section>
-
-      <div class="flex justify-end">
-        <LoadingButton
-          type="submit"
-          :loading="saveLoading"
-        >
-          {{ $t('provider.saveChanges') }}
-        </LoadingButton>
-      </div>
-    </form>
-
-    <Separator class="mt-6 mb-6" />
-
-    <section>
-      <div class="flex justify-between items-center mb-4">
-        <h3 class="text-xs font-medium">
-          {{ $t('transcription.models') }}
-        </h3>
-        <div
-          v-if="curProviderId"
-          class="flex items-center gap-2"
-        >
-          <LoadingButton
-            type="button"
-            variant="outline"
-            size="sm"
-            :loading="importLoading"
-            @click="handleImportModels"
-          >
-            {{ $t('transcription.importModels') }}
-          </LoadingButton>
-          <CreateModel
-            :id="curProviderId"
-            default-type="transcription"
-            hide-type
-            :type-options="transcriptionTypeOptions"
-            :invalidate-keys="['transcription-provider-models', 'transcription-models']"
-          />
-        </div>
-      </div>
-
-      <div
-        v-if="providerModels.length === 0"
-        class="text-xs text-muted-foreground py-4 text-center"
-      >
-        {{ $t('transcription.noModels') }}
-      </div>
-
-      <div
-        v-for="model in providerModels"
-        :key="model.id"
-        class="border border-border rounded-lg mb-4"
-      >
-        <button
-          type="button"
-          class="w-full flex items-center justify-between p-3 text-left hover:bg-accent/50 rounded-t-lg transition-colors"
-          @click="toggleModel(model.id ?? '')"
-        >
-          <div>
-            <span class="text-xs font-medium">{{ model.name || model.model_id }}</span>
-            <span
-              v-if="model.name"
-              class="text-xs text-muted-foreground ml-2"
-            >
-              {{ model.model_id }}
-            </span>
-          </div>
-          <component
-            :is="expandedModelId === model.id ? ChevronUp : ChevronDown"
-            class="size-3 text-muted-foreground"
-          />
-        </button>
-        <div
-          v-if="expandedModelId === model.id"
-          class="px-3 pb-3 space-y-4 border-t border-border pt-3"
-        >
-          <ModelConfigEditor
-            :model-id="model.id ?? ''"
-            :model-name="model.model_id ?? ''"
-            :config="model.config || {}"
-            :schema="getModelSchema(model.model_id ?? '')"
-            mode="transcription"
-            :on-test="(file, cfg) => handleTestModel(model.id ?? '', file as File, cfg)"
-            @save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
-          />
-        </div>
-      </div>
-    </section>
-  </div>
-</template>
-
-<script setup lang="ts">
-import { computed, inject, reactive, ref, watch } from 'vue'
-import { useQuery, useQueryCache } from '@pinia/colada'
-import { toast } from 'vue-sonner'
-import { useI18n } from 'vue-i18n'
-import {
-  getTranscriptionProvidersById,
-  getTranscriptionProvidersMeta,
-  getTranscriptionProvidersByIdModels,
-  postTranscriptionProvidersByIdImportModels,
-  postTranscriptionModelsByIdTest,
-  putProvidersById,
-  putTranscriptionModelsById,
-} from '@memohai/sdk'
-import type {
-  AudioProviderMetaResponse,
-  AudioSpeechProviderResponse,
-  AudioTestTranscriptionResponse,
-  AudioTranscriptionModelResponse,
-} from '@memohai/sdk'
-import { ChevronDown, ChevronUp, Eye, EyeOff } from 'lucide-vue-next'
-import { Input, Label, Select, SelectContent, SelectItem, SelectTrigger, SelectValue, Separator, Switch } from '@memohai/ui'
-import ProviderIcon from '@/components/provider-icon/index.vue'
-import LoadingButton from '@/components/loading-button/index.vue'
-import ModelConfigEditor from '@/pages/speech/components/model-config-editor.vue'
-import CreateModel from '@/components/create-model/index.vue'
-
-interface FieldSchema { key: string, type: string, title?: string, description?: string, enum?: string[], order?: number }
-interface ConfigSchema { fields?: FieldSchema[] }
-interface ModelMeta { id: string, name: string, config_schema?: ConfigSchema, capabilities?: { config_schema?: ConfigSchema } }
-interface ProviderMeta {
-  provider: string
-  display_name?: string
-  config_schema?: ConfigSchema
-  default_transcription_model?: string
-  transcription_models?: ModelMeta[]
-  models?: ModelMeta[]
-}
-
-function getInitials(name: string | undefined) {
-  const label = name?.trim() ?? ''
-  return label ? label.slice(0, 2).toUpperCase() : '?'
-}
-
-function normalizeConfigSchema(schema?: AudioProviderMetaResponse['config_schema']): ConfigSchema | undefined {
-  if (!schema) return undefined
-  const fields: FieldSchema[] = []
-  for (const field of schema.fields ?? []) {
-    if (!field?.key || !field.type) continue
-    fields.push({
-      key: field.key,
-      type: field.type,
-      title: field.title,
-      description: field.description,
-      enum: field.enum,
-      order: field.order,
-    })
-  }
-  return { fields }
-}
-
-function normalizeModelMeta(model: NonNullable<AudioProviderMetaResponse['models']>[number]): ModelMeta | null {
-  if (!model?.id) return null
-  return {
-    id: model.id,
-    name: model.name ?? model.id,
-    config_schema: normalizeConfigSchema(model.config_schema),
-    capabilities: model.capabilities
-      ? { config_schema: normalizeConfigSchema(model.capabilities.config_schema) }
-      : undefined,
-  }
-}
-
-function normalizeProviderMeta(meta: AudioProviderMetaResponse): ProviderMeta {
-  return {
-    provider: meta.provider ?? '',
-    display_name: meta.display_name,
-    config_schema: normalizeConfigSchema(meta.config_schema),
-    default_transcription_model: meta.default_transcription_model,
-    transcription_models: (meta.transcription_models ?? [])
-      .map(normalizeModelMeta)
-      .filter((model): model is ModelMeta => model !== null),
-    models: (meta.models ?? [])
-      .map(normalizeModelMeta)
-      .filter((model): model is ModelMeta => model !== null),
-  }
-}
-
-const { t } = useI18n()
-const curProvider = inject('curTranscriptionProvider', ref<AudioSpeechProviderResponse>())
-const curProviderId = computed(() => curProvider.value?.id)
-const providerName = ref('')
-const providerConfig = reactive<Record<string, unknown>>({})
-const visibleSecrets = reactive<Record<string, boolean>>({})
-const expandedModelId = ref('')
-const enableLoading = ref(false)
-const saveLoading = ref(false)
-const importLoading = ref(false)
-const queryCache = useQueryCache()
-const transcriptionTypeOptions = [
-  { value: 'transcription', label: 'Transcription' },
-]
-
-const { data: providerDetail } = useQuery({
-  key: () => ['transcription-provider-detail', curProviderId.value ?? ''],
-  query: async () => {
-    if (!curProviderId.value) return null
-    const { data } = await getTranscriptionProvidersById({
-      path: { id: curProviderId.value },
-      throwOnError: true,
-    })
-    return (data ?? null) as AudioSpeechProviderResponse | null
-  },
-})
-
-const { data: metaList } = useQuery({
-  key: () => ['transcription-providers-meta'],
-  query: async () => {
-    const { data } = await getTranscriptionProvidersMeta({ throwOnError: true })
-    return (data ?? []).map(normalizeProviderMeta)
-  },
-})
-
-const currentMeta = computed(() => (metaList.value ?? []).find(m => m.provider === curProvider.value?.client_type) ?? null)
-const orderedProviderFields = computed(() => [...(currentMeta.value?.config_schema?.fields ?? [])].sort((a, b) => (a.order ?? 0) - (b.order ?? 0)))
-
-const { data: providerModelData } = useQuery({
-  key: () => ['transcription-provider-models', curProviderId.value ?? ''],
-  query: async () => {
-    if (!curProviderId.value) return []
-    const { data } = await getTranscriptionProvidersByIdModels({
-      path: { id: curProviderId.value },
-      throwOnError: true,
-    })
-    return (data ?? []) as AudioTranscriptionModelResponse[]
-  },
-})
-
-const providerModels = computed(() => providerModelData.value ?? [])
-
-watch(() => providerDetail.value, (provider) => {
-  providerName.value = provider?.name ?? curProvider.value?.name ?? ''
-  Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
-  Object.assign(providerConfig, { ...(provider?.config ?? {}) })
-}, { immediate: true, deep: true })
-
-function getModelSchema(modelID: string): ConfigSchema | null {
-  const models = currentMeta.value?.transcription_models ?? currentMeta.value?.models ?? []
-  const exact = models.find(m => m.id === modelID)
-  const fallback = exact ?? models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? models[0]
-  return fallback?.config_schema ?? fallback?.capabilities?.config_schema ?? null
-}
-
-function toggleModel(id: string) {
-  expandedModelId.value = expandedModelId.value === id ? '' : id
-}
-
-async function handleToggleEnable(value: boolean) {
-  if (!curProviderId.value || !curProvider.value?.client_type) return
-  const prev = curProvider.value.enable ?? false
-  curProvider.value = { ...curProvider.value, enable: value }
-  enableLoading.value = true
-  try {
-    await putProvidersById({
-      path: { id: curProviderId.value },
-      body: {
-        name: providerName.value.trim() || curProvider.value.name || '',
-        client_type: curProvider.value.client_type,
-        enable: value,
-        config: sanitizeConfig(providerConfig),
-      },
-      throwOnError: true,
-    })
-    queryCache.invalidateQueries({ key: ['transcription-providers'] })
-    queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
-  } catch {
-    curProvider.value = { ...curProvider.value, enable: prev }
-    toast.error(t('common.saveFailed'))
-  } finally {
-    enableLoading.value = false
-  }
-}
-
-async function handleSaveProvider() {
-  if (!curProviderId.value || !curProvider.value?.client_type) return
-  saveLoading.value = true
-  try {
-    await putProvidersById({
-      path: { id: curProviderId.value },
-      body: {
-        name: providerName.value.trim() || curProvider.value.name || '',
-        client_type: curProvider.value.client_type,
-        enable: curProvider.value.enable,
-        config: sanitizeConfig(providerConfig),
-      },
-      throwOnError: true,
-    })
-    toast.success(t('transcription.saveSuccess'))
-    queryCache.invalidateQueries({ key: ['transcription-providers'] })
-    queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
-  } catch {
-    toast.error(t('common.saveFailed'))
-  } finally {
-    saveLoading.value = false
-  }
-}
-
-async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
-  const model = providerModels.value.find(item => item.id === modelId)
-  if (!model) return
-  try {
-    await putTranscriptionModelsById({
-      path: { id: modelId },
-      body: { name: model.name ?? model.model_id ?? modelId, config },
-      throwOnError: true,
-    })
-    toast.success(t('transcription.saveSuccess'))
-    queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
-    queryCache.invalidateQueries({ key: ['transcription-models'] })
-  } catch {
-    toast.error(t('common.saveFailed'))
-  }
-}
-
-async function handleImportModels() {
-  if (!curProviderId.value) return
-  importLoading.value = true
-  try {
-    const { data } = await postTranscriptionProvidersByIdImportModels({
-      path: { id: curProviderId.value },
-      throwOnError: true,
-    })
-    const payload = (data ?? {}) as { created?: number, skipped?: number }
-    toast.success(t('transcription.importSuccess', {
-      created: payload.created ?? 0,
-      skipped: payload.skipped ?? 0,
-    }))
-    queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
-    queryCache.invalidateQueries({ key: ['transcription-models'] })
-    queryCache.invalidateQueries({ key: ['transcription-providers-meta'] })
-  } catch {
-    toast.error(t('transcription.importFailed'))
-  } finally {
-    importLoading.value = false
-  }
-}
-
-async function handleTestModel(modelId: string, file: File, config: Record<string, unknown>) {
-  const { data } = await postTranscriptionModelsByIdTest({
-    path: { id: modelId },
-    body: {
-      file,
-      config: JSON.stringify(config),
-    },
-    throwOnError: true,
-  })
-  return (data ?? {}) as AudioTestTranscriptionResponse
-}
-
-function sanitizeConfig(input: Record<string, unknown>) {
-  const result: Record<string, unknown> = {}
-  for (const [key, value] of Object.entries(input)) {
-    if (value === '' || value == null) continue
-    result[key] = value
-  }
-  return result
-}
-</script>
@@ -89,14 +89,6 @@ const routes = [
          breadcrumb: i18nRef('sidebar.speech'),
        },
      },
-      {
-        name: 'transcription',
-        path: 'transcription',
-        component: () => import('@/pages/transcription/index.vue'),
-        meta: {
-          breadcrumb: i18nRef('sidebar.transcription'),
-        },
-      },
      {
        name: 'email',
        path: 'email',
@@ -23,7 +23,6 @@ import (
 	agentpkg "github.com/memohai/memoh/internal/agent"
 	"github.com/memohai/memoh/internal/agent/background"
 	agenttools "github.com/memohai/memoh/internal/agent/tools"
-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/bind"
 	"github.com/memohai/memoh/internal/boot"
 	"github.com/memohai/memoh/internal/bots"
@@ -88,6 +87,7 @@ import (
 	"github.com/memohai/memoh/internal/storage/providers/containerfs"
 	"github.com/memohai/memoh/internal/storage/providers/fallback"
 	"github.com/memohai/memoh/internal/storage/providers/localfs"
+	ttspkg "github.com/memohai/memoh/internal/tts"
 	"github.com/memohai/memoh/internal/version"
 	"github.com/memohai/memoh/internal/workspace"
 )
@@ -331,7 +331,7 @@ func provideChannelRouter(
 	policyService *policy.Service,
 	bindService *bind.Service,
 	mediaService *media.Service,
-	audioService *audiopkg.Service,
+	ttsService *ttspkg.Service,
 	settingsService *settings.Service,
 	scheduleService *schedule.Service,
 	mcpConnService *mcp.ConnectionService,
@@ -372,8 +372,7 @@ func provideChannelRouter(
 	processor.SetMediaService(mediaService)
 	processor.SetStreamObserver(local.NewRouteHubBroadcaster(hub))
 	processor.SetDispatcher(inbound.NewRouteDispatcher(log))
-	processor.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
-	processor.SetTranscriptionService(&settingsTranscriptionAdapter{audio: audioService}, &settingsTranscriptionModelResolver{settings: settingsService})
+	processor.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
 	cmdHandler := command.NewHandler(
 		log,
 		&command.BotMemberRoleAdapter{BotService: botService},
@@ -450,7 +449,7 @@ func provideBackgroundManager(log *slog.Logger) *background.Manager {
 	return background.New(log)
 }

-func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, audioService *audiopkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
+func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
 	var assetResolver messaging.AssetResolver
 	if mediaService != nil {
 		assetResolver = &mediaAssetResolverAdapter{media: mediaService}
@@ -468,8 +467,7 @@ func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *c
 		agenttools.NewSpawnProvider(log, settingsService, modelsService, queries, sessionService),
 		agenttools.NewSkillProvider(log),
 		agenttools.NewBrowserProvider(log, settingsService, browserContextService, manager, cfg.BrowserGateway),
-		agenttools.NewTTSProvider(log, settingsService, audioService, channelManager, registry),
-		agenttools.NewTranscriptionProvider(log, settingsService, audioService, mediaService),
+		agenttools.NewTTSProvider(log, settingsService, ttsService, channelManager, registry),
 		agenttools.NewImageGenProvider(log, settingsService, modelsService, queries, manager, config.DefaultDataMount),
 		agenttools.NewFederationProvider(log, fedSource),
 		agenttools.NewHistoryProvider(log, sessionService, queries),
@@ -513,23 +511,23 @@ func provideUsersHandler(log *slog.Logger, accountService *accounts.Service, ide
 	return handlers.NewUsersHandler(log, accountService, identityService, botService, routeService, channelStore, channelLifecycle, channelManager, registry)
 }

-func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, audioService *audiopkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
+func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, ttsService *ttspkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
 	h := handlers.NewLocalChannelHandler(local.WebType, channelManager, channelStore, chatService, hub, botService, accountService)
 	h.SetResolver(resolver)
 	h.SetMediaService(mediaService)
-	h.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
+	h.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
 	return h
 }

-func provideAudioRegistry() *audiopkg.Registry {
-	return audiopkg.NewRegistry()
+func provideTtsRegistry() *ttspkg.Registry {
+	return ttspkg.NewRegistry()
 }

-func provideAudioTempStore() (*audiopkg.TempStore, error) {
-	return audiopkg.NewTempStore(os.TempDir())
+func provideTtsTempStore() (*ttspkg.TempStore, error) {
+	return ttspkg.NewTempStore(os.TempDir())
 }

-func startAudioTempStoreCleanup(lc fx.Lifecycle, store *audiopkg.TempStore) {
+func startTtsTempStoreCleanup(lc fx.Lifecycle, store *ttspkg.TempStore) {
 	done := make(chan struct{})
 	lc.Append(fx.Hook{
 		OnStart: func(_ context.Context) error {
@@ -585,11 +583,11 @@ func (a *sessionEnsurerAdapter) CreateNewSession(ctx context.Context, botID, rou
 	return inbound.SessionResult{ID: sess.ID, Type: sess.Type}, nil
 }

-type settingsSpeechModelResolver struct {
+type settingsTtsModelResolver struct {
 	settings *settings.Service
 }

-func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context, botID string) (string, error) {
+func (r *settingsTtsModelResolver) ResolveTtsModelID(ctx context.Context, botID string) (string, error) {
 	s, err := r.settings.GetBot(ctx, botID)
 	if err != nil {
 		return "", err
@@ -597,36 +595,6 @@ func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context,
 	return s.TtsModelID, nil
 }

-type settingsTranscriptionModelResolver struct {
-	settings *settings.Service
-}
-
-func (r *settingsTranscriptionModelResolver) ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error) {
-	s, err := r.settings.GetBot(ctx, botID)
-	if err != nil {
-		return "", err
-	}
-	return s.TranscriptionModelID, nil
-}
-
-type settingsTranscriptionAdapter struct {
-	audio *audiopkg.Service
-}
-
-type inboundTranscriptionResult struct {
-	text string
-}
-
-func (r inboundTranscriptionResult) GetText() string { return r.text }
-
-func (a *settingsTranscriptionAdapter) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (inbound.TranscriptionResult, error) {
-	result, err := a.audio.Transcribe(ctx, modelID, audio, filename, contentType, overrideCfg)
-	if err != nil {
-		return nil, err
-	}
-	return inboundTranscriptionResult{text: result.Text}, nil
-}
-
 func provideEmailRegistry(log *slog.Logger, tokenStore *emailpkg.DBOAuthTokenStore) *emailpkg.Registry {
 	reg := emailpkg.NewRegistry()
 	reg.Register(emailgeneric.New(log))
@@ -716,11 +684,11 @@ func startRegistrySync(lc fx.Lifecycle, log *slog.Logger, cfg config.Config, que
 	})
 }

-func startAudioProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *audiopkg.Registry) {
+func startSpeechProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *ttspkg.Registry) {
 	lc.Append(fx.Hook{
 		OnStart: func(ctx context.Context) error {
-			if err := audiopkg.SyncRegistry(ctx, log, queries, registry); err != nil {
-				log.Warn("audio registry bootstrap failed", slog.Any("error", err))
+			if err := ttspkg.SyncRegistry(ctx, log, queries, registry); err != nil {
+				log.Warn("speech registry bootstrap failed", slog.Any("error", err))
 			}
 			return nil
 		},
@@ -8,7 +8,6 @@ import (

 	"github.com/memohai/memoh/internal/accounts"
 	"github.com/memohai/memoh/internal/acl"
-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/bind"
 	"github.com/memohai/memoh/internal/boot"
 	"github.com/memohai/memoh/internal/bots"
@@ -30,6 +29,7 @@ import (
 	"github.com/memohai/memoh/internal/schedule"
 	"github.com/memohai/memoh/internal/searchproviders"
 	"github.com/memohai/memoh/internal/settings"
+	ttspkg "github.com/memohai/memoh/internal/tts"
 )

 func runServe() {
@@ -63,9 +63,9 @@ func options() fx.Option {
 			identities.NewService,
 			bind.NewService,
 			event.NewHub,
-			provideAudioRegistry,
-			audiopkg.NewService,
-			provideAudioTempStore,
+			provideTtsRegistry,
+			ttspkg.NewService,
+			provideTtsTempStore,
 			emailpkg.NewDBOAuthTokenStore,
 			provideEmailRegistry,
 			emailpkg.NewService,
@@ -121,8 +121,8 @@ func options() fx.Option {
 			provideServerHandler(weixin.NewQRServerHandler),
 			provideServerHandler(provideUsersHandler),
 			provideServerHandler(handlers.NewMemoryProvidersHandler),
-			provideServerHandler(handlers.NewAudioHandler),
-			provideServerHandler(handlers.NewBotAudioHandler),
+			provideServerHandler(handlers.NewSpeechHandler),
+			provideServerHandler(handlers.NewBotTtsHandler),
 			provideServerHandler(handlers.NewEmailProvidersHandler),
 			provideServerHandler(handlers.NewEmailBindingsHandler),
 			provideServerHandler(handlers.NewEmailOutboxHandler),
@@ -141,7 +141,7 @@ func options() fx.Option {
 		fx.Invoke(
 			injectToolProviders,
 			startRegistrySync,
-			startAudioProviderBootstrap,
+			startSpeechProviderBootstrap,
 			startMemoryProviderBootstrap,
 			startSearchProviderBootstrap,
 			startScheduleService,
@@ -151,7 +151,7 @@ func options() fx.Option {
 			startEmailManager,
 			startContainerReconciliation,
 			startBackgroundTaskCleanup,
-			startAudioTempStoreCleanup,
+			startTtsTempStoreCleanup,
 			startServer,
 		),
 		fx.WithLogger(func(logger *slog.Logger) fxevent.Logger {
@@ -1,9 +0,0 @@
-name: Deepgram Transcription
-client_type: deepgram-transcription
-icon: deepgram
-base_url: https://api.deepgram.com
-
-models:
-  - model_id: nova-3
-    name: Nova-3
-    type: transcription
@@ -1,9 +0,0 @@
-name: ElevenLabs Transcription
-client_type: elevenlabs-transcription
-icon: elevenlabs
-base_url: https://api.elevenlabs.io
-
-models:
-  - model_id: scribe_v2
-    name: Scribe v2
-    type: transcription
@@ -1,9 +0,0 @@
-name: Google Transcription
-client_type: google-transcription
-icon: google-color
-base_url: https://generativelanguage.googleapis.com/v1beta
-
-models:
-  - model_id: gemini-2.5-flash
-    name: Gemini 2.5 Flash
-    type: transcription
@@ -1,9 +0,0 @@
-name: OpenAI Transcription
-client_type: openai-transcription
-icon: openai
-base_url: https://api.openai.com/v1
-
-models:
-  - model_id: gpt-4o-mini-transcribe
-    name: GPT-4o Mini Transcribe
-    type: transcription
@@ -1,9 +0,0 @@
-name: OpenRouter Transcription
-client_type: openrouter-transcription
-icon: openrouter
-base_url: https://openrouter.ai/api/v1
-
-models:
-  - model_id: openai/gpt-4o-mini-transcribe
-    name: OpenRouter Transcription
-    type: transcription
@@ -77,19 +77,13 @@ CREATE TABLE IF NOT EXISTS providers (
    'github-copilot',
    'edge-speech',
    'openai-speech',
-    'openai-transcription',
    'openrouter-speech',
-    'openrouter-transcription',
    'elevenlabs-speech',
-    'elevenlabs-transcription',
    'deepgram-speech',
-    'deepgram-transcription',
    'minimax-speech',
    'volcengine-speech',
    'alibabacloud-speech',
-    'microsoft-speech',
-    'google-speech',
-    'google-transcription'
+    'microsoft-speech'
  ))
 );

@@ -114,7 +108,7 @@ CREATE TABLE IF NOT EXISTS models (
  created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  CONSTRAINT models_provider_id_model_id_unique UNIQUE (provider_id, model_id),
-  CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'))
+  CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'))
 );

 CREATE TABLE IF NOT EXISTS model_variants (
@@ -176,7 +170,6 @@ CREATE TABLE IF NOT EXISTS bots (
  image_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
  discuss_probe_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
  tts_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
-  transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
  browser_context_id UUID REFERENCES browser_contexts(id) ON DELETE SET NULL,
  persist_full_tool_results BOOLEAN NOT NULL DEFAULT false,
  metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
@@ -1,33 +0,0 @@
-- 0069_add_transcription_models_and_speech_domain
-- Revert transcription model type and speech-domain expansion.
-
-DELETE FROM models WHERE type = 'transcription';
-DELETE FROM providers WHERE client_type = 'google-speech';
-
-ALTER TABLE models
-  DROP CONSTRAINT IF EXISTS models_type_check;
-
-ALTER TABLE models
-  ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'));
-
-ALTER TABLE providers
-  DROP CONSTRAINT IF EXISTS providers_client_type_check;
-
-ALTER TABLE providers
-  ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
-    'openai-responses',
-    'openai-completions',
-    'anthropic-messages',
-    'google-generative-ai',
-    'openai-codex',
-    'github-copilot',
-    'edge-speech',
-    'openai-speech',
-    'openrouter-speech',
-    'elevenlabs-speech',
-    'deepgram-speech',
-    'minimax-speech',
-    'volcengine-speech',
-    'alibabacloud-speech',
-    'microsoft-speech'
-  ));
@@ -1,31 +0,0 @@
-- 0069_add_transcription_models_and_speech_domain
-- Expand the speech domain to support transcription models and shared speech providers.
-
-ALTER TABLE providers
-  DROP CONSTRAINT IF EXISTS providers_client_type_check;
-
-ALTER TABLE providers
-  ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
-    'openai-responses',
-    'openai-completions',
-    'anthropic-messages',
-    'google-generative-ai',
-    'openai-codex',
-    'github-copilot',
-    'edge-speech',
-    'openai-speech',
-    'openrouter-speech',
-    'elevenlabs-speech',
-    'deepgram-speech',
-    'minimax-speech',
-    'volcengine-speech',
-    'alibabacloud-speech',
-    'microsoft-speech',
-    'google-speech'
-  ));
-
-ALTER TABLE models
-  DROP CONSTRAINT IF EXISTS models_type_check;
-
-ALTER TABLE models
-  ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'));
@@ -1,8 +0,0 @@
-- 0070_add_bot_transcription_model
-- Remove bots.transcription_model_id.
-
-ALTER TABLE bots
-  DROP CONSTRAINT IF EXISTS bots_transcription_model_id_fkey;
-
-ALTER TABLE bots
-  DROP COLUMN IF EXISTS transcription_model_id;
@@ -1,5 +0,0 @@
-- 0070_add_bot_transcription_model
-- Add bots.transcription_model_id for bot-level speech-to-text defaults.
-
-ALTER TABLE bots
-  ADD COLUMN IF NOT EXISTS transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL;
@@ -1,33 +0,0 @@
-- 0071_split_transcription_providers
-- Remove dedicated transcription provider client types.
-
-DELETE FROM providers
-WHERE client_type IN (
-  'openai-transcription',
-  'openrouter-transcription',
-  'elevenlabs-transcription',
-  'deepgram-transcription',
-  'google-transcription'
-);
-
-ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
-
-ALTER TABLE providers
-ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
-  'openai-responses',
-  'openai-completions',
-  'anthropic-messages',
-  'google-generative-ai',
-  'openai-codex',
-  'github-copilot',
-  'edge-speech',
-  'openai-speech',
-  'openrouter-speech',
-  'elevenlabs-speech',
-  'deepgram-speech',
-  'minimax-speech',
-  'volcengine-speech',
-  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech'
-));
@@ -1,29 +0,0 @@
-- 0071_split_transcription_providers
-- Add dedicated transcription provider client types.
-
-ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
-
-ALTER TABLE providers
-ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
-  'openai-responses',
-  'openai-completions',
-  'anthropic-messages',
-  'google-generative-ai',
-  'openai-codex',
-  'github-copilot',
-  'edge-speech',
-  'openai-speech',
-  'openai-transcription',
-  'openrouter-speech',
-  'openrouter-transcription',
-  'elevenlabs-speech',
-  'elevenlabs-transcription',
-  'deepgram-speech',
-  'deepgram-transcription',
-  'minimax-speech',
-  'volcengine-speech',
-  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech',
-  'google-transcription'
-));
@@ -16,27 +16,18 @@ SELECT * FROM providers WHERE id = sqlc.arg(id);
 -- name: GetProviderByName :one
 SELECT * FROM providers WHERE name = sqlc.arg(name);

-- name: GetProviderByClientType :one
-SELECT * FROM providers WHERE client_type = sqlc.arg(client_type);
-
 -- name: ListProviders :many
 SELECT * FROM providers
 WHERE client_type NOT IN (
  'edge-speech',
  'openai-speech',
-  'openai-transcription',
  'openrouter-speech',
-  'openrouter-transcription',
  'elevenlabs-speech',
-  'elevenlabs-transcription',
  'deepgram-speech',
-  'deepgram-transcription',
  'minimax-speech',
  'volcengine-speech',
  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech',
-  'google-transcription'
+  'microsoft-speech'
 )
 ORDER BY created_at DESC;

@@ -62,19 +53,13 @@ FROM providers
 WHERE client_type NOT IN (
  'edge-speech',
  'openai-speech',
-  'openai-transcription',
  'openrouter-speech',
-  'openrouter-transcription',
  'elevenlabs-speech',
-  'elevenlabs-transcription',
  'deepgram-speech',
-  'deepgram-transcription',
  'minimax-speech',
  'volcengine-speech',
  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech',
-  'google-transcription'
+  'microsoft-speech'
 );

 -- name: CreateModel :one
@@ -101,7 +86,7 @@ ORDER BY created_at DESC;

 -- name: ListModels :many
 SELECT * FROM models
-WHERE type NOT IN ('speech', 'transcription')
+WHERE type != 'speech'
 ORDER BY created_at DESC;

 -- name: ListModelsByType :many
@@ -112,7 +97,7 @@ ORDER BY created_at DESC;
 -- name: ListModelsByProviderID :many
 SELECT * FROM models
 WHERE provider_id = sqlc.arg(provider_id)
-  AND type NOT IN ('speech', 'transcription')
+  AND type != 'speech'
 ORDER BY created_at DESC;

 -- name: ListModelsByProviderIDAndType :many
@@ -151,15 +136,9 @@ DELETE FROM models
 WHERE provider_id = sqlc.arg(provider_id)
  AND model_id = sqlc.arg(model_id);

-- name: DeleteModelByProviderAndType :exec
-DELETE FROM models
-WHERE provider_id = sqlc.arg(provider_id)
-  AND model_id = sqlc.arg(model_id)
-  AND type = sqlc.arg(type);
-
 -- name: CountModels :one
 SELECT COUNT(*) FROM models
-WHERE type NOT IN ('speech', 'transcription');
+WHERE type != 'speech';

 -- name: CountModelsByType :one
 SELECT COUNT(*) FROM models WHERE type = sqlc.arg(type);
@@ -171,6 +150,11 @@ VALUES (sqlc.arg(name), sqlc.arg(client_type), sqlc.arg(icon), false, sqlc.arg(c
 ON CONFLICT (name) DO UPDATE SET
  icon = EXCLUDED.icon,
  client_type = EXCLUDED.client_type,
+  config = CASE
+    WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
+    THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
+    ELSE EXCLUDED.config
+  END,
  updated_at = now()
 RETURNING *;

@@ -189,7 +173,7 @@ SELECT m.*
 FROM models m
 JOIN providers p ON m.provider_id = p.id
 WHERE p.enable = true
-  AND m.type NOT IN ('speech', 'transcription')
+  AND m.type != 'speech'
 ORDER BY m.created_at DESC;

 -- name: ListEnabledModelsByType :many
@@ -247,17 +231,6 @@ WHERE client_type IN (
 )
 ORDER BY created_at DESC;

-- name: ListTranscriptionProviders :many
-SELECT * FROM providers
-WHERE client_type IN (
-  'openai-transcription',
-  'openrouter-transcription',
-  'elevenlabs-transcription',
-  'deepgram-transcription',
-  'google-transcription'
-)
-ORDER BY created_at DESC;
-
 -- name: ListSpeechModels :many
 SELECT m.*,
  p.client_type AS provider_type
@@ -277,26 +250,3 @@ SELECT * FROM models
 WHERE provider_id = sqlc.arg(provider_id)
  AND model_id = sqlc.arg(model_id)
 LIMIT 1;
-
-- name: GetTranscriptionModelWithProvider :one
-SELECT
-  m.*,
-  p.client_type AS provider_type
-FROM models m
-JOIN providers p ON p.id = m.provider_id
-WHERE m.id = sqlc.arg(id)
-  AND m.type = 'transcription';
-
-- name: ListTranscriptionModels :many
-SELECT m.*,
-  p.client_type AS provider_type
-FROM models m
-JOIN providers p ON p.id = m.provider_id
-WHERE m.type = 'transcription'
-ORDER BY m.created_at DESC;
-
-- name: ListTranscriptionModelsByProviderID :many
-SELECT * FROM models
-WHERE provider_id = sqlc.arg(provider_id)
-  AND type = 'transcription'
-ORDER BY created_at DESC;
@@ -19,7 +19,6 @@ SELECT
  memory_providers.id AS memory_provider_id,
  image_models.id AS image_model_id,
  tts_models.id AS tts_model_id,
-  transcription_models.id AS transcription_model_id,
  browser_contexts.id AS browser_context_id,
  bots.persist_full_tool_results
 FROM bots
@@ -31,7 +30,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
 LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
-LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
 WHERE bots.id = $1;

@@ -56,12 +54,11 @@ WITH updated AS (
      memory_provider_id = COALESCE(sqlc.narg(memory_provider_id)::uuid, bots.memory_provider_id),
      image_model_id = COALESCE(sqlc.narg(image_model_id)::uuid, bots.image_model_id),
      tts_model_id = COALESCE(sqlc.narg(tts_model_id)::uuid, bots.tts_model_id),
-      transcription_model_id = COALESCE(sqlc.narg(transcription_model_id)::uuid, bots.transcription_model_id),
      browser_context_id = COALESCE(sqlc.narg(browser_context_id)::uuid, bots.browser_context_id),
      persist_full_tool_results = sqlc.arg(persist_full_tool_results),
      updated_at = now()
  WHERE bots.id = sqlc.arg(id)
-  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
+  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
 )
 SELECT
  updated.id AS bot_id,
@@ -83,7 +80,6 @@ SELECT
  memory_providers.id AS memory_provider_id,
  image_models.id AS image_model_id,
  tts_models.id AS tts_model_id,
-  transcription_models.id AS transcription_model_id,
  browser_contexts.id AS browser_context_id,
  updated.persist_full_tool_results
 FROM updated
@@ -95,7 +91,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
 LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
-LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id;

 -- name: DeleteSettingsByBotID :exec
@@ -117,7 +112,6 @@ SET language = 'auto',
    search_provider_id = NULL,
    memory_provider_id = NULL,
    tts_model_id = NULL,
-    transcription_model_id = NULL,
    browser_context_id = NULL,
    persist_full_tool_results = false,
    updated_at = now()
@@ -72,7 +72,8 @@ func TestSpawnAndNotify(t *testing.T) {
 	task := mgr.Get(taskID)
 	if task == nil {
 		t.Fatal("task not found after completion")
-	} else if task.Status != TaskCompleted {
+	}
+	if task.Status != TaskCompleted {
 		t.Errorf("expected task status completed, got %s", task.Status)
 	}
 }
@@ -129,7 +130,8 @@ func TestKillTask(t *testing.T) {
 	task := mgr.Get(taskID)
 	if task == nil {
 		t.Fatal("task not found")
-	} else if task.Status != TaskKilled {
+	}
+	if task.Status != TaskKilled {
 		t.Errorf("expected status killed, got %s", task.Status)
 	}

@@ -84,7 +84,7 @@ func retryDelay(attempt int, cfg RetryConfig) time.Duration {
 	if backoffIdx > 20 {
 		backoffIdx = 20
 	}
-	delay := cfg.BaseDelay * time.Duration(1<<backoffIdx)
+	delay := cfg.BaseDelay * time.Duration(1<<uint(backoffIdx))
 	delay = min(delay, cfg.MaxDelay)
 	// Add jitter: random value in [0, delay/2), so final delay is in [delay/2, delay).
 	// math/rand is intentional here — cryptographic randomness is not needed for backoff jitter.
@@ -295,7 +295,7 @@ func (p *ContainerProvider) execRead(ctx context.Context, session SessionContext
 		content += "\n"
 	}

-	content = addLineNumbers(content, lineOffset)
+	content = addLineNumbers(content, int32(lineOffset))
 	return map[string]any{"content": content, "total_lines": totalLines}, nil
 }

@@ -757,7 +757,7 @@ func truncateStr(s string, n int) string {
 	return s[:n] + "..."
 }

-func addLineNumbers(content string, startLine int) string {
+func addLineNumbers(content string, startLine int32) string {
 	if content == "" {
 		return content
 	}
@@ -765,7 +765,7 @@ func addLineNumbers(content string, startLine int) string {
 	var out strings.Builder
 	out.Grow(len(content) + len(lines)*8)
 	for i, line := range lines {
-		fmt.Fprintf(&out, "%6d\t%s\n", startLine+i, line)
+		fmt.Fprintf(&out, "%6d\t%s\n", int(startLine)+i, line)
 	}
 	return out.String()
 }
@@ -1,232 +0,0 @@
-//nolint:gosec
-package tools
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"net"
-	"net/http"
-	"net/url"
-	"path/filepath"
-	"strings"
-	"time"
-
-	sdk "github.com/memohai/twilight-ai/sdk"
-
-	audiopkg "github.com/memohai/memoh/internal/audio"
-	"github.com/memohai/memoh/internal/media"
-	"github.com/memohai/memoh/internal/settings"
-)
-
-const mediaDataPrefix = "/data/media/"
-
-type TranscriptionProvider struct {
-	logger   *slog.Logger
-	settings *settings.Service
-	audio    *audiopkg.Service
-	media    *media.Service
-	http     *http.Client
-}
-
-func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
-	if log == nil {
-		log = slog.Default()
-	}
-	return &TranscriptionProvider{
-		logger:   log.With(slog.String("tool", "transcribe_audio")),
-		settings: settingsSvc,
-		audio:    audioSvc,
-		media:    mediaSvc,
-		http: &http.Client{
-			Timeout: 30 * time.Second,
-			CheckRedirect: func(req *http.Request, via []*http.Request) error {
-				if len(via) >= 10 {
-					return errors.New("stopped after 10 redirects")
-				}
-				if _, err := validateURL(req.Context(), req.URL.String()); err != nil {
-					return fmt.Errorf("redirect to non-public address is not allowed: %w", err)
-				}
-				return nil
-			},
-		},
-	}
-}
-
-func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
-	if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
-		return nil, nil
-	}
-	botID := strings.TrimSpace(session.BotID)
-	if botID == "" {
-		return nil, nil
-	}
-	botSettings, err := p.settings.GetBot(ctx, botID)
-	if err != nil || strings.TrimSpace(botSettings.TranscriptionModelID) == "" {
-		return nil, nil
-	}
-	sess := session
-	return []sdk.Tool{{
-		Name:        "transcribe_audio",
-		Description: "Transcribe an audio or voice message into text. Use this when the user sent a voice message and you need to understand its contents. Accepts a bot media path such as /data/media/... or a direct URL.",
-		Parameters: map[string]any{
-			"type": "object",
-			"properties": map[string]any{
-				"path":        map[string]any{"type": "string", "description": "Audio file path from the message context, usually under /data/media/..."},
-				"url":         map[string]any{"type": "string", "description": "Direct audio URL when a path is unavailable"},
-				"language":    map[string]any{"type": "string", "description": "Optional language hint"},
-				"prompt":      map[string]any{"type": "string", "description": "Optional transcription prompt"},
-				"contentType": map[string]any{"type": "string", "description": "Optional MIME type override"},
-			},
-			"required": []string{},
-		},
-		Execute: func(execCtx *sdk.ToolExecContext, input any) (any, error) {
-			return p.execTranscribe(execCtx.Context, sess, inputAsMap(input))
-		},
-	}}, nil
-}
-
-func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
-	botID := strings.TrimSpace(session.BotID)
-	if botID == "" {
-		return nil, errors.New("bot_id is required")
-	}
-	botSettings, err := p.settings.GetBot(ctx, botID)
-	if err != nil {
-		return nil, errors.New("failed to load bot settings")
-	}
-	modelID := strings.TrimSpace(botSettings.TranscriptionModelID)
-	if modelID == "" {
-		return nil, errors.New("bot has no transcription model configured")
-	}
-
-	path := FirstStringArg(args, "path", "audio_path", "file_path")
-	rawURL := FirstStringArg(args, "url", "audio_url")
-	if path == "" && rawURL == "" {
-		return nil, errors.New("path or url is required")
-	}
-
-	audio, filename, contentType, err := p.loadAudio(ctx, botID, path, rawURL, FirstStringArg(args, "contentType", "content_type"))
-	if err != nil {
-		return nil, err
-	}
-
-	override := map[string]any{}
-	if language := FirstStringArg(args, "language"); language != "" {
-		override["language"] = language
-	}
-	if prompt := FirstStringArg(args, "prompt"); prompt != "" {
-		override["prompt"] = prompt
-	}
-	result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
-	if err != nil {
-		return nil, err
-	}
-	return map[string]any{
-		"ok":               true,
-		"text":             result.Text,
-		"language":         result.Language,
-		"duration_seconds": result.DurationSeconds,
-	}, nil
-}
-
-func (p *TranscriptionProvider) loadAudio(ctx context.Context, botID, pathValue, rawURL, contentTypeOverride string) ([]byte, string, string, error) {
-	if pathValue != "" {
-		return p.loadAudioFromPath(ctx, botID, pathValue, contentTypeOverride)
-	}
-	u, err := validateURL(ctx, rawURL)
-	if err != nil {
-		return nil, "", "", err
-	}
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
-	if err != nil {
-		return nil, "", "", err
-	}
-	resp, err := p.http.Do(req)
-	if err != nil {
-		return nil, "", "", err
-	}
-	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		_ = resp.Body.Close()
-		return nil, "", "", fmt.Errorf("download audio: unexpected status %d", resp.StatusCode)
-	}
-	defer func(body io.ReadCloser) {
-		if closeErr := body.Close(); closeErr != nil {
-			p.logger.Warn("failed to close audio response body", slog.Any("error", closeErr))
-		}
-	}(resp.Body)
-	audio, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, "", "", err
-	}
-	contentType := strings.TrimSpace(contentTypeOverride)
-	if contentType == "" {
-		contentType = strings.TrimSpace(resp.Header.Get("Content-Type"))
-	}
-	return audio, filepath.Base(strings.TrimSpace(req.URL.Path)), contentType, nil
-}
-
-func (p *TranscriptionProvider) loadAudioFromPath(ctx context.Context, botID, pathValue, contentTypeOverride string) ([]byte, string, string, error) {
-	storageKey := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(pathValue), mediaDataPrefix))
-	if storageKey == "" || storageKey == strings.TrimSpace(pathValue) {
-		return nil, "", "", fmt.Errorf("unsupported media path: %s", pathValue)
-	}
-	asset, err := p.media.GetByStorageKey(ctx, botID, storageKey)
-	if err != nil {
-		return nil, "", "", err
-	}
-	reader, _, err := p.media.Open(ctx, botID, asset.ContentHash)
-	if err != nil {
-		return nil, "", "", err
-	}
-	defer func(reader io.ReadCloser) {
-		if closeErr := reader.Close(); closeErr != nil {
-			p.logger.Warn("failed to close media reader", slog.Any("error", closeErr))
-		}
-	}(reader)
-	audio, err := io.ReadAll(reader)
-	if err != nil {
-		return nil, "", "", err
-	}
-	contentType := strings.TrimSpace(contentTypeOverride)
-	if contentType == "" {
-		contentType = strings.TrimSpace(asset.Mime)
-	}
-	return audio, filepath.Base(storageKey), contentType, nil
-}
-
-func validateURL(ctx context.Context, rawURL string) (*url.URL, error) {
-	u, err := url.Parse(rawURL)
-	if err != nil {
-		return nil, fmt.Errorf("invalid url: %w", err)
-	}
-
-	if u.Scheme != "http" && u.Scheme != "https" {
-		return nil, fmt.Errorf("unsupported scheme: %s", u.Scheme)
-	}
-
-	hostname := u.Hostname()
-	if hostname == "" {
-		return nil, errors.New("missing hostname in url")
-	}
-
-	resolver := net.Resolver{}
-	ips, err := resolver.LookupIPAddr(ctx, hostname)
-	if err != nil {
-		return nil, fmt.Errorf("dns lookup failed for %s: %w", hostname, err)
-	}
-
-	if len(ips) == 0 {
-		return nil, fmt.Errorf("no ip addresses found for %s", hostname)
-	}
-
-	for _, ip := range ips {
-		if ip.IP.IsLoopback() || ip.IP.IsPrivate() || ip.IP.IsLinkLocalUnicast() || ip.IP.IsLinkLocalMulticast() {
-			return nil, fmt.Errorf("url resolves to a non-public ip address: %s", ip.IP.String())
-		}
-	}
-
-	return u, nil
-}
@@ -10,9 +10,9 @@ import (

 	sdk "github.com/memohai/twilight-ai/sdk"

-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/channel"
 	"github.com/memohai/memoh/internal/settings"
+	ttspkg "github.com/memohai/memoh/internal/tts"
 )

 const ttsMaxTextLen = 500
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
 type TTSProvider struct {
 	logger   *slog.Logger
 	settings *settings.Service
-	audio    *audiopkg.Service
+	tts      *ttspkg.Service
 	sender   TTSSender
 	resolver TTSChannelResolver
 }

-func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
+func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
 	if log == nil {
 		log = slog.Default()
 	}
 	return &TTSProvider{
 		logger:   log.With(slog.String("tool", "tts")),
 		settings: settingsSvc,
-		audio:    audioSvc,
+		tts:      ttsSvc,
 		sender:   sender,
 		resolver: resolver,
 	}
 }

 func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
-	if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
+	if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
 		return nil, nil
 	}
 	botID := strings.TrimSpace(session.BotID)
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
 	if botSettings.TtsModelID == "" {
 		return nil, errors.New("bot has no TTS model configured")
 	}
-	audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
+	audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
 	if synthErr != nil {
 		return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
 	}
@@ -1,100 +0,0 @@
-package audio
-
-import (
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"log/slog"
-
-	"github.com/jackc/pgx/v5"
-	"github.com/jackc/pgx/v5/pgtype"
-
-	"github.com/memohai/memoh/internal/db/sqlc"
-	"github.com/memohai/memoh/internal/models"
-)
-
-func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
-	for _, def := range registry.List() {
-		provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
-		if err != nil {
-			if errors.Is(err, pgx.ErrNoRows) {
-				if logger != nil {
-					logger.Warn("audio registry skipped provider without template",
-						slog.String("provider", string(def.ClientType)),
-						slog.String("display_name", def.DisplayName))
-				}
-				continue
-			}
-			if logger != nil {
-				logger.Warn("audio registry failed to load provider template",
-					slog.String("provider", string(def.ClientType)),
-					slog.String("display_name", def.DisplayName),
-					slog.Any("error", err))
-			}
-			return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
-		}
-
-		synced := 0
-		if !isTranscriptionClientType(def.ClientType) {
-			for _, model := range def.Models {
-				if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
-					if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
-						ProviderID: provider.ID,
-						ModelID:    model.ID,
-						Type:       string(models.ModelTypeSpeech),
-					}); err != nil {
-						return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
-					}
-					continue
-				}
-				modelConfigJSON, err := json.Marshal(map[string]any{})
-				if err != nil {
-					return fmt.Errorf("marshal speech model config: %w", err)
-				}
-				name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
-				if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
-					ModelID:    model.ID,
-					Name:       name,
-					ProviderID: provider.ID,
-					Type:       string(models.ModelTypeSpeech),
-					Config:     modelConfigJSON,
-				}); err != nil {
-					return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
-				}
-				synced++
-			}
-		}
-		for _, model := range def.TranscriptionModels {
-			if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) {
-				if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
-					ProviderID: provider.ID,
-					ModelID:    model.ID,
-					Type:       string(models.ModelTypeTranscription),
-				}); err != nil {
-					return fmt.Errorf("delete hidden transcription template model %s: %w", model.ID, err)
-				}
-				continue
-			}
-			modelConfigJSON, err := json.Marshal(map[string]any{})
-			if err != nil {
-				return fmt.Errorf("marshal transcription model config: %w", err)
-			}
-			name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
-			if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
-				ModelID:    model.ID,
-				Name:       name,
-				ProviderID: provider.ID,
-				Type:       string(models.ModelTypeTranscription),
-				Config:     modelConfigJSON,
-			}); err != nil {
-				return fmt.Errorf("upsert transcription model %s: %w", model.ID, err)
-			}
-		}
-
-		if logger != nil {
-			logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
-		}
-	}
-	return nil
-}
@@ -1,769 +0,0 @@
-package audio
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-
-	"github.com/jackc/pgx/v5/pgtype"
-	sdk "github.com/memohai/twilight-ai/sdk"
-
-	"github.com/memohai/memoh/internal/db"
-	"github.com/memohai/memoh/internal/db/sqlc"
-	"github.com/memohai/memoh/internal/models"
-)
-
-type Service struct {
-	queries  *sqlc.Queries
-	logger   *slog.Logger
-	registry *Registry
-}
-
-func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
-	return &Service{
-		queries:  queries,
-		logger:   log.With(slog.String("service", "audio")),
-		registry: registry,
-	}
-}
-
-func (s *Service) Registry() *Registry { return s.registry }
-
-func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
-	return s.registry.ListMeta()
-}
-
-func (s *Service) ListSpeechMeta(_ context.Context) []ProviderMetaResponse {
-	return s.registry.ListSpeechMeta()
-}
-
-func (s *Service) ListTranscriptionMeta(_ context.Context) []ProviderMetaResponse {
-	return s.registry.ListTranscriptionMeta()
-}
-
-func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
-	rows, err := s.queries.ListSpeechProviders(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list speech providers: %w", err)
-	}
-	items := make([]SpeechProviderResponse, 0, len(rows))
-	for _, row := range rows {
-		items = append(items, toSpeechProviderResponse(row))
-	}
-	return items, nil
-}
-
-func (s *Service) ListTranscriptionProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
-	rows, err := s.queries.ListTranscriptionProviders(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list transcription providers: %w", err)
-	}
-	items := make([]SpeechProviderResponse, 0, len(rows))
-	for _, row := range rows {
-		items = append(items, toSpeechProviderResponse(row))
-	}
-	return items, nil
-}
-
-func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
-	pgID, err := db.ParseUUID(id)
-	if err != nil {
-		return SpeechProviderResponse{}, err
-	}
-	row, err := s.queries.GetProviderByID(ctx, pgID)
-	if err != nil {
-		return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
-	}
-	return toSpeechProviderResponse(row), nil
-}
-
-func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
-	rows, err := s.queries.ListSpeechModels(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list speech models: %w", err)
-	}
-	items := make([]SpeechModelResponse, 0, len(rows))
-	for _, row := range rows {
-		if s.shouldHideModel(row.ProviderType, models.ModelTypeSpeech, row.ModelID) {
-			continue
-		}
-		items = append(items, toSpeechModelFromListRow(row))
-	}
-	return items, nil
-}
-
-func (s *Service) ListTranscriptionModels(ctx context.Context) ([]TranscriptionModelResponse, error) {
-	rows, err := s.queries.ListTranscriptionModels(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list transcription models: %w", err)
-	}
-	items := make([]TranscriptionModelResponse, 0, len(rows))
-	for _, row := range rows {
-		if s.shouldHideModel(row.ProviderType, models.ModelTypeTranscription, row.ModelID) {
-			continue
-		}
-		items = append(items, toTranscriptionModelFromListRow(row))
-	}
-	return items, nil
-}
-
-func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
-	pgID, err := db.ParseUUID(providerID)
-	if err != nil {
-		return nil, err
-	}
-	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("list speech models by provider: %w", err)
-	}
-	items := make([]SpeechModelResponse, 0, len(rows))
-	for _, row := range rows {
-		if shouldHideTemplateModel(def, models.ModelTypeSpeech, row.ModelID) {
-			continue
-		}
-		items = append(items, toSpeechModelFromModel(row, ""))
-	}
-	return items, nil
-}
-
-func (s *Service) ListTranscriptionModelsByProvider(ctx context.Context, providerID string) ([]TranscriptionModelResponse, error) {
-	pgID, err := db.ParseUUID(providerID)
-	if err != nil {
-		return nil, err
-	}
-	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	rows, err := s.queries.ListTranscriptionModelsByProviderID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("list transcription models by provider: %w", err)
-	}
-	items := make([]TranscriptionModelResponse, 0, len(rows))
-	for _, row := range rows {
-		if shouldHideTemplateModel(def, models.ModelTypeTranscription, row.ModelID) {
-			continue
-		}
-		items = append(items, toTranscriptionModelFromModel(row, ""))
-	}
-	return items, nil
-}
-
-func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
-	pgID, err := db.ParseUUID(id)
-	if err != nil {
-		return SpeechModelResponse{}, err
-	}
-	row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
-	if err != nil {
-		return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
-	}
-	return toSpeechModelWithProviderResponse(row), nil
-}
-
-func (s *Service) GetTranscriptionModel(ctx context.Context, id string) (TranscriptionModelResponse, error) {
-	pgID, err := db.ParseUUID(id)
-	if err != nil {
-		return TranscriptionModelResponse{}, err
-	}
-	row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
-	if err != nil {
-		return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
-	}
-	return toTranscriptionModelWithProviderResponse(row), nil
-}
-
-func (s *Service) UpdateSpeechModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (SpeechModelResponse, error) {
-	pgID, err := db.ParseUUID(id)
-	if err != nil {
-		return SpeechModelResponse{}, err
-	}
-	row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
-	if err != nil {
-		return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
-	}
-	configJSON, err := json.Marshal(req.Config)
-	if err != nil {
-		return SpeechModelResponse{}, fmt.Errorf("marshal speech config: %w", err)
-	}
-	name := row.Name
-	if req.Name != nil {
-		name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
-	}
-	updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
-		ID:         pgID,
-		ModelID:    row.ModelID,
-		Name:       name,
-		ProviderID: row.ProviderID,
-		Type:       string(models.ModelTypeSpeech),
-		Config:     configJSON,
-	})
-	if err != nil {
-		return SpeechModelResponse{}, fmt.Errorf("update speech model: %w", err)
-	}
-	return toSpeechModelFromModel(updated, row.ProviderType), nil
-}
-
-func (s *Service) UpdateTranscriptionModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (TranscriptionModelResponse, error) {
-	pgID, err := db.ParseUUID(id)
-	if err != nil {
-		return TranscriptionModelResponse{}, err
-	}
-	row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
-	if err != nil {
-		return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
-	}
-	configJSON, err := json.Marshal(req.Config)
-	if err != nil {
-		return TranscriptionModelResponse{}, fmt.Errorf("marshal transcription config: %w", err)
-	}
-	name := row.Name
-	if req.Name != nil {
-		name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
-	}
-	updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
-		ID:         pgID,
-		ModelID:    row.ModelID,
-		Name:       name,
-		ProviderID: row.ProviderID,
-		Type:       string(models.ModelTypeTranscription),
-		Config:     configJSON,
-	})
-	if err != nil {
-		return TranscriptionModelResponse{}, fmt.Errorf("update transcription model: %w", err)
-	}
-	return toTranscriptionModelFromModel(updated, row.ProviderType), nil
-}
-
-func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
-	params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
-	if err != nil {
-		return nil, "", err
-	}
-	result, err := sdk.GenerateSpeech(ctx,
-		sdk.WithSpeechModel(params.model),
-		sdk.WithText(text),
-		sdk.WithSpeechConfig(params.config),
-	)
-	if err != nil {
-		return nil, "", fmt.Errorf("synthesize: %w", err)
-	}
-	return result.Audio, result.ContentType, nil
-}
-
-func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
-	params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
-	if err != nil {
-		return "", err
-	}
-	streamResult, err := sdk.StreamSpeech(ctx,
-		sdk.WithSpeechModel(params.model),
-		sdk.WithText(text),
-		sdk.WithSpeechConfig(params.config),
-	)
-	if err != nil {
-		return "", fmt.Errorf("stream: %w", err)
-	}
-	audio, err := streamResult.Bytes()
-	if err != nil {
-		return "", fmt.Errorf("stream: %w", err)
-	}
-	if _, writeErr := w.Write(audio); writeErr != nil {
-		return "", fmt.Errorf("write chunk: %w", writeErr)
-	}
-	return streamResult.ContentType, nil
-}
-
-func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
-	pgID, err := db.ParseUUID(modelID)
-	if err != nil {
-		return nil, err
-	}
-	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech model: %w", err)
-	}
-	def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
-	if err != nil {
-		return nil, err
-	}
-	template := findModelTemplate(def.Models, def.DefaultModel, modelRow.ModelID)
-	if template == nil {
-		return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
-	}
-	caps := template.Capabilities
-	if len(caps.ConfigSchema.Fields) == 0 {
-		caps.ConfigSchema = template.ConfigSchema
-	}
-	return &caps, nil
-}
-
-func (s *Service) GetSpeechModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
-	return s.GetModelCapabilities(ctx, modelID)
-}
-
-func (s *Service) GetTranscriptionModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
-	pgID, err := db.ParseUUID(modelID)
-	if err != nil {
-		return nil, err
-	}
-	modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get transcription model: %w", err)
-	}
-	def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
-	if err != nil {
-		return nil, err
-	}
-	template := findModelTemplate(def.TranscriptionModels, def.DefaultTranscriptionModel, modelRow.ModelID)
-	if template == nil {
-		return nil, fmt.Errorf("transcription model capabilities not found: %s", modelRow.ModelID)
-	}
-	caps := template.Capabilities
-	if len(caps.ConfigSchema.Fields) == 0 {
-		caps.ConfigSchema = template.ConfigSchema
-	}
-	return &caps, nil
-}
-
-func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
-	pgID, err := db.ParseUUID(providerID)
-	if err != nil {
-		return nil, err
-	}
-
-	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	if !def.SupportsList || def.Factory == nil {
-		return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
-	}
-
-	provider, err := def.Factory(parseConfig(providerRow.Config))
-	if err != nil {
-		return nil, fmt.Errorf("build speech provider: %w", err)
-	}
-
-	remoteModels, err := provider.ListModels(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list speech models: %w", err)
-	}
-
-	discovered := make([]ModelInfo, 0, len(remoteModels))
-	for _, remoteModel := range remoteModels {
-		if remoteModel == nil || remoteModel.ID == "" {
-			continue
-		}
-		discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
-	}
-	return discovered, nil
-}
-
-func (s *Service) FetchRemoteTranscriptionModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
-	pgID, err := db.ParseUUID(providerID)
-	if err != nil {
-		return nil, err
-	}
-
-	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	if !def.SupportsTranscriptionList || def.TranscriptionFactory == nil {
-		return nil, fmt.Errorf("speech provider does not support transcription model discovery: %s", providerRow.ClientType)
-	}
-
-	provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
-	if err != nil {
-		return nil, fmt.Errorf("build transcription provider: %w", err)
-	}
-
-	remoteModels, err := provider.ListModels(ctx)
-	if err != nil {
-		return nil, fmt.Errorf("list transcription models: %w", err)
-	}
-
-	discovered := make([]ModelInfo, 0, len(remoteModels))
-	for _, remoteModel := range remoteModels {
-		if remoteModel == nil || remoteModel.ID == "" {
-			continue
-		}
-		discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.TranscriptionModels))
-	}
-	return discovered, nil
-}
-
-func (s *Service) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*sdk.TranscriptionResult, error) {
-	params, err := s.resolveTranscriptionParams(ctx, modelID, audio, filename, contentType, overrideCfg)
-	if err != nil {
-		return nil, err
-	}
-	result, err := sdk.Transcribe(ctx,
-		sdk.WithTranscriptionModel(params.model),
-		sdk.WithAudio(audio, filename, contentType),
-		sdk.WithTranscriptionConfig(params.config),
-	)
-	if err != nil {
-		return nil, fmt.Errorf("transcribe: %w", err)
-	}
-	return result, nil
-}
-
-type resolvedSpeechParams struct {
-	model  *sdk.SpeechModel
-	config map[string]any
-}
-
-type resolvedTranscriptionParams struct {
-	model  *sdk.TranscriptionModel
-	config map[string]any
-}
-
-func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
-	_ = text
-	pgID, err := db.ParseUUID(modelID)
-	if err != nil {
-		return nil, err
-	}
-
-	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech model: %w", err)
-	}
-	providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	provider, err := def.Factory(parseConfig(providerRow.Config))
-	if err != nil {
-		return nil, fmt.Errorf("build speech provider: %w", err)
-	}
-
-	cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
-	return &resolvedSpeechParams{
-		model:  &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
-		config: cfg,
-	}, nil
-}
-
-func (s *Service) resolveTranscriptionParams(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*resolvedTranscriptionParams, error) {
-	_ = audio
-	_ = filename
-	_ = contentType
-	pgID, err := db.ParseUUID(modelID)
-	if err != nil {
-		return nil, err
-	}
-
-	modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
-	if err != nil {
-		return nil, fmt.Errorf("get transcription model: %w", err)
-	}
-	providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
-	if err != nil {
-		return nil, fmt.Errorf("get speech provider: %w", err)
-	}
-
-	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
-	if err != nil {
-		return nil, err
-	}
-	provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
-	if err != nil {
-		return nil, fmt.Errorf("build transcription provider: %w", err)
-	}
-
-	cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
-	return &resolvedTranscriptionParams{
-		model:  &sdk.TranscriptionModel{ID: modelRow.ModelID, Provider: provider},
-		config: cfg,
-	}, nil
-}
-
-func parseConfig(raw []byte) map[string]any {
-	if len(raw) == 0 {
-		return map[string]any{}
-	}
-	var cfg map[string]any
-	if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
-		return map[string]any{}
-	}
-	return cfg
-}
-
-func mergeConfig(parts ...map[string]any) map[string]any {
-	out := make(map[string]any)
-	for _, part := range parts {
-		for key, value := range part {
-			out[key] = value
-		}
-	}
-	return out
-}
-
-func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
-	for _, model := range defaults {
-		if model.ID == modelID {
-			return model
-		}
-	}
-	return ModelInfo{
-		ID:   modelID,
-		Name: modelID,
-	}
-}
-
-func (s *Service) shouldHideModel(clientType string, modelType models.ModelType, modelID string) bool {
-	def, err := s.registry.Get(models.ClientType(clientType))
-	if err != nil {
-		return false
-	}
-	return shouldHideTemplateModel(def, modelType, modelID)
-}
-
-func shouldHideTemplateModel(def ProviderDefinition, modelType models.ModelType, modelID string) bool {
-	switch modelType {
-	case models.ModelTypeSpeech:
-		if !def.SupportsList {
-			return false
-		}
-		for _, model := range def.Models {
-			if model.ID == modelID {
-				return model.TemplateOnly
-			}
-		}
-	case models.ModelTypeTranscription:
-		if !def.SupportsTranscriptionList {
-			return false
-		}
-		for _, model := range def.TranscriptionModels {
-			if model.ID == modelID {
-				return model.TemplateOnly
-			}
-		}
-	}
-	return false
-}
-
-func findModelTemplate(modelsList []ModelInfo, defaultModel string, modelID string) *ModelInfo {
-	for i := range modelsList {
-		if modelsList[i].ID == modelID {
-			return &modelsList[i]
-		}
-	}
-	if defaultModel != "" {
-		for i := range modelsList {
-			if modelsList[i].ID == defaultModel {
-				return &modelsList[i]
-			}
-		}
-	}
-	if len(modelsList) > 0 {
-		return &modelsList[0]
-	}
-	return nil
-}
-
-func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
-	icon := ""
-	if row.Icon.Valid {
-		icon = row.Icon.String
-	}
-	return SpeechProviderResponse{
-		ID:         row.ID.String(),
-		Name:       row.Name,
-		ClientType: row.ClientType,
-		Icon:       icon,
-		Enable:     row.Enable,
-		Config:     maskSpeechProviderConfig(parseConfig(row.Config)),
-		CreatedAt:  row.CreatedAt.Time,
-		UpdatedAt:  row.UpdatedAt.Time,
-	}
-}
-
-func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
-	if len(cfg) == 0 {
-		return map[string]any{}
-	}
-	out := make(map[string]any, len(cfg))
-	for key, value := range cfg {
-		if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
-			out[key] = maskSpeechSecret(s)
-			continue
-		}
-		out[key] = value
-	}
-	return out
-}
-
-func isSpeechSecretKey(key string) bool {
-	switch key {
-	case "api_key", "access_key", "secret_key", "app_key":
-		return true
-	default:
-		return false
-	}
-}
-
-func maskSpeechSecret(value string) string {
-	if len(value) <= 8 {
-		return "********"
-	}
-	return value[:4] + "****" + value[len(value)-4:]
-}
-
-func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return SpeechModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: row.ProviderType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
-
-func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return SpeechModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: providerType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
-
-func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return SpeechModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: row.ProviderType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
-
-func toTranscriptionModelFromListRow(row sqlc.ListTranscriptionModelsRow) TranscriptionModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return TranscriptionModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: row.ProviderType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
-
-func toTranscriptionModelFromModel(row sqlc.Model, providerType string) TranscriptionModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return TranscriptionModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: providerType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
-
-func toTranscriptionModelWithProviderResponse(row sqlc.GetTranscriptionModelWithProviderRow) TranscriptionModelResponse {
-	var cfg map[string]any
-	if len(row.Config) > 0 {
-		_ = json.Unmarshal(row.Config, &cfg)
-	}
-	name := ""
-	if row.Name.Valid {
-		name = row.Name.String
-	}
-	return TranscriptionModelResponse{
-		ID:           row.ID.String(),
-		ModelID:      row.ModelID,
-		Name:         name,
-		ProviderID:   row.ProviderID.String(),
-		ProviderType: row.ProviderType,
-		Config:       cfg,
-		CreatedAt:    row.CreatedAt.Time,
-		UpdatedAt:    row.UpdatedAt.Time,
-	}
-}
@@ -1,102 +0,0 @@
-package audio
-
-import "time"
-
-// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
-type ProviderMetaResponse struct {
-	Provider                  string       `json:"provider"`
-	DisplayName               string       `json:"display_name"`
-	Description               string       `json:"description"`
-	ConfigSchema              ConfigSchema `json:"config_schema,omitempty"`
-	DefaultModel              string       `json:"default_model,omitempty"`
-	Models                    []ModelInfo  `json:"models,omitempty"`
-	DefaultSynthesisModel     string       `json:"default_synthesis_model,omitempty"`
-	SynthesisModels           []ModelInfo  `json:"synthesis_models,omitempty"`
-	SupportsSynthesisList     bool         `json:"supports_synthesis_list,omitempty"`
-	DefaultTranscriptionModel string       `json:"default_transcription_model,omitempty"`
-	TranscriptionModels       []ModelInfo  `json:"transcription_models,omitempty"`
-	SupportsTranscriptionList bool         `json:"supports_transcription_list,omitempty"`
-}
-
-// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
-type SpeechProviderResponse struct {
-	ID         string         `json:"id"`
-	Name       string         `json:"name"`
-	ClientType string         `json:"client_type"`
-	Icon       string         `json:"icon,omitempty"`
-	Enable     bool           `json:"enable"`
-	Config     map[string]any `json:"config,omitempty"`
-	CreatedAt  time.Time      `json:"created_at"`
-	UpdatedAt  time.Time      `json:"updated_at"`
-}
-
-// SpeechModelResponse represents a speech model from the unified models table.
-type SpeechModelResponse struct {
-	ID           string         `json:"id"`
-	ModelID      string         `json:"model_id"`
-	Name         string         `json:"name"`
-	ProviderID   string         `json:"provider_id"`
-	ProviderType string         `json:"provider_type,omitempty"`
-	Config       map[string]any `json:"config,omitempty"`
-	CreatedAt    time.Time      `json:"created_at"`
-	UpdatedAt    time.Time      `json:"updated_at"`
-}
-
-// TranscriptionModelResponse represents a transcription model from the unified models table.
-type TranscriptionModelResponse struct {
-	ID           string         `json:"id"`
-	ModelID      string         `json:"model_id"`
-	Name         string         `json:"name"`
-	ProviderID   string         `json:"provider_id"`
-	ProviderType string         `json:"provider_type,omitempty"`
-	Config       map[string]any `json:"config,omitempty"`
-	CreatedAt    time.Time      `json:"created_at"`
-	UpdatedAt    time.Time      `json:"updated_at"`
-}
-
-// UpdateSpeechProviderRequest is used for updating a speech provider.
-type UpdateSpeechProviderRequest struct {
-	Name   *string `json:"name,omitempty"`
-	Enable *bool   `json:"enable,omitempty"`
-}
-
-// UpdateSpeechModelRequest is used for updating a speech model.
-type UpdateSpeechModelRequest struct {
-	Name   *string        `json:"name,omitempty"`
-	Config map[string]any `json:"config,omitempty"`
-}
-
-// TestSynthesizeRequest represents a text-to-speech test request.
-type TestSynthesizeRequest struct {
-	Text   string         `json:"text"`
-	Config map[string]any `json:"config,omitempty"`
-}
-
-// TestTranscriptionRequest represents an audio-to-text test request.
-type TestTranscriptionRequest struct {
-	Config map[string]any `json:"config,omitempty"`
-}
-
-// TestTranscriptionResponse represents the result of a transcription test.
-type TestTranscriptionResponse struct {
-	Text            string              `json:"text"`
-	Language        string              `json:"language,omitempty"`
-	DurationSeconds float64             `json:"duration_seconds,omitempty"`
-	Words           []TranscriptionWord `json:"words,omitempty"`
-	Metadata        map[string]any      `json:"metadata,omitempty"`
-}
-
-// TranscriptionWord represents a single word alignment from a transcription result.
-type TranscriptionWord struct {
-	Text      string  `json:"text"`
-	Start     float64 `json:"start,omitempty"`
-	End       float64 `json:"end,omitempty"`
-	SpeakerID string  `json:"speaker_id,omitempty"`
-}
-
-// ImportModelsResponse represents the response for importing speech models.
-type ImportModelsResponse struct {
-	Created int      `json:"created"`
-	Skipped int      `json:"skipped"`
-	Models  []string `json:"models"`
-}
@@ -1,4 +1,5 @@
 //go:build ignore
+// +build ignore

 package identities_test

@@ -58,29 +58,14 @@ type mediaIngestor interface {
 	channel.ContainerAttachmentIngester
 }

-// speechSynthesizer synthesizes text to speech audio.
-type speechSynthesizer interface {
+// ttsSynthesizer synthesizes text to speech audio.
+type ttsSynthesizer interface {
 	Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
 }

-// speechModelResolver looks up the speech model ID configured for a bot.
-type speechModelResolver interface {
-	ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
-}
-
-// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
-type TranscriptionResult interface {
-	GetText() string
-}
-
-// transcriptionRecognizer converts inbound audio to text using a configured model.
-type transcriptionRecognizer interface {
-	Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (TranscriptionResult, error)
-}
-
-// transcriptionModelResolver looks up the transcription model ID configured for a bot.
-type transcriptionModelResolver interface {
-	ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error)
+// ttsModelResolver looks up the TTS model ID configured for a bot.
+type ttsModelResolver interface {
+	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
 }

 // SessionEnsurer resolves or creates an active session for a route.
@@ -101,29 +86,27 @@ type SessionResult struct {

 // ChannelInboundProcessor routes channel inbound messages to the chat gateway.
 type ChannelInboundProcessor struct {
-	runner              flow.Runner
-	routeResolver       RouteResolver
-	message             messagepkg.Writer
-	mediaService        mediaIngestor
-	reactor             channelReactor
-	commandHandler      *command.Handler
-	registry            *channel.Registry
-	logger              *slog.Logger
-	jwtSecret           string
-	tokenTTL            time.Duration
-	identity            *IdentityResolver
-	policy              PolicyService
-	dispatcher          *RouteDispatcher
-	acl                 chatACL
-	observer            channel.StreamObserver
-	speechService       speechSynthesizer
-	speechModelResolver speechModelResolver
-	transcriber         transcriptionRecognizer
-	sttModelResolver    transcriptionModelResolver
-	sessionEnsurer      SessionEnsurer
-	pipeline            *pipelinepkg.Pipeline
-	eventStore          *pipelinepkg.EventStore
-	discussDriver       *pipelinepkg.DiscussDriver
+	runner           flow.Runner
+	routeResolver    RouteResolver
+	message          messagepkg.Writer
+	mediaService     mediaIngestor
+	reactor          channelReactor
+	commandHandler   *command.Handler
+	registry         *channel.Registry
+	logger           *slog.Logger
+	jwtSecret        string
+	tokenTTL         time.Duration
+	identity         *IdentityResolver
+	policy           PolicyService
+	dispatcher       *RouteDispatcher
+	acl              chatACL
+	observer         channel.StreamObserver
+	ttsService       ttsSynthesizer
+	ttsModelResolver ttsModelResolver
+	sessionEnsurer   SessionEnsurer
+	pipeline         *pipelinepkg.Pipeline
+	eventStore       *pipelinepkg.EventStore
+	discussDriver    *pipelinepkg.DiscussDriver

 	// activeStreams maps "botID:routeID" to a context.CancelFunc for the
 	// currently running agent stream. Used by /stop to abort generation
@@ -205,23 +188,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
 	p.observer = observer
 }

-// SetSpeechService configures the speech synthesizer and settings reader for
-// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
-func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
+// SetTtsService configures the TTS synthesizer and settings reader for handling
+// <speech> tag events (speech_delta) that require server-side audio synthesis.
+func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
 	if p == nil {
 		return
 	}
-	p.speechService = synth
-	p.speechModelResolver = modelResolver
-}
-
-// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
-func (p *ChannelInboundProcessor) SetTranscriptionService(recognizer transcriptionRecognizer, modelResolver transcriptionModelResolver) {
-	if p == nil {
-		return
-	}
-	p.transcriber = recognizer
-	p.sttModelResolver = modelResolver
+	p.ttsService = synth
+	p.ttsModelResolver = modelResolver
 }

 // SetSessionEnsurer configures the session ensurer for auto-creating sessions on routes.
@@ -352,8 +326,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
 	}

 	resolvedAttachments := p.ingestInboundAttachments(ctx, cfg, msg, strings.TrimSpace(identity.BotID), msg.Message.Attachments)
-	msg.Message.Attachments = resolvedAttachments
-	hadVoiceAttachment := containsVoiceAttachment(resolvedAttachments)
 	attachments := mapChannelToChatAttachments(resolvedAttachments)
 	text = strings.TrimSpace(msg.Message.PlainText())

@@ -494,24 +466,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
 	}
 	shouldTrigger := shouldTriggerAssistantResponse(msg) || identity.ForceReply

-	if sessionType == sessionpkg.TypeDiscuss || shouldTrigger {
-		if transcript := p.transcribeInboundAttachments(ctx, strings.TrimSpace(identity.BotID), resolvedAttachments); transcript != "" {
-			labeledTranscript := formatInboundTranscript(transcript)
-			if msg.Message.Metadata == nil {
-				msg.Message.Metadata = make(map[string]any)
-			}
-			msg.Message.Metadata["transcript"] = transcript
-			if plain := strings.TrimSpace(msg.Message.PlainText()); plain == "" {
-				msg.Message.Text = labeledTranscript
-			} else if !strings.Contains(plain, transcript) {
-				msg.Message.Text = plain + "\n\n" + labeledTranscript
-			}
-		} else if hadVoiceAttachment && strings.TrimSpace(msg.Message.PlainText()) == "" {
-			msg.Message.Text = formatVoiceTranscriptionUnavailableNotice(resolvedAttachments)
-		}
-		text = strings.TrimSpace(msg.Message.PlainText())
-	}
-
 	if !shouldTrigger {
 		p.persistPassiveMessage(ctx, identity, msg, text, attachments, resolved.RouteID, sessionID, eventID)
 		if p.logger != nil {
@@ -1946,97 +1900,6 @@ func (p *ChannelInboundProcessor) loadInboundAttachmentPayload(
 	}, nil
 }

-func (p *ChannelInboundProcessor) transcribeInboundAttachments(ctx context.Context, botID string, attachments []channel.Attachment) string {
-	if p == nil || p.transcriber == nil || p.sttModelResolver == nil || p.mediaService == nil || strings.TrimSpace(botID) == "" {
-		return ""
-	}
-	modelID, err := p.sttModelResolver.ResolveTranscriptionModelID(ctx, botID)
-	if err != nil || strings.TrimSpace(modelID) == "" {
-		return ""
-	}
-	transcripts := make([]string, 0, len(attachments))
-	for _, att := range attachments {
-		if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
-			continue
-		}
-		if strings.TrimSpace(att.ContentHash) == "" {
-			continue
-		}
-		reader, asset, err := p.mediaService.Open(ctx, botID, strings.TrimSpace(att.ContentHash))
-		if err != nil {
-			if p.logger != nil {
-				p.logger.Warn("open inbound audio for transcription failed", slog.Any("error", err), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
-			}
-			continue
-		}
-		audio, readErr := io.ReadAll(reader)
-		_ = reader.Close()
-		if readErr != nil || len(audio) == 0 {
-			if p.logger != nil {
-				p.logger.Warn("read inbound audio for transcription failed", slog.Any("error", readErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
-			}
-			continue
-		}
-		filename := strings.TrimSpace(att.Name)
-		if filename == "" {
-			filename = "audio" + filepath.Ext(asset.StorageKey)
-		}
-		contentType := strings.TrimSpace(att.Mime)
-		if contentType == "" {
-			contentType = strings.TrimSpace(asset.Mime)
-		}
-		result, txErr := p.transcriber.Transcribe(ctx, modelID, audio, filename, contentType, nil)
-		if txErr != nil {
-			if p.logger != nil {
-				p.logger.Warn("inbound attachment transcription failed", slog.Any("error", txErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
-			}
-			continue
-		}
-		text := strings.TrimSpace(result.GetText())
-		if text == "" {
-			continue
-		}
-		transcripts = append(transcripts, text)
-	}
-	if len(transcripts) == 0 {
-		return ""
-	}
-	return strings.Join(transcripts, "\n\n")
-}
-
-func formatInboundTranscript(transcript string) string {
-	transcript = strings.TrimSpace(transcript)
-	if transcript == "" {
-		return ""
-	}
-	return "[Voice message transcription]\n" + transcript
-}
-
-func containsVoiceAttachment(attachments []channel.Attachment) bool {
-	for _, att := range attachments {
-		if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
-			return true
-		}
-	}
-	return false
-}
-
-func formatVoiceTranscriptionUnavailableNotice(attachments []channel.Attachment) string {
-	paths := make([]string, 0, len(attachments))
-	for _, att := range attachments {
-		if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
-			continue
-		}
-		if ref := strings.TrimSpace(att.URL); ref != "" {
-			paths = append(paths, ref)
-		}
-	}
-	if len(paths) == 0 {
-		return "[User sent a voice message, but transcription is unavailable.]"
-	}
-	return "[User sent a voice message, but transcription is unavailable. Use transcribe_audio with one of these paths if needed: " + strings.Join(paths, ", ") + "]"
-}
-
 func openInboundAttachmentURL(ctx context.Context, rawURL string) (inboundAttachmentPayload, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
 	if err != nil {
@@ -2227,9 +2090,6 @@ func mapChannelToChatAttachments(attachments []channel.Attachment) []conversatio
 	}
 	result := make([]conversation.ChatAttachment, 0, len(attachments))
 	for _, att := range attachments {
-		if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
-			continue
-		}
 		ca := conversation.ChatAttachment{
 			Type:        string(att.Type),
 			PlatformKey: att.PlatformKey,
@@ -2304,13 +2164,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
 	outboundAssetRefs *[]conversation.OutboundAssetRef,
 	assetMu *sync.Mutex,
 ) {
-	if p.speechService == nil || p.speechModelResolver == nil {
+	if p.ttsService == nil || p.ttsModelResolver == nil {
 		if p.logger != nil {
 			p.logger.Warn("speech_delta received but TTS service not configured")
 		}
 		return
 	}
-	modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
+	modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
 	if err != nil || strings.TrimSpace(modelID) == "" {
 		if p.logger != nil {
 			p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
@@ -2322,7 +2182,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
 		if text == "" {
 			continue
 		}
-		audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
+		audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
 		if synthErr != nil {
 			if p.logger != nil {
 				p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
@@ -511,7 +511,7 @@ WITH updated AS (
  SET display_name = $1,
      updated_at = now()
  WHERE bots.id = $2
-  RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, transcription_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
+  RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
 )
 SELECT
  updated.id AS id,
@@ -34,7 +34,6 @@ type Bot struct {
 	ImageModelID           pgtype.UUID        `json:"image_model_id"`
 	DiscussProbeModelID    pgtype.UUID        `json:"discuss_probe_model_id"`
 	TtsModelID             pgtype.UUID        `json:"tts_model_id"`
-	TranscriptionModelID   pgtype.UUID        `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID        `json:"browser_context_id"`
 	PersistFullToolResults bool               `json:"persist_full_tool_results"`
 	Metadata               []byte             `json:"metadata"`
@@ -13,7 +13,7 @@ import (

 const countModels = `-- name: CountModels :one
 SELECT COUNT(*) FROM models
-WHERE type NOT IN ('speech', 'transcription')
+WHERE type != 'speech'
 `

 func (q *Queries) CountModels(ctx context.Context) (int64, error) {
@@ -40,19 +40,13 @@ FROM providers
 WHERE client_type NOT IN (
  'edge-speech',
  'openai-speech',
-  'openai-transcription',
  'openrouter-speech',
-  'openrouter-transcription',
  'elevenlabs-speech',
-  'elevenlabs-transcription',
  'deepgram-speech',
-  'deepgram-transcription',
  'minimax-speech',
  'volcengine-speech',
  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech',
-  'google-transcription'
+  'microsoft-speech'
 )
 `

@@ -207,24 +201,6 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
 	return err
 }

-const deleteModelByProviderAndType = `-- name: DeleteModelByProviderAndType :exec
-DELETE FROM models
-WHERE provider_id = $1
-  AND model_id = $2
-  AND type = $3
-`
-
-type DeleteModelByProviderAndTypeParams struct {
-	ProviderID pgtype.UUID `json:"provider_id"`
-	ModelID    string      `json:"model_id"`
-	Type       string      `json:"type"`
-}
-
-func (q *Queries) DeleteModelByProviderAndType(ctx context.Context, arg DeleteModelByProviderAndTypeParams) error {
-	_, err := q.db.Exec(ctx, deleteModelByProviderAndType, arg.ProviderID, arg.ModelID, arg.Type)
-	return err
-}
-
 const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
 DELETE FROM models
 WHERE provider_id = $1
@@ -318,27 +294,6 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
 	return i, err
 }

-const getProviderByClientType = `-- name: GetProviderByClientType :one
-SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
-`
-
-func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
-	row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
-	var i Provider
-	err := row.Scan(
-		&i.ID,
-		&i.Name,
-		&i.ClientType,
-		&i.Icon,
-		&i.Enable,
-		&i.Config,
-		&i.Metadata,
-		&i.CreatedAt,
-		&i.UpdatedAt,
-	)
-	return i, err
-}
-
 const getProviderByID = `-- name: GetProviderByID :one
 SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
 `
@@ -420,51 +375,12 @@ func (q *Queries) GetSpeechModelWithProvider(ctx context.Context, id pgtype.UUID
 	return i, err
 }

-const getTranscriptionModelWithProvider = `-- name: GetTranscriptionModelWithProvider :one
-SELECT
-  m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
-  p.client_type AS provider_type
-FROM models m
-JOIN providers p ON p.id = m.provider_id
-WHERE m.id = $1
-  AND m.type = 'transcription'
-`
-
-type GetTranscriptionModelWithProviderRow struct {
-	ID           pgtype.UUID        `json:"id"`
-	ModelID      string             `json:"model_id"`
-	Name         pgtype.Text        `json:"name"`
-	ProviderID   pgtype.UUID        `json:"provider_id"`
-	Type         string             `json:"type"`
-	Config       []byte             `json:"config"`
-	CreatedAt    pgtype.Timestamptz `json:"created_at"`
-	UpdatedAt    pgtype.Timestamptz `json:"updated_at"`
-	ProviderType string             `json:"provider_type"`
-}
-
-func (q *Queries) GetTranscriptionModelWithProvider(ctx context.Context, id pgtype.UUID) (GetTranscriptionModelWithProviderRow, error) {
-	row := q.db.QueryRow(ctx, getTranscriptionModelWithProvider, id)
-	var i GetTranscriptionModelWithProviderRow
-	err := row.Scan(
-		&i.ID,
-		&i.ModelID,
-		&i.Name,
-		&i.ProviderID,
-		&i.Type,
-		&i.Config,
-		&i.CreatedAt,
-		&i.UpdatedAt,
-		&i.ProviderType,
-	)
-	return i, err
-}
-
 const listEnabledModels = `-- name: ListEnabledModels :many
 SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at
 FROM models m
 JOIN providers p ON m.provider_id = p.id
 WHERE p.enable = true
-  AND m.type NOT IN ('speech', 'transcription')
+  AND m.type != 'speech'
 ORDER BY m.created_at DESC
 `

@@ -609,7 +525,7 @@ func (q *Queries) ListModelVariantsByModelUUID(ctx context.Context, modelUuid pg

 const listModels = `-- name: ListModels :many
 SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
-WHERE type NOT IN ('speech', 'transcription')
+WHERE type != 'speech'
 ORDER BY created_at DESC
 `

@@ -717,7 +633,7 @@ func (q *Queries) ListModelsByProviderClientType(ctx context.Context, clientType
 const listModelsByProviderID = `-- name: ListModelsByProviderID :many
 SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
 WHERE provider_id = $1
-  AND type NOT IN ('speech', 'transcription')
+  AND type != 'speech'
 ORDER BY created_at DESC
 `

@@ -831,19 +747,13 @@ SELECT id, name, client_type, icon, enable, config, metadata, created_at, update
 WHERE client_type NOT IN (
  'edge-speech',
  'openai-speech',
-  'openai-transcription',
  'openrouter-speech',
-  'openrouter-transcription',
  'elevenlabs-speech',
-  'elevenlabs-transcription',
  'deepgram-speech',
-  'deepgram-transcription',
  'minimax-speech',
  'volcengine-speech',
  'alibabacloud-speech',
-  'microsoft-speech',
-  'google-speech',
-  'google-transcription'
+  'microsoft-speech'
 )
 ORDER BY created_at DESC
 `
@@ -1011,135 +921,6 @@ func (q *Queries) ListSpeechProviders(ctx context.Context) ([]Provider, error) {
 	return items, nil
 }

-const listTranscriptionModels = `-- name: ListTranscriptionModels :many
-SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
-  p.client_type AS provider_type
-FROM models m
-JOIN providers p ON p.id = m.provider_id
-WHERE m.type = 'transcription'
-ORDER BY m.created_at DESC
-`
-
-type ListTranscriptionModelsRow struct {
-	ID           pgtype.UUID        `json:"id"`
-	ModelID      string             `json:"model_id"`
-	Name         pgtype.Text        `json:"name"`
-	ProviderID   pgtype.UUID        `json:"provider_id"`
-	Type         string             `json:"type"`
-	Config       []byte             `json:"config"`
-	CreatedAt    pgtype.Timestamptz `json:"created_at"`
-	UpdatedAt    pgtype.Timestamptz `json:"updated_at"`
-	ProviderType string             `json:"provider_type"`
-}
-
-func (q *Queries) ListTranscriptionModels(ctx context.Context) ([]ListTranscriptionModelsRow, error) {
-	rows, err := q.db.Query(ctx, listTranscriptionModels)
-	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-	var items []ListTranscriptionModelsRow
-	for rows.Next() {
-		var i ListTranscriptionModelsRow
-		if err := rows.Scan(
-			&i.ID,
-			&i.ModelID,
-			&i.Name,
-			&i.ProviderID,
-			&i.Type,
-			&i.Config,
-			&i.CreatedAt,
-			&i.UpdatedAt,
-			&i.ProviderType,
-		); err != nil {
-			return nil, err
-		}
-		items = append(items, i)
-	}
-	if err := rows.Err(); err != nil {
-		return nil, err
-	}
-	return items, nil
-}
-
-const listTranscriptionModelsByProviderID = `-- name: ListTranscriptionModelsByProviderID :many
-SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
-WHERE provider_id = $1
-  AND type = 'transcription'
-ORDER BY created_at DESC
-`
-
-func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, providerID pgtype.UUID) ([]Model, error) {
-	rows, err := q.db.Query(ctx, listTranscriptionModelsByProviderID, providerID)
-	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-	var items []Model
-	for rows.Next() {
-		var i Model
-		if err := rows.Scan(
-			&i.ID,
-			&i.ModelID,
-			&i.Name,
-			&i.ProviderID,
-			&i.Type,
-			&i.Config,
-			&i.CreatedAt,
-			&i.UpdatedAt,
-		); err != nil {
-			return nil, err
-		}
-		items = append(items, i)
-	}
-	if err := rows.Err(); err != nil {
-		return nil, err
-	}
-	return items, nil
-}
-
-const listTranscriptionProviders = `-- name: ListTranscriptionProviders :many
-SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
-WHERE client_type IN (
-  'openai-transcription',
-  'openrouter-transcription',
-  'elevenlabs-transcription',
-  'deepgram-transcription',
-  'google-transcription'
-)
-ORDER BY created_at DESC
-`
-
-func (q *Queries) ListTranscriptionProviders(ctx context.Context) ([]Provider, error) {
-	rows, err := q.db.Query(ctx, listTranscriptionProviders)
-	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-	var items []Provider
-	for rows.Next() {
-		var i Provider
-		if err := rows.Scan(
-			&i.ID,
-			&i.Name,
-			&i.ClientType,
-			&i.Icon,
-			&i.Enable,
-			&i.Config,
-			&i.Metadata,
-			&i.CreatedAt,
-			&i.UpdatedAt,
-		); err != nil {
-			return nil, err
-		}
-		items = append(items, i)
-	}
-	if err := rows.Err(); err != nil {
-		return nil, err
-	}
-	return items, nil
-}
-
 const updateModel = `-- name: UpdateModel :one
 UPDATE models
 SET
@@ -1281,6 +1062,11 @@ VALUES ($1, $2, $3, false, $4, '{}')
 ON CONFLICT (name) DO UPDATE SET
  icon = EXCLUDED.icon,
  client_type = EXCLUDED.client_type,
+  config = CASE
+    WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
+    THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
+    ELSE EXCLUDED.config
+  END,
  updated_at = now()
 RETURNING id, name, client_type, icon, enable, config, metadata, created_at, updated_at
 `
@@ -30,7 +30,6 @@ SET language = 'auto',
    search_provider_id = NULL,
    memory_provider_id = NULL,
    tts_model_id = NULL,
-    transcription_model_id = NULL,
    browser_context_id = NULL,
    persist_full_tool_results = false,
    updated_at = now()
@@ -63,7 +62,6 @@ SELECT
  memory_providers.id AS memory_provider_id,
  image_models.id AS image_model_id,
  tts_models.id AS tts_model_id,
-  transcription_models.id AS transcription_model_id,
  browser_contexts.id AS browser_context_id,
  bots.persist_full_tool_results
 FROM bots
@@ -75,7 +73,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
 LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
-LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
 WHERE bots.id = $1
 `
@@ -100,7 +97,6 @@ type GetSettingsByBotIDRow struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
-	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 }
@@ -128,7 +124,6 @@ func (q *Queries) GetSettingsByBotID(ctx context.Context, id pgtype.UUID) (GetSe
 		&i.MemoryProviderID,
 		&i.ImageModelID,
 		&i.TtsModelID,
-		&i.TranscriptionModelID,
 		&i.BrowserContextID,
 		&i.PersistFullToolResults,
 	)
@@ -156,12 +151,11 @@ WITH updated AS (
      memory_provider_id = COALESCE($16::uuid, bots.memory_provider_id),
      image_model_id = COALESCE($17::uuid, bots.image_model_id),
      tts_model_id = COALESCE($18::uuid, bots.tts_model_id),
-      transcription_model_id = COALESCE($19::uuid, bots.transcription_model_id),
-      browser_context_id = COALESCE($20::uuid, bots.browser_context_id),
-      persist_full_tool_results = $21,
+      browser_context_id = COALESCE($19::uuid, bots.browser_context_id),
+      persist_full_tool_results = $20,
      updated_at = now()
-  WHERE bots.id = $22
-  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
+  WHERE bots.id = $21
+  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
 )
 SELECT
  updated.id AS bot_id,
@@ -183,7 +177,6 @@ SELECT
  memory_providers.id AS memory_provider_id,
  image_models.id AS image_model_id,
  tts_models.id AS tts_model_id,
-  transcription_models.id AS transcription_model_id,
  browser_contexts.id AS browser_context_id,
  updated.persist_full_tool_results
 FROM updated
@@ -195,7 +188,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
 LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
-LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id
 `

@@ -218,7 +210,6 @@ type UpsertBotSettingsParams struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
-	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 	ID                     pgtype.UUID `json:"id"`
@@ -244,7 +235,6 @@ type UpsertBotSettingsRow struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
-	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 }
@@ -269,7 +259,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
 		arg.MemoryProviderID,
 		arg.ImageModelID,
 		arg.TtsModelID,
-		arg.TranscriptionModelID,
 		arg.BrowserContextID,
 		arg.PersistFullToolResults,
 		arg.ID,
@@ -295,7 +284,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
 		&i.MemoryProviderID,
 		&i.ImageModelID,
 		&i.TtsModelID,
-		&i.TranscriptionModelID,
 		&i.BrowserContextID,
 		&i.PersistFullToolResults,
 	)
@@ -7,28 +7,28 @@ import (

 	"github.com/labstack/echo/v4"

-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/settings"
+	"github.com/memohai/memoh/internal/tts"
 )

-// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
-type BotAudioHandler struct {
-	audioService    *audiopkg.Service
+// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
+type BotTtsHandler struct {
+	ttsService      *tts.Service
 	settingsService *settings.Service
-	tempStore       *audiopkg.TempStore
+	tempStore       *tts.TempStore
 	logger          *slog.Logger
 }

-func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
-	return &BotAudioHandler{
-		audioService:    audioService,
+func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
+	return &BotTtsHandler{
+		ttsService:      ttsService,
 		settingsService: settingsService,
 		tempStore:       tempStore,
-		logger:          log.With(slog.String("handler", "bot_audio")),
+		logger:          log.With(slog.String("handler", "bot_tts")),
 	}
 }

-func (h *BotAudioHandler) Register(e *echo.Echo) {
+func (h *BotTtsHandler) Register(e *echo.Echo) {
 	e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
 }

@@ -54,7 +54,7 @@ type synthesizeResponse struct {
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /bots/{bot_id}/tts/synthesize [post].
-func (h *BotAudioHandler) Synthesize(c echo.Context) error {
+func (h *BotTtsHandler) Synthesize(c echo.Context) error {
 	botID := strings.TrimSpace(c.Param("bot_id"))
 	if botID == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
@@ -88,10 +88,10 @@ func (h *BotAudioHandler) Synthesize(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
 	}

-	contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
+	contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
 	closeErr := f.Close()
 	if streamErr != nil {
-		h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
+		h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
 		h.tempStore.Delete(tempID)
 		return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
 	}
@@ -30,30 +30,30 @@ import (
 	messagepkg "github.com/memohai/memoh/internal/message"
 )

-// localSpeechSynthesizer synthesizes text to speech audio.
-type localSpeechSynthesizer interface {
+// localTtsSynthesizer synthesizes text to speech audio.
+type localTtsSynthesizer interface {
 	Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
 }

-// localSpeechModelResolver resolves speech model IDs for bots.
-type localSpeechModelResolver interface {
-	ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
+// localTtsModelResolver resolves TTS model IDs for bots.
+type localTtsModelResolver interface {
+	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
 }

 // LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
 type LocalChannelHandler struct {
-	channelType         channel.ChannelType
-	channelManager      *channel.Manager
-	channelStore        *channel.Store
-	chatService         *conversation.Service
-	routeHub            *local.RouteHub
-	botService          *bots.Service
-	accountService      *accounts.Service
-	resolver            *flow.Resolver
-	mediaService        *media.Service
-	speechService       localSpeechSynthesizer
-	speechModelResolver localSpeechModelResolver
-	logger              *slog.Logger
+	channelType      channel.ChannelType
+	channelManager   *channel.Manager
+	channelStore     *channel.Store
+	chatService      *conversation.Service
+	routeHub         *local.RouteHub
+	botService       *bots.Service
+	accountService   *accounts.Service
+	resolver         *flow.Resolver
+	mediaService     *media.Service
+	ttsService       localTtsSynthesizer
+	ttsModelResolver localTtsModelResolver
+	logger           *slog.Logger
 }

 // NewLocalChannelHandler creates a local channel handler.
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
 	h.mediaService = svc
 }

-// SetSpeechService configures speech synthesis for handling speech_delta events.
-func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
-	h.speechService = synth
-	h.speechModelResolver = resolver
+// SetTtsService configures TTS synthesis for handling speech_delta events.
+func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
+	h.ttsService = synth
+	h.ttsModelResolver = resolver
 }

 // Register registers the local channel routes.
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
 // wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
 // injecting attachment_delta events with the resulting voice attachments.
 func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
-	if h.speechService == nil || h.speechModelResolver == nil {
+	if h.ttsService == nil || h.ttsModelResolver == nil {
 		h.logger.Warn("speech_delta received but TTS service not configured")
 		return nil
 	}

-	modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
+	modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
 	if err != nil || strings.TrimSpace(modelID) == "" {
 		h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
 		return nil
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
 			continue
 		}

-		audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
+		audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
 		if synthErr != nil {
 			h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
 			continue
@@ -1,83 +1,55 @@
 package handlers

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"log/slog"
-	"mime/multipart"
 	"net/http"
 	"strings"

 	"github.com/labstack/echo/v4"

-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/models"
+	"github.com/memohai/memoh/internal/tts"
 )

-type AudioHandler struct {
-	service       *audiopkg.Service
+type SpeechHandler struct {
+	service       *tts.Service
 	modelsService *models.Service
 	logger        *slog.Logger
 }

-func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
-	return &AudioHandler{
+func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
+	return &SpeechHandler{
 		service:       service,
 		modelsService: modelsService,
-		logger:        log.With(slog.String("handler", "audio")),
+		logger:        log.With(slog.String("handler", "speech")),
 	}
 }

-func (h *AudioHandler) Register(e *echo.Echo) {
+func (h *SpeechHandler) Register(e *echo.Echo) {
 	pg := e.Group("/speech-providers")
 	pg.GET("", h.ListProviders)
 	pg.GET("/:id", h.GetProvider)
-	pg.GET("/meta", h.ListSpeechMeta)
+	pg.GET("/meta", h.ListMeta)
 	pg.GET("/:id/models", h.ListModelsByProvider)
 	pg.POST("/:id/import-models", h.ImportModels)

-	tpg := e.Group("/transcription-providers")
-	tpg.GET("", h.ListTranscriptionProviders)
-	tpg.GET("/meta", h.ListTranscriptionMeta)
-	tpg.GET("/:id", h.GetProvider)
-	tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider)
-	tpg.POST("/:id/import-models", h.ImportTranscriptionModels)
-
 	mg := e.Group("/speech-models")
 	mg.GET("", h.ListModels)
 	mg.GET("/:id", h.GetModel)
-	mg.PUT("/:id", h.UpdateModel)
 	mg.GET("/:id/capabilities", h.GetModelCapabilities)
 	mg.POST("/:id/test", h.TestModel)
-
-	tg := e.Group("/transcription-models")
-	tg.GET("", h.ListTranscriptionModels)
-	tg.GET("/:id", h.GetTranscriptionModel)
-	tg.PUT("/:id", h.UpdateTranscriptionModel)
-	tg.GET("/:id/capabilities", h.GetTranscriptionModelCapabilities)
-	tg.POST("/:id/test", h.TestTranscriptionModel)
 }

 // ListMeta godoc
 // @Summary List speech provider metadata
 // @Description List available speech provider types with their models and capabilities
 // @Tags speech-providers
-// @Success 200 {array} audiopkg.ProviderMetaResponse
+// @Success 200 {array} tts.ProviderMetaResponse
 // @Router /speech-providers/meta [get].
-func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
-	return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
-}
-
-// ListTranscriptionMeta godoc
-// @Summary List transcription provider metadata
-// @Description List available transcription provider types with their models and capabilities
-// @Tags transcription-providers
-// @Success 200 {array} audiopkg.ProviderMetaResponse
-// @Router /transcription-providers/meta [get].
-func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
-	return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
+func (h *SpeechHandler) ListMeta(c echo.Context) error {
+	return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context()))
 }

 // ListProviders godoc
@@ -85,10 +57,10 @@ func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
 // @Description List providers that support speech (filtered view of unified providers table)
 // @Tags speech-providers
 // @Produce json
-// @Success 200 {array} audiopkg.SpeechProviderResponse
+// @Success 200 {array} tts.SpeechProviderResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers [get].
-func (h *AudioHandler) ListProviders(c echo.Context) error {
+func (h *SpeechHandler) ListProviders(c echo.Context) error {
 	items, err := h.service.ListSpeechProviders(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -96,34 +68,17 @@ func (h *AudioHandler) ListProviders(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-// ListTranscriptionProviders godoc
-// @Summary List transcription providers
-// @Description List providers that support transcription (filtered view of unified providers table)
-// @Tags transcription-providers
-// @Produce json
-// @Success 200 {array} audiopkg.SpeechProviderResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers [get].
-func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
-	items, err := h.service.ListTranscriptionProviders(c.Request().Context())
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
 // GetProvider godoc
 // @Summary Get speech provider
 // @Description Get a speech provider with masked config values
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.SpeechProviderResponse
+// @Success 200 {object} tts.SpeechProviderResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-providers/{id} [get].
-// @Router /transcription-providers/{id} [get].
-func (h *AudioHandler) GetProvider(c echo.Context) error {
+func (h *SpeechHandler) GetProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -141,11 +96,11 @@ func (h *AudioHandler) GetProvider(c echo.Context) error {
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {array} audiopkg.SpeechModelResponse
+// @Success 200 {array} tts.SpeechModelResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/models [get].
-func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
+func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -164,12 +119,12 @@ func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
 // @Accept json
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.ImportModelsResponse
+// @Success 200 {object} tts.ImportModelsResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/import-models [post].
-func (h *AudioHandler) ImportModels(c echo.Context) error {
+func (h *SpeechHandler) ImportModels(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -180,7 +135,7 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
 	}

-	resp := audiopkg.ImportModelsResponse{
+	resp := tts.ImportModelsResponse{
 		Models: make([]string, 0, len(remoteModels)),
 	}

@@ -212,92 +167,15 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-// ListTranscriptionModelsByProvider godoc
-// @Summary List transcription models by provider
-// @Description List models of type 'transcription' for a specific transcription provider
-// @Tags transcription-providers
-// @Produce json
-// @Param id path string true "Provider ID (UUID)"
-// @Success 200 {array} audiopkg.TranscriptionModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers/{id}/models [get].
-func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	items, err := h.service.ListTranscriptionModelsByProvider(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
-// ImportTranscriptionModels godoc
-// @Summary Import transcription models from provider
-// @Description Fetch models using the configured transcription provider and import them into the unified models table
-// @Tags transcription-providers
-// @Accept json
-// @Produce json
-// @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.ImportModelsResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 404 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers/{id}/import-models [post].
-func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-
-	remoteModels, err := h.service.FetchRemoteTranscriptionModels(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
-	}
-
-	resp := audiopkg.ImportModelsResponse{
-		Models: make([]string, 0, len(remoteModels)),
-	}
-
-	for _, model := range remoteModels {
-		name := strings.TrimSpace(model.Name)
-		if name == "" {
-			name = model.ID
-		}
-
-		_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
-			ModelID:    model.ID,
-			Name:       name,
-			ProviderID: id,
-			Type:       models.ModelTypeTranscription,
-			Config:     models.ModelConfig{},
-		})
-		if err != nil {
-			if errors.Is(err, models.ErrModelIDAlreadyExists) {
-				resp.Skipped++
-				continue
-			}
-			h.logger.Warn("failed to import transcription model", slog.String("model_id", model.ID), slog.Any("error", err))
-			continue
-		}
-		resp.Created++
-		resp.Models = append(resp.Models, model.ID)
-	}
-
-	return c.JSON(http.StatusOK, resp)
-}
-
 // ListModels godoc
 // @Summary List all speech models
 // @Description List all models of type 'speech' (filtered view of unified models table)
 // @Tags speech-models
 // @Produce json
-// @Success 200 {array} audiopkg.SpeechModelResponse
+// @Success 200 {array} tts.SpeechModelResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models [get].
-func (h *AudioHandler) ListModels(c echo.Context) error {
+func (h *SpeechHandler) ListModels(c echo.Context) error {
 	items, err := h.service.ListSpeechModels(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -305,31 +183,15 @@ func (h *AudioHandler) ListModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-// ListTranscriptionModels godoc
-// @Summary List all transcription models
-// @Description List all models of type 'transcription' (filtered view of unified models table)
-// @Tags transcription-models
-// @Produce json
-// @Success 200 {array} audiopkg.TranscriptionModelResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models [get].
-func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
-	items, err := h.service.ListTranscriptionModels(c.Request().Context())
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
 // GetModel godoc
 // @Summary Get a speech model
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.SpeechModelResponse
+// @Success 200 {object} tts.SpeechModelResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id} [get].
-func (h *AudioHandler) GetModel(c echo.Context) error {
+func (h *SpeechHandler) GetModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -341,89 +203,15 @@ func (h *AudioHandler) GetModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-// UpdateModel godoc
-// @Summary Update a speech model
-// @Tags speech-models
-// @Accept json
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
-// @Success 200 {object} audiopkg.SpeechModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /speech-models/{id} [put].
-func (h *AudioHandler) UpdateModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	var req audiopkg.UpdateSpeechModelRequest
-	if err := c.Bind(&req); err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	resp, err := h.service.UpdateSpeechModel(c.Request().Context(), id, req)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
-// GetTranscriptionModel godoc
-// @Summary Get a transcription model
-// @Tags transcription-models
-// @Produce json
-// @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.TranscriptionModelResponse
-// @Failure 404 {object} ErrorResponse
-// @Router /transcription-models/{id} [get].
-func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	resp, err := h.service.GetTranscriptionModel(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusNotFound, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
-// UpdateTranscriptionModel godoc
-// @Summary Update a transcription model
-// @Tags transcription-models
-// @Accept json
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
-// @Success 200 {object} audiopkg.TranscriptionModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models/{id} [put].
-func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	var req audiopkg.UpdateSpeechModelRequest
-	if err := c.Bind(&req); err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	resp, err := h.service.UpdateTranscriptionModel(c.Request().Context(), id, req)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
 // GetModelCapabilities godoc
 // @Summary Get speech model capabilities
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.ModelCapabilities
+// @Success 200 {object} tts.ModelCapabilities
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id}/capabilities [get].
-func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
+func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -435,26 +223,6 @@ func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
 	return c.JSON(http.StatusOK, caps)
 }

-// GetTranscriptionModelCapabilities godoc
-// @Summary Get transcription model capabilities
-// @Tags transcription-models
-// @Produce json
-// @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.ModelCapabilities
-// @Failure 404 {object} ErrorResponse
-// @Router /transcription-models/{id}/capabilities [get].
-func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	caps, err := h.service.GetTranscriptionModelCapabilities(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusNotFound, err.Error())
-	}
-	return c.JSON(http.StatusOK, caps)
-}
-
 // TestModel godoc
 // @Summary Test speech model synthesis
 // @Description Synthesize text using a specific model's config and return audio
@@ -462,17 +230,17 @@ func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
 // @Accept json
 // @Produce application/octet-stream
 // @Param id path string true "Model ID"
-// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
+// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
 // @Success 200 {file} binary "Audio data"
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models/{id}/test [post].
-func (h *AudioHandler) TestModel(c echo.Context) error {
+func (h *SpeechHandler) TestModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
 	}
-	var req audiopkg.TestSynthesizeRequest
+	var req tts.TestSynthesizeRequest
 	if err := c.Bind(&req); err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
 	}
@@ -490,69 +258,3 @@ func (h *AudioHandler) TestModel(c echo.Context) error {
 	}
 	return c.Blob(http.StatusOK, contentType, audio)
 }
-
-// TestTranscriptionModel godoc
-// @Summary Test transcription model recognition
-// @Description Transcribe uploaded audio using a specific model's config and return structured text output
-// @Tags transcription-models
-// @Accept mpfd
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param file formData file true "Audio file"
-// @Param config formData string false "Optional JSON config"
-// @Success 200 {object} audiopkg.TestTranscriptionResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models/{id}/test [post].
-func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	file, err := c.FormFile("file")
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, "file is required")
-	}
-	src, err := file.Open()
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	defer func(src multipart.File) {
-		err := src.Close()
-		if err != nil {
-			h.logger.Warn("failed to close uploaded file", slog.Any("error", err))
-		}
-	}(src)
-	audio, err := io.ReadAll(src)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	var cfg map[string]any
-	if raw := strings.TrimSpace(c.FormValue("config")); raw != "" {
-		if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
-			return echo.NewHTTPError(http.StatusBadRequest, "invalid config")
-		}
-	}
-	result, err := h.service.Transcribe(c.Request().Context(), id, audio, file.Filename, file.Header.Get("Content-Type"), cfg)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	resp := audiopkg.TestTranscriptionResponse{
-		Text:            result.Text,
-		Language:        result.Language,
-		DurationSeconds: result.DurationSeconds,
-		Metadata:        result.ProviderMetadata,
-	}
-	if len(result.Words) > 0 {
-		resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
-		for _, word := range result.Words {
-			resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
-				Text:      word.Text,
-				Start:     word.Start,
-				End:       word.End,
-				SpeakerID: word.SpeakerID,
-			})
-		}
-	}
-	return c.JSON(http.StatusOK, resp)
-}
@@ -126,9 +126,9 @@ func (s *Service) List(ctx context.Context) ([]GetResponse, error) {
 	return s.convertToGetResponseList(dbModels), nil
 }

-// ListByType returns models filtered by type.
+// ListByType returns models filtered by type (chat, embedding, or speech).
 func (s *Service) ListByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}

@@ -165,7 +165,7 @@ func (s *Service) ListEnabled(ctx context.Context) ([]GetResponse, error) {

 // ListEnabledByType returns models from enabled providers filtered by type.
 func (s *Service) ListEnabledByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}
 	dbModels, err := s.queries.ListEnabledModelsByType(ctx, string(modelType))
@@ -206,7 +206,7 @@ func (s *Service) ListByProviderID(ctx context.Context, providerID string) ([]Ge

 // ListByProviderIDAndType returns models filtered by provider ID and type.
 func (s *Service) ListByProviderIDAndType(ctx context.Context, providerID string, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}
 	if strings.TrimSpace(providerID) == "" {
@@ -361,7 +361,7 @@ func (s *Service) Count(ctx context.Context) (int64, error) {

 // CountByType returns the number of models of a specific type.
 func (s *Service) CountByType(ctx context.Context, modelType ModelType) (int64, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
 		return 0, fmt.Errorf("invalid model type: %s", modelType)
 	}

@@ -432,19 +432,13 @@ func IsValidClientType(clientType ClientType) bool {
 		ClientTypeGitHubCopilot,
 		ClientTypeEdgeSpeech,
 		ClientTypeOpenAISpeech,
-		ClientTypeOpenAITranscription,
 		ClientTypeOpenRouterSpeech,
-		ClientTypeOpenRouterTranscription,
 		ClientTypeElevenLabsSpeech,
-		ClientTypeElevenLabsTranscription,
 		ClientTypeDeepgramSpeech,
-		ClientTypeDeepgramTranscription,
 		ClientTypeMiniMaxSpeech,
 		ClientTypeVolcengineSpeech,
 		ClientTypeAlibabaSpeech,
-		ClientTypeMicrosoftSpeech,
-		ClientTypeGoogleSpeech,
-		ClientTypeGoogleTranscription:
+		ClientTypeMicrosoftSpeech:
 		return true
 	default:
 		return false
@@ -454,9 +448,7 @@ func IsValidClientType(clientType ClientType) bool {
 // IsLLMClientType returns true if the client type belongs to the LLM domain
 // (chat/embedding), excluding speech-only types (any type ending in "-speech").
 func IsLLMClientType(clientType ClientType) bool {
-	return IsValidClientType(clientType) &&
-		!strings.HasSuffix(string(clientType), "-speech") &&
-		!strings.HasSuffix(string(clientType), "-transcription")
+	return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
 }

 // SelectMemoryModel selects a chat model for memory operations.
@@ -9,36 +9,29 @@ import (
 type ModelType string

 const (
-	ModelTypeChat          ModelType = "chat"
-	ModelTypeEmbedding     ModelType = "embedding"
-	ModelTypeSpeech        ModelType = "speech"
-	ModelTypeTranscription ModelType = "transcription"
+	ModelTypeChat      ModelType = "chat"
+	ModelTypeEmbedding ModelType = "embedding"
+	ModelTypeSpeech    ModelType = "speech"
 )

 type ClientType string

 const (
-	ClientTypeOpenAIResponses         ClientType = "openai-responses"
-	ClientTypeOpenAICompletions       ClientType = "openai-completions"
-	ClientTypeAnthropicMessages       ClientType = "anthropic-messages"
-	ClientTypeGoogleGenerativeAI      ClientType = "google-generative-ai"
-	ClientTypeOpenAICodex             ClientType = "openai-codex"
-	ClientTypeGitHubCopilot           ClientType = "github-copilot"
-	ClientTypeEdgeSpeech              ClientType = "edge-speech"
-	ClientTypeOpenAISpeech            ClientType = "openai-speech"
-	ClientTypeOpenAITranscription     ClientType = "openai-transcription"
-	ClientTypeOpenRouterSpeech        ClientType = "openrouter-speech"
-	ClientTypeOpenRouterTranscription ClientType = "openrouter-transcription"
-	ClientTypeElevenLabsSpeech        ClientType = "elevenlabs-speech"
-	ClientTypeElevenLabsTranscription ClientType = "elevenlabs-transcription"
-	ClientTypeDeepgramSpeech          ClientType = "deepgram-speech"
-	ClientTypeDeepgramTranscription   ClientType = "deepgram-transcription"
-	ClientTypeMiniMaxSpeech           ClientType = "minimax-speech"
-	ClientTypeVolcengineSpeech        ClientType = "volcengine-speech"
-	ClientTypeAlibabaSpeech           ClientType = "alibabacloud-speech"
-	ClientTypeMicrosoftSpeech         ClientType = "microsoft-speech"
-	ClientTypeGoogleSpeech            ClientType = "google-speech"
-	ClientTypeGoogleTranscription     ClientType = "google-transcription"
+	ClientTypeOpenAIResponses    ClientType = "openai-responses"
+	ClientTypeOpenAICompletions  ClientType = "openai-completions"
+	ClientTypeAnthropicMessages  ClientType = "anthropic-messages"
+	ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
+	ClientTypeOpenAICodex        ClientType = "openai-codex"
+	ClientTypeGitHubCopilot      ClientType = "github-copilot"
+	ClientTypeEdgeSpeech         ClientType = "edge-speech"
+	ClientTypeOpenAISpeech       ClientType = "openai-speech"
+	ClientTypeOpenRouterSpeech   ClientType = "openrouter-speech"
+	ClientTypeElevenLabsSpeech   ClientType = "elevenlabs-speech"
+	ClientTypeDeepgramSpeech     ClientType = "deepgram-speech"
+	ClientTypeMiniMaxSpeech      ClientType = "minimax-speech"
+	ClientTypeVolcengineSpeech   ClientType = "volcengine-speech"
+	ClientTypeAlibabaSpeech      ClientType = "alibabacloud-speech"
+	ClientTypeMicrosoftSpeech    ClientType = "microsoft-speech"
 )

 const (
@@ -95,7 +88,7 @@ func (m *Model) Validate() error {
 	if _, err := uuid.Parse(m.ProviderID); err != nil {
 		return errors.New("provider ID must be a valid UUID")
 	}
-	if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech && m.Type != ModelTypeTranscription {
+	if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech {
 		return errors.New("invalid model type")
 	}
 	if m.Type == ModelTypeEmbedding {
@@ -175,14 +175,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
 		}
 		ttsModelUUID = modelID
 	}
-	transcriptionModelUUID := pgtype.UUID{}
-	if value := strings.TrimSpace(req.TranscriptionModelID); value != "" {
-		modelID, err := db.ParseUUID(value)
-		if err != nil {
-			return Settings{}, err
-		}
-		transcriptionModelUUID = modelID
-	}
 	browserContextUUID := pgtype.UUID{}
 	if value := strings.TrimSpace(req.BrowserContextID); value != "" {
 		ctxID, err := db.ParseUUID(value)
@@ -212,7 +204,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
 		SearchProviderID:       searchProviderUUID,
 		MemoryProviderID:       memoryProviderUUID,
 		TtsModelID:             ttsModelUUID,
-		TranscriptionModelID:   transcriptionModelUUID,
 		BrowserContextID:       browserContextUUID,
 		PersistFullToolResults: current.PersistFullToolResults,
 	})
@@ -307,7 +298,6 @@ func normalizeBotSettingsReadRow(row sqlc.GetSettingsByBotIDRow) Settings {
 		row.SearchProviderID,
 		row.MemoryProviderID,
 		row.TtsModelID,
-		row.TranscriptionModelID,
 		row.BrowserContextID,
 		row.PersistFullToolResults,
 	)
@@ -332,7 +322,6 @@ func normalizeBotSettingsWriteRow(row sqlc.UpsertBotSettingsRow) Settings {
 		row.SearchProviderID,
 		row.MemoryProviderID,
 		row.TtsModelID,
-		row.TranscriptionModelID,
 		row.BrowserContextID,
 		row.PersistFullToolResults,
 	)
@@ -356,7 +345,6 @@ func normalizeBotSettingsFields(
 	searchProviderID pgtype.UUID,
 	memoryProviderID pgtype.UUID,
 	ttsModelID pgtype.UUID,
-	transcriptionModelID pgtype.UUID,
 	browserContextID pgtype.UUID,
 	persistFullToolResults bool,
 ) Settings {
@@ -388,9 +376,6 @@ func normalizeBotSettingsFields(
 	if ttsModelID.Valid {
 		settings.TtsModelID = uuid.UUID(ttsModelID.Bytes).String()
 	}
-	if transcriptionModelID.Valid {
-		settings.TranscriptionModelID = uuid.UUID(transcriptionModelID.Bytes).String()
-	}
 	if browserContextID.Valid {
 		settings.BrowserContextID = uuid.UUID(browserContextID.Bytes).String()
 	}
@@ -12,7 +12,6 @@ type Settings struct {
 	SearchProviderID       string `json:"search_provider_id"`
 	MemoryProviderID       string `json:"memory_provider_id"`
 	TtsModelID             string `json:"tts_model_id"`
-	TranscriptionModelID   string `json:"transcription_model_id"`
 	BrowserContextID       string `json:"browser_context_id"`
 	Language               string `json:"language"`
 	AclDefaultEffect       string `json:"acl_default_effect"`
@@ -37,7 +36,6 @@ type UpsertRequest struct {
 	SearchProviderID       string  `json:"search_provider_id,omitempty"`
 	MemoryProviderID       string  `json:"memory_provider_id,omitempty"`
 	TtsModelID             string  `json:"tts_model_id,omitempty"`
-	TranscriptionModelID   string  `json:"transcription_model_id,omitempty"`
 	BrowserContextID       string  `json:"browser_context_id,omitempty"`
 	Language               string  `json:"language,omitempty"`
 	AclDefaultEffect       string  `json:"acl_default_effect,omitempty"`
@@ -1,4 +1,4 @@
-package audio
+package tts

 import "context"

@@ -6,10 +6,10 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/memohai/memoh/internal/audio"
+	"github.com/memohai/memoh/internal/tts"
 )

-const TtsTypeEdge audio.TtsType = "edge"
+const TtsTypeEdge tts.TtsType = "edge"

 const edgeModelReadAloud = "edge-read-aloud"

@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
 	}
 }

-func (*EdgeAdapter) Type() audio.TtsType {
+func (*EdgeAdapter) Type() tts.TtsType {
 	return TtsTypeEdge
 }

-func (*EdgeAdapter) Meta() audio.TtsMeta {
-	return audio.TtsMeta{
+func (*EdgeAdapter) Meta() tts.TtsMeta {
+	return tts.TtsMeta{
 		Provider:    "Microsoft Edge",
 		Description: "Microsoft Edge TTS",
 	}
@@ -54,32 +54,32 @@ var edgeFormats = []string{
 	"webm-24khz-16bit-mono-opus",
 }

-var edgeSpeedConstraint = &audio.ParamConstraint{
+var edgeSpeedConstraint = &tts.ParamConstraint{
 	Options: []float64{0.5, 1.0, 2.0, 3.0},
 	Default: 1.0,
 }

-var edgePitchConstraint = &audio.ParamConstraint{
+var edgePitchConstraint = &tts.ParamConstraint{
 	Min:     -100,
 	Max:     100,
 	Default: 0,
 }

-func (*EdgeAdapter) Models() []audio.ModelInfo {
-	var voices []audio.VoiceInfo
+func (*EdgeAdapter) Models() []tts.ModelInfo {
+	var voices []tts.VoiceInfo
 	for lang, ids := range EdgeTTSVoices {
 		for _, id := range ids {
 			name := strings.TrimPrefix(id, lang+"-")
 			name = strings.TrimSuffix(name, "Neural")
-			voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
+			voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
 		}
 	}
-	return []audio.ModelInfo{
+	return []tts.ModelInfo{
 		{
 			ID:          edgeModelReadAloud,
 			Name:        "Edge Read Aloud",
 			Description: "Built-in Edge Read Aloud speech model",
-			Capabilities: audio.ModelCapabilities{
+			Capabilities: tts.ModelCapabilities{
 				Voices:  voices,
 				Formats: edgeFormats,
 				Speed:   edgeSpeedConstraint,
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
 	return edgeModelReadAloud, nil
 }

-func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
+func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
 	if err := config.Validate(); err != nil {
 		return nil, fmt.Errorf("edge tts: invalid config: %w", err)
 	}
 	return a.client.Synthesize(ctx, text, config)
 }

-func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
+func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
 	if err := config.Validate(); err != nil {
 		errCh := make(chan error, 1)
 		errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"testing"

-	"github.com/memohai/memoh/internal/audio"
+	"github.com/memohai/memoh/internal/tts"
 )

 func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
 	var chunks [][]byte
 	for b := range ch {
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
+	_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
 	if err == nil {
 		t.Fatal("expected error when connection fails")
 	}
@@ -20,7 +20,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/gorilla/websocket"

-	"github.com/memohai/memoh/internal/audio"
+	"github.com/memohai/memoh/internal/tts"
 )

 // Edge TTS WebSocket client.
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
 }

 // Configure sends the speech.config message (output format, etc.).
-func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
+func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	if c.conn == nil {
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig)
 }

 // buildSSML builds SSML with rate and pitch for Edge TTS prosody.
-func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
+func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
 	voiceID := voice.ID
 	if voiceID == "" {
 		voiceID = DEFAULT_VOICE
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {

 // Synthesize sends SSML and synchronously collects all audio data.
 // It handles the full lifecycle: connect → configure → send → receive → close.
-func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
+func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
 	if err := c.Connect(ctx); err != nil {
 		return nil, err
 	}
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {

 // Stream sends SSML and returns audio chunks via channel.
 // It handles the full lifecycle: connect → configure → send → stream → close.
-func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
+func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
 	ch = make(chan []byte, 8)
 	errCh = make(chan error, 1)
 	go func() {
@@ -9,7 +9,7 @@ import (
 	"testing"
 	"time"

-	"github.com/memohai/memoh/internal/audio"
+	"github.com/memohai/memoh/internal/tts"
 )

 // Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
@@ -17,14 +17,14 @@ import (
 //
 // Run:
 //
-//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v
+//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v

 func TestRealEdgeTTS_Synthesize(t *testing.T) {
 	client := NewEdgeWsClient()
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
 	audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
 	ch, errCh := client.Stream(ctx, "你好，这是流式测试。", config)
 	var total int
 	for b := range ch {
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {

 // TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
 //
-//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
+//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
 func TestRealEdgeTTS_Formats(t *testing.T) {
 	formats := []string{
 		"audio-24khz-48kbitrate-mono-mp3",
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
 			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 			defer cancel()

-			config := audio.AudioConfig{
-				Voice:  audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
+			config := tts.AudioConfig{
+				Voice:  tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
 				Format: fmt,
 				Speed:  1.0,
 			}
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {

 // TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
 //
-//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
+//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
 func TestRealEdgeTTS_SaveAudio(t *testing.T) {
 	client := NewEdgeWsClient()
 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
 	cases := []struct {
 		name  string
 		text  string
-		voice audio.VoiceConfig
+		voice tts.VoiceConfig
 		file  string
 	}{
-		{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
-		{"zh", "你好，这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
+		{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
+		{"zh", "你好，这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
 	}

 	outDir := filepath.Join(os.TempDir(), "edge_tts_test")
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {

 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
-			config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
+			config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
 			audio, err := client.Synthesize(ctx, tc.text, config)
 			if err != nil {
 				t.Fatalf("Synthesize: %v", err)
@@ -11,7 +11,7 @@ import (

 	"github.com/gorilla/websocket"

-	"github.com/memohai/memoh/internal/audio"
+	"github.com/memohai/memoh/internal/tts"
 )

 var upgrader = websocket.Upgrader{
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
 	client := NewEdgeWsClient()
 	client.BaseURL = wsURL

-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
 	audio, err := client.Synthesize(t.Context(), "Hello world", config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
 	client := NewEdgeWsClient()
 	client.BaseURL = wsURL

-	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	ch, errCh := client.Stream(t.Context(), "Hi", config)
 	var chunks [][]byte
 	for b := range ch {
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {

 func TestBuildSSML(t *testing.T) {
 	t.Parallel()
-	ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
+	ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
 	if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
 		t.Errorf("ssml should contain voice: %s", ssml)
 	}
@@ -0,0 +1,68 @@
+package tts
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+
+	"github.com/jackc/pgx/v5/pgtype"
+
+	"github.com/memohai/memoh/internal/db/sqlc"
+)
+
+func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
+	for _, def := range registry.List() {
+		configJSON, err := json.Marshal(map[string]any{})
+		if err != nil {
+			return fmt.Errorf("marshal speech provider config: %w", err)
+		}
+		var icon pgtype.Text
+		if def.Icon != "" {
+			icon = pgtype.Text{String: def.Icon, Valid: true}
+		}
+
+		provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
+			Name:       def.DisplayName,
+			ClientType: string(def.ClientType),
+			Icon:       icon,
+			Config:     configJSON,
+		})
+		if err != nil {
+			return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
+		}
+
+		synced := 0
+		for _, model := range def.Models {
+			if shouldHideTemplateModel(def, model.ID) {
+				if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
+					ProviderID: provider.ID,
+					ModelID:    model.ID,
+				}); err != nil {
+					return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
+				}
+				continue
+			}
+			modelConfigJSON, err := json.Marshal(map[string]any{})
+			if err != nil {
+				return fmt.Errorf("marshal speech model config: %w", err)
+			}
+			name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
+			if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
+				ModelID:    model.ID,
+				Name:       name,
+				ProviderID: provider.ID,
+				Type:       "speech",
+				Config:     modelConfigJSON,
+			}); err != nil {
+				return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
+			}
+			synced++
+		}
+
+		if logger != nil {
+			logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
+		}
+	}
+	return nil
+}
@@ -1,4 +1,4 @@
-package audio
+package tts

 // VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
 type VoiceConfig struct {
@@ -1,4 +1,4 @@
-package audio
+package tts

 import (
 	"fmt"
@@ -8,43 +8,31 @@ import (

 	alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
 	deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
-	deepgramtranscription "github.com/memohai/twilight-ai/provider/deepgram/transcription"
 	edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
 	elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
-	elevenlabstranscription "github.com/memohai/twilight-ai/provider/elevenlabs/transcription"
-	googletranscription "github.com/memohai/twilight-ai/provider/google/transcription"
 	microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
 	minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
 	openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
-	openaitranscription "github.com/memohai/twilight-ai/provider/openai/transcription"
 	openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
-	openroutertranscription "github.com/memohai/twilight-ai/provider/openrouter/transcription"
 	volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
 	sdk "github.com/memohai/twilight-ai/sdk"

 	"github.com/memohai/memoh/internal/models"
 )

-type (
-	ProviderFactory              func(config map[string]any) (sdk.SpeechProvider, error)
-	TranscriptionProviderFactory func(config map[string]any) (sdk.TranscriptionProvider, error)
-)
+type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)

 type ProviderDefinition struct {
-	ClientType                models.ClientType
-	DisplayName               string
-	Icon                      string
-	Description               string
-	ConfigSchema              ConfigSchema
-	DefaultModel              string
-	SupportsList              bool
-	Models                    []ModelInfo
-	Factory                   ProviderFactory
-	DefaultTranscriptionModel string
-	SupportsTranscriptionList bool
-	TranscriptionModels       []ModelInfo
-	TranscriptionFactory      TranscriptionProviderFactory
-	Order                     int
+	ClientType   models.ClientType
+	DisplayName  string
+	Icon         string
+	Description  string
+	ConfigSchema ConfigSchema
+	DefaultModel string
+	SupportsList bool
+	Models       []ModelInfo
+	Factory      ProviderFactory
+	Order        int
 }

 type Registry struct {
@@ -53,60 +41,11 @@ type Registry struct {
 	ordered   []models.ClientType
 }

-func isTranscriptionClientType(clientType models.ClientType) bool {
-	switch clientType {
-	case
-		models.ClientTypeOpenAITranscription,
-		models.ClientTypeOpenRouterTranscription,
-		models.ClientTypeElevenLabsTranscription,
-		models.ClientTypeDeepgramTranscription,
-		models.ClientTypeGoogleTranscription:
-		return true
-	default:
-		return false
-	}
-}
-
-func speechToTranscriptionClientType(clientType models.ClientType) models.ClientType {
-	switch clientType {
-	case models.ClientTypeOpenAISpeech:
-		return models.ClientTypeOpenAITranscription
-	case models.ClientTypeOpenRouterSpeech:
-		return models.ClientTypeOpenRouterTranscription
-	case models.ClientTypeElevenLabsSpeech:
-		return models.ClientTypeElevenLabsTranscription
-	case models.ClientTypeDeepgramSpeech:
-		return models.ClientTypeDeepgramTranscription
-	case models.ClientTypeGoogleSpeech:
-		return models.ClientTypeGoogleTranscription
-	default:
-		return ""
-	}
-}
-
-func transcriptionDisplayName(displayName string) string {
-	displayName = strings.TrimSpace(displayName)
-	if displayName == "Google Speech" {
-		return "Google Transcription"
-	}
-	if strings.HasSuffix(displayName, " Speech") {
-		return strings.TrimSuffix(displayName, " Speech") + " Transcription"
-	}
-	return displayName + " Transcription"
-}
-
 func NewRegistry() *Registry {
 	r := &Registry{
 		providers: make(map[models.ClientType]ProviderDefinition),
 	}
-	baseDefs := defaultProviderDefinitions()
-	for _, def := range baseDefs {
-		if def.Factory == nil && def.TranscriptionFactory != nil {
-			continue
-		}
-		r.Register(def)
-	}
-	for _, def := range transcriptionProviderDefinitions(baseDefs) {
+	for _, def := range defaultProviderDefinitions() {
 		r.Register(def)
 	}
 	return r
@@ -155,98 +94,17 @@ func (r *Registry) ListMeta() []ProviderMetaResponse {
 	metas := make([]ProviderMetaResponse, 0, len(defs))
 	for _, def := range defs {
 		metas = append(metas, ProviderMetaResponse{
-			Provider:                  string(def.ClientType),
-			DisplayName:               def.DisplayName,
-			Description:               def.Description,
-			ConfigSchema:              def.ConfigSchema,
-			DefaultModel:              def.DefaultModel,
-			Models:                    def.Models,
-			DefaultSynthesisModel:     def.DefaultModel,
-			SynthesisModels:           def.Models,
-			SupportsSynthesisList:     def.SupportsList,
-			DefaultTranscriptionModel: def.DefaultTranscriptionModel,
-			TranscriptionModels:       def.TranscriptionModels,
-			SupportsTranscriptionList: def.SupportsTranscriptionList,
+			Provider:     string(def.ClientType),
+			DisplayName:  def.DisplayName,
+			Description:  def.Description,
+			ConfigSchema: def.ConfigSchema,
+			DefaultModel: def.DefaultModel,
+			Models:       def.Models,
 		})
 	}
 	return metas
 }

-func (r *Registry) ListSpeechMeta() []ProviderMetaResponse {
-	defs := r.List()
-	metas := make([]ProviderMetaResponse, 0, len(defs))
-	for _, def := range defs {
-		if def.Factory == nil {
-			continue
-		}
-		metas = append(metas, ProviderMetaResponse{
-			Provider:              string(def.ClientType),
-			DisplayName:           def.DisplayName,
-			Description:           def.Description,
-			ConfigSchema:          def.ConfigSchema,
-			DefaultModel:          def.DefaultModel,
-			Models:                def.Models,
-			DefaultSynthesisModel: def.DefaultModel,
-			SynthesisModels:       def.Models,
-			SupportsSynthesisList: def.SupportsList,
-		})
-	}
-	return metas
-}
-
-func (r *Registry) ListTranscriptionMeta() []ProviderMetaResponse {
-	defs := r.List()
-	metas := make([]ProviderMetaResponse, 0, len(defs))
-	for _, def := range defs {
-		if def.TranscriptionFactory == nil || !isTranscriptionClientType(def.ClientType) {
-			continue
-		}
-		modelsList := def.TranscriptionModels
-		if len(modelsList) == 0 {
-			modelsList = def.Models
-		}
-		metas = append(metas, ProviderMetaResponse{
-			Provider:                  string(def.ClientType),
-			DisplayName:               def.DisplayName,
-			Description:               def.Description,
-			ConfigSchema:              def.ConfigSchema,
-			DefaultModel:              def.DefaultTranscriptionModel,
-			Models:                    modelsList,
-			DefaultTranscriptionModel: def.DefaultTranscriptionModel,
-			TranscriptionModels:       modelsList,
-			SupportsTranscriptionList: def.SupportsTranscriptionList,
-		})
-	}
-	return metas
-}
-
-func transcriptionProviderDefinitions(base []ProviderDefinition) []ProviderDefinition {
-	out := make([]ProviderDefinition, 0, len(base))
-	for _, def := range base {
-		clientType := speechToTranscriptionClientType(def.ClientType)
-		if clientType == "" || def.TranscriptionFactory == nil {
-			continue
-		}
-		modelsList := def.TranscriptionModels
-		out = append(out, ProviderDefinition{
-			ClientType:                clientType,
-			DisplayName:               transcriptionDisplayName(def.DisplayName),
-			Icon:                      def.Icon,
-			Description:               strings.TrimSpace(def.Description),
-			ConfigSchema:              def.ConfigSchema,
-			DefaultModel:              def.DefaultTranscriptionModel,
-			SupportsList:              def.SupportsTranscriptionList,
-			Models:                    modelsList,
-			DefaultTranscriptionModel: def.DefaultTranscriptionModel,
-			SupportsTranscriptionList: def.SupportsTranscriptionList,
-			TranscriptionModels:       modelsList,
-			TranscriptionFactory:      def.TranscriptionFactory,
-			Order:                     def.Order + 1,
-		})
-	}
-	return out
-}
-
 func defaultProviderDefinitions() []ProviderDefinition {
 	edgeVoices := make([]VoiceInfo, 0)
 	for lang, ids := range edgespeech.EdgeTTSVoices {
@@ -315,10 +173,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "Bearer API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
 			}},
-			DefaultModel:              "gpt-4o-mini-tts",
-			SupportsList:              true,
-			DefaultTranscriptionModel: "gpt-4o-mini-transcribe",
-			SupportsTranscriptionList: true,
+			DefaultModel: "gpt-4o-mini-tts",
+			SupportsList: true,
 			Models: []ModelInfo{{
 				ID:          "gpt-4o-mini-tts",
 				Name:        "gpt-4o-mini-tts",
@@ -339,23 +195,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					Formats: []string{"mp3", "opus", "pcm", "wav"},
 				},
 			}},
-			TranscriptionModels: []ModelInfo{{
-				ID:          "gpt-4o-mini-transcribe",
-				Name:        "gpt-4o-mini-transcribe",
-				Description: "Default OpenAI transcription model",
-				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language", "Language", "Optional ISO language hint", false, "", 10),
-					stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
-					numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
-					enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
-				}},
-				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language", "Language", "Optional ISO language hint", false, "", 10),
-					stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
-					numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
-					enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
-				}}},
-			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []openaispeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -366,16 +205,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return openaispeech.New(opts...), nil
 			},
-			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
-				opts := []openaitranscription.Option{}
-				if v := configString(config, "api_key"); v != "" {
-					opts = append(opts, openaitranscription.WithAPIKey(v))
-				}
-				if v := configString(config, "base_url"); v != "" {
-					opts = append(opts, openaitranscription.WithBaseURL(v))
-				}
-				return openaitranscription.New(opts...), nil
-			},
 			Order: 20,
 		},
 		{
@@ -387,10 +216,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "OpenRouter API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
 			}},
-			DefaultModel:              "openrouter-tts",
-			SupportsList:              true,
-			DefaultTranscriptionModel: "openai/gpt-4o-mini-transcribe",
-			SupportsTranscriptionList: true,
+			DefaultModel: "openrouter-tts",
+			SupportsList: true,
 			Models: []ModelInfo{{
 				ID:           "openrouter-tts",
 				Name:         "openrouter-tts",
@@ -407,17 +234,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
 				}}},
 			}},
-			TranscriptionModels: []ModelInfo{{
-				ID:          "openai/gpt-4o-mini-transcribe",
-				Name:        "openai/gpt-4o-mini-transcribe",
-				Description: "Default OpenRouter transcription model",
-				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
-				}},
-				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
-				}}},
-			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []openrouterspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -428,16 +244,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return openrouterspeech.New(opts...), nil
 			},
-			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
-				opts := []openroutertranscription.Option{}
-				if v := configString(config, "api_key"); v != "" {
-					opts = append(opts, openroutertranscription.WithAPIKey(v))
-				}
-				if v := configString(config, "base_url"); v != "" {
-					opts = append(opts, openroutertranscription.WithBaseURL(v))
-				}
-				return openroutertranscription.New(opts...), nil
-			},
 			Order: 30,
 		},
 		{
@@ -449,10 +255,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
 			}},
-			DefaultModel:              "elevenlabs-tts",
-			SupportsList:              true,
-			DefaultTranscriptionModel: "scribe_v2",
-			SupportsTranscriptionList: true,
+			DefaultModel: "elevenlabs-tts",
+			SupportsList: true,
 			Models: []ModelInfo{{
 				ID:           "elevenlabs-tts",
 				Name:         "elevenlabs-tts",
@@ -485,25 +289,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
 				}}},
 			}},
-			TranscriptionModels: []ModelInfo{{
-				ID:          "scribe_v2",
-				Name:        "scribe_v2",
-				Description: "Default ElevenLabs transcription model",
-				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
-					boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
-					boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
-					numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
-					enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
-				}},
-				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
-					boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
-					boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
-					numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
-					enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
-				}}},
-			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []elevenlabsspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -514,52 +299,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return elevenlabsspeech.New(opts...), nil
 			},
-			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
-				opts := []elevenlabstranscription.Option{}
-				if v := configString(config, "api_key"); v != "" {
-					opts = append(opts, elevenlabstranscription.WithAPIKey(v))
-				}
-				if v := configString(config, "base_url"); v != "" {
-					opts = append(opts, elevenlabstranscription.WithBaseURL(v))
-				}
-				return elevenlabstranscription.New(opts...), nil
-			},
 			Order: 40,
 		},
-		{
-			ClientType:  models.ClientTypeGoogleSpeech,
-			DisplayName: "Google Speech",
-			Icon:        "google-color",
-			Description: "Google Gemini speech transcription",
-			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-				secretField("api_key", "API Key", "Google API key", true, 10),
-				stringField("base_url", "Base URL", "Override the API base URL", false, "https://generativelanguage.googleapis.com/v1beta", 20),
-			}},
-			DefaultTranscriptionModel: "gemini-2.5-flash",
-			SupportsTranscriptionList: true,
-			TranscriptionModels: []ModelInfo{{
-				ID:          "gemini-2.5-flash",
-				Name:        "gemini-2.5-flash",
-				Description: "Default Google transcription model",
-				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
-				}},
-				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
-				}}},
-			}},
-			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
-				opts := []googletranscription.Option{}
-				if v := configString(config, "api_key"); v != "" {
-					opts = append(opts, googletranscription.WithAPIKey(v))
-				}
-				if v := configString(config, "base_url"); v != "" {
-					opts = append(opts, googletranscription.WithBaseURL(v))
-				}
-				return googletranscription.New(opts...), nil
-			},
-			Order: 45,
-		},
 		{
 			ClientType:  models.ClientTypeDeepgramSpeech,
 			DisplayName: "Deepgram Speech",
@@ -569,10 +310,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "Deepgram API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
 			}},
-			DefaultModel:              "deepgram-tts",
-			SupportsList:              false,
-			DefaultTranscriptionModel: "nova-3",
-			SupportsTranscriptionList: false,
+			DefaultModel: "deepgram-tts",
+			SupportsList: false,
 			Models: []ModelInfo{{
 				ID:          "deepgram-tts",
 				Name:        "deepgram-tts",
@@ -593,25 +332,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					Formats: []string{"wav", "none"},
 				},
 			}},
-			TranscriptionModels: []ModelInfo{{
-				ID:          "nova-3",
-				Name:        "nova-3",
-				Description: "Default Deepgram transcription model",
-				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language", "Language", "Optional language hint", false, "", 10),
-					boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
-					boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
-					boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
-					boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
-				}},
-				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
-					stringField("language", "Language", "Optional language hint", false, "", 10),
-					boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
-					boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
-					boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
-					boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
-				}}},
-			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []deepgramspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -622,16 +342,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return deepgramspeech.New(opts...), nil
 			},
-			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
-				opts := []deepgramtranscription.Option{}
-				if v := configString(config, "api_key"); v != "" {
-					opts = append(opts, deepgramtranscription.WithAPIKey(v))
-				}
-				if v := configString(config, "base_url"); v != "" {
-					opts = append(opts, deepgramtranscription.WithBaseURL(v))
-				}
-				return deepgramtranscription.New(opts...), nil
-			},
 			Order: 50,
 		},
 		{
@@ -0,0 +1,435 @@
+package tts
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+
+	sdk "github.com/memohai/twilight-ai/sdk"
+
+	"github.com/memohai/memoh/internal/db"
+	"github.com/memohai/memoh/internal/db/sqlc"
+	"github.com/memohai/memoh/internal/models"
+)
+
+type Service struct {
+	queries  *sqlc.Queries
+	logger   *slog.Logger
+	registry *Registry
+}
+
+func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
+	return &Service{
+		queries:  queries,
+		logger:   log.With(slog.String("service", "tts")),
+		registry: registry,
+	}
+}
+
+func (s *Service) Registry() *Registry { return s.registry }
+
+func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
+	return s.registry.ListMeta()
+}
+
+func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
+	rows, err := s.queries.ListSpeechProviders(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list speech providers: %w", err)
+	}
+	items := make([]SpeechProviderResponse, 0, len(rows))
+	for _, row := range rows {
+		items = append(items, toSpeechProviderResponse(row))
+	}
+	return items, nil
+}
+
+func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return SpeechProviderResponse{}, err
+	}
+	row, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
+	}
+	return toSpeechProviderResponse(row), nil
+}
+
+func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
+	rows, err := s.queries.ListSpeechModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list speech models: %w", err)
+	}
+	items := make([]SpeechModelResponse, 0, len(rows))
+	for _, row := range rows {
+		if s.shouldHideModel(row.ProviderType, row.ModelID) {
+			continue
+		}
+		items = append(items, toSpeechModelFromListRow(row))
+	}
+	return items, nil
+}
+
+func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
+	pgID, err := db.ParseUUID(providerID)
+	if err != nil {
+		return nil, err
+	}
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("list speech models by provider: %w", err)
+	}
+	items := make([]SpeechModelResponse, 0, len(rows))
+	for _, row := range rows {
+		if shouldHideTemplateModel(def, row.ModelID) {
+			continue
+		}
+		items = append(items, toSpeechModelFromModel(row, ""))
+	}
+	return items, nil
+}
+
+func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return SpeechModelResponse{}, err
+	}
+	row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	if err != nil {
+		return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
+	}
+	return toSpeechModelWithProviderResponse(row), nil
+}
+
+func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
+	params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
+	if err != nil {
+		return nil, "", err
+	}
+	result, err := sdk.GenerateSpeech(ctx,
+		sdk.WithSpeechModel(params.model),
+		sdk.WithText(text),
+		sdk.WithSpeechConfig(params.config),
+	)
+	if err != nil {
+		return nil, "", fmt.Errorf("synthesize: %w", err)
+	}
+	return result.Audio, result.ContentType, nil
+}
+
+func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
+	params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
+	if err != nil {
+		return "", err
+	}
+	streamResult, err := sdk.StreamSpeech(ctx,
+		sdk.WithSpeechModel(params.model),
+		sdk.WithText(text),
+		sdk.WithSpeechConfig(params.config),
+	)
+	if err != nil {
+		return "", fmt.Errorf("stream: %w", err)
+	}
+	audio, err := streamResult.Bytes()
+	if err != nil {
+		return "", fmt.Errorf("stream: %w", err)
+	}
+	if _, writeErr := w.Write(audio); writeErr != nil {
+		return "", fmt.Errorf("write chunk: %w", writeErr)
+	}
+	return streamResult.ContentType, nil
+}
+
+func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
+	pgID, err := db.ParseUUID(modelID)
+	if err != nil {
+		return nil, err
+	}
+	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech model: %w", err)
+	}
+	def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
+	if err != nil {
+		return nil, err
+	}
+	template := findModelTemplate(def, modelRow.ModelID)
+	if template == nil {
+		return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
+	}
+	caps := template.Capabilities
+	if len(caps.ConfigSchema.Fields) == 0 {
+		caps.ConfigSchema = template.ConfigSchema
+	}
+	return &caps, nil
+}
+
+func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
+	pgID, err := db.ParseUUID(providerID)
+	if err != nil {
+		return nil, err
+	}
+
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	if !def.SupportsList || def.Factory == nil {
+		return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
+	}
+
+	provider, err := def.Factory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build speech provider: %w", err)
+	}
+
+	remoteModels, err := provider.ListModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list speech models: %w", err)
+	}
+
+	discovered := make([]ModelInfo, 0, len(remoteModels))
+	for _, remoteModel := range remoteModels {
+		if remoteModel == nil || remoteModel.ID == "" {
+			continue
+		}
+		discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
+	}
+	return discovered, nil
+}
+
+type resolvedSpeechParams struct {
+	model  *sdk.SpeechModel
+	config map[string]any
+}
+
+func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
+	_ = text
+	pgID, err := db.ParseUUID(modelID)
+	if err != nil {
+		return nil, err
+	}
+
+	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech model: %w", err)
+	}
+	providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	provider, err := def.Factory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build speech provider: %w", err)
+	}
+
+	cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
+	return &resolvedSpeechParams{
+		model:  &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
+		config: cfg,
+	}, nil
+}
+
+func parseConfig(raw []byte) map[string]any {
+	if len(raw) == 0 {
+		return map[string]any{}
+	}
+	var cfg map[string]any
+	if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
+		return map[string]any{}
+	}
+	return cfg
+}
+
+func mergeConfig(parts ...map[string]any) map[string]any {
+	out := make(map[string]any)
+	for _, part := range parts {
+		for key, value := range part {
+			out[key] = value
+		}
+	}
+	return out
+}
+
+func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
+	for _, model := range defaults {
+		if model.ID == modelID {
+			return model
+		}
+	}
+	return ModelInfo{
+		ID:   modelID,
+		Name: modelID,
+	}
+}
+
+func (s *Service) shouldHideModel(clientType string, modelID string) bool {
+	def, err := s.registry.Get(models.ClientType(clientType))
+	if err != nil {
+		return false
+	}
+	return shouldHideTemplateModel(def, modelID)
+}
+
+func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
+	if !def.SupportsList {
+		return false
+	}
+	for _, model := range def.Models {
+		if model.ID == modelID {
+			return model.TemplateOnly
+		}
+	}
+	return false
+}
+
+func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
+	for i := range def.Models {
+		if def.Models[i].ID == modelID {
+			return &def.Models[i]
+		}
+	}
+	if def.DefaultModel != "" {
+		for i := range def.Models {
+			if def.Models[i].ID == def.DefaultModel {
+				return &def.Models[i]
+			}
+		}
+	}
+	if len(def.Models) > 0 {
+		return &def.Models[0]
+	}
+	return nil
+}
+
+func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
+	icon := ""
+	if row.Icon.Valid {
+		icon = row.Icon.String
+	}
+	return SpeechProviderResponse{
+		ID:         row.ID.String(),
+		Name:       row.Name,
+		ClientType: row.ClientType,
+		Icon:       icon,
+		Enable:     row.Enable,
+		Config:     maskSpeechProviderConfig(parseConfig(row.Config)),
+		CreatedAt:  row.CreatedAt.Time,
+		UpdatedAt:  row.UpdatedAt.Time,
+	}
+}
+
+func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
+	if len(cfg) == 0 {
+		return map[string]any{}
+	}
+	out := make(map[string]any, len(cfg))
+	for key, value := range cfg {
+		if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
+			out[key] = maskSpeechSecret(s)
+			continue
+		}
+		out[key] = value
+	}
+	return out
+}
+
+func isSpeechSecretKey(key string) bool {
+	switch key {
+	case "api_key", "access_key", "secret_key", "app_key":
+		return true
+	default:
+		return false
+	}
+}
+
+func maskSpeechSecret(value string) string {
+	if len(value) <= 8 {
+		return "********"
+	}
+	return value[:4] + "****" + value[len(value)-4:]
+}
+
+func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return SpeechModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: row.ProviderType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
+
+func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return SpeechModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: providerType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
+
+func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return SpeechModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: row.ProviderType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
@@ -1,4 +1,4 @@
-package audio
+package tts

 import (
 	"fmt"
@@ -13,7 +13,7 @@ import (
 const (
 	defaultTTL      = 10 * time.Minute
 	cleanupInterval = 1 * time.Minute
-	tempDirName     = "audio_temp"
+	tempDirName     = "tts_temp"
 )

 // TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
@@ -30,7 +30,7 @@ type TempStore struct {
 func NewTempStore(baseDir string) (*TempStore, error) {
 	dir := filepath.Join(baseDir, tempDirName)
 	if err := os.MkdirAll(dir, 0o750); err != nil {
-		return nil, fmt.Errorf("create audio temp dir: %w", err)
+		return nil, fmt.Errorf("create tts temp dir: %w", err)
 	}
 	return &TempStore{
 		dir:     dir,
@@ -0,0 +1,62 @@
+package tts
+
+import "time"
+
+// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
+type ProviderMetaResponse struct {
+	Provider     string       `json:"provider"`
+	DisplayName  string       `json:"display_name"`
+	Description  string       `json:"description"`
+	ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
+	DefaultModel string       `json:"default_model"`
+	Models       []ModelInfo  `json:"models"`
+}
+
+// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
+type SpeechProviderResponse struct {
+	ID         string         `json:"id"`
+	Name       string         `json:"name"`
+	ClientType string         `json:"client_type"`
+	Icon       string         `json:"icon,omitempty"`
+	Enable     bool           `json:"enable"`
+	Config     map[string]any `json:"config,omitempty"`
+	CreatedAt  time.Time      `json:"created_at"`
+	UpdatedAt  time.Time      `json:"updated_at"`
+}
+
+// SpeechModelResponse represents a speech model from the unified models table.
+type SpeechModelResponse struct {
+	ID           string         `json:"id"`
+	ModelID      string         `json:"model_id"`
+	Name         string         `json:"name"`
+	ProviderID   string         `json:"provider_id"`
+	ProviderType string         `json:"provider_type,omitempty"`
+	Config       map[string]any `json:"config,omitempty"`
+	CreatedAt    time.Time      `json:"created_at"`
+	UpdatedAt    time.Time      `json:"updated_at"`
+}
+
+// UpdateSpeechProviderRequest is used for updating a speech provider.
+type UpdateSpeechProviderRequest struct {
+	Name   *string `json:"name,omitempty"`
+	Enable *bool   `json:"enable,omitempty"`
+}
+
+// UpdateSpeechModelRequest is used for updating a speech model.
+type UpdateSpeechModelRequest struct {
+	Name   *string        `json:"name,omitempty"`
+	Config map[string]any `json:"config,omitempty"`
+}
+
+// TestSynthesizeRequest represents a text-to-speech test request.
+type TestSynthesizeRequest struct {
+	Text   string         `json:"text"`
+	Config map[string]any `json:"config,omitempty"`
+}
+
+// ImportModelsResponse represents the response for importing speech models.
+type ImportModelsResponse struct {
+	Created int      `json:"created"`
+	Skipped int      `json:"skipped"`
+	Models  []string `json:"models"`
+}
@@ -175,7 +175,6 @@ func withWorkspaceGPUPreference(metadata map[string]any, gpu WorkspaceGPUConfig)
 	return next
 }

-//nolint:unused // Kept for tests and upcoming metadata plumbing.
 func withWorkspaceSkillDiscoveryRoots(metadata map[string]any, roots []string) map[string]any {
 	next := cloneAnyMap(metadata)
 	section := workspaceSection(next)
@@ -200,7 +199,6 @@ func withoutWorkspaceGPUPreference(metadata map[string]any) map[string]any {
 	return next
 }

-//nolint:unused // Kept for tests and upcoming metadata plumbing.
 func withoutWorkspaceSkillDiscoveryRoots(metadata map[string]any) map[string]any {
 	next := cloneAnyMap(metadata)
 	section := workspaceSection(next)
@@ -310,146 +310,6 @@ export type AdaptersUsageResponse = {
    total_text_bytes?: number;
 };

-export type AudioConfigSchema = {
-    fields?: Array<AudioFieldSchema>;
-};
-
-export type AudioFieldSchema = {
-    advanced?: boolean;
-    description?: string;
-    enum?: Array<string>;
-    example?: unknown;
-    key?: string;
-    order?: number;
-    required?: boolean;
-    title?: string;
-    type?: string;
-};
-
-export type AudioImportModelsResponse = {
-    created?: number;
-    models?: Array<string>;
-    skipped?: number;
-};
-
-export type AudioModelCapabilities = {
-    config_schema?: AudioConfigSchema;
-    formats?: Array<string>;
-    metadata?: {
-        [key: string]: string;
-    };
-    pitch?: AudioParamConstraint;
-    speed?: AudioParamConstraint;
-    voices?: Array<AudioVoiceInfo>;
-};
-
-export type AudioModelInfo = {
-    capabilities?: AudioModelCapabilities;
-    config_schema?: AudioConfigSchema;
-    description?: string;
-    id?: string;
-    name?: string;
-    template_only?: boolean;
-};
-
-export type AudioParamConstraint = {
-    default?: number;
-    max?: number;
-    min?: number;
-    options?: Array<number>;
-};
-
-export type AudioProviderMetaResponse = {
-    config_schema?: AudioConfigSchema;
-    default_model?: string;
-    default_synthesis_model?: string;
-    default_transcription_model?: string;
-    description?: string;
-    display_name?: string;
-    models?: Array<AudioModelInfo>;
-    provider?: string;
-    supports_synthesis_list?: boolean;
-    supports_transcription_list?: boolean;
-    synthesis_models?: Array<AudioModelInfo>;
-    transcription_models?: Array<AudioModelInfo>;
-};
-
-export type AudioSpeechModelResponse = {
-    config?: {
-        [key: string]: unknown;
-    };
-    created_at?: string;
-    id?: string;
-    model_id?: string;
-    name?: string;
-    provider_id?: string;
-    provider_type?: string;
-    updated_at?: string;
-};
-
-export type AudioSpeechProviderResponse = {
-    client_type?: string;
-    config?: {
-        [key: string]: unknown;
-    };
-    created_at?: string;
-    enable?: boolean;
-    icon?: string;
-    id?: string;
-    name?: string;
-    updated_at?: string;
-};
-
-export type AudioTestSynthesizeRequest = {
-    config?: {
-        [key: string]: unknown;
-    };
-    text?: string;
-};
-
-export type AudioTestTranscriptionResponse = {
-    duration_seconds?: number;
-    language?: string;
-    metadata?: {
-        [key: string]: unknown;
-    };
-    text?: string;
-    words?: Array<AudioTranscriptionWord>;
-};
-
-export type AudioTranscriptionModelResponse = {
-    config?: {
-        [key: string]: unknown;
-    };
-    created_at?: string;
-    id?: string;
-    model_id?: string;
-    name?: string;
-    provider_id?: string;
-    provider_type?: string;
-    updated_at?: string;
-};
-
-export type AudioTranscriptionWord = {
-    end?: number;
-    speaker_id?: string;
-    start?: number;
-    text?: string;
-};
-
-export type AudioUpdateSpeechModelRequest = {
-    config?: {
-        [key: string]: unknown;
-    };
-    name?: string;
-};
-
-export type AudioVoiceInfo = {
-    id?: string;
-    lang?: string;
-    name?: string;
-};
-
 export type BotsBot = {
    avatar_url?: string;
    check_issue_count?: number;
@@ -613,7 +473,7 @@ export type ChannelChannelIdentityBinding = {
    updated_at?: string;
 };

-export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local' | 'slack';
+export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local';

 export type ChannelConfigSchema = {
    fields?: {
@@ -1494,7 +1354,7 @@ export type ModelsModelConfig = {
    reasoning_efforts?: Array<string>;
 };

-export type ModelsModelType = 'chat' | 'embedding' | 'speech' | 'transcription';
+export type ModelsModelType = 'chat' | 'embedding' | 'speech';

 export type ModelsTestResponse = {
    latency_ms?: number;
@@ -1755,7 +1615,6 @@ export type SettingsSettings = {
    search_provider_id?: string;
    timezone?: string;
    title_model_id?: string;
-    transcription_model_id?: string;
    tts_model_id?: string;
 };

@@ -1780,10 +1639,105 @@ export type SettingsUpsertRequest = {
    search_provider_id?: string;
    timezone?: string;
    title_model_id?: string;
-    transcription_model_id?: string;
    tts_model_id?: string;
 };

+export type TtsConfigSchema = {
+    fields?: Array<TtsFieldSchema>;
+};
+
+export type TtsFieldSchema = {
+    advanced?: boolean;
+    description?: string;
+    enum?: Array<string>;
+    example?: unknown;
+    key?: string;
+    order?: number;
+    required?: boolean;
+    title?: string;
+    type?: string;
+};
+
+export type TtsImportModelsResponse = {
+    created?: number;
+    models?: Array<string>;
+    skipped?: number;
+};
+
+export type TtsModelCapabilities = {
+    config_schema?: TtsConfigSchema;
+    formats?: Array<string>;
+    metadata?: {
+        [key: string]: string;
+    };
+    pitch?: TtsParamConstraint;
+    speed?: TtsParamConstraint;
+    voices?: Array<TtsVoiceInfo>;
+};
+
+export type TtsModelInfo = {
+    capabilities?: TtsModelCapabilities;
+    config_schema?: TtsConfigSchema;
+    description?: string;
+    id?: string;
+    name?: string;
+};
+
+export type TtsParamConstraint = {
+    default?: number;
+    max?: number;
+    min?: number;
+    options?: Array<number>;
+};
+
+export type TtsProviderMetaResponse = {
+    config_schema?: TtsConfigSchema;
+    default_model?: string;
+    description?: string;
+    display_name?: string;
+    models?: Array<TtsModelInfo>;
+    provider?: string;
+};
+
+export type TtsSpeechModelResponse = {
+    config?: {
+        [key: string]: unknown;
+    };
+    created_at?: string;
+    id?: string;
+    model_id?: string;
+    name?: string;
+    provider_id?: string;
+    provider_type?: string;
+    updated_at?: string;
+};
+
+export type TtsSpeechProviderResponse = {
+    client_type?: string;
+    config?: {
+        [key: string]: unknown;
+    };
+    created_at?: string;
+    enable?: boolean;
+    icon?: string;
+    id?: string;
+    name?: string;
+    updated_at?: string;
+};
+
+export type TtsTestSynthesizeRequest = {
+    config?: {
+        [key: string]: unknown;
+    };
+    text?: string;
+};
+
+export type TtsVoiceInfo = {
+    id?: string;
+    lang?: string;
+    name?: string;
+};
+
 export type PostAuthLoginData = {
    /**
     * Login request
@@ -8268,7 +8222,7 @@ export type GetSpeechModelsResponses = {
    /**
     * OK
     */
-    200: Array<AudioSpeechModelResponse>;
+    200: Array<TtsSpeechModelResponse>;
 };

 export type GetSpeechModelsResponse = GetSpeechModelsResponses[keyof GetSpeechModelsResponses];
@@ -8298,48 +8252,11 @@ export type GetSpeechModelsByIdResponses = {
    /**
     * OK
     */
-    200: AudioSpeechModelResponse;
+    200: TtsSpeechModelResponse;
 };

 export type GetSpeechModelsByIdResponse = GetSpeechModelsByIdResponses[keyof GetSpeechModelsByIdResponses];

-export type PutSpeechModelsByIdData = {
-    /**
-     * Model update payload
-     */
-    body: AudioUpdateSpeechModelRequest;
-    path: {
-        /**
-         * Model ID
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/speech-models/{id}';
-};
-
-export type PutSpeechModelsByIdErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type PutSpeechModelsByIdError = PutSpeechModelsByIdErrors[keyof PutSpeechModelsByIdErrors];
-
-export type PutSpeechModelsByIdResponses = {
-    /**
-     * OK
-     */
-    200: AudioSpeechModelResponse;
-};
-
-export type PutSpeechModelsByIdResponse = PutSpeechModelsByIdResponses[keyof PutSpeechModelsByIdResponses];
-
 export type GetSpeechModelsByIdCapabilitiesData = {
    body?: never;
    path: {
@@ -8365,7 +8282,7 @@ export type GetSpeechModelsByIdCapabilitiesResponses = {
    /**
     * OK
     */
-    200: AudioModelCapabilities;
+    200: TtsModelCapabilities;
 };

 export type GetSpeechModelsByIdCapabilitiesResponse = GetSpeechModelsByIdCapabilitiesResponses[keyof GetSpeechModelsByIdCapabilitiesResponses];
@@ -8374,7 +8291,7 @@ export type PostSpeechModelsByIdTestData = {
    /**
     * Text to synthesize
     */
-    body: AudioTestSynthesizeRequest;
+    body: TtsTestSynthesizeRequest;
    path: {
        /**
         * Model ID
@@ -8425,7 +8342,7 @@ export type GetSpeechProvidersResponses = {
    /**
     * OK
     */
-    200: Array<AudioSpeechProviderResponse>;
+    200: Array<TtsSpeechProviderResponse>;
 };

 export type GetSpeechProvidersResponse = GetSpeechProvidersResponses[keyof GetSpeechProvidersResponses];
@@ -8441,7 +8358,7 @@ export type GetSpeechProvidersMetaResponses = {
    /**
     * OK
     */
-    200: Array<AudioProviderMetaResponse>;
+    200: Array<TtsProviderMetaResponse>;
 };

 export type GetSpeechProvidersMetaResponse = GetSpeechProvidersMetaResponses[keyof GetSpeechProvidersMetaResponses];
@@ -8475,7 +8392,7 @@ export type GetSpeechProvidersByIdResponses = {
    /**
     * OK
     */
-    200: AudioSpeechProviderResponse;
+    200: TtsSpeechProviderResponse;
 };

 export type GetSpeechProvidersByIdResponse = GetSpeechProvidersByIdResponses[keyof GetSpeechProvidersByIdResponses];
@@ -8513,7 +8430,7 @@ export type PostSpeechProvidersByIdImportModelsResponses = {
    /**
     * OK
     */
-    200: AudioImportModelsResponse;
+    200: TtsImportModelsResponse;
 };

 export type PostSpeechProvidersByIdImportModelsResponse = PostSpeechProvidersByIdImportModelsResponses[keyof PostSpeechProvidersByIdImportModelsResponses];
@@ -8547,7 +8464,7 @@ export type GetSpeechProvidersByIdModelsResponses = {
    /**
     * OK
     */
-    200: Array<AudioSpeechModelResponse>;
+    200: Array<TtsSpeechModelResponse>;
 };

 export type GetSpeechProvidersByIdModelsResponse = GetSpeechProvidersByIdModelsResponses[keyof GetSpeechProvidersByIdModelsResponses];
@@ -8733,318 +8650,6 @@ export type GetSupermarketTagsResponses = {

 export type GetSupermarketTagsResponse = GetSupermarketTagsResponses[keyof GetSupermarketTagsResponses];

-export type GetTranscriptionModelsData = {
-    body?: never;
-    path?: never;
-    query?: never;
-    url: '/transcription-models';
-};
-
-export type GetTranscriptionModelsErrors = {
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type GetTranscriptionModelsError = GetTranscriptionModelsErrors[keyof GetTranscriptionModelsErrors];
-
-export type GetTranscriptionModelsResponses = {
-    /**
-     * OK
-     */
-    200: Array<AudioTranscriptionModelResponse>;
-};
-
-export type GetTranscriptionModelsResponse = GetTranscriptionModelsResponses[keyof GetTranscriptionModelsResponses];
-
-export type GetTranscriptionModelsByIdData = {
-    body?: never;
-    path: {
-        /**
-         * Model ID
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-models/{id}';
-};
-
-export type GetTranscriptionModelsByIdErrors = {
-    /**
-     * Not Found
-     */
-    404: HandlersErrorResponse;
-};
-
-export type GetTranscriptionModelsByIdError = GetTranscriptionModelsByIdErrors[keyof GetTranscriptionModelsByIdErrors];
-
-export type GetTranscriptionModelsByIdResponses = {
-    /**
-     * OK
-     */
-    200: AudioTranscriptionModelResponse;
-};
-
-export type GetTranscriptionModelsByIdResponse = GetTranscriptionModelsByIdResponses[keyof GetTranscriptionModelsByIdResponses];
-
-export type PutTranscriptionModelsByIdData = {
-    /**
-     * Model update payload
-     */
-    body: AudioUpdateSpeechModelRequest;
-    path: {
-        /**
-         * Model ID
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-models/{id}';
-};
-
-export type PutTranscriptionModelsByIdErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type PutTranscriptionModelsByIdError = PutTranscriptionModelsByIdErrors[keyof PutTranscriptionModelsByIdErrors];
-
-export type PutTranscriptionModelsByIdResponses = {
-    /**
-     * OK
-     */
-    200: AudioTranscriptionModelResponse;
-};
-
-export type PutTranscriptionModelsByIdResponse = PutTranscriptionModelsByIdResponses[keyof PutTranscriptionModelsByIdResponses];
-
-export type GetTranscriptionModelsByIdCapabilitiesData = {
-    body?: never;
-    path: {
-        /**
-         * Model ID
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-models/{id}/capabilities';
-};
-
-export type GetTranscriptionModelsByIdCapabilitiesErrors = {
-    /**
-     * Not Found
-     */
-    404: HandlersErrorResponse;
-};
-
-export type GetTranscriptionModelsByIdCapabilitiesError = GetTranscriptionModelsByIdCapabilitiesErrors[keyof GetTranscriptionModelsByIdCapabilitiesErrors];
-
-export type GetTranscriptionModelsByIdCapabilitiesResponses = {
-    /**
-     * OK
-     */
-    200: AudioModelCapabilities;
-};
-
-export type GetTranscriptionModelsByIdCapabilitiesResponse = GetTranscriptionModelsByIdCapabilitiesResponses[keyof GetTranscriptionModelsByIdCapabilitiesResponses];
-
-export type PostTranscriptionModelsByIdTestData = {
-    body: {
-        /**
-         * Audio file
-         */
-        file: Blob | File;
-        /**
-         * Optional JSON config
-         */
-        config?: string;
-    };
-    path: {
-        /**
-         * Model ID
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-models/{id}/test';
-};
-
-export type PostTranscriptionModelsByIdTestErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type PostTranscriptionModelsByIdTestError = PostTranscriptionModelsByIdTestErrors[keyof PostTranscriptionModelsByIdTestErrors];
-
-export type PostTranscriptionModelsByIdTestResponses = {
-    /**
-     * OK
-     */
-    200: AudioTestTranscriptionResponse;
-};
-
-export type PostTranscriptionModelsByIdTestResponse = PostTranscriptionModelsByIdTestResponses[keyof PostTranscriptionModelsByIdTestResponses];
-
-export type GetTranscriptionProvidersData = {
-    body?: never;
-    path?: never;
-    query?: never;
-    url: '/transcription-providers';
-};
-
-export type GetTranscriptionProvidersErrors = {
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type GetTranscriptionProvidersError = GetTranscriptionProvidersErrors[keyof GetTranscriptionProvidersErrors];
-
-export type GetTranscriptionProvidersResponses = {
-    /**
-     * OK
-     */
-    200: Array<AudioSpeechProviderResponse>;
-};
-
-export type GetTranscriptionProvidersResponse = GetTranscriptionProvidersResponses[keyof GetTranscriptionProvidersResponses];
-
-export type GetTranscriptionProvidersMetaData = {
-    body?: never;
-    path?: never;
-    query?: never;
-    url: '/transcription-providers/meta';
-};
-
-export type GetTranscriptionProvidersMetaResponses = {
-    /**
-     * OK
-     */
-    200: Array<AudioProviderMetaResponse>;
-};
-
-export type GetTranscriptionProvidersMetaResponse = GetTranscriptionProvidersMetaResponses[keyof GetTranscriptionProvidersMetaResponses];
-
-export type GetTranscriptionProvidersByIdData = {
-    body?: never;
-    path: {
-        /**
-         * Provider ID (UUID)
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-providers/{id}';
-};
-
-export type GetTranscriptionProvidersByIdErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Not Found
-     */
-    404: HandlersErrorResponse;
-};
-
-export type GetTranscriptionProvidersByIdError = GetTranscriptionProvidersByIdErrors[keyof GetTranscriptionProvidersByIdErrors];
-
-export type GetTranscriptionProvidersByIdResponses = {
-    /**
-     * OK
-     */
-    200: AudioSpeechProviderResponse;
-};
-
-export type GetTranscriptionProvidersByIdResponse = GetTranscriptionProvidersByIdResponses[keyof GetTranscriptionProvidersByIdResponses];
-
-export type PostTranscriptionProvidersByIdImportModelsData = {
-    body?: never;
-    path: {
-        /**
-         * Provider ID (UUID)
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-providers/{id}/import-models';
-};
-
-export type PostTranscriptionProvidersByIdImportModelsErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Not Found
-     */
-    404: HandlersErrorResponse;
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type PostTranscriptionProvidersByIdImportModelsError = PostTranscriptionProvidersByIdImportModelsErrors[keyof PostTranscriptionProvidersByIdImportModelsErrors];
-
-export type PostTranscriptionProvidersByIdImportModelsResponses = {
-    /**
-     * OK
-     */
-    200: AudioImportModelsResponse;
-};
-
-export type PostTranscriptionProvidersByIdImportModelsResponse = PostTranscriptionProvidersByIdImportModelsResponses[keyof PostTranscriptionProvidersByIdImportModelsResponses];
-
-export type GetTranscriptionProvidersByIdModelsData = {
-    body?: never;
-    path: {
-        /**
-         * Provider ID (UUID)
-         */
-        id: string;
-    };
-    query?: never;
-    url: '/transcription-providers/{id}/models';
-};
-
-export type GetTranscriptionProvidersByIdModelsErrors = {
-    /**
-     * Bad Request
-     */
-    400: HandlersErrorResponse;
-    /**
-     * Internal Server Error
-     */
-    500: HandlersErrorResponse;
-};
-
-export type GetTranscriptionProvidersByIdModelsError = GetTranscriptionProvidersByIdModelsErrors[keyof GetTranscriptionProvidersByIdModelsErrors];
-
-export type GetTranscriptionProvidersByIdModelsResponses = {
-    /**
-     * OK
-     */
-    200: Array<AudioTranscriptionModelResponse>;
-};
-
-export type GetTranscriptionProvidersByIdModelsResponse = GetTranscriptionProvidersByIdModelsResponses[keyof GetTranscriptionProvidersByIdModelsResponses];
-
 export type GetUsersData = {
    body?: never;
    path?: never;
@@ -489,240 +489,6 @@ definitions:
      total_text_bytes:
        type: integer
    type: object
-  audio.ConfigSchema:
-    properties:
-      fields:
-        items:
-          $ref: '#/definitions/audio.FieldSchema'
-        type: array
-    type: object
-  audio.FieldSchema:
-    properties:
-      advanced:
-        type: boolean
-      description:
-        type: string
-      enum:
-        items:
-          type: string
-        type: array
-      example: {}
-      key:
-        type: string
-      order:
-        type: integer
-      required:
-        type: boolean
-      title:
-        type: string
-      type:
-        type: string
-    type: object
-  audio.ImportModelsResponse:
-    properties:
-      created:
-        type: integer
-      models:
-        items:
-          type: string
-        type: array
-      skipped:
-        type: integer
-    type: object
-  audio.ModelCapabilities:
-    properties:
-      config_schema:
-        $ref: '#/definitions/audio.ConfigSchema'
-      formats:
-        items:
-          type: string
-        type: array
-      metadata:
-        additionalProperties:
-          type: string
-        type: object
-      pitch:
-        $ref: '#/definitions/audio.ParamConstraint'
-      speed:
-        $ref: '#/definitions/audio.ParamConstraint'
-      voices:
-        items:
-          $ref: '#/definitions/audio.VoiceInfo'
-        type: array
-    type: object
-  audio.ModelInfo:
-    properties:
-      capabilities:
-        $ref: '#/definitions/audio.ModelCapabilities'
-      config_schema:
-        $ref: '#/definitions/audio.ConfigSchema'
-      description:
-        type: string
-      id:
-        type: string
-      name:
-        type: string
-      template_only:
-        type: boolean
-    type: object
-  audio.ParamConstraint:
-    properties:
-      default:
-        type: number
-      max:
-        type: number
-      min:
-        type: number
-      options:
-        items:
-          type: number
-        type: array
-    type: object
-  audio.ProviderMetaResponse:
-    properties:
-      config_schema:
-        $ref: '#/definitions/audio.ConfigSchema'
-      default_model:
-        type: string
-      default_synthesis_model:
-        type: string
-      default_transcription_model:
-        type: string
-      description:
-        type: string
-      display_name:
-        type: string
-      models:
-        items:
-          $ref: '#/definitions/audio.ModelInfo'
-        type: array
-      provider:
-        type: string
-      supports_synthesis_list:
-        type: boolean
-      supports_transcription_list:
-        type: boolean
-      synthesis_models:
-        items:
-          $ref: '#/definitions/audio.ModelInfo'
-        type: array
-      transcription_models:
-        items:
-          $ref: '#/definitions/audio.ModelInfo'
-        type: array
-    type: object
-  audio.SpeechModelResponse:
-    properties:
-      config:
-        additionalProperties: {}
-        type: object
-      created_at:
-        type: string
-      id:
-        type: string
-      model_id:
-        type: string
-      name:
-        type: string
-      provider_id:
-        type: string
-      provider_type:
-        type: string
-      updated_at:
-        type: string
-    type: object
-  audio.SpeechProviderResponse:
-    properties:
-      client_type:
-        type: string
-      config:
-        additionalProperties: {}
-        type: object
-      created_at:
-        type: string
-      enable:
-        type: boolean
-      icon:
-        type: string
-      id:
-        type: string
-      name:
-        type: string
-      updated_at:
-        type: string
-    type: object
-  audio.TestSynthesizeRequest:
-    properties:
-      config:
-        additionalProperties: {}
-        type: object
-      text:
-        type: string
-    type: object
-  audio.TestTranscriptionResponse:
-    properties:
-      duration_seconds:
-        type: number
-      language:
-        type: string
-      metadata:
-        additionalProperties: {}
-        type: object
-      text:
-        type: string
-      words:
-        items:
-          $ref: '#/definitions/audio.TranscriptionWord'
-        type: array
-    type: object
-  audio.TranscriptionModelResponse:
-    properties:
-      config:
-        additionalProperties: {}
-        type: object
-      created_at:
-        type: string
-      id:
-        type: string
-      model_id:
-        type: string
-      name:
-        type: string
-      provider_id:
-        type: string
-      provider_type:
-        type: string
-      updated_at:
-        type: string
-    type: object
-  audio.TranscriptionWord:
-    properties:
-      end:
-        type: number
-      speaker_id:
-        type: string
-      start:
-        type: number
-      text:
-        type: string
-    type: object
-  audio.UpdateSpeechModelRequest:
-    properties:
-      config:
-        additionalProperties: {}
-        type: object
-      name:
-        type: string
-    type: object
-  audio.VoiceInfo:
-    properties:
-      id:
-        type: string
-      lang:
-        type: string
-      name:
-        type: string
-    type: object
  bots.Bot:
    properties:
      avatar_url:
@@ -1008,7 +774,6 @@ definitions:
    - weixin
    - wechatoa
    - local
-    - slack
    type: string
    x-enum-varnames:
    - ChannelTypeTelegram
@@ -1021,7 +786,6 @@ definitions:
    - ChannelTypeWeixin
    - ChannelTypeWeChatOA
    - ChannelTypeLocal
-    - ChannelTypeSlack
  channel.ConfigSchema:
    properties:
      fields:
@@ -2498,13 +2262,11 @@ definitions:
    - chat
    - embedding
    - speech
-    - transcription
    type: string
    x-enum-varnames:
    - ModelTypeChat
    - ModelTypeEmbedding
    - ModelTypeSpeech
-    - ModelTypeTranscription
  models.TestResponse:
    properties:
      latency_ms:
@@ -2951,8 +2713,6 @@ definitions:
        type: string
      title_model_id:
        type: string
-      transcription_model_id:
-        type: string
      tts_model_id:
        type: string
    type: object
@@ -2998,11 +2758,170 @@ definitions:
        type: string
      title_model_id:
        type: string
-      transcription_model_id:
-        type: string
      tts_model_id:
        type: string
    type: object
+  tts.ConfigSchema:
+    properties:
+      fields:
+        items:
+          $ref: '#/definitions/tts.FieldSchema'
+        type: array
+    type: object
+  tts.FieldSchema:
+    properties:
+      advanced:
+        type: boolean
+      description:
+        type: string
+      enum:
+        items:
+          type: string
+        type: array
+      example: {}
+      key:
+        type: string
+      order:
+        type: integer
+      required:
+        type: boolean
+      title:
+        type: string
+      type:
+        type: string
+    type: object
+  tts.ImportModelsResponse:
+    properties:
+      created:
+        type: integer
+      models:
+        items:
+          type: string
+        type: array
+      skipped:
+        type: integer
+    type: object
+  tts.ModelCapabilities:
+    properties:
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
+      formats:
+        items:
+          type: string
+        type: array
+      metadata:
+        additionalProperties:
+          type: string
+        type: object
+      pitch:
+        $ref: '#/definitions/tts.ParamConstraint'
+      speed:
+        $ref: '#/definitions/tts.ParamConstraint'
+      voices:
+        items:
+          $ref: '#/definitions/tts.VoiceInfo'
+        type: array
+    type: object
+  tts.ModelInfo:
+    properties:
+      capabilities:
+        $ref: '#/definitions/tts.ModelCapabilities'
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
+      description:
+        type: string
+      id:
+        type: string
+      name:
+        type: string
+    type: object
+  tts.ParamConstraint:
+    properties:
+      default:
+        type: number
+      max:
+        type: number
+      min:
+        type: number
+      options:
+        items:
+          type: number
+        type: array
+    type: object
+  tts.ProviderMetaResponse:
+    properties:
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
+      default_model:
+        type: string
+      description:
+        type: string
+      display_name:
+        type: string
+      models:
+        items:
+          $ref: '#/definitions/tts.ModelInfo'
+        type: array
+      provider:
+        type: string
+    type: object
+  tts.SpeechModelResponse:
+    properties:
+      config:
+        additionalProperties: {}
+        type: object
+      created_at:
+        type: string
+      id:
+        type: string
+      model_id:
+        type: string
+      name:
+        type: string
+      provider_id:
+        type: string
+      provider_type:
+        type: string
+      updated_at:
+        type: string
+    type: object
+  tts.SpeechProviderResponse:
+    properties:
+      client_type:
+        type: string
+      config:
+        additionalProperties: {}
+        type: object
+      created_at:
+        type: string
+      enable:
+        type: boolean
+      icon:
+        type: string
+      id:
+        type: string
+      name:
+        type: string
+      updated_at:
+        type: string
+    type: object
+  tts.TestSynthesizeRequest:
+    properties:
+      config:
+        additionalProperties: {}
+        type: object
+      text:
+        type: string
+    type: object
+  tts.VoiceInfo:
+    properties:
+      id:
+        type: string
+      lang:
+        type: string
+      name:
+        type: string
+    type: object
 info:
  contact: {}
  title: Memoh API
@@ -8257,7 +8176,7 @@ paths:
          description: OK
          schema:
            items:
-              $ref: '#/definitions/audio.SpeechModelResponse'
+              $ref: '#/definitions/tts.SpeechModelResponse'
            type: array
        "500":
          description: Internal Server Error
@@ -8280,7 +8199,7 @@ paths:
        "200":
          description: OK
          schema:
-            $ref: '#/definitions/audio.SpeechModelResponse'
+            $ref: '#/definitions/tts.SpeechModelResponse'
        "404":
          description: Not Found
          schema:
@@ -8288,39 +8207,6 @@ paths:
      summary: Get a speech model
      tags:
      - speech-models
-    put:
-      consumes:
-      - application/json
-      parameters:
-      - description: Model ID
-        in: path
-        name: id
-        required: true
-        type: string
-      - description: Model update payload
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/audio.UpdateSpeechModelRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.SpeechModelResponse'
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Update a speech model
-      tags:
-      - speech-models
  /speech-models/{id}/capabilities:
    get:
      parameters:
@@ -8335,7 +8221,7 @@ paths:
        "200":
          description: OK
          schema:
-            $ref: '#/definitions/audio.ModelCapabilities'
+            $ref: '#/definitions/tts.ModelCapabilities'
        "404":
          description: Not Found
          schema:
@@ -8359,7 +8245,7 @@ paths:
        name: request
        required: true
        schema:
-          $ref: '#/definitions/audio.TestSynthesizeRequest'
+          $ref: '#/definitions/tts.TestSynthesizeRequest'
      produces:
      - application/octet-stream
      responses:
@@ -8389,7 +8275,7 @@ paths:
          description: OK
          schema:
            items:
-              $ref: '#/definitions/audio.SpeechProviderResponse'
+              $ref: '#/definitions/tts.SpeechProviderResponse'
            type: array
        "500":
          description: Internal Server Error
@@ -8413,7 +8299,7 @@ paths:
        "200":
          description: OK
          schema:
-            $ref: '#/definitions/audio.SpeechProviderResponse'
+            $ref: '#/definitions/tts.SpeechProviderResponse'
        "400":
          description: Bad Request
          schema:
@@ -8443,7 +8329,7 @@ paths:
        "200":
          description: OK
          schema:
-            $ref: '#/definitions/audio.ImportModelsResponse'
+            $ref: '#/definitions/tts.ImportModelsResponse'
        "400":
          description: Bad Request
          schema:
@@ -8475,7 +8361,7 @@ paths:
          description: OK
          schema:
            items:
-              $ref: '#/definitions/audio.SpeechModelResponse'
+              $ref: '#/definitions/tts.SpeechModelResponse'
            type: array
        "400":
          description: Bad Request
@@ -8496,7 +8382,7 @@ paths:
          description: OK
          schema:
            items:
-              $ref: '#/definitions/audio.ProviderMetaResponse'
+              $ref: '#/definitions/tts.ProviderMetaResponse'
            type: array
      summary: List speech provider metadata
      tags:
@@ -8629,267 +8515,6 @@ paths:
      summary: List all tags from supermarket
      tags:
      - supermarket
-  /transcription-models:
-    get:
-      description: List all models of type 'transcription' (filtered view of unified
-        models table)
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            items:
-              $ref: '#/definitions/audio.TranscriptionModelResponse'
-            type: array
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: List all transcription models
-      tags:
-      - transcription-models
-  /transcription-models/{id}:
-    get:
-      parameters:
-      - description: Model ID
-        in: path
-        name: id
-        required: true
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.TranscriptionModelResponse'
-        "404":
-          description: Not Found
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Get a transcription model
-      tags:
-      - transcription-models
-    put:
-      consumes:
-      - application/json
-      parameters:
-      - description: Model ID
-        in: path
-        name: id
-        required: true
-        type: string
-      - description: Model update payload
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/audio.UpdateSpeechModelRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.TranscriptionModelResponse'
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Update a transcription model
-      tags:
-      - transcription-models
-  /transcription-models/{id}/capabilities:
-    get:
-      parameters:
-      - description: Model ID
-        in: path
-        name: id
-        required: true
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.ModelCapabilities'
-        "404":
-          description: Not Found
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Get transcription model capabilities
-      tags:
-      - transcription-models
-  /transcription-models/{id}/test:
-    post:
-      consumes:
-      - multipart/form-data
-      description: Transcribe uploaded audio using a specific model's config and return
-        structured text output
-      parameters:
-      - description: Model ID
-        in: path
-        name: id
-        required: true
-        type: string
-      - description: Audio file
-        in: formData
-        name: file
-        required: true
-        type: file
-      - description: Optional JSON config
-        in: formData
-        name: config
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.TestTranscriptionResponse'
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Test transcription model recognition
-      tags:
-      - transcription-models
-  /transcription-providers:
-    get:
-      description: List providers that support transcription (filtered view of unified
-        providers table)
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            items:
-              $ref: '#/definitions/audio.SpeechProviderResponse'
-            type: array
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: List transcription providers
-      tags:
-      - transcription-providers
-  /transcription-providers/{id}:
-    get:
-      description: Get a speech provider with masked config values
-      parameters:
-      - description: Provider ID (UUID)
-        in: path
-        name: id
-        required: true
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.SpeechProviderResponse'
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "404":
-          description: Not Found
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Get speech provider
-      tags:
-      - speech-providers
-  /transcription-providers/{id}/import-models:
-    post:
-      consumes:
-      - application/json
-      description: Fetch models using the configured transcription provider and import
-        them into the unified models table
-      parameters:
-      - description: Provider ID (UUID)
-        in: path
-        name: id
-        required: true
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/audio.ImportModelsResponse'
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "404":
-          description: Not Found
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: Import transcription models from provider
-      tags:
-      - transcription-providers
-  /transcription-providers/{id}/models:
-    get:
-      description: List models of type 'transcription' for a specific transcription
-        provider
-      parameters:
-      - description: Provider ID (UUID)
-        in: path
-        name: id
-        required: true
-        type: string
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            items:
-              $ref: '#/definitions/audio.TranscriptionModelResponse'
-            type: array
-        "400":
-          description: Bad Request
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-        "500":
-          description: Internal Server Error
-          schema:
-            $ref: '#/definitions/handlers.ErrorResponse'
-      summary: List transcription models by provider
-      tags:
-      - transcription-providers
-  /transcription-providers/meta:
-    get:
-      description: List available transcription provider types with their models and
-        capabilities
-      responses:
-        "200":
-          description: OK
-          schema:
-            items:
-              $ref: '#/definitions/audio.ProviderMetaResponse'
-            type: array
-      summary: List transcription provider metadata
-      tags:
-      - transcription-providers
  /users:
    get:
      description: List users