feat: expand speech provider support with new client types and config… (#389)

* feat: expand speech provider support with new client types and configuration schema * feat: add icon support for speech providers and update related configurations * feat: add SVG support for Deepgram and Elevenlabs with Vue components * feat: except *-speech client type in llm provider * feat: enhance speech provider functionality with advanced settings and model import capabilities * chore: remove go.mod replace * feat: enhance speech provider functionality with advanced settings and model import capabilities * chore: update go module dependencies --------- Co-authored-by: Acbox <acbox0328@gmail.com>
2026-04-27 07:16:19 +09:00 · 2026-04-19 22:58:16 +09:00
parent 8e013ad1ad
commit 8d78925a23
46 changed files with 2808 additions and 565 deletions
@@ -13,10 +13,12 @@ import {
  ClaudeColor,
  Cohere,
  CohereColor,
+  Deepgram,
  Deepseek,
  DeepseekColor,
  Doubao,
  DoubaoColor,
+  Elevenlabs,
  Fireworks,
  FireworksColor,
  Gemini,
@@ -35,6 +37,8 @@ import {
  Lmstudio,
  Meta,
  MetaColor,
+  Microsoft,
+  MicrosoftColor,
  Minimax,
  MinimaxColor,
  Mistral,
@@ -81,6 +85,8 @@ export const iconMap: Record<string, Component> = {
  'google-brand-color': GoogleBrandColor,
  'deepseek': Deepseek,
  'deepseek-color': DeepseekColor,
+  'deepgram': Deepgram,
+  'elevenlabs': Elevenlabs,
  'groq': Groq,
  'huggingface': Huggingface,
  'huggingface-color': HuggingfaceColor,
@@ -105,6 +111,8 @@ export const iconMap: Record<string, Component> = {
  'cohere-color': CohereColor,
  'azure': Azure,
  'azure-color': AzureColor,
+  'microsoft': Microsoft,
+  'microsoft-color': MicrosoftColor,
  'nvidia': Nvidia,
  'nvidia-color': NvidiaColor,
  'fireworks': Fireworks,
@@ -40,9 +40,49 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
    label: 'Edge Speech',
    hint: 'Microsoft Edge Read Aloud TTS',
  },
+  'openai-speech': {
+    value: 'openai-speech',
+    label: 'OpenAI Speech',
+    hint: 'OpenAI /audio/speech compatible TTS',
+  },
+  'openrouter-speech': {
+    value: 'openrouter-speech',
+    label: 'OpenRouter Speech',
+    hint: 'OpenRouter audio modality TTS',
+  },
+  'elevenlabs-speech': {
+    value: 'elevenlabs-speech',
+    label: 'ElevenLabs Speech',
+    hint: 'ElevenLabs text-to-speech',
+  },
+  'deepgram-speech': {
+    value: 'deepgram-speech',
+    label: 'Deepgram Speech',
+    hint: 'Deepgram TTS',
+  },
+  'minimax-speech': {
+    value: 'minimax-speech',
+    label: 'MiniMax Speech',
+    hint: 'MiniMax TTS',
+  },
+  'volcengine-speech': {
+    value: 'volcengine-speech',
+    label: 'Volcengine Speech',
+    hint: 'Volcengine SAMI TTS',
+  },
+  'alibabacloud-speech': {
+    value: 'alibabacloud-speech',
+    label: 'Alibaba Cloud Speech',
+    hint: 'DashScope CosyVoice TTS',
+  },
+  'microsoft-speech': {
+    value: 'microsoft-speech',
+    label: 'Microsoft Speech',
+    hint: 'Azure Cognitive Services TTS',
+  },
 }

 export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)

 export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
-  .filter(ct => ct.value !== 'edge-speech')
+  .filter(ct => !ct.value.endsWith('-speech'))
@@ -424,6 +424,11 @@
    "modelIdPlaceholder": "Enter model identifier (e.g. custom-voice)",
    "noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
    "noCapabilities": "No capabilities available for this model.",
+    "saveSuccess": "Speech configuration saved",
+    "advanced": {
+      "title": "Advanced Settings",
+      "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
+    },
    "fields": {
      "language": "Language",
      "languagePlaceholder": "Select language...",
@@ -420,6 +420,11 @@
    "modelIdPlaceholder": "输入模型标识符（如 custom-voice）",
    "noModels": "暂无模型，点击\"导入模型\"发现可用模型，或点击\"新建模型\"手动创建。",
    "noCapabilities": "该模型暂无可用能力信息。",
+    "saveSuccess": "语音配置已保存",
+    "advanced": {
+      "title": "高级设置",
+      "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
+    },
    "fields": {
      "language": "语言",
      "languagePlaceholder": "选择语言...",
@@ -1,189 +1,198 @@
 <template>
  <div class="space-y-4">
-    <template v-if="caps">
-      <!-- Language -->
-      <div class="space-y-2">
-        <Label for="tts-lang">{{ $t('speech.fields.language') }}</Label>
-        <Select
-          :model-value="configData.voice_lang ?? ''"
-          @update:model-value="onLangChange"
-        >
-          <SelectTrigger
-            id="tts-lang"
-            class="w-full"
-          >
-            <SelectValue :placeholder="$t('speech.fields.languagePlaceholder')" />
-          </SelectTrigger>
-          <SelectContent class="max-h-60">
-            <SelectItem
-              v-for="lang in availableLanguages"
-              :key="lang"
-              :value="lang"
-            >
-              {{ lang }}
-            </SelectItem>
-          </SelectContent>
-        </Select>
-      </div>
-
-      <!-- Voice -->
-      <div class="space-y-2">
-        <Label for="tts-voice">{{ $t('speech.fields.voice') }}</Label>
-        <Select
-          :model-value="configData.voice_id ?? ''"
-          @update:model-value="(val) => configData.voice_id = val"
-        >
-          <SelectTrigger
-            id="tts-voice"
-            class="w-full"
-          >
-            <SelectValue :placeholder="$t('speech.fields.voicePlaceholder')" />
-          </SelectTrigger>
-          <SelectContent class="max-h-60">
-            <SelectItem
-              v-for="voice in filteredVoices"
-              :key="voice.id"
-              :value="voice.id!"
-            >
-              {{ voice.name }} ({{ voice.id }})
-            </SelectItem>
-          </SelectContent>
-        </Select>
-      </div>
-
-      <!-- Format -->
-      <div
-        v-if="caps.formats && caps.formats.length > 0"
+    <template v-if="basicFields.length > 0">
+      <section
+        v-for="field in basicFields"
+        :key="field.key"
        class="space-y-2"
      >
-        <Label for="tts-format">{{ $t('speech.fields.format') }}</Label>
-        <Select
-          :model-value="configData.format ?? ''"
-          @update:model-value="(val) => configData.format = val"
+        <Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `tts-field-${field.key}`">
+          {{ field.title || field.key }}
+        </Label>
+        <p
+          v-if="field.description"
+          class="text-xs text-muted-foreground"
        >
-          <SelectTrigger
-            id="tts-format"
-            class="w-full"
+          {{ field.description }}
+        </p>
+
+        <div
+          v-if="field.type === 'secret'"
+          class="relative"
+        >
+          <Input
+            :id="`tts-field-${field.key}`"
+            v-model="configData[field.key] as string"
+            :type="visibleSecrets[field.key] ? 'text' : 'password'"
+            :placeholder="field.example ? String(field.example) : ''"
+          />
+          <button
+            type="button"
+            class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
+            @click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
          >
-            <SelectValue :placeholder="$t('speech.fields.formatPlaceholder')" />
+            <component
+              :is="visibleSecrets[field.key] ? EyeOff : Eye"
+              class="size-3.5"
+            />
+          </button>
+        </div>
+
+        <Switch
+          v-else-if="field.type === 'bool'"
+          :model-value="!!configData[field.key]"
+          @update:model-value="(val) => configData[field.key] = !!val"
+        />
+
+        <Input
+          v-else-if="field.type === 'number'"
+          :id="`tts-field-${field.key}`"
+          v-model.number="configData[field.key] as number"
+          type="number"
+          :placeholder="field.example ? String(field.example) : ''"
+        />
+
+        <Select
+          v-else-if="field.type === 'enum' && field.enum"
+          :model-value="String(configData[field.key] ?? '')"
+          @update:model-value="(val) => configData[field.key] = val"
+        >
+          <SelectTrigger>
+            <SelectValue :placeholder="field.title || field.key" />
          </SelectTrigger>
          <SelectContent>
            <SelectItem
-              v-for="fmt in caps.formats"
-              :key="fmt"
-              :value="fmt"
+              v-for="opt in field.enum"
+              :key="opt"
+              :value="opt"
            >
-              {{ fmt }}
+              {{ opt }}
            </SelectItem>
          </SelectContent>
        </Select>
-      </div>

-      <!-- Speed -->
-      <div
-        v-if="caps.speed"
-        class="space-y-2"
-      >
-        <Label>{{ $t('speech.fields.speed') }}</Label>
-        <p class="text-xs text-muted-foreground">
-          {{ $t('speech.fields.speedDescription', { default: caps.speed.default ?? 1 }) }}
-        </p>
-        <div v-if="caps.speed.options && caps.speed.options.length > 0">
-          <Select
-            :model-value="String(configData.speed ?? caps.speed.default ?? 1)"
-            @update:model-value="(val) => configData.speed = Number(val)"
-          >
-            <SelectTrigger class="w-full">
-              <SelectValue />
-            </SelectTrigger>
-            <SelectContent>
-              <SelectItem
-                v-for="opt in caps.speed.options"
-                :key="opt"
-                :value="String(opt)"
-              >
-                {{ opt }}x
-              </SelectItem>
-            </SelectContent>
-          </Select>
-        </div>
-        <div
+        <Input
          v-else
-          class="flex items-center gap-3"
-        >
-          <Slider
-            :model-value="[Number(configData.speed ?? caps.speed.default ?? 1)]"
-            :min="caps.speed.min"
-            :max="caps.speed.max"
-            :step="0.1"
-            class="flex-1"
-            @update:model-value="(val) => configData.speed = val[0]"
-          />
-          <span class="text-xs text-muted-foreground w-12 text-right">
-            {{ Number(configData.speed ?? caps.speed.default ?? 1).toFixed(1) }}x
-          </span>
-        </div>
-      </div>
-
-      <!-- Pitch -->
-      <div
-        v-if="caps.pitch"
-        class="space-y-2"
-      >
-        <Label>{{ $t('speech.fields.pitch') }}</Label>
-        <p class="text-xs text-muted-foreground">
-          {{ $t('speech.fields.pitchDescription', { default: caps.pitch.default ?? 0 }) }}
-        </p>
-        <div
-          v-if="caps.pitch.options && caps.pitch.options.length > 0"
-        >
-          <Select
-            :model-value="String(configData.pitch ?? caps.pitch.default ?? 0)"
-            @update:model-value="(val) => configData.pitch = Number(val)"
-          >
-            <SelectTrigger class="w-full">
-              <SelectValue />
-            </SelectTrigger>
-            <SelectContent>
-              <SelectItem
-                v-for="opt in caps.pitch.options"
-                :key="opt"
-                :value="String(opt)"
-              >
-                {{ opt }} Hz
-              </SelectItem>
-            </SelectContent>
-          </Select>
-        </div>
-        <div
-          v-else
-          class="flex items-center gap-3"
-        >
-          <Slider
-            :model-value="[Number(configData.pitch ?? caps.pitch.default ?? 0)]"
-            :min="caps.pitch.min"
-            :max="caps.pitch.max"
-            :step="1"
-            class="flex-1"
-            @update:model-value="(val) => configData.pitch = val[0]"
-          />
-          <span class="text-xs text-muted-foreground w-16 text-right">
-            {{ Number(configData.pitch ?? caps.pitch.default ?? 0).toFixed(0) }} Hz
-          </span>
-        </div>
-      </div>
+          :id="`tts-field-${field.key}`"
+          v-model="configData[field.key] as string"
+          type="text"
+          :placeholder="field.example ? String(field.example) : ''"
+        />
+      </section>
    </template>

    <div
-      v-else
+      v-else-if="advancedFields.length === 0"
      class="text-xs text-muted-foreground"
    >
      {{ $t('speech.noCapabilities') }}
    </div>

+    <div
+      v-if="advancedFields.length > 0"
+      class="rounded-lg border border-border"
+    >
+      <button
+        type="button"
+        class="flex w-full items-center justify-between px-3 py-2 text-left text-xs font-medium"
+        @click="showAdvanced = !showAdvanced"
+      >
+        <span>{{ $t('speech.advanced.title') }}</span>
+        <component
+          :is="showAdvanced ? ChevronUp : ChevronDown"
+          class="size-3 text-muted-foreground"
+        />
+      </button>
+      <div
+        v-if="showAdvanced"
+        class="space-y-4 border-t border-border px-3 py-3"
+      >
+        <p class="text-xs text-muted-foreground">
+          {{ $t('speech.advanced.description') }}
+        </p>
+        <section
+          v-for="field in advancedFields"
+          :key="field.key"
+          class="space-y-2"
+        >
+          <Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `tts-field-${field.key}`">
+            {{ field.title || field.key }}
+          </Label>
+          <p
+            v-if="field.description"
+            class="text-xs text-muted-foreground"
+          >
+            {{ field.description }}
+          </p>
+
+          <div
+            v-if="field.type === 'secret'"
+            class="relative"
+          >
+            <Input
+              :id="`tts-field-${field.key}`"
+              v-model="configData[field.key] as string"
+              :type="visibleSecrets[field.key] ? 'text' : 'password'"
+              :placeholder="field.example ? String(field.example) : ''"
+            />
+            <button
+              type="button"
+              class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
+              @click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
+            >
+              <component
+                :is="visibleSecrets[field.key] ? EyeOff : Eye"
+                class="size-3.5"
+              />
+            </button>
+          </div>
+
+          <Switch
+            v-else-if="field.type === 'bool'"
+            :model-value="!!configData[field.key]"
+            @update:model-value="(val) => configData[field.key] = !!val"
+          />
+
+          <Input
+            v-else-if="field.type === 'number'"
+            :id="`tts-field-${field.key}`"
+            v-model.number="configData[field.key] as number"
+            type="number"
+            :placeholder="field.example ? String(field.example) : ''"
+          />
+
+          <Select
+            v-else-if="field.type === 'enum' && field.enum"
+            :model-value="String(configData[field.key] ?? '')"
+            @update:model-value="(val) => configData[field.key] = val"
+          >
+            <SelectTrigger>
+              <SelectValue :placeholder="field.title || field.key" />
+            </SelectTrigger>
+            <SelectContent>
+              <SelectItem
+                v-for="opt in field.enum"
+                :key="opt"
+                :value="opt"
+              >
+                {{ opt }}
+              </SelectItem>
+            </SelectContent>
+          </Select>
+
+          <Input
+            v-else
+            :id="`tts-field-${field.key}`"
+            v-model="configData[field.key] as string"
+            type="text"
+            :placeholder="field.example ? String(field.example) : ''"
+          />
+        </section>
+      </div>
+    </div>
+
    <Separator class="my-3" />

-    <!-- Test Synthesis -->
    <div class="space-y-3">
      <h4 class="text-xs font-medium">
        {{ $t('speech.test.title') }}
@@ -209,9 +218,7 @@
          :disabled="!testText.trim() || testText.length > maxTestTextLen"
          @click="handleTest"
        >
-          <Play
-            class="mr-1.5"
-          />
+          <Play class="mr-1.5" />
          {{ $t('speech.test.generate') }}
        </LoadingButton>
        <span
@@ -251,104 +258,88 @@

 <script setup lang="ts">
 import {
+  Input,
  Label,
  Select,
-  SelectTrigger,
-  SelectValue,
  SelectContent,
  SelectItem,
-  Slider,
-  Textarea,
+  SelectTrigger,
+  SelectValue,
  Separator,
+  Switch,
+  Textarea,
 } from '@memohai/ui'
-import { Play } from 'lucide-vue-next'
-import LoadingButton from '@/components/loading-button/index.vue'
-import type { TtsModelCapabilities, TtsVoiceInfo } from '@memohai/sdk'
+import { ChevronDown, ChevronUp, Eye, EyeOff, Play } from 'lucide-vue-next'
 import { computed, onBeforeUnmount, reactive, ref, watch } from 'vue'
 import { toast } from 'vue-sonner'
 import { useI18n } from 'vue-i18n'
+import LoadingButton from '@/components/loading-button/index.vue'
+
+interface SpeechFieldSchema {
+  key: string
+  type: string
+  title?: string
+  description?: string
+  required?: boolean
+  advanced?: boolean
+  enum?: string[]
+  example?: unknown
+  order?: number
+}
+
+interface SpeechConfigSchema {
+  fields?: SpeechFieldSchema[]
+}

 const props = defineProps<{
  modelId: string
  modelName: string
  config: Record<string, unknown>
-  capabilities: TtsModelCapabilities | null
+  schema: SpeechConfigSchema | null
+  onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
 }>()

 const emit = defineEmits<{
  save: [config: Record<string, unknown>]
-  test: [text: string, config: Record<string, unknown>]
 }>()

 const { t } = useI18n()
-
-const caps = computed(() => props.capabilities)
-
 const configData = reactive<Record<string, unknown>>({})
-
-watch(() => props.config, (cfg) => {
-  Object.keys(configData).forEach((k) => delete configData[k])
-  if (cfg.voice && typeof cfg.voice === 'object') {
-    const voice = cfg.voice as Record<string, unknown>
-    configData.voice_id = voice.id ?? ''
-    configData.voice_lang = voice.lang ?? ''
-  }
-  if (cfg.format) configData.format = cfg.format
-  if (cfg.speed != null) configData.speed = cfg.speed
-  if (cfg.pitch != null) configData.pitch = cfg.pitch
-  if (cfg.sample_rate != null) configData.sample_rate = cfg.sample_rate
-}, { immediate: true })
-
-const availableLanguages = computed(() => {
-  if (!caps.value?.voices) return []
-  const langs = new Set(caps.value.voices.map((v: TtsVoiceInfo) => v.lang ?? '').filter(Boolean))
-  return [...langs].sort()
-})
-
-const filteredVoices = computed(() => {
-  if (!caps.value?.voices) return []
-  const lang = configData.voice_lang
-  if (!lang) return caps.value.voices
-  return caps.value.voices.filter((v: TtsVoiceInfo) => v.lang === lang)
-})
-
-function onLangChange(lang: string) {
-  configData.voice_lang = lang
-  const voices = caps.value?.voices?.filter((v: TtsVoiceInfo) => v.lang === lang)
-  if (voices && voices.length > 0 && !voices.some((v: TtsVoiceInfo) => v.id === configData.voice_id)) {
-    configData.voice_id = voices[0].id ?? ''
-  }
-}
-
-function buildConfig(): Record<string, unknown> {
-  const result: Record<string, unknown> = {}
-  if (configData.voice_id || configData.voice_lang) {
-    result.voice = { id: configData.voice_id ?? '', lang: configData.voice_lang ?? '' }
-  }
-  if (configData.format) result.format = configData.format
-  if (configData.speed != null) result.speed = Number(configData.speed)
-  if (configData.pitch != null) result.pitch = Number(configData.pitch)
-  if (configData.sample_rate != null) result.sample_rate = Number(configData.sample_rate)
-  return result
-}
-
+const visibleSecrets = reactive<Record<string, boolean>>({})
 const saving = ref(false)
-async function handleSaveConfig() {
-  saving.value = true
-  try {
-    emit('save', buildConfig())
-  } finally {
-    saving.value = false
-  }
-}
-
-// Test synthesis
-const maxTestTextLen = 500
+const showAdvanced = ref(false)
 const testText = ref('')
 const testLoading = ref(false)
 const testError = ref('')
 const audioUrl = ref('')
 const audioEl = ref<HTMLAudioElement>()
+const maxTestTextLen = 500
+
+const orderedFields = computed(() => {
+  const fields = props.schema?.fields ?? []
+  return [...fields].sort((a, b) => (a.order ?? 0) - (b.order ?? 0))
+})
+
+const basicFields = computed(() => orderedFields.value.filter(field => !field.advanced))
+const advancedFields = computed(() => orderedFields.value.filter(field => field.advanced))
+
+watch(() => props.config, (cfg) => {
+  Object.keys(configData).forEach((key) => delete configData[key])
+  Object.assign(configData, { ...(cfg ?? {}) })
+  showAdvanced.value = advancedFields.value.some(field => {
+    const value = cfg?.[field.key]
+    return value !== '' && value != null
+  })
+}, { immediate: true, deep: true })
+
+function buildConfig(): Record<string, unknown> {
+  const result: Record<string, unknown> = {}
+  for (const [key, value] of Object.entries(configData)) {
+    if (value === '' || value == null) continue
+    result[key] = value
+  }
+  return result
+}

 function revokeAudio() {
  if (audioUrl.value) {
@@ -359,6 +350,15 @@ function revokeAudio() {

 onBeforeUnmount(revokeAudio)

+async function handleSaveConfig() {
+  saving.value = true
+  try {
+    emit('save', buildConfig())
+  } finally {
+    saving.value = false
+  }
+}
+
 async function handleTest() {
  if (!testText.value.trim()) return
  testLoading.value = true
@@ -366,39 +366,13 @@ async function handleTest() {
  revokeAudio()

  try {
-    const blob = await new Promise<Blob>((resolve, reject) => {
-      const handler = async () => {
-        try {
-          const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
-          const token = localStorage.getItem('token')
-          const resp = await fetch(`${apiBase}/speech-models/${props.modelId}/test`, {
-            method: 'POST',
-            headers: {
-              'Content-Type': 'application/json',
-              ...(token ? { Authorization: `Bearer ${token}` } : {}),
-            },
-            body: JSON.stringify({ text: testText.value, config: buildConfig() }),
-          })
-          if (!resp.ok) {
-            const errBody = await resp.text()
-            let msg: string
-            try { msg = JSON.parse(errBody)?.message ?? errBody } catch { msg = errBody }
-            reject(new Error(msg))
-            return
-          }
-          resolve(await resp.blob())
-        } catch (e) {
-          reject(e)
-        }
-      }
-      handler()
-    })
+    const blob = await props.onTest(testText.value, buildConfig())

    audioUrl.value = URL.createObjectURL(blob)
    await new Promise<void>((resolve) => setTimeout(resolve, 50))
    audioEl.value?.play()
-  } catch (e: unknown) {
-    const msg = e instanceof Error ? e.message : t('speech.test.failed')
+  } catch (error: unknown) {
+    const msg = error instanceof Error ? error.message : t('speech.test.failed')
    testError.value = msg
    toast.error(msg)
  } finally {
@@ -1,9 +1,19 @@
 <template>
  <div class="p-4">
    <section class="flex items-center gap-3">
-      <Volume2
-        class="size-5"
-      />
+      <span class="flex size-10 shrink-0 items-center justify-center rounded-full bg-muted">
+        <ProviderIcon
+          v-if="curProvider?.icon"
+          :icon="curProvider.icon"
+          size="1.5em"
+        />
+        <span
+          v-else
+          class="text-xs font-medium text-muted-foreground"
+        >
+          {{ getInitials(curProvider?.name) }}
+        </span>
+      </span>
      <div class="min-w-0">
        <h2 class="text-sm font-semibold truncate">
          {{ curProvider?.name }}
@@ -25,12 +35,121 @@
    </section>
    <Separator class="mt-4 mb-6" />

-    <!-- Models -->
+    <form
+      class="space-y-4"
+      @submit.prevent="handleSaveProvider"
+    >
+      <section class="space-y-2">
+        <Label for="speech-provider-name">{{ $t('common.name') }}</Label>
+        <Input
+          id="speech-provider-name"
+          v-model="providerName"
+          type="text"
+          :placeholder="$t('common.namePlaceholder')"
+        />
+      </section>
+
+      <section
+        v-for="field in orderedProviderFields"
+        :key="field.key"
+        class="space-y-2"
+      >
+        <Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `speech-provider-${field.key}`">
+          {{ field.title || field.key }}
+        </Label>
+        <p
+          v-if="field.description"
+          class="text-xs text-muted-foreground"
+        >
+          {{ field.description }}
+        </p>
+        <div
+          v-if="field.type === 'secret'"
+          class="relative"
+        >
+          <Input
+            :id="`speech-provider-${field.key}`"
+            v-model="providerConfig[field.key] as string"
+            :type="visibleSecrets[field.key] ? 'text' : 'password'"
+            :placeholder="field.example ? String(field.example) : ''"
+          />
+          <button
+            type="button"
+            class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
+            @click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
+          >
+            <component
+              :is="visibleSecrets[field.key] ? EyeOff : Eye"
+              class="size-3.5"
+            />
+          </button>
+        </div>
+        <Switch
+          v-else-if="field.type === 'bool'"
+          :model-value="!!providerConfig[field.key]"
+          @update:model-value="(val) => providerConfig[field.key] = !!val"
+        />
+        <Input
+          v-else-if="field.type === 'number'"
+          :id="`speech-provider-${field.key}`"
+          v-model.number="providerConfig[field.key] as number"
+          type="number"
+          :placeholder="field.example ? String(field.example) : ''"
+        />
+        <Select
+          v-else-if="field.type === 'enum' && field.enum"
+          :model-value="String(providerConfig[field.key] ?? '')"
+          @update:model-value="(val) => providerConfig[field.key] = val"
+        >
+          <SelectTrigger>
+            <SelectValue :placeholder="field.title || field.key" />
+          </SelectTrigger>
+          <SelectContent>
+            <SelectItem
+              v-for="opt in field.enum"
+              :key="opt"
+              :value="opt"
+            >
+              {{ opt }}
+            </SelectItem>
+          </SelectContent>
+        </Select>
+        <Input
+          v-else
+          :id="`speech-provider-${field.key}`"
+          v-model="providerConfig[field.key] as string"
+          type="text"
+          :placeholder="field.example ? String(field.example) : ''"
+        />
+      </section>
+
+      <div class="flex justify-end">
+        <LoadingButton
+          type="submit"
+          :loading="saveLoading"
+        >
+          {{ $t('provider.saveChanges') }}
+        </LoadingButton>
+      </div>
+    </form>
+
+    <Separator class="mt-6 mb-6" />
+
    <section>
      <div class="flex justify-between items-center mb-4">
        <h3 class="text-xs font-medium">
          {{ $t('speech.models') }}
        </h3>
+        <LoadingButton
+          v-if="curProviderId"
+          type="button"
+          variant="outline"
+          size="sm"
+          :loading="importLoading"
+          @click="handleImportModels"
+        >
+          {{ $t('speech.importModels') }}
+        </LoadingButton>
      </div>

      <div
@@ -71,8 +190,9 @@
            :model-id="model.id ?? ''"
            :model-name="model.model_id ?? ''"
            :config="model.config || {}"
-            :capabilities="getModelCapabilities(model.model_id ?? '')"
-            @test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
+            :schema="getModelSchema(model.model_id ?? '')"
+            :on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
+            @save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
          />
        </div>
      </div>
@@ -82,65 +202,152 @@

 <script setup lang="ts">
 import {
+  Input,
+  Label,
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
  Separator,
  Switch,
 } from '@memohai/ui'
 import ModelConfigEditor from './model-config-editor.vue'
-import { Volume2, ChevronUp, ChevronDown } from 'lucide-vue-next'
-import { computed, inject, ref } from 'vue'
+import { ChevronDown, ChevronUp, Eye, EyeOff } from 'lucide-vue-next'
+import { computed, inject, reactive, ref, watch } from 'vue'
 import { toast } from 'vue-sonner'
 import { useI18n } from 'vue-i18n'
 import { useQuery, useQueryCache } from '@pinia/colada'
-import { getSpeechProvidersMeta, getSpeechModels, putProvidersById } from '@memohai/sdk'
-import type { TtsSpeechProviderResponse, TtsProviderMetaResponse, TtsModelInfo } from '@memohai/sdk'
+import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
+import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
+import LoadingButton from '@/components/loading-button/index.vue'
+import ProviderIcon from '@/components/provider-icon/index.vue'
+
+interface SpeechFieldSchema {
+  key: string
+  type: string
+  title?: string
+  description?: string
+  required?: boolean
+  advanced?: boolean
+  enum?: string[]
+  example?: unknown
+  order?: number
+}
+
+interface SpeechConfigSchema {
+  fields?: SpeechFieldSchema[]
+}
+
+interface SpeechModelMeta {
+  id: string
+  name: string
+  description?: string
+  config_schema?: SpeechConfigSchema
+  capabilities?: {
+    config_schema?: SpeechConfigSchema
+  }
+}
+
+interface SpeechProviderMeta {
+  provider: string
+  display_name: string
+  description?: string
+  config_schema?: SpeechConfigSchema
+  default_model?: string
+  models?: SpeechModelMeta[]
+}
+
+function getInitials(name: string | undefined) {
+  const label = name?.trim() ?? ''
+  return label ? label.slice(0, 2).toUpperCase() : '?'
+}

 const { t } = useI18n()
 const curProvider = inject('curTtsProvider', ref<TtsSpeechProviderResponse>())
 const curProviderId = computed(() => curProvider.value?.id)
+const providerName = ref('')
+const providerConfig = reactive<Record<string, unknown>>({})
+const visibleSecrets = reactive<Record<string, boolean>>({})
+const expandedModelId = ref('')
 const enableLoading = ref(false)
+const saveLoading = ref(false)
+const importLoading = ref(false)
+const queryCache = useQueryCache()
+
+const { data: providerDetail } = useQuery({
+  key: () => ['speech-provider-detail', curProviderId.value],
+  query: async () => {
+    if (!curProviderId.value) return null
+    const { data } = await getSpeechProvidersById({
+      path: { id: curProviderId.value },
+      throwOnError: true,
+    })
+    return data ?? null
+  },
+})

 const { data: metaList } = useQuery({
  key: () => ['speech-providers-meta'],
  query: async () => {
    const { data } = await getSpeechProvidersMeta({ throwOnError: true })
-    return data
+    return (data ?? []) as SpeechProviderMeta[]
  },
 })

-const currentMeta = computed<TtsProviderMetaResponse | null>(() => {
+const currentMeta = computed(() => {
  if (!metaList.value || !curProvider.value?.client_type) return null
-  return (metaList.value as TtsProviderMetaResponse[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
+  return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
 })

-function getModelCapabilities(modelId: string) {
-  const meta = currentMeta.value
-  if (!meta?.models) return null
-  return meta.models.find((m: TtsModelInfo) => m.id === modelId)?.capabilities ?? null
-}
+const orderedProviderFields = computed(() => {
+  const fields = currentMeta.value?.config_schema?.fields ?? []
+  return [...fields].sort((a, b) => (a.order ?? 0) - (b.order ?? 0))
+})

-const { data: allSpeechModels } = useQuery({
-  key: () => ['speech-models'],
+const { data: providerSpeechModels } = useQuery({
+  key: () => ['speech-provider-models', curProviderId.value],
  query: async () => {
-    const { data } = await getSpeechModels({ throwOnError: true })
-    return data
+    if (!curProviderId.value) return []
+    const { data } = await getSpeechProvidersByIdModels({
+      path: { id: curProviderId.value },
+      throwOnError: true,
+    })
+    return data ?? []
  },
 })

 const providerModels = computed(() => {
-  if (!allSpeechModels.value || !curProviderId.value) return []
-  return allSpeechModels.value.filter((m) => m.provider_id === curProviderId.value)
+  return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
 })

-const expandedModelId = ref('')
+watch(() => providerDetail.value, (provider) => {
+  providerName.value = provider?.name ?? curProvider.value?.name ?? ''
+  Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
+  Object.assign(providerConfig, { ...(provider?.config ?? {}) })
+}, { immediate: true, deep: true })
+
+function getModelMeta(modelID: string): SpeechModelMeta | null {
+  const models = currentMeta.value?.models ?? []
+  const exact = models.find(m => m.id === modelID)
+  if (exact) return exact
+  if (currentMeta.value?.default_model) {
+    return models.find(m => m.id === currentMeta.value?.default_model) ?? null
+  }
+  return models[0] ?? null
+}
+
+function getModelSchema(modelID: string): SpeechConfigSchema | null {
+  const meta = getModelMeta(modelID)
+  return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
+}
+
 function toggleModel(id: string) {
  expandedModelId.value = expandedModelId.value === id ? '' : id
 }

-const queryCache = useQueryCache()
-
 async function handleToggleEnable(value: boolean) {
  if (!curProviderId.value || !curProvider.value) return
-
  const prev = curProvider.value.enable ?? false
  curProvider.value = { ...curProvider.value, enable: value }

@@ -148,10 +355,16 @@ async function handleToggleEnable(value: boolean) {
  try {
    await putProvidersById({
      path: { id: curProviderId.value },
-      body: { enable: value },
+      body: {
+        name: providerName.value.trim() || curProvider.value.name,
+        client_type: curProvider.value.client_type,
+        enable: value,
+        config: sanitizeConfig(providerConfig),
+      },
      throwOnError: true,
    })
    queryCache.invalidateQueries({ key: ['speech-providers'] })
+    queryCache.invalidateQueries({ key: ['speech-provider-detail', curProviderId.value] })
  } catch {
    curProvider.value = { ...curProvider.value, enable: prev }
    toast.error(t('common.saveFailed'))
@@ -160,6 +373,75 @@ async function handleToggleEnable(value: boolean) {
  }
 }

+async function handleSaveProvider() {
+  if (!curProviderId.value || !curProvider.value) return
+  saveLoading.value = true
+  try {
+    await putProvidersById({
+      path: { id: curProviderId.value },
+      body: {
+        name: providerName.value.trim() || curProvider.value.name,
+        client_type: curProvider.value.client_type,
+        enable: curProvider.value.enable,
+        config: sanitizeConfig(providerConfig),
+      },
+      throwOnError: true,
+    })
+    toast.success(t('speech.saveSuccess'))
+    queryCache.invalidateQueries({ key: ['speech-providers'] })
+    queryCache.invalidateQueries({ key: ['speech-provider-detail', curProviderId.value] })
+  } catch {
+    toast.error(t('common.saveFailed'))
+  } finally {
+    saveLoading.value = false
+  }
+}
+
+async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
+  const model = providerModels.value.find((item) => item.id === modelId)
+  if (!model) return
+  try {
+    await putModelsById({
+      path: { id: modelId },
+      body: {
+        model_id: model.model_id,
+        name: model.name ?? model.model_id,
+        provider_id: model.provider_id,
+        type: 'speech',
+        config,
+      },
+      throwOnError: true,
+    })
+    toast.success(t('speech.saveSuccess'))
+    queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
+    queryCache.invalidateQueries({ key: ['speech-models'] })
+  } catch {
+    toast.error(t('common.saveFailed'))
+  }
+}
+
+async function handleImportModels() {
+  if (!curProviderId.value) return
+  importLoading.value = true
+  try {
+    const { data } = await postSpeechProvidersByIdImportModels({
+      path: { id: curProviderId.value },
+      throwOnError: true,
+    })
+    toast.success(t('speech.importSuccess', {
+      created: data?.created ?? 0,
+      skipped: data?.skipped ?? 0,
+    }))
+    queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
+    queryCache.invalidateQueries({ key: ['speech-models'] })
+    queryCache.invalidateQueries({ key: ['speech-providers-meta'] })
+  } catch {
+    toast.error(t('speech.importFailed'))
+  } finally {
+    importLoading.value = false
+  }
+}
+
 async function handleTestModel(modelId: string, text: string, config: Record<string, unknown>) {
  const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
  const token = localStorage.getItem('token')
@@ -183,4 +465,13 @@ async function handleTestModel(modelId: string, text: string, config: Record<str
  }
  return resp.blob()
 }
+
+function sanitizeConfig(input: Record<string, unknown>) {
+  const result: Record<string, unknown> = {}
+  for (const [key, value] of Object.entries(input)) {
+    if (value === '' || value == null) continue
+    result[key] = value
+  }
+  return result
+}
 </script>
@@ -18,6 +18,12 @@ import type { TtsSpeechProviderResponse } from '@memohai/sdk'
 import ProviderSetting from './components/provider-setting.vue'
 import { Volume2 } from 'lucide-vue-next'
 import MasterDetailSidebarLayout from '@/components/master-detail-sidebar-layout/index.vue'
+import ProviderIcon from '@/components/provider-icon/index.vue'
+
+function getInitials(name: string | undefined) {
+  const label = name?.trim() ?? ''
+  return label ? label.slice(0, 2).toUpperCase() : '?'
+}

 const { data: providerData } = useQuery({
  key: () => ['speech-providers'],
@@ -79,9 +85,17 @@ watch(filteredProviders, (list) => {
            >
              <span class="relative shrink-0">
                <span class="flex size-7 items-center justify-center rounded-full bg-muted">
-                  <Volume2
-                    class="size-3.5 text-muted-foreground"
+                  <ProviderIcon
+                    v-if="item.icon"
+                    :icon="item.icon"
+                    size="1.25em"
                  />
+                  <span
+                    v-else
+                    class="text-xs font-medium text-muted-foreground"
+                  >
+                    {{ getInitials(item.name) }}
+                  </span>
                </span>
                <span
                  v-if="item.enable !== false"
@@ -88,7 +88,6 @@ import (
 	"github.com/memohai/memoh/internal/storage/providers/fallback"
 	"github.com/memohai/memoh/internal/storage/providers/localfs"
 	ttspkg "github.com/memohai/memoh/internal/tts"
-	ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge"
 	"github.com/memohai/memoh/internal/version"
 	"github.com/memohai/memoh/internal/workspace"
 )
@@ -520,10 +519,8 @@ func provideWebHandler(channelManager *channel.Manager, channelStore *channel.St
 	return h
 }

-func provideTtsRegistry(log *slog.Logger) *ttspkg.Registry {
-	reg := ttspkg.NewRegistry()
-	reg.Register(ttsedge.NewEdgeAdapter(log))
-	return reg
+func provideTtsRegistry() *ttspkg.Registry {
+	return ttspkg.NewRegistry()
 }

 func provideTtsTempStore() (*ttspkg.TempStore, error) {
@@ -687,6 +684,17 @@ func startRegistrySync(lc fx.Lifecycle, log *slog.Logger, cfg config.Config, que
 	})
 }

+func startSpeechProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *ttspkg.Registry) {
+	lc.Append(fx.Hook{
+		OnStart: func(ctx context.Context) error {
+			if err := ttspkg.SyncRegistry(ctx, log, queries, registry); err != nil {
+				log.Warn("speech registry bootstrap failed", slog.Any("error", err))
+			}
+			return nil
+		},
+	})
+}
+
 func startMemoryProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, mpService *memprovider.Service, registry *memprovider.Registry) {
 	mpService.SetRegistry(registry)
 	lc.Append(fx.Hook{
@@ -141,6 +141,7 @@ func options() fx.Option {
 		fx.Invoke(
 			injectToolProviders,
 			startRegistrySync,
+			startSpeechProviderBootstrap,
 			startMemoryProviderBootstrap,
 			startSearchProviderBootstrap,
 			startScheduleService,
@@ -0,0 +1,8 @@
+name: Alibaba Cloud Speech
+client_type: alibabacloud-speech
+icon: bailian-color
+
+models:
+  - model_id: cosyvoice-tts
+    name: CosyVoice TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: Deepgram Speech
+client_type: deepgram-speech
+icon: deepgram
+
+models:
+  - model_id: deepgram-tts
+    name: Deepgram TTS
+    type: speech
@@ -1,6 +1,6 @@
 name: Edge
 client_type: edge-speech
-icon: edge
+icon: microsoft

 models:
  - model_id: edge-read-aloud
@@ -0,0 +1,8 @@
+name: ElevenLabs Speech
+client_type: elevenlabs-speech
+icon: elevenlabs
+
+models:
+  - model_id: elevenlabs-tts
+    name: ElevenLabs TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: Microsoft Speech
+client_type: microsoft-speech
+icon: azure-color
+
+models:
+  - model_id: microsoft-tts
+    name: Microsoft TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: MiniMax Speech
+client_type: minimax-speech
+icon: minimax-color
+
+models:
+  - model_id: minimax-tts
+    name: MiniMax TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: OpenAI Speech
+client_type: openai-speech
+icon: openai
+
+models:
+  - model_id: gpt-4o-mini-tts
+    name: GPT-4o Mini TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: OpenRouter Speech
+client_type: openrouter-speech
+icon: openrouter
+
+models:
+  - model_id: openrouter-tts
+    name: OpenRouter TTS
+    type: speech
@@ -0,0 +1,8 @@
+name: Volcengine Speech
+client_type: volcengine-speech
+icon: volcengine-color
+
+models:
+  - model_id: sami-tts
+    name: SAMI TTS
+    type: speech
@@ -68,7 +68,23 @@ CREATE TABLE IF NOT EXISTS providers (
  created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  CONSTRAINT providers_name_unique UNIQUE (name),
-  CONSTRAINT providers_client_type_check CHECK (client_type IN ('openai-responses', 'openai-completions', 'anthropic-messages', 'google-generative-ai', 'openai-codex', 'github-copilot', 'edge-speech'))
+  CONSTRAINT providers_client_type_check CHECK (client_type IN (
+    'openai-responses',
+    'openai-completions',
+    'anthropic-messages',
+    'google-generative-ai',
+    'openai-codex',
+    'github-copilot',
+    'edge-speech',
+    'openai-speech',
+    'openrouter-speech',
+    'elevenlabs-speech',
+    'deepgram-speech',
+    'minimax-speech',
+    'volcengine-speech',
+    'alibabacloud-speech',
+    'microsoft-speech'
+  ))
 );

 CREATE TABLE IF NOT EXISTS search_providers (
@@ -0,0 +1,29 @@
+-- 0068_expand_speech_provider_client_types (rollback)
+-- Remove newly added Twilight speech provider client_type values from unified providers table.
+
+DELETE FROM providers
+WHERE client_type IN (
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+);
+
+ALTER TABLE IF EXISTS providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE IF EXISTS providers
+  ADD CONSTRAINT providers_client_type_check CHECK (
+    client_type IN (
+      'openai-responses',
+      'openai-completions',
+      'anthropic-messages',
+      'google-generative-ai',
+      'openai-codex',
+      'github-copilot',
+      'edge-speech'
+    )
+  );
@@ -0,0 +1,25 @@
+-- 0068_expand_speech_provider_client_types
+-- Allow all Twilight speech provider client_type values in unified providers table.
+
+ALTER TABLE IF EXISTS providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE IF EXISTS providers
+  ADD CONSTRAINT providers_client_type_check CHECK (
+    client_type IN (
+      'openai-responses',
+      'openai-completions',
+      'anthropic-messages',
+      'google-generative-ai',
+      'openai-codex',
+      'github-copilot',
+      'edge-speech',
+      'openai-speech',
+      'openrouter-speech',
+      'elevenlabs-speech',
+      'deepgram-speech',
+      'minimax-speech',
+      'volcengine-speech',
+      'alibabacloud-speech',
+      'microsoft-speech'
+    )
+  );
@@ -18,7 +18,17 @@ SELECT * FROM providers WHERE name = sqlc.arg(name);

 -- name: ListProviders :many
 SELECT * FROM providers
-WHERE client_type NOT IN ('edge-speech')
+WHERE client_type NOT IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+)
 ORDER BY created_at DESC;

 -- name: UpdateProvider :one
@@ -38,8 +48,19 @@ RETURNING *;
 DELETE FROM providers WHERE id = sqlc.arg(id);

 -- name: CountProviders :one
-SELECT COUNT(*) FROM providers
-WHERE client_type NOT IN ('edge-speech');
+SELECT COUNT(*)
+FROM providers
+WHERE client_type NOT IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+);

 -- name: CreateModel :one
 INSERT INTO models (model_id, name, provider_id, type, config)
@@ -110,6 +131,11 @@ DELETE FROM models WHERE id = sqlc.arg(id);
 -- name: DeleteModelByModelID :exec
 DELETE FROM models WHERE model_id = sqlc.arg(model_id);

+-- name: DeleteModelByProviderIDAndModelID :exec
+DELETE FROM models
+WHERE provider_id = sqlc.arg(provider_id)
+  AND model_id = sqlc.arg(model_id);
+
 -- name: CountModels :one
 SELECT COUNT(*) FROM models
 WHERE type != 'speech';
@@ -192,7 +218,17 @@ WHERE m.id = sqlc.arg(id)

 -- name: ListSpeechProviders :many
 SELECT * FROM providers
-WHERE client_type IN ('edge-speech')
+WHERE client_type IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+)
 ORDER BY created_at DESC;

 -- name: ListSpeechModels :many
@@ -18,7 +18,6 @@ require (
 	github.com/creack/pty v1.1.24
 	github.com/emersion/go-imap/v2 v2.0.0-beta.8
 	github.com/emersion/go-sasl v0.0.0-20241020182733-b788ff22d5a6
-	github.com/go-playground/validator/v10 v10.30.1
 	github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0
 	github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1
 	github.com/golang-jwt/jwt/v5 v5.3.1
@@ -27,18 +26,20 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674
 	github.com/jackc/pgx/v5 v5.8.0
+	github.com/kenshaw/emoji v0.4.1
 	github.com/labstack/echo-jwt/v4 v4.4.0
 	github.com/labstack/echo/v4 v4.15.0
 	github.com/larksuite/oapi-sdk-go/v3 v3.5.3
 	github.com/mailgun/mailgun-go/v5 v5.14.0
 	github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7
 	github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978
-	github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f
+	github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04
 	github.com/modelcontextprotocol/go-sdk v1.5.0
 	github.com/opencontainers/image-spec v1.1.1
 	github.com/opencontainers/runtime-spec v1.3.0
 	github.com/qdrant/go-client v1.17.1
 	github.com/robfig/cron/v3 v3.0.1
+	github.com/slack-go/slack v0.19.0
 	github.com/spf13/cobra v1.10.2
 	github.com/stretchr/testify v1.11.1
 	github.com/swaggo/swag v1.16.6
@@ -66,6 +67,20 @@ require (
 	github.com/andybalholm/cascadia v1.3.3 // indirect
 	github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
 	github.com/atotto/clipboard v0.1.4 // indirect
+	github.com/aws/aws-sdk-go-v2 v1.41.5 // indirect
+	github.com/aws/aws-sdk-go-v2/config v1.32.14 // indirect
+	github.com/aws/aws-sdk-go-v2/credentials v1.19.14 // indirect
+	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 // indirect
+	github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 // indirect
+	github.com/aws/smithy-go v1.24.2 // indirect
 	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
 	github.com/aymerick/douceur v0.2.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
@@ -95,7 +110,6 @@ require (
 	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
-	github.com/gabriel-vasile/mimetype v1.4.12 // indirect
 	github.com/go-logr/logr v1.4.3 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-openapi/jsonpointer v0.22.4 // indirect
@@ -108,8 +122,6 @@ require (
 	github.com/go-openapi/swag/stringutils v0.25.4 // indirect
 	github.com/go-openapi/swag/typeutils v0.25.4 // indirect
 	github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
-	github.com/go-playground/locales v0.14.1 // indirect
-	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
@@ -121,10 +133,8 @@ require (
 	github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
 	github.com/jackc/puddle/v2 v2.2.2 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/kenshaw/emoji v0.4.1 // indirect
 	github.com/klauspost/compress v1.18.4 // indirect
 	github.com/labstack/gommon v0.4.2 // indirect
-	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/lib/pq v1.10.9 // indirect
 	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
 	github.com/mattn/go-colorable v0.1.14 // indirect
@@ -158,7 +168,6 @@ require (
 	github.com/segmentio/asm v1.2.1 // indirect
 	github.com/segmentio/encoding v0.5.4 // indirect
 	github.com/sirupsen/logrus v1.9.4 // indirect
-	github.com/slack-go/slack v0.19.0 // indirect
 	github.com/spf13/pflag v1.0.9 // indirect
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/fasttemplate v1.2.2 // indirect
@@ -32,6 +32,34 @@ github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhP
 github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
 github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
 github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
+github.com/aws/aws-sdk-go-v2 v1.41.5 h1:dj5kopbwUsVUVFgO4Fi5BIT3t4WyqIDjGKCangnV/yY=
+github.com/aws/aws-sdk-go-v2 v1.41.5/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o=
+github.com/aws/aws-sdk-go-v2/config v1.32.14 h1:opVIRo/ZbbI8OIqSOKmpFaY7IwfFUOCCXBsUpJOwDdI=
+github.com/aws/aws-sdk-go-v2/config v1.32.14/go.mod h1:U4/V0uKxh0Tl5sxmCBZ3AecYny4UNlVmObYjKuuaiOo=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.14 h1:n+UcGWAIZHkXzYt87uMFBv/l8THYELoX6gVcUvgl6fI=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.14/go.mod h1:cJKuyWB59Mqi0jM3nFYQRmnHVQIcgoxjEMAbLkpr62w=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 h1:NUS3K4BTDArQqNu2ih7yeDLaS3bmHD0YndtA6UP884g=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21/go.mod h1:YWNWJQNjKigKY1RHVJCuupeWDrrHjRqHm0N9rdrWzYI=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 h1:Rgg6wvjjtX8bNHcvi9OnXWwcE0a2vGpbwmtICOsvcf4=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21/go.mod h1:A/kJFst/nm//cyqonihbdpQZwiUhhzpqTsdbhDdRF9c=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 h1:PEgGVtPoB6NTpPrBgqSE5hE/o47Ij9qk/SEZFbUOe9A=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21/go.mod h1:p+hz+PRAYlY3zcpJhPwXlLC4C+kqn70WIHwnzAfs6ps=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 h1:qYQ4pzQ2Oz6WpQ8T3HvGHnZydA72MnLuFK9tJwmrbHw=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6/go.mod h1:O3h0IK87yXci+kg6flUKzJnWeziQUKciKrLjcatSNcY=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 h1:5EniKhLZe4xzL7a+fU3C2tfUN4nWIqlLesfrjkuPFTY=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 h1:c31//R3xgIJMSC8S6hEVq+38DcvUlgFY0FM6mSI5oto=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21/go.mod h1:r6+pf23ouCB718FUxaqzZdbpYFyDtehyZcmP5KL9FkA=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 h1:QKZH0S178gCmFEgst8hN0mCX1KxLgHBKKY/CLqwP8lg=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.9/go.mod h1:7yuQJoT+OoH8aqIxw9vwF+8KpvLZ8AWmvmUWHsGQZvI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 h1:lFd1+ZSEYJZYvv9d6kXzhkZu07si3f+GQ1AaYwa2LUM=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.15/go.mod h1:WSvS1NLr7JaPunCXqpJnWk1Bjo7IxzZXrZi1QQCkuqM=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 h1:dzztQ1YmfPrxdrOiuZRMF6fuOwWlWpD2StNLTceKpys=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19/go.mod h1:YO8TrYtFdl5w/4vmjL8zaBSsiNp3w0L1FfKVKenZT7w=
+github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 h1:p8ogvvLugcR/zLBXTXrTkj0RYBUdErbMnAFFp12Lm/U=
+github.com/aws/aws-sdk-go-v2/service/sts v1.41.10/go.mod h1:60dv0eZJfeVXfbT1tFJinbHrDfSJ2GZl4Q//OSSNAVw=
+github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng=
+github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/aymanbagabas/go-udiff v0.3.1 h1:LV+qyBQ2pqe0u42ZsUEtPiCaUoqgA9gYRDs3vj1nolY=
@@ -140,8 +168,6 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
 github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
-github.com/gabriel-vasile/mimetype v1.4.12 h1:e9hWvmLYvtp846tLHam2o++qitpguFiYCKbn0w9jyqw=
-github.com/gabriel-vasile/mimetype v1.4.12/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s=
 github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
 github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@@ -176,14 +202,6 @@ github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxE
 github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg=
 github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls=
 github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
-github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
-github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
-github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
-github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
-github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
-github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
-github.com/go-playground/validator/v10 v10.30.1 h1:f3zDSN/zOma+w6+1Wswgd9fLkdwy06ntQJp0BBvFG0w=
-github.com/go-playground/validator/v10 v10.30.1/go.mod h1:oSuBIQzuJxL//3MelwSLD5hc2Tu889bF0Idm9Dg26cM=
 github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
 github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
 github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0 h1:A3B75Yp163FAIf9nLlFMl4pwIj+T3uKxfI7mbvvY2Ls=
@@ -192,6 +210,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v
 github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
 github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1 h1:wG8n/XJQ07TmjbITcGiUaOtXxdrINDz1b0J1w0SzqDc=
 github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1/go.mod h1:A2S0CWkNylc2phvKXWBBdD3K0iGnDBGbzRpISP2zBl8=
+github.com/go-test/deep v1.1.1 h1:0r/53hagsehfO4bzD2Pgr/+RgHqhmf+k1Bpse2cTu1U=
+github.com/go-test/deep v1.1.1/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
@@ -277,8 +297,6 @@ github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0
 github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
 github.com/larksuite/oapi-sdk-go/v3 v3.5.3 h1:xvf8Dv29kBXC5/DNDCLhHkAFW8l/0LlQJimO5Zn+JUk=
 github.com/larksuite/oapi-sdk-go/v3 v3.5.3/go.mod h1:ZEplY+kwuIrj/nqw5uSCINNATcH3KdxSN7y+UxYY5fI=
-github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
-github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
 github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
 github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
 github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
@@ -299,8 +317,8 @@ github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7 h1:beehwOQperqGWj4m4E
 github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7/go.mod h1:OvmxM7JmnXBmwJWWVqtreL3HSHSKuzPbtbhlg5MvBg0=
 github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978 h1:6gD8DvZkimGmU0e3PjlusJPyw55SyeoE12CZQoYUa8g=
 github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978/go.mod h1:2LMgK5QYFlTSvrGY+sI/j+jK2WK+YGHv4IMuiW+iPSc=
-github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f h1:9NAj+FyDJPi8RzD1PUwb6OxZx/OrBD2FJo4tVAlhpbs=
-github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f/go.mod h1:1uNfZWc8du+HWJ3r3FLyeGAXGiUAniuSWV89A8gbcz0=
+github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04 h1:8TnRoVU7u2aSvXDLMmlGt4l92TjjP7LYWqu73Rx0uGo=
+github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04/go.mod h1:s5s03jeYgK56SaHH9oVHua73xCmiSG4uyfCZKi9fCHk=
 github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk=
 github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA=
 github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
@@ -35,8 +35,19 @@ func (q *Queries) CountModelsByType(ctx context.Context, type_ string) (int64, e
 }

 const countProviders = `-- name: CountProviders :one
-SELECT COUNT(*) FROM providers
-WHERE client_type NOT IN ('edge-speech')
+SELECT COUNT(*)
+FROM providers
+WHERE client_type NOT IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+)
 `

 func (q *Queries) CountProviders(ctx context.Context) (int64, error) {
@@ -190,6 +201,22 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
 	return err
 }

+const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
+DELETE FROM models
+WHERE provider_id = $1
+  AND model_id = $2
+`
+
+type DeleteModelByProviderIDAndModelIDParams struct {
+	ProviderID pgtype.UUID `json:"provider_id"`
+	ModelID    string      `json:"model_id"`
+}
+
+func (q *Queries) DeleteModelByProviderIDAndModelID(ctx context.Context, arg DeleteModelByProviderIDAndModelIDParams) error {
+	_, err := q.db.Exec(ctx, deleteModelByProviderIDAndModelID, arg.ProviderID, arg.ModelID)
+	return err
+}
+
 const deleteProvider = `-- name: DeleteProvider :exec
 DELETE FROM providers WHERE id = $1
 `
@@ -717,7 +744,17 @@ func (q *Queries) ListModelsByType(ctx context.Context, type_ string) ([]Model,

 const listProviders = `-- name: ListProviders :many
 SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
-WHERE client_type NOT IN ('edge-speech')
+WHERE client_type NOT IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+)
 ORDER BY created_at DESC
 `

@@ -840,7 +877,17 @@ func (q *Queries) ListSpeechModelsByProviderID(ctx context.Context, providerID p

 const listSpeechProviders = `-- name: ListSpeechProviders :many
 SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
-WHERE client_type IN ('edge-speech')
+WHERE client_type IN (
+  'edge-speech',
+  'openai-speech',
+  'openrouter-speech',
+  'elevenlabs-speech',
+  'deepgram-speech',
+  'minimax-speech',
+  'volcengine-speech',
+  'alibabacloud-speech',
+  'microsoft-speech'
+)
 ORDER BY created_at DESC
 `

@@ -1,31 +1,39 @@
 package handlers

 import (
+	"errors"
+	"fmt"
 	"log/slog"
 	"net/http"
 	"strings"

 	"github.com/labstack/echo/v4"

+	"github.com/memohai/memoh/internal/models"
 	"github.com/memohai/memoh/internal/tts"
 )

 type SpeechHandler struct {
-	service *tts.Service
-	logger  *slog.Logger
+	service       *tts.Service
+	modelsService *models.Service
+	logger        *slog.Logger
 }

-func NewSpeechHandler(log *slog.Logger, service *tts.Service) *SpeechHandler {
+func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
 	return &SpeechHandler{
-		service: service,
-		logger:  log.With(slog.String("handler", "speech")),
+		service:       service,
+		modelsService: modelsService,
+		logger:        log.With(slog.String("handler", "speech")),
 	}
 }

 func (h *SpeechHandler) Register(e *echo.Echo) {
 	pg := e.Group("/speech-providers")
 	pg.GET("", h.ListProviders)
+	pg.GET("/:id", h.GetProvider)
 	pg.GET("/meta", h.ListMeta)
+	pg.GET("/:id/models", h.ListModelsByProvider)
+	pg.POST("/:id/import-models", h.ImportModels)

 	mg := e.Group("/speech-models")
 	mg.GET("", h.ListModels)
@@ -60,6 +68,105 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

+// GetProvider godoc
+// @Summary Get speech provider
+// @Description Get a speech provider with masked config values
+// @Tags speech-providers
+// @Produce json
+// @Param id path string true "Provider ID (UUID)"
+// @Success 200 {object} tts.SpeechProviderResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 404 {object} ErrorResponse
+// @Router /speech-providers/{id} [get].
+func (h *SpeechHandler) GetProvider(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	item, err := h.service.GetSpeechProvider(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusNotFound, err.Error())
+	}
+	return c.JSON(http.StatusOK, item)
+}
+
+// ListModelsByProvider godoc
+// @Summary List speech models by provider
+// @Description List models of type 'speech' for a specific speech provider
+// @Tags speech-providers
+// @Produce json
+// @Param id path string true "Provider ID (UUID)"
+// @Success 200 {array} tts.SpeechModelResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /speech-providers/{id}/models [get].
+func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	items, err := h.service.ListSpeechModelsByProvider(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	return c.JSON(http.StatusOK, items)
+}
+
+// ImportModels godoc
+// @Summary Import speech models from provider
+// @Description Fetch models using the configured speech provider and import them into the unified models table
+// @Tags speech-providers
+// @Accept json
+// @Produce json
+// @Param id path string true "Provider ID (UUID)"
+// @Success 200 {object} tts.ImportModelsResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 404 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /speech-providers/{id}/import-models [post].
+func (h *SpeechHandler) ImportModels(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+
+	remoteModels, err := h.service.FetchRemoteModels(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
+	}
+
+	resp := tts.ImportModelsResponse{
+		Models: make([]string, 0, len(remoteModels)),
+	}
+
+	for _, model := range remoteModels {
+		name := strings.TrimSpace(model.Name)
+		if name == "" {
+			name = model.ID
+		}
+
+		_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
+			ModelID:    model.ID,
+			Name:       name,
+			ProviderID: id,
+			Type:       models.ModelTypeSpeech,
+			Config:     models.ModelConfig{},
+		})
+		if err != nil {
+			if errors.Is(err, models.ErrModelIDAlreadyExists) {
+				resp.Skipped++
+				continue
+			}
+			h.logger.Warn("failed to import speech model", slog.String("model_id", model.ID), slog.Any("error", err))
+			continue
+		}
+		resp.Created++
+		resp.Models = append(resp.Models, model.ID)
+	}
+
+	return c.JSON(http.StatusOK, resp)
+}
+
 // ListModels godoc
 // @Summary List all speech models
 // @Description List all models of type 'speech' (filtered view of unified models table)
@@ -430,7 +430,15 @@ func IsValidClientType(clientType ClientType) bool {
 		ClientTypeGoogleGenerativeAI,
 		ClientTypeOpenAICodex,
 		ClientTypeGitHubCopilot,
-		ClientTypeEdgeSpeech:
+		ClientTypeEdgeSpeech,
+		ClientTypeOpenAISpeech,
+		ClientTypeOpenRouterSpeech,
+		ClientTypeElevenLabsSpeech,
+		ClientTypeDeepgramSpeech,
+		ClientTypeMiniMaxSpeech,
+		ClientTypeVolcengineSpeech,
+		ClientTypeAlibabaSpeech,
+		ClientTypeMicrosoftSpeech:
 		return true
 	default:
 		return false
@@ -438,19 +446,9 @@ func IsValidClientType(clientType ClientType) bool {
 }

 // IsLLMClientType returns true if the client type belongs to the LLM domain
-// (chat/embedding), excluding speech-only types like edge-speech.
+// (chat/embedding), excluding speech-only types (any type ending in "-speech").
 func IsLLMClientType(clientType ClientType) bool {
-	switch clientType {
-	case ClientTypeOpenAIResponses,
-		ClientTypeOpenAICompletions,
-		ClientTypeAnthropicMessages,
-		ClientTypeGoogleGenerativeAI,
-		ClientTypeOpenAICodex,
-		ClientTypeGitHubCopilot:
-		return true
-	default:
-		return false
-	}
+	return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
 }

 // SelectMemoryModel selects a chat model for memory operations.
@@ -24,6 +24,14 @@ const (
 	ClientTypeOpenAICodex        ClientType = "openai-codex"
 	ClientTypeGitHubCopilot      ClientType = "github-copilot"
 	ClientTypeEdgeSpeech         ClientType = "edge-speech"
+	ClientTypeOpenAISpeech       ClientType = "openai-speech"
+	ClientTypeOpenRouterSpeech   ClientType = "openrouter-speech"
+	ClientTypeElevenLabsSpeech   ClientType = "elevenlabs-speech"
+	ClientTypeDeepgramSpeech     ClientType = "deepgram-speech"
+	ClientTypeMiniMaxSpeech      ClientType = "minimax-speech"
+	ClientTypeVolcengineSpeech   ClientType = "volcengine-speech"
+	ClientTypeAlibabaSpeech      ClientType = "alibabacloud-speech"
+	ClientTypeMicrosoftSpeech    ClientType = "microsoft-speech"
 )

 const (
@@ -0,0 +1,68 @@
+package tts
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+
+	"github.com/jackc/pgx/v5/pgtype"
+
+	"github.com/memohai/memoh/internal/db/sqlc"
+)
+
+func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
+	for _, def := range registry.List() {
+		configJSON, err := json.Marshal(map[string]any{})
+		if err != nil {
+			return fmt.Errorf("marshal speech provider config: %w", err)
+		}
+		var icon pgtype.Text
+		if def.Icon != "" {
+			icon = pgtype.Text{String: def.Icon, Valid: true}
+		}
+
+		provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
+			Name:       def.DisplayName,
+			ClientType: string(def.ClientType),
+			Icon:       icon,
+			Config:     configJSON,
+		})
+		if err != nil {
+			return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
+		}
+
+		synced := 0
+		for _, model := range def.Models {
+			if shouldHideTemplateModel(def, model.ID) {
+				if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
+					ProviderID: provider.ID,
+					ModelID:    model.ID,
+				}); err != nil {
+					return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
+				}
+				continue
+			}
+			modelConfigJSON, err := json.Marshal(map[string]any{})
+			if err != nil {
+				return fmt.Errorf("marshal speech model config: %w", err)
+			}
+			name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
+			if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
+				ModelID:    model.ID,
+				Name:       name,
+				ProviderID: provider.ID,
+				Type:       "speech",
+				Config:     modelConfigJSON,
+			}); err != nil {
+				return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
+			}
+			synced++
+		}
+
+		if logger != nil {
+			logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
+		}
+	}
+	return nil
+}
@@ -1,32 +1,41 @@
 package tts

-import "github.com/go-playground/validator/v10"
-
-var validate = validator.New()
-
-// VoiceConfig identifies a TTS voice and its language.
-// Both fields are optional; adapters fill in their own defaults when empty.
+// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
 type VoiceConfig struct {
-	ID   string `json:"id"   validate:"omitempty"`
-	Lang string `json:"lang" validate:"omitempty"`
+	ID   string `json:"id"`
+	Lang string `json:"lang"`
 }

-// AudioConfig is the user-facing configuration for a TTS request.
+// AudioConfig is kept for backward compatibility with the legacy Edge adapter tests.
 type AudioConfig struct {
-	Format     string      `json:"format"      validate:"omitempty"`
-	SampleRate int         `json:"sample_rate"  validate:"omitempty,oneof=16000 24000 48000"`
-	Speed      float64     `json:"speed"        validate:"omitempty"`
-	Pitch      float64     `json:"pitch"        validate:"omitempty"`
+	Format     string      `json:"format"`
+	SampleRate int         `json:"sample_rate"`
+	Speed      float64     `json:"speed"`
+	Pitch      float64     `json:"pitch"`
 	Voice      VoiceConfig `json:"voice"`
 }

-func (c AudioConfig) Validate() error {
-	return validate.Struct(c)
+func (AudioConfig) Validate() error { return nil }
+
+// FieldSchema describes a single dynamic speech config field.
+type FieldSchema struct {
+	Key         string   `json:"key"`
+	Type        string   `json:"type"`
+	Title       string   `json:"title,omitempty"`
+	Description string   `json:"description,omitempty"`
+	Required    bool     `json:"required,omitempty"`
+	Advanced    bool     `json:"advanced,omitempty"`
+	Enum        []string `json:"enum,omitempty"`
+	Example     any      `json:"example,omitempty"`
+	Order       int      `json:"order"`
+}
+
+type ConfigSchema struct {
+	Fields []FieldSchema `json:"fields"`
 }

 // ParamConstraint describes valid values for a numeric parameter.
-// If Options is non-empty, only those discrete values are allowed (frontend renders a select).
-// Otherwise Min/Max define a continuous range (frontend renders a slider).
+// If Options is non-empty, only those discrete values are allowed.
 type ParamConstraint struct {
 	Options []float64 `json:"options,omitempty"`
 	Min     float64   `json:"min,omitempty"`
@@ -34,20 +43,23 @@ type ParamConstraint struct {
 	Default float64   `json:"default"`
 }

-// ModelCapabilities describes what a specific TTS model supports.
-// nil pointer means the parameter is not supported; frontend should hide it.
+// ModelCapabilities exposes optional UX hints for speech config forms.
 type ModelCapabilities struct {
-	Voices  []VoiceInfo      `json:"voices"`
-	Formats []string         `json:"formats"`
-	Speed   *ParamConstraint `json:"speed,omitempty"`
-	Pitch   *ParamConstraint `json:"pitch,omitempty"`
+	ConfigSchema ConfigSchema      `json:"config_schema,omitempty"`
+	Voices       []VoiceInfo       `json:"voices,omitempty"`
+	Formats      []string          `json:"formats,omitempty"`
+	Speed        *ParamConstraint  `json:"speed,omitempty"`
+	Pitch        *ParamConstraint  `json:"pitch,omitempty"`
+	Metadata     map[string]string `json:"metadata,omitempty"`
 }

-// ModelInfo describes a single model exposed by a TTS adapter.
+// ModelInfo describes a single speech model exposed by a provider definition.
 type ModelInfo struct {
 	ID           string            `json:"id"`
 	Name         string            `json:"name"`
 	Description  string            `json:"description,omitempty"`
+	TemplateOnly bool              `json:"template_only,omitempty"`
+	ConfigSchema ConfigSchema      `json:"config_schema,omitempty"`
 	Capabilities ModelCapabilities `json:"capabilities"`
 }

@@ -2,47 +2,586 @@ package tts

 import (
 	"fmt"
+	"sort"
+	"strings"
 	"sync"
+
+	alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
+	deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
+	edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
+	elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
+	microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
+	minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
+	openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
+	openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
+	volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
+	sdk "github.com/memohai/twilight-ai/sdk"
+
+	"github.com/memohai/memoh/internal/models"
 )

+type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
+
+type ProviderDefinition struct {
+	ClientType   models.ClientType
+	DisplayName  string
+	Icon         string
+	Description  string
+	ConfigSchema ConfigSchema
+	DefaultModel string
+	SupportsList bool
+	Models       []ModelInfo
+	Factory      ProviderFactory
+	Order        int
+}
+
 type Registry struct {
-	mu       sync.RWMutex
-	adapters map[TtsType]TtsAdapter
+	mu        sync.RWMutex
+	providers map[models.ClientType]ProviderDefinition
+	ordered   []models.ClientType
 }

 func NewRegistry() *Registry {
-	return &Registry{adapters: make(map[TtsType]TtsAdapter)}
+	r := &Registry{
+		providers: make(map[models.ClientType]ProviderDefinition),
+	}
+	for _, def := range defaultProviderDefinitions() {
+		r.Register(def)
+	}
+	return r
 }

-func (r *Registry) Register(a TtsAdapter) {
+func (r *Registry) Register(def ProviderDefinition) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	r.adapters[a.Type()] = a
+
+	if _, exists := r.providers[def.ClientType]; !exists {
+		r.ordered = append(r.ordered, def.ClientType)
+	}
+	r.providers[def.ClientType] = def
+	sort.SliceStable(r.ordered, func(i, j int) bool {
+		left := r.providers[r.ordered[i]]
+		right := r.providers[r.ordered[j]]
+		if left.Order != right.Order {
+			return left.Order < right.Order
+		}
+		return left.DisplayName < right.DisplayName
+	})
 }

-func (r *Registry) Get(name TtsType) (TtsAdapter, error) {
+func (r *Registry) Get(clientType models.ClientType) (ProviderDefinition, error) {
 	r.mu.RLock()
 	defer r.mu.RUnlock()
-	a, ok := r.adapters[name]
+	def, ok := r.providers[clientType]
 	if !ok {
-		return nil, fmt.Errorf("tts adapter not found: %s", name)
+		return ProviderDefinition{}, fmt.Errorf("speech provider not found: %s", clientType)
 	}
-	return a, nil
+	return def, nil
+}
+
+func (r *Registry) List() []ProviderDefinition {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	out := make([]ProviderDefinition, 0, len(r.ordered))
+	for _, key := range r.ordered {
+		out = append(out, r.providers[key])
+	}
+	return out
 }

 func (r *Registry) ListMeta() []ProviderMetaResponse {
-	r.mu.RLock()
-	defer r.mu.RUnlock()
-	metas := make([]ProviderMetaResponse, 0, len(r.adapters))
-	for _, a := range r.adapters {
-		meta := a.Meta()
+	defs := r.List()
+	metas := make([]ProviderMetaResponse, 0, len(defs))
+	for _, def := range defs {
 		metas = append(metas, ProviderMetaResponse{
-			Provider:     string(a.Type()),
-			DisplayName:  meta.Provider,
-			Description:  meta.Description,
-			DefaultModel: a.DefaultModel(),
-			Models:       a.Models(),
+			Provider:     string(def.ClientType),
+			DisplayName:  def.DisplayName,
+			Description:  def.Description,
+			ConfigSchema: def.ConfigSchema,
+			DefaultModel: def.DefaultModel,
+			Models:       def.Models,
 		})
 	}
 	return metas
 }
+
+func defaultProviderDefinitions() []ProviderDefinition {
+	edgeVoices := make([]VoiceInfo, 0)
+	for lang, ids := range edgespeech.EdgeTTSVoices {
+		for _, id := range ids {
+			name := strings.TrimPrefix(id, lang+"-")
+			name = strings.TrimSuffix(name, "Neural")
+			edgeVoices = append(edgeVoices, VoiceInfo{ID: id, Lang: lang, Name: name})
+		}
+	}
+	sort.Slice(edgeVoices, func(i, j int) bool {
+		if edgeVoices[i].Lang != edgeVoices[j].Lang {
+			return edgeVoices[i].Lang < edgeVoices[j].Lang
+		}
+		return edgeVoices[i].ID < edgeVoices[j].ID
+	})
+
+	return []ProviderDefinition{
+		{
+			ClientType:   models.ClientTypeEdgeSpeech,
+			DisplayName:  "Microsoft Edge",
+			Icon:         "microsoft",
+			Description:  "Free Edge Read Aloud TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{stringField("base_url", "Base URL", "Override the Edge WebSocket endpoint", false, "", 10)}},
+			DefaultModel: "edge-read-aloud",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "edge-read-aloud",
+				Name:        "Edge Read Aloud",
+				Description: "Built-in Edge Read Aloud speech model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					enumField("voice", "Voice", "Edge voice ID", false, voiceIDs(edgeVoices), 10),
+					stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 20),
+					enumField("format", "Format", "Output audio format", false, []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"}, 30),
+					numberField("speed", "Speed", "Speech rate, 1.0 = normal", false, 1.0, 40),
+					numberField("pitch", "Pitch", "Pitch adjustment in Hz", false, 0, 50),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						enumField("voice", "Voice", "Edge voice ID", false, voiceIDs(edgeVoices), 10),
+						stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 20),
+						enumField("format", "Format", "Output audio format", false, []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"}, 30),
+						numberField("speed", "Speed", "Speech rate, 1.0 = normal", false, 1.0, 40),
+						numberField("pitch", "Pitch", "Pitch adjustment in Hz", false, 0, 50),
+					}},
+					Voices:  edgeVoices,
+					Formats: []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"},
+					Speed:   &ParamConstraint{Options: []float64{0.5, 1.0, 2.0, 3.0}, Default: 1.0},
+					Pitch:   &ParamConstraint{Min: -100, Max: 100, Default: 0},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []edgespeech.Option{}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, edgespeech.WithBaseURL(v))
+				}
+				return edgespeech.New(opts...), nil
+			},
+			Order: 10,
+		},
+		{
+			ClientType:  models.ClientTypeOpenAISpeech,
+			DisplayName: "OpenAI Speech",
+			Icon:        "openai",
+			Description: "OpenAI /audio/speech compatible TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "Bearer API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
+			}},
+			DefaultModel: "gpt-4o-mini-tts",
+			SupportsList: true,
+			Models: []ModelInfo{{
+				ID:          "gpt-4o-mini-tts",
+				Name:        "gpt-4o-mini-tts",
+				Description: "Default OpenAI speech model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("voice", "Voice", "Voice ID", false, "coral", 10),
+					enumField("response_format", "Response Format", "Audio format", false, []string{"mp3", "opus", "pcm", "wav"}, 20),
+					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+					stringField("instructions", "Instructions", "Style instructions for supported models", false, "", 40),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						stringField("voice", "Voice", "Voice ID", false, "coral", 10),
+						enumField("response_format", "Response Format", "Audio format", false, []string{"mp3", "opus", "pcm", "wav"}, 20),
+						numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+						stringField("instructions", "Instructions", "Style instructions for supported models", false, "", 40),
+					}},
+					Formats: []string{"mp3", "opus", "pcm", "wav"},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []openaispeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, openaispeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, openaispeech.WithBaseURL(v))
+				}
+				return openaispeech.New(opts...), nil
+			},
+			Order: 20,
+		},
+		{
+			ClientType:  models.ClientTypeOpenRouterSpeech,
+			DisplayName: "OpenRouter Speech",
+			Icon:        "openrouter",
+			Description: "OpenRouter audio modality TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "OpenRouter API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
+			}},
+			DefaultModel: "openrouter-tts",
+			SupportsList: true,
+			Models: []ModelInfo{{
+				ID:           "openrouter-tts",
+				Name:         "openrouter-tts",
+				Description:  "Default OpenRouter speech wrapper model",
+				TemplateOnly: true,
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("model", "Model", "Underlying OpenRouter model ID", false, "openai/gpt-audio-mini", 10),
+					stringField("voice", "Voice", "Voice name", false, "coral", 20),
+					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("model", "Model", "Underlying OpenRouter model ID", false, "openai/gpt-audio-mini", 10),
+					stringField("voice", "Voice", "Voice name", false, "coral", 20),
+					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+				}}},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []openrouterspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, openrouterspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, openrouterspeech.WithBaseURL(v))
+				}
+				return openrouterspeech.New(opts...), nil
+			},
+			Order: 30,
+		},
+		{
+			ClientType:  models.ClientTypeElevenLabsSpeech,
+			DisplayName: "ElevenLabs Speech",
+			Icon:        "elevenlabs",
+			Description: "ElevenLabs text-to-speech",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
+			}},
+			DefaultModel: "elevenlabs-tts",
+			SupportsList: true,
+			Models: []ModelInfo{{
+				ID:           "elevenlabs-tts",
+				Name:         "elevenlabs-tts",
+				Description:  "Default ElevenLabs speech wrapper model",
+				TemplateOnly: true,
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("voice_id", "Voice ID", "ElevenLabs voice ID", true, "", 10),
+					advancedStringField("model_id", "Model ID", "ElevenLabs model ID", false, "eleven_multilingual_v2", 20),
+					numberField("stability", "Stability", "Voice stability 0-1", false, 0.5, 30),
+					numberField("similarity_boost", "Similarity Boost", "Voice similarity boost 0-1", false, 0.75, 40),
+					numberField("style", "Style", "Speaking style intensity 0-1", false, 0, 50),
+					boolField("use_speaker_boost", "Speaker Boost", "Enable speaker boost", false, 60),
+					numberField("speed", "Speed", "Speech rate 0.5-2.0", false, 1.0, 70),
+					stringField("output_format", "Output Format", "Output format", false, "mp3_44100_128", 80),
+					numberField("seed", "Seed", "Deterministic seed", false, 0, 90),
+					enumField("apply_text_normalization", "Text Normalization", "Text normalization mode", false, []string{"auto", "on", "off"}, 100),
+					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("voice_id", "Voice ID", "ElevenLabs voice ID", true, "", 10),
+					advancedStringField("model_id", "Model ID", "ElevenLabs model ID", false, "eleven_multilingual_v2", 20),
+					numberField("stability", "Stability", "Voice stability 0-1", false, 0.5, 30),
+					numberField("similarity_boost", "Similarity Boost", "Voice similarity boost 0-1", false, 0.75, 40),
+					numberField("style", "Style", "Speaking style intensity 0-1", false, 0, 50),
+					boolField("use_speaker_boost", "Speaker Boost", "Enable speaker boost", false, 60),
+					numberField("speed", "Speed", "Speech rate 0.5-2.0", false, 1.0, 70),
+					stringField("output_format", "Output Format", "Output format", false, "mp3_44100_128", 80),
+					numberField("seed", "Seed", "Deterministic seed", false, 0, 90),
+					enumField("apply_text_normalization", "Text Normalization", "Text normalization mode", false, []string{"auto", "on", "off"}, 100),
+					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
+				}}},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []elevenlabsspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, elevenlabsspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, elevenlabsspeech.WithBaseURL(v))
+				}
+				return elevenlabsspeech.New(opts...), nil
+			},
+			Order: 40,
+		},
+		{
+			ClientType:  models.ClientTypeDeepgramSpeech,
+			DisplayName: "Deepgram Speech",
+			Icon:        "deepgram",
+			Description: "Deepgram TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "Deepgram API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
+			}},
+			DefaultModel: "deepgram-tts",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "deepgram-tts",
+				Name:        "deepgram-tts",
+				Description: "Default Deepgram speech wrapper model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("model", "Model", "Deepgram voice model", false, "aura-2-asteria-en", 10),
+					enumField("encoding", "Encoding", "Audio encoding", false, []string{"linear16", "mulaw", "alaw"}, 20),
+					numberField("sample_rate", "Sample Rate", "Audio sample rate in Hz", false, 24000, 30),
+					enumField("container", "Container", "Audio container", false, []string{"wav", "none"}, 40),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						advancedStringField("model", "Model", "Deepgram voice model", false, "aura-2-asteria-en", 10),
+						enumField("encoding", "Encoding", "Audio encoding", false, []string{"linear16", "mulaw", "alaw"}, 20),
+						numberField("sample_rate", "Sample Rate", "Audio sample rate in Hz", false, 24000, 30),
+						enumField("container", "Container", "Audio container", false, []string{"wav", "none"}, 40),
+					}},
+					Formats: []string{"wav", "none"},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []deepgramspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, deepgramspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, deepgramspeech.WithBaseURL(v))
+				}
+				return deepgramspeech.New(opts...), nil
+			},
+			Order: 50,
+		},
+		{
+			ClientType:  models.ClientTypeMiniMaxSpeech,
+			DisplayName: "MiniMax Speech",
+			Icon:        "minimax-color",
+			Description: "MiniMax TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "MiniMax API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.minimax.io", 20),
+			}},
+			DefaultModel: "minimax-tts",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "minimax-tts",
+				Name:        "minimax-tts",
+				Description: "Default MiniMax speech wrapper model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("voice_id", "Voice ID", "MiniMax voice ID", false, "English_expressive_narrator", 10),
+					advancedStringField("model", "Model", "MiniMax model", false, "speech-2.8-hd", 20),
+					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+					numberField("vol", "Volume", "Volume", false, 1.0, 40),
+					numberField("pitch", "Pitch", "Pitch adjustment", false, 0, 50),
+					enumField("output_format", "Output Format", "Audio format", false, []string{"mp3", "pcm", "flac", "wav"}, 60),
+					numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 32000, 70),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						stringField("voice_id", "Voice ID", "MiniMax voice ID", false, "English_expressive_narrator", 10),
+						advancedStringField("model", "Model", "MiniMax model", false, "speech-2.8-hd", 20),
+						numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
+						numberField("vol", "Volume", "Volume", false, 1.0, 40),
+						numberField("pitch", "Pitch", "Pitch adjustment", false, 0, 50),
+						enumField("output_format", "Output Format", "Audio format", false, []string{"mp3", "pcm", "flac", "wav"}, 60),
+						numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 32000, 70),
+					}},
+					Formats: []string{"mp3", "pcm", "flac", "wav"},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []minimaxspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, minimaxspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, minimaxspeech.WithBaseURL(v))
+				}
+				return minimaxspeech.New(opts...), nil
+			},
+			Order: 60,
+		},
+		{
+			ClientType:  models.ClientTypeVolcengineSpeech,
+			DisplayName: "Volcengine Speech",
+			Icon:        "volcengine-color",
+			Description: "Volcengine SAMI TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("access_key", "Access Key", "Volcengine access key", true, 10),
+				secretField("secret_key", "Secret Key", "Volcengine secret key", true, 20),
+				secretField("app_key", "App Key", "SAMI app key", true, 30),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://sami.bytedance.com", 40),
+			}},
+			DefaultModel: "sami-tts",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "sami-tts",
+				Name:        "sami-tts",
+				Description: "Default Volcengine SAMI speech wrapper model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("speaker", "Speaker", "Speaker ID", true, "", 10),
+					enumField("encoding", "Encoding", "Output encoding", false, []string{"mp3", "wav", "aac"}, 20),
+					numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 24000, 30),
+					numberField("speech_rate", "Speech Rate", "Speech rate [-50,100]", false, 0, 40),
+					numberField("pitch_rate", "Pitch Rate", "Pitch rate [-12,12]", false, 0, 50),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						stringField("speaker", "Speaker", "Speaker ID", true, "", 10),
+						enumField("encoding", "Encoding", "Output encoding", false, []string{"mp3", "wav", "aac"}, 20),
+						numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 24000, 30),
+						numberField("speech_rate", "Speech Rate", "Speech rate [-50,100]", false, 0, 40),
+						numberField("pitch_rate", "Pitch Rate", "Pitch rate [-12,12]", false, 0, 50),
+					}},
+					Formats: []string{"mp3", "wav", "aac"},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []volcenginespeech.Option{}
+				if v := configString(config, "access_key"); v != "" {
+					opts = append(opts, volcenginespeech.WithAccessKey(v))
+				}
+				if v := configString(config, "secret_key"); v != "" {
+					opts = append(opts, volcenginespeech.WithSecretKey(v))
+				}
+				if v := configString(config, "app_key"); v != "" {
+					opts = append(opts, volcenginespeech.WithAppKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, volcenginespeech.WithBaseURL(v))
+				}
+				return volcenginespeech.New(opts...), nil
+			},
+			Order: 70,
+		},
+		{
+			ClientType:  models.ClientTypeAlibabaSpeech,
+			DisplayName: "Alibaba Cloud Speech",
+			Icon:        "bailian-color",
+			Description: "DashScope CosyVoice TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "DashScope API key", true, 10),
+				stringField("base_url", "Base URL", "Override the WebSocket endpoint", false, "wss://dashscope.aliyuncs.com/api-ws/v1/inference/", 20),
+			}},
+			DefaultModel: "cosyvoice-tts",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "cosyvoice-tts",
+				Name:        "cosyvoice-tts",
+				Description: "Default DashScope CosyVoice wrapper model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("model", "Model", "DashScope model ID", false, "cosyvoice-v1", 10),
+					stringField("voice", "Voice", "Voice or custom clone ID", true, "", 20),
+					enumField("format", "Format", "Audio format", false, []string{"mp3", "wav", "pcm", "opus"}, 30),
+					numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 22050, 40),
+					numberField("volume", "Volume", "Volume 0-100", false, 50, 50),
+					numberField("rate", "Rate", "Speech rate 0.5-2.0", false, 1.0, 60),
+					numberField("pitch", "Pitch", "Pitch multiplier 0.5-2.0", false, 1.0, 70),
+				}},
+				Capabilities: ModelCapabilities{
+					ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+						advancedStringField("model", "Model", "DashScope model ID", false, "cosyvoice-v1", 10),
+						stringField("voice", "Voice", "Voice or custom clone ID", true, "", 20),
+						enumField("format", "Format", "Audio format", false, []string{"mp3", "wav", "pcm", "opus"}, 30),
+						numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 22050, 40),
+						numberField("volume", "Volume", "Volume 0-100", false, 50, 50),
+						numberField("rate", "Rate", "Speech rate 0.5-2.0", false, 1.0, 60),
+						numberField("pitch", "Pitch", "Pitch multiplier 0.5-2.0", false, 1.0, 70),
+					}},
+					Formats: []string{"mp3", "wav", "pcm", "opus"},
+				},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []alibabaspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, alibabaspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, alibabaspeech.WithBaseURL(v))
+				}
+				return alibabaspeech.New(opts...), nil
+			},
+			Order: 80,
+		},
+		{
+			ClientType:  models.ClientTypeMicrosoftSpeech,
+			DisplayName: "Microsoft Speech",
+			Icon:        "azure-color",
+			Description: "Azure Cognitive Services TTS",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "Azure speech subscription key", true, 10),
+				stringField("base_url", "Base URL", "Optional full TTS endpoint override", false, "", 20),
+			}},
+			DefaultModel: "microsoft-tts",
+			SupportsList: false,
+			Models: []ModelInfo{{
+				ID:          "microsoft-tts",
+				Name:        "microsoft-tts",
+				Description: "Default Azure speech wrapper model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("region", "Region", "Azure region, e.g. eastus", false, "eastus", 10),
+					stringField("voice", "Voice", "Azure voice name", false, "en-US-JennyNeural", 20),
+					stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 30),
+					stringField("output_format", "Output Format", "Azure output format", false, "audio-16khz-128kbitrate-mono-mp3", 40),
+					stringField("style", "Style", "Optional speaking style", false, "", 50),
+					stringField("rate", "Rate", "Optional speaking rate", false, "", 60),
+					stringField("pitch", "Pitch", "Optional pitch adjustment", false, "", 70),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("region", "Region", "Azure region, e.g. eastus", false, "eastus", 10),
+					stringField("voice", "Voice", "Azure voice name", false, "en-US-JennyNeural", 20),
+					stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 30),
+					stringField("output_format", "Output Format", "Azure output format", false, "audio-16khz-128kbitrate-mono-mp3", 40),
+					stringField("style", "Style", "Optional speaking style", false, "", 50),
+					stringField("rate", "Rate", "Optional speaking rate", false, "", 60),
+					stringField("pitch", "Pitch", "Optional pitch adjustment", false, "", 70),
+				}}},
+			}},
+			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
+				opts := []microsoftspeech.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, microsoftspeech.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, microsoftspeech.WithBaseURL(v))
+				}
+				return microsoftspeech.New(opts...), nil
+			},
+			Order: 90,
+		},
+	}
+}
+
+func stringField(key, title, description string, required bool, example any, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "string", Title: title, Description: description, Required: required, Example: example, Order: order}
+}
+
+func advancedStringField(key, title, description string, required bool, example any, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "string", Title: title, Description: description, Required: required, Advanced: true, Example: example, Order: order}
+}
+
+func secretField(key, title, description string, required bool, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "secret", Title: title, Description: description, Required: required, Order: order}
+}
+
+func numberField(key, title, description string, required bool, example any, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "number", Title: title, Description: description, Required: required, Example: example, Order: order}
+}
+
+func boolField(key, title, description string, required bool, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "bool", Title: title, Description: description, Required: required, Order: order}
+}
+
+func enumField(key, title, description string, required bool, values []string, order int) FieldSchema {
+	return FieldSchema{Key: key, Type: "enum", Title: title, Description: description, Required: required, Enum: values, Order: order}
+}
+
+func configString(cfg map[string]any, key string) string {
+	if cfg == nil {
+		return ""
+	}
+	if v, ok := cfg[key].(string); ok {
+		return strings.TrimSpace(v)
+	}
+	return ""
+}
+
+func voiceIDs(voices []VoiceInfo) []string {
+	out := make([]string, 0, len(voices))
+	for _, voice := range voices {
+		out = append(out, voice.ID)
+	}
+	return out
+}
@@ -3,14 +3,15 @@ package tts
 import (
 	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
-	"strings"
+
+	sdk "github.com/memohai/twilight-ai/sdk"

 	"github.com/memohai/memoh/internal/db"
 	"github.com/memohai/memoh/internal/db/sqlc"
+	"github.com/memohai/memoh/internal/models"
 )

 type Service struct {
@@ -33,11 +34,6 @@ func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
 	return s.registry.ListMeta()
 }

-// ---------------------------------------------------------------------------
-// Read helpers (speech-filtered views of unified tables)
-// ---------------------------------------------------------------------------
-
-// ListSpeechProviders returns providers with speech client types.
 func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
 	rows, err := s.queries.ListSpeechProviders(ctx)
 	if err != nil {
@@ -50,7 +46,18 @@ func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResp
 	return items, nil
 }

-// ListSpeechModels returns all speech-type models.
+func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return SpeechProviderResponse{}, err
+	}
+	row, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
+	}
+	return toSpeechProviderResponse(row), nil
+}
+
 func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
 	rows, err := s.queries.ListSpeechModels(ctx)
 	if err != nil {
@@ -58,29 +65,41 @@ func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse,
 	}
 	items := make([]SpeechModelResponse, 0, len(rows))
 	for _, row := range rows {
+		if s.shouldHideModel(row.ProviderType, row.ModelID) {
+			continue
+		}
 		items = append(items, toSpeechModelFromListRow(row))
 	}
 	return items, nil
 }

-// ListSpeechModelsByProvider returns speech models for a given provider.
 func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
 	pgID, err := db.ParseUUID(providerID)
 	if err != nil {
 		return nil, err
 	}
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
 	rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
 	if err != nil {
 		return nil, fmt.Errorf("list speech models by provider: %w", err)
 	}
 	items := make([]SpeechModelResponse, 0, len(rows))
 	for _, row := range rows {
+		if shouldHideTemplateModel(def, row.ModelID) {
+			continue
+		}
 		items = append(items, toSpeechModelFromModel(row, ""))
 	}
 	return items, nil
 }

-// GetSpeechModel returns a speech model by ID.
 func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
 	pgID, err := db.ParseUUID(id)
 	if err != nil {
@@ -93,98 +112,45 @@ func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelRes
 	return toSpeechModelWithProviderResponse(row), nil
 }

-// ---------------------------------------------------------------------------
-// Synthesis
-// ---------------------------------------------------------------------------
-
-// Synthesize runs text-to-speech using the saved model config, optionally
-// overridden by fields in overrideCfg. Returns raw audio bytes.
 func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
-	pgID, err := db.ParseUUID(modelID)
+	params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
 	if err != nil {
 		return nil, "", err
 	}
-	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	result, err := sdk.GenerateSpeech(ctx,
+		sdk.WithSpeechModel(params.model),
+		sdk.WithText(text),
+		sdk.WithSpeechConfig(params.config),
+	)
 	if err != nil {
-		return nil, "", fmt.Errorf("get speech model: %w", err)
+		return nil, "", fmt.Errorf("synthesize: %w", err)
 	}
-	adapterType := clientTypeToTtsType(modelRow.ProviderType)
-	adapter, err := s.registry.Get(adapterType)
-	if err != nil {
-		return nil, "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
-	}
-
-	savedCfg := parseModelConfig(modelRow.Config)
-	for k, v := range overrideCfg {
-		savedCfg[k] = v
-	}
-
-	audioCfg := buildAudioConfig(savedCfg)
-	if err := audioCfg.Validate(); err != nil {
-		return nil, "", fmt.Errorf("invalid audio config: %w", err)
-	}
-
-	resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
-	audio, synthErr := adapter.Synthesize(ctx, text, resolvedModel, audioCfg)
-	if synthErr != nil {
-		return nil, "", fmt.Errorf("synthesize: %w", synthErr)
-	}
-
-	contentType := resolveContentType(audioCfg.Format)
-	return audio, contentType, nil
+	return result.Audio, result.ContentType, nil
 }

-// StreamToFile runs text-to-speech using Stream() and writes audio chunks
-// directly to the given writer, keeping peak memory low for large audio.
 func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
-	pgID, err := db.ParseUUID(modelID)
+	params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
 	if err != nil {
 		return "", err
 	}
-	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	streamResult, err := sdk.StreamSpeech(ctx,
+		sdk.WithSpeechModel(params.model),
+		sdk.WithText(text),
+		sdk.WithSpeechConfig(params.config),
+	)
 	if err != nil {
-		return "", fmt.Errorf("get speech model: %w", err)
+		return "", fmt.Errorf("stream: %w", err)
 	}
-	adapterType := clientTypeToTtsType(modelRow.ProviderType)
-	adapter, err := s.registry.Get(adapterType)
+	audio, err := streamResult.Bytes()
 	if err != nil {
-		return "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
+		return "", fmt.Errorf("stream: %w", err)
 	}
-
-	savedCfg := parseModelConfig(modelRow.Config)
-	audioCfg := buildAudioConfig(savedCfg)
-	if err := audioCfg.Validate(); err != nil {
-		return "", fmt.Errorf("invalid audio config: %w", err)
+	if _, writeErr := w.Write(audio); writeErr != nil {
+		return "", fmt.Errorf("write chunk: %w", writeErr)
 	}
-
-	resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
-	dataCh, errCh := adapter.Stream(ctx, text, resolvedModel, audioCfg)
-	if dataCh == nil {
-		select {
-		case streamErr := <-errCh:
-			return "", fmt.Errorf("stream: %w", streamErr)
-		default:
-			return "", errors.New("stream returned nil channels")
-		}
-	}
-
-	for chunk := range dataCh {
-		if _, writeErr := w.Write(chunk); writeErr != nil {
-			return "", fmt.Errorf("write chunk: %w", writeErr)
-		}
-	}
-	if streamErr, ok := <-errCh; ok && streamErr != nil {
-		return "", fmt.Errorf("stream: %w", streamErr)
-	}
-
-	return resolveContentType(audioCfg.Format), nil
+	return streamResult.ContentType, nil
 }

-// ---------------------------------------------------------------------------
-// Capabilities
-// ---------------------------------------------------------------------------
-
-// GetModelCapabilities returns the adapter-level capabilities for a stored model.
 func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
 	pgID, err := db.ParseUUID(modelID)
 	if err != nil {
@@ -194,115 +160,217 @@ func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*Mo
 	if err != nil {
 		return nil, fmt.Errorf("get speech model: %w", err)
 	}
-	adapterType := clientTypeToTtsType(modelRow.ProviderType)
-	adapter, err := s.registry.Get(adapterType)
+	def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
 	if err != nil {
-		return nil, fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
+		return nil, err
 	}
-	for _, m := range adapter.Models() {
-		if m.ID == modelRow.ModelID {
-			return &m.Capabilities, nil
+	template := findModelTemplate(def, modelRow.ModelID)
+	if template == nil {
+		return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
+	}
+	caps := template.Capabilities
+	if len(caps.ConfigSchema.Fields) == 0 {
+		caps.ConfigSchema = template.ConfigSchema
+	}
+	return &caps, nil
+}
+
+func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
+	pgID, err := db.ParseUUID(providerID)
+	if err != nil {
+		return nil, err
+	}
+
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	if !def.SupportsList || def.Factory == nil {
+		return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
+	}
+
+	provider, err := def.Factory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build speech provider: %w", err)
+	}
+
+	remoteModels, err := provider.ListModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list speech models: %w", err)
+	}
+
+	discovered := make([]ModelInfo, 0, len(remoteModels))
+	for _, remoteModel := range remoteModels {
+		if remoteModel == nil || remoteModel.ID == "" {
+			continue
 		}
+		discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
 	}
-	return nil, fmt.Errorf("model %s not found in adapter", modelRow.ModelID)
+	return discovered, nil
 }

-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-
-// clientTypeToTtsType maps the unified client_type to the TTS adapter type.
-func clientTypeToTtsType(clientType string) TtsType {
-	switch clientType {
-	case "edge-speech":
-		return "edge"
-	default:
-		return TtsType(clientType)
-	}
+type resolvedSpeechParams struct {
+	model  *sdk.SpeechModel
+	config map[string]any
 }

-func parseModelConfig(raw []byte) map[string]any {
+func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
+	_ = text
+	pgID, err := db.ParseUUID(modelID)
+	if err != nil {
+		return nil, err
+	}
+
+	modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech model: %w", err)
+	}
+	providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	provider, err := def.Factory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build speech provider: %w", err)
+	}
+
+	cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
+	return &resolvedSpeechParams{
+		model:  &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
+		config: cfg,
+	}, nil
+}
+
+func parseConfig(raw []byte) map[string]any {
 	if len(raw) == 0 {
-		return make(map[string]any)
+		return map[string]any{}
 	}
 	var cfg map[string]any
-	if err := json.Unmarshal(raw, &cfg); err != nil {
-		return make(map[string]any)
-	}
-	if cfg == nil {
-		return make(map[string]any)
+	if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
+		return map[string]any{}
 	}
 	return cfg
 }

-func buildAudioConfig(cfg map[string]any) AudioConfig {
-	ac := AudioConfig{}
-	if voice, ok := cfg["voice"].(map[string]any); ok {
-		if id, ok := voice["id"].(string); ok {
-			ac.Voice.ID = id
-		}
-		if lang, ok := voice["lang"].(string); ok {
-			ac.Voice.Lang = lang
+func mergeConfig(parts ...map[string]any) map[string]any {
+	out := make(map[string]any)
+	for _, part := range parts {
+		for key, value := range part {
+			out[key] = value
 		}
 	}
-	if format, ok := cfg["format"].(string); ok {
-		ac.Format = format
-	}
-	if speed, ok := toFloat(cfg["speed"]); ok {
-		ac.Speed = speed
-	}
-	if pitch, ok := toFloat(cfg["pitch"]); ok {
-		ac.Pitch = pitch
-	}
-	if sr, ok := toFloat(cfg["sample_rate"]); ok {
-		ac.SampleRate = int(sr)
-	}
-	return ac
+	return out
 }

-func toFloat(v any) (float64, bool) {
-	switch n := v.(type) {
-	case float64:
-		return n, true
-	case float32:
-		return float64(n), true
-	case int:
-		return float64(n), true
-	case int64:
-		return float64(n), true
-	default:
-		return 0, false
+func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
+	for _, model := range defaults {
+		if model.ID == modelID {
+			return model
+		}
+	}
+	return ModelInfo{
+		ID:   modelID,
+		Name: modelID,
 	}
 }

-func resolveContentType(format string) string {
-	switch {
-	case strings.Contains(format, "mp3"):
-		return "audio/mpeg"
-	case strings.Contains(format, "opus"):
-		return "audio/opus"
-	case strings.Contains(format, "ogg"):
-		return "audio/ogg"
-	case strings.Contains(format, "webm"):
-		return "audio/webm"
-	case strings.Contains(format, "wav"):
-		return "audio/wav"
-	default:
-		return "audio/mpeg"
+func (s *Service) shouldHideModel(clientType string, modelID string) bool {
+	def, err := s.registry.Get(models.ClientType(clientType))
+	if err != nil {
+		return false
 	}
+	return shouldHideTemplateModel(def, modelID)
+}
+
+func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
+	if !def.SupportsList {
+		return false
+	}
+	for _, model := range def.Models {
+		if model.ID == modelID {
+			return model.TemplateOnly
+		}
+	}
+	return false
+}
+
+func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
+	for i := range def.Models {
+		if def.Models[i].ID == modelID {
+			return &def.Models[i]
+		}
+	}
+	if def.DefaultModel != "" {
+		for i := range def.Models {
+			if def.Models[i].ID == def.DefaultModel {
+				return &def.Models[i]
+			}
+		}
+	}
+	if len(def.Models) > 0 {
+		return &def.Models[0]
+	}
+	return nil
 }

 func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
+	icon := ""
+	if row.Icon.Valid {
+		icon = row.Icon.String
+	}
 	return SpeechProviderResponse{
 		ID:         row.ID.String(),
 		Name:       row.Name,
 		ClientType: row.ClientType,
+		Icon:       icon,
 		Enable:     row.Enable,
+		Config:     maskSpeechProviderConfig(parseConfig(row.Config)),
 		CreatedAt:  row.CreatedAt.Time,
 		UpdatedAt:  row.UpdatedAt.Time,
 	}
 }

+func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
+	if len(cfg) == 0 {
+		return map[string]any{}
+	}
+	out := make(map[string]any, len(cfg))
+	for key, value := range cfg {
+		if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
+			out[key] = maskSpeechSecret(s)
+			continue
+		}
+		out[key] = value
+	}
+	return out
+}
+
+func isSpeechSecretKey(key string) bool {
+	switch key {
+	case "api_key", "access_key", "secret_key", "app_key":
+		return true
+	default:
+		return false
+	}
+}
+
+func maskSpeechSecret(value string) string {
+	if len(value) <= 8 {
+		return "********"
+	}
+	return value[:4] + "****" + value[len(value)-4:]
+}
+
 func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
 	var cfg map[string]any
 	if len(row.Config) > 0 {
@@ -4,21 +4,24 @@ import "time"

 // ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
 type ProviderMetaResponse struct {
-	Provider     string      `json:"provider"`
-	DisplayName  string      `json:"display_name"`
-	Description  string      `json:"description"`
-	DefaultModel string      `json:"default_model"`
-	Models       []ModelInfo `json:"models"`
+	Provider     string       `json:"provider"`
+	DisplayName  string       `json:"display_name"`
+	Description  string       `json:"description"`
+	ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
+	DefaultModel string       `json:"default_model"`
+	Models       []ModelInfo  `json:"models"`
 }

 // SpeechProviderResponse represents a speech-capable provider from the unified providers table.
 type SpeechProviderResponse struct {
-	ID         string    `json:"id"`
-	Name       string    `json:"name"`
-	ClientType string    `json:"client_type"`
-	Enable     bool      `json:"enable"`
-	CreatedAt  time.Time `json:"created_at"`
-	UpdatedAt  time.Time `json:"updated_at"`
+	ID         string         `json:"id"`
+	Name       string         `json:"name"`
+	ClientType string         `json:"client_type"`
+	Icon       string         `json:"icon,omitempty"`
+	Enable     bool           `json:"enable"`
+	Config     map[string]any `json:"config,omitempty"`
+	CreatedAt  time.Time      `json:"created_at"`
+	UpdatedAt  time.Time      `json:"updated_at"`
 }

 // SpeechModelResponse represents a speech model from the unified models table.
@@ -50,3 +53,10 @@ type TestSynthesizeRequest struct {
 	Text   string         `json:"text"`
 	Config map[string]any `json:"config,omitempty"`
 }
+
+// ImportModelsResponse represents the response for importing speech models.
+type ImportModelsResponse struct {
+	Created int      `json:"created"`
+	Skipped int      `json:"skipped"`
+	Models  []string `json:"models"`
+}
@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none">
+  <path fill="currentColor" d="M11.203 24H1.517a.364.364 0 0 1-.258-.62l6.239-6.275a.37.37 0 0 1 .259-.108h3.52c2.723 0 5.025-2.127 5.107-4.845a5.004 5.004 0 0 0-4.999-5.148H7.613v4.646c0 .2-.164.364-.365.364H.968a.365.365 0 0 1-.363-.364V.364C.605.164.768 0 .969 0h10.416c6.684 0 12.111 5.485 12.01 12.187C23.293 18.77 17.794 24 11.202 24z"/>
+</svg>
@@ -0,0 +1,4 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="172 129 156 242" fill="none">
+  <path d="M314 355H271V145H314V355Z" fill="currentColor"/>
+  <path d="M229 355H186V145H229V355Z" fill="currentColor"/>
+</svg>
@@ -18,6 +18,8 @@ const llmProviders: string[] = [
  ...withVariants('anthropic', []),
  ...withVariants('google', ['color', 'brand-color']),
  ...withVariants('deepseek', ['color']),
+  ...withVariants('deepgram', []),
+  ...withVariants('elevenlabs', []),
  ...withVariants('groq', []),
  ...withVariants('huggingface', ['color']),
  ...withVariants('lmstudio', []),
@@ -0,0 +1,17 @@
+<template>
+  <svg
+    xmlns="http://www.w3.org/2000/svg"
+    :width="size"
+    :height="size"
+    viewBox="0 0 24 24"
+    v-bind="$attrs"
+  ><path
+    fill="currentColor"
+    d="M11.203 24H1.517a.364.364 0 0 1-.258-.62l6.239-6.275a.37.37 0 0 1 .259-.108h3.52c2.723 0 5.025-2.127 5.107-4.845a5.004 5.004 0 0 0-4.999-5.148H7.613v4.646c0 .2-.164.364-.365.364H.968a.365.365 0 0 1-.363-.364V.364C.605.164.768 0 .969 0h10.416c6.684 0 12.111 5.485 12.01 12.187C23.293 18.77 17.794 24 11.202 24z"
+  /></svg>
+</template>
+
+<script setup lang="ts">
+withDefaults(defineProps<{ size?: string | number }>(), { size: '1em' })
+defineOptions({ inheritAttrs: false })
+</script>
@@ -0,0 +1,21 @@
+<template>
+  <svg
+    xmlns="http://www.w3.org/2000/svg"
+    :width="size"
+    :height="size"
+    viewBox="172 129 156 242"
+    v-bind="$attrs"
+  ><path
+     d="M314 355H271V145H314V355Z"
+     fill="currentColor"
+   />
+    <path
+      d="M229 355H186V145H229V355Z"
+      fill="currentColor"
+    /></svg>
+</template>
+
+<script setup lang="ts">
+withDefaults(defineProps<{ size?: string | number }>(), { size: '1em' })
+defineOptions({ inheritAttrs: false })
+</script>
@@ -15,6 +15,7 @@ export { default as Claude } from './icons/Claude.vue'
 export { default as ClaudeColor } from './icons/ClaudeColor.vue'
 export { default as Cohere } from './icons/Cohere.vue'
 export { default as CohereColor } from './icons/CohereColor.vue'
+export { default as Deepgram } from './icons/Deepgram.vue'
 export { default as Deepseek } from './icons/Deepseek.vue'
 export { default as DeepseekColor } from './icons/DeepseekColor.vue'
 export { default as Dingtalk } from './icons/Dingtalk.vue'
@@ -22,6 +23,7 @@ export { default as Discord } from './icons/Discord.vue'
 export { default as Doubao } from './icons/Doubao.vue'
 export { default as DoubaoColor } from './icons/DoubaoColor.vue'
 export { default as Duckduckgo } from './icons/Duckduckgo.vue'
+export { default as Elevenlabs } from './icons/Elevenlabs.vue'
 export { default as Exa } from './icons/Exa.vue'
 export { default as ExaColor } from './icons/ExaColor.vue'
 export { default as Feishu } from './icons/Feishu.vue'
@@ -1642,8 +1642,34 @@ export type SettingsUpsertRequest = {
    tts_model_id?: string;
 };

+export type TtsConfigSchema = {
+    fields?: Array<TtsFieldSchema>;
+};
+
+export type TtsFieldSchema = {
+    advanced?: boolean;
+    description?: string;
+    enum?: Array<string>;
+    example?: unknown;
+    key?: string;
+    order?: number;
+    required?: boolean;
+    title?: string;
+    type?: string;
+};
+
+export type TtsImportModelsResponse = {
+    created?: number;
+    models?: Array<string>;
+    skipped?: number;
+};
+
 export type TtsModelCapabilities = {
+    config_schema?: TtsConfigSchema;
    formats?: Array<string>;
+    metadata?: {
+        [key: string]: string;
+    };
    pitch?: TtsParamConstraint;
    speed?: TtsParamConstraint;
    voices?: Array<TtsVoiceInfo>;
@@ -1651,6 +1677,7 @@ export type TtsModelCapabilities = {

 export type TtsModelInfo = {
    capabilities?: TtsModelCapabilities;
+    config_schema?: TtsConfigSchema;
    description?: string;
    id?: string;
    name?: string;
@@ -1664,6 +1691,7 @@ export type TtsParamConstraint = {
 };

 export type TtsProviderMetaResponse = {
+    config_schema?: TtsConfigSchema;
    default_model?: string;
    description?: string;
    display_name?: string;
@@ -1686,8 +1714,12 @@ export type TtsSpeechModelResponse = {

 export type TtsSpeechProviderResponse = {
    client_type?: string;
+    config?: {
+        [key: string]: unknown;
+    };
    created_at?: string;
    enable?: boolean;
+    icon?: string;
    id?: string;
    name?: string;
    updated_at?: string;
@@ -8331,6 +8363,112 @@ export type GetSpeechProvidersMetaResponses = {

 export type GetSpeechProvidersMetaResponse = GetSpeechProvidersMetaResponses[keyof GetSpeechProvidersMetaResponses];

+export type GetSpeechProvidersByIdData = {
+    body?: never;
+    path: {
+        /**
+         * Provider ID (UUID)
+         */
+        id: string;
+    };
+    query?: never;
+    url: '/speech-providers/{id}';
+};
+
+export type GetSpeechProvidersByIdErrors = {
+    /**
+     * Bad Request
+     */
+    400: HandlersErrorResponse;
+    /**
+     * Not Found
+     */
+    404: HandlersErrorResponse;
+};
+
+export type GetSpeechProvidersByIdError = GetSpeechProvidersByIdErrors[keyof GetSpeechProvidersByIdErrors];
+
+export type GetSpeechProvidersByIdResponses = {
+    /**
+     * OK
+     */
+    200: TtsSpeechProviderResponse;
+};
+
+export type GetSpeechProvidersByIdResponse = GetSpeechProvidersByIdResponses[keyof GetSpeechProvidersByIdResponses];
+
+export type PostSpeechProvidersByIdImportModelsData = {
+    body?: never;
+    path: {
+        /**
+         * Provider ID (UUID)
+         */
+        id: string;
+    };
+    query?: never;
+    url: '/speech-providers/{id}/import-models';
+};
+
+export type PostSpeechProvidersByIdImportModelsErrors = {
+    /**
+     * Bad Request
+     */
+    400: HandlersErrorResponse;
+    /**
+     * Not Found
+     */
+    404: HandlersErrorResponse;
+    /**
+     * Internal Server Error
+     */
+    500: HandlersErrorResponse;
+};
+
+export type PostSpeechProvidersByIdImportModelsError = PostSpeechProvidersByIdImportModelsErrors[keyof PostSpeechProvidersByIdImportModelsErrors];
+
+export type PostSpeechProvidersByIdImportModelsResponses = {
+    /**
+     * OK
+     */
+    200: TtsImportModelsResponse;
+};
+
+export type PostSpeechProvidersByIdImportModelsResponse = PostSpeechProvidersByIdImportModelsResponses[keyof PostSpeechProvidersByIdImportModelsResponses];
+
+export type GetSpeechProvidersByIdModelsData = {
+    body?: never;
+    path: {
+        /**
+         * Provider ID (UUID)
+         */
+        id: string;
+    };
+    query?: never;
+    url: '/speech-providers/{id}/models';
+};
+
+export type GetSpeechProvidersByIdModelsErrors = {
+    /**
+     * Bad Request
+     */
+    400: HandlersErrorResponse;
+    /**
+     * Internal Server Error
+     */
+    500: HandlersErrorResponse;
+};
+
+export type GetSpeechProvidersByIdModelsError = GetSpeechProvidersByIdModelsErrors[keyof GetSpeechProvidersByIdModelsErrors];
+
+export type GetSpeechProvidersByIdModelsResponses = {
+    /**
+     * OK
+     */
+    200: Array<TtsSpeechModelResponse>;
+};
+
+export type GetSpeechProvidersByIdModelsResponse = GetSpeechProvidersByIdModelsResponses[keyof GetSpeechProvidersByIdModelsResponses];
+
 export type GetSupermarketMcpsData = {
    body?: never;
    path?: never;
@@ -8097,6 +8097,141 @@ const docTemplate = `{
                }
            }
        },
+        "/speech-providers/{id}": {
+            "get": {
+                "description": "Get a speech provider with masked config values",
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "Get speech provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/tts.SpeechProviderResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
+        "/speech-providers/{id}/import-models": {
+            "post": {
+                "description": "Fetch models using the configured speech provider and import them into the unified models table",
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "Import speech models from provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/tts.ImportModelsResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Internal Server Error",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
+        "/speech-providers/{id}/models": {
+            "get": {
+                "description": "List models of type 'speech' for a specific speech provider",
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "List speech models by provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/tts.SpeechModelResponse"
+                            }
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Internal Server Error",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/supermarket/mcps": {
            "get": {
                "tags": [
@@ -12930,15 +13065,85 @@ const docTemplate = `{
                }
            }
        },
+        "tts.ConfigSchema": {
+            "type": "object",
+            "properties": {
+                "fields": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/tts.FieldSchema"
+                    }
+                }
+            }
+        },
+        "tts.FieldSchema": {
+            "type": "object",
+            "properties": {
+                "advanced": {
+                    "type": "boolean"
+                },
+                "description": {
+                    "type": "string"
+                },
+                "enum": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "example": {},
+                "key": {
+                    "type": "string"
+                },
+                "order": {
+                    "type": "integer"
+                },
+                "required": {
+                    "type": "boolean"
+                },
+                "title": {
+                    "type": "string"
+                },
+                "type": {
+                    "type": "string"
+                }
+            }
+        },
+        "tts.ImportModelsResponse": {
+            "type": "object",
+            "properties": {
+                "created": {
+                    "type": "integer"
+                },
+                "models": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "skipped": {
+                    "type": "integer"
+                }
+            }
+        },
        "tts.ModelCapabilities": {
            "type": "object",
            "properties": {
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "formats": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                },
+                "metadata": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
                "pitch": {
                    "$ref": "#/definitions/tts.ParamConstraint"
                },
@@ -12959,6 +13164,9 @@ const docTemplate = `{
                "capabilities": {
                    "$ref": "#/definitions/tts.ModelCapabilities"
                },
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "description": {
                    "type": "string"
                },
@@ -12993,6 +13201,9 @@ const docTemplate = `{
        "tts.ProviderMetaResponse": {
            "type": "object",
            "properties": {
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "default_model": {
                    "type": "string"
                },
@@ -13049,12 +13260,19 @@ const docTemplate = `{
                "client_type": {
                    "type": "string"
                },
+                "config": {
+                    "type": "object",
+                    "additionalProperties": {}
+                },
                "created_at": {
                    "type": "string"
                },
                "enable": {
                    "type": "boolean"
                },
+                "icon": {
+                    "type": "string"
+                },
                "id": {
                    "type": "string"
                },
@@ -8088,6 +8088,141 @@
                }
            }
        },
+        "/speech-providers/{id}": {
+            "get": {
+                "description": "Get a speech provider with masked config values",
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "Get speech provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/tts.SpeechProviderResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
+        "/speech-providers/{id}/import-models": {
+            "post": {
+                "description": "Fetch models using the configured speech provider and import them into the unified models table",
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "Import speech models from provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/tts.ImportModelsResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Internal Server Error",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
+        "/speech-providers/{id}/models": {
+            "get": {
+                "description": "List models of type 'speech' for a specific speech provider",
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "speech-providers"
+                ],
+                "summary": "List speech models by provider",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Provider ID (UUID)",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/tts.SpeechModelResponse"
+                            }
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Internal Server Error",
+                        "schema": {
+                            "$ref": "#/definitions/handlers.ErrorResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/supermarket/mcps": {
            "get": {
                "tags": [
@@ -12921,15 +13056,85 @@
                }
            }
        },
+        "tts.ConfigSchema": {
+            "type": "object",
+            "properties": {
+                "fields": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/tts.FieldSchema"
+                    }
+                }
+            }
+        },
+        "tts.FieldSchema": {
+            "type": "object",
+            "properties": {
+                "advanced": {
+                    "type": "boolean"
+                },
+                "description": {
+                    "type": "string"
+                },
+                "enum": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "example": {},
+                "key": {
+                    "type": "string"
+                },
+                "order": {
+                    "type": "integer"
+                },
+                "required": {
+                    "type": "boolean"
+                },
+                "title": {
+                    "type": "string"
+                },
+                "type": {
+                    "type": "string"
+                }
+            }
+        },
+        "tts.ImportModelsResponse": {
+            "type": "object",
+            "properties": {
+                "created": {
+                    "type": "integer"
+                },
+                "models": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "skipped": {
+                    "type": "integer"
+                }
+            }
+        },
        "tts.ModelCapabilities": {
            "type": "object",
            "properties": {
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "formats": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                },
+                "metadata": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
                "pitch": {
                    "$ref": "#/definitions/tts.ParamConstraint"
                },
@@ -12950,6 +13155,9 @@
                "capabilities": {
                    "$ref": "#/definitions/tts.ModelCapabilities"
                },
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "description": {
                    "type": "string"
                },
@@ -12984,6 +13192,9 @@
        "tts.ProviderMetaResponse": {
            "type": "object",
            "properties": {
+                "config_schema": {
+                    "$ref": "#/definitions/tts.ConfigSchema"
+                },
                "default_model": {
                    "type": "string"
                },
@@ -13040,12 +13251,19 @@
                "client_type": {
                    "type": "string"
                },
+                "config": {
+                    "type": "object",
+                    "additionalProperties": {}
+                },
                "created_at": {
                    "type": "string"
                },
                "enable": {
                    "type": "boolean"
                },
+                "icon": {
+                    "type": "string"
+                },
                "id": {
                    "type": "string"
                },
@@ -2761,12 +2761,58 @@ definitions:
      tts_model_id:
        type: string
    type: object
+  tts.ConfigSchema:
+    properties:
+      fields:
+        items:
+          $ref: '#/definitions/tts.FieldSchema'
+        type: array
+    type: object
+  tts.FieldSchema:
+    properties:
+      advanced:
+        type: boolean
+      description:
+        type: string
+      enum:
+        items:
+          type: string
+        type: array
+      example: {}
+      key:
+        type: string
+      order:
+        type: integer
+      required:
+        type: boolean
+      title:
+        type: string
+      type:
+        type: string
+    type: object
+  tts.ImportModelsResponse:
+    properties:
+      created:
+        type: integer
+      models:
+        items:
+          type: string
+        type: array
+      skipped:
+        type: integer
+    type: object
  tts.ModelCapabilities:
    properties:
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
      formats:
        items:
          type: string
        type: array
+      metadata:
+        additionalProperties:
+          type: string
+        type: object
      pitch:
        $ref: '#/definitions/tts.ParamConstraint'
      speed:
@@ -2780,6 +2826,8 @@ definitions:
    properties:
      capabilities:
        $ref: '#/definitions/tts.ModelCapabilities'
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
      description:
        type: string
      id:
@@ -2802,6 +2850,8 @@ definitions:
    type: object
  tts.ProviderMetaResponse:
    properties:
+      config_schema:
+        $ref: '#/definitions/tts.ConfigSchema'
      default_model:
        type: string
      description:
@@ -2839,10 +2889,15 @@ definitions:
    properties:
      client_type:
        type: string
+      config:
+        additionalProperties: {}
+        type: object
      created_at:
        type: string
      enable:
        type: boolean
+      icon:
+        type: string
      id:
        type: string
      name:
@@ -8229,6 +8284,96 @@ paths:
      summary: List speech providers
      tags:
      - speech-providers
+  /speech-providers/{id}:
+    get:
+      description: Get a speech provider with masked config values
+      parameters:
+      - description: Provider ID (UUID)
+        in: path
+        name: id
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/tts.SpeechProviderResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+      summary: Get speech provider
+      tags:
+      - speech-providers
+  /speech-providers/{id}/import-models:
+    post:
+      consumes:
+      - application/json
+      description: Fetch models using the configured speech provider and import them
+        into the unified models table
+      parameters:
+      - description: Provider ID (UUID)
+        in: path
+        name: id
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/tts.ImportModelsResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+        "500":
+          description: Internal Server Error
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+      summary: Import speech models from provider
+      tags:
+      - speech-providers
+  /speech-providers/{id}/models:
+    get:
+      description: List models of type 'speech' for a specific speech provider
+      parameters:
+      - description: Provider ID (UUID)
+        in: path
+        name: id
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            items:
+              $ref: '#/definitions/tts.SpeechModelResponse'
+            type: array
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+        "500":
+          description: Internal Server Error
+          schema:
+            $ref: '#/definitions/handlers.ErrorResponse'
+      summary: List speech models by provider
+      tags:
+      - speech-providers
  /speech-providers/meta:
    get:
      description: List available speech provider types with their models and capabilities