feat: expand speech provider support with new client types and config… (#389)

* feat: expand speech provider support with new client types and configuration schema

* feat: add icon support for speech providers and update related configurations

* feat: add SVG support for Deepgram and Elevenlabs with Vue components

* feat: except *-speech client type in llm provider

* feat: enhance speech provider functionality with advanced settings and model import capabilities

* chore: remove go.mod replace

* feat: enhance speech provider functionality with advanced settings and model import capabilities

* chore: update go module dependencies

---------

Co-authored-by: Acbox <acbox0328@gmail.com>
This commit is contained in:
Yiming Qi
2026-04-19 22:58:16 +09:00
committed by GitHub
parent 8e013ad1ad
commit 8d78925a23
46 changed files with 2808 additions and 565 deletions
@@ -13,10 +13,12 @@ import {
ClaudeColor,
Cohere,
CohereColor,
Deepgram,
Deepseek,
DeepseekColor,
Doubao,
DoubaoColor,
Elevenlabs,
Fireworks,
FireworksColor,
Gemini,
@@ -35,6 +37,8 @@ import {
Lmstudio,
Meta,
MetaColor,
Microsoft,
MicrosoftColor,
Minimax,
MinimaxColor,
Mistral,
@@ -81,6 +85,8 @@ export const iconMap: Record<string, Component> = {
'google-brand-color': GoogleBrandColor,
'deepseek': Deepseek,
'deepseek-color': DeepseekColor,
'deepgram': Deepgram,
'elevenlabs': Elevenlabs,
'groq': Groq,
'huggingface': Huggingface,
'huggingface-color': HuggingfaceColor,
@@ -105,6 +111,8 @@ export const iconMap: Record<string, Component> = {
'cohere-color': CohereColor,
'azure': Azure,
'azure-color': AzureColor,
'microsoft': Microsoft,
'microsoft-color': MicrosoftColor,
'nvidia': Nvidia,
'nvidia-color': NvidiaColor,
'fireworks': Fireworks,
+41 -1
View File
@@ -40,9 +40,49 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
label: 'Edge Speech',
hint: 'Microsoft Edge Read Aloud TTS',
},
'openai-speech': {
value: 'openai-speech',
label: 'OpenAI Speech',
hint: 'OpenAI /audio/speech compatible TTS',
},
'openrouter-speech': {
value: 'openrouter-speech',
label: 'OpenRouter Speech',
hint: 'OpenRouter audio modality TTS',
},
'elevenlabs-speech': {
value: 'elevenlabs-speech',
label: 'ElevenLabs Speech',
hint: 'ElevenLabs text-to-speech',
},
'deepgram-speech': {
value: 'deepgram-speech',
label: 'Deepgram Speech',
hint: 'Deepgram TTS',
},
'minimax-speech': {
value: 'minimax-speech',
label: 'MiniMax Speech',
hint: 'MiniMax TTS',
},
'volcengine-speech': {
value: 'volcengine-speech',
label: 'Volcengine Speech',
hint: 'Volcengine SAMI TTS',
},
'alibabacloud-speech': {
value: 'alibabacloud-speech',
label: 'Alibaba Cloud Speech',
hint: 'DashScope CosyVoice TTS',
},
'microsoft-speech': {
value: 'microsoft-speech',
label: 'Microsoft Speech',
hint: 'Azure Cognitive Services TTS',
},
}
export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)
export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
.filter(ct => ct.value !== 'edge-speech')
.filter(ct => !ct.value.endsWith('-speech'))
+5
View File
@@ -424,6 +424,11 @@
"modelIdPlaceholder": "Enter model identifier (e.g. custom-voice)",
"noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
"noCapabilities": "No capabilities available for this model.",
"saveSuccess": "Speech configuration saved",
"advanced": {
"title": "Advanced Settings",
"description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
},
"fields": {
"language": "Language",
"languagePlaceholder": "Select language...",
+5
View File
@@ -420,6 +420,11 @@
"modelIdPlaceholder": "输入模型标识符(如 custom-voice",
"noModels": "暂无模型,点击\"导入模型\"发现可用模型,或点击\"新建模型\"手动创建。",
"noCapabilities": "该模型暂无可用能力信息。",
"saveSuccess": "语音配置已保存",
"advanced": {
"title": "高级设置",
"description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
},
"fields": {
"language": "语言",
"languagePlaceholder": "选择语言...",
@@ -1,189 +1,198 @@
<template>
<div class="space-y-4">
<template v-if="caps">
<!-- Language -->
<div class="space-y-2">
<Label for="tts-lang">{{ $t('speech.fields.language') }}</Label>
<Select
:model-value="configData.voice_lang ?? ''"
@update:model-value="onLangChange"
>
<SelectTrigger
id="tts-lang"
class="w-full"
>
<SelectValue :placeholder="$t('speech.fields.languagePlaceholder')" />
</SelectTrigger>
<SelectContent class="max-h-60">
<SelectItem
v-for="lang in availableLanguages"
:key="lang"
:value="lang"
>
{{ lang }}
</SelectItem>
</SelectContent>
</Select>
</div>
<!-- Voice -->
<div class="space-y-2">
<Label for="tts-voice">{{ $t('speech.fields.voice') }}</Label>
<Select
:model-value="configData.voice_id ?? ''"
@update:model-value="(val) => configData.voice_id = val"
>
<SelectTrigger
id="tts-voice"
class="w-full"
>
<SelectValue :placeholder="$t('speech.fields.voicePlaceholder')" />
</SelectTrigger>
<SelectContent class="max-h-60">
<SelectItem
v-for="voice in filteredVoices"
:key="voice.id"
:value="voice.id!"
>
{{ voice.name }} ({{ voice.id }})
</SelectItem>
</SelectContent>
</Select>
</div>
<!-- Format -->
<div
v-if="caps.formats && caps.formats.length > 0"
<template v-if="basicFields.length > 0">
<section
v-for="field in basicFields"
:key="field.key"
class="space-y-2"
>
<Label for="tts-format">{{ $t('speech.fields.format') }}</Label>
<Select
:model-value="configData.format ?? ''"
@update:model-value="(val) => configData.format = val"
<Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `tts-field-${field.key}`">
{{ field.title || field.key }}
</Label>
<p
v-if="field.description"
class="text-xs text-muted-foreground"
>
<SelectTrigger
id="tts-format"
class="w-full"
{{ field.description }}
</p>
<div
v-if="field.type === 'secret'"
class="relative"
>
<Input
:id="`tts-field-${field.key}`"
v-model="configData[field.key] as string"
:type="visibleSecrets[field.key] ? 'text' : 'password'"
:placeholder="field.example ? String(field.example) : ''"
/>
<button
type="button"
class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
@click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
>
<SelectValue :placeholder="$t('speech.fields.formatPlaceholder')" />
<component
:is="visibleSecrets[field.key] ? EyeOff : Eye"
class="size-3.5"
/>
</button>
</div>
<Switch
v-else-if="field.type === 'bool'"
:model-value="!!configData[field.key]"
@update:model-value="(val) => configData[field.key] = !!val"
/>
<Input
v-else-if="field.type === 'number'"
:id="`tts-field-${field.key}`"
v-model.number="configData[field.key] as number"
type="number"
:placeholder="field.example ? String(field.example) : ''"
/>
<Select
v-else-if="field.type === 'enum' && field.enum"
:model-value="String(configData[field.key] ?? '')"
@update:model-value="(val) => configData[field.key] = val"
>
<SelectTrigger>
<SelectValue :placeholder="field.title || field.key" />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="fmt in caps.formats"
:key="fmt"
:value="fmt"
v-for="opt in field.enum"
:key="opt"
:value="opt"
>
{{ fmt }}
{{ opt }}
</SelectItem>
</SelectContent>
</Select>
</div>
<!-- Speed -->
<div
v-if="caps.speed"
class="space-y-2"
>
<Label>{{ $t('speech.fields.speed') }}</Label>
<p class="text-xs text-muted-foreground">
{{ $t('speech.fields.speedDescription', { default: caps.speed.default ?? 1 }) }}
</p>
<div v-if="caps.speed.options && caps.speed.options.length > 0">
<Select
:model-value="String(configData.speed ?? caps.speed.default ?? 1)"
@update:model-value="(val) => configData.speed = Number(val)"
>
<SelectTrigger class="w-full">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="opt in caps.speed.options"
:key="opt"
:value="String(opt)"
>
{{ opt }}x
</SelectItem>
</SelectContent>
</Select>
</div>
<div
<Input
v-else
class="flex items-center gap-3"
>
<Slider
:model-value="[Number(configData.speed ?? caps.speed.default ?? 1)]"
:min="caps.speed.min"
:max="caps.speed.max"
:step="0.1"
class="flex-1"
@update:model-value="(val) => configData.speed = val[0]"
/>
<span class="text-xs text-muted-foreground w-12 text-right">
{{ Number(configData.speed ?? caps.speed.default ?? 1).toFixed(1) }}x
</span>
</div>
</div>
<!-- Pitch -->
<div
v-if="caps.pitch"
class="space-y-2"
>
<Label>{{ $t('speech.fields.pitch') }}</Label>
<p class="text-xs text-muted-foreground">
{{ $t('speech.fields.pitchDescription', { default: caps.pitch.default ?? 0 }) }}
</p>
<div
v-if="caps.pitch.options && caps.pitch.options.length > 0"
>
<Select
:model-value="String(configData.pitch ?? caps.pitch.default ?? 0)"
@update:model-value="(val) => configData.pitch = Number(val)"
>
<SelectTrigger class="w-full">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="opt in caps.pitch.options"
:key="opt"
:value="String(opt)"
>
{{ opt }} Hz
</SelectItem>
</SelectContent>
</Select>
</div>
<div
v-else
class="flex items-center gap-3"
>
<Slider
:model-value="[Number(configData.pitch ?? caps.pitch.default ?? 0)]"
:min="caps.pitch.min"
:max="caps.pitch.max"
:step="1"
class="flex-1"
@update:model-value="(val) => configData.pitch = val[0]"
/>
<span class="text-xs text-muted-foreground w-16 text-right">
{{ Number(configData.pitch ?? caps.pitch.default ?? 0).toFixed(0) }} Hz
</span>
</div>
</div>
:id="`tts-field-${field.key}`"
v-model="configData[field.key] as string"
type="text"
:placeholder="field.example ? String(field.example) : ''"
/>
</section>
</template>
<div
v-else
v-else-if="advancedFields.length === 0"
class="text-xs text-muted-foreground"
>
{{ $t('speech.noCapabilities') }}
</div>
<div
v-if="advancedFields.length > 0"
class="rounded-lg border border-border"
>
<button
type="button"
class="flex w-full items-center justify-between px-3 py-2 text-left text-xs font-medium"
@click="showAdvanced = !showAdvanced"
>
<span>{{ $t('speech.advanced.title') }}</span>
<component
:is="showAdvanced ? ChevronUp : ChevronDown"
class="size-3 text-muted-foreground"
/>
</button>
<div
v-if="showAdvanced"
class="space-y-4 border-t border-border px-3 py-3"
>
<p class="text-xs text-muted-foreground">
{{ $t('speech.advanced.description') }}
</p>
<section
v-for="field in advancedFields"
:key="field.key"
class="space-y-2"
>
<Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `tts-field-${field.key}`">
{{ field.title || field.key }}
</Label>
<p
v-if="field.description"
class="text-xs text-muted-foreground"
>
{{ field.description }}
</p>
<div
v-if="field.type === 'secret'"
class="relative"
>
<Input
:id="`tts-field-${field.key}`"
v-model="configData[field.key] as string"
:type="visibleSecrets[field.key] ? 'text' : 'password'"
:placeholder="field.example ? String(field.example) : ''"
/>
<button
type="button"
class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
@click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
>
<component
:is="visibleSecrets[field.key] ? EyeOff : Eye"
class="size-3.5"
/>
</button>
</div>
<Switch
v-else-if="field.type === 'bool'"
:model-value="!!configData[field.key]"
@update:model-value="(val) => configData[field.key] = !!val"
/>
<Input
v-else-if="field.type === 'number'"
:id="`tts-field-${field.key}`"
v-model.number="configData[field.key] as number"
type="number"
:placeholder="field.example ? String(field.example) : ''"
/>
<Select
v-else-if="field.type === 'enum' && field.enum"
:model-value="String(configData[field.key] ?? '')"
@update:model-value="(val) => configData[field.key] = val"
>
<SelectTrigger>
<SelectValue :placeholder="field.title || field.key" />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="opt in field.enum"
:key="opt"
:value="opt"
>
{{ opt }}
</SelectItem>
</SelectContent>
</Select>
<Input
v-else
:id="`tts-field-${field.key}`"
v-model="configData[field.key] as string"
type="text"
:placeholder="field.example ? String(field.example) : ''"
/>
</section>
</div>
</div>
<Separator class="my-3" />
<!-- Test Synthesis -->
<div class="space-y-3">
<h4 class="text-xs font-medium">
{{ $t('speech.test.title') }}
@@ -209,9 +218,7 @@
:disabled="!testText.trim() || testText.length > maxTestTextLen"
@click="handleTest"
>
<Play
class="mr-1.5"
/>
<Play class="mr-1.5" />
{{ $t('speech.test.generate') }}
</LoadingButton>
<span
@@ -251,104 +258,88 @@
<script setup lang="ts">
import {
Input,
Label,
Select,
SelectTrigger,
SelectValue,
SelectContent,
SelectItem,
Slider,
Textarea,
SelectTrigger,
SelectValue,
Separator,
Switch,
Textarea,
} from '@memohai/ui'
import { Play } from 'lucide-vue-next'
import LoadingButton from '@/components/loading-button/index.vue'
import type { TtsModelCapabilities, TtsVoiceInfo } from '@memohai/sdk'
import { ChevronDown, ChevronUp, Eye, EyeOff, Play } from 'lucide-vue-next'
import { computed, onBeforeUnmount, reactive, ref, watch } from 'vue'
import { toast } from 'vue-sonner'
import { useI18n } from 'vue-i18n'
import LoadingButton from '@/components/loading-button/index.vue'
interface SpeechFieldSchema {
key: string
type: string
title?: string
description?: string
required?: boolean
advanced?: boolean
enum?: string[]
example?: unknown
order?: number
}
interface SpeechConfigSchema {
fields?: SpeechFieldSchema[]
}
const props = defineProps<{
modelId: string
modelName: string
config: Record<string, unknown>
capabilities: TtsModelCapabilities | null
schema: SpeechConfigSchema | null
onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
}>()
const emit = defineEmits<{
save: [config: Record<string, unknown>]
test: [text: string, config: Record<string, unknown>]
}>()
const { t } = useI18n()
const caps = computed(() => props.capabilities)
const configData = reactive<Record<string, unknown>>({})
watch(() => props.config, (cfg) => {
Object.keys(configData).forEach((k) => delete configData[k])
if (cfg.voice && typeof cfg.voice === 'object') {
const voice = cfg.voice as Record<string, unknown>
configData.voice_id = voice.id ?? ''
configData.voice_lang = voice.lang ?? ''
}
if (cfg.format) configData.format = cfg.format
if (cfg.speed != null) configData.speed = cfg.speed
if (cfg.pitch != null) configData.pitch = cfg.pitch
if (cfg.sample_rate != null) configData.sample_rate = cfg.sample_rate
}, { immediate: true })
const availableLanguages = computed(() => {
if (!caps.value?.voices) return []
const langs = new Set(caps.value.voices.map((v: TtsVoiceInfo) => v.lang ?? '').filter(Boolean))
return [...langs].sort()
})
const filteredVoices = computed(() => {
if (!caps.value?.voices) return []
const lang = configData.voice_lang
if (!lang) return caps.value.voices
return caps.value.voices.filter((v: TtsVoiceInfo) => v.lang === lang)
})
function onLangChange(lang: string) {
configData.voice_lang = lang
const voices = caps.value?.voices?.filter((v: TtsVoiceInfo) => v.lang === lang)
if (voices && voices.length > 0 && !voices.some((v: TtsVoiceInfo) => v.id === configData.voice_id)) {
configData.voice_id = voices[0].id ?? ''
}
}
function buildConfig(): Record<string, unknown> {
const result: Record<string, unknown> = {}
if (configData.voice_id || configData.voice_lang) {
result.voice = { id: configData.voice_id ?? '', lang: configData.voice_lang ?? '' }
}
if (configData.format) result.format = configData.format
if (configData.speed != null) result.speed = Number(configData.speed)
if (configData.pitch != null) result.pitch = Number(configData.pitch)
if (configData.sample_rate != null) result.sample_rate = Number(configData.sample_rate)
return result
}
const visibleSecrets = reactive<Record<string, boolean>>({})
const saving = ref(false)
async function handleSaveConfig() {
saving.value = true
try {
emit('save', buildConfig())
} finally {
saving.value = false
}
}
// Test synthesis
const maxTestTextLen = 500
const showAdvanced = ref(false)
const testText = ref('')
const testLoading = ref(false)
const testError = ref('')
const audioUrl = ref('')
const audioEl = ref<HTMLAudioElement>()
const maxTestTextLen = 500
const orderedFields = computed(() => {
const fields = props.schema?.fields ?? []
return [...fields].sort((a, b) => (a.order ?? 0) - (b.order ?? 0))
})
const basicFields = computed(() => orderedFields.value.filter(field => !field.advanced))
const advancedFields = computed(() => orderedFields.value.filter(field => field.advanced))
watch(() => props.config, (cfg) => {
Object.keys(configData).forEach((key) => delete configData[key])
Object.assign(configData, { ...(cfg ?? {}) })
showAdvanced.value = advancedFields.value.some(field => {
const value = cfg?.[field.key]
return value !== '' && value != null
})
}, { immediate: true, deep: true })
function buildConfig(): Record<string, unknown> {
const result: Record<string, unknown> = {}
for (const [key, value] of Object.entries(configData)) {
if (value === '' || value == null) continue
result[key] = value
}
return result
}
function revokeAudio() {
if (audioUrl.value) {
@@ -359,6 +350,15 @@ function revokeAudio() {
onBeforeUnmount(revokeAudio)
async function handleSaveConfig() {
saving.value = true
try {
emit('save', buildConfig())
} finally {
saving.value = false
}
}
async function handleTest() {
if (!testText.value.trim()) return
testLoading.value = true
@@ -366,39 +366,13 @@ async function handleTest() {
revokeAudio()
try {
const blob = await new Promise<Blob>((resolve, reject) => {
const handler = async () => {
try {
const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
const token = localStorage.getItem('token')
const resp = await fetch(`${apiBase}/speech-models/${props.modelId}/test`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(token ? { Authorization: `Bearer ${token}` } : {}),
},
body: JSON.stringify({ text: testText.value, config: buildConfig() }),
})
if (!resp.ok) {
const errBody = await resp.text()
let msg: string
try { msg = JSON.parse(errBody)?.message ?? errBody } catch { msg = errBody }
reject(new Error(msg))
return
}
resolve(await resp.blob())
} catch (e) {
reject(e)
}
}
handler()
})
const blob = await props.onTest(testText.value, buildConfig())
audioUrl.value = URL.createObjectURL(blob)
await new Promise<void>((resolve) => setTimeout(resolve, 50))
audioEl.value?.play()
} catch (e: unknown) {
const msg = e instanceof Error ? e.message : t('speech.test.failed')
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : t('speech.test.failed')
testError.value = msg
toast.error(msg)
} finally {
@@ -1,9 +1,19 @@
<template>
<div class="p-4">
<section class="flex items-center gap-3">
<Volume2
class="size-5"
/>
<span class="flex size-10 shrink-0 items-center justify-center rounded-full bg-muted">
<ProviderIcon
v-if="curProvider?.icon"
:icon="curProvider.icon"
size="1.5em"
/>
<span
v-else
class="text-xs font-medium text-muted-foreground"
>
{{ getInitials(curProvider?.name) }}
</span>
</span>
<div class="min-w-0">
<h2 class="text-sm font-semibold truncate">
{{ curProvider?.name }}
@@ -25,12 +35,121 @@
</section>
<Separator class="mt-4 mb-6" />
<!-- Models -->
<form
class="space-y-4"
@submit.prevent="handleSaveProvider"
>
<section class="space-y-2">
<Label for="speech-provider-name">{{ $t('common.name') }}</Label>
<Input
id="speech-provider-name"
v-model="providerName"
type="text"
:placeholder="$t('common.namePlaceholder')"
/>
</section>
<section
v-for="field in orderedProviderFields"
:key="field.key"
class="space-y-2"
>
<Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `speech-provider-${field.key}`">
{{ field.title || field.key }}
</Label>
<p
v-if="field.description"
class="text-xs text-muted-foreground"
>
{{ field.description }}
</p>
<div
v-if="field.type === 'secret'"
class="relative"
>
<Input
:id="`speech-provider-${field.key}`"
v-model="providerConfig[field.key] as string"
:type="visibleSecrets[field.key] ? 'text' : 'password'"
:placeholder="field.example ? String(field.example) : ''"
/>
<button
type="button"
class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
@click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
>
<component
:is="visibleSecrets[field.key] ? EyeOff : Eye"
class="size-3.5"
/>
</button>
</div>
<Switch
v-else-if="field.type === 'bool'"
:model-value="!!providerConfig[field.key]"
@update:model-value="(val) => providerConfig[field.key] = !!val"
/>
<Input
v-else-if="field.type === 'number'"
:id="`speech-provider-${field.key}`"
v-model.number="providerConfig[field.key] as number"
type="number"
:placeholder="field.example ? String(field.example) : ''"
/>
<Select
v-else-if="field.type === 'enum' && field.enum"
:model-value="String(providerConfig[field.key] ?? '')"
@update:model-value="(val) => providerConfig[field.key] = val"
>
<SelectTrigger>
<SelectValue :placeholder="field.title || field.key" />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="opt in field.enum"
:key="opt"
:value="opt"
>
{{ opt }}
</SelectItem>
</SelectContent>
</Select>
<Input
v-else
:id="`speech-provider-${field.key}`"
v-model="providerConfig[field.key] as string"
type="text"
:placeholder="field.example ? String(field.example) : ''"
/>
</section>
<div class="flex justify-end">
<LoadingButton
type="submit"
:loading="saveLoading"
>
{{ $t('provider.saveChanges') }}
</LoadingButton>
</div>
</form>
<Separator class="mt-6 mb-6" />
<section>
<div class="flex justify-between items-center mb-4">
<h3 class="text-xs font-medium">
{{ $t('speech.models') }}
</h3>
<LoadingButton
v-if="curProviderId"
type="button"
variant="outline"
size="sm"
:loading="importLoading"
@click="handleImportModels"
>
{{ $t('speech.importModels') }}
</LoadingButton>
</div>
<div
@@ -71,8 +190,9 @@
:model-id="model.id ?? ''"
:model-name="model.model_id ?? ''"
:config="model.config || {}"
:capabilities="getModelCapabilities(model.model_id ?? '')"
@test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
:schema="getModelSchema(model.model_id ?? '')"
:on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
@save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
/>
</div>
</div>
@@ -82,65 +202,152 @@
<script setup lang="ts">
import {
Input,
Label,
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
Separator,
Switch,
} from '@memohai/ui'
import ModelConfigEditor from './model-config-editor.vue'
import { Volume2, ChevronUp, ChevronDown } from 'lucide-vue-next'
import { computed, inject, ref } from 'vue'
import { ChevronDown, ChevronUp, Eye, EyeOff } from 'lucide-vue-next'
import { computed, inject, reactive, ref, watch } from 'vue'
import { toast } from 'vue-sonner'
import { useI18n } from 'vue-i18n'
import { useQuery, useQueryCache } from '@pinia/colada'
import { getSpeechProvidersMeta, getSpeechModels, putProvidersById } from '@memohai/sdk'
import type { TtsSpeechProviderResponse, TtsProviderMetaResponse, TtsModelInfo } from '@memohai/sdk'
import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
import LoadingButton from '@/components/loading-button/index.vue'
import ProviderIcon from '@/components/provider-icon/index.vue'
interface SpeechFieldSchema {
key: string
type: string
title?: string
description?: string
required?: boolean
advanced?: boolean
enum?: string[]
example?: unknown
order?: number
}
interface SpeechConfigSchema {
fields?: SpeechFieldSchema[]
}
interface SpeechModelMeta {
id: string
name: string
description?: string
config_schema?: SpeechConfigSchema
capabilities?: {
config_schema?: SpeechConfigSchema
}
}
interface SpeechProviderMeta {
provider: string
display_name: string
description?: string
config_schema?: SpeechConfigSchema
default_model?: string
models?: SpeechModelMeta[]
}
function getInitials(name: string | undefined) {
const label = name?.trim() ?? ''
return label ? label.slice(0, 2).toUpperCase() : '?'
}
const { t } = useI18n()
const curProvider = inject('curTtsProvider', ref<TtsSpeechProviderResponse>())
const curProviderId = computed(() => curProvider.value?.id)
const providerName = ref('')
const providerConfig = reactive<Record<string, unknown>>({})
const visibleSecrets = reactive<Record<string, boolean>>({})
const expandedModelId = ref('')
const enableLoading = ref(false)
const saveLoading = ref(false)
const importLoading = ref(false)
const queryCache = useQueryCache()
const { data: providerDetail } = useQuery({
key: () => ['speech-provider-detail', curProviderId.value],
query: async () => {
if (!curProviderId.value) return null
const { data } = await getSpeechProvidersById({
path: { id: curProviderId.value },
throwOnError: true,
})
return data ?? null
},
})
const { data: metaList } = useQuery({
key: () => ['speech-providers-meta'],
query: async () => {
const { data } = await getSpeechProvidersMeta({ throwOnError: true })
return data
return (data ?? []) as SpeechProviderMeta[]
},
})
const currentMeta = computed<TtsProviderMetaResponse | null>(() => {
const currentMeta = computed(() => {
if (!metaList.value || !curProvider.value?.client_type) return null
return (metaList.value as TtsProviderMetaResponse[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
})
function getModelCapabilities(modelId: string) {
const meta = currentMeta.value
if (!meta?.models) return null
return meta.models.find((m: TtsModelInfo) => m.id === modelId)?.capabilities ?? null
}
const orderedProviderFields = computed(() => {
const fields = currentMeta.value?.config_schema?.fields ?? []
return [...fields].sort((a, b) => (a.order ?? 0) - (b.order ?? 0))
})
const { data: allSpeechModels } = useQuery({
key: () => ['speech-models'],
const { data: providerSpeechModels } = useQuery({
key: () => ['speech-provider-models', curProviderId.value],
query: async () => {
const { data } = await getSpeechModels({ throwOnError: true })
return data
if (!curProviderId.value) return []
const { data } = await getSpeechProvidersByIdModels({
path: { id: curProviderId.value },
throwOnError: true,
})
return data ?? []
},
})
const providerModels = computed(() => {
if (!allSpeechModels.value || !curProviderId.value) return []
return allSpeechModels.value.filter((m) => m.provider_id === curProviderId.value)
return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
})
const expandedModelId = ref('')
watch(() => providerDetail.value, (provider) => {
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
Object.assign(providerConfig, { ...(provider?.config ?? {}) })
}, { immediate: true, deep: true })
function getModelMeta(modelID: string): SpeechModelMeta | null {
const models = currentMeta.value?.models ?? []
const exact = models.find(m => m.id === modelID)
if (exact) return exact
if (currentMeta.value?.default_model) {
return models.find(m => m.id === currentMeta.value?.default_model) ?? null
}
return models[0] ?? null
}
function getModelSchema(modelID: string): SpeechConfigSchema | null {
const meta = getModelMeta(modelID)
return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
}
function toggleModel(id: string) {
expandedModelId.value = expandedModelId.value === id ? '' : id
}
const queryCache = useQueryCache()
async function handleToggleEnable(value: boolean) {
if (!curProviderId.value || !curProvider.value) return
const prev = curProvider.value.enable ?? false
curProvider.value = { ...curProvider.value, enable: value }
@@ -148,10 +355,16 @@ async function handleToggleEnable(value: boolean) {
try {
await putProvidersById({
path: { id: curProviderId.value },
body: { enable: value },
body: {
name: providerName.value.trim() || curProvider.value.name,
client_type: curProvider.value.client_type,
enable: value,
config: sanitizeConfig(providerConfig),
},
throwOnError: true,
})
queryCache.invalidateQueries({ key: ['speech-providers'] })
queryCache.invalidateQueries({ key: ['speech-provider-detail', curProviderId.value] })
} catch {
curProvider.value = { ...curProvider.value, enable: prev }
toast.error(t('common.saveFailed'))
@@ -160,6 +373,75 @@ async function handleToggleEnable(value: boolean) {
}
}
async function handleSaveProvider() {
if (!curProviderId.value || !curProvider.value) return
saveLoading.value = true
try {
await putProvidersById({
path: { id: curProviderId.value },
body: {
name: providerName.value.trim() || curProvider.value.name,
client_type: curProvider.value.client_type,
enable: curProvider.value.enable,
config: sanitizeConfig(providerConfig),
},
throwOnError: true,
})
toast.success(t('speech.saveSuccess'))
queryCache.invalidateQueries({ key: ['speech-providers'] })
queryCache.invalidateQueries({ key: ['speech-provider-detail', curProviderId.value] })
} catch {
toast.error(t('common.saveFailed'))
} finally {
saveLoading.value = false
}
}
async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
const model = providerModels.value.find((item) => item.id === modelId)
if (!model) return
try {
await putModelsById({
path: { id: modelId },
body: {
model_id: model.model_id,
name: model.name ?? model.model_id,
provider_id: model.provider_id,
type: 'speech',
config,
},
throwOnError: true,
})
toast.success(t('speech.saveSuccess'))
queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
queryCache.invalidateQueries({ key: ['speech-models'] })
} catch {
toast.error(t('common.saveFailed'))
}
}
async function handleImportModels() {
if (!curProviderId.value) return
importLoading.value = true
try {
const { data } = await postSpeechProvidersByIdImportModels({
path: { id: curProviderId.value },
throwOnError: true,
})
toast.success(t('speech.importSuccess', {
created: data?.created ?? 0,
skipped: data?.skipped ?? 0,
}))
queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
queryCache.invalidateQueries({ key: ['speech-models'] })
queryCache.invalidateQueries({ key: ['speech-providers-meta'] })
} catch {
toast.error(t('speech.importFailed'))
} finally {
importLoading.value = false
}
}
async function handleTestModel(modelId: string, text: string, config: Record<string, unknown>) {
const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
const token = localStorage.getItem('token')
@@ -183,4 +465,13 @@ async function handleTestModel(modelId: string, text: string, config: Record<str
}
return resp.blob()
}
function sanitizeConfig(input: Record<string, unknown>) {
const result: Record<string, unknown> = {}
for (const [key, value] of Object.entries(input)) {
if (value === '' || value == null) continue
result[key] = value
}
return result
}
</script>
+16 -2
View File
@@ -18,6 +18,12 @@ import type { TtsSpeechProviderResponse } from '@memohai/sdk'
import ProviderSetting from './components/provider-setting.vue'
import { Volume2 } from 'lucide-vue-next'
import MasterDetailSidebarLayout from '@/components/master-detail-sidebar-layout/index.vue'
import ProviderIcon from '@/components/provider-icon/index.vue'
function getInitials(name: string | undefined) {
const label = name?.trim() ?? ''
return label ? label.slice(0, 2).toUpperCase() : '?'
}
const { data: providerData } = useQuery({
key: () => ['speech-providers'],
@@ -79,9 +85,17 @@ watch(filteredProviders, (list) => {
>
<span class="relative shrink-0">
<span class="flex size-7 items-center justify-center rounded-full bg-muted">
<Volume2
class="size-3.5 text-muted-foreground"
<ProviderIcon
v-if="item.icon"
:icon="item.icon"
size="1.25em"
/>
<span
v-else
class="text-xs font-medium text-muted-foreground"
>
{{ getInitials(item.name) }}
</span>
</span>
<span
v-if="item.enable !== false"
+13 -5
View File
@@ -88,7 +88,6 @@ import (
"github.com/memohai/memoh/internal/storage/providers/fallback"
"github.com/memohai/memoh/internal/storage/providers/localfs"
ttspkg "github.com/memohai/memoh/internal/tts"
ttsedge "github.com/memohai/memoh/internal/tts/adapter/edge"
"github.com/memohai/memoh/internal/version"
"github.com/memohai/memoh/internal/workspace"
)
@@ -520,10 +519,8 @@ func provideWebHandler(channelManager *channel.Manager, channelStore *channel.St
return h
}
func provideTtsRegistry(log *slog.Logger) *ttspkg.Registry {
reg := ttspkg.NewRegistry()
reg.Register(ttsedge.NewEdgeAdapter(log))
return reg
func provideTtsRegistry() *ttspkg.Registry {
return ttspkg.NewRegistry()
}
func provideTtsTempStore() (*ttspkg.TempStore, error) {
@@ -687,6 +684,17 @@ func startRegistrySync(lc fx.Lifecycle, log *slog.Logger, cfg config.Config, que
})
}
func startSpeechProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *ttspkg.Registry) {
lc.Append(fx.Hook{
OnStart: func(ctx context.Context) error {
if err := ttspkg.SyncRegistry(ctx, log, queries, registry); err != nil {
log.Warn("speech registry bootstrap failed", slog.Any("error", err))
}
return nil
},
})
}
func startMemoryProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, mpService *memprovider.Service, registry *memprovider.Registry) {
mpService.SetRegistry(registry)
lc.Append(fx.Hook{
+1
View File
@@ -141,6 +141,7 @@ func options() fx.Option {
fx.Invoke(
injectToolProviders,
startRegistrySync,
startSpeechProviderBootstrap,
startMemoryProviderBootstrap,
startSearchProviderBootstrap,
startScheduleService,
+8
View File
@@ -0,0 +1,8 @@
name: Alibaba Cloud Speech
client_type: alibabacloud-speech
icon: bailian-color
models:
- model_id: cosyvoice-tts
name: CosyVoice TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: Deepgram Speech
client_type: deepgram-speech
icon: deepgram
models:
- model_id: deepgram-tts
name: Deepgram TTS
type: speech
+1 -1
View File
@@ -1,6 +1,6 @@
name: Edge
client_type: edge-speech
icon: edge
icon: microsoft
models:
- model_id: edge-read-aloud
+8
View File
@@ -0,0 +1,8 @@
name: ElevenLabs Speech
client_type: elevenlabs-speech
icon: elevenlabs
models:
- model_id: elevenlabs-tts
name: ElevenLabs TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: Microsoft Speech
client_type: microsoft-speech
icon: azure-color
models:
- model_id: microsoft-tts
name: Microsoft TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: MiniMax Speech
client_type: minimax-speech
icon: minimax-color
models:
- model_id: minimax-tts
name: MiniMax TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: OpenAI Speech
client_type: openai-speech
icon: openai
models:
- model_id: gpt-4o-mini-tts
name: GPT-4o Mini TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: OpenRouter Speech
client_type: openrouter-speech
icon: openrouter
models:
- model_id: openrouter-tts
name: OpenRouter TTS
type: speech
+8
View File
@@ -0,0 +1,8 @@
name: Volcengine Speech
client_type: volcengine-speech
icon: volcengine-color
models:
- model_id: sami-tts
name: SAMI TTS
type: speech
+17 -1
View File
@@ -68,7 +68,23 @@ CREATE TABLE IF NOT EXISTS providers (
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
CONSTRAINT providers_name_unique UNIQUE (name),
CONSTRAINT providers_client_type_check CHECK (client_type IN ('openai-responses', 'openai-completions', 'anthropic-messages', 'google-generative-ai', 'openai-codex', 'github-copilot', 'edge-speech'))
CONSTRAINT providers_client_type_check CHECK (client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
))
);
CREATE TABLE IF NOT EXISTS search_providers (
@@ -0,0 +1,29 @@
-- 0068_expand_speech_provider_client_types (rollback)
-- Remove newly added Twilight speech provider client_type values from unified providers table.
DELETE FROM providers
WHERE client_type IN (
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
);
ALTER TABLE IF EXISTS providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE IF EXISTS providers
ADD CONSTRAINT providers_client_type_check CHECK (
client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech'
)
);
@@ -0,0 +1,25 @@
-- 0068_expand_speech_provider_client_types
-- Allow all Twilight speech provider client_type values in unified providers table.
ALTER TABLE IF EXISTS providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE IF EXISTS providers
ADD CONSTRAINT providers_client_type_check CHECK (
client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
);
+40 -4
View File
@@ -18,7 +18,17 @@ SELECT * FROM providers WHERE name = sqlc.arg(name);
-- name: ListProviders :many
SELECT * FROM providers
WHERE client_type NOT IN ('edge-speech')
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
ORDER BY created_at DESC;
-- name: UpdateProvider :one
@@ -38,8 +48,19 @@ RETURNING *;
DELETE FROM providers WHERE id = sqlc.arg(id);
-- name: CountProviders :one
SELECT COUNT(*) FROM providers
WHERE client_type NOT IN ('edge-speech');
SELECT COUNT(*)
FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
);
-- name: CreateModel :one
INSERT INTO models (model_id, name, provider_id, type, config)
@@ -110,6 +131,11 @@ DELETE FROM models WHERE id = sqlc.arg(id);
-- name: DeleteModelByModelID :exec
DELETE FROM models WHERE model_id = sqlc.arg(model_id);
-- name: DeleteModelByProviderIDAndModelID :exec
DELETE FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND model_id = sqlc.arg(model_id);
-- name: CountModels :one
SELECT COUNT(*) FROM models
WHERE type != 'speech';
@@ -192,7 +218,17 @@ WHERE m.id = sqlc.arg(id)
-- name: ListSpeechProviders :many
SELECT * FROM providers
WHERE client_type IN ('edge-speech')
WHERE client_type IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
ORDER BY created_at DESC;
-- name: ListSpeechModels :many
+17 -8
View File
@@ -18,7 +18,6 @@ require (
github.com/creack/pty v1.1.24
github.com/emersion/go-imap/v2 v2.0.0-beta.8
github.com/emersion/go-sasl v0.0.0-20241020182733-b788ff22d5a6
github.com/go-playground/validator/v10 v10.30.1
github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0
github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1
github.com/golang-jwt/jwt/v5 v5.3.1
@@ -27,18 +26,20 @@ require (
github.com/google/uuid v1.6.0
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674
github.com/jackc/pgx/v5 v5.8.0
github.com/kenshaw/emoji v0.4.1
github.com/labstack/echo-jwt/v4 v4.4.0
github.com/labstack/echo/v4 v4.15.0
github.com/larksuite/oapi-sdk-go/v3 v3.5.3
github.com/mailgun/mailgun-go/v5 v5.14.0
github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7
github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978
github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f
github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04
github.com/modelcontextprotocol/go-sdk v1.5.0
github.com/opencontainers/image-spec v1.1.1
github.com/opencontainers/runtime-spec v1.3.0
github.com/qdrant/go-client v1.17.1
github.com/robfig/cron/v3 v3.0.1
github.com/slack-go/slack v0.19.0
github.com/spf13/cobra v1.10.2
github.com/stretchr/testify v1.11.1
github.com/swaggo/swag v1.16.6
@@ -66,6 +67,20 @@ require (
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
github.com/atotto/clipboard v0.1.4 // indirect
github.com/aws/aws-sdk-go-v2 v1.41.5 // indirect
github.com/aws/aws-sdk-go-v2/config v1.32.14 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.19.14 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 // indirect
github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 // indirect
github.com/aws/smithy-go v1.24.2 // indirect
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
@@ -95,7 +110,6 @@ require (
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.12 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.22.4 // indirect
@@ -108,8 +122,6 @@ require (
github.com/go-openapi/swag/stringutils v0.25.4 // indirect
github.com/go-openapi/swag/typeutils v0.25.4 // indirect
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
@@ -121,10 +133,8 @@ require (
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kenshaw/emoji v0.4.1 // indirect
github.com/klauspost/compress v1.18.4 // indirect
github.com/labstack/gommon v0.4.2 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/lib/pq v1.10.9 // indirect
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
github.com/mattn/go-colorable v0.1.14 // indirect
@@ -158,7 +168,6 @@ require (
github.com/segmentio/asm v1.2.1 // indirect
github.com/segmentio/encoding v0.5.4 // indirect
github.com/sirupsen/logrus v1.9.4 // indirect
github.com/slack-go/slack v0.19.0 // indirect
github.com/spf13/pflag v1.0.9 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.2 // indirect
+32 -14
View File
@@ -32,6 +32,34 @@ github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhP
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
github.com/aws/aws-sdk-go-v2 v1.41.5 h1:dj5kopbwUsVUVFgO4Fi5BIT3t4WyqIDjGKCangnV/yY=
github.com/aws/aws-sdk-go-v2 v1.41.5/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o=
github.com/aws/aws-sdk-go-v2/config v1.32.14 h1:opVIRo/ZbbI8OIqSOKmpFaY7IwfFUOCCXBsUpJOwDdI=
github.com/aws/aws-sdk-go-v2/config v1.32.14/go.mod h1:U4/V0uKxh0Tl5sxmCBZ3AecYny4UNlVmObYjKuuaiOo=
github.com/aws/aws-sdk-go-v2/credentials v1.19.14 h1:n+UcGWAIZHkXzYt87uMFBv/l8THYELoX6gVcUvgl6fI=
github.com/aws/aws-sdk-go-v2/credentials v1.19.14/go.mod h1:cJKuyWB59Mqi0jM3nFYQRmnHVQIcgoxjEMAbLkpr62w=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 h1:NUS3K4BTDArQqNu2ih7yeDLaS3bmHD0YndtA6UP884g=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21/go.mod h1:YWNWJQNjKigKY1RHVJCuupeWDrrHjRqHm0N9rdrWzYI=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 h1:Rgg6wvjjtX8bNHcvi9OnXWwcE0a2vGpbwmtICOsvcf4=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21/go.mod h1:A/kJFst/nm//cyqonihbdpQZwiUhhzpqTsdbhDdRF9c=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 h1:PEgGVtPoB6NTpPrBgqSE5hE/o47Ij9qk/SEZFbUOe9A=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21/go.mod h1:p+hz+PRAYlY3zcpJhPwXlLC4C+kqn70WIHwnzAfs6ps=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 h1:qYQ4pzQ2Oz6WpQ8T3HvGHnZydA72MnLuFK9tJwmrbHw=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6/go.mod h1:O3h0IK87yXci+kg6flUKzJnWeziQUKciKrLjcatSNcY=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 h1:5EniKhLZe4xzL7a+fU3C2tfUN4nWIqlLesfrjkuPFTY=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 h1:c31//R3xgIJMSC8S6hEVq+38DcvUlgFY0FM6mSI5oto=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21/go.mod h1:r6+pf23ouCB718FUxaqzZdbpYFyDtehyZcmP5KL9FkA=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 h1:QKZH0S178gCmFEgst8hN0mCX1KxLgHBKKY/CLqwP8lg=
github.com/aws/aws-sdk-go-v2/service/signin v1.0.9/go.mod h1:7yuQJoT+OoH8aqIxw9vwF+8KpvLZ8AWmvmUWHsGQZvI=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 h1:lFd1+ZSEYJZYvv9d6kXzhkZu07si3f+GQ1AaYwa2LUM=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.15/go.mod h1:WSvS1NLr7JaPunCXqpJnWk1Bjo7IxzZXrZi1QQCkuqM=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 h1:dzztQ1YmfPrxdrOiuZRMF6fuOwWlWpD2StNLTceKpys=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19/go.mod h1:YO8TrYtFdl5w/4vmjL8zaBSsiNp3w0L1FfKVKenZT7w=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 h1:p8ogvvLugcR/zLBXTXrTkj0RYBUdErbMnAFFp12Lm/U=
github.com/aws/aws-sdk-go-v2/service/sts v1.41.10/go.mod h1:60dv0eZJfeVXfbT1tFJinbHrDfSJ2GZl4Q//OSSNAVw=
github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng=
github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/aymanbagabas/go-udiff v0.3.1 h1:LV+qyBQ2pqe0u42ZsUEtPiCaUoqgA9gYRDs3vj1nolY=
@@ -140,8 +168,6 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/gabriel-vasile/mimetype v1.4.12 h1:e9hWvmLYvtp846tLHam2o++qitpguFiYCKbn0w9jyqw=
github.com/gabriel-vasile/mimetype v1.4.12/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s=
github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@@ -176,14 +202,6 @@ github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxE
github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg=
github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls=
github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.30.1 h1:f3zDSN/zOma+w6+1Wswgd9fLkdwy06ntQJp0BBvFG0w=
github.com/go-playground/validator/v10 v10.30.1/go.mod h1:oSuBIQzuJxL//3MelwSLD5hc2Tu889bF0Idm9Dg26cM=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-readability v0.0.0-20251205110129-5db1dc9836f0 h1:A3B75Yp163FAIf9nLlFMl4pwIj+T3uKxfI7mbvvY2Ls=
@@ -192,6 +210,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1 h1:wG8n/XJQ07TmjbITcGiUaOtXxdrINDz1b0J1w0SzqDc=
github.com/go-telegram-bot-api/telegram-bot-api/v5 v5.5.1/go.mod h1:A2S0CWkNylc2phvKXWBBdD3K0iGnDBGbzRpISP2zBl8=
github.com/go-test/deep v1.1.1 h1:0r/53hagsehfO4bzD2Pgr/+RgHqhmf+k1Bpse2cTu1U=
github.com/go-test/deep v1.1.1/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
@@ -277,8 +297,6 @@ github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0
github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
github.com/larksuite/oapi-sdk-go/v3 v3.5.3 h1:xvf8Dv29kBXC5/DNDCLhHkAFW8l/0LlQJimO5Zn+JUk=
github.com/larksuite/oapi-sdk-go/v3 v3.5.3/go.mod h1:ZEplY+kwuIrj/nqw5uSCINNATcH3KdxSN7y+UxYY5fI=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
@@ -299,8 +317,8 @@ github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7 h1:beehwOQperqGWj4m4E
github.com/memohai/acgo v0.0.0-20260221232113-babac0d6acd7/go.mod h1:OvmxM7JmnXBmwJWWVqtreL3HSHSKuzPbtbhlg5MvBg0=
github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978 h1:6gD8DvZkimGmU0e3PjlusJPyw55SyeoE12CZQoYUa8g=
github.com/memohai/dingtalk-stream-sdk-go v0.0.0-20260405113102-87e23096b978/go.mod h1:2LMgK5QYFlTSvrGY+sI/j+jK2WK+YGHv4IMuiW+iPSc=
github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f h1:9NAj+FyDJPi8RzD1PUwb6OxZx/OrBD2FJo4tVAlhpbs=
github.com/memohai/twilight-ai v0.3.4-0.20260412161211-dbedfe32c86f/go.mod h1:1uNfZWc8du+HWJ3r3FLyeGAXGiUAniuSWV89A8gbcz0=
github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04 h1:8TnRoVU7u2aSvXDLMmlGt4l92TjjP7LYWqu73Rx0uGo=
github.com/memohai/twilight-ai v0.3.4-0.20260419121757-8ac67fb0bc04/go.mod h1:s5s03jeYgK56SaHH9oVHua73xCmiSG4uyfCZKi9fCHk=
github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk=
github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
+51 -4
View File
@@ -35,8 +35,19 @@ func (q *Queries) CountModelsByType(ctx context.Context, type_ string) (int64, e
}
const countProviders = `-- name: CountProviders :one
SELECT COUNT(*) FROM providers
WHERE client_type NOT IN ('edge-speech')
SELECT COUNT(*)
FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
`
func (q *Queries) CountProviders(ctx context.Context) (int64, error) {
@@ -190,6 +201,22 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
return err
}
const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
DELETE FROM models
WHERE provider_id = $1
AND model_id = $2
`
type DeleteModelByProviderIDAndModelIDParams struct {
ProviderID pgtype.UUID `json:"provider_id"`
ModelID string `json:"model_id"`
}
func (q *Queries) DeleteModelByProviderIDAndModelID(ctx context.Context, arg DeleteModelByProviderIDAndModelIDParams) error {
_, err := q.db.Exec(ctx, deleteModelByProviderIDAndModelID, arg.ProviderID, arg.ModelID)
return err
}
const deleteProvider = `-- name: DeleteProvider :exec
DELETE FROM providers WHERE id = $1
`
@@ -717,7 +744,17 @@ func (q *Queries) ListModelsByType(ctx context.Context, type_ string) ([]Model,
const listProviders = `-- name: ListProviders :many
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
WHERE client_type NOT IN ('edge-speech')
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
ORDER BY created_at DESC
`
@@ -840,7 +877,17 @@ func (q *Queries) ListSpeechModelsByProviderID(ctx context.Context, providerID p
const listSpeechProviders = `-- name: ListSpeechProviders :many
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
WHERE client_type IN ('edge-speech')
WHERE client_type IN (
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
)
ORDER BY created_at DESC
`
+112 -5
View File
@@ -1,31 +1,39 @@
package handlers
import (
"errors"
"fmt"
"log/slog"
"net/http"
"strings"
"github.com/labstack/echo/v4"
"github.com/memohai/memoh/internal/models"
"github.com/memohai/memoh/internal/tts"
)
type SpeechHandler struct {
service *tts.Service
logger *slog.Logger
service *tts.Service
modelsService *models.Service
logger *slog.Logger
}
func NewSpeechHandler(log *slog.Logger, service *tts.Service) *SpeechHandler {
func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
return &SpeechHandler{
service: service,
logger: log.With(slog.String("handler", "speech")),
service: service,
modelsService: modelsService,
logger: log.With(slog.String("handler", "speech")),
}
}
func (h *SpeechHandler) Register(e *echo.Echo) {
pg := e.Group("/speech-providers")
pg.GET("", h.ListProviders)
pg.GET("/:id", h.GetProvider)
pg.GET("/meta", h.ListMeta)
pg.GET("/:id/models", h.ListModelsByProvider)
pg.POST("/:id/import-models", h.ImportModels)
mg := e.Group("/speech-models")
mg.GET("", h.ListModels)
@@ -60,6 +68,105 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
// GetProvider godoc
// @Summary Get speech provider
// @Description Get a speech provider with masked config values
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} tts.SpeechProviderResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Router /speech-providers/{id} [get].
func (h *SpeechHandler) GetProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
item, err := h.service.GetSpeechProvider(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusNotFound, err.Error())
}
return c.JSON(http.StatusOK, item)
}
// ListModelsByProvider godoc
// @Summary List speech models by provider
// @Description List models of type 'speech' for a specific speech provider
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {array} tts.SpeechModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/models [get].
func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
items, err := h.service.ListSpeechModelsByProvider(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, items)
}
// ImportModels godoc
// @Summary Import speech models from provider
// @Description Fetch models using the configured speech provider and import them into the unified models table
// @Tags speech-providers
// @Accept json
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} tts.ImportModelsResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/import-models [post].
func (h *SpeechHandler) ImportModels(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
remoteModels, err := h.service.FetchRemoteModels(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
}
resp := tts.ImportModelsResponse{
Models: make([]string, 0, len(remoteModels)),
}
for _, model := range remoteModels {
name := strings.TrimSpace(model.Name)
if name == "" {
name = model.ID
}
_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
ModelID: model.ID,
Name: name,
ProviderID: id,
Type: models.ModelTypeSpeech,
Config: models.ModelConfig{},
})
if err != nil {
if errors.Is(err, models.ErrModelIDAlreadyExists) {
resp.Skipped++
continue
}
h.logger.Warn("failed to import speech model", slog.String("model_id", model.ID), slog.Any("error", err))
continue
}
resp.Created++
resp.Models = append(resp.Models, model.ID)
}
return c.JSON(http.StatusOK, resp)
}
// ListModels godoc
// @Summary List all speech models
// @Description List all models of type 'speech' (filtered view of unified models table)
+11 -13
View File
@@ -430,7 +430,15 @@ func IsValidClientType(clientType ClientType) bool {
ClientTypeGoogleGenerativeAI,
ClientTypeOpenAICodex,
ClientTypeGitHubCopilot,
ClientTypeEdgeSpeech:
ClientTypeEdgeSpeech,
ClientTypeOpenAISpeech,
ClientTypeOpenRouterSpeech,
ClientTypeElevenLabsSpeech,
ClientTypeDeepgramSpeech,
ClientTypeMiniMaxSpeech,
ClientTypeVolcengineSpeech,
ClientTypeAlibabaSpeech,
ClientTypeMicrosoftSpeech:
return true
default:
return false
@@ -438,19 +446,9 @@ func IsValidClientType(clientType ClientType) bool {
}
// IsLLMClientType returns true if the client type belongs to the LLM domain
// (chat/embedding), excluding speech-only types like edge-speech.
// (chat/embedding), excluding speech-only types (any type ending in "-speech").
func IsLLMClientType(clientType ClientType) bool {
switch clientType {
case ClientTypeOpenAIResponses,
ClientTypeOpenAICompletions,
ClientTypeAnthropicMessages,
ClientTypeGoogleGenerativeAI,
ClientTypeOpenAICodex,
ClientTypeGitHubCopilot:
return true
default:
return false
}
return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
}
// SelectMemoryModel selects a chat model for memory operations.
+8
View File
@@ -24,6 +24,14 @@ const (
ClientTypeOpenAICodex ClientType = "openai-codex"
ClientTypeGitHubCopilot ClientType = "github-copilot"
ClientTypeEdgeSpeech ClientType = "edge-speech"
ClientTypeOpenAISpeech ClientType = "openai-speech"
ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
)
const (
+68
View File
@@ -0,0 +1,68 @@
package tts
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"github.com/jackc/pgx/v5/pgtype"
"github.com/memohai/memoh/internal/db/sqlc"
)
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
for _, def := range registry.List() {
configJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal speech provider config: %w", err)
}
var icon pgtype.Text
if def.Icon != "" {
icon = pgtype.Text{String: def.Icon, Valid: true}
}
provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
Name: def.DisplayName,
ClientType: string(def.ClientType),
Icon: icon,
Config: configJSON,
})
if err != nil {
return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
}
synced := 0
for _, model := range def.Models {
if shouldHideTemplateModel(def, model.ID) {
if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
ProviderID: provider.ID,
ModelID: model.ID,
}); err != nil {
return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
}
continue
}
modelConfigJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal speech model config: %w", err)
}
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
ModelID: model.ID,
Name: name,
ProviderID: provider.ID,
Type: "speech",
Config: modelConfigJSON,
}); err != nil {
return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
}
synced++
}
if logger != nil {
logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
}
}
return nil
}
+36 -24
View File
@@ -1,32 +1,41 @@
package tts
import "github.com/go-playground/validator/v10"
var validate = validator.New()
// VoiceConfig identifies a TTS voice and its language.
// Both fields are optional; adapters fill in their own defaults when empty.
// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
type VoiceConfig struct {
ID string `json:"id" validate:"omitempty"`
Lang string `json:"lang" validate:"omitempty"`
ID string `json:"id"`
Lang string `json:"lang"`
}
// AudioConfig is the user-facing configuration for a TTS request.
// AudioConfig is kept for backward compatibility with the legacy Edge adapter tests.
type AudioConfig struct {
Format string `json:"format" validate:"omitempty"`
SampleRate int `json:"sample_rate" validate:"omitempty,oneof=16000 24000 48000"`
Speed float64 `json:"speed" validate:"omitempty"`
Pitch float64 `json:"pitch" validate:"omitempty"`
Format string `json:"format"`
SampleRate int `json:"sample_rate"`
Speed float64 `json:"speed"`
Pitch float64 `json:"pitch"`
Voice VoiceConfig `json:"voice"`
}
func (c AudioConfig) Validate() error {
return validate.Struct(c)
func (AudioConfig) Validate() error { return nil }
// FieldSchema describes a single dynamic speech config field.
type FieldSchema struct {
Key string `json:"key"`
Type string `json:"type"`
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Required bool `json:"required,omitempty"`
Advanced bool `json:"advanced,omitempty"`
Enum []string `json:"enum,omitempty"`
Example any `json:"example,omitempty"`
Order int `json:"order"`
}
type ConfigSchema struct {
Fields []FieldSchema `json:"fields"`
}
// ParamConstraint describes valid values for a numeric parameter.
// If Options is non-empty, only those discrete values are allowed (frontend renders a select).
// Otherwise Min/Max define a continuous range (frontend renders a slider).
// If Options is non-empty, only those discrete values are allowed.
type ParamConstraint struct {
Options []float64 `json:"options,omitempty"`
Min float64 `json:"min,omitempty"`
@@ -34,20 +43,23 @@ type ParamConstraint struct {
Default float64 `json:"default"`
}
// ModelCapabilities describes what a specific TTS model supports.
// nil pointer means the parameter is not supported; frontend should hide it.
// ModelCapabilities exposes optional UX hints for speech config forms.
type ModelCapabilities struct {
Voices []VoiceInfo `json:"voices"`
Formats []string `json:"formats"`
Speed *ParamConstraint `json:"speed,omitempty"`
Pitch *ParamConstraint `json:"pitch,omitempty"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
Voices []VoiceInfo `json:"voices,omitempty"`
Formats []string `json:"formats,omitempty"`
Speed *ParamConstraint `json:"speed,omitempty"`
Pitch *ParamConstraint `json:"pitch,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
// ModelInfo describes a single model exposed by a TTS adapter.
// ModelInfo describes a single speech model exposed by a provider definition.
type ModelInfo struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description,omitempty"`
TemplateOnly bool `json:"template_only,omitempty"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
Capabilities ModelCapabilities `json:"capabilities"`
}
+558 -19
View File
@@ -2,47 +2,586 @@ package tts
import (
"fmt"
"sort"
"strings"
"sync"
alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/models"
)
type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
type ProviderDefinition struct {
ClientType models.ClientType
DisplayName string
Icon string
Description string
ConfigSchema ConfigSchema
DefaultModel string
SupportsList bool
Models []ModelInfo
Factory ProviderFactory
Order int
}
type Registry struct {
mu sync.RWMutex
adapters map[TtsType]TtsAdapter
mu sync.RWMutex
providers map[models.ClientType]ProviderDefinition
ordered []models.ClientType
}
func NewRegistry() *Registry {
return &Registry{adapters: make(map[TtsType]TtsAdapter)}
r := &Registry{
providers: make(map[models.ClientType]ProviderDefinition),
}
for _, def := range defaultProviderDefinitions() {
r.Register(def)
}
return r
}
func (r *Registry) Register(a TtsAdapter) {
func (r *Registry) Register(def ProviderDefinition) {
r.mu.Lock()
defer r.mu.Unlock()
r.adapters[a.Type()] = a
if _, exists := r.providers[def.ClientType]; !exists {
r.ordered = append(r.ordered, def.ClientType)
}
r.providers[def.ClientType] = def
sort.SliceStable(r.ordered, func(i, j int) bool {
left := r.providers[r.ordered[i]]
right := r.providers[r.ordered[j]]
if left.Order != right.Order {
return left.Order < right.Order
}
return left.DisplayName < right.DisplayName
})
}
func (r *Registry) Get(name TtsType) (TtsAdapter, error) {
func (r *Registry) Get(clientType models.ClientType) (ProviderDefinition, error) {
r.mu.RLock()
defer r.mu.RUnlock()
a, ok := r.adapters[name]
def, ok := r.providers[clientType]
if !ok {
return nil, fmt.Errorf("tts adapter not found: %s", name)
return ProviderDefinition{}, fmt.Errorf("speech provider not found: %s", clientType)
}
return a, nil
return def, nil
}
func (r *Registry) List() []ProviderDefinition {
r.mu.RLock()
defer r.mu.RUnlock()
out := make([]ProviderDefinition, 0, len(r.ordered))
for _, key := range r.ordered {
out = append(out, r.providers[key])
}
return out
}
func (r *Registry) ListMeta() []ProviderMetaResponse {
r.mu.RLock()
defer r.mu.RUnlock()
metas := make([]ProviderMetaResponse, 0, len(r.adapters))
for _, a := range r.adapters {
meta := a.Meta()
defs := r.List()
metas := make([]ProviderMetaResponse, 0, len(defs))
for _, def := range defs {
metas = append(metas, ProviderMetaResponse{
Provider: string(a.Type()),
DisplayName: meta.Provider,
Description: meta.Description,
DefaultModel: a.DefaultModel(),
Models: a.Models(),
Provider: string(def.ClientType),
DisplayName: def.DisplayName,
Description: def.Description,
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultModel,
Models: def.Models,
})
}
return metas
}
func defaultProviderDefinitions() []ProviderDefinition {
edgeVoices := make([]VoiceInfo, 0)
for lang, ids := range edgespeech.EdgeTTSVoices {
for _, id := range ids {
name := strings.TrimPrefix(id, lang+"-")
name = strings.TrimSuffix(name, "Neural")
edgeVoices = append(edgeVoices, VoiceInfo{ID: id, Lang: lang, Name: name})
}
}
sort.Slice(edgeVoices, func(i, j int) bool {
if edgeVoices[i].Lang != edgeVoices[j].Lang {
return edgeVoices[i].Lang < edgeVoices[j].Lang
}
return edgeVoices[i].ID < edgeVoices[j].ID
})
return []ProviderDefinition{
{
ClientType: models.ClientTypeEdgeSpeech,
DisplayName: "Microsoft Edge",
Icon: "microsoft",
Description: "Free Edge Read Aloud TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{stringField("base_url", "Base URL", "Override the Edge WebSocket endpoint", false, "", 10)}},
DefaultModel: "edge-read-aloud",
SupportsList: false,
Models: []ModelInfo{{
ID: "edge-read-aloud",
Name: "Edge Read Aloud",
Description: "Built-in Edge Read Aloud speech model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
enumField("voice", "Voice", "Edge voice ID", false, voiceIDs(edgeVoices), 10),
stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 20),
enumField("format", "Format", "Output audio format", false, []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"}, 30),
numberField("speed", "Speed", "Speech rate, 1.0 = normal", false, 1.0, 40),
numberField("pitch", "Pitch", "Pitch adjustment in Hz", false, 0, 50),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
enumField("voice", "Voice", "Edge voice ID", false, voiceIDs(edgeVoices), 10),
stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 20),
enumField("format", "Format", "Output audio format", false, []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"}, 30),
numberField("speed", "Speed", "Speech rate, 1.0 = normal", false, 1.0, 40),
numberField("pitch", "Pitch", "Pitch adjustment in Hz", false, 0, 50),
}},
Voices: edgeVoices,
Formats: []string{"audio-24khz-48kbitrate-mono-mp3", "audio-24khz-96kbitrate-mono-mp3", "webm-24khz-16bit-mono-opus"},
Speed: &ParamConstraint{Options: []float64{0.5, 1.0, 2.0, 3.0}, Default: 1.0},
Pitch: &ParamConstraint{Min: -100, Max: 100, Default: 0},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []edgespeech.Option{}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, edgespeech.WithBaseURL(v))
}
return edgespeech.New(opts...), nil
},
Order: 10,
},
{
ClientType: models.ClientTypeOpenAISpeech,
DisplayName: "OpenAI Speech",
Icon: "openai",
Description: "OpenAI /audio/speech compatible TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "Bearer API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
}},
DefaultModel: "gpt-4o-mini-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "gpt-4o-mini-tts",
Name: "gpt-4o-mini-tts",
Description: "Default OpenAI speech model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice", "Voice", "Voice ID", false, "coral", 10),
enumField("response_format", "Response Format", "Audio format", false, []string{"mp3", "opus", "pcm", "wav"}, 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
stringField("instructions", "Instructions", "Style instructions for supported models", false, "", 40),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice", "Voice", "Voice ID", false, "coral", 10),
enumField("response_format", "Response Format", "Audio format", false, []string{"mp3", "opus", "pcm", "wav"}, 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
stringField("instructions", "Instructions", "Style instructions for supported models", false, "", 40),
}},
Formats: []string{"mp3", "opus", "pcm", "wav"},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []openaispeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, openaispeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, openaispeech.WithBaseURL(v))
}
return openaispeech.New(opts...), nil
},
Order: 20,
},
{
ClientType: models.ClientTypeOpenRouterSpeech,
DisplayName: "OpenRouter Speech",
Icon: "openrouter",
Description: "OpenRouter audio modality TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "OpenRouter API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
}},
DefaultModel: "openrouter-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "openrouter-tts",
Name: "openrouter-tts",
Description: "Default OpenRouter speech wrapper model",
TemplateOnly: true,
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "Underlying OpenRouter model ID", false, "openai/gpt-audio-mini", 10),
stringField("voice", "Voice", "Voice name", false, "coral", 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "Underlying OpenRouter model ID", false, "openai/gpt-audio-mini", 10),
stringField("voice", "Voice", "Voice name", false, "coral", 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []openrouterspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, openrouterspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, openrouterspeech.WithBaseURL(v))
}
return openrouterspeech.New(opts...), nil
},
Order: 30,
},
{
ClientType: models.ClientTypeElevenLabsSpeech,
DisplayName: "ElevenLabs Speech",
Icon: "elevenlabs",
Description: "ElevenLabs text-to-speech",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
}},
DefaultModel: "elevenlabs-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "elevenlabs-tts",
Name: "elevenlabs-tts",
Description: "Default ElevenLabs speech wrapper model",
TemplateOnly: true,
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice_id", "Voice ID", "ElevenLabs voice ID", true, "", 10),
advancedStringField("model_id", "Model ID", "ElevenLabs model ID", false, "eleven_multilingual_v2", 20),
numberField("stability", "Stability", "Voice stability 0-1", false, 0.5, 30),
numberField("similarity_boost", "Similarity Boost", "Voice similarity boost 0-1", false, 0.75, 40),
numberField("style", "Style", "Speaking style intensity 0-1", false, 0, 50),
boolField("use_speaker_boost", "Speaker Boost", "Enable speaker boost", false, 60),
numberField("speed", "Speed", "Speech rate 0.5-2.0", false, 1.0, 70),
stringField("output_format", "Output Format", "Output format", false, "mp3_44100_128", 80),
numberField("seed", "Seed", "Deterministic seed", false, 0, 90),
enumField("apply_text_normalization", "Text Normalization", "Text normalization mode", false, []string{"auto", "on", "off"}, 100),
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice_id", "Voice ID", "ElevenLabs voice ID", true, "", 10),
advancedStringField("model_id", "Model ID", "ElevenLabs model ID", false, "eleven_multilingual_v2", 20),
numberField("stability", "Stability", "Voice stability 0-1", false, 0.5, 30),
numberField("similarity_boost", "Similarity Boost", "Voice similarity boost 0-1", false, 0.75, 40),
numberField("style", "Style", "Speaking style intensity 0-1", false, 0, 50),
boolField("use_speaker_boost", "Speaker Boost", "Enable speaker boost", false, 60),
numberField("speed", "Speed", "Speech rate 0.5-2.0", false, 1.0, 70),
stringField("output_format", "Output Format", "Output format", false, "mp3_44100_128", 80),
numberField("seed", "Seed", "Deterministic seed", false, 0, 90),
enumField("apply_text_normalization", "Text Normalization", "Text normalization mode", false, []string{"auto", "on", "off"}, 100),
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []elevenlabsspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, elevenlabsspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, elevenlabsspeech.WithBaseURL(v))
}
return elevenlabsspeech.New(opts...), nil
},
Order: 40,
},
{
ClientType: models.ClientTypeDeepgramSpeech,
DisplayName: "Deepgram Speech",
Icon: "deepgram",
Description: "Deepgram TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "Deepgram API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
}},
DefaultModel: "deepgram-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "deepgram-tts",
Name: "deepgram-tts",
Description: "Default Deepgram speech wrapper model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "Deepgram voice model", false, "aura-2-asteria-en", 10),
enumField("encoding", "Encoding", "Audio encoding", false, []string{"linear16", "mulaw", "alaw"}, 20),
numberField("sample_rate", "Sample Rate", "Audio sample rate in Hz", false, 24000, 30),
enumField("container", "Container", "Audio container", false, []string{"wav", "none"}, 40),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "Deepgram voice model", false, "aura-2-asteria-en", 10),
enumField("encoding", "Encoding", "Audio encoding", false, []string{"linear16", "mulaw", "alaw"}, 20),
numberField("sample_rate", "Sample Rate", "Audio sample rate in Hz", false, 24000, 30),
enumField("container", "Container", "Audio container", false, []string{"wav", "none"}, 40),
}},
Formats: []string{"wav", "none"},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []deepgramspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, deepgramspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, deepgramspeech.WithBaseURL(v))
}
return deepgramspeech.New(opts...), nil
},
Order: 50,
},
{
ClientType: models.ClientTypeMiniMaxSpeech,
DisplayName: "MiniMax Speech",
Icon: "minimax-color",
Description: "MiniMax TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "MiniMax API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.minimax.io", 20),
}},
DefaultModel: "minimax-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "minimax-tts",
Name: "minimax-tts",
Description: "Default MiniMax speech wrapper model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice_id", "Voice ID", "MiniMax voice ID", false, "English_expressive_narrator", 10),
advancedStringField("model", "Model", "MiniMax model", false, "speech-2.8-hd", 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
numberField("vol", "Volume", "Volume", false, 1.0, 40),
numberField("pitch", "Pitch", "Pitch adjustment", false, 0, 50),
enumField("output_format", "Output Format", "Audio format", false, []string{"mp3", "pcm", "flac", "wav"}, 60),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 32000, 70),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("voice_id", "Voice ID", "MiniMax voice ID", false, "English_expressive_narrator", 10),
advancedStringField("model", "Model", "MiniMax model", false, "speech-2.8-hd", 20),
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
numberField("vol", "Volume", "Volume", false, 1.0, 40),
numberField("pitch", "Pitch", "Pitch adjustment", false, 0, 50),
enumField("output_format", "Output Format", "Audio format", false, []string{"mp3", "pcm", "flac", "wav"}, 60),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 32000, 70),
}},
Formats: []string{"mp3", "pcm", "flac", "wav"},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []minimaxspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, minimaxspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, minimaxspeech.WithBaseURL(v))
}
return minimaxspeech.New(opts...), nil
},
Order: 60,
},
{
ClientType: models.ClientTypeVolcengineSpeech,
DisplayName: "Volcengine Speech",
Icon: "volcengine-color",
Description: "Volcengine SAMI TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("access_key", "Access Key", "Volcengine access key", true, 10),
secretField("secret_key", "Secret Key", "Volcengine secret key", true, 20),
secretField("app_key", "App Key", "SAMI app key", true, 30),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://sami.bytedance.com", 40),
}},
DefaultModel: "sami-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "sami-tts",
Name: "sami-tts",
Description: "Default Volcengine SAMI speech wrapper model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("speaker", "Speaker", "Speaker ID", true, "", 10),
enumField("encoding", "Encoding", "Output encoding", false, []string{"mp3", "wav", "aac"}, 20),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 24000, 30),
numberField("speech_rate", "Speech Rate", "Speech rate [-50,100]", false, 0, 40),
numberField("pitch_rate", "Pitch Rate", "Pitch rate [-12,12]", false, 0, 50),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("speaker", "Speaker", "Speaker ID", true, "", 10),
enumField("encoding", "Encoding", "Output encoding", false, []string{"mp3", "wav", "aac"}, 20),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 24000, 30),
numberField("speech_rate", "Speech Rate", "Speech rate [-50,100]", false, 0, 40),
numberField("pitch_rate", "Pitch Rate", "Pitch rate [-12,12]", false, 0, 50),
}},
Formats: []string{"mp3", "wav", "aac"},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []volcenginespeech.Option{}
if v := configString(config, "access_key"); v != "" {
opts = append(opts, volcenginespeech.WithAccessKey(v))
}
if v := configString(config, "secret_key"); v != "" {
opts = append(opts, volcenginespeech.WithSecretKey(v))
}
if v := configString(config, "app_key"); v != "" {
opts = append(opts, volcenginespeech.WithAppKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, volcenginespeech.WithBaseURL(v))
}
return volcenginespeech.New(opts...), nil
},
Order: 70,
},
{
ClientType: models.ClientTypeAlibabaSpeech,
DisplayName: "Alibaba Cloud Speech",
Icon: "bailian-color",
Description: "DashScope CosyVoice TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "DashScope API key", true, 10),
stringField("base_url", "Base URL", "Override the WebSocket endpoint", false, "wss://dashscope.aliyuncs.com/api-ws/v1/inference/", 20),
}},
DefaultModel: "cosyvoice-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "cosyvoice-tts",
Name: "cosyvoice-tts",
Description: "Default DashScope CosyVoice wrapper model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "DashScope model ID", false, "cosyvoice-v1", 10),
stringField("voice", "Voice", "Voice or custom clone ID", true, "", 20),
enumField("format", "Format", "Audio format", false, []string{"mp3", "wav", "pcm", "opus"}, 30),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 22050, 40),
numberField("volume", "Volume", "Volume 0-100", false, 50, 50),
numberField("rate", "Rate", "Speech rate 0.5-2.0", false, 1.0, 60),
numberField("pitch", "Pitch", "Pitch multiplier 0.5-2.0", false, 1.0, 70),
}},
Capabilities: ModelCapabilities{
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("model", "Model", "DashScope model ID", false, "cosyvoice-v1", 10),
stringField("voice", "Voice", "Voice or custom clone ID", true, "", 20),
enumField("format", "Format", "Audio format", false, []string{"mp3", "wav", "pcm", "opus"}, 30),
numberField("sample_rate", "Sample Rate", "Audio sample rate", false, 22050, 40),
numberField("volume", "Volume", "Volume 0-100", false, 50, 50),
numberField("rate", "Rate", "Speech rate 0.5-2.0", false, 1.0, 60),
numberField("pitch", "Pitch", "Pitch multiplier 0.5-2.0", false, 1.0, 70),
}},
Formats: []string{"mp3", "wav", "pcm", "opus"},
},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []alibabaspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, alibabaspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, alibabaspeech.WithBaseURL(v))
}
return alibabaspeech.New(opts...), nil
},
Order: 80,
},
{
ClientType: models.ClientTypeMicrosoftSpeech,
DisplayName: "Microsoft Speech",
Icon: "azure-color",
Description: "Azure Cognitive Services TTS",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "Azure speech subscription key", true, 10),
stringField("base_url", "Base URL", "Optional full TTS endpoint override", false, "", 20),
}},
DefaultModel: "microsoft-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "microsoft-tts",
Name: "microsoft-tts",
Description: "Default Azure speech wrapper model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("region", "Region", "Azure region, e.g. eastus", false, "eastus", 10),
stringField("voice", "Voice", "Azure voice name", false, "en-US-JennyNeural", 20),
stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 30),
stringField("output_format", "Output Format", "Azure output format", false, "audio-16khz-128kbitrate-mono-mp3", 40),
stringField("style", "Style", "Optional speaking style", false, "", 50),
stringField("rate", "Rate", "Optional speaking rate", false, "", 60),
stringField("pitch", "Pitch", "Optional pitch adjustment", false, "", 70),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("region", "Region", "Azure region, e.g. eastus", false, "eastus", 10),
stringField("voice", "Voice", "Azure voice name", false, "en-US-JennyNeural", 20),
stringField("language", "Language", "Optional BCP-47 language tag", false, "en-US", 30),
stringField("output_format", "Output Format", "Azure output format", false, "audio-16khz-128kbitrate-mono-mp3", 40),
stringField("style", "Style", "Optional speaking style", false, "", 50),
stringField("rate", "Rate", "Optional speaking rate", false, "", 60),
stringField("pitch", "Pitch", "Optional pitch adjustment", false, "", 70),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []microsoftspeech.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, microsoftspeech.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, microsoftspeech.WithBaseURL(v))
}
return microsoftspeech.New(opts...), nil
},
Order: 90,
},
}
}
func stringField(key, title, description string, required bool, example any, order int) FieldSchema {
return FieldSchema{Key: key, Type: "string", Title: title, Description: description, Required: required, Example: example, Order: order}
}
func advancedStringField(key, title, description string, required bool, example any, order int) FieldSchema {
return FieldSchema{Key: key, Type: "string", Title: title, Description: description, Required: required, Advanced: true, Example: example, Order: order}
}
func secretField(key, title, description string, required bool, order int) FieldSchema {
return FieldSchema{Key: key, Type: "secret", Title: title, Description: description, Required: required, Order: order}
}
func numberField(key, title, description string, required bool, example any, order int) FieldSchema {
return FieldSchema{Key: key, Type: "number", Title: title, Description: description, Required: required, Example: example, Order: order}
}
func boolField(key, title, description string, required bool, order int) FieldSchema {
return FieldSchema{Key: key, Type: "bool", Title: title, Description: description, Required: required, Order: order}
}
func enumField(key, title, description string, required bool, values []string, order int) FieldSchema {
return FieldSchema{Key: key, Type: "enum", Title: title, Description: description, Required: required, Enum: values, Order: order}
}
func configString(cfg map[string]any, key string) string {
if cfg == nil {
return ""
}
if v, ok := cfg[key].(string); ok {
return strings.TrimSpace(v)
}
return ""
}
func voiceIDs(voices []VoiceInfo) []string {
out := make([]string, 0, len(voices))
for _, voice := range voices {
out = append(out, voice.ID)
}
return out
}
+224 -156
View File
@@ -3,14 +3,15 @@ package tts
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"strings"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/db"
"github.com/memohai/memoh/internal/db/sqlc"
"github.com/memohai/memoh/internal/models"
)
type Service struct {
@@ -33,11 +34,6 @@ func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListMeta()
}
// ---------------------------------------------------------------------------
// Read helpers (speech-filtered views of unified tables)
// ---------------------------------------------------------------------------
// ListSpeechProviders returns providers with speech client types.
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
rows, err := s.queries.ListSpeechProviders(ctx)
if err != nil {
@@ -50,7 +46,18 @@ func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResp
return items, nil
}
// ListSpeechModels returns all speech-type models.
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechProviderResponse{}, err
}
row, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
}
return toSpeechProviderResponse(row), nil
}
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
rows, err := s.queries.ListSpeechModels(ctx)
if err != nil {
@@ -58,29 +65,41 @@ func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse,
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if s.shouldHideModel(row.ProviderType, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromListRow(row))
}
return items, nil
}
// ListSpeechModelsByProvider returns speech models for a given provider.
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("list speech models by provider: %w", err)
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if shouldHideTemplateModel(def, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromModel(row, ""))
}
return items, nil
}
// GetSpeechModel returns a speech model by ID.
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
@@ -93,98 +112,45 @@ func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelRes
return toSpeechModelWithProviderResponse(row), nil
}
// ---------------------------------------------------------------------------
// Synthesis
// ---------------------------------------------------------------------------
// Synthesize runs text-to-speech using the saved model config, optionally
// overridden by fields in overrideCfg. Returns raw audio bytes.
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
pgID, err := db.ParseUUID(modelID)
params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
if err != nil {
return nil, "", err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
result, err := sdk.GenerateSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return nil, "", fmt.Errorf("get speech model: %w", err)
return nil, "", fmt.Errorf("synthesize: %w", err)
}
adapterType := clientTypeToTtsType(modelRow.ProviderType)
adapter, err := s.registry.Get(adapterType)
if err != nil {
return nil, "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
}
savedCfg := parseModelConfig(modelRow.Config)
for k, v := range overrideCfg {
savedCfg[k] = v
}
audioCfg := buildAudioConfig(savedCfg)
if err := audioCfg.Validate(); err != nil {
return nil, "", fmt.Errorf("invalid audio config: %w", err)
}
resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
audio, synthErr := adapter.Synthesize(ctx, text, resolvedModel, audioCfg)
if synthErr != nil {
return nil, "", fmt.Errorf("synthesize: %w", synthErr)
}
contentType := resolveContentType(audioCfg.Format)
return audio, contentType, nil
return result.Audio, result.ContentType, nil
}
// StreamToFile runs text-to-speech using Stream() and writes audio chunks
// directly to the given writer, keeping peak memory low for large audio.
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
pgID, err := db.ParseUUID(modelID)
params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
if err != nil {
return "", err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
streamResult, err := sdk.StreamSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return "", fmt.Errorf("get speech model: %w", err)
return "", fmt.Errorf("stream: %w", err)
}
adapterType := clientTypeToTtsType(modelRow.ProviderType)
adapter, err := s.registry.Get(adapterType)
audio, err := streamResult.Bytes()
if err != nil {
return "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
return "", fmt.Errorf("stream: %w", err)
}
savedCfg := parseModelConfig(modelRow.Config)
audioCfg := buildAudioConfig(savedCfg)
if err := audioCfg.Validate(); err != nil {
return "", fmt.Errorf("invalid audio config: %w", err)
if _, writeErr := w.Write(audio); writeErr != nil {
return "", fmt.Errorf("write chunk: %w", writeErr)
}
resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
dataCh, errCh := adapter.Stream(ctx, text, resolvedModel, audioCfg)
if dataCh == nil {
select {
case streamErr := <-errCh:
return "", fmt.Errorf("stream: %w", streamErr)
default:
return "", errors.New("stream returned nil channels")
}
}
for chunk := range dataCh {
if _, writeErr := w.Write(chunk); writeErr != nil {
return "", fmt.Errorf("write chunk: %w", writeErr)
}
}
if streamErr, ok := <-errCh; ok && streamErr != nil {
return "", fmt.Errorf("stream: %w", streamErr)
}
return resolveContentType(audioCfg.Format), nil
return streamResult.ContentType, nil
}
// ---------------------------------------------------------------------------
// Capabilities
// ---------------------------------------------------------------------------
// GetModelCapabilities returns the adapter-level capabilities for a stored model.
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
pgID, err := db.ParseUUID(modelID)
if err != nil {
@@ -194,115 +160,217 @@ func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*Mo
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
adapterType := clientTypeToTtsType(modelRow.ProviderType)
adapter, err := s.registry.Get(adapterType)
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
if err != nil {
return nil, fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
return nil, err
}
for _, m := range adapter.Models() {
if m.ID == modelRow.ModelID {
return &m.Capabilities, nil
template := findModelTemplate(def, modelRow.ModelID)
if template == nil {
return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
}
caps := template.Capabilities
if len(caps.ConfigSchema.Fields) == 0 {
caps.ConfigSchema = template.ConfigSchema
}
return &caps, nil
}
func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
if !def.SupportsList || def.Factory == nil {
return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
remoteModels, err := provider.ListModels(ctx)
if err != nil {
return nil, fmt.Errorf("list speech models: %w", err)
}
discovered := make([]ModelInfo, 0, len(remoteModels))
for _, remoteModel := range remoteModels {
if remoteModel == nil || remoteModel.ID == "" {
continue
}
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
}
return nil, fmt.Errorf("model %s not found in adapter", modelRow.ModelID)
return discovered, nil
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
// clientTypeToTtsType maps the unified client_type to the TTS adapter type.
func clientTypeToTtsType(clientType string) TtsType {
switch clientType {
case "edge-speech":
return "edge"
default:
return TtsType(clientType)
}
type resolvedSpeechParams struct {
model *sdk.SpeechModel
config map[string]any
}
func parseModelConfig(raw []byte) map[string]any {
func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
_ = text
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
return &resolvedSpeechParams{
model: &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
config: cfg,
}, nil
}
func parseConfig(raw []byte) map[string]any {
if len(raw) == 0 {
return make(map[string]any)
return map[string]any{}
}
var cfg map[string]any
if err := json.Unmarshal(raw, &cfg); err != nil {
return make(map[string]any)
}
if cfg == nil {
return make(map[string]any)
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
return map[string]any{}
}
return cfg
}
func buildAudioConfig(cfg map[string]any) AudioConfig {
ac := AudioConfig{}
if voice, ok := cfg["voice"].(map[string]any); ok {
if id, ok := voice["id"].(string); ok {
ac.Voice.ID = id
}
if lang, ok := voice["lang"].(string); ok {
ac.Voice.Lang = lang
func mergeConfig(parts ...map[string]any) map[string]any {
out := make(map[string]any)
for _, part := range parts {
for key, value := range part {
out[key] = value
}
}
if format, ok := cfg["format"].(string); ok {
ac.Format = format
}
if speed, ok := toFloat(cfg["speed"]); ok {
ac.Speed = speed
}
if pitch, ok := toFloat(cfg["pitch"]); ok {
ac.Pitch = pitch
}
if sr, ok := toFloat(cfg["sample_rate"]); ok {
ac.SampleRate = int(sr)
}
return ac
return out
}
func toFloat(v any) (float64, bool) {
switch n := v.(type) {
case float64:
return n, true
case float32:
return float64(n), true
case int:
return float64(n), true
case int64:
return float64(n), true
default:
return 0, false
func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
for _, model := range defaults {
if model.ID == modelID {
return model
}
}
return ModelInfo{
ID: modelID,
Name: modelID,
}
}
func resolveContentType(format string) string {
switch {
case strings.Contains(format, "mp3"):
return "audio/mpeg"
case strings.Contains(format, "opus"):
return "audio/opus"
case strings.Contains(format, "ogg"):
return "audio/ogg"
case strings.Contains(format, "webm"):
return "audio/webm"
case strings.Contains(format, "wav"):
return "audio/wav"
default:
return "audio/mpeg"
func (s *Service) shouldHideModel(clientType string, modelID string) bool {
def, err := s.registry.Get(models.ClientType(clientType))
if err != nil {
return false
}
return shouldHideTemplateModel(def, modelID)
}
func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
if !def.SupportsList {
return false
}
for _, model := range def.Models {
if model.ID == modelID {
return model.TemplateOnly
}
}
return false
}
func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
for i := range def.Models {
if def.Models[i].ID == modelID {
return &def.Models[i]
}
}
if def.DefaultModel != "" {
for i := range def.Models {
if def.Models[i].ID == def.DefaultModel {
return &def.Models[i]
}
}
}
if len(def.Models) > 0 {
return &def.Models[0]
}
return nil
}
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
icon := ""
if row.Icon.Valid {
icon = row.Icon.String
}
return SpeechProviderResponse{
ID: row.ID.String(),
Name: row.Name,
ClientType: row.ClientType,
Icon: icon,
Enable: row.Enable,
Config: maskSpeechProviderConfig(parseConfig(row.Config)),
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
if len(cfg) == 0 {
return map[string]any{}
}
out := make(map[string]any, len(cfg))
for key, value := range cfg {
if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
out[key] = maskSpeechSecret(s)
continue
}
out[key] = value
}
return out
}
func isSpeechSecretKey(key string) bool {
switch key {
case "api_key", "access_key", "secret_key", "app_key":
return true
default:
return false
}
}
func maskSpeechSecret(value string) string {
if len(value) <= 8 {
return "********"
}
return value[:4] + "****" + value[len(value)-4:]
}
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
+21 -11
View File
@@ -4,21 +4,24 @@ import "time"
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
type ProviderMetaResponse struct {
Provider string `json:"provider"`
DisplayName string `json:"display_name"`
Description string `json:"description"`
DefaultModel string `json:"default_model"`
Models []ModelInfo `json:"models"`
Provider string `json:"provider"`
DisplayName string `json:"display_name"`
Description string `json:"description"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
DefaultModel string `json:"default_model"`
Models []ModelInfo `json:"models"`
}
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
type SpeechProviderResponse struct {
ID string `json:"id"`
Name string `json:"name"`
ClientType string `json:"client_type"`
Enable bool `json:"enable"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
ID string `json:"id"`
Name string `json:"name"`
ClientType string `json:"client_type"`
Icon string `json:"icon,omitempty"`
Enable bool `json:"enable"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// SpeechModelResponse represents a speech model from the unified models table.
@@ -50,3 +53,10 @@ type TestSynthesizeRequest struct {
Text string `json:"text"`
Config map[string]any `json:"config,omitempty"`
}
// ImportModelsResponse represents the response for importing speech models.
type ImportModelsResponse struct {
Created int `json:"created"`
Skipped int `json:"skipped"`
Models []string `json:"models"`
}
+3
View File
@@ -0,0 +1,3 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none">
<path fill="currentColor" d="M11.203 24H1.517a.364.364 0 0 1-.258-.62l6.239-6.275a.37.37 0 0 1 .259-.108h3.52c2.723 0 5.025-2.127 5.107-4.845a5.004 5.004 0 0 0-4.999-5.148H7.613v4.646c0 .2-.164.364-.365.364H.968a.365.365 0 0 1-.363-.364V.364C.605.164.768 0 .969 0h10.416c6.684 0 12.111 5.485 12.01 12.187C23.293 18.77 17.794 24 11.202 24z"/>
</svg>

After

Width:  |  Height:  |  Size: 424 B

+4
View File
@@ -0,0 +1,4 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="172 129 156 242" fill="none">
<path d="M314 355H271V145H314V355Z" fill="currentColor"/>
<path d="M229 355H186V145H229V355Z" fill="currentColor"/>
</svg>

After

Width:  |  Height:  |  Size: 206 B

+2
View File
@@ -18,6 +18,8 @@ const llmProviders: string[] = [
...withVariants('anthropic', []),
...withVariants('google', ['color', 'brand-color']),
...withVariants('deepseek', ['color']),
...withVariants('deepgram', []),
...withVariants('elevenlabs', []),
...withVariants('groq', []),
...withVariants('huggingface', ['color']),
...withVariants('lmstudio', []),
+17
View File
@@ -0,0 +1,17 @@
<template>
<svg
xmlns="http://www.w3.org/2000/svg"
:width="size"
:height="size"
viewBox="0 0 24 24"
v-bind="$attrs"
><path
fill="currentColor"
d="M11.203 24H1.517a.364.364 0 0 1-.258-.62l6.239-6.275a.37.37 0 0 1 .259-.108h3.52c2.723 0 5.025-2.127 5.107-4.845a5.004 5.004 0 0 0-4.999-5.148H7.613v4.646c0 .2-.164.364-.365.364H.968a.365.365 0 0 1-.363-.364V.364C.605.164.768 0 .969 0h10.416c6.684 0 12.111 5.485 12.01 12.187C23.293 18.77 17.794 24 11.202 24z"
/></svg>
</template>
<script setup lang="ts">
withDefaults(defineProps<{ size?: string | number }>(), { size: '1em' })
defineOptions({ inheritAttrs: false })
</script>
+21
View File
@@ -0,0 +1,21 @@
<template>
<svg
xmlns="http://www.w3.org/2000/svg"
:width="size"
:height="size"
viewBox="172 129 156 242"
v-bind="$attrs"
><path
d="M314 355H271V145H314V355Z"
fill="currentColor"
/>
<path
d="M229 355H186V145H229V355Z"
fill="currentColor"
/></svg>
</template>
<script setup lang="ts">
withDefaults(defineProps<{ size?: string | number }>(), { size: '1em' })
defineOptions({ inheritAttrs: false })
</script>
+2
View File
@@ -15,6 +15,7 @@ export { default as Claude } from './icons/Claude.vue'
export { default as ClaudeColor } from './icons/ClaudeColor.vue'
export { default as Cohere } from './icons/Cohere.vue'
export { default as CohereColor } from './icons/CohereColor.vue'
export { default as Deepgram } from './icons/Deepgram.vue'
export { default as Deepseek } from './icons/Deepseek.vue'
export { default as DeepseekColor } from './icons/DeepseekColor.vue'
export { default as Dingtalk } from './icons/Dingtalk.vue'
@@ -22,6 +23,7 @@ export { default as Discord } from './icons/Discord.vue'
export { default as Doubao } from './icons/Doubao.vue'
export { default as DoubaoColor } from './icons/DoubaoColor.vue'
export { default as Duckduckgo } from './icons/Duckduckgo.vue'
export { default as Elevenlabs } from './icons/Elevenlabs.vue'
export { default as Exa } from './icons/Exa.vue'
export { default as ExaColor } from './icons/ExaColor.vue'
export { default as Feishu } from './icons/Feishu.vue'
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+138
View File
@@ -1642,8 +1642,34 @@ export type SettingsUpsertRequest = {
tts_model_id?: string;
};
export type TtsConfigSchema = {
fields?: Array<TtsFieldSchema>;
};
export type TtsFieldSchema = {
advanced?: boolean;
description?: string;
enum?: Array<string>;
example?: unknown;
key?: string;
order?: number;
required?: boolean;
title?: string;
type?: string;
};
export type TtsImportModelsResponse = {
created?: number;
models?: Array<string>;
skipped?: number;
};
export type TtsModelCapabilities = {
config_schema?: TtsConfigSchema;
formats?: Array<string>;
metadata?: {
[key: string]: string;
};
pitch?: TtsParamConstraint;
speed?: TtsParamConstraint;
voices?: Array<TtsVoiceInfo>;
@@ -1651,6 +1677,7 @@ export type TtsModelCapabilities = {
export type TtsModelInfo = {
capabilities?: TtsModelCapabilities;
config_schema?: TtsConfigSchema;
description?: string;
id?: string;
name?: string;
@@ -1664,6 +1691,7 @@ export type TtsParamConstraint = {
};
export type TtsProviderMetaResponse = {
config_schema?: TtsConfigSchema;
default_model?: string;
description?: string;
display_name?: string;
@@ -1686,8 +1714,12 @@ export type TtsSpeechModelResponse = {
export type TtsSpeechProviderResponse = {
client_type?: string;
config?: {
[key: string]: unknown;
};
created_at?: string;
enable?: boolean;
icon?: string;
id?: string;
name?: string;
updated_at?: string;
@@ -8331,6 +8363,112 @@ export type GetSpeechProvidersMetaResponses = {
export type GetSpeechProvidersMetaResponse = GetSpeechProvidersMetaResponses[keyof GetSpeechProvidersMetaResponses];
export type GetSpeechProvidersByIdData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/speech-providers/{id}';
};
export type GetSpeechProvidersByIdErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Not Found
*/
404: HandlersErrorResponse;
};
export type GetSpeechProvidersByIdError = GetSpeechProvidersByIdErrors[keyof GetSpeechProvidersByIdErrors];
export type GetSpeechProvidersByIdResponses = {
/**
* OK
*/
200: TtsSpeechProviderResponse;
};
export type GetSpeechProvidersByIdResponse = GetSpeechProvidersByIdResponses[keyof GetSpeechProvidersByIdResponses];
export type PostSpeechProvidersByIdImportModelsData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/speech-providers/{id}/import-models';
};
export type PostSpeechProvidersByIdImportModelsErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Not Found
*/
404: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type PostSpeechProvidersByIdImportModelsError = PostSpeechProvidersByIdImportModelsErrors[keyof PostSpeechProvidersByIdImportModelsErrors];
export type PostSpeechProvidersByIdImportModelsResponses = {
/**
* OK
*/
200: TtsImportModelsResponse;
};
export type PostSpeechProvidersByIdImportModelsResponse = PostSpeechProvidersByIdImportModelsResponses[keyof PostSpeechProvidersByIdImportModelsResponses];
export type GetSpeechProvidersByIdModelsData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/speech-providers/{id}/models';
};
export type GetSpeechProvidersByIdModelsErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type GetSpeechProvidersByIdModelsError = GetSpeechProvidersByIdModelsErrors[keyof GetSpeechProvidersByIdModelsErrors];
export type GetSpeechProvidersByIdModelsResponses = {
/**
* OK
*/
200: Array<TtsSpeechModelResponse>;
};
export type GetSpeechProvidersByIdModelsResponse = GetSpeechProvidersByIdModelsResponses[keyof GetSpeechProvidersByIdModelsResponses];
export type GetSupermarketMcpsData = {
body?: never;
path?: never;
+218
View File
@@ -8097,6 +8097,141 @@ const docTemplate = `{
}
}
},
"/speech-providers/{id}": {
"get": {
"description": "Get a speech provider with masked config values",
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "Get speech provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/tts.SpeechProviderResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"404": {
"description": "Not Found",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/speech-providers/{id}/import-models": {
"post": {
"description": "Fetch models using the configured speech provider and import them into the unified models table",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "Import speech models from provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/tts.ImportModelsResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"404": {
"description": "Not Found",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/speech-providers/{id}/models": {
"get": {
"description": "List models of type 'speech' for a specific speech provider",
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "List speech models by provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"type": "array",
"items": {
"$ref": "#/definitions/tts.SpeechModelResponse"
}
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/supermarket/mcps": {
"get": {
"tags": [
@@ -12930,15 +13065,85 @@ const docTemplate = `{
}
}
},
"tts.ConfigSchema": {
"type": "object",
"properties": {
"fields": {
"type": "array",
"items": {
"$ref": "#/definitions/tts.FieldSchema"
}
}
}
},
"tts.FieldSchema": {
"type": "object",
"properties": {
"advanced": {
"type": "boolean"
},
"description": {
"type": "string"
},
"enum": {
"type": "array",
"items": {
"type": "string"
}
},
"example": {},
"key": {
"type": "string"
},
"order": {
"type": "integer"
},
"required": {
"type": "boolean"
},
"title": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"tts.ImportModelsResponse": {
"type": "object",
"properties": {
"created": {
"type": "integer"
},
"models": {
"type": "array",
"items": {
"type": "string"
}
},
"skipped": {
"type": "integer"
}
}
},
"tts.ModelCapabilities": {
"type": "object",
"properties": {
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"formats": {
"type": "array",
"items": {
"type": "string"
}
},
"metadata": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"pitch": {
"$ref": "#/definitions/tts.ParamConstraint"
},
@@ -12959,6 +13164,9 @@ const docTemplate = `{
"capabilities": {
"$ref": "#/definitions/tts.ModelCapabilities"
},
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"description": {
"type": "string"
},
@@ -12993,6 +13201,9 @@ const docTemplate = `{
"tts.ProviderMetaResponse": {
"type": "object",
"properties": {
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"default_model": {
"type": "string"
},
@@ -13049,12 +13260,19 @@ const docTemplate = `{
"client_type": {
"type": "string"
},
"config": {
"type": "object",
"additionalProperties": {}
},
"created_at": {
"type": "string"
},
"enable": {
"type": "boolean"
},
"icon": {
"type": "string"
},
"id": {
"type": "string"
},
+218
View File
@@ -8088,6 +8088,141 @@
}
}
},
"/speech-providers/{id}": {
"get": {
"description": "Get a speech provider with masked config values",
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "Get speech provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/tts.SpeechProviderResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"404": {
"description": "Not Found",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/speech-providers/{id}/import-models": {
"post": {
"description": "Fetch models using the configured speech provider and import them into the unified models table",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "Import speech models from provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/tts.ImportModelsResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"404": {
"description": "Not Found",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/speech-providers/{id}/models": {
"get": {
"description": "List models of type 'speech' for a specific speech provider",
"produces": [
"application/json"
],
"tags": [
"speech-providers"
],
"summary": "List speech models by provider",
"parameters": [
{
"type": "string",
"description": "Provider ID (UUID)",
"name": "id",
"in": "path",
"required": true
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"type": "array",
"items": {
"$ref": "#/definitions/tts.SpeechModelResponse"
}
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/handlers.ErrorResponse"
}
}
}
}
},
"/supermarket/mcps": {
"get": {
"tags": [
@@ -12921,15 +13056,85 @@
}
}
},
"tts.ConfigSchema": {
"type": "object",
"properties": {
"fields": {
"type": "array",
"items": {
"$ref": "#/definitions/tts.FieldSchema"
}
}
}
},
"tts.FieldSchema": {
"type": "object",
"properties": {
"advanced": {
"type": "boolean"
},
"description": {
"type": "string"
},
"enum": {
"type": "array",
"items": {
"type": "string"
}
},
"example": {},
"key": {
"type": "string"
},
"order": {
"type": "integer"
},
"required": {
"type": "boolean"
},
"title": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"tts.ImportModelsResponse": {
"type": "object",
"properties": {
"created": {
"type": "integer"
},
"models": {
"type": "array",
"items": {
"type": "string"
}
},
"skipped": {
"type": "integer"
}
}
},
"tts.ModelCapabilities": {
"type": "object",
"properties": {
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"formats": {
"type": "array",
"items": {
"type": "string"
}
},
"metadata": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"pitch": {
"$ref": "#/definitions/tts.ParamConstraint"
},
@@ -12950,6 +13155,9 @@
"capabilities": {
"$ref": "#/definitions/tts.ModelCapabilities"
},
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"description": {
"type": "string"
},
@@ -12984,6 +13192,9 @@
"tts.ProviderMetaResponse": {
"type": "object",
"properties": {
"config_schema": {
"$ref": "#/definitions/tts.ConfigSchema"
},
"default_model": {
"type": "string"
},
@@ -13040,12 +13251,19 @@
"client_type": {
"type": "string"
},
"config": {
"type": "object",
"additionalProperties": {}
},
"created_at": {
"type": "string"
},
"enable": {
"type": "boolean"
},
"icon": {
"type": "string"
},
"id": {
"type": "string"
},
+145
View File
@@ -2761,12 +2761,58 @@ definitions:
tts_model_id:
type: string
type: object
tts.ConfigSchema:
properties:
fields:
items:
$ref: '#/definitions/tts.FieldSchema'
type: array
type: object
tts.FieldSchema:
properties:
advanced:
type: boolean
description:
type: string
enum:
items:
type: string
type: array
example: {}
key:
type: string
order:
type: integer
required:
type: boolean
title:
type: string
type:
type: string
type: object
tts.ImportModelsResponse:
properties:
created:
type: integer
models:
items:
type: string
type: array
skipped:
type: integer
type: object
tts.ModelCapabilities:
properties:
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
formats:
items:
type: string
type: array
metadata:
additionalProperties:
type: string
type: object
pitch:
$ref: '#/definitions/tts.ParamConstraint'
speed:
@@ -2780,6 +2826,8 @@ definitions:
properties:
capabilities:
$ref: '#/definitions/tts.ModelCapabilities'
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
description:
type: string
id:
@@ -2802,6 +2850,8 @@ definitions:
type: object
tts.ProviderMetaResponse:
properties:
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
default_model:
type: string
description:
@@ -2839,10 +2889,15 @@ definitions:
properties:
client_type:
type: string
config:
additionalProperties: {}
type: object
created_at:
type: string
enable:
type: boolean
icon:
type: string
id:
type: string
name:
@@ -8229,6 +8284,96 @@ paths:
summary: List speech providers
tags:
- speech-providers
/speech-providers/{id}:
get:
description: Get a speech provider with masked config values
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/tts.SpeechProviderResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Get speech provider
tags:
- speech-providers
/speech-providers/{id}/import-models:
post:
consumes:
- application/json
description: Fetch models using the configured speech provider and import them
into the unified models table
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/tts.ImportModelsResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Import speech models from provider
tags:
- speech-providers
/speech-providers/{id}/models:
get:
description: List models of type 'speech' for a specific speech provider
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
items:
$ref: '#/definitions/tts.SpeechModelResponse'
type: array
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: List speech models by provider
tags:
- speech-providers
/speech-providers/meta:
get:
description: List available speech provider types with their models and capabilities