mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
@@ -18,7 +18,6 @@
|
||||
<div class="flex flex-col gap-3 mt-4">
|
||||
<!-- Type -->
|
||||
<FormField
|
||||
v-if="!hideType"
|
||||
v-slot="{ componentField }"
|
||||
name="type"
|
||||
>
|
||||
@@ -36,12 +35,11 @@
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectGroup>
|
||||
<SelectItem
|
||||
v-for="opt in typeOptions"
|
||||
:key="opt.value"
|
||||
:value="opt.value"
|
||||
>
|
||||
{{ opt.label }}
|
||||
<SelectItem value="chat">
|
||||
Chat
|
||||
</SelectItem>
|
||||
<SelectItem value="embedding">
|
||||
Embedding
|
||||
</SelectItem>
|
||||
</SelectGroup>
|
||||
</SelectContent>
|
||||
@@ -183,11 +181,6 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities'
|
||||
import FormDialogShell from '@/components/form-dialog-shell/index.vue'
|
||||
import { useDialogMutation } from '@/composables/useDialogMutation'
|
||||
|
||||
interface ModelTypeOption {
|
||||
value: string
|
||||
label: string
|
||||
}
|
||||
|
||||
const selectedCompat = ref<string[]>([])
|
||||
const { t } = useI18n()
|
||||
const { run } = useDialogMutation()
|
||||
@@ -200,30 +193,14 @@ const formSchema = toTypedSchema(z.object({
|
||||
context_window: z.coerce.number().min(1).optional(),
|
||||
}))
|
||||
|
||||
const props = withDefaults(defineProps<{
|
||||
id: string
|
||||
typeOptions?: ModelTypeOption[]
|
||||
defaultType?: string
|
||||
hideType?: boolean
|
||||
invalidateKeys?: string[]
|
||||
}>(), {
|
||||
typeOptions: () => [
|
||||
{ value: 'chat', label: 'Chat' },
|
||||
{ value: 'embedding', label: 'Embedding' },
|
||||
],
|
||||
defaultType: 'chat',
|
||||
hideType: false,
|
||||
invalidateKeys: () => ['provider-models'],
|
||||
})
|
||||
|
||||
const form = useForm({
|
||||
validationSchema: formSchema,
|
||||
initialValues: {
|
||||
type: props.defaultType,
|
||||
type: 'chat',
|
||||
},
|
||||
})
|
||||
|
||||
const selectedType = computed(() => form.values.type || props.defaultType)
|
||||
const selectedType = computed(() => form.values.type || 'chat')
|
||||
|
||||
const open = inject<Ref<boolean>>('openModel', ref(false))
|
||||
const title = inject<Ref<'edit' | 'title'>>('openModelTitle', ref('title'))
|
||||
@@ -260,19 +237,15 @@ function onNameInput(e: Event) {
|
||||
form.setFieldValue('name', (e.target as HTMLInputElement).value)
|
||||
}
|
||||
|
||||
const queryCache = useQueryCache()
|
||||
function invalidateModelQueries() {
|
||||
for (const key of props.invalidateKeys) {
|
||||
queryCache.invalidateQueries({ key: [key] })
|
||||
}
|
||||
}
|
||||
const { id } = defineProps<{ id: string }>()
|
||||
|
||||
const queryCache = useQueryCache()
|
||||
const { mutateAsync: createModel, isLoading: createLoading } = useMutation({
|
||||
mutation: async (data: Record<string, unknown>) => {
|
||||
const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true })
|
||||
return result
|
||||
},
|
||||
onSettled: invalidateModelQueries,
|
||||
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
|
||||
})
|
||||
const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
|
||||
mutation: async ({ id, data }: { id: string; data: Record<string, unknown> }) => {
|
||||
@@ -283,7 +256,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
|
||||
})
|
||||
return result
|
||||
},
|
||||
onSettled: invalidateModelQueries,
|
||||
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
|
||||
})
|
||||
const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({
|
||||
mutation: async ({ modelId, data }: { modelId: string; data: Record<string, unknown> }) => {
|
||||
@@ -294,7 +267,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading
|
||||
})
|
||||
return result
|
||||
},
|
||||
onSettled: invalidateModelQueries,
|
||||
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
|
||||
})
|
||||
const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value)
|
||||
|
||||
@@ -324,7 +297,7 @@ async function addModel() {
|
||||
const payload: Record<string, unknown> = {
|
||||
type,
|
||||
model_id,
|
||||
provider_id: props.id,
|
||||
provider_id: id,
|
||||
config,
|
||||
}
|
||||
|
||||
@@ -375,15 +348,7 @@ watch(open, async () => {
|
||||
selectedCompat.value = config?.compatibilities ?? []
|
||||
userEditedName.value = !!(name && name !== model_id)
|
||||
} else {
|
||||
form.resetForm({
|
||||
values: {
|
||||
type: props.defaultType,
|
||||
model_id: '',
|
||||
name: '',
|
||||
dimensions: undefined,
|
||||
context_window: undefined,
|
||||
},
|
||||
})
|
||||
form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } })
|
||||
selectedCompat.value = []
|
||||
userEditedName.value = false
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ import { computed, type Component } from 'vue'
|
||||
import { storeToRefs } from 'pinia'
|
||||
import { useRouter, useRoute } from 'vue-router'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
|
||||
import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
|
||||
import { useChatSelectionStore } from '@/store/chat-selection'
|
||||
import {
|
||||
Sidebar,
|
||||
@@ -118,11 +118,6 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(()
|
||||
name: 'speech',
|
||||
icon: Volume2,
|
||||
},
|
||||
{
|
||||
title: t('sidebar.transcription'),
|
||||
name: 'transcription',
|
||||
icon: AudioLines,
|
||||
},
|
||||
{
|
||||
title: t('sidebar.email'),
|
||||
name: 'email',
|
||||
|
||||
@@ -45,41 +45,21 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
|
||||
label: 'OpenAI Speech',
|
||||
hint: 'OpenAI /audio/speech compatible TTS',
|
||||
},
|
||||
'openai-transcription': {
|
||||
value: 'openai-transcription',
|
||||
label: 'OpenAI Transcription',
|
||||
hint: 'OpenAI audio transcription',
|
||||
},
|
||||
'openrouter-speech': {
|
||||
value: 'openrouter-speech',
|
||||
label: 'OpenRouter Speech',
|
||||
hint: 'OpenRouter audio modality TTS',
|
||||
},
|
||||
'openrouter-transcription': {
|
||||
value: 'openrouter-transcription',
|
||||
label: 'OpenRouter Transcription',
|
||||
hint: 'OpenRouter transcription models',
|
||||
},
|
||||
'elevenlabs-speech': {
|
||||
value: 'elevenlabs-speech',
|
||||
label: 'ElevenLabs Speech',
|
||||
hint: 'ElevenLabs text-to-speech',
|
||||
},
|
||||
'elevenlabs-transcription': {
|
||||
value: 'elevenlabs-transcription',
|
||||
label: 'ElevenLabs Transcription',
|
||||
hint: 'ElevenLabs speech-to-text',
|
||||
},
|
||||
'deepgram-speech': {
|
||||
value: 'deepgram-speech',
|
||||
label: 'Deepgram Speech',
|
||||
hint: 'Deepgram TTS',
|
||||
},
|
||||
'deepgram-transcription': {
|
||||
value: 'deepgram-transcription',
|
||||
label: 'Deepgram Transcription',
|
||||
hint: 'Deepgram speech-to-text',
|
||||
},
|
||||
'minimax-speech': {
|
||||
value: 'minimax-speech',
|
||||
label: 'MiniMax Speech',
|
||||
@@ -100,19 +80,9 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
|
||||
label: 'Microsoft Speech',
|
||||
hint: 'Azure Cognitive Services TTS',
|
||||
},
|
||||
'google-speech': {
|
||||
value: 'google-speech',
|
||||
label: 'Google Speech',
|
||||
hint: 'Gemini speech transcription',
|
||||
},
|
||||
'google-transcription': {
|
||||
value: 'google-transcription',
|
||||
label: 'Google Transcription',
|
||||
hint: 'Gemini speech transcription',
|
||||
},
|
||||
}
|
||||
|
||||
export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)
|
||||
|
||||
export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
|
||||
.filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription'))
|
||||
.filter(ct => !ct.value.endsWith('-speech'))
|
||||
|
||||
@@ -63,7 +63,6 @@
|
||||
"webSearch": "Web Search",
|
||||
"memory": "Memory",
|
||||
"speech": "Speech",
|
||||
"transcription": "Transcription",
|
||||
"email": "Email",
|
||||
"settings": "Settings",
|
||||
"profile": "Profile",
|
||||
@@ -426,9 +425,6 @@
|
||||
"noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
|
||||
"noCapabilities": "No capabilities available for this model.",
|
||||
"saveSuccess": "Speech configuration saved",
|
||||
"synthesis": {
|
||||
"models": "Synthesis Models"
|
||||
},
|
||||
"advanced": {
|
||||
"title": "Advanced Settings",
|
||||
"description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
|
||||
@@ -452,27 +448,6 @@
|
||||
"failed": "Synthesis failed"
|
||||
}
|
||||
},
|
||||
"transcription": {
|
||||
"title": "Transcription",
|
||||
"emptyTitle": "No Transcription Providers",
|
||||
"emptyDescription": "Add a transcription provider to enable speech-to-text for your bots",
|
||||
"models": "Transcription Models",
|
||||
"noModels": "No transcription models found. Import available models or keep the default template model.",
|
||||
"noCapabilities": "No capabilities available for this model.",
|
||||
"importModels": "Import Models",
|
||||
"importSuccess": "Transcription models imported successfully",
|
||||
"importFailed": "Failed to import transcription models",
|
||||
"saveSuccess": "Transcription configuration saved",
|
||||
"advanced": {
|
||||
"title": "Advanced Settings",
|
||||
"description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
|
||||
},
|
||||
"test": {
|
||||
"title": "Test Transcription",
|
||||
"run": "Transcribe",
|
||||
"failed": "Transcription failed"
|
||||
}
|
||||
},
|
||||
"email": {
|
||||
"title": "Email",
|
||||
"add": "Add Email",
|
||||
@@ -945,8 +920,6 @@
|
||||
"memoryHealthUnavailable": "Unavailable",
|
||||
"ttsModel": "TTS Model",
|
||||
"ttsModelPlaceholder": "Select TTS model",
|
||||
"transcriptionModel": "Transcription Model",
|
||||
"transcriptionModelPlaceholder": "Select transcription model",
|
||||
"imageModel": "Image Generation Model",
|
||||
"imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.",
|
||||
"imageModelPlaceholder": "Select image model (optional)",
|
||||
|
||||
@@ -64,7 +64,6 @@
|
||||
"webSearch": "搜索",
|
||||
"memory": "记忆",
|
||||
"speech": "语音",
|
||||
"transcription": "转写",
|
||||
"email": "邮件",
|
||||
"profile": "用户",
|
||||
"home": "首页",
|
||||
@@ -422,9 +421,6 @@
|
||||
"noModels": "暂无模型,点击\"导入模型\"发现可用模型,或点击\"新建模型\"手动创建。",
|
||||
"noCapabilities": "该模型暂无可用能力信息。",
|
||||
"saveSuccess": "语音配置已保存",
|
||||
"synthesis": {
|
||||
"models": "语音合成模型"
|
||||
},
|
||||
"advanced": {
|
||||
"title": "高级设置",
|
||||
"description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
|
||||
@@ -448,27 +444,6 @@
|
||||
"failed": "合成失败"
|
||||
}
|
||||
},
|
||||
"transcription": {
|
||||
"title": "语音转写",
|
||||
"emptyTitle": "暂无转写提供方",
|
||||
"emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能",
|
||||
"models": "语音识别模型",
|
||||
"noModels": "暂无语音识别模型,可导入可用模型,或保留默认模板模型。",
|
||||
"importModels": "导入模型",
|
||||
"importSuccess": "识别模型导入成功",
|
||||
"importFailed": "识别模型导入失败",
|
||||
"saveSuccess": "转写配置已保存",
|
||||
"noCapabilities": "该模型暂无可用能力信息。",
|
||||
"advanced": {
|
||||
"title": "高级设置",
|
||||
"description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
|
||||
},
|
||||
"test": {
|
||||
"title": "测试识别",
|
||||
"run": "开始识别",
|
||||
"failed": "识别失败"
|
||||
}
|
||||
},
|
||||
"email": {
|
||||
"title": "邮件提供方",
|
||||
"add": "添加邮件提供方",
|
||||
@@ -941,8 +916,6 @@
|
||||
"memoryHealthUnavailable": "暂不可用",
|
||||
"ttsModel": "语音合成模型",
|
||||
"ttsModelPlaceholder": "选择语音合成模型",
|
||||
"transcriptionModel": "转写模型",
|
||||
"transcriptionModelPlaceholder": "选择语音转写模型",
|
||||
"imageModel": "图片生成模型",
|
||||
"imageModelDescription": "用于 generate_image 工具的模型,必须支持 image-output 兼容性。",
|
||||
"imageModelPlaceholder": "选择图片模型(可选)",
|
||||
|
||||
@@ -187,17 +187,6 @@
|
||||
/>
|
||||
</div>
|
||||
|
||||
<!-- Transcription Model -->
|
||||
<div class="space-y-2">
|
||||
<Label>{{ $t('bots.settings.transcriptionModel') }}</Label>
|
||||
<TtsModelSelect
|
||||
v-model="form.transcription_model_id"
|
||||
:models="transcriptionModels"
|
||||
:providers="ttsProviders"
|
||||
:placeholder="$t('bots.settings.transcriptionModelPlaceholder')"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<!-- Image Generation Model -->
|
||||
<div class="space-y-2">
|
||||
<Label>{{ $t('bots.settings.imageModel') }}</Label>
|
||||
@@ -367,7 +356,7 @@ import MemoryProviderSelect from './memory-provider-select.vue'
|
||||
import TtsModelSelect from './tts-model-select.vue'
|
||||
import BrowserContextSelect from './browser-context-select.vue'
|
||||
import { useQuery, useMutation, useQueryCache } from '@pinia/colada'
|
||||
import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getTranscriptionProviders, getTranscriptionModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
|
||||
import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
|
||||
import type { SettingsSettings } from '@memohai/sdk'
|
||||
import type { Ref } from 'vue'
|
||||
import { resolveApiErrorMessage } from '@/utils/api-error'
|
||||
@@ -451,22 +440,6 @@ const { data: ttsModelData } = useQuery({
|
||||
},
|
||||
})
|
||||
|
||||
const { data: transcriptionModelData } = useQuery({
|
||||
key: ['transcription-models'],
|
||||
query: async () => {
|
||||
const { data } = await getTranscriptionModels({ throwOnError: true })
|
||||
return data
|
||||
},
|
||||
})
|
||||
|
||||
const { data: transcriptionProviderData } = useQuery({
|
||||
key: ['transcription-providers'],
|
||||
query: async () => {
|
||||
const { data } = await getTranscriptionProviders({ throwOnError: true })
|
||||
return data
|
||||
},
|
||||
})
|
||||
|
||||
const { data: browserContextData } = useQuery({
|
||||
key: ['all-browser-contexts'],
|
||||
query: async () => {
|
||||
@@ -521,10 +494,7 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter((
|
||||
const memoryProviders = computed(() => memoryProviderData.value ?? [])
|
||||
const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false))
|
||||
const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id)))
|
||||
const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record<string, unknown>) => p.enable !== false))
|
||||
const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record<string, unknown>) => p.id as string)))
|
||||
const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTtsProviderIds.value.has(m.provider_id as string)))
|
||||
const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTranscriptionProviderIds.value.has(m.provider_id as string)))
|
||||
const browserContexts = computed(() => browserContextData.value ?? [])
|
||||
|
||||
// ---- Form ----
|
||||
@@ -535,7 +505,6 @@ const form = reactive({
|
||||
search_provider_id: '',
|
||||
memory_provider_id: '',
|
||||
tts_model_id: '',
|
||||
transcription_model_id: '',
|
||||
browser_context_id: '',
|
||||
timezone: '',
|
||||
language: '',
|
||||
@@ -675,7 +644,6 @@ watch(settings, (val) => {
|
||||
form.search_provider_id = val.search_provider_id ?? ''
|
||||
form.memory_provider_id = val.memory_provider_id ?? ''
|
||||
form.tts_model_id = val.tts_model_id ?? ''
|
||||
form.transcription_model_id = val.transcription_model_id ?? ''
|
||||
form.browser_context_id = val.browser_context_id ?? ''
|
||||
form.language = val.language ?? ''
|
||||
form.timezone = val.timezone ?? ''
|
||||
@@ -698,7 +666,6 @@ const hasSettingsChanges = computed(() => {
|
||||
|| form.search_provider_id !== (s.search_provider_id ?? '')
|
||||
|| form.memory_provider_id !== (s.memory_provider_id ?? '')
|
||||
|| form.tts_model_id !== (s.tts_model_id ?? '')
|
||||
|| form.transcription_model_id !== (s.transcription_model_id ?? '')
|
||||
|| form.browser_context_id !== (s.browser_context_id ?? '')
|
||||
|| form.language !== (s.language ?? '')
|
||||
|| form.timezone !== (s.timezone ?? '')
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
v-else-if="advancedFields.length === 0"
|
||||
class="text-xs text-muted-foreground"
|
||||
>
|
||||
{{ mode === 'transcription' ? $t('transcription.noCapabilities') : $t('speech.noCapabilities') }}
|
||||
{{ $t('speech.noCapabilities') }}
|
||||
</div>
|
||||
|
||||
<div
|
||||
@@ -97,7 +97,7 @@
|
||||
class="flex w-full items-center justify-between px-3 py-2 text-left text-xs font-medium"
|
||||
@click="showAdvanced = !showAdvanced"
|
||||
>
|
||||
<span>{{ mode === 'transcription' ? $t('transcription.advanced.title') : $t('speech.advanced.title') }}</span>
|
||||
<span>{{ $t('speech.advanced.title') }}</span>
|
||||
<component
|
||||
:is="showAdvanced ? ChevronUp : ChevronDown"
|
||||
class="size-3 text-muted-foreground"
|
||||
@@ -108,7 +108,7 @@
|
||||
class="space-y-4 border-t border-border px-3 py-3"
|
||||
>
|
||||
<p class="text-xs text-muted-foreground">
|
||||
{{ mode === 'transcription' ? $t('transcription.advanced.description') : $t('speech.advanced.description') }}
|
||||
{{ $t('speech.advanced.description') }}
|
||||
</p>
|
||||
<section
|
||||
v-for="field in advancedFields"
|
||||
@@ -195,12 +195,9 @@
|
||||
|
||||
<div class="space-y-3">
|
||||
<h4 class="text-xs font-medium">
|
||||
{{ mode === 'transcription' ? $t('transcription.test.title') : $t('speech.test.title') }}
|
||||
{{ $t('speech.test.title') }}
|
||||
</h4>
|
||||
<div
|
||||
v-if="mode === 'synthesis'"
|
||||
class="relative"
|
||||
>
|
||||
<div class="relative">
|
||||
<Textarea
|
||||
v-model="testText"
|
||||
:placeholder="$t('speech.test.placeholder')"
|
||||
@@ -212,36 +209,17 @@
|
||||
{{ testText.length }}/{{ maxTestTextLen }}
|
||||
</span>
|
||||
</div>
|
||||
<div
|
||||
v-else
|
||||
class="space-y-2"
|
||||
>
|
||||
<Input
|
||||
type="file"
|
||||
accept="audio/*"
|
||||
@change="handleFileChange"
|
||||
/>
|
||||
<p
|
||||
v-if="selectedFileName"
|
||||
class="text-xs text-muted-foreground"
|
||||
>
|
||||
{{ selectedFileName }}
|
||||
</p>
|
||||
</div>
|
||||
<div class="flex items-center gap-3">
|
||||
<LoadingButton
|
||||
type="button"
|
||||
variant="outline"
|
||||
size="sm"
|
||||
:loading="testLoading"
|
||||
:disabled="mode === 'synthesis' ? (!testText.trim() || testText.length > maxTestTextLen) : !selectedFile"
|
||||
:disabled="!testText.trim() || testText.length > maxTestTextLen"
|
||||
@click="handleTest"
|
||||
>
|
||||
<Play
|
||||
v-if="mode === 'synthesis'"
|
||||
class="mr-1.5"
|
||||
/>
|
||||
{{ mode === 'transcription' ? $t('transcription.test.run') : $t('speech.test.generate') }}
|
||||
<Play class="mr-1.5" />
|
||||
{{ $t('speech.test.generate') }}
|
||||
</LoadingButton>
|
||||
<span
|
||||
v-if="testError"
|
||||
@@ -251,7 +229,7 @@
|
||||
</span>
|
||||
</div>
|
||||
<div
|
||||
v-if="mode === 'synthesis' && audioUrl"
|
||||
v-if="audioUrl"
|
||||
class="rounded-md border border-border bg-muted/30 p-3"
|
||||
>
|
||||
<audio
|
||||
@@ -261,20 +239,6 @@
|
||||
class="w-full"
|
||||
/>
|
||||
</div>
|
||||
<div
|
||||
v-if="mode === 'transcription' && transcriptionText"
|
||||
class="rounded-md border border-border bg-muted/30 p-3 space-y-2"
|
||||
>
|
||||
<p class="text-sm whitespace-pre-wrap wrap-break-word">
|
||||
{{ transcriptionText }}
|
||||
</p>
|
||||
<p
|
||||
v-if="transcriptionLanguage"
|
||||
class="text-xs text-muted-foreground"
|
||||
>
|
||||
{{ transcriptionLanguage }}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<Separator class="my-3" />
|
||||
@@ -332,8 +296,7 @@ const props = defineProps<{
|
||||
modelName: string
|
||||
config: Record<string, unknown>
|
||||
schema: SpeechConfigSchema | null
|
||||
mode?: 'synthesis' | 'transcription'
|
||||
onTest: (payload: string | File, config: Record<string, unknown>) => Promise<Blob | { text?: string, language?: string }>
|
||||
onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
|
||||
}>()
|
||||
|
||||
const emit = defineEmits<{
|
||||
@@ -346,16 +309,11 @@ const visibleSecrets = reactive<Record<string, boolean>>({})
|
||||
const saving = ref(false)
|
||||
const showAdvanced = ref(false)
|
||||
const testText = ref('')
|
||||
const selectedFile = ref<File | null>(null)
|
||||
const selectedFileName = ref('')
|
||||
const testLoading = ref(false)
|
||||
const testError = ref('')
|
||||
const audioUrl = ref('')
|
||||
const transcriptionText = ref('')
|
||||
const transcriptionLanguage = ref('')
|
||||
const audioEl = ref<HTMLAudioElement>()
|
||||
const maxTestTextLen = 500
|
||||
const mode = computed(() => props.mode ?? 'synthesis')
|
||||
|
||||
const orderedFields = computed(() => {
|
||||
const fields = props.schema?.fields ?? []
|
||||
@@ -390,11 +348,6 @@ function revokeAudio() {
|
||||
}
|
||||
}
|
||||
|
||||
function resetTranscription() {
|
||||
transcriptionText.value = ''
|
||||
transcriptionLanguage.value = ''
|
||||
}
|
||||
|
||||
onBeforeUnmount(revokeAudio)
|
||||
|
||||
async function handleSaveConfig() {
|
||||
@@ -407,39 +360,23 @@ async function handleSaveConfig() {
|
||||
}
|
||||
|
||||
async function handleTest() {
|
||||
if (mode.value === 'synthesis' && !testText.value.trim()) return
|
||||
if (mode.value === 'transcription' && !selectedFile.value) return
|
||||
if (!testText.value.trim()) return
|
||||
testLoading.value = true
|
||||
testError.value = ''
|
||||
revokeAudio()
|
||||
resetTranscription()
|
||||
|
||||
try {
|
||||
const result = await props.onTest(mode.value === 'synthesis' ? testText.value : selectedFile.value as File, buildConfig())
|
||||
const blob = await props.onTest(testText.value, buildConfig())
|
||||
|
||||
if (mode.value === 'synthesis') {
|
||||
const blob = result as Blob
|
||||
audioUrl.value = URL.createObjectURL(blob)
|
||||
await new Promise<void>((resolve) => setTimeout(resolve, 50))
|
||||
audioEl.value?.play()
|
||||
} else {
|
||||
const payload = result as { text?: string, language?: string }
|
||||
transcriptionText.value = payload.text ?? ''
|
||||
transcriptionLanguage.value = payload.language ?? ''
|
||||
}
|
||||
audioUrl.value = URL.createObjectURL(blob)
|
||||
await new Promise<void>((resolve) => setTimeout(resolve, 50))
|
||||
audioEl.value?.play()
|
||||
} catch (error: unknown) {
|
||||
const msg = error instanceof Error ? error.message : t(mode.value === 'transcription' ? 'transcription.test.failed' : 'speech.test.failed')
|
||||
const msg = error instanceof Error ? error.message : t('speech.test.failed')
|
||||
testError.value = msg
|
||||
toast.error(msg)
|
||||
} finally {
|
||||
testLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
function handleFileChange(event: Event) {
|
||||
const input = event.target as HTMLInputElement
|
||||
const file = input.files?.[0] ?? null
|
||||
selectedFile.value = file
|
||||
selectedFileName.value = file?.name ?? ''
|
||||
}
|
||||
</script>
|
||||
|
||||
@@ -138,29 +138,18 @@
|
||||
<section>
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<h3 class="text-xs font-medium">
|
||||
{{ $t('speech.synthesis.models') }}
|
||||
{{ $t('speech.models') }}
|
||||
</h3>
|
||||
<div
|
||||
<LoadingButton
|
||||
v-if="curProviderId"
|
||||
class="flex items-center gap-2"
|
||||
type="button"
|
||||
variant="outline"
|
||||
size="sm"
|
||||
:loading="importLoading"
|
||||
@click="handleImportModels"
|
||||
>
|
||||
<LoadingButton
|
||||
type="button"
|
||||
variant="outline"
|
||||
size="sm"
|
||||
:loading="importLoading"
|
||||
@click="handleImportModels"
|
||||
>
|
||||
{{ $t('speech.importModels') }}
|
||||
</LoadingButton>
|
||||
<CreateModel
|
||||
:id="curProviderId"
|
||||
default-type="speech"
|
||||
hide-type
|
||||
:type-options="speechTypeOptions"
|
||||
:invalidate-keys="['speech-provider-models', 'speech-models']"
|
||||
/>
|
||||
</div>
|
||||
{{ $t('speech.importModels') }}
|
||||
</LoadingButton>
|
||||
</div>
|
||||
|
||||
<div
|
||||
@@ -202,7 +191,7 @@
|
||||
:model-name="model.model_id ?? ''"
|
||||
:config="model.config || {}"
|
||||
:schema="getModelSchema(model.model_id ?? '')"
|
||||
:on-test="(text, cfg) => handleTestModel(model.id ?? '', text as string, cfg)"
|
||||
:on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
|
||||
@save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
|
||||
/>
|
||||
</div>
|
||||
@@ -229,11 +218,10 @@ import { computed, inject, reactive, ref, watch } from 'vue'
|
||||
import { toast } from 'vue-sonner'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
import { useQuery, useQueryCache } from '@pinia/colada'
|
||||
import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putProvidersById } from '@memohai/sdk'
|
||||
import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
|
||||
import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
|
||||
import LoadingButton from '@/components/loading-button/index.vue'
|
||||
import ProviderIcon from '@/components/provider-icon/index.vue'
|
||||
import CreateModel from '@/components/create-model/index.vue'
|
||||
|
||||
interface SpeechFieldSchema {
|
||||
key: string
|
||||
@@ -268,8 +256,6 @@ interface SpeechProviderMeta {
|
||||
config_schema?: SpeechConfigSchema
|
||||
default_model?: string
|
||||
models?: SpeechModelMeta[]
|
||||
default_synthesis_model?: string
|
||||
synthesis_models?: SpeechModelMeta[]
|
||||
}
|
||||
|
||||
function getInitials(name: string | undefined) {
|
||||
@@ -288,9 +274,6 @@ const enableLoading = ref(false)
|
||||
const saveLoading = ref(false)
|
||||
const importLoading = ref(false)
|
||||
const queryCache = useQueryCache()
|
||||
const speechTypeOptions = [
|
||||
{ value: 'speech', label: 'Speech' },
|
||||
]
|
||||
|
||||
const { data: providerDetail } = useQuery({
|
||||
key: () => ['speech-provider-detail', curProviderId.value],
|
||||
@@ -314,7 +297,7 @@ const { data: metaList } = useQuery({
|
||||
|
||||
const currentMeta = computed(() => {
|
||||
if (!metaList.value || !curProvider.value?.client_type) return null
|
||||
return (metaList.value as SpeechProviderMeta[]).find(m => m.provider === curProvider.value?.client_type) ?? null
|
||||
return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
|
||||
})
|
||||
|
||||
const orderedProviderFields = computed(() => {
|
||||
@@ -334,7 +317,9 @@ const { data: providerSpeechModels } = useQuery({
|
||||
},
|
||||
})
|
||||
|
||||
const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []))
|
||||
const providerModels = computed(() => {
|
||||
return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
|
||||
})
|
||||
|
||||
watch(() => providerDetail.value, (provider) => {
|
||||
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
|
||||
@@ -343,11 +328,12 @@ watch(() => providerDetail.value, (provider) => {
|
||||
}, { immediate: true, deep: true })
|
||||
|
||||
function getModelMeta(modelID: string): SpeechModelMeta | null {
|
||||
const models = currentMeta.value?.synthesis_models ?? currentMeta.value?.models ?? []
|
||||
const models = currentMeta.value?.models ?? []
|
||||
const exact = models.find(m => m.id === modelID)
|
||||
if (exact) return exact
|
||||
const defaultModel = currentMeta.value?.default_synthesis_model ?? currentMeta.value?.default_model
|
||||
if (defaultModel) return models.find(m => m.id === defaultModel) ?? null
|
||||
if (currentMeta.value?.default_model) {
|
||||
return models.find(m => m.id === currentMeta.value?.default_model) ?? null
|
||||
}
|
||||
return models[0] ?? null
|
||||
}
|
||||
|
||||
@@ -412,23 +398,20 @@ async function handleSaveProvider() {
|
||||
}
|
||||
|
||||
async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
|
||||
const model = providerModels.value.find(item => item.id === modelId)
|
||||
const model = providerModels.value.find((item) => item.id === modelId)
|
||||
if (!model) return
|
||||
try {
|
||||
const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
|
||||
const token = localStorage.getItem('token')
|
||||
const resp = await fetch(`${apiBase}/speech-models/${modelId}`, {
|
||||
method: 'PUT',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(token ? { Authorization: `Bearer ${token}` } : {}),
|
||||
},
|
||||
body: JSON.stringify({
|
||||
await putModelsById({
|
||||
path: { id: modelId },
|
||||
body: {
|
||||
model_id: model.model_id,
|
||||
name: model.name ?? model.model_id,
|
||||
provider_id: model.provider_id,
|
||||
type: 'speech',
|
||||
config,
|
||||
}),
|
||||
},
|
||||
throwOnError: true,
|
||||
})
|
||||
if (!resp.ok) throw new Error(await resp.text())
|
||||
toast.success(t('speech.saveSuccess'))
|
||||
queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
|
||||
queryCache.invalidateQueries({ key: ['speech-models'] })
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
<script setup lang="ts">
|
||||
import { computed, ref, provide, watch } from 'vue'
|
||||
import { useQuery } from '@pinia/colada'
|
||||
import {
|
||||
ScrollArea,
|
||||
SidebarMenu,
|
||||
SidebarMenuButton,
|
||||
SidebarMenuItem,
|
||||
Toggle,
|
||||
Empty,
|
||||
EmptyDescription,
|
||||
EmptyHeader,
|
||||
EmptyMedia,
|
||||
EmptyTitle,
|
||||
} from '@memohai/ui'
|
||||
import { getTranscriptionProviders } from '@memohai/sdk'
|
||||
import type { AudioSpeechProviderResponse } from '@memohai/sdk'
|
||||
import ProviderSetting from './provider-setting.vue'
|
||||
import { AudioLines } from 'lucide-vue-next'
|
||||
import MasterDetailSidebarLayout from '@/components/master-detail-sidebar-layout/index.vue'
|
||||
import ProviderIcon from '@/components/provider-icon/index.vue'
|
||||
|
||||
function getInitials(name: string | undefined) {
|
||||
const label = name?.trim() ?? ''
|
||||
return label ? label.slice(0, 2).toUpperCase() : '?'
|
||||
}
|
||||
|
||||
const { data: providerData } = useQuery({
|
||||
key: () => ['transcription-providers'],
|
||||
query: async () => {
|
||||
const { data } = await getTranscriptionProviders({ throwOnError: true })
|
||||
return (data ?? []) as AudioSpeechProviderResponse[]
|
||||
},
|
||||
})
|
||||
const curProvider = ref<AudioSpeechProviderResponse>()
|
||||
provide('curTranscriptionProvider', curProvider)
|
||||
|
||||
const selectProvider = (name: string) => computed(() => curProvider.value?.name === name)
|
||||
|
||||
const filteredProviders = computed(() => {
|
||||
if (!Array.isArray(providerData.value)) return []
|
||||
return [...providerData.value].sort((a, b) => Number(b.enable !== false) - Number(a.enable !== false))
|
||||
})
|
||||
|
||||
watch(filteredProviders, (list) => {
|
||||
if (!list || list.length === 0) {
|
||||
curProvider.value = { id: '' }
|
||||
return
|
||||
}
|
||||
const currentId = curProvider.value?.id
|
||||
if (currentId) {
|
||||
const stillExists = list.find(p => p.id === currentId)
|
||||
if (stillExists) {
|
||||
curProvider.value = stillExists
|
||||
return
|
||||
}
|
||||
}
|
||||
curProvider.value = list[0]
|
||||
}, { immediate: true })
|
||||
</script>
|
||||
|
||||
<template>
|
||||
<MasterDetailSidebarLayout>
|
||||
<template #sidebar-content>
|
||||
<SidebarMenu
|
||||
v-for="item in filteredProviders"
|
||||
:key="item.id"
|
||||
>
|
||||
<SidebarMenuItem>
|
||||
<SidebarMenuButton
|
||||
as-child
|
||||
class="justify-start py-5! px-4"
|
||||
>
|
||||
<Toggle
|
||||
:class="['py-4 border', curProvider?.id === item.id ? 'border-border' : 'border-transparent']"
|
||||
:model-value="selectProvider(item.name ?? '').value"
|
||||
@update:model-value="(isSelect) => { if (isSelect) curProvider = item }"
|
||||
>
|
||||
<span class="relative shrink-0">
|
||||
<span class="flex size-7 items-center justify-center rounded-full bg-muted">
|
||||
<ProviderIcon
|
||||
v-if="item.icon"
|
||||
:icon="item.icon"
|
||||
size="1.25em"
|
||||
/>
|
||||
<span
|
||||
v-else
|
||||
class="text-xs font-medium text-muted-foreground"
|
||||
>
|
||||
{{ getInitials(item.name) }}
|
||||
</span>
|
||||
</span>
|
||||
<span
|
||||
v-if="item.enable !== false"
|
||||
class="absolute -bottom-0.5 -right-0.5 size-2.5 rounded-full bg-green-500 ring-2 ring-background"
|
||||
/>
|
||||
</span>
|
||||
<span class="truncate">{{ item.name }}</span>
|
||||
</Toggle>
|
||||
</SidebarMenuButton>
|
||||
</SidebarMenuItem>
|
||||
</SidebarMenu>
|
||||
</template>
|
||||
|
||||
<template #detail>
|
||||
<ScrollArea
|
||||
v-if="curProvider?.id"
|
||||
class="max-h-full h-full"
|
||||
>
|
||||
<ProviderSetting />
|
||||
</ScrollArea>
|
||||
<Empty
|
||||
v-else
|
||||
class="h-full flex justify-center items-center"
|
||||
>
|
||||
<EmptyHeader>
|
||||
<EmptyMedia variant="icon">
|
||||
<AudioLines />
|
||||
</EmptyMedia>
|
||||
</EmptyHeader>
|
||||
<EmptyTitle>{{ $t('transcription.emptyTitle') }}</EmptyTitle>
|
||||
<EmptyDescription>{{ $t('transcription.emptyDescription') }}</EmptyDescription>
|
||||
</Empty>
|
||||
</template>
|
||||
</MasterDetailSidebarLayout>
|
||||
</template>
|
||||
@@ -1,480 +0,0 @@
|
||||
<template>
|
||||
<div class="p-4">
|
||||
<section class="flex items-center gap-3">
|
||||
<span class="flex size-10 shrink-0 items-center justify-center rounded-full bg-muted">
|
||||
<ProviderIcon
|
||||
v-if="curProvider?.icon"
|
||||
:icon="curProvider.icon"
|
||||
size="1.5em"
|
||||
/>
|
||||
<span
|
||||
v-else
|
||||
class="text-xs font-medium text-muted-foreground"
|
||||
>
|
||||
{{ getInitials(curProvider?.name) }}
|
||||
</span>
|
||||
</span>
|
||||
<div class="min-w-0">
|
||||
<h2 class="text-sm font-semibold truncate">
|
||||
{{ curProvider?.name }}
|
||||
</h2>
|
||||
<p class="text-xs text-muted-foreground">
|
||||
{{ currentMeta?.display_name ?? curProvider?.client_type }}
|
||||
</p>
|
||||
</div>
|
||||
<div class="ml-auto flex items-center gap-2">
|
||||
<span class="text-xs text-muted-foreground">
|
||||
{{ $t('common.enable') }}
|
||||
</span>
|
||||
<Switch
|
||||
:model-value="curProvider?.enable ?? false"
|
||||
:disabled="!curProvider?.id || enableLoading"
|
||||
@update:model-value="handleToggleEnable"
|
||||
/>
|
||||
</div>
|
||||
</section>
|
||||
<Separator class="mt-4 mb-6" />
|
||||
|
||||
<form
|
||||
class="space-y-4"
|
||||
@submit.prevent="handleSaveProvider"
|
||||
>
|
||||
<section class="space-y-2">
|
||||
<Label for="transcription-provider-name">{{ $t('common.name') }}</Label>
|
||||
<Input
|
||||
id="transcription-provider-name"
|
||||
v-model="providerName"
|
||||
type="text"
|
||||
:placeholder="$t('common.namePlaceholder')"
|
||||
/>
|
||||
</section>
|
||||
|
||||
<section
|
||||
v-for="field in orderedProviderFields"
|
||||
:key="field.key"
|
||||
class="space-y-2"
|
||||
>
|
||||
<Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `transcription-provider-${field.key}`">
|
||||
{{ field.title || field.key }}
|
||||
</Label>
|
||||
<p
|
||||
v-if="field.description"
|
||||
class="text-xs text-muted-foreground"
|
||||
>
|
||||
{{ field.description }}
|
||||
</p>
|
||||
<div
|
||||
v-if="field.type === 'secret'"
|
||||
class="relative"
|
||||
>
|
||||
<Input
|
||||
:id="`transcription-provider-${field.key}`"
|
||||
v-model="providerConfig[field.key] as string"
|
||||
:type="visibleSecrets[field.key] ? 'text' : 'password'"
|
||||
/>
|
||||
<button
|
||||
type="button"
|
||||
class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
|
||||
@click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
|
||||
>
|
||||
<component
|
||||
:is="visibleSecrets[field.key] ? EyeOff : Eye"
|
||||
class="size-3.5"
|
||||
/>
|
||||
</button>
|
||||
</div>
|
||||
<Switch
|
||||
v-else-if="field.type === 'bool'"
|
||||
:model-value="!!providerConfig[field.key]"
|
||||
@update:model-value="(val) => providerConfig[field.key] = !!val"
|
||||
/>
|
||||
<Input
|
||||
v-else-if="field.type === 'number'"
|
||||
:id="`transcription-provider-${field.key}`"
|
||||
v-model.number="providerConfig[field.key] as number"
|
||||
type="number"
|
||||
/>
|
||||
<Select
|
||||
v-else-if="field.type === 'enum' && field.enum"
|
||||
:model-value="String(providerConfig[field.key] ?? '')"
|
||||
@update:model-value="(val) => providerConfig[field.key] = val"
|
||||
>
|
||||
<SelectTrigger>
|
||||
<SelectValue :placeholder="field.title || field.key" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem
|
||||
v-for="opt in field.enum"
|
||||
:key="opt"
|
||||
:value="opt"
|
||||
>
|
||||
{{ opt }}
|
||||
</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
<Input
|
||||
v-else
|
||||
:id="`transcription-provider-${field.key}`"
|
||||
v-model="providerConfig[field.key] as string"
|
||||
type="text"
|
||||
/>
|
||||
</section>
|
||||
|
||||
<div class="flex justify-end">
|
||||
<LoadingButton
|
||||
type="submit"
|
||||
:loading="saveLoading"
|
||||
>
|
||||
{{ $t('provider.saveChanges') }}
|
||||
</LoadingButton>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<Separator class="mt-6 mb-6" />
|
||||
|
||||
<section>
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<h3 class="text-xs font-medium">
|
||||
{{ $t('transcription.models') }}
|
||||
</h3>
|
||||
<div
|
||||
v-if="curProviderId"
|
||||
class="flex items-center gap-2"
|
||||
>
|
||||
<LoadingButton
|
||||
type="button"
|
||||
variant="outline"
|
||||
size="sm"
|
||||
:loading="importLoading"
|
||||
@click="handleImportModels"
|
||||
>
|
||||
{{ $t('transcription.importModels') }}
|
||||
</LoadingButton>
|
||||
<CreateModel
|
||||
:id="curProviderId"
|
||||
default-type="transcription"
|
||||
hide-type
|
||||
:type-options="transcriptionTypeOptions"
|
||||
:invalidate-keys="['transcription-provider-models', 'transcription-models']"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div
|
||||
v-if="providerModels.length === 0"
|
||||
class="text-xs text-muted-foreground py-4 text-center"
|
||||
>
|
||||
{{ $t('transcription.noModels') }}
|
||||
</div>
|
||||
|
||||
<div
|
||||
v-for="model in providerModels"
|
||||
:key="model.id"
|
||||
class="border border-border rounded-lg mb-4"
|
||||
>
|
||||
<button
|
||||
type="button"
|
||||
class="w-full flex items-center justify-between p-3 text-left hover:bg-accent/50 rounded-t-lg transition-colors"
|
||||
@click="toggleModel(model.id ?? '')"
|
||||
>
|
||||
<div>
|
||||
<span class="text-xs font-medium">{{ model.name || model.model_id }}</span>
|
||||
<span
|
||||
v-if="model.name"
|
||||
class="text-xs text-muted-foreground ml-2"
|
||||
>
|
||||
{{ model.model_id }}
|
||||
</span>
|
||||
</div>
|
||||
<component
|
||||
:is="expandedModelId === model.id ? ChevronUp : ChevronDown"
|
||||
class="size-3 text-muted-foreground"
|
||||
/>
|
||||
</button>
|
||||
<div
|
||||
v-if="expandedModelId === model.id"
|
||||
class="px-3 pb-3 space-y-4 border-t border-border pt-3"
|
||||
>
|
||||
<ModelConfigEditor
|
||||
:model-id="model.id ?? ''"
|
||||
:model-name="model.model_id ?? ''"
|
||||
:config="model.config || {}"
|
||||
:schema="getModelSchema(model.model_id ?? '')"
|
||||
mode="transcription"
|
||||
:on-test="(file, cfg) => handleTestModel(model.id ?? '', file as File, cfg)"
|
||||
@save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { computed, inject, reactive, ref, watch } from 'vue'
|
||||
import { useQuery, useQueryCache } from '@pinia/colada'
|
||||
import { toast } from 'vue-sonner'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
import {
|
||||
getTranscriptionProvidersById,
|
||||
getTranscriptionProvidersMeta,
|
||||
getTranscriptionProvidersByIdModels,
|
||||
postTranscriptionProvidersByIdImportModels,
|
||||
postTranscriptionModelsByIdTest,
|
||||
putProvidersById,
|
||||
putTranscriptionModelsById,
|
||||
} from '@memohai/sdk'
|
||||
import type {
|
||||
AudioProviderMetaResponse,
|
||||
AudioSpeechProviderResponse,
|
||||
AudioTestTranscriptionResponse,
|
||||
AudioTranscriptionModelResponse,
|
||||
} from '@memohai/sdk'
|
||||
import { ChevronDown, ChevronUp, Eye, EyeOff } from 'lucide-vue-next'
|
||||
import { Input, Label, Select, SelectContent, SelectItem, SelectTrigger, SelectValue, Separator, Switch } from '@memohai/ui'
|
||||
import ProviderIcon from '@/components/provider-icon/index.vue'
|
||||
import LoadingButton from '@/components/loading-button/index.vue'
|
||||
import ModelConfigEditor from '@/pages/speech/components/model-config-editor.vue'
|
||||
import CreateModel from '@/components/create-model/index.vue'
|
||||
|
||||
interface FieldSchema { key: string, type: string, title?: string, description?: string, enum?: string[], order?: number }
|
||||
interface ConfigSchema { fields?: FieldSchema[] }
|
||||
interface ModelMeta { id: string, name: string, config_schema?: ConfigSchema, capabilities?: { config_schema?: ConfigSchema } }
|
||||
interface ProviderMeta {
|
||||
provider: string
|
||||
display_name?: string
|
||||
config_schema?: ConfigSchema
|
||||
default_transcription_model?: string
|
||||
transcription_models?: ModelMeta[]
|
||||
models?: ModelMeta[]
|
||||
}
|
||||
|
||||
function getInitials(name: string | undefined) {
|
||||
const label = name?.trim() ?? ''
|
||||
return label ? label.slice(0, 2).toUpperCase() : '?'
|
||||
}
|
||||
|
||||
function normalizeConfigSchema(schema?: AudioProviderMetaResponse['config_schema']): ConfigSchema | undefined {
|
||||
if (!schema) return undefined
|
||||
const fields: FieldSchema[] = []
|
||||
for (const field of schema.fields ?? []) {
|
||||
if (!field?.key || !field.type) continue
|
||||
fields.push({
|
||||
key: field.key,
|
||||
type: field.type,
|
||||
title: field.title,
|
||||
description: field.description,
|
||||
enum: field.enum,
|
||||
order: field.order,
|
||||
})
|
||||
}
|
||||
return { fields }
|
||||
}
|
||||
|
||||
function normalizeModelMeta(model: NonNullable<AudioProviderMetaResponse['models']>[number]): ModelMeta | null {
|
||||
if (!model?.id) return null
|
||||
return {
|
||||
id: model.id,
|
||||
name: model.name ?? model.id,
|
||||
config_schema: normalizeConfigSchema(model.config_schema),
|
||||
capabilities: model.capabilities
|
||||
? { config_schema: normalizeConfigSchema(model.capabilities.config_schema) }
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeProviderMeta(meta: AudioProviderMetaResponse): ProviderMeta {
|
||||
return {
|
||||
provider: meta.provider ?? '',
|
||||
display_name: meta.display_name,
|
||||
config_schema: normalizeConfigSchema(meta.config_schema),
|
||||
default_transcription_model: meta.default_transcription_model,
|
||||
transcription_models: (meta.transcription_models ?? [])
|
||||
.map(normalizeModelMeta)
|
||||
.filter((model): model is ModelMeta => model !== null),
|
||||
models: (meta.models ?? [])
|
||||
.map(normalizeModelMeta)
|
||||
.filter((model): model is ModelMeta => model !== null),
|
||||
}
|
||||
}
|
||||
|
||||
const { t } = useI18n()
|
||||
const curProvider = inject('curTranscriptionProvider', ref<AudioSpeechProviderResponse>())
|
||||
const curProviderId = computed(() => curProvider.value?.id)
|
||||
const providerName = ref('')
|
||||
const providerConfig = reactive<Record<string, unknown>>({})
|
||||
const visibleSecrets = reactive<Record<string, boolean>>({})
|
||||
const expandedModelId = ref('')
|
||||
const enableLoading = ref(false)
|
||||
const saveLoading = ref(false)
|
||||
const importLoading = ref(false)
|
||||
const queryCache = useQueryCache()
|
||||
const transcriptionTypeOptions = [
|
||||
{ value: 'transcription', label: 'Transcription' },
|
||||
]
|
||||
|
||||
const { data: providerDetail } = useQuery({
|
||||
key: () => ['transcription-provider-detail', curProviderId.value ?? ''],
|
||||
query: async () => {
|
||||
if (!curProviderId.value) return null
|
||||
const { data } = await getTranscriptionProvidersById({
|
||||
path: { id: curProviderId.value },
|
||||
throwOnError: true,
|
||||
})
|
||||
return (data ?? null) as AudioSpeechProviderResponse | null
|
||||
},
|
||||
})
|
||||
|
||||
const { data: metaList } = useQuery({
|
||||
key: () => ['transcription-providers-meta'],
|
||||
query: async () => {
|
||||
const { data } = await getTranscriptionProvidersMeta({ throwOnError: true })
|
||||
return (data ?? []).map(normalizeProviderMeta)
|
||||
},
|
||||
})
|
||||
|
||||
const currentMeta = computed(() => (metaList.value ?? []).find(m => m.provider === curProvider.value?.client_type) ?? null)
|
||||
const orderedProviderFields = computed(() => [...(currentMeta.value?.config_schema?.fields ?? [])].sort((a, b) => (a.order ?? 0) - (b.order ?? 0)))
|
||||
|
||||
const { data: providerModelData } = useQuery({
|
||||
key: () => ['transcription-provider-models', curProviderId.value ?? ''],
|
||||
query: async () => {
|
||||
if (!curProviderId.value) return []
|
||||
const { data } = await getTranscriptionProvidersByIdModels({
|
||||
path: { id: curProviderId.value },
|
||||
throwOnError: true,
|
||||
})
|
||||
return (data ?? []) as AudioTranscriptionModelResponse[]
|
||||
},
|
||||
})
|
||||
|
||||
const providerModels = computed(() => providerModelData.value ?? [])
|
||||
|
||||
watch(() => providerDetail.value, (provider) => {
|
||||
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
|
||||
Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
|
||||
Object.assign(providerConfig, { ...(provider?.config ?? {}) })
|
||||
}, { immediate: true, deep: true })
|
||||
|
||||
function getModelSchema(modelID: string): ConfigSchema | null {
|
||||
const models = currentMeta.value?.transcription_models ?? currentMeta.value?.models ?? []
|
||||
const exact = models.find(m => m.id === modelID)
|
||||
const fallback = exact ?? models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? models[0]
|
||||
return fallback?.config_schema ?? fallback?.capabilities?.config_schema ?? null
|
||||
}
|
||||
|
||||
function toggleModel(id: string) {
|
||||
expandedModelId.value = expandedModelId.value === id ? '' : id
|
||||
}
|
||||
|
||||
async function handleToggleEnable(value: boolean) {
|
||||
if (!curProviderId.value || !curProvider.value?.client_type) return
|
||||
const prev = curProvider.value.enable ?? false
|
||||
curProvider.value = { ...curProvider.value, enable: value }
|
||||
enableLoading.value = true
|
||||
try {
|
||||
await putProvidersById({
|
||||
path: { id: curProviderId.value },
|
||||
body: {
|
||||
name: providerName.value.trim() || curProvider.value.name || '',
|
||||
client_type: curProvider.value.client_type,
|
||||
enable: value,
|
||||
config: sanitizeConfig(providerConfig),
|
||||
},
|
||||
throwOnError: true,
|
||||
})
|
||||
queryCache.invalidateQueries({ key: ['transcription-providers'] })
|
||||
queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
|
||||
} catch {
|
||||
curProvider.value = { ...curProvider.value, enable: prev }
|
||||
toast.error(t('common.saveFailed'))
|
||||
} finally {
|
||||
enableLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
async function handleSaveProvider() {
|
||||
if (!curProviderId.value || !curProvider.value?.client_type) return
|
||||
saveLoading.value = true
|
||||
try {
|
||||
await putProvidersById({
|
||||
path: { id: curProviderId.value },
|
||||
body: {
|
||||
name: providerName.value.trim() || curProvider.value.name || '',
|
||||
client_type: curProvider.value.client_type,
|
||||
enable: curProvider.value.enable,
|
||||
config: sanitizeConfig(providerConfig),
|
||||
},
|
||||
throwOnError: true,
|
||||
})
|
||||
toast.success(t('transcription.saveSuccess'))
|
||||
queryCache.invalidateQueries({ key: ['transcription-providers'] })
|
||||
queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
|
||||
} catch {
|
||||
toast.error(t('common.saveFailed'))
|
||||
} finally {
|
||||
saveLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
|
||||
const model = providerModels.value.find(item => item.id === modelId)
|
||||
if (!model) return
|
||||
try {
|
||||
await putTranscriptionModelsById({
|
||||
path: { id: modelId },
|
||||
body: { name: model.name ?? model.model_id ?? modelId, config },
|
||||
throwOnError: true,
|
||||
})
|
||||
toast.success(t('transcription.saveSuccess'))
|
||||
queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
|
||||
queryCache.invalidateQueries({ key: ['transcription-models'] })
|
||||
} catch {
|
||||
toast.error(t('common.saveFailed'))
|
||||
}
|
||||
}
|
||||
|
||||
async function handleImportModels() {
|
||||
if (!curProviderId.value) return
|
||||
importLoading.value = true
|
||||
try {
|
||||
const { data } = await postTranscriptionProvidersByIdImportModels({
|
||||
path: { id: curProviderId.value },
|
||||
throwOnError: true,
|
||||
})
|
||||
const payload = (data ?? {}) as { created?: number, skipped?: number }
|
||||
toast.success(t('transcription.importSuccess', {
|
||||
created: payload.created ?? 0,
|
||||
skipped: payload.skipped ?? 0,
|
||||
}))
|
||||
queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
|
||||
queryCache.invalidateQueries({ key: ['transcription-models'] })
|
||||
queryCache.invalidateQueries({ key: ['transcription-providers-meta'] })
|
||||
} catch {
|
||||
toast.error(t('transcription.importFailed'))
|
||||
} finally {
|
||||
importLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
async function handleTestModel(modelId: string, file: File, config: Record<string, unknown>) {
|
||||
const { data } = await postTranscriptionModelsByIdTest({
|
||||
path: { id: modelId },
|
||||
body: {
|
||||
file,
|
||||
config: JSON.stringify(config),
|
||||
},
|
||||
throwOnError: true,
|
||||
})
|
||||
return (data ?? {}) as AudioTestTranscriptionResponse
|
||||
}
|
||||
|
||||
function sanitizeConfig(input: Record<string, unknown>) {
|
||||
const result: Record<string, unknown> = {}
|
||||
for (const [key, value] of Object.entries(input)) {
|
||||
if (value === '' || value == null) continue
|
||||
result[key] = value
|
||||
}
|
||||
return result
|
||||
}
|
||||
</script>
|
||||
@@ -89,14 +89,6 @@ const routes = [
|
||||
breadcrumb: i18nRef('sidebar.speech'),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'transcription',
|
||||
path: 'transcription',
|
||||
component: () => import('@/pages/transcription/index.vue'),
|
||||
meta: {
|
||||
breadcrumb: i18nRef('sidebar.transcription'),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'email',
|
||||
path: 'email',
|
||||
|
||||
+17
-49
@@ -23,7 +23,6 @@ import (
|
||||
agentpkg "github.com/memohai/memoh/internal/agent"
|
||||
"github.com/memohai/memoh/internal/agent/background"
|
||||
agenttools "github.com/memohai/memoh/internal/agent/tools"
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/bind"
|
||||
"github.com/memohai/memoh/internal/boot"
|
||||
"github.com/memohai/memoh/internal/bots"
|
||||
@@ -88,6 +87,7 @@ import (
|
||||
"github.com/memohai/memoh/internal/storage/providers/containerfs"
|
||||
"github.com/memohai/memoh/internal/storage/providers/fallback"
|
||||
"github.com/memohai/memoh/internal/storage/providers/localfs"
|
||||
ttspkg "github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/version"
|
||||
"github.com/memohai/memoh/internal/workspace"
|
||||
)
|
||||
@@ -331,7 +331,7 @@ func provideChannelRouter(
|
||||
policyService *policy.Service,
|
||||
bindService *bind.Service,
|
||||
mediaService *media.Service,
|
||||
audioService *audiopkg.Service,
|
||||
ttsService *ttspkg.Service,
|
||||
settingsService *settings.Service,
|
||||
scheduleService *schedule.Service,
|
||||
mcpConnService *mcp.ConnectionService,
|
||||
@@ -372,8 +372,7 @@ func provideChannelRouter(
|
||||
processor.SetMediaService(mediaService)
|
||||
processor.SetStreamObserver(local.NewRouteHubBroadcaster(hub))
|
||||
processor.SetDispatcher(inbound.NewRouteDispatcher(log))
|
||||
processor.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
|
||||
processor.SetTranscriptionService(&settingsTranscriptionAdapter{audio: audioService}, &settingsTranscriptionModelResolver{settings: settingsService})
|
||||
processor.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
|
||||
cmdHandler := command.NewHandler(
|
||||
log,
|
||||
&command.BotMemberRoleAdapter{BotService: botService},
|
||||
@@ -450,7 +449,7 @@ func provideBackgroundManager(log *slog.Logger) *background.Manager {
|
||||
return background.New(log)
|
||||
}
|
||||
|
||||
func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, audioService *audiopkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
|
||||
func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
|
||||
var assetResolver messaging.AssetResolver
|
||||
if mediaService != nil {
|
||||
assetResolver = &mediaAssetResolverAdapter{media: mediaService}
|
||||
@@ -468,8 +467,7 @@ func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *c
|
||||
agenttools.NewSpawnProvider(log, settingsService, modelsService, queries, sessionService),
|
||||
agenttools.NewSkillProvider(log),
|
||||
agenttools.NewBrowserProvider(log, settingsService, browserContextService, manager, cfg.BrowserGateway),
|
||||
agenttools.NewTTSProvider(log, settingsService, audioService, channelManager, registry),
|
||||
agenttools.NewTranscriptionProvider(log, settingsService, audioService, mediaService),
|
||||
agenttools.NewTTSProvider(log, settingsService, ttsService, channelManager, registry),
|
||||
agenttools.NewImageGenProvider(log, settingsService, modelsService, queries, manager, config.DefaultDataMount),
|
||||
agenttools.NewFederationProvider(log, fedSource),
|
||||
agenttools.NewHistoryProvider(log, sessionService, queries),
|
||||
@@ -513,23 +511,23 @@ func provideUsersHandler(log *slog.Logger, accountService *accounts.Service, ide
|
||||
return handlers.NewUsersHandler(log, accountService, identityService, botService, routeService, channelStore, channelLifecycle, channelManager, registry)
|
||||
}
|
||||
|
||||
func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, audioService *audiopkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
|
||||
func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, ttsService *ttspkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
|
||||
h := handlers.NewLocalChannelHandler(local.WebType, channelManager, channelStore, chatService, hub, botService, accountService)
|
||||
h.SetResolver(resolver)
|
||||
h.SetMediaService(mediaService)
|
||||
h.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
|
||||
h.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
|
||||
return h
|
||||
}
|
||||
|
||||
func provideAudioRegistry() *audiopkg.Registry {
|
||||
return audiopkg.NewRegistry()
|
||||
func provideTtsRegistry() *ttspkg.Registry {
|
||||
return ttspkg.NewRegistry()
|
||||
}
|
||||
|
||||
func provideAudioTempStore() (*audiopkg.TempStore, error) {
|
||||
return audiopkg.NewTempStore(os.TempDir())
|
||||
func provideTtsTempStore() (*ttspkg.TempStore, error) {
|
||||
return ttspkg.NewTempStore(os.TempDir())
|
||||
}
|
||||
|
||||
func startAudioTempStoreCleanup(lc fx.Lifecycle, store *audiopkg.TempStore) {
|
||||
func startTtsTempStoreCleanup(lc fx.Lifecycle, store *ttspkg.TempStore) {
|
||||
done := make(chan struct{})
|
||||
lc.Append(fx.Hook{
|
||||
OnStart: func(_ context.Context) error {
|
||||
@@ -585,11 +583,11 @@ func (a *sessionEnsurerAdapter) CreateNewSession(ctx context.Context, botID, rou
|
||||
return inbound.SessionResult{ID: sess.ID, Type: sess.Type}, nil
|
||||
}
|
||||
|
||||
type settingsSpeechModelResolver struct {
|
||||
type settingsTtsModelResolver struct {
|
||||
settings *settings.Service
|
||||
}
|
||||
|
||||
func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context, botID string) (string, error) {
|
||||
func (r *settingsTtsModelResolver) ResolveTtsModelID(ctx context.Context, botID string) (string, error) {
|
||||
s, err := r.settings.GetBot(ctx, botID)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -597,36 +595,6 @@ func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context,
|
||||
return s.TtsModelID, nil
|
||||
}
|
||||
|
||||
type settingsTranscriptionModelResolver struct {
|
||||
settings *settings.Service
|
||||
}
|
||||
|
||||
func (r *settingsTranscriptionModelResolver) ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error) {
|
||||
s, err := r.settings.GetBot(ctx, botID)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return s.TranscriptionModelID, nil
|
||||
}
|
||||
|
||||
type settingsTranscriptionAdapter struct {
|
||||
audio *audiopkg.Service
|
||||
}
|
||||
|
||||
type inboundTranscriptionResult struct {
|
||||
text string
|
||||
}
|
||||
|
||||
func (r inboundTranscriptionResult) GetText() string { return r.text }
|
||||
|
||||
func (a *settingsTranscriptionAdapter) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (inbound.TranscriptionResult, error) {
|
||||
result, err := a.audio.Transcribe(ctx, modelID, audio, filename, contentType, overrideCfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return inboundTranscriptionResult{text: result.Text}, nil
|
||||
}
|
||||
|
||||
func provideEmailRegistry(log *slog.Logger, tokenStore *emailpkg.DBOAuthTokenStore) *emailpkg.Registry {
|
||||
reg := emailpkg.NewRegistry()
|
||||
reg.Register(emailgeneric.New(log))
|
||||
@@ -716,11 +684,11 @@ func startRegistrySync(lc fx.Lifecycle, log *slog.Logger, cfg config.Config, que
|
||||
})
|
||||
}
|
||||
|
||||
func startAudioProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *audiopkg.Registry) {
|
||||
func startSpeechProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *ttspkg.Registry) {
|
||||
lc.Append(fx.Hook{
|
||||
OnStart: func(ctx context.Context) error {
|
||||
if err := audiopkg.SyncRegistry(ctx, log, queries, registry); err != nil {
|
||||
log.Warn("audio registry bootstrap failed", slog.Any("error", err))
|
||||
if err := ttspkg.SyncRegistry(ctx, log, queries, registry); err != nil {
|
||||
log.Warn("speech registry bootstrap failed", slog.Any("error", err))
|
||||
}
|
||||
return nil
|
||||
},
|
||||
|
||||
+8
-8
@@ -8,7 +8,6 @@ import (
|
||||
|
||||
"github.com/memohai/memoh/internal/accounts"
|
||||
"github.com/memohai/memoh/internal/acl"
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/bind"
|
||||
"github.com/memohai/memoh/internal/boot"
|
||||
"github.com/memohai/memoh/internal/bots"
|
||||
@@ -30,6 +29,7 @@ import (
|
||||
"github.com/memohai/memoh/internal/schedule"
|
||||
"github.com/memohai/memoh/internal/searchproviders"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
ttspkg "github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
func runServe() {
|
||||
@@ -63,9 +63,9 @@ func options() fx.Option {
|
||||
identities.NewService,
|
||||
bind.NewService,
|
||||
event.NewHub,
|
||||
provideAudioRegistry,
|
||||
audiopkg.NewService,
|
||||
provideAudioTempStore,
|
||||
provideTtsRegistry,
|
||||
ttspkg.NewService,
|
||||
provideTtsTempStore,
|
||||
emailpkg.NewDBOAuthTokenStore,
|
||||
provideEmailRegistry,
|
||||
emailpkg.NewService,
|
||||
@@ -121,8 +121,8 @@ func options() fx.Option {
|
||||
provideServerHandler(weixin.NewQRServerHandler),
|
||||
provideServerHandler(provideUsersHandler),
|
||||
provideServerHandler(handlers.NewMemoryProvidersHandler),
|
||||
provideServerHandler(handlers.NewAudioHandler),
|
||||
provideServerHandler(handlers.NewBotAudioHandler),
|
||||
provideServerHandler(handlers.NewSpeechHandler),
|
||||
provideServerHandler(handlers.NewBotTtsHandler),
|
||||
provideServerHandler(handlers.NewEmailProvidersHandler),
|
||||
provideServerHandler(handlers.NewEmailBindingsHandler),
|
||||
provideServerHandler(handlers.NewEmailOutboxHandler),
|
||||
@@ -141,7 +141,7 @@ func options() fx.Option {
|
||||
fx.Invoke(
|
||||
injectToolProviders,
|
||||
startRegistrySync,
|
||||
startAudioProviderBootstrap,
|
||||
startSpeechProviderBootstrap,
|
||||
startMemoryProviderBootstrap,
|
||||
startSearchProviderBootstrap,
|
||||
startScheduleService,
|
||||
@@ -151,7 +151,7 @@ func options() fx.Option {
|
||||
startEmailManager,
|
||||
startContainerReconciliation,
|
||||
startBackgroundTaskCleanup,
|
||||
startAudioTempStoreCleanup,
|
||||
startTtsTempStoreCleanup,
|
||||
startServer,
|
||||
),
|
||||
fx.WithLogger(func(logger *slog.Logger) fxevent.Logger {
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
name: Deepgram Transcription
|
||||
client_type: deepgram-transcription
|
||||
icon: deepgram
|
||||
base_url: https://api.deepgram.com
|
||||
|
||||
models:
|
||||
- model_id: nova-3
|
||||
name: Nova-3
|
||||
type: transcription
|
||||
@@ -1,9 +0,0 @@
|
||||
name: ElevenLabs Transcription
|
||||
client_type: elevenlabs-transcription
|
||||
icon: elevenlabs
|
||||
base_url: https://api.elevenlabs.io
|
||||
|
||||
models:
|
||||
- model_id: scribe_v2
|
||||
name: Scribe v2
|
||||
type: transcription
|
||||
@@ -1,9 +0,0 @@
|
||||
name: Google Transcription
|
||||
client_type: google-transcription
|
||||
icon: google-color
|
||||
base_url: https://generativelanguage.googleapis.com/v1beta
|
||||
|
||||
models:
|
||||
- model_id: gemini-2.5-flash
|
||||
name: Gemini 2.5 Flash
|
||||
type: transcription
|
||||
@@ -1,9 +0,0 @@
|
||||
name: OpenAI Transcription
|
||||
client_type: openai-transcription
|
||||
icon: openai
|
||||
base_url: https://api.openai.com/v1
|
||||
|
||||
models:
|
||||
- model_id: gpt-4o-mini-transcribe
|
||||
name: GPT-4o Mini Transcribe
|
||||
type: transcription
|
||||
@@ -1,9 +0,0 @@
|
||||
name: OpenRouter Transcription
|
||||
client_type: openrouter-transcription
|
||||
icon: openrouter
|
||||
base_url: https://openrouter.ai/api/v1
|
||||
|
||||
models:
|
||||
- model_id: openai/gpt-4o-mini-transcribe
|
||||
name: OpenRouter Transcription
|
||||
type: transcription
|
||||
@@ -77,19 +77,13 @@ CREATE TABLE IF NOT EXISTS providers (
|
||||
'github-copilot',
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
'microsoft-speech'
|
||||
))
|
||||
);
|
||||
|
||||
@@ -114,7 +108,7 @@ CREATE TABLE IF NOT EXISTS models (
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
CONSTRAINT models_provider_id_model_id_unique UNIQUE (provider_id, model_id),
|
||||
CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'))
|
||||
CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_variants (
|
||||
@@ -176,7 +170,6 @@ CREATE TABLE IF NOT EXISTS bots (
|
||||
image_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
|
||||
discuss_probe_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
|
||||
tts_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
|
||||
transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
|
||||
browser_context_id UUID REFERENCES browser_contexts(id) ON DELETE SET NULL,
|
||||
persist_full_tool_results BOOLEAN NOT NULL DEFAULT false,
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
-- 0069_add_transcription_models_and_speech_domain
|
||||
-- Revert transcription model type and speech-domain expansion.
|
||||
|
||||
DELETE FROM models WHERE type = 'transcription';
|
||||
DELETE FROM providers WHERE client_type = 'google-speech';
|
||||
|
||||
ALTER TABLE models
|
||||
DROP CONSTRAINT IF EXISTS models_type_check;
|
||||
|
||||
ALTER TABLE models
|
||||
ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'));
|
||||
|
||||
ALTER TABLE providers
|
||||
DROP CONSTRAINT IF EXISTS providers_client_type_check;
|
||||
|
||||
ALTER TABLE providers
|
||||
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
|
||||
'openai-responses',
|
||||
'openai-completions',
|
||||
'anthropic-messages',
|
||||
'google-generative-ai',
|
||||
'openai-codex',
|
||||
'github-copilot',
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openrouter-speech',
|
||||
'elevenlabs-speech',
|
||||
'deepgram-speech',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech'
|
||||
));
|
||||
@@ -1,31 +0,0 @@
|
||||
-- 0069_add_transcription_models_and_speech_domain
|
||||
-- Expand the speech domain to support transcription models and shared speech providers.
|
||||
|
||||
ALTER TABLE providers
|
||||
DROP CONSTRAINT IF EXISTS providers_client_type_check;
|
||||
|
||||
ALTER TABLE providers
|
||||
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
|
||||
'openai-responses',
|
||||
'openai-completions',
|
||||
'anthropic-messages',
|
||||
'google-generative-ai',
|
||||
'openai-codex',
|
||||
'github-copilot',
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openrouter-speech',
|
||||
'elevenlabs-speech',
|
||||
'deepgram-speech',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech'
|
||||
));
|
||||
|
||||
ALTER TABLE models
|
||||
DROP CONSTRAINT IF EXISTS models_type_check;
|
||||
|
||||
ALTER TABLE models
|
||||
ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'));
|
||||
@@ -1,8 +0,0 @@
|
||||
-- 0070_add_bot_transcription_model
|
||||
-- Remove bots.transcription_model_id.
|
||||
|
||||
ALTER TABLE bots
|
||||
DROP CONSTRAINT IF EXISTS bots_transcription_model_id_fkey;
|
||||
|
||||
ALTER TABLE bots
|
||||
DROP COLUMN IF EXISTS transcription_model_id;
|
||||
@@ -1,5 +0,0 @@
|
||||
-- 0070_add_bot_transcription_model
|
||||
-- Add bots.transcription_model_id for bot-level speech-to-text defaults.
|
||||
|
||||
ALTER TABLE bots
|
||||
ADD COLUMN IF NOT EXISTS transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL;
|
||||
@@ -1,33 +0,0 @@
|
||||
-- 0071_split_transcription_providers
|
||||
-- Remove dedicated transcription provider client types.
|
||||
|
||||
DELETE FROM providers
|
||||
WHERE client_type IN (
|
||||
'openai-transcription',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-transcription',
|
||||
'google-transcription'
|
||||
);
|
||||
|
||||
ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
|
||||
|
||||
ALTER TABLE providers
|
||||
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
|
||||
'openai-responses',
|
||||
'openai-completions',
|
||||
'anthropic-messages',
|
||||
'google-generative-ai',
|
||||
'openai-codex',
|
||||
'github-copilot',
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openrouter-speech',
|
||||
'elevenlabs-speech',
|
||||
'deepgram-speech',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech'
|
||||
));
|
||||
@@ -1,29 +0,0 @@
|
||||
-- 0071_split_transcription_providers
|
||||
-- Add dedicated transcription provider client types.
|
||||
|
||||
ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
|
||||
|
||||
ALTER TABLE providers
|
||||
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
|
||||
'openai-responses',
|
||||
'openai-completions',
|
||||
'anthropic-messages',
|
||||
'google-generative-ai',
|
||||
'openai-codex',
|
||||
'github-copilot',
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
));
|
||||
+11
-61
@@ -16,27 +16,18 @@ SELECT * FROM providers WHERE id = sqlc.arg(id);
|
||||
-- name: GetProviderByName :one
|
||||
SELECT * FROM providers WHERE name = sqlc.arg(name);
|
||||
|
||||
-- name: GetProviderByClientType :one
|
||||
SELECT * FROM providers WHERE client_type = sqlc.arg(client_type);
|
||||
|
||||
-- name: ListProviders :many
|
||||
SELECT * FROM providers
|
||||
WHERE client_type NOT IN (
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
'microsoft-speech'
|
||||
)
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
@@ -62,19 +53,13 @@ FROM providers
|
||||
WHERE client_type NOT IN (
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
'microsoft-speech'
|
||||
);
|
||||
|
||||
-- name: CreateModel :one
|
||||
@@ -101,7 +86,7 @@ ORDER BY created_at DESC;
|
||||
|
||||
-- name: ListModels :many
|
||||
SELECT * FROM models
|
||||
WHERE type NOT IN ('speech', 'transcription')
|
||||
WHERE type != 'speech'
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
-- name: ListModelsByType :many
|
||||
@@ -112,7 +97,7 @@ ORDER BY created_at DESC;
|
||||
-- name: ListModelsByProviderID :many
|
||||
SELECT * FROM models
|
||||
WHERE provider_id = sqlc.arg(provider_id)
|
||||
AND type NOT IN ('speech', 'transcription')
|
||||
AND type != 'speech'
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
-- name: ListModelsByProviderIDAndType :many
|
||||
@@ -151,15 +136,9 @@ DELETE FROM models
|
||||
WHERE provider_id = sqlc.arg(provider_id)
|
||||
AND model_id = sqlc.arg(model_id);
|
||||
|
||||
-- name: DeleteModelByProviderAndType :exec
|
||||
DELETE FROM models
|
||||
WHERE provider_id = sqlc.arg(provider_id)
|
||||
AND model_id = sqlc.arg(model_id)
|
||||
AND type = sqlc.arg(type);
|
||||
|
||||
-- name: CountModels :one
|
||||
SELECT COUNT(*) FROM models
|
||||
WHERE type NOT IN ('speech', 'transcription');
|
||||
WHERE type != 'speech';
|
||||
|
||||
-- name: CountModelsByType :one
|
||||
SELECT COUNT(*) FROM models WHERE type = sqlc.arg(type);
|
||||
@@ -171,6 +150,11 @@ VALUES (sqlc.arg(name), sqlc.arg(client_type), sqlc.arg(icon), false, sqlc.arg(c
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
icon = EXCLUDED.icon,
|
||||
client_type = EXCLUDED.client_type,
|
||||
config = CASE
|
||||
WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
|
||||
THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
|
||||
ELSE EXCLUDED.config
|
||||
END,
|
||||
updated_at = now()
|
||||
RETURNING *;
|
||||
|
||||
@@ -189,7 +173,7 @@ SELECT m.*
|
||||
FROM models m
|
||||
JOIN providers p ON m.provider_id = p.id
|
||||
WHERE p.enable = true
|
||||
AND m.type NOT IN ('speech', 'transcription')
|
||||
AND m.type != 'speech'
|
||||
ORDER BY m.created_at DESC;
|
||||
|
||||
-- name: ListEnabledModelsByType :many
|
||||
@@ -247,17 +231,6 @@ WHERE client_type IN (
|
||||
)
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
-- name: ListTranscriptionProviders :many
|
||||
SELECT * FROM providers
|
||||
WHERE client_type IN (
|
||||
'openai-transcription',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-transcription',
|
||||
'google-transcription'
|
||||
)
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
-- name: ListSpeechModels :many
|
||||
SELECT m.*,
|
||||
p.client_type AS provider_type
|
||||
@@ -277,26 +250,3 @@ SELECT * FROM models
|
||||
WHERE provider_id = sqlc.arg(provider_id)
|
||||
AND model_id = sqlc.arg(model_id)
|
||||
LIMIT 1;
|
||||
|
||||
-- name: GetTranscriptionModelWithProvider :one
|
||||
SELECT
|
||||
m.*,
|
||||
p.client_type AS provider_type
|
||||
FROM models m
|
||||
JOIN providers p ON p.id = m.provider_id
|
||||
WHERE m.id = sqlc.arg(id)
|
||||
AND m.type = 'transcription';
|
||||
|
||||
-- name: ListTranscriptionModels :many
|
||||
SELECT m.*,
|
||||
p.client_type AS provider_type
|
||||
FROM models m
|
||||
JOIN providers p ON p.id = m.provider_id
|
||||
WHERE m.type = 'transcription'
|
||||
ORDER BY m.created_at DESC;
|
||||
|
||||
-- name: ListTranscriptionModelsByProviderID :many
|
||||
SELECT * FROM models
|
||||
WHERE provider_id = sqlc.arg(provider_id)
|
||||
AND type = 'transcription'
|
||||
ORDER BY created_at DESC;
|
||||
|
||||
@@ -19,7 +19,6 @@ SELECT
|
||||
memory_providers.id AS memory_provider_id,
|
||||
image_models.id AS image_model_id,
|
||||
tts_models.id AS tts_model_id,
|
||||
transcription_models.id AS transcription_model_id,
|
||||
browser_contexts.id AS browser_context_id,
|
||||
bots.persist_full_tool_results
|
||||
FROM bots
|
||||
@@ -31,7 +30,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
|
||||
LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
|
||||
LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
|
||||
LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
|
||||
LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
|
||||
LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
|
||||
WHERE bots.id = $1;
|
||||
|
||||
@@ -56,12 +54,11 @@ WITH updated AS (
|
||||
memory_provider_id = COALESCE(sqlc.narg(memory_provider_id)::uuid, bots.memory_provider_id),
|
||||
image_model_id = COALESCE(sqlc.narg(image_model_id)::uuid, bots.image_model_id),
|
||||
tts_model_id = COALESCE(sqlc.narg(tts_model_id)::uuid, bots.tts_model_id),
|
||||
transcription_model_id = COALESCE(sqlc.narg(transcription_model_id)::uuid, bots.transcription_model_id),
|
||||
browser_context_id = COALESCE(sqlc.narg(browser_context_id)::uuid, bots.browser_context_id),
|
||||
persist_full_tool_results = sqlc.arg(persist_full_tool_results),
|
||||
updated_at = now()
|
||||
WHERE bots.id = sqlc.arg(id)
|
||||
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
|
||||
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
|
||||
)
|
||||
SELECT
|
||||
updated.id AS bot_id,
|
||||
@@ -83,7 +80,6 @@ SELECT
|
||||
memory_providers.id AS memory_provider_id,
|
||||
image_models.id AS image_model_id,
|
||||
tts_models.id AS tts_model_id,
|
||||
transcription_models.id AS transcription_model_id,
|
||||
browser_contexts.id AS browser_context_id,
|
||||
updated.persist_full_tool_results
|
||||
FROM updated
|
||||
@@ -95,7 +91,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
|
||||
LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
|
||||
LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
|
||||
LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
|
||||
LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
|
||||
LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id;
|
||||
|
||||
-- name: DeleteSettingsByBotID :exec
|
||||
@@ -117,7 +112,6 @@ SET language = 'auto',
|
||||
search_provider_id = NULL,
|
||||
memory_provider_id = NULL,
|
||||
tts_model_id = NULL,
|
||||
transcription_model_id = NULL,
|
||||
browser_context_id = NULL,
|
||||
persist_full_tool_results = false,
|
||||
updated_at = now()
|
||||
|
||||
@@ -72,7 +72,8 @@ func TestSpawnAndNotify(t *testing.T) {
|
||||
task := mgr.Get(taskID)
|
||||
if task == nil {
|
||||
t.Fatal("task not found after completion")
|
||||
} else if task.Status != TaskCompleted {
|
||||
}
|
||||
if task.Status != TaskCompleted {
|
||||
t.Errorf("expected task status completed, got %s", task.Status)
|
||||
}
|
||||
}
|
||||
@@ -129,7 +130,8 @@ func TestKillTask(t *testing.T) {
|
||||
task := mgr.Get(taskID)
|
||||
if task == nil {
|
||||
t.Fatal("task not found")
|
||||
} else if task.Status != TaskKilled {
|
||||
}
|
||||
if task.Status != TaskKilled {
|
||||
t.Errorf("expected status killed, got %s", task.Status)
|
||||
}
|
||||
|
||||
|
||||
@@ -84,7 +84,7 @@ func retryDelay(attempt int, cfg RetryConfig) time.Duration {
|
||||
if backoffIdx > 20 {
|
||||
backoffIdx = 20
|
||||
}
|
||||
delay := cfg.BaseDelay * time.Duration(1<<backoffIdx)
|
||||
delay := cfg.BaseDelay * time.Duration(1<<uint(backoffIdx))
|
||||
delay = min(delay, cfg.MaxDelay)
|
||||
// Add jitter: random value in [0, delay/2), so final delay is in [delay/2, delay).
|
||||
// math/rand is intentional here — cryptographic randomness is not needed for backoff jitter.
|
||||
|
||||
@@ -295,7 +295,7 @@ func (p *ContainerProvider) execRead(ctx context.Context, session SessionContext
|
||||
content += "\n"
|
||||
}
|
||||
|
||||
content = addLineNumbers(content, lineOffset)
|
||||
content = addLineNumbers(content, int32(lineOffset))
|
||||
return map[string]any{"content": content, "total_lines": totalLines}, nil
|
||||
}
|
||||
|
||||
@@ -757,7 +757,7 @@ func truncateStr(s string, n int) string {
|
||||
return s[:n] + "..."
|
||||
}
|
||||
|
||||
func addLineNumbers(content string, startLine int) string {
|
||||
func addLineNumbers(content string, startLine int32) string {
|
||||
if content == "" {
|
||||
return content
|
||||
}
|
||||
@@ -765,7 +765,7 @@ func addLineNumbers(content string, startLine int) string {
|
||||
var out strings.Builder
|
||||
out.Grow(len(content) + len(lines)*8)
|
||||
for i, line := range lines {
|
||||
fmt.Fprintf(&out, "%6d\t%s\n", startLine+i, line)
|
||||
fmt.Fprintf(&out, "%6d\t%s\n", int(startLine)+i, line)
|
||||
}
|
||||
return out.String()
|
||||
}
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
//nolint:gosec
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/media"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
)
|
||||
|
||||
const mediaDataPrefix = "/data/media/"
|
||||
|
||||
type TranscriptionProvider struct {
|
||||
logger *slog.Logger
|
||||
settings *settings.Service
|
||||
audio *audiopkg.Service
|
||||
media *media.Service
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &TranscriptionProvider{
|
||||
logger: log.With(slog.String("tool", "transcribe_audio")),
|
||||
settings: settingsSvc,
|
||||
audio: audioSvc,
|
||||
media: mediaSvc,
|
||||
http: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return errors.New("stopped after 10 redirects")
|
||||
}
|
||||
if _, err := validateURL(req.Context(), req.URL.String()); err != nil {
|
||||
return fmt.Errorf("redirect to non-public address is not allowed: %w", err)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
|
||||
if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
|
||||
return nil, nil
|
||||
}
|
||||
botID := strings.TrimSpace(session.BotID)
|
||||
if botID == "" {
|
||||
return nil, nil
|
||||
}
|
||||
botSettings, err := p.settings.GetBot(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(botSettings.TranscriptionModelID) == "" {
|
||||
return nil, nil
|
||||
}
|
||||
sess := session
|
||||
return []sdk.Tool{{
|
||||
Name: "transcribe_audio",
|
||||
Description: "Transcribe an audio or voice message into text. Use this when the user sent a voice message and you need to understand its contents. Accepts a bot media path such as /data/media/... or a direct URL.",
|
||||
Parameters: map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"path": map[string]any{"type": "string", "description": "Audio file path from the message context, usually under /data/media/..."},
|
||||
"url": map[string]any{"type": "string", "description": "Direct audio URL when a path is unavailable"},
|
||||
"language": map[string]any{"type": "string", "description": "Optional language hint"},
|
||||
"prompt": map[string]any{"type": "string", "description": "Optional transcription prompt"},
|
||||
"contentType": map[string]any{"type": "string", "description": "Optional MIME type override"},
|
||||
},
|
||||
"required": []string{},
|
||||
},
|
||||
Execute: func(execCtx *sdk.ToolExecContext, input any) (any, error) {
|
||||
return p.execTranscribe(execCtx.Context, sess, inputAsMap(input))
|
||||
},
|
||||
}}, nil
|
||||
}
|
||||
|
||||
func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
|
||||
botID := strings.TrimSpace(session.BotID)
|
||||
if botID == "" {
|
||||
return nil, errors.New("bot_id is required")
|
||||
}
|
||||
botSettings, err := p.settings.GetBot(ctx, botID)
|
||||
if err != nil {
|
||||
return nil, errors.New("failed to load bot settings")
|
||||
}
|
||||
modelID := strings.TrimSpace(botSettings.TranscriptionModelID)
|
||||
if modelID == "" {
|
||||
return nil, errors.New("bot has no transcription model configured")
|
||||
}
|
||||
|
||||
path := FirstStringArg(args, "path", "audio_path", "file_path")
|
||||
rawURL := FirstStringArg(args, "url", "audio_url")
|
||||
if path == "" && rawURL == "" {
|
||||
return nil, errors.New("path or url is required")
|
||||
}
|
||||
|
||||
audio, filename, contentType, err := p.loadAudio(ctx, botID, path, rawURL, FirstStringArg(args, "contentType", "content_type"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
override := map[string]any{}
|
||||
if language := FirstStringArg(args, "language"); language != "" {
|
||||
override["language"] = language
|
||||
}
|
||||
if prompt := FirstStringArg(args, "prompt"); prompt != "" {
|
||||
override["prompt"] = prompt
|
||||
}
|
||||
result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return map[string]any{
|
||||
"ok": true,
|
||||
"text": result.Text,
|
||||
"language": result.Language,
|
||||
"duration_seconds": result.DurationSeconds,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *TranscriptionProvider) loadAudio(ctx context.Context, botID, pathValue, rawURL, contentTypeOverride string) ([]byte, string, string, error) {
|
||||
if pathValue != "" {
|
||||
return p.loadAudioFromPath(ctx, botID, pathValue, contentTypeOverride)
|
||||
}
|
||||
u, err := validateURL(ctx, rawURL)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
resp, err := p.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
_ = resp.Body.Close()
|
||||
return nil, "", "", fmt.Errorf("download audio: unexpected status %d", resp.StatusCode)
|
||||
}
|
||||
defer func(body io.ReadCloser) {
|
||||
if closeErr := body.Close(); closeErr != nil {
|
||||
p.logger.Warn("failed to close audio response body", slog.Any("error", closeErr))
|
||||
}
|
||||
}(resp.Body)
|
||||
audio, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
contentType := strings.TrimSpace(contentTypeOverride)
|
||||
if contentType == "" {
|
||||
contentType = strings.TrimSpace(resp.Header.Get("Content-Type"))
|
||||
}
|
||||
return audio, filepath.Base(strings.TrimSpace(req.URL.Path)), contentType, nil
|
||||
}
|
||||
|
||||
func (p *TranscriptionProvider) loadAudioFromPath(ctx context.Context, botID, pathValue, contentTypeOverride string) ([]byte, string, string, error) {
|
||||
storageKey := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(pathValue), mediaDataPrefix))
|
||||
if storageKey == "" || storageKey == strings.TrimSpace(pathValue) {
|
||||
return nil, "", "", fmt.Errorf("unsupported media path: %s", pathValue)
|
||||
}
|
||||
asset, err := p.media.GetByStorageKey(ctx, botID, storageKey)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
reader, _, err := p.media.Open(ctx, botID, asset.ContentHash)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
defer func(reader io.ReadCloser) {
|
||||
if closeErr := reader.Close(); closeErr != nil {
|
||||
p.logger.Warn("failed to close media reader", slog.Any("error", closeErr))
|
||||
}
|
||||
}(reader)
|
||||
audio, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
contentType := strings.TrimSpace(contentTypeOverride)
|
||||
if contentType == "" {
|
||||
contentType = strings.TrimSpace(asset.Mime)
|
||||
}
|
||||
return audio, filepath.Base(storageKey), contentType, nil
|
||||
}
|
||||
|
||||
func validateURL(ctx context.Context, rawURL string) (*url.URL, error) {
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid url: %w", err)
|
||||
}
|
||||
|
||||
if u.Scheme != "http" && u.Scheme != "https" {
|
||||
return nil, fmt.Errorf("unsupported scheme: %s", u.Scheme)
|
||||
}
|
||||
|
||||
hostname := u.Hostname()
|
||||
if hostname == "" {
|
||||
return nil, errors.New("missing hostname in url")
|
||||
}
|
||||
|
||||
resolver := net.Resolver{}
|
||||
ips, err := resolver.LookupIPAddr(ctx, hostname)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dns lookup failed for %s: %w", hostname, err)
|
||||
}
|
||||
|
||||
if len(ips) == 0 {
|
||||
return nil, fmt.Errorf("no ip addresses found for %s", hostname)
|
||||
}
|
||||
|
||||
for _, ip := range ips {
|
||||
if ip.IP.IsLoopback() || ip.IP.IsPrivate() || ip.IP.IsLinkLocalUnicast() || ip.IP.IsLinkLocalMulticast() {
|
||||
return nil, fmt.Errorf("url resolves to a non-public ip address: %s", ip.IP.String())
|
||||
}
|
||||
}
|
||||
|
||||
return u, nil
|
||||
}
|
||||
@@ -10,9 +10,9 @@ import (
|
||||
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/channel"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
ttspkg "github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
const ttsMaxTextLen = 500
|
||||
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
|
||||
type TTSProvider struct {
|
||||
logger *slog.Logger
|
||||
settings *settings.Service
|
||||
audio *audiopkg.Service
|
||||
tts *ttspkg.Service
|
||||
sender TTSSender
|
||||
resolver TTSChannelResolver
|
||||
}
|
||||
|
||||
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
|
||||
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &TTSProvider{
|
||||
logger: log.With(slog.String("tool", "tts")),
|
||||
settings: settingsSvc,
|
||||
audio: audioSvc,
|
||||
tts: ttsSvc,
|
||||
sender: sender,
|
||||
resolver: resolver,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
|
||||
if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
|
||||
if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
|
||||
return nil, nil
|
||||
}
|
||||
botID := strings.TrimSpace(session.BotID)
|
||||
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
|
||||
if botSettings.TtsModelID == "" {
|
||||
return nil, errors.New("bot has no TTS model configured")
|
||||
}
|
||||
audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
|
||||
audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
|
||||
if synthErr != nil {
|
||||
return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
|
||||
}
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
package audio
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
)
|
||||
|
||||
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
|
||||
for _, def := range registry.List() {
|
||||
provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
if logger != nil {
|
||||
logger.Warn("audio registry skipped provider without template",
|
||||
slog.String("provider", string(def.ClientType)),
|
||||
slog.String("display_name", def.DisplayName))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if logger != nil {
|
||||
logger.Warn("audio registry failed to load provider template",
|
||||
slog.String("provider", string(def.ClientType)),
|
||||
slog.String("display_name", def.DisplayName),
|
||||
slog.Any("error", err))
|
||||
}
|
||||
return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
|
||||
}
|
||||
|
||||
synced := 0
|
||||
if !isTranscriptionClientType(def.ClientType) {
|
||||
for _, model := range def.Models {
|
||||
if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
|
||||
if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
|
||||
ProviderID: provider.ID,
|
||||
ModelID: model.ID,
|
||||
Type: string(models.ModelTypeSpeech),
|
||||
}); err != nil {
|
||||
return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
modelConfigJSON, err := json.Marshal(map[string]any{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal speech model config: %w", err)
|
||||
}
|
||||
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
|
||||
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
|
||||
ModelID: model.ID,
|
||||
Name: name,
|
||||
ProviderID: provider.ID,
|
||||
Type: string(models.ModelTypeSpeech),
|
||||
Config: modelConfigJSON,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
|
||||
}
|
||||
synced++
|
||||
}
|
||||
}
|
||||
for _, model := range def.TranscriptionModels {
|
||||
if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) {
|
||||
if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
|
||||
ProviderID: provider.ID,
|
||||
ModelID: model.ID,
|
||||
Type: string(models.ModelTypeTranscription),
|
||||
}); err != nil {
|
||||
return fmt.Errorf("delete hidden transcription template model %s: %w", model.ID, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
modelConfigJSON, err := json.Marshal(map[string]any{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal transcription model config: %w", err)
|
||||
}
|
||||
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
|
||||
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
|
||||
ModelID: model.ID,
|
||||
Name: name,
|
||||
ProviderID: provider.ID,
|
||||
Type: string(models.ModelTypeTranscription),
|
||||
Config: modelConfigJSON,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("upsert transcription model %s: %w", model.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
if logger != nil {
|
||||
logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,769 +0,0 @@
|
||||
package audio
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
"github.com/memohai/memoh/internal/db"
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
queries *sqlc.Queries
|
||||
logger *slog.Logger
|
||||
registry *Registry
|
||||
}
|
||||
|
||||
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
|
||||
return &Service{
|
||||
queries: queries,
|
||||
logger: log.With(slog.String("service", "audio")),
|
||||
registry: registry,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) Registry() *Registry { return s.registry }
|
||||
|
||||
func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
|
||||
return s.registry.ListMeta()
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechMeta(_ context.Context) []ProviderMetaResponse {
|
||||
return s.registry.ListSpeechMeta()
|
||||
}
|
||||
|
||||
func (s *Service) ListTranscriptionMeta(_ context.Context) []ProviderMetaResponse {
|
||||
return s.registry.ListTranscriptionMeta()
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
|
||||
rows, err := s.queries.ListSpeechProviders(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech providers: %w", err)
|
||||
}
|
||||
items := make([]SpeechProviderResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
items = append(items, toSpeechProviderResponse(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) ListTranscriptionProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
|
||||
rows, err := s.queries.ListTranscriptionProviders(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list transcription providers: %w", err)
|
||||
}
|
||||
items := make([]SpeechProviderResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
items = append(items, toSpeechProviderResponse(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return SpeechProviderResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
return toSpeechProviderResponse(row), nil
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
|
||||
rows, err := s.queries.ListSpeechModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models: %w", err)
|
||||
}
|
||||
items := make([]SpeechModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if s.shouldHideModel(row.ProviderType, models.ModelTypeSpeech, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toSpeechModelFromListRow(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) ListTranscriptionModels(ctx context.Context) ([]TranscriptionModelResponse, error) {
|
||||
rows, err := s.queries.ListTranscriptionModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list transcription models: %w", err)
|
||||
}
|
||||
items := make([]TranscriptionModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if s.shouldHideModel(row.ProviderType, models.ModelTypeTranscription, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toTranscriptionModelFromListRow(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models by provider: %w", err)
|
||||
}
|
||||
items := make([]SpeechModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if shouldHideTemplateModel(def, models.ModelTypeSpeech, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toSpeechModelFromModel(row, ""))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) ListTranscriptionModelsByProvider(ctx context.Context, providerID string) ([]TranscriptionModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rows, err := s.queries.ListTranscriptionModelsByProviderID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list transcription models by provider: %w", err)
|
||||
}
|
||||
items := make([]TranscriptionModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if shouldHideTemplateModel(def, models.ModelTypeTranscription, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toTranscriptionModelFromModel(row, ""))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
return toSpeechModelWithProviderResponse(row), nil
|
||||
}
|
||||
|
||||
func (s *Service) GetTranscriptionModel(ctx context.Context, id string) (TranscriptionModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
|
||||
}
|
||||
return toTranscriptionModelWithProviderResponse(row), nil
|
||||
}
|
||||
|
||||
func (s *Service) UpdateSpeechModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (SpeechModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
configJSON, err := json.Marshal(req.Config)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, fmt.Errorf("marshal speech config: %w", err)
|
||||
}
|
||||
name := row.Name
|
||||
if req.Name != nil {
|
||||
name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
|
||||
}
|
||||
updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
|
||||
ID: pgID,
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID,
|
||||
Type: string(models.ModelTypeSpeech),
|
||||
Config: configJSON,
|
||||
})
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, fmt.Errorf("update speech model: %w", err)
|
||||
}
|
||||
return toSpeechModelFromModel(updated, row.ProviderType), nil
|
||||
}
|
||||
|
||||
func (s *Service) UpdateTranscriptionModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (TranscriptionModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
|
||||
}
|
||||
configJSON, err := json.Marshal(req.Config)
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, fmt.Errorf("marshal transcription config: %w", err)
|
||||
}
|
||||
name := row.Name
|
||||
if req.Name != nil {
|
||||
name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
|
||||
}
|
||||
updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
|
||||
ID: pgID,
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID,
|
||||
Type: string(models.ModelTypeTranscription),
|
||||
Config: configJSON,
|
||||
})
|
||||
if err != nil {
|
||||
return TranscriptionModelResponse{}, fmt.Errorf("update transcription model: %w", err)
|
||||
}
|
||||
return toTranscriptionModelFromModel(updated, row.ProviderType), nil
|
||||
}
|
||||
|
||||
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
|
||||
params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
result, err := sdk.GenerateSpeech(ctx,
|
||||
sdk.WithSpeechModel(params.model),
|
||||
sdk.WithText(text),
|
||||
sdk.WithSpeechConfig(params.config),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("synthesize: %w", err)
|
||||
}
|
||||
return result.Audio, result.ContentType, nil
|
||||
}
|
||||
|
||||
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
|
||||
params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
streamResult, err := sdk.StreamSpeech(ctx,
|
||||
sdk.WithSpeechModel(params.model),
|
||||
sdk.WithText(text),
|
||||
sdk.WithSpeechConfig(params.config),
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stream: %w", err)
|
||||
}
|
||||
audio, err := streamResult.Bytes()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stream: %w", err)
|
||||
}
|
||||
if _, writeErr := w.Write(audio); writeErr != nil {
|
||||
return "", fmt.Errorf("write chunk: %w", writeErr)
|
||||
}
|
||||
return streamResult.ContentType, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
template := findModelTemplate(def.Models, def.DefaultModel, modelRow.ModelID)
|
||||
if template == nil {
|
||||
return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
|
||||
}
|
||||
caps := template.Capabilities
|
||||
if len(caps.ConfigSchema.Fields) == 0 {
|
||||
caps.ConfigSchema = template.ConfigSchema
|
||||
}
|
||||
return &caps, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetSpeechModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
|
||||
return s.GetModelCapabilities(ctx, modelID)
|
||||
}
|
||||
|
||||
func (s *Service) GetTranscriptionModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get transcription model: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
template := findModelTemplate(def.TranscriptionModels, def.DefaultTranscriptionModel, modelRow.ModelID)
|
||||
if template == nil {
|
||||
return nil, fmt.Errorf("transcription model capabilities not found: %s", modelRow.ModelID)
|
||||
}
|
||||
caps := template.Capabilities
|
||||
if len(caps.ConfigSchema.Fields) == 0 {
|
||||
caps.ConfigSchema = template.ConfigSchema
|
||||
}
|
||||
return &caps, nil
|
||||
}
|
||||
|
||||
func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !def.SupportsList || def.Factory == nil {
|
||||
return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
|
||||
}
|
||||
|
||||
provider, err := def.Factory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build speech provider: %w", err)
|
||||
}
|
||||
|
||||
remoteModels, err := provider.ListModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models: %w", err)
|
||||
}
|
||||
|
||||
discovered := make([]ModelInfo, 0, len(remoteModels))
|
||||
for _, remoteModel := range remoteModels {
|
||||
if remoteModel == nil || remoteModel.ID == "" {
|
||||
continue
|
||||
}
|
||||
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
|
||||
}
|
||||
return discovered, nil
|
||||
}
|
||||
|
||||
func (s *Service) FetchRemoteTranscriptionModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !def.SupportsTranscriptionList || def.TranscriptionFactory == nil {
|
||||
return nil, fmt.Errorf("speech provider does not support transcription model discovery: %s", providerRow.ClientType)
|
||||
}
|
||||
|
||||
provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build transcription provider: %w", err)
|
||||
}
|
||||
|
||||
remoteModels, err := provider.ListModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list transcription models: %w", err)
|
||||
}
|
||||
|
||||
discovered := make([]ModelInfo, 0, len(remoteModels))
|
||||
for _, remoteModel := range remoteModels {
|
||||
if remoteModel == nil || remoteModel.ID == "" {
|
||||
continue
|
||||
}
|
||||
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.TranscriptionModels))
|
||||
}
|
||||
return discovered, nil
|
||||
}
|
||||
|
||||
func (s *Service) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*sdk.TranscriptionResult, error) {
|
||||
params, err := s.resolveTranscriptionParams(ctx, modelID, audio, filename, contentType, overrideCfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result, err := sdk.Transcribe(ctx,
|
||||
sdk.WithTranscriptionModel(params.model),
|
||||
sdk.WithAudio(audio, filename, contentType),
|
||||
sdk.WithTranscriptionConfig(params.config),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("transcribe: %w", err)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
type resolvedSpeechParams struct {
|
||||
model *sdk.SpeechModel
|
||||
config map[string]any
|
||||
}
|
||||
|
||||
type resolvedTranscriptionParams struct {
|
||||
model *sdk.TranscriptionModel
|
||||
config map[string]any
|
||||
}
|
||||
|
||||
func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
|
||||
_ = text
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
provider, err := def.Factory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build speech provider: %w", err)
|
||||
}
|
||||
|
||||
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
|
||||
return &resolvedSpeechParams{
|
||||
model: &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
|
||||
config: cfg,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Service) resolveTranscriptionParams(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*resolvedTranscriptionParams, error) {
|
||||
_ = audio
|
||||
_ = filename
|
||||
_ = contentType
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get transcription model: %w", err)
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build transcription provider: %w", err)
|
||||
}
|
||||
|
||||
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
|
||||
return &resolvedTranscriptionParams{
|
||||
model: &sdk.TranscriptionModel{ID: modelRow.ModelID, Provider: provider},
|
||||
config: cfg,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseConfig(raw []byte) map[string]any {
|
||||
if len(raw) == 0 {
|
||||
return map[string]any{}
|
||||
}
|
||||
var cfg map[string]any
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
|
||||
return map[string]any{}
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func mergeConfig(parts ...map[string]any) map[string]any {
|
||||
out := make(map[string]any)
|
||||
for _, part := range parts {
|
||||
for key, value := range part {
|
||||
out[key] = value
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
|
||||
for _, model := range defaults {
|
||||
if model.ID == modelID {
|
||||
return model
|
||||
}
|
||||
}
|
||||
return ModelInfo{
|
||||
ID: modelID,
|
||||
Name: modelID,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) shouldHideModel(clientType string, modelType models.ModelType, modelID string) bool {
|
||||
def, err := s.registry.Get(models.ClientType(clientType))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return shouldHideTemplateModel(def, modelType, modelID)
|
||||
}
|
||||
|
||||
func shouldHideTemplateModel(def ProviderDefinition, modelType models.ModelType, modelID string) bool {
|
||||
switch modelType {
|
||||
case models.ModelTypeSpeech:
|
||||
if !def.SupportsList {
|
||||
return false
|
||||
}
|
||||
for _, model := range def.Models {
|
||||
if model.ID == modelID {
|
||||
return model.TemplateOnly
|
||||
}
|
||||
}
|
||||
case models.ModelTypeTranscription:
|
||||
if !def.SupportsTranscriptionList {
|
||||
return false
|
||||
}
|
||||
for _, model := range def.TranscriptionModels {
|
||||
if model.ID == modelID {
|
||||
return model.TemplateOnly
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func findModelTemplate(modelsList []ModelInfo, defaultModel string, modelID string) *ModelInfo {
|
||||
for i := range modelsList {
|
||||
if modelsList[i].ID == modelID {
|
||||
return &modelsList[i]
|
||||
}
|
||||
}
|
||||
if defaultModel != "" {
|
||||
for i := range modelsList {
|
||||
if modelsList[i].ID == defaultModel {
|
||||
return &modelsList[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(modelsList) > 0 {
|
||||
return &modelsList[0]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
|
||||
icon := ""
|
||||
if row.Icon.Valid {
|
||||
icon = row.Icon.String
|
||||
}
|
||||
return SpeechProviderResponse{
|
||||
ID: row.ID.String(),
|
||||
Name: row.Name,
|
||||
ClientType: row.ClientType,
|
||||
Icon: icon,
|
||||
Enable: row.Enable,
|
||||
Config: maskSpeechProviderConfig(parseConfig(row.Config)),
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
|
||||
if len(cfg) == 0 {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := make(map[string]any, len(cfg))
|
||||
for key, value := range cfg {
|
||||
if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
|
||||
out[key] = maskSpeechSecret(s)
|
||||
continue
|
||||
}
|
||||
out[key] = value
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isSpeechSecretKey(key string) bool {
|
||||
switch key {
|
||||
case "api_key", "access_key", "secret_key", "app_key":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func maskSpeechSecret(value string) string {
|
||||
if len(value) <= 8 {
|
||||
return "********"
|
||||
}
|
||||
return value[:4] + "****" + value[len(value)-4:]
|
||||
}
|
||||
|
||||
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: providerType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toTranscriptionModelFromListRow(row sqlc.ListTranscriptionModelsRow) TranscriptionModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return TranscriptionModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toTranscriptionModelFromModel(row sqlc.Model, providerType string) TranscriptionModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return TranscriptionModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: providerType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toTranscriptionModelWithProviderResponse(row sqlc.GetTranscriptionModelWithProviderRow) TranscriptionModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return TranscriptionModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
@@ -1,102 +0,0 @@
|
||||
package audio
|
||||
|
||||
import "time"
|
||||
|
||||
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
|
||||
type ProviderMetaResponse struct {
|
||||
Provider string `json:"provider"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Description string `json:"description"`
|
||||
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
|
||||
DefaultModel string `json:"default_model,omitempty"`
|
||||
Models []ModelInfo `json:"models,omitempty"`
|
||||
DefaultSynthesisModel string `json:"default_synthesis_model,omitempty"`
|
||||
SynthesisModels []ModelInfo `json:"synthesis_models,omitempty"`
|
||||
SupportsSynthesisList bool `json:"supports_synthesis_list,omitempty"`
|
||||
DefaultTranscriptionModel string `json:"default_transcription_model,omitempty"`
|
||||
TranscriptionModels []ModelInfo `json:"transcription_models,omitempty"`
|
||||
SupportsTranscriptionList bool `json:"supports_transcription_list,omitempty"`
|
||||
}
|
||||
|
||||
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
|
||||
type SpeechProviderResponse struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
ClientType string `json:"client_type"`
|
||||
Icon string `json:"icon,omitempty"`
|
||||
Enable bool `json:"enable"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// SpeechModelResponse represents a speech model from the unified models table.
|
||||
type SpeechModelResponse struct {
|
||||
ID string `json:"id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Name string `json:"name"`
|
||||
ProviderID string `json:"provider_id"`
|
||||
ProviderType string `json:"provider_type,omitempty"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// TranscriptionModelResponse represents a transcription model from the unified models table.
|
||||
type TranscriptionModelResponse struct {
|
||||
ID string `json:"id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Name string `json:"name"`
|
||||
ProviderID string `json:"provider_id"`
|
||||
ProviderType string `json:"provider_type,omitempty"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// UpdateSpeechProviderRequest is used for updating a speech provider.
|
||||
type UpdateSpeechProviderRequest struct {
|
||||
Name *string `json:"name,omitempty"`
|
||||
Enable *bool `json:"enable,omitempty"`
|
||||
}
|
||||
|
||||
// UpdateSpeechModelRequest is used for updating a speech model.
|
||||
type UpdateSpeechModelRequest struct {
|
||||
Name *string `json:"name,omitempty"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
// TestSynthesizeRequest represents a text-to-speech test request.
|
||||
type TestSynthesizeRequest struct {
|
||||
Text string `json:"text"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
// TestTranscriptionRequest represents an audio-to-text test request.
|
||||
type TestTranscriptionRequest struct {
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
// TestTranscriptionResponse represents the result of a transcription test.
|
||||
type TestTranscriptionResponse struct {
|
||||
Text string `json:"text"`
|
||||
Language string `json:"language,omitempty"`
|
||||
DurationSeconds float64 `json:"duration_seconds,omitempty"`
|
||||
Words []TranscriptionWord `json:"words,omitempty"`
|
||||
Metadata map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
// TranscriptionWord represents a single word alignment from a transcription result.
|
||||
type TranscriptionWord struct {
|
||||
Text string `json:"text"`
|
||||
Start float64 `json:"start,omitempty"`
|
||||
End float64 `json:"end,omitempty"`
|
||||
SpeakerID string `json:"speaker_id,omitempty"`
|
||||
}
|
||||
|
||||
// ImportModelsResponse represents the response for importing speech models.
|
||||
type ImportModelsResponse struct {
|
||||
Created int `json:"created"`
|
||||
Skipped int `json:"skipped"`
|
||||
Models []string `json:"models"`
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
//go:build ignore
|
||||
// +build ignore
|
||||
|
||||
package identities_test
|
||||
|
||||
|
||||
@@ -58,29 +58,14 @@ type mediaIngestor interface {
|
||||
channel.ContainerAttachmentIngester
|
||||
}
|
||||
|
||||
// speechSynthesizer synthesizes text to speech audio.
|
||||
type speechSynthesizer interface {
|
||||
// ttsSynthesizer synthesizes text to speech audio.
|
||||
type ttsSynthesizer interface {
|
||||
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
|
||||
}
|
||||
|
||||
// speechModelResolver looks up the speech model ID configured for a bot.
|
||||
type speechModelResolver interface {
|
||||
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
|
||||
}
|
||||
|
||||
// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
|
||||
type TranscriptionResult interface {
|
||||
GetText() string
|
||||
}
|
||||
|
||||
// transcriptionRecognizer converts inbound audio to text using a configured model.
|
||||
type transcriptionRecognizer interface {
|
||||
Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (TranscriptionResult, error)
|
||||
}
|
||||
|
||||
// transcriptionModelResolver looks up the transcription model ID configured for a bot.
|
||||
type transcriptionModelResolver interface {
|
||||
ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error)
|
||||
// ttsModelResolver looks up the TTS model ID configured for a bot.
|
||||
type ttsModelResolver interface {
|
||||
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
|
||||
}
|
||||
|
||||
// SessionEnsurer resolves or creates an active session for a route.
|
||||
@@ -101,29 +86,27 @@ type SessionResult struct {
|
||||
|
||||
// ChannelInboundProcessor routes channel inbound messages to the chat gateway.
|
||||
type ChannelInboundProcessor struct {
|
||||
runner flow.Runner
|
||||
routeResolver RouteResolver
|
||||
message messagepkg.Writer
|
||||
mediaService mediaIngestor
|
||||
reactor channelReactor
|
||||
commandHandler *command.Handler
|
||||
registry *channel.Registry
|
||||
logger *slog.Logger
|
||||
jwtSecret string
|
||||
tokenTTL time.Duration
|
||||
identity *IdentityResolver
|
||||
policy PolicyService
|
||||
dispatcher *RouteDispatcher
|
||||
acl chatACL
|
||||
observer channel.StreamObserver
|
||||
speechService speechSynthesizer
|
||||
speechModelResolver speechModelResolver
|
||||
transcriber transcriptionRecognizer
|
||||
sttModelResolver transcriptionModelResolver
|
||||
sessionEnsurer SessionEnsurer
|
||||
pipeline *pipelinepkg.Pipeline
|
||||
eventStore *pipelinepkg.EventStore
|
||||
discussDriver *pipelinepkg.DiscussDriver
|
||||
runner flow.Runner
|
||||
routeResolver RouteResolver
|
||||
message messagepkg.Writer
|
||||
mediaService mediaIngestor
|
||||
reactor channelReactor
|
||||
commandHandler *command.Handler
|
||||
registry *channel.Registry
|
||||
logger *slog.Logger
|
||||
jwtSecret string
|
||||
tokenTTL time.Duration
|
||||
identity *IdentityResolver
|
||||
policy PolicyService
|
||||
dispatcher *RouteDispatcher
|
||||
acl chatACL
|
||||
observer channel.StreamObserver
|
||||
ttsService ttsSynthesizer
|
||||
ttsModelResolver ttsModelResolver
|
||||
sessionEnsurer SessionEnsurer
|
||||
pipeline *pipelinepkg.Pipeline
|
||||
eventStore *pipelinepkg.EventStore
|
||||
discussDriver *pipelinepkg.DiscussDriver
|
||||
|
||||
// activeStreams maps "botID:routeID" to a context.CancelFunc for the
|
||||
// currently running agent stream. Used by /stop to abort generation
|
||||
@@ -205,23 +188,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
|
||||
p.observer = observer
|
||||
}
|
||||
|
||||
// SetSpeechService configures the speech synthesizer and settings reader for
|
||||
// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
|
||||
func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
|
||||
// SetTtsService configures the TTS synthesizer and settings reader for handling
|
||||
// <speech> tag events (speech_delta) that require server-side audio synthesis.
|
||||
func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
|
||||
if p == nil {
|
||||
return
|
||||
}
|
||||
p.speechService = synth
|
||||
p.speechModelResolver = modelResolver
|
||||
}
|
||||
|
||||
// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
|
||||
func (p *ChannelInboundProcessor) SetTranscriptionService(recognizer transcriptionRecognizer, modelResolver transcriptionModelResolver) {
|
||||
if p == nil {
|
||||
return
|
||||
}
|
||||
p.transcriber = recognizer
|
||||
p.sttModelResolver = modelResolver
|
||||
p.ttsService = synth
|
||||
p.ttsModelResolver = modelResolver
|
||||
}
|
||||
|
||||
// SetSessionEnsurer configures the session ensurer for auto-creating sessions on routes.
|
||||
@@ -352,8 +326,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
|
||||
}
|
||||
|
||||
resolvedAttachments := p.ingestInboundAttachments(ctx, cfg, msg, strings.TrimSpace(identity.BotID), msg.Message.Attachments)
|
||||
msg.Message.Attachments = resolvedAttachments
|
||||
hadVoiceAttachment := containsVoiceAttachment(resolvedAttachments)
|
||||
attachments := mapChannelToChatAttachments(resolvedAttachments)
|
||||
text = strings.TrimSpace(msg.Message.PlainText())
|
||||
|
||||
@@ -494,24 +466,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
|
||||
}
|
||||
shouldTrigger := shouldTriggerAssistantResponse(msg) || identity.ForceReply
|
||||
|
||||
if sessionType == sessionpkg.TypeDiscuss || shouldTrigger {
|
||||
if transcript := p.transcribeInboundAttachments(ctx, strings.TrimSpace(identity.BotID), resolvedAttachments); transcript != "" {
|
||||
labeledTranscript := formatInboundTranscript(transcript)
|
||||
if msg.Message.Metadata == nil {
|
||||
msg.Message.Metadata = make(map[string]any)
|
||||
}
|
||||
msg.Message.Metadata["transcript"] = transcript
|
||||
if plain := strings.TrimSpace(msg.Message.PlainText()); plain == "" {
|
||||
msg.Message.Text = labeledTranscript
|
||||
} else if !strings.Contains(plain, transcript) {
|
||||
msg.Message.Text = plain + "\n\n" + labeledTranscript
|
||||
}
|
||||
} else if hadVoiceAttachment && strings.TrimSpace(msg.Message.PlainText()) == "" {
|
||||
msg.Message.Text = formatVoiceTranscriptionUnavailableNotice(resolvedAttachments)
|
||||
}
|
||||
text = strings.TrimSpace(msg.Message.PlainText())
|
||||
}
|
||||
|
||||
if !shouldTrigger {
|
||||
p.persistPassiveMessage(ctx, identity, msg, text, attachments, resolved.RouteID, sessionID, eventID)
|
||||
if p.logger != nil {
|
||||
@@ -1946,97 +1900,6 @@ func (p *ChannelInboundProcessor) loadInboundAttachmentPayload(
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *ChannelInboundProcessor) transcribeInboundAttachments(ctx context.Context, botID string, attachments []channel.Attachment) string {
|
||||
if p == nil || p.transcriber == nil || p.sttModelResolver == nil || p.mediaService == nil || strings.TrimSpace(botID) == "" {
|
||||
return ""
|
||||
}
|
||||
modelID, err := p.sttModelResolver.ResolveTranscriptionModelID(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(modelID) == "" {
|
||||
return ""
|
||||
}
|
||||
transcripts := make([]string, 0, len(attachments))
|
||||
for _, att := range attachments {
|
||||
if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(att.ContentHash) == "" {
|
||||
continue
|
||||
}
|
||||
reader, asset, err := p.mediaService.Open(ctx, botID, strings.TrimSpace(att.ContentHash))
|
||||
if err != nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("open inbound audio for transcription failed", slog.Any("error", err), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
|
||||
}
|
||||
continue
|
||||
}
|
||||
audio, readErr := io.ReadAll(reader)
|
||||
_ = reader.Close()
|
||||
if readErr != nil || len(audio) == 0 {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("read inbound audio for transcription failed", slog.Any("error", readErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
|
||||
}
|
||||
continue
|
||||
}
|
||||
filename := strings.TrimSpace(att.Name)
|
||||
if filename == "" {
|
||||
filename = "audio" + filepath.Ext(asset.StorageKey)
|
||||
}
|
||||
contentType := strings.TrimSpace(att.Mime)
|
||||
if contentType == "" {
|
||||
contentType = strings.TrimSpace(asset.Mime)
|
||||
}
|
||||
result, txErr := p.transcriber.Transcribe(ctx, modelID, audio, filename, contentType, nil)
|
||||
if txErr != nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("inbound attachment transcription failed", slog.Any("error", txErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
|
||||
}
|
||||
continue
|
||||
}
|
||||
text := strings.TrimSpace(result.GetText())
|
||||
if text == "" {
|
||||
continue
|
||||
}
|
||||
transcripts = append(transcripts, text)
|
||||
}
|
||||
if len(transcripts) == 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.Join(transcripts, "\n\n")
|
||||
}
|
||||
|
||||
func formatInboundTranscript(transcript string) string {
|
||||
transcript = strings.TrimSpace(transcript)
|
||||
if transcript == "" {
|
||||
return ""
|
||||
}
|
||||
return "[Voice message transcription]\n" + transcript
|
||||
}
|
||||
|
||||
func containsVoiceAttachment(attachments []channel.Attachment) bool {
|
||||
for _, att := range attachments {
|
||||
if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func formatVoiceTranscriptionUnavailableNotice(attachments []channel.Attachment) string {
|
||||
paths := make([]string, 0, len(attachments))
|
||||
for _, att := range attachments {
|
||||
if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
|
||||
continue
|
||||
}
|
||||
if ref := strings.TrimSpace(att.URL); ref != "" {
|
||||
paths = append(paths, ref)
|
||||
}
|
||||
}
|
||||
if len(paths) == 0 {
|
||||
return "[User sent a voice message, but transcription is unavailable.]"
|
||||
}
|
||||
return "[User sent a voice message, but transcription is unavailable. Use transcribe_audio with one of these paths if needed: " + strings.Join(paths, ", ") + "]"
|
||||
}
|
||||
|
||||
func openInboundAttachmentURL(ctx context.Context, rawURL string) (inboundAttachmentPayload, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
||||
if err != nil {
|
||||
@@ -2227,9 +2090,6 @@ func mapChannelToChatAttachments(attachments []channel.Attachment) []conversatio
|
||||
}
|
||||
result := make([]conversation.ChatAttachment, 0, len(attachments))
|
||||
for _, att := range attachments {
|
||||
if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
|
||||
continue
|
||||
}
|
||||
ca := conversation.ChatAttachment{
|
||||
Type: string(att.Type),
|
||||
PlatformKey: att.PlatformKey,
|
||||
@@ -2304,13 +2164,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
|
||||
outboundAssetRefs *[]conversation.OutboundAssetRef,
|
||||
assetMu *sync.Mutex,
|
||||
) {
|
||||
if p.speechService == nil || p.speechModelResolver == nil {
|
||||
if p.ttsService == nil || p.ttsModelResolver == nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech_delta received but TTS service not configured")
|
||||
}
|
||||
return
|
||||
}
|
||||
modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
|
||||
modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(modelID) == "" {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
|
||||
@@ -2322,7 +2182,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
|
||||
if text == "" {
|
||||
continue
|
||||
}
|
||||
audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
|
||||
audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
|
||||
if synthErr != nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
|
||||
|
||||
@@ -511,7 +511,7 @@ WITH updated AS (
|
||||
SET display_name = $1,
|
||||
updated_at = now()
|
||||
WHERE bots.id = $2
|
||||
RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, transcription_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
|
||||
RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
|
||||
)
|
||||
SELECT
|
||||
updated.id AS id,
|
||||
|
||||
@@ -34,7 +34,6 @@ type Bot struct {
|
||||
ImageModelID pgtype.UUID `json:"image_model_id"`
|
||||
DiscussProbeModelID pgtype.UUID `json:"discuss_probe_model_id"`
|
||||
TtsModelID pgtype.UUID `json:"tts_model_id"`
|
||||
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
|
||||
BrowserContextID pgtype.UUID `json:"browser_context_id"`
|
||||
PersistFullToolResults bool `json:"persist_full_tool_results"`
|
||||
Metadata []byte `json:"metadata"`
|
||||
|
||||
+11
-225
@@ -13,7 +13,7 @@ import (
|
||||
|
||||
const countModels = `-- name: CountModels :one
|
||||
SELECT COUNT(*) FROM models
|
||||
WHERE type NOT IN ('speech', 'transcription')
|
||||
WHERE type != 'speech'
|
||||
`
|
||||
|
||||
func (q *Queries) CountModels(ctx context.Context) (int64, error) {
|
||||
@@ -40,19 +40,13 @@ FROM providers
|
||||
WHERE client_type NOT IN (
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
'microsoft-speech'
|
||||
)
|
||||
`
|
||||
|
||||
@@ -207,24 +201,6 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
|
||||
return err
|
||||
}
|
||||
|
||||
const deleteModelByProviderAndType = `-- name: DeleteModelByProviderAndType :exec
|
||||
DELETE FROM models
|
||||
WHERE provider_id = $1
|
||||
AND model_id = $2
|
||||
AND type = $3
|
||||
`
|
||||
|
||||
type DeleteModelByProviderAndTypeParams struct {
|
||||
ProviderID pgtype.UUID `json:"provider_id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
func (q *Queries) DeleteModelByProviderAndType(ctx context.Context, arg DeleteModelByProviderAndTypeParams) error {
|
||||
_, err := q.db.Exec(ctx, deleteModelByProviderAndType, arg.ProviderID, arg.ModelID, arg.Type)
|
||||
return err
|
||||
}
|
||||
|
||||
const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
|
||||
DELETE FROM models
|
||||
WHERE provider_id = $1
|
||||
@@ -318,27 +294,6 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getProviderByClientType = `-- name: GetProviderByClientType :one
|
||||
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
|
||||
`
|
||||
|
||||
func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
|
||||
row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
|
||||
var i Provider
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.ClientType,
|
||||
&i.Icon,
|
||||
&i.Enable,
|
||||
&i.Config,
|
||||
&i.Metadata,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getProviderByID = `-- name: GetProviderByID :one
|
||||
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
|
||||
`
|
||||
@@ -420,51 +375,12 @@ func (q *Queries) GetSpeechModelWithProvider(ctx context.Context, id pgtype.UUID
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getTranscriptionModelWithProvider = `-- name: GetTranscriptionModelWithProvider :one
|
||||
SELECT
|
||||
m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
|
||||
p.client_type AS provider_type
|
||||
FROM models m
|
||||
JOIN providers p ON p.id = m.provider_id
|
||||
WHERE m.id = $1
|
||||
AND m.type = 'transcription'
|
||||
`
|
||||
|
||||
type GetTranscriptionModelWithProviderRow struct {
|
||||
ID pgtype.UUID `json:"id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Name pgtype.Text `json:"name"`
|
||||
ProviderID pgtype.UUID `json:"provider_id"`
|
||||
Type string `json:"type"`
|
||||
Config []byte `json:"config"`
|
||||
CreatedAt pgtype.Timestamptz `json:"created_at"`
|
||||
UpdatedAt pgtype.Timestamptz `json:"updated_at"`
|
||||
ProviderType string `json:"provider_type"`
|
||||
}
|
||||
|
||||
func (q *Queries) GetTranscriptionModelWithProvider(ctx context.Context, id pgtype.UUID) (GetTranscriptionModelWithProviderRow, error) {
|
||||
row := q.db.QueryRow(ctx, getTranscriptionModelWithProvider, id)
|
||||
var i GetTranscriptionModelWithProviderRow
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.ModelID,
|
||||
&i.Name,
|
||||
&i.ProviderID,
|
||||
&i.Type,
|
||||
&i.Config,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
&i.ProviderType,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const listEnabledModels = `-- name: ListEnabledModels :many
|
||||
SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at
|
||||
FROM models m
|
||||
JOIN providers p ON m.provider_id = p.id
|
||||
WHERE p.enable = true
|
||||
AND m.type NOT IN ('speech', 'transcription')
|
||||
AND m.type != 'speech'
|
||||
ORDER BY m.created_at DESC
|
||||
`
|
||||
|
||||
@@ -609,7 +525,7 @@ func (q *Queries) ListModelVariantsByModelUUID(ctx context.Context, modelUuid pg
|
||||
|
||||
const listModels = `-- name: ListModels :many
|
||||
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
|
||||
WHERE type NOT IN ('speech', 'transcription')
|
||||
WHERE type != 'speech'
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
@@ -717,7 +633,7 @@ func (q *Queries) ListModelsByProviderClientType(ctx context.Context, clientType
|
||||
const listModelsByProviderID = `-- name: ListModelsByProviderID :many
|
||||
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
|
||||
WHERE provider_id = $1
|
||||
AND type NOT IN ('speech', 'transcription')
|
||||
AND type != 'speech'
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
@@ -831,19 +747,13 @@ SELECT id, name, client_type, icon, enable, config, metadata, created_at, update
|
||||
WHERE client_type NOT IN (
|
||||
'edge-speech',
|
||||
'openai-speech',
|
||||
'openai-transcription',
|
||||
'openrouter-speech',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-speech',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-speech',
|
||||
'deepgram-transcription',
|
||||
'minimax-speech',
|
||||
'volcengine-speech',
|
||||
'alibabacloud-speech',
|
||||
'microsoft-speech',
|
||||
'google-speech',
|
||||
'google-transcription'
|
||||
'microsoft-speech'
|
||||
)
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
@@ -1011,135 +921,6 @@ func (q *Queries) ListSpeechProviders(ctx context.Context) ([]Provider, error) {
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const listTranscriptionModels = `-- name: ListTranscriptionModels :many
|
||||
SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
|
||||
p.client_type AS provider_type
|
||||
FROM models m
|
||||
JOIN providers p ON p.id = m.provider_id
|
||||
WHERE m.type = 'transcription'
|
||||
ORDER BY m.created_at DESC
|
||||
`
|
||||
|
||||
type ListTranscriptionModelsRow struct {
|
||||
ID pgtype.UUID `json:"id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Name pgtype.Text `json:"name"`
|
||||
ProviderID pgtype.UUID `json:"provider_id"`
|
||||
Type string `json:"type"`
|
||||
Config []byte `json:"config"`
|
||||
CreatedAt pgtype.Timestamptz `json:"created_at"`
|
||||
UpdatedAt pgtype.Timestamptz `json:"updated_at"`
|
||||
ProviderType string `json:"provider_type"`
|
||||
}
|
||||
|
||||
func (q *Queries) ListTranscriptionModels(ctx context.Context) ([]ListTranscriptionModelsRow, error) {
|
||||
rows, err := q.db.Query(ctx, listTranscriptionModels)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []ListTranscriptionModelsRow
|
||||
for rows.Next() {
|
||||
var i ListTranscriptionModelsRow
|
||||
if err := rows.Scan(
|
||||
&i.ID,
|
||||
&i.ModelID,
|
||||
&i.Name,
|
||||
&i.ProviderID,
|
||||
&i.Type,
|
||||
&i.Config,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
&i.ProviderType,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const listTranscriptionModelsByProviderID = `-- name: ListTranscriptionModelsByProviderID :many
|
||||
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
|
||||
WHERE provider_id = $1
|
||||
AND type = 'transcription'
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, providerID pgtype.UUID) ([]Model, error) {
|
||||
rows, err := q.db.Query(ctx, listTranscriptionModelsByProviderID, providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []Model
|
||||
for rows.Next() {
|
||||
var i Model
|
||||
if err := rows.Scan(
|
||||
&i.ID,
|
||||
&i.ModelID,
|
||||
&i.Name,
|
||||
&i.ProviderID,
|
||||
&i.Type,
|
||||
&i.Config,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const listTranscriptionProviders = `-- name: ListTranscriptionProviders :many
|
||||
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
|
||||
WHERE client_type IN (
|
||||
'openai-transcription',
|
||||
'openrouter-transcription',
|
||||
'elevenlabs-transcription',
|
||||
'deepgram-transcription',
|
||||
'google-transcription'
|
||||
)
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
func (q *Queries) ListTranscriptionProviders(ctx context.Context) ([]Provider, error) {
|
||||
rows, err := q.db.Query(ctx, listTranscriptionProviders)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []Provider
|
||||
for rows.Next() {
|
||||
var i Provider
|
||||
if err := rows.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.ClientType,
|
||||
&i.Icon,
|
||||
&i.Enable,
|
||||
&i.Config,
|
||||
&i.Metadata,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const updateModel = `-- name: UpdateModel :one
|
||||
UPDATE models
|
||||
SET
|
||||
@@ -1281,6 +1062,11 @@ VALUES ($1, $2, $3, false, $4, '{}')
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
icon = EXCLUDED.icon,
|
||||
client_type = EXCLUDED.client_type,
|
||||
config = CASE
|
||||
WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
|
||||
THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
|
||||
ELSE EXCLUDED.config
|
||||
END,
|
||||
updated_at = now()
|
||||
RETURNING id, name, client_type, icon, enable, config, metadata, created_at, updated_at
|
||||
`
|
||||
|
||||
@@ -30,7 +30,6 @@ SET language = 'auto',
|
||||
search_provider_id = NULL,
|
||||
memory_provider_id = NULL,
|
||||
tts_model_id = NULL,
|
||||
transcription_model_id = NULL,
|
||||
browser_context_id = NULL,
|
||||
persist_full_tool_results = false,
|
||||
updated_at = now()
|
||||
@@ -63,7 +62,6 @@ SELECT
|
||||
memory_providers.id AS memory_provider_id,
|
||||
image_models.id AS image_model_id,
|
||||
tts_models.id AS tts_model_id,
|
||||
transcription_models.id AS transcription_model_id,
|
||||
browser_contexts.id AS browser_context_id,
|
||||
bots.persist_full_tool_results
|
||||
FROM bots
|
||||
@@ -75,7 +73,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
|
||||
LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
|
||||
LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
|
||||
LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
|
||||
LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
|
||||
LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
|
||||
WHERE bots.id = $1
|
||||
`
|
||||
@@ -100,7 +97,6 @@ type GetSettingsByBotIDRow struct {
|
||||
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
|
||||
ImageModelID pgtype.UUID `json:"image_model_id"`
|
||||
TtsModelID pgtype.UUID `json:"tts_model_id"`
|
||||
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
|
||||
BrowserContextID pgtype.UUID `json:"browser_context_id"`
|
||||
PersistFullToolResults bool `json:"persist_full_tool_results"`
|
||||
}
|
||||
@@ -128,7 +124,6 @@ func (q *Queries) GetSettingsByBotID(ctx context.Context, id pgtype.UUID) (GetSe
|
||||
&i.MemoryProviderID,
|
||||
&i.ImageModelID,
|
||||
&i.TtsModelID,
|
||||
&i.TranscriptionModelID,
|
||||
&i.BrowserContextID,
|
||||
&i.PersistFullToolResults,
|
||||
)
|
||||
@@ -156,12 +151,11 @@ WITH updated AS (
|
||||
memory_provider_id = COALESCE($16::uuid, bots.memory_provider_id),
|
||||
image_model_id = COALESCE($17::uuid, bots.image_model_id),
|
||||
tts_model_id = COALESCE($18::uuid, bots.tts_model_id),
|
||||
transcription_model_id = COALESCE($19::uuid, bots.transcription_model_id),
|
||||
browser_context_id = COALESCE($20::uuid, bots.browser_context_id),
|
||||
persist_full_tool_results = $21,
|
||||
browser_context_id = COALESCE($19::uuid, bots.browser_context_id),
|
||||
persist_full_tool_results = $20,
|
||||
updated_at = now()
|
||||
WHERE bots.id = $22
|
||||
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
|
||||
WHERE bots.id = $21
|
||||
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
|
||||
)
|
||||
SELECT
|
||||
updated.id AS bot_id,
|
||||
@@ -183,7 +177,6 @@ SELECT
|
||||
memory_providers.id AS memory_provider_id,
|
||||
image_models.id AS image_model_id,
|
||||
tts_models.id AS tts_model_id,
|
||||
transcription_models.id AS transcription_model_id,
|
||||
browser_contexts.id AS browser_context_id,
|
||||
updated.persist_full_tool_results
|
||||
FROM updated
|
||||
@@ -195,7 +188,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
|
||||
LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
|
||||
LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
|
||||
LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
|
||||
LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
|
||||
LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id
|
||||
`
|
||||
|
||||
@@ -218,7 +210,6 @@ type UpsertBotSettingsParams struct {
|
||||
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
|
||||
ImageModelID pgtype.UUID `json:"image_model_id"`
|
||||
TtsModelID pgtype.UUID `json:"tts_model_id"`
|
||||
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
|
||||
BrowserContextID pgtype.UUID `json:"browser_context_id"`
|
||||
PersistFullToolResults bool `json:"persist_full_tool_results"`
|
||||
ID pgtype.UUID `json:"id"`
|
||||
@@ -244,7 +235,6 @@ type UpsertBotSettingsRow struct {
|
||||
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
|
||||
ImageModelID pgtype.UUID `json:"image_model_id"`
|
||||
TtsModelID pgtype.UUID `json:"tts_model_id"`
|
||||
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
|
||||
BrowserContextID pgtype.UUID `json:"browser_context_id"`
|
||||
PersistFullToolResults bool `json:"persist_full_tool_results"`
|
||||
}
|
||||
@@ -269,7 +259,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
|
||||
arg.MemoryProviderID,
|
||||
arg.ImageModelID,
|
||||
arg.TtsModelID,
|
||||
arg.TranscriptionModelID,
|
||||
arg.BrowserContextID,
|
||||
arg.PersistFullToolResults,
|
||||
arg.ID,
|
||||
@@ -295,7 +284,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
|
||||
&i.MemoryProviderID,
|
||||
&i.ImageModelID,
|
||||
&i.TtsModelID,
|
||||
&i.TranscriptionModelID,
|
||||
&i.BrowserContextID,
|
||||
&i.PersistFullToolResults,
|
||||
)
|
||||
|
||||
@@ -7,28 +7,28 @@ import (
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
|
||||
type BotAudioHandler struct {
|
||||
audioService *audiopkg.Service
|
||||
// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
|
||||
type BotTtsHandler struct {
|
||||
ttsService *tts.Service
|
||||
settingsService *settings.Service
|
||||
tempStore *audiopkg.TempStore
|
||||
tempStore *tts.TempStore
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
|
||||
return &BotAudioHandler{
|
||||
audioService: audioService,
|
||||
func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
|
||||
return &BotTtsHandler{
|
||||
ttsService: ttsService,
|
||||
settingsService: settingsService,
|
||||
tempStore: tempStore,
|
||||
logger: log.With(slog.String("handler", "bot_audio")),
|
||||
logger: log.With(slog.String("handler", "bot_tts")),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *BotAudioHandler) Register(e *echo.Echo) {
|
||||
func (h *BotTtsHandler) Register(e *echo.Echo) {
|
||||
e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ type synthesizeResponse struct {
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /bots/{bot_id}/tts/synthesize [post].
|
||||
func (h *BotAudioHandler) Synthesize(c echo.Context) error {
|
||||
func (h *BotTtsHandler) Synthesize(c echo.Context) error {
|
||||
botID := strings.TrimSpace(c.Param("bot_id"))
|
||||
if botID == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
|
||||
@@ -88,10 +88,10 @@ func (h *BotAudioHandler) Synthesize(c echo.Context) error {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
|
||||
}
|
||||
|
||||
contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
|
||||
contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
|
||||
closeErr := f.Close()
|
||||
if streamErr != nil {
|
||||
h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
|
||||
h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
|
||||
h.tempStore.Delete(tempID)
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
|
||||
}
|
||||
|
||||
@@ -30,30 +30,30 @@ import (
|
||||
messagepkg "github.com/memohai/memoh/internal/message"
|
||||
)
|
||||
|
||||
// localSpeechSynthesizer synthesizes text to speech audio.
|
||||
type localSpeechSynthesizer interface {
|
||||
// localTtsSynthesizer synthesizes text to speech audio.
|
||||
type localTtsSynthesizer interface {
|
||||
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
|
||||
}
|
||||
|
||||
// localSpeechModelResolver resolves speech model IDs for bots.
|
||||
type localSpeechModelResolver interface {
|
||||
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
|
||||
// localTtsModelResolver resolves TTS model IDs for bots.
|
||||
type localTtsModelResolver interface {
|
||||
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
|
||||
}
|
||||
|
||||
// LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
|
||||
type LocalChannelHandler struct {
|
||||
channelType channel.ChannelType
|
||||
channelManager *channel.Manager
|
||||
channelStore *channel.Store
|
||||
chatService *conversation.Service
|
||||
routeHub *local.RouteHub
|
||||
botService *bots.Service
|
||||
accountService *accounts.Service
|
||||
resolver *flow.Resolver
|
||||
mediaService *media.Service
|
||||
speechService localSpeechSynthesizer
|
||||
speechModelResolver localSpeechModelResolver
|
||||
logger *slog.Logger
|
||||
channelType channel.ChannelType
|
||||
channelManager *channel.Manager
|
||||
channelStore *channel.Store
|
||||
chatService *conversation.Service
|
||||
routeHub *local.RouteHub
|
||||
botService *bots.Service
|
||||
accountService *accounts.Service
|
||||
resolver *flow.Resolver
|
||||
mediaService *media.Service
|
||||
ttsService localTtsSynthesizer
|
||||
ttsModelResolver localTtsModelResolver
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewLocalChannelHandler creates a local channel handler.
|
||||
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
|
||||
h.mediaService = svc
|
||||
}
|
||||
|
||||
// SetSpeechService configures speech synthesis for handling speech_delta events.
|
||||
func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
|
||||
h.speechService = synth
|
||||
h.speechModelResolver = resolver
|
||||
// SetTtsService configures TTS synthesis for handling speech_delta events.
|
||||
func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
|
||||
h.ttsService = synth
|
||||
h.ttsModelResolver = resolver
|
||||
}
|
||||
|
||||
// Register registers the local channel routes.
|
||||
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
|
||||
// wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
|
||||
// injecting attachment_delta events with the resulting voice attachments.
|
||||
func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
|
||||
if h.speechService == nil || h.speechModelResolver == nil {
|
||||
if h.ttsService == nil || h.ttsModelResolver == nil {
|
||||
h.logger.Warn("speech_delta received but TTS service not configured")
|
||||
return nil
|
||||
}
|
||||
|
||||
modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
|
||||
modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(modelID) == "" {
|
||||
h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
|
||||
return nil
|
||||
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
|
||||
continue
|
||||
}
|
||||
|
||||
audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
|
||||
audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
|
||||
if synthErr != nil {
|
||||
h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
|
||||
continue
|
||||
|
||||
@@ -1,83 +1,55 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
type AudioHandler struct {
|
||||
service *audiopkg.Service
|
||||
type SpeechHandler struct {
|
||||
service *tts.Service
|
||||
modelsService *models.Service
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
|
||||
return &AudioHandler{
|
||||
func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
|
||||
return &SpeechHandler{
|
||||
service: service,
|
||||
modelsService: modelsService,
|
||||
logger: log.With(slog.String("handler", "audio")),
|
||||
logger: log.With(slog.String("handler", "speech")),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *AudioHandler) Register(e *echo.Echo) {
|
||||
func (h *SpeechHandler) Register(e *echo.Echo) {
|
||||
pg := e.Group("/speech-providers")
|
||||
pg.GET("", h.ListProviders)
|
||||
pg.GET("/:id", h.GetProvider)
|
||||
pg.GET("/meta", h.ListSpeechMeta)
|
||||
pg.GET("/meta", h.ListMeta)
|
||||
pg.GET("/:id/models", h.ListModelsByProvider)
|
||||
pg.POST("/:id/import-models", h.ImportModels)
|
||||
|
||||
tpg := e.Group("/transcription-providers")
|
||||
tpg.GET("", h.ListTranscriptionProviders)
|
||||
tpg.GET("/meta", h.ListTranscriptionMeta)
|
||||
tpg.GET("/:id", h.GetProvider)
|
||||
tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider)
|
||||
tpg.POST("/:id/import-models", h.ImportTranscriptionModels)
|
||||
|
||||
mg := e.Group("/speech-models")
|
||||
mg.GET("", h.ListModels)
|
||||
mg.GET("/:id", h.GetModel)
|
||||
mg.PUT("/:id", h.UpdateModel)
|
||||
mg.GET("/:id/capabilities", h.GetModelCapabilities)
|
||||
mg.POST("/:id/test", h.TestModel)
|
||||
|
||||
tg := e.Group("/transcription-models")
|
||||
tg.GET("", h.ListTranscriptionModels)
|
||||
tg.GET("/:id", h.GetTranscriptionModel)
|
||||
tg.PUT("/:id", h.UpdateTranscriptionModel)
|
||||
tg.GET("/:id/capabilities", h.GetTranscriptionModelCapabilities)
|
||||
tg.POST("/:id/test", h.TestTranscriptionModel)
|
||||
}
|
||||
|
||||
// ListMeta godoc
|
||||
// @Summary List speech provider metadata
|
||||
// @Description List available speech provider types with their models and capabilities
|
||||
// @Tags speech-providers
|
||||
// @Success 200 {array} audiopkg.ProviderMetaResponse
|
||||
// @Success 200 {array} tts.ProviderMetaResponse
|
||||
// @Router /speech-providers/meta [get].
|
||||
func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
|
||||
}
|
||||
|
||||
// ListTranscriptionMeta godoc
|
||||
// @Summary List transcription provider metadata
|
||||
// @Description List available transcription provider types with their models and capabilities
|
||||
// @Tags transcription-providers
|
||||
// @Success 200 {array} audiopkg.ProviderMetaResponse
|
||||
// @Router /transcription-providers/meta [get].
|
||||
func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
|
||||
func (h *SpeechHandler) ListMeta(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context()))
|
||||
}
|
||||
|
||||
// ListProviders godoc
|
||||
@@ -85,10 +57,10 @@ func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
|
||||
// @Description List providers that support speech (filtered view of unified providers table)
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.SpeechProviderResponse
|
||||
// @Success 200 {array} tts.SpeechProviderResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers [get].
|
||||
func (h *AudioHandler) ListProviders(c echo.Context) error {
|
||||
func (h *SpeechHandler) ListProviders(c echo.Context) error {
|
||||
items, err := h.service.ListSpeechProviders(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -96,34 +68,17 @@ func (h *AudioHandler) ListProviders(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
// ListTranscriptionProviders godoc
|
||||
// @Summary List transcription providers
|
||||
// @Description List providers that support transcription (filtered view of unified providers table)
|
||||
// @Tags transcription-providers
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.SpeechProviderResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers [get].
|
||||
func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
|
||||
items, err := h.service.ListTranscriptionProviders(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
// GetProvider godoc
|
||||
// @Summary Get speech provider
|
||||
// @Description Get a speech provider with masked config values
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} audiopkg.SpeechProviderResponse
|
||||
// @Success 200 {object} tts.SpeechProviderResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id} [get].
|
||||
// @Router /transcription-providers/{id} [get].
|
||||
func (h *AudioHandler) GetProvider(c echo.Context) error {
|
||||
func (h *SpeechHandler) GetProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -141,11 +96,11 @@ func (h *AudioHandler) GetProvider(c echo.Context) error {
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {array} audiopkg.SpeechModelResponse
|
||||
// @Success 200 {array} tts.SpeechModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id}/models [get].
|
||||
func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
|
||||
func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -164,12 +119,12 @@ func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} audiopkg.ImportModelsResponse
|
||||
// @Success 200 {object} tts.ImportModelsResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id}/import-models [post].
|
||||
func (h *AudioHandler) ImportModels(c echo.Context) error {
|
||||
func (h *SpeechHandler) ImportModels(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -180,7 +135,7 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
|
||||
}
|
||||
|
||||
resp := audiopkg.ImportModelsResponse{
|
||||
resp := tts.ImportModelsResponse{
|
||||
Models: make([]string, 0, len(remoteModels)),
|
||||
}
|
||||
|
||||
@@ -212,92 +167,15 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// ListTranscriptionModelsByProvider godoc
|
||||
// @Summary List transcription models by provider
|
||||
// @Description List models of type 'transcription' for a specific transcription provider
|
||||
// @Tags transcription-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {array} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers/{id}/models [get].
|
||||
func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
items, err := h.service.ListTranscriptionModelsByProvider(c.Request().Context(), id)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
// ImportTranscriptionModels godoc
|
||||
// @Summary Import transcription models from provider
|
||||
// @Description Fetch models using the configured transcription provider and import them into the unified models table
|
||||
// @Tags transcription-providers
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} audiopkg.ImportModelsResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers/{id}/import-models [post].
|
||||
func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
|
||||
remoteModels, err := h.service.FetchRemoteTranscriptionModels(c.Request().Context(), id)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
|
||||
}
|
||||
|
||||
resp := audiopkg.ImportModelsResponse{
|
||||
Models: make([]string, 0, len(remoteModels)),
|
||||
}
|
||||
|
||||
for _, model := range remoteModels {
|
||||
name := strings.TrimSpace(model.Name)
|
||||
if name == "" {
|
||||
name = model.ID
|
||||
}
|
||||
|
||||
_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
|
||||
ModelID: model.ID,
|
||||
Name: name,
|
||||
ProviderID: id,
|
||||
Type: models.ModelTypeTranscription,
|
||||
Config: models.ModelConfig{},
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, models.ErrModelIDAlreadyExists) {
|
||||
resp.Skipped++
|
||||
continue
|
||||
}
|
||||
h.logger.Warn("failed to import transcription model", slog.String("model_id", model.ID), slog.Any("error", err))
|
||||
continue
|
||||
}
|
||||
resp.Created++
|
||||
resp.Models = append(resp.Models, model.ID)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// ListModels godoc
|
||||
// @Summary List all speech models
|
||||
// @Description List all models of type 'speech' (filtered view of unified models table)
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.SpeechModelResponse
|
||||
// @Success 200 {array} tts.SpeechModelResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models [get].
|
||||
func (h *AudioHandler) ListModels(c echo.Context) error {
|
||||
func (h *SpeechHandler) ListModels(c echo.Context) error {
|
||||
items, err := h.service.ListSpeechModels(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -305,31 +183,15 @@ func (h *AudioHandler) ListModels(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
// ListTranscriptionModels godoc
|
||||
// @Summary List all transcription models
|
||||
// @Description List all models of type 'transcription' (filtered view of unified models table)
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models [get].
|
||||
func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
|
||||
items, err := h.service.ListTranscriptionModels(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
// GetModel godoc
|
||||
// @Summary Get a speech model
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.SpeechModelResponse
|
||||
// @Success 200 {object} tts.SpeechModelResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-models/{id} [get].
|
||||
func (h *AudioHandler) GetModel(c echo.Context) error {
|
||||
func (h *SpeechHandler) GetModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -341,89 +203,15 @@ func (h *AudioHandler) GetModel(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// UpdateModel godoc
|
||||
// @Summary Update a speech model
|
||||
// @Tags speech-models
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
|
||||
// @Success 200 {object} audiopkg.SpeechModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models/{id} [put].
|
||||
func (h *AudioHandler) UpdateModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req audiopkg.UpdateSpeechModelRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
resp, err := h.service.UpdateSpeechModel(c.Request().Context(), id, req)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// GetTranscriptionModel godoc
|
||||
// @Summary Get a transcription model
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id} [get].
|
||||
func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
resp, err := h.service.GetTranscriptionModel(c.Request().Context(), id)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusNotFound, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// UpdateTranscriptionModel godoc
|
||||
// @Summary Update a transcription model
|
||||
// @Tags transcription-models
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
|
||||
// @Success 200 {object} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id} [put].
|
||||
func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req audiopkg.UpdateSpeechModelRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
resp, err := h.service.UpdateTranscriptionModel(c.Request().Context(), id, req)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// GetModelCapabilities godoc
|
||||
// @Summary Get speech model capabilities
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.ModelCapabilities
|
||||
// @Success 200 {object} tts.ModelCapabilities
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-models/{id}/capabilities [get].
|
||||
func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
|
||||
func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -435,26 +223,6 @@ func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, caps)
|
||||
}
|
||||
|
||||
// GetTranscriptionModelCapabilities godoc
|
||||
// @Summary Get transcription model capabilities
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.ModelCapabilities
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id}/capabilities [get].
|
||||
func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
caps, err := h.service.GetTranscriptionModelCapabilities(c.Request().Context(), id)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusNotFound, err.Error())
|
||||
}
|
||||
return c.JSON(http.StatusOK, caps)
|
||||
}
|
||||
|
||||
// TestModel godoc
|
||||
// @Summary Test speech model synthesis
|
||||
// @Description Synthesize text using a specific model's config and return audio
|
||||
@@ -462,17 +230,17 @@ func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
|
||||
// @Accept json
|
||||
// @Produce application/octet-stream
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
|
||||
// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
|
||||
// @Success 200 {file} binary "Audio data"
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models/{id}/test [post].
|
||||
func (h *AudioHandler) TestModel(c echo.Context) error {
|
||||
func (h *SpeechHandler) TestModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req audiopkg.TestSynthesizeRequest
|
||||
var req tts.TestSynthesizeRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
@@ -490,69 +258,3 @@ func (h *AudioHandler) TestModel(c echo.Context) error {
|
||||
}
|
||||
return c.Blob(http.StatusOK, contentType, audio)
|
||||
}
|
||||
|
||||
// TestTranscriptionModel godoc
|
||||
// @Summary Test transcription model recognition
|
||||
// @Description Transcribe uploaded audio using a specific model's config and return structured text output
|
||||
// @Tags transcription-models
|
||||
// @Accept mpfd
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param file formData file true "Audio file"
|
||||
// @Param config formData string false "Optional JSON config"
|
||||
// @Success 200 {object} audiopkg.TestTranscriptionResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id}/test [post].
|
||||
func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
file, err := c.FormFile("file")
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "file is required")
|
||||
}
|
||||
src, err := file.Open()
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
defer func(src multipart.File) {
|
||||
err := src.Close()
|
||||
if err != nil {
|
||||
h.logger.Warn("failed to close uploaded file", slog.Any("error", err))
|
||||
}
|
||||
}(src)
|
||||
audio, err := io.ReadAll(src)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
var cfg map[string]any
|
||||
if raw := strings.TrimSpace(c.FormValue("config")); raw != "" {
|
||||
if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "invalid config")
|
||||
}
|
||||
}
|
||||
result, err := h.service.Transcribe(c.Request().Context(), id, audio, file.Filename, file.Header.Get("Content-Type"), cfg)
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
resp := audiopkg.TestTranscriptionResponse{
|
||||
Text: result.Text,
|
||||
Language: result.Language,
|
||||
DurationSeconds: result.DurationSeconds,
|
||||
Metadata: result.ProviderMetadata,
|
||||
}
|
||||
if len(result.Words) > 0 {
|
||||
resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
|
||||
for _, word := range result.Words {
|
||||
resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
|
||||
Text: word.Text,
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
SpeakerID: word.SpeakerID,
|
||||
})
|
||||
}
|
||||
}
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
@@ -126,9 +126,9 @@ func (s *Service) List(ctx context.Context) ([]GetResponse, error) {
|
||||
return s.convertToGetResponseList(dbModels), nil
|
||||
}
|
||||
|
||||
// ListByType returns models filtered by type.
|
||||
// ListByType returns models filtered by type (chat, embedding, or speech).
|
||||
func (s *Service) ListByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
|
||||
return nil, fmt.Errorf("invalid model type: %s", modelType)
|
||||
}
|
||||
|
||||
@@ -165,7 +165,7 @@ func (s *Service) ListEnabled(ctx context.Context) ([]GetResponse, error) {
|
||||
|
||||
// ListEnabledByType returns models from enabled providers filtered by type.
|
||||
func (s *Service) ListEnabledByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
|
||||
return nil, fmt.Errorf("invalid model type: %s", modelType)
|
||||
}
|
||||
dbModels, err := s.queries.ListEnabledModelsByType(ctx, string(modelType))
|
||||
@@ -206,7 +206,7 @@ func (s *Service) ListByProviderID(ctx context.Context, providerID string) ([]Ge
|
||||
|
||||
// ListByProviderIDAndType returns models filtered by provider ID and type.
|
||||
func (s *Service) ListByProviderIDAndType(ctx context.Context, providerID string, modelType ModelType) ([]GetResponse, error) {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
|
||||
return nil, fmt.Errorf("invalid model type: %s", modelType)
|
||||
}
|
||||
if strings.TrimSpace(providerID) == "" {
|
||||
@@ -361,7 +361,7 @@ func (s *Service) Count(ctx context.Context) (int64, error) {
|
||||
|
||||
// CountByType returns the number of models of a specific type.
|
||||
func (s *Service) CountByType(ctx context.Context, modelType ModelType) (int64, error) {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
|
||||
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
|
||||
return 0, fmt.Errorf("invalid model type: %s", modelType)
|
||||
}
|
||||
|
||||
@@ -432,19 +432,13 @@ func IsValidClientType(clientType ClientType) bool {
|
||||
ClientTypeGitHubCopilot,
|
||||
ClientTypeEdgeSpeech,
|
||||
ClientTypeOpenAISpeech,
|
||||
ClientTypeOpenAITranscription,
|
||||
ClientTypeOpenRouterSpeech,
|
||||
ClientTypeOpenRouterTranscription,
|
||||
ClientTypeElevenLabsSpeech,
|
||||
ClientTypeElevenLabsTranscription,
|
||||
ClientTypeDeepgramSpeech,
|
||||
ClientTypeDeepgramTranscription,
|
||||
ClientTypeMiniMaxSpeech,
|
||||
ClientTypeVolcengineSpeech,
|
||||
ClientTypeAlibabaSpeech,
|
||||
ClientTypeMicrosoftSpeech,
|
||||
ClientTypeGoogleSpeech,
|
||||
ClientTypeGoogleTranscription:
|
||||
ClientTypeMicrosoftSpeech:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
@@ -454,9 +448,7 @@ func IsValidClientType(clientType ClientType) bool {
|
||||
// IsLLMClientType returns true if the client type belongs to the LLM domain
|
||||
// (chat/embedding), excluding speech-only types (any type ending in "-speech").
|
||||
func IsLLMClientType(clientType ClientType) bool {
|
||||
return IsValidClientType(clientType) &&
|
||||
!strings.HasSuffix(string(clientType), "-speech") &&
|
||||
!strings.HasSuffix(string(clientType), "-transcription")
|
||||
return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
|
||||
}
|
||||
|
||||
// SelectMemoryModel selects a chat model for memory operations.
|
||||
|
||||
+19
-26
@@ -9,36 +9,29 @@ import (
|
||||
type ModelType string
|
||||
|
||||
const (
|
||||
ModelTypeChat ModelType = "chat"
|
||||
ModelTypeEmbedding ModelType = "embedding"
|
||||
ModelTypeSpeech ModelType = "speech"
|
||||
ModelTypeTranscription ModelType = "transcription"
|
||||
ModelTypeChat ModelType = "chat"
|
||||
ModelTypeEmbedding ModelType = "embedding"
|
||||
ModelTypeSpeech ModelType = "speech"
|
||||
)
|
||||
|
||||
type ClientType string
|
||||
|
||||
const (
|
||||
ClientTypeOpenAIResponses ClientType = "openai-responses"
|
||||
ClientTypeOpenAICompletions ClientType = "openai-completions"
|
||||
ClientTypeAnthropicMessages ClientType = "anthropic-messages"
|
||||
ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
|
||||
ClientTypeOpenAICodex ClientType = "openai-codex"
|
||||
ClientTypeGitHubCopilot ClientType = "github-copilot"
|
||||
ClientTypeEdgeSpeech ClientType = "edge-speech"
|
||||
ClientTypeOpenAISpeech ClientType = "openai-speech"
|
||||
ClientTypeOpenAITranscription ClientType = "openai-transcription"
|
||||
ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
|
||||
ClientTypeOpenRouterTranscription ClientType = "openrouter-transcription"
|
||||
ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
|
||||
ClientTypeElevenLabsTranscription ClientType = "elevenlabs-transcription"
|
||||
ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
|
||||
ClientTypeDeepgramTranscription ClientType = "deepgram-transcription"
|
||||
ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
|
||||
ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
|
||||
ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
|
||||
ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
|
||||
ClientTypeGoogleSpeech ClientType = "google-speech"
|
||||
ClientTypeGoogleTranscription ClientType = "google-transcription"
|
||||
ClientTypeOpenAIResponses ClientType = "openai-responses"
|
||||
ClientTypeOpenAICompletions ClientType = "openai-completions"
|
||||
ClientTypeAnthropicMessages ClientType = "anthropic-messages"
|
||||
ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
|
||||
ClientTypeOpenAICodex ClientType = "openai-codex"
|
||||
ClientTypeGitHubCopilot ClientType = "github-copilot"
|
||||
ClientTypeEdgeSpeech ClientType = "edge-speech"
|
||||
ClientTypeOpenAISpeech ClientType = "openai-speech"
|
||||
ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
|
||||
ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
|
||||
ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
|
||||
ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
|
||||
ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
|
||||
ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
|
||||
ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -95,7 +88,7 @@ func (m *Model) Validate() error {
|
||||
if _, err := uuid.Parse(m.ProviderID); err != nil {
|
||||
return errors.New("provider ID must be a valid UUID")
|
||||
}
|
||||
if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech && m.Type != ModelTypeTranscription {
|
||||
if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech {
|
||||
return errors.New("invalid model type")
|
||||
}
|
||||
if m.Type == ModelTypeEmbedding {
|
||||
|
||||
@@ -175,14 +175,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
|
||||
}
|
||||
ttsModelUUID = modelID
|
||||
}
|
||||
transcriptionModelUUID := pgtype.UUID{}
|
||||
if value := strings.TrimSpace(req.TranscriptionModelID); value != "" {
|
||||
modelID, err := db.ParseUUID(value)
|
||||
if err != nil {
|
||||
return Settings{}, err
|
||||
}
|
||||
transcriptionModelUUID = modelID
|
||||
}
|
||||
browserContextUUID := pgtype.UUID{}
|
||||
if value := strings.TrimSpace(req.BrowserContextID); value != "" {
|
||||
ctxID, err := db.ParseUUID(value)
|
||||
@@ -212,7 +204,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
|
||||
SearchProviderID: searchProviderUUID,
|
||||
MemoryProviderID: memoryProviderUUID,
|
||||
TtsModelID: ttsModelUUID,
|
||||
TranscriptionModelID: transcriptionModelUUID,
|
||||
BrowserContextID: browserContextUUID,
|
||||
PersistFullToolResults: current.PersistFullToolResults,
|
||||
})
|
||||
@@ -307,7 +298,6 @@ func normalizeBotSettingsReadRow(row sqlc.GetSettingsByBotIDRow) Settings {
|
||||
row.SearchProviderID,
|
||||
row.MemoryProviderID,
|
||||
row.TtsModelID,
|
||||
row.TranscriptionModelID,
|
||||
row.BrowserContextID,
|
||||
row.PersistFullToolResults,
|
||||
)
|
||||
@@ -332,7 +322,6 @@ func normalizeBotSettingsWriteRow(row sqlc.UpsertBotSettingsRow) Settings {
|
||||
row.SearchProviderID,
|
||||
row.MemoryProviderID,
|
||||
row.TtsModelID,
|
||||
row.TranscriptionModelID,
|
||||
row.BrowserContextID,
|
||||
row.PersistFullToolResults,
|
||||
)
|
||||
@@ -356,7 +345,6 @@ func normalizeBotSettingsFields(
|
||||
searchProviderID pgtype.UUID,
|
||||
memoryProviderID pgtype.UUID,
|
||||
ttsModelID pgtype.UUID,
|
||||
transcriptionModelID pgtype.UUID,
|
||||
browserContextID pgtype.UUID,
|
||||
persistFullToolResults bool,
|
||||
) Settings {
|
||||
@@ -388,9 +376,6 @@ func normalizeBotSettingsFields(
|
||||
if ttsModelID.Valid {
|
||||
settings.TtsModelID = uuid.UUID(ttsModelID.Bytes).String()
|
||||
}
|
||||
if transcriptionModelID.Valid {
|
||||
settings.TranscriptionModelID = uuid.UUID(transcriptionModelID.Bytes).String()
|
||||
}
|
||||
if browserContextID.Valid {
|
||||
settings.BrowserContextID = uuid.UUID(browserContextID.Bytes).String()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ type Settings struct {
|
||||
SearchProviderID string `json:"search_provider_id"`
|
||||
MemoryProviderID string `json:"memory_provider_id"`
|
||||
TtsModelID string `json:"tts_model_id"`
|
||||
TranscriptionModelID string `json:"transcription_model_id"`
|
||||
BrowserContextID string `json:"browser_context_id"`
|
||||
Language string `json:"language"`
|
||||
AclDefaultEffect string `json:"acl_default_effect"`
|
||||
@@ -37,7 +36,6 @@ type UpsertRequest struct {
|
||||
SearchProviderID string `json:"search_provider_id,omitempty"`
|
||||
MemoryProviderID string `json:"memory_provider_id,omitempty"`
|
||||
TtsModelID string `json:"tts_model_id,omitempty"`
|
||||
TranscriptionModelID string `json:"transcription_model_id,omitempty"`
|
||||
BrowserContextID string `json:"browser_context_id,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
AclDefaultEffect string `json:"acl_default_effect,omitempty"`
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package audio
|
||||
package tts
|
||||
|
||||
import "context"
|
||||
|
||||
@@ -6,10 +6,10 @@ import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
const TtsTypeEdge audio.TtsType = "edge"
|
||||
const TtsTypeEdge tts.TtsType = "edge"
|
||||
|
||||
const edgeModelReadAloud = "edge-read-aloud"
|
||||
|
||||
@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
|
||||
}
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Type() audio.TtsType {
|
||||
func (*EdgeAdapter) Type() tts.TtsType {
|
||||
return TtsTypeEdge
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Meta() audio.TtsMeta {
|
||||
return audio.TtsMeta{
|
||||
func (*EdgeAdapter) Meta() tts.TtsMeta {
|
||||
return tts.TtsMeta{
|
||||
Provider: "Microsoft Edge",
|
||||
Description: "Microsoft Edge TTS",
|
||||
}
|
||||
@@ -54,32 +54,32 @@ var edgeFormats = []string{
|
||||
"webm-24khz-16bit-mono-opus",
|
||||
}
|
||||
|
||||
var edgeSpeedConstraint = &audio.ParamConstraint{
|
||||
var edgeSpeedConstraint = &tts.ParamConstraint{
|
||||
Options: []float64{0.5, 1.0, 2.0, 3.0},
|
||||
Default: 1.0,
|
||||
}
|
||||
|
||||
var edgePitchConstraint = &audio.ParamConstraint{
|
||||
var edgePitchConstraint = &tts.ParamConstraint{
|
||||
Min: -100,
|
||||
Max: 100,
|
||||
Default: 0,
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Models() []audio.ModelInfo {
|
||||
var voices []audio.VoiceInfo
|
||||
func (*EdgeAdapter) Models() []tts.ModelInfo {
|
||||
var voices []tts.VoiceInfo
|
||||
for lang, ids := range EdgeTTSVoices {
|
||||
for _, id := range ids {
|
||||
name := strings.TrimPrefix(id, lang+"-")
|
||||
name = strings.TrimSuffix(name, "Neural")
|
||||
voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
|
||||
voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
|
||||
}
|
||||
}
|
||||
return []audio.ModelInfo{
|
||||
return []tts.ModelInfo{
|
||||
{
|
||||
ID: edgeModelReadAloud,
|
||||
Name: "Edge Read Aloud",
|
||||
Description: "Built-in Edge Read Aloud speech model",
|
||||
Capabilities: audio.ModelCapabilities{
|
||||
Capabilities: tts.ModelCapabilities{
|
||||
Voices: voices,
|
||||
Formats: edgeFormats,
|
||||
Speed: edgeSpeedConstraint,
|
||||
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
|
||||
return edgeModelReadAloud, nil
|
||||
}
|
||||
|
||||
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
|
||||
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
|
||||
if err := config.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("edge tts: invalid config: %w", err)
|
||||
}
|
||||
return a.client.Synthesize(ctx, text, config)
|
||||
}
|
||||
|
||||
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
|
||||
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
|
||||
if err := config.Validate(); err != nil {
|
||||
errCh := make(chan error, 1)
|
||||
errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
|
||||
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
|
||||
var chunks [][]byte
|
||||
for b := range ch {
|
||||
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
|
||||
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when connection fails")
|
||||
}
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
"github.com/google/uuid"
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
// Edge TTS WebSocket client.
|
||||
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
|
||||
}
|
||||
|
||||
// Configure sends the speech.config message (output format, etc.).
|
||||
func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
|
||||
func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.conn == nil {
|
||||
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig)
|
||||
}
|
||||
|
||||
// buildSSML builds SSML with rate and pitch for Edge TTS prosody.
|
||||
func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
|
||||
func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
|
||||
voiceID := voice.ID
|
||||
if voiceID == "" {
|
||||
voiceID = DEFAULT_VOICE
|
||||
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {
|
||||
|
||||
// Synthesize sends SSML and synchronously collects all audio data.
|
||||
// It handles the full lifecycle: connect → configure → send → receive → close.
|
||||
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
|
||||
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
|
||||
if err := c.Connect(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {
|
||||
|
||||
// Stream sends SSML and returns audio chunks via channel.
|
||||
// It handles the full lifecycle: connect → configure → send → stream → close.
|
||||
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
|
||||
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
|
||||
ch = make(chan []byte, 8)
|
||||
errCh = make(chan error, 1)
|
||||
go func() {
|
||||
+12
-12
@@ -9,7 +9,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
// Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
|
||||
@@ -17,14 +17,14 @@ import (
|
||||
//
|
||||
// Run:
|
||||
//
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v
|
||||
|
||||
func TestRealEdgeTTS_Synthesize(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
|
||||
ch, errCh := client.Stream(ctx, "你好,这是流式测试。", config)
|
||||
var total int
|
||||
for b := range ch {
|
||||
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
|
||||
|
||||
// TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
|
||||
//
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
|
||||
func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
formats := []string{
|
||||
"audio-24khz-48kbitrate-mono-mp3",
|
||||
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := audio.AudioConfig{
|
||||
Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
|
||||
config := tts.AudioConfig{
|
||||
Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
|
||||
Format: fmt,
|
||||
Speed: 1.0,
|
||||
}
|
||||
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
|
||||
// TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
|
||||
//
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
|
||||
func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
text string
|
||||
voice audio.VoiceConfig
|
||||
voice tts.VoiceConfig
|
||||
file string
|
||||
}{
|
||||
{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
|
||||
{"zh", "你好,这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
|
||||
{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
|
||||
{"zh", "你好,这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
|
||||
}
|
||||
|
||||
outDir := filepath.Join(os.TempDir(), "edge_tts_test")
|
||||
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
|
||||
config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
|
||||
audio, err := client.Synthesize(ctx, tc.text, config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
var upgrader = websocket.Upgrader{
|
||||
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
client.BaseURL = wsURL
|
||||
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
audio, err := client.Synthesize(t.Context(), "Hello world", config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
client.BaseURL = wsURL
|
||||
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
ch, errCh := client.Stream(t.Context(), "Hi", config)
|
||||
var chunks [][]byte
|
||||
for b := range ch {
|
||||
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {
|
||||
|
||||
func TestBuildSSML(t *testing.T) {
|
||||
t.Parallel()
|
||||
ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
|
||||
ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
|
||||
if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
|
||||
t.Errorf("ssml should contain voice: %s", ssml)
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package tts
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
)
|
||||
|
||||
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
|
||||
for _, def := range registry.List() {
|
||||
configJSON, err := json.Marshal(map[string]any{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal speech provider config: %w", err)
|
||||
}
|
||||
var icon pgtype.Text
|
||||
if def.Icon != "" {
|
||||
icon = pgtype.Text{String: def.Icon, Valid: true}
|
||||
}
|
||||
|
||||
provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
|
||||
Name: def.DisplayName,
|
||||
ClientType: string(def.ClientType),
|
||||
Icon: icon,
|
||||
Config: configJSON,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
|
||||
}
|
||||
|
||||
synced := 0
|
||||
for _, model := range def.Models {
|
||||
if shouldHideTemplateModel(def, model.ID) {
|
||||
if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
|
||||
ProviderID: provider.ID,
|
||||
ModelID: model.ID,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
modelConfigJSON, err := json.Marshal(map[string]any{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal speech model config: %w", err)
|
||||
}
|
||||
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
|
||||
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
|
||||
ModelID: model.ID,
|
||||
Name: name,
|
||||
ProviderID: provider.ID,
|
||||
Type: "speech",
|
||||
Config: modelConfigJSON,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
|
||||
}
|
||||
synced++
|
||||
}
|
||||
|
||||
if logger != nil {
|
||||
logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package audio
|
||||
package tts
|
||||
|
||||
// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
|
||||
type VoiceConfig struct {
|
||||
@@ -1,4 +1,4 @@
|
||||
package audio
|
||||
package tts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -8,43 +8,31 @@ import (
|
||||
|
||||
alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
|
||||
deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
|
||||
deepgramtranscription "github.com/memohai/twilight-ai/provider/deepgram/transcription"
|
||||
edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
|
||||
elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
|
||||
elevenlabstranscription "github.com/memohai/twilight-ai/provider/elevenlabs/transcription"
|
||||
googletranscription "github.com/memohai/twilight-ai/provider/google/transcription"
|
||||
microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
|
||||
minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
|
||||
openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
|
||||
openaitranscription "github.com/memohai/twilight-ai/provider/openai/transcription"
|
||||
openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
|
||||
openroutertranscription "github.com/memohai/twilight-ai/provider/openrouter/transcription"
|
||||
volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
)
|
||||
|
||||
type (
|
||||
ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
|
||||
TranscriptionProviderFactory func(config map[string]any) (sdk.TranscriptionProvider, error)
|
||||
)
|
||||
type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
|
||||
|
||||
type ProviderDefinition struct {
|
||||
ClientType models.ClientType
|
||||
DisplayName string
|
||||
Icon string
|
||||
Description string
|
||||
ConfigSchema ConfigSchema
|
||||
DefaultModel string
|
||||
SupportsList bool
|
||||
Models []ModelInfo
|
||||
Factory ProviderFactory
|
||||
DefaultTranscriptionModel string
|
||||
SupportsTranscriptionList bool
|
||||
TranscriptionModels []ModelInfo
|
||||
TranscriptionFactory TranscriptionProviderFactory
|
||||
Order int
|
||||
ClientType models.ClientType
|
||||
DisplayName string
|
||||
Icon string
|
||||
Description string
|
||||
ConfigSchema ConfigSchema
|
||||
DefaultModel string
|
||||
SupportsList bool
|
||||
Models []ModelInfo
|
||||
Factory ProviderFactory
|
||||
Order int
|
||||
}
|
||||
|
||||
type Registry struct {
|
||||
@@ -53,60 +41,11 @@ type Registry struct {
|
||||
ordered []models.ClientType
|
||||
}
|
||||
|
||||
func isTranscriptionClientType(clientType models.ClientType) bool {
|
||||
switch clientType {
|
||||
case
|
||||
models.ClientTypeOpenAITranscription,
|
||||
models.ClientTypeOpenRouterTranscription,
|
||||
models.ClientTypeElevenLabsTranscription,
|
||||
models.ClientTypeDeepgramTranscription,
|
||||
models.ClientTypeGoogleTranscription:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func speechToTranscriptionClientType(clientType models.ClientType) models.ClientType {
|
||||
switch clientType {
|
||||
case models.ClientTypeOpenAISpeech:
|
||||
return models.ClientTypeOpenAITranscription
|
||||
case models.ClientTypeOpenRouterSpeech:
|
||||
return models.ClientTypeOpenRouterTranscription
|
||||
case models.ClientTypeElevenLabsSpeech:
|
||||
return models.ClientTypeElevenLabsTranscription
|
||||
case models.ClientTypeDeepgramSpeech:
|
||||
return models.ClientTypeDeepgramTranscription
|
||||
case models.ClientTypeGoogleSpeech:
|
||||
return models.ClientTypeGoogleTranscription
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func transcriptionDisplayName(displayName string) string {
|
||||
displayName = strings.TrimSpace(displayName)
|
||||
if displayName == "Google Speech" {
|
||||
return "Google Transcription"
|
||||
}
|
||||
if strings.HasSuffix(displayName, " Speech") {
|
||||
return strings.TrimSuffix(displayName, " Speech") + " Transcription"
|
||||
}
|
||||
return displayName + " Transcription"
|
||||
}
|
||||
|
||||
func NewRegistry() *Registry {
|
||||
r := &Registry{
|
||||
providers: make(map[models.ClientType]ProviderDefinition),
|
||||
}
|
||||
baseDefs := defaultProviderDefinitions()
|
||||
for _, def := range baseDefs {
|
||||
if def.Factory == nil && def.TranscriptionFactory != nil {
|
||||
continue
|
||||
}
|
||||
r.Register(def)
|
||||
}
|
||||
for _, def := range transcriptionProviderDefinitions(baseDefs) {
|
||||
for _, def := range defaultProviderDefinitions() {
|
||||
r.Register(def)
|
||||
}
|
||||
return r
|
||||
@@ -155,98 +94,17 @@ func (r *Registry) ListMeta() []ProviderMetaResponse {
|
||||
metas := make([]ProviderMetaResponse, 0, len(defs))
|
||||
for _, def := range defs {
|
||||
metas = append(metas, ProviderMetaResponse{
|
||||
Provider: string(def.ClientType),
|
||||
DisplayName: def.DisplayName,
|
||||
Description: def.Description,
|
||||
ConfigSchema: def.ConfigSchema,
|
||||
DefaultModel: def.DefaultModel,
|
||||
Models: def.Models,
|
||||
DefaultSynthesisModel: def.DefaultModel,
|
||||
SynthesisModels: def.Models,
|
||||
SupportsSynthesisList: def.SupportsList,
|
||||
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
|
||||
TranscriptionModels: def.TranscriptionModels,
|
||||
SupportsTranscriptionList: def.SupportsTranscriptionList,
|
||||
Provider: string(def.ClientType),
|
||||
DisplayName: def.DisplayName,
|
||||
Description: def.Description,
|
||||
ConfigSchema: def.ConfigSchema,
|
||||
DefaultModel: def.DefaultModel,
|
||||
Models: def.Models,
|
||||
})
|
||||
}
|
||||
return metas
|
||||
}
|
||||
|
||||
func (r *Registry) ListSpeechMeta() []ProviderMetaResponse {
|
||||
defs := r.List()
|
||||
metas := make([]ProviderMetaResponse, 0, len(defs))
|
||||
for _, def := range defs {
|
||||
if def.Factory == nil {
|
||||
continue
|
||||
}
|
||||
metas = append(metas, ProviderMetaResponse{
|
||||
Provider: string(def.ClientType),
|
||||
DisplayName: def.DisplayName,
|
||||
Description: def.Description,
|
||||
ConfigSchema: def.ConfigSchema,
|
||||
DefaultModel: def.DefaultModel,
|
||||
Models: def.Models,
|
||||
DefaultSynthesisModel: def.DefaultModel,
|
||||
SynthesisModels: def.Models,
|
||||
SupportsSynthesisList: def.SupportsList,
|
||||
})
|
||||
}
|
||||
return metas
|
||||
}
|
||||
|
||||
func (r *Registry) ListTranscriptionMeta() []ProviderMetaResponse {
|
||||
defs := r.List()
|
||||
metas := make([]ProviderMetaResponse, 0, len(defs))
|
||||
for _, def := range defs {
|
||||
if def.TranscriptionFactory == nil || !isTranscriptionClientType(def.ClientType) {
|
||||
continue
|
||||
}
|
||||
modelsList := def.TranscriptionModels
|
||||
if len(modelsList) == 0 {
|
||||
modelsList = def.Models
|
||||
}
|
||||
metas = append(metas, ProviderMetaResponse{
|
||||
Provider: string(def.ClientType),
|
||||
DisplayName: def.DisplayName,
|
||||
Description: def.Description,
|
||||
ConfigSchema: def.ConfigSchema,
|
||||
DefaultModel: def.DefaultTranscriptionModel,
|
||||
Models: modelsList,
|
||||
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
|
||||
TranscriptionModels: modelsList,
|
||||
SupportsTranscriptionList: def.SupportsTranscriptionList,
|
||||
})
|
||||
}
|
||||
return metas
|
||||
}
|
||||
|
||||
func transcriptionProviderDefinitions(base []ProviderDefinition) []ProviderDefinition {
|
||||
out := make([]ProviderDefinition, 0, len(base))
|
||||
for _, def := range base {
|
||||
clientType := speechToTranscriptionClientType(def.ClientType)
|
||||
if clientType == "" || def.TranscriptionFactory == nil {
|
||||
continue
|
||||
}
|
||||
modelsList := def.TranscriptionModels
|
||||
out = append(out, ProviderDefinition{
|
||||
ClientType: clientType,
|
||||
DisplayName: transcriptionDisplayName(def.DisplayName),
|
||||
Icon: def.Icon,
|
||||
Description: strings.TrimSpace(def.Description),
|
||||
ConfigSchema: def.ConfigSchema,
|
||||
DefaultModel: def.DefaultTranscriptionModel,
|
||||
SupportsList: def.SupportsTranscriptionList,
|
||||
Models: modelsList,
|
||||
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
|
||||
SupportsTranscriptionList: def.SupportsTranscriptionList,
|
||||
TranscriptionModels: modelsList,
|
||||
TranscriptionFactory: def.TranscriptionFactory,
|
||||
Order: def.Order + 1,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func defaultProviderDefinitions() []ProviderDefinition {
|
||||
edgeVoices := make([]VoiceInfo, 0)
|
||||
for lang, ids := range edgespeech.EdgeTTSVoices {
|
||||
@@ -315,10 +173,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
secretField("api_key", "API Key", "Bearer API key", true, 10),
|
||||
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
|
||||
}},
|
||||
DefaultModel: "gpt-4o-mini-tts",
|
||||
SupportsList: true,
|
||||
DefaultTranscriptionModel: "gpt-4o-mini-transcribe",
|
||||
SupportsTranscriptionList: true,
|
||||
DefaultModel: "gpt-4o-mini-tts",
|
||||
SupportsList: true,
|
||||
Models: []ModelInfo{{
|
||||
ID: "gpt-4o-mini-tts",
|
||||
Name: "gpt-4o-mini-tts",
|
||||
@@ -339,23 +195,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
Formats: []string{"mp3", "opus", "pcm", "wav"},
|
||||
},
|
||||
}},
|
||||
TranscriptionModels: []ModelInfo{{
|
||||
ID: "gpt-4o-mini-transcribe",
|
||||
Name: "gpt-4o-mini-transcribe",
|
||||
Description: "Default OpenAI transcription model",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language", "Language", "Optional ISO language hint", false, "", 10),
|
||||
stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
|
||||
numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
|
||||
enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
|
||||
}},
|
||||
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language", "Language", "Optional ISO language hint", false, "", 10),
|
||||
stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
|
||||
numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
|
||||
enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
|
||||
}}},
|
||||
}},
|
||||
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
|
||||
opts := []openaispeech.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
@@ -366,16 +205,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
}
|
||||
return openaispeech.New(opts...), nil
|
||||
},
|
||||
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
|
||||
opts := []openaitranscription.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
opts = append(opts, openaitranscription.WithAPIKey(v))
|
||||
}
|
||||
if v := configString(config, "base_url"); v != "" {
|
||||
opts = append(opts, openaitranscription.WithBaseURL(v))
|
||||
}
|
||||
return openaitranscription.New(opts...), nil
|
||||
},
|
||||
Order: 20,
|
||||
},
|
||||
{
|
||||
@@ -387,10 +216,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
secretField("api_key", "API Key", "OpenRouter API key", true, 10),
|
||||
stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
|
||||
}},
|
||||
DefaultModel: "openrouter-tts",
|
||||
SupportsList: true,
|
||||
DefaultTranscriptionModel: "openai/gpt-4o-mini-transcribe",
|
||||
SupportsTranscriptionList: true,
|
||||
DefaultModel: "openrouter-tts",
|
||||
SupportsList: true,
|
||||
Models: []ModelInfo{{
|
||||
ID: "openrouter-tts",
|
||||
Name: "openrouter-tts",
|
||||
@@ -407,17 +234,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
|
||||
}}},
|
||||
}},
|
||||
TranscriptionModels: []ModelInfo{{
|
||||
ID: "openai/gpt-4o-mini-transcribe",
|
||||
Name: "openai/gpt-4o-mini-transcribe",
|
||||
Description: "Default OpenRouter transcription model",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
|
||||
}},
|
||||
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
|
||||
}}},
|
||||
}},
|
||||
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
|
||||
opts := []openrouterspeech.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
@@ -428,16 +244,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
}
|
||||
return openrouterspeech.New(opts...), nil
|
||||
},
|
||||
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
|
||||
opts := []openroutertranscription.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
opts = append(opts, openroutertranscription.WithAPIKey(v))
|
||||
}
|
||||
if v := configString(config, "base_url"); v != "" {
|
||||
opts = append(opts, openroutertranscription.WithBaseURL(v))
|
||||
}
|
||||
return openroutertranscription.New(opts...), nil
|
||||
},
|
||||
Order: 30,
|
||||
},
|
||||
{
|
||||
@@ -449,10 +255,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
|
||||
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
|
||||
}},
|
||||
DefaultModel: "elevenlabs-tts",
|
||||
SupportsList: true,
|
||||
DefaultTranscriptionModel: "scribe_v2",
|
||||
SupportsTranscriptionList: true,
|
||||
DefaultModel: "elevenlabs-tts",
|
||||
SupportsList: true,
|
||||
Models: []ModelInfo{{
|
||||
ID: "elevenlabs-tts",
|
||||
Name: "elevenlabs-tts",
|
||||
@@ -485,25 +289,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
|
||||
}}},
|
||||
}},
|
||||
TranscriptionModels: []ModelInfo{{
|
||||
ID: "scribe_v2",
|
||||
Name: "scribe_v2",
|
||||
Description: "Default ElevenLabs transcription model",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
|
||||
boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
|
||||
boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
|
||||
numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
|
||||
enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
|
||||
}},
|
||||
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
|
||||
boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
|
||||
boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
|
||||
numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
|
||||
enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
|
||||
}}},
|
||||
}},
|
||||
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
|
||||
opts := []elevenlabsspeech.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
@@ -514,52 +299,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
}
|
||||
return elevenlabsspeech.New(opts...), nil
|
||||
},
|
||||
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
|
||||
opts := []elevenlabstranscription.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
opts = append(opts, elevenlabstranscription.WithAPIKey(v))
|
||||
}
|
||||
if v := configString(config, "base_url"); v != "" {
|
||||
opts = append(opts, elevenlabstranscription.WithBaseURL(v))
|
||||
}
|
||||
return elevenlabstranscription.New(opts...), nil
|
||||
},
|
||||
Order: 40,
|
||||
},
|
||||
{
|
||||
ClientType: models.ClientTypeGoogleSpeech,
|
||||
DisplayName: "Google Speech",
|
||||
Icon: "google-color",
|
||||
Description: "Google Gemini speech transcription",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
secretField("api_key", "API Key", "Google API key", true, 10),
|
||||
stringField("base_url", "Base URL", "Override the API base URL", false, "https://generativelanguage.googleapis.com/v1beta", 20),
|
||||
}},
|
||||
DefaultTranscriptionModel: "gemini-2.5-flash",
|
||||
SupportsTranscriptionList: true,
|
||||
TranscriptionModels: []ModelInfo{{
|
||||
ID: "gemini-2.5-flash",
|
||||
Name: "gemini-2.5-flash",
|
||||
Description: "Default Google transcription model",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
|
||||
}},
|
||||
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
|
||||
}}},
|
||||
}},
|
||||
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
|
||||
opts := []googletranscription.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
opts = append(opts, googletranscription.WithAPIKey(v))
|
||||
}
|
||||
if v := configString(config, "base_url"); v != "" {
|
||||
opts = append(opts, googletranscription.WithBaseURL(v))
|
||||
}
|
||||
return googletranscription.New(opts...), nil
|
||||
},
|
||||
Order: 45,
|
||||
},
|
||||
{
|
||||
ClientType: models.ClientTypeDeepgramSpeech,
|
||||
DisplayName: "Deepgram Speech",
|
||||
@@ -569,10 +310,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
secretField("api_key", "API Key", "Deepgram API key", true, 10),
|
||||
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
|
||||
}},
|
||||
DefaultModel: "deepgram-tts",
|
||||
SupportsList: false,
|
||||
DefaultTranscriptionModel: "nova-3",
|
||||
SupportsTranscriptionList: false,
|
||||
DefaultModel: "deepgram-tts",
|
||||
SupportsList: false,
|
||||
Models: []ModelInfo{{
|
||||
ID: "deepgram-tts",
|
||||
Name: "deepgram-tts",
|
||||
@@ -593,25 +332,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
Formats: []string{"wav", "none"},
|
||||
},
|
||||
}},
|
||||
TranscriptionModels: []ModelInfo{{
|
||||
ID: "nova-3",
|
||||
Name: "nova-3",
|
||||
Description: "Default Deepgram transcription model",
|
||||
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language", "Language", "Optional language hint", false, "", 10),
|
||||
boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
|
||||
boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
|
||||
boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
|
||||
boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
|
||||
}},
|
||||
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
|
||||
stringField("language", "Language", "Optional language hint", false, "", 10),
|
||||
boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
|
||||
boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
|
||||
boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
|
||||
boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
|
||||
}}},
|
||||
}},
|
||||
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
|
||||
opts := []deepgramspeech.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
@@ -622,16 +342,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
|
||||
}
|
||||
return deepgramspeech.New(opts...), nil
|
||||
},
|
||||
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
|
||||
opts := []deepgramtranscription.Option{}
|
||||
if v := configString(config, "api_key"); v != "" {
|
||||
opts = append(opts, deepgramtranscription.WithAPIKey(v))
|
||||
}
|
||||
if v := configString(config, "base_url"); v != "" {
|
||||
opts = append(opts, deepgramtranscription.WithBaseURL(v))
|
||||
}
|
||||
return deepgramtranscription.New(opts...), nil
|
||||
},
|
||||
Order: 50,
|
||||
},
|
||||
{
|
||||
@@ -0,0 +1,435 @@
|
||||
package tts
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
"github.com/memohai/memoh/internal/db"
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
queries *sqlc.Queries
|
||||
logger *slog.Logger
|
||||
registry *Registry
|
||||
}
|
||||
|
||||
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
|
||||
return &Service{
|
||||
queries: queries,
|
||||
logger: log.With(slog.String("service", "tts")),
|
||||
registry: registry,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) Registry() *Registry { return s.registry }
|
||||
|
||||
func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
|
||||
return s.registry.ListMeta()
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
|
||||
rows, err := s.queries.ListSpeechProviders(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech providers: %w", err)
|
||||
}
|
||||
items := make([]SpeechProviderResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
items = append(items, toSpeechProviderResponse(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return SpeechProviderResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
return toSpeechProviderResponse(row), nil
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
|
||||
rows, err := s.queries.ListSpeechModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models: %w", err)
|
||||
}
|
||||
items := make([]SpeechModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if s.shouldHideModel(row.ProviderType, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toSpeechModelFromListRow(row))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models by provider: %w", err)
|
||||
}
|
||||
items := make([]SpeechModelResponse, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if shouldHideTemplateModel(def, row.ModelID) {
|
||||
continue
|
||||
}
|
||||
items = append(items, toSpeechModelFromModel(row, ""))
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
|
||||
pgID, err := db.ParseUUID(id)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, err
|
||||
}
|
||||
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
return toSpeechModelWithProviderResponse(row), nil
|
||||
}
|
||||
|
||||
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
|
||||
params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
result, err := sdk.GenerateSpeech(ctx,
|
||||
sdk.WithSpeechModel(params.model),
|
||||
sdk.WithText(text),
|
||||
sdk.WithSpeechConfig(params.config),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("synthesize: %w", err)
|
||||
}
|
||||
return result.Audio, result.ContentType, nil
|
||||
}
|
||||
|
||||
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
|
||||
params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
streamResult, err := sdk.StreamSpeech(ctx,
|
||||
sdk.WithSpeechModel(params.model),
|
||||
sdk.WithText(text),
|
||||
sdk.WithSpeechConfig(params.config),
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stream: %w", err)
|
||||
}
|
||||
audio, err := streamResult.Bytes()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stream: %w", err)
|
||||
}
|
||||
if _, writeErr := w.Write(audio); writeErr != nil {
|
||||
return "", fmt.Errorf("write chunk: %w", writeErr)
|
||||
}
|
||||
return streamResult.ContentType, nil
|
||||
}
|
||||
|
||||
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
template := findModelTemplate(def, modelRow.ModelID)
|
||||
if template == nil {
|
||||
return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
|
||||
}
|
||||
caps := template.Capabilities
|
||||
if len(caps.ConfigSchema.Fields) == 0 {
|
||||
caps.ConfigSchema = template.ConfigSchema
|
||||
}
|
||||
return &caps, nil
|
||||
}
|
||||
|
||||
func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
|
||||
pgID, err := db.ParseUUID(providerID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !def.SupportsList || def.Factory == nil {
|
||||
return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
|
||||
}
|
||||
|
||||
provider, err := def.Factory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build speech provider: %w", err)
|
||||
}
|
||||
|
||||
remoteModels, err := provider.ListModels(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list speech models: %w", err)
|
||||
}
|
||||
|
||||
discovered := make([]ModelInfo, 0, len(remoteModels))
|
||||
for _, remoteModel := range remoteModels {
|
||||
if remoteModel == nil || remoteModel.ID == "" {
|
||||
continue
|
||||
}
|
||||
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
|
||||
}
|
||||
return discovered, nil
|
||||
}
|
||||
|
||||
type resolvedSpeechParams struct {
|
||||
model *sdk.SpeechModel
|
||||
config map[string]any
|
||||
}
|
||||
|
||||
func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
|
||||
_ = text
|
||||
pgID, err := db.ParseUUID(modelID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech model: %w", err)
|
||||
}
|
||||
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get speech provider: %w", err)
|
||||
}
|
||||
|
||||
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
provider, err := def.Factory(parseConfig(providerRow.Config))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build speech provider: %w", err)
|
||||
}
|
||||
|
||||
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
|
||||
return &resolvedSpeechParams{
|
||||
model: &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
|
||||
config: cfg,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseConfig(raw []byte) map[string]any {
|
||||
if len(raw) == 0 {
|
||||
return map[string]any{}
|
||||
}
|
||||
var cfg map[string]any
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
|
||||
return map[string]any{}
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func mergeConfig(parts ...map[string]any) map[string]any {
|
||||
out := make(map[string]any)
|
||||
for _, part := range parts {
|
||||
for key, value := range part {
|
||||
out[key] = value
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
|
||||
for _, model := range defaults {
|
||||
if model.ID == modelID {
|
||||
return model
|
||||
}
|
||||
}
|
||||
return ModelInfo{
|
||||
ID: modelID,
|
||||
Name: modelID,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) shouldHideModel(clientType string, modelID string) bool {
|
||||
def, err := s.registry.Get(models.ClientType(clientType))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return shouldHideTemplateModel(def, modelID)
|
||||
}
|
||||
|
||||
func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
|
||||
if !def.SupportsList {
|
||||
return false
|
||||
}
|
||||
for _, model := range def.Models {
|
||||
if model.ID == modelID {
|
||||
return model.TemplateOnly
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
|
||||
for i := range def.Models {
|
||||
if def.Models[i].ID == modelID {
|
||||
return &def.Models[i]
|
||||
}
|
||||
}
|
||||
if def.DefaultModel != "" {
|
||||
for i := range def.Models {
|
||||
if def.Models[i].ID == def.DefaultModel {
|
||||
return &def.Models[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(def.Models) > 0 {
|
||||
return &def.Models[0]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
|
||||
icon := ""
|
||||
if row.Icon.Valid {
|
||||
icon = row.Icon.String
|
||||
}
|
||||
return SpeechProviderResponse{
|
||||
ID: row.ID.String(),
|
||||
Name: row.Name,
|
||||
ClientType: row.ClientType,
|
||||
Icon: icon,
|
||||
Enable: row.Enable,
|
||||
Config: maskSpeechProviderConfig(parseConfig(row.Config)),
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
|
||||
if len(cfg) == 0 {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := make(map[string]any, len(cfg))
|
||||
for key, value := range cfg {
|
||||
if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
|
||||
out[key] = maskSpeechSecret(s)
|
||||
continue
|
||||
}
|
||||
out[key] = value
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isSpeechSecretKey(key string) bool {
|
||||
switch key {
|
||||
case "api_key", "access_key", "secret_key", "app_key":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func maskSpeechSecret(value string) string {
|
||||
if len(value) <= 8 {
|
||||
return "********"
|
||||
}
|
||||
return value[:4] + "****" + value[len(value)-4:]
|
||||
}
|
||||
|
||||
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: providerType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
|
||||
func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
|
||||
var cfg map[string]any
|
||||
if len(row.Config) > 0 {
|
||||
_ = json.Unmarshal(row.Config, &cfg)
|
||||
}
|
||||
name := ""
|
||||
if row.Name.Valid {
|
||||
name = row.Name.String
|
||||
}
|
||||
return SpeechModelResponse{
|
||||
ID: row.ID.String(),
|
||||
ModelID: row.ModelID,
|
||||
Name: name,
|
||||
ProviderID: row.ProviderID.String(),
|
||||
ProviderType: row.ProviderType,
|
||||
Config: cfg,
|
||||
CreatedAt: row.CreatedAt.Time,
|
||||
UpdatedAt: row.UpdatedAt.Time,
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package audio
|
||||
package tts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
const (
|
||||
defaultTTL = 10 * time.Minute
|
||||
cleanupInterval = 1 * time.Minute
|
||||
tempDirName = "audio_temp"
|
||||
tempDirName = "tts_temp"
|
||||
)
|
||||
|
||||
// TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
|
||||
@@ -30,7 +30,7 @@ type TempStore struct {
|
||||
func NewTempStore(baseDir string) (*TempStore, error) {
|
||||
dir := filepath.Join(baseDir, tempDirName)
|
||||
if err := os.MkdirAll(dir, 0o750); err != nil {
|
||||
return nil, fmt.Errorf("create audio temp dir: %w", err)
|
||||
return nil, fmt.Errorf("create tts temp dir: %w", err)
|
||||
}
|
||||
return &TempStore{
|
||||
dir: dir,
|
||||
@@ -0,0 +1,62 @@
|
||||
package tts
|
||||
|
||||
import "time"
|
||||
|
||||
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
|
||||
type ProviderMetaResponse struct {
|
||||
Provider string `json:"provider"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Description string `json:"description"`
|
||||
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
|
||||
DefaultModel string `json:"default_model"`
|
||||
Models []ModelInfo `json:"models"`
|
||||
}
|
||||
|
||||
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
|
||||
type SpeechProviderResponse struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
ClientType string `json:"client_type"`
|
||||
Icon string `json:"icon,omitempty"`
|
||||
Enable bool `json:"enable"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// SpeechModelResponse represents a speech model from the unified models table.
|
||||
type SpeechModelResponse struct {
|
||||
ID string `json:"id"`
|
||||
ModelID string `json:"model_id"`
|
||||
Name string `json:"name"`
|
||||
ProviderID string `json:"provider_id"`
|
||||
ProviderType string `json:"provider_type,omitempty"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// UpdateSpeechProviderRequest is used for updating a speech provider.
|
||||
type UpdateSpeechProviderRequest struct {
|
||||
Name *string `json:"name,omitempty"`
|
||||
Enable *bool `json:"enable,omitempty"`
|
||||
}
|
||||
|
||||
// UpdateSpeechModelRequest is used for updating a speech model.
|
||||
type UpdateSpeechModelRequest struct {
|
||||
Name *string `json:"name,omitempty"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
// TestSynthesizeRequest represents a text-to-speech test request.
|
||||
type TestSynthesizeRequest struct {
|
||||
Text string `json:"text"`
|
||||
Config map[string]any `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
// ImportModelsResponse represents the response for importing speech models.
|
||||
type ImportModelsResponse struct {
|
||||
Created int `json:"created"`
|
||||
Skipped int `json:"skipped"`
|
||||
Models []string `json:"models"`
|
||||
}
|
||||
@@ -175,7 +175,6 @@ func withWorkspaceGPUPreference(metadata map[string]any, gpu WorkspaceGPUConfig)
|
||||
return next
|
||||
}
|
||||
|
||||
//nolint:unused // Kept for tests and upcoming metadata plumbing.
|
||||
func withWorkspaceSkillDiscoveryRoots(metadata map[string]any, roots []string) map[string]any {
|
||||
next := cloneAnyMap(metadata)
|
||||
section := workspaceSection(next)
|
||||
@@ -200,7 +199,6 @@ func withoutWorkspaceGPUPreference(metadata map[string]any) map[string]any {
|
||||
return next
|
||||
}
|
||||
|
||||
//nolint:unused // Kept for tests and upcoming metadata plumbing.
|
||||
func withoutWorkspaceSkillDiscoveryRoots(metadata map[string]any) map[string]any {
|
||||
next := cloneAnyMap(metadata)
|
||||
section := workspaceSection(next)
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+107
-502
@@ -310,146 +310,6 @@ export type AdaptersUsageResponse = {
|
||||
total_text_bytes?: number;
|
||||
};
|
||||
|
||||
export type AudioConfigSchema = {
|
||||
fields?: Array<AudioFieldSchema>;
|
||||
};
|
||||
|
||||
export type AudioFieldSchema = {
|
||||
advanced?: boolean;
|
||||
description?: string;
|
||||
enum?: Array<string>;
|
||||
example?: unknown;
|
||||
key?: string;
|
||||
order?: number;
|
||||
required?: boolean;
|
||||
title?: string;
|
||||
type?: string;
|
||||
};
|
||||
|
||||
export type AudioImportModelsResponse = {
|
||||
created?: number;
|
||||
models?: Array<string>;
|
||||
skipped?: number;
|
||||
};
|
||||
|
||||
export type AudioModelCapabilities = {
|
||||
config_schema?: AudioConfigSchema;
|
||||
formats?: Array<string>;
|
||||
metadata?: {
|
||||
[key: string]: string;
|
||||
};
|
||||
pitch?: AudioParamConstraint;
|
||||
speed?: AudioParamConstraint;
|
||||
voices?: Array<AudioVoiceInfo>;
|
||||
};
|
||||
|
||||
export type AudioModelInfo = {
|
||||
capabilities?: AudioModelCapabilities;
|
||||
config_schema?: AudioConfigSchema;
|
||||
description?: string;
|
||||
id?: string;
|
||||
name?: string;
|
||||
template_only?: boolean;
|
||||
};
|
||||
|
||||
export type AudioParamConstraint = {
|
||||
default?: number;
|
||||
max?: number;
|
||||
min?: number;
|
||||
options?: Array<number>;
|
||||
};
|
||||
|
||||
export type AudioProviderMetaResponse = {
|
||||
config_schema?: AudioConfigSchema;
|
||||
default_model?: string;
|
||||
default_synthesis_model?: string;
|
||||
default_transcription_model?: string;
|
||||
description?: string;
|
||||
display_name?: string;
|
||||
models?: Array<AudioModelInfo>;
|
||||
provider?: string;
|
||||
supports_synthesis_list?: boolean;
|
||||
supports_transcription_list?: boolean;
|
||||
synthesis_models?: Array<AudioModelInfo>;
|
||||
transcription_models?: Array<AudioModelInfo>;
|
||||
};
|
||||
|
||||
export type AudioSpeechModelResponse = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
created_at?: string;
|
||||
id?: string;
|
||||
model_id?: string;
|
||||
name?: string;
|
||||
provider_id?: string;
|
||||
provider_type?: string;
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type AudioSpeechProviderResponse = {
|
||||
client_type?: string;
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
created_at?: string;
|
||||
enable?: boolean;
|
||||
icon?: string;
|
||||
id?: string;
|
||||
name?: string;
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type AudioTestSynthesizeRequest = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
text?: string;
|
||||
};
|
||||
|
||||
export type AudioTestTranscriptionResponse = {
|
||||
duration_seconds?: number;
|
||||
language?: string;
|
||||
metadata?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
text?: string;
|
||||
words?: Array<AudioTranscriptionWord>;
|
||||
};
|
||||
|
||||
export type AudioTranscriptionModelResponse = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
created_at?: string;
|
||||
id?: string;
|
||||
model_id?: string;
|
||||
name?: string;
|
||||
provider_id?: string;
|
||||
provider_type?: string;
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type AudioTranscriptionWord = {
|
||||
end?: number;
|
||||
speaker_id?: string;
|
||||
start?: number;
|
||||
text?: string;
|
||||
};
|
||||
|
||||
export type AudioUpdateSpeechModelRequest = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
name?: string;
|
||||
};
|
||||
|
||||
export type AudioVoiceInfo = {
|
||||
id?: string;
|
||||
lang?: string;
|
||||
name?: string;
|
||||
};
|
||||
|
||||
export type BotsBot = {
|
||||
avatar_url?: string;
|
||||
check_issue_count?: number;
|
||||
@@ -613,7 +473,7 @@ export type ChannelChannelIdentityBinding = {
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local' | 'slack';
|
||||
export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local';
|
||||
|
||||
export type ChannelConfigSchema = {
|
||||
fields?: {
|
||||
@@ -1494,7 +1354,7 @@ export type ModelsModelConfig = {
|
||||
reasoning_efforts?: Array<string>;
|
||||
};
|
||||
|
||||
export type ModelsModelType = 'chat' | 'embedding' | 'speech' | 'transcription';
|
||||
export type ModelsModelType = 'chat' | 'embedding' | 'speech';
|
||||
|
||||
export type ModelsTestResponse = {
|
||||
latency_ms?: number;
|
||||
@@ -1755,7 +1615,6 @@ export type SettingsSettings = {
|
||||
search_provider_id?: string;
|
||||
timezone?: string;
|
||||
title_model_id?: string;
|
||||
transcription_model_id?: string;
|
||||
tts_model_id?: string;
|
||||
};
|
||||
|
||||
@@ -1780,10 +1639,105 @@ export type SettingsUpsertRequest = {
|
||||
search_provider_id?: string;
|
||||
timezone?: string;
|
||||
title_model_id?: string;
|
||||
transcription_model_id?: string;
|
||||
tts_model_id?: string;
|
||||
};
|
||||
|
||||
export type TtsConfigSchema = {
|
||||
fields?: Array<TtsFieldSchema>;
|
||||
};
|
||||
|
||||
export type TtsFieldSchema = {
|
||||
advanced?: boolean;
|
||||
description?: string;
|
||||
enum?: Array<string>;
|
||||
example?: unknown;
|
||||
key?: string;
|
||||
order?: number;
|
||||
required?: boolean;
|
||||
title?: string;
|
||||
type?: string;
|
||||
};
|
||||
|
||||
export type TtsImportModelsResponse = {
|
||||
created?: number;
|
||||
models?: Array<string>;
|
||||
skipped?: number;
|
||||
};
|
||||
|
||||
export type TtsModelCapabilities = {
|
||||
config_schema?: TtsConfigSchema;
|
||||
formats?: Array<string>;
|
||||
metadata?: {
|
||||
[key: string]: string;
|
||||
};
|
||||
pitch?: TtsParamConstraint;
|
||||
speed?: TtsParamConstraint;
|
||||
voices?: Array<TtsVoiceInfo>;
|
||||
};
|
||||
|
||||
export type TtsModelInfo = {
|
||||
capabilities?: TtsModelCapabilities;
|
||||
config_schema?: TtsConfigSchema;
|
||||
description?: string;
|
||||
id?: string;
|
||||
name?: string;
|
||||
};
|
||||
|
||||
export type TtsParamConstraint = {
|
||||
default?: number;
|
||||
max?: number;
|
||||
min?: number;
|
||||
options?: Array<number>;
|
||||
};
|
||||
|
||||
export type TtsProviderMetaResponse = {
|
||||
config_schema?: TtsConfigSchema;
|
||||
default_model?: string;
|
||||
description?: string;
|
||||
display_name?: string;
|
||||
models?: Array<TtsModelInfo>;
|
||||
provider?: string;
|
||||
};
|
||||
|
||||
export type TtsSpeechModelResponse = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
created_at?: string;
|
||||
id?: string;
|
||||
model_id?: string;
|
||||
name?: string;
|
||||
provider_id?: string;
|
||||
provider_type?: string;
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type TtsSpeechProviderResponse = {
|
||||
client_type?: string;
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
created_at?: string;
|
||||
enable?: boolean;
|
||||
icon?: string;
|
||||
id?: string;
|
||||
name?: string;
|
||||
updated_at?: string;
|
||||
};
|
||||
|
||||
export type TtsTestSynthesizeRequest = {
|
||||
config?: {
|
||||
[key: string]: unknown;
|
||||
};
|
||||
text?: string;
|
||||
};
|
||||
|
||||
export type TtsVoiceInfo = {
|
||||
id?: string;
|
||||
lang?: string;
|
||||
name?: string;
|
||||
};
|
||||
|
||||
export type PostAuthLoginData = {
|
||||
/**
|
||||
* Login request
|
||||
@@ -8268,7 +8222,7 @@ export type GetSpeechModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioSpeechModelResponse>;
|
||||
200: Array<TtsSpeechModelResponse>;
|
||||
};
|
||||
|
||||
export type GetSpeechModelsResponse = GetSpeechModelsResponses[keyof GetSpeechModelsResponses];
|
||||
@@ -8298,48 +8252,11 @@ export type GetSpeechModelsByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioSpeechModelResponse;
|
||||
200: TtsSpeechModelResponse;
|
||||
};
|
||||
|
||||
export type GetSpeechModelsByIdResponse = GetSpeechModelsByIdResponses[keyof GetSpeechModelsByIdResponses];
|
||||
|
||||
export type PutSpeechModelsByIdData = {
|
||||
/**
|
||||
* Model update payload
|
||||
*/
|
||||
body: AudioUpdateSpeechModelRequest;
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/speech-models/{id}';
|
||||
};
|
||||
|
||||
export type PutSpeechModelsByIdErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type PutSpeechModelsByIdError = PutSpeechModelsByIdErrors[keyof PutSpeechModelsByIdErrors];
|
||||
|
||||
export type PutSpeechModelsByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioSpeechModelResponse;
|
||||
};
|
||||
|
||||
export type PutSpeechModelsByIdResponse = PutSpeechModelsByIdResponses[keyof PutSpeechModelsByIdResponses];
|
||||
|
||||
export type GetSpeechModelsByIdCapabilitiesData = {
|
||||
body?: never;
|
||||
path: {
|
||||
@@ -8365,7 +8282,7 @@ export type GetSpeechModelsByIdCapabilitiesResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioModelCapabilities;
|
||||
200: TtsModelCapabilities;
|
||||
};
|
||||
|
||||
export type GetSpeechModelsByIdCapabilitiesResponse = GetSpeechModelsByIdCapabilitiesResponses[keyof GetSpeechModelsByIdCapabilitiesResponses];
|
||||
@@ -8374,7 +8291,7 @@ export type PostSpeechModelsByIdTestData = {
|
||||
/**
|
||||
* Text to synthesize
|
||||
*/
|
||||
body: AudioTestSynthesizeRequest;
|
||||
body: TtsTestSynthesizeRequest;
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
@@ -8425,7 +8342,7 @@ export type GetSpeechProvidersResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioSpeechProviderResponse>;
|
||||
200: Array<TtsSpeechProviderResponse>;
|
||||
};
|
||||
|
||||
export type GetSpeechProvidersResponse = GetSpeechProvidersResponses[keyof GetSpeechProvidersResponses];
|
||||
@@ -8441,7 +8358,7 @@ export type GetSpeechProvidersMetaResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioProviderMetaResponse>;
|
||||
200: Array<TtsProviderMetaResponse>;
|
||||
};
|
||||
|
||||
export type GetSpeechProvidersMetaResponse = GetSpeechProvidersMetaResponses[keyof GetSpeechProvidersMetaResponses];
|
||||
@@ -8475,7 +8392,7 @@ export type GetSpeechProvidersByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioSpeechProviderResponse;
|
||||
200: TtsSpeechProviderResponse;
|
||||
};
|
||||
|
||||
export type GetSpeechProvidersByIdResponse = GetSpeechProvidersByIdResponses[keyof GetSpeechProvidersByIdResponses];
|
||||
@@ -8513,7 +8430,7 @@ export type PostSpeechProvidersByIdImportModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioImportModelsResponse;
|
||||
200: TtsImportModelsResponse;
|
||||
};
|
||||
|
||||
export type PostSpeechProvidersByIdImportModelsResponse = PostSpeechProvidersByIdImportModelsResponses[keyof PostSpeechProvidersByIdImportModelsResponses];
|
||||
@@ -8547,7 +8464,7 @@ export type GetSpeechProvidersByIdModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioSpeechModelResponse>;
|
||||
200: Array<TtsSpeechModelResponse>;
|
||||
};
|
||||
|
||||
export type GetSpeechProvidersByIdModelsResponse = GetSpeechProvidersByIdModelsResponses[keyof GetSpeechProvidersByIdModelsResponses];
|
||||
@@ -8733,318 +8650,6 @@ export type GetSupermarketTagsResponses = {
|
||||
|
||||
export type GetSupermarketTagsResponse = GetSupermarketTagsResponses[keyof GetSupermarketTagsResponses];
|
||||
|
||||
export type GetTranscriptionModelsData = {
|
||||
body?: never;
|
||||
path?: never;
|
||||
query?: never;
|
||||
url: '/transcription-models';
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsErrors = {
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsError = GetTranscriptionModelsErrors[keyof GetTranscriptionModelsErrors];
|
||||
|
||||
export type GetTranscriptionModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioTranscriptionModelResponse>;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsResponse = GetTranscriptionModelsResponses[keyof GetTranscriptionModelsResponses];
|
||||
|
||||
export type GetTranscriptionModelsByIdData = {
|
||||
body?: never;
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-models/{id}';
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdErrors = {
|
||||
/**
|
||||
* Not Found
|
||||
*/
|
||||
404: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdError = GetTranscriptionModelsByIdErrors[keyof GetTranscriptionModelsByIdErrors];
|
||||
|
||||
export type GetTranscriptionModelsByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioTranscriptionModelResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdResponse = GetTranscriptionModelsByIdResponses[keyof GetTranscriptionModelsByIdResponses];
|
||||
|
||||
export type PutTranscriptionModelsByIdData = {
|
||||
/**
|
||||
* Model update payload
|
||||
*/
|
||||
body: AudioUpdateSpeechModelRequest;
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-models/{id}';
|
||||
};
|
||||
|
||||
export type PutTranscriptionModelsByIdErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type PutTranscriptionModelsByIdError = PutTranscriptionModelsByIdErrors[keyof PutTranscriptionModelsByIdErrors];
|
||||
|
||||
export type PutTranscriptionModelsByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioTranscriptionModelResponse;
|
||||
};
|
||||
|
||||
export type PutTranscriptionModelsByIdResponse = PutTranscriptionModelsByIdResponses[keyof PutTranscriptionModelsByIdResponses];
|
||||
|
||||
export type GetTranscriptionModelsByIdCapabilitiesData = {
|
||||
body?: never;
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-models/{id}/capabilities';
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdCapabilitiesErrors = {
|
||||
/**
|
||||
* Not Found
|
||||
*/
|
||||
404: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdCapabilitiesError = GetTranscriptionModelsByIdCapabilitiesErrors[keyof GetTranscriptionModelsByIdCapabilitiesErrors];
|
||||
|
||||
export type GetTranscriptionModelsByIdCapabilitiesResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioModelCapabilities;
|
||||
};
|
||||
|
||||
export type GetTranscriptionModelsByIdCapabilitiesResponse = GetTranscriptionModelsByIdCapabilitiesResponses[keyof GetTranscriptionModelsByIdCapabilitiesResponses];
|
||||
|
||||
export type PostTranscriptionModelsByIdTestData = {
|
||||
body: {
|
||||
/**
|
||||
* Audio file
|
||||
*/
|
||||
file: Blob | File;
|
||||
/**
|
||||
* Optional JSON config
|
||||
*/
|
||||
config?: string;
|
||||
};
|
||||
path: {
|
||||
/**
|
||||
* Model ID
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-models/{id}/test';
|
||||
};
|
||||
|
||||
export type PostTranscriptionModelsByIdTestErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type PostTranscriptionModelsByIdTestError = PostTranscriptionModelsByIdTestErrors[keyof PostTranscriptionModelsByIdTestErrors];
|
||||
|
||||
export type PostTranscriptionModelsByIdTestResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioTestTranscriptionResponse;
|
||||
};
|
||||
|
||||
export type PostTranscriptionModelsByIdTestResponse = PostTranscriptionModelsByIdTestResponses[keyof PostTranscriptionModelsByIdTestResponses];
|
||||
|
||||
export type GetTranscriptionProvidersData = {
|
||||
body?: never;
|
||||
path?: never;
|
||||
query?: never;
|
||||
url: '/transcription-providers';
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersErrors = {
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersError = GetTranscriptionProvidersErrors[keyof GetTranscriptionProvidersErrors];
|
||||
|
||||
export type GetTranscriptionProvidersResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioSpeechProviderResponse>;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersResponse = GetTranscriptionProvidersResponses[keyof GetTranscriptionProvidersResponses];
|
||||
|
||||
export type GetTranscriptionProvidersMetaData = {
|
||||
body?: never;
|
||||
path?: never;
|
||||
query?: never;
|
||||
url: '/transcription-providers/meta';
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersMetaResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioProviderMetaResponse>;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersMetaResponse = GetTranscriptionProvidersMetaResponses[keyof GetTranscriptionProvidersMetaResponses];
|
||||
|
||||
export type GetTranscriptionProvidersByIdData = {
|
||||
body?: never;
|
||||
path: {
|
||||
/**
|
||||
* Provider ID (UUID)
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-providers/{id}';
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Not Found
|
||||
*/
|
||||
404: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdError = GetTranscriptionProvidersByIdErrors[keyof GetTranscriptionProvidersByIdErrors];
|
||||
|
||||
export type GetTranscriptionProvidersByIdResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioSpeechProviderResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdResponse = GetTranscriptionProvidersByIdResponses[keyof GetTranscriptionProvidersByIdResponses];
|
||||
|
||||
export type PostTranscriptionProvidersByIdImportModelsData = {
|
||||
body?: never;
|
||||
path: {
|
||||
/**
|
||||
* Provider ID (UUID)
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-providers/{id}/import-models';
|
||||
};
|
||||
|
||||
export type PostTranscriptionProvidersByIdImportModelsErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Not Found
|
||||
*/
|
||||
404: HandlersErrorResponse;
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type PostTranscriptionProvidersByIdImportModelsError = PostTranscriptionProvidersByIdImportModelsErrors[keyof PostTranscriptionProvidersByIdImportModelsErrors];
|
||||
|
||||
export type PostTranscriptionProvidersByIdImportModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: AudioImportModelsResponse;
|
||||
};
|
||||
|
||||
export type PostTranscriptionProvidersByIdImportModelsResponse = PostTranscriptionProvidersByIdImportModelsResponses[keyof PostTranscriptionProvidersByIdImportModelsResponses];
|
||||
|
||||
export type GetTranscriptionProvidersByIdModelsData = {
|
||||
body?: never;
|
||||
path: {
|
||||
/**
|
||||
* Provider ID (UUID)
|
||||
*/
|
||||
id: string;
|
||||
};
|
||||
query?: never;
|
||||
url: '/transcription-providers/{id}/models';
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdModelsErrors = {
|
||||
/**
|
||||
* Bad Request
|
||||
*/
|
||||
400: HandlersErrorResponse;
|
||||
/**
|
||||
* Internal Server Error
|
||||
*/
|
||||
500: HandlersErrorResponse;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdModelsError = GetTranscriptionProvidersByIdModelsErrors[keyof GetTranscriptionProvidersByIdModelsErrors];
|
||||
|
||||
export type GetTranscriptionProvidersByIdModelsResponses = {
|
||||
/**
|
||||
* OK
|
||||
*/
|
||||
200: Array<AudioTranscriptionModelResponse>;
|
||||
};
|
||||
|
||||
export type GetTranscriptionProvidersByIdModelsResponse = GetTranscriptionProvidersByIdModelsResponses[keyof GetTranscriptionProvidersByIdModelsResponses];
|
||||
|
||||
export type GetUsersData = {
|
||||
body?: never;
|
||||
path?: never;
|
||||
|
||||
+257
-815
File diff suppressed because it is too large
Load Diff
+257
-815
File diff suppressed because it is too large
Load Diff
+170
-545
@@ -489,240 +489,6 @@ definitions:
|
||||
total_text_bytes:
|
||||
type: integer
|
||||
type: object
|
||||
audio.ConfigSchema:
|
||||
properties:
|
||||
fields:
|
||||
items:
|
||||
$ref: '#/definitions/audio.FieldSchema'
|
||||
type: array
|
||||
type: object
|
||||
audio.FieldSchema:
|
||||
properties:
|
||||
advanced:
|
||||
type: boolean
|
||||
description:
|
||||
type: string
|
||||
enum:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
example: {}
|
||||
key:
|
||||
type: string
|
||||
order:
|
||||
type: integer
|
||||
required:
|
||||
type: boolean
|
||||
title:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
type: object
|
||||
audio.ImportModelsResponse:
|
||||
properties:
|
||||
created:
|
||||
type: integer
|
||||
models:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
skipped:
|
||||
type: integer
|
||||
type: object
|
||||
audio.ModelCapabilities:
|
||||
properties:
|
||||
config_schema:
|
||||
$ref: '#/definitions/audio.ConfigSchema'
|
||||
formats:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
metadata:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
pitch:
|
||||
$ref: '#/definitions/audio.ParamConstraint'
|
||||
speed:
|
||||
$ref: '#/definitions/audio.ParamConstraint'
|
||||
voices:
|
||||
items:
|
||||
$ref: '#/definitions/audio.VoiceInfo'
|
||||
type: array
|
||||
type: object
|
||||
audio.ModelInfo:
|
||||
properties:
|
||||
capabilities:
|
||||
$ref: '#/definitions/audio.ModelCapabilities'
|
||||
config_schema:
|
||||
$ref: '#/definitions/audio.ConfigSchema'
|
||||
description:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
template_only:
|
||||
type: boolean
|
||||
type: object
|
||||
audio.ParamConstraint:
|
||||
properties:
|
||||
default:
|
||||
type: number
|
||||
max:
|
||||
type: number
|
||||
min:
|
||||
type: number
|
||||
options:
|
||||
items:
|
||||
type: number
|
||||
type: array
|
||||
type: object
|
||||
audio.ProviderMetaResponse:
|
||||
properties:
|
||||
config_schema:
|
||||
$ref: '#/definitions/audio.ConfigSchema'
|
||||
default_model:
|
||||
type: string
|
||||
default_synthesis_model:
|
||||
type: string
|
||||
default_transcription_model:
|
||||
type: string
|
||||
description:
|
||||
type: string
|
||||
display_name:
|
||||
type: string
|
||||
models:
|
||||
items:
|
||||
$ref: '#/definitions/audio.ModelInfo'
|
||||
type: array
|
||||
provider:
|
||||
type: string
|
||||
supports_synthesis_list:
|
||||
type: boolean
|
||||
supports_transcription_list:
|
||||
type: boolean
|
||||
synthesis_models:
|
||||
items:
|
||||
$ref: '#/definitions/audio.ModelInfo'
|
||||
type: array
|
||||
transcription_models:
|
||||
items:
|
||||
$ref: '#/definitions/audio.ModelInfo'
|
||||
type: array
|
||||
type: object
|
||||
audio.SpeechModelResponse:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
created_at:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
model_id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
provider_id:
|
||||
type: string
|
||||
provider_type:
|
||||
type: string
|
||||
updated_at:
|
||||
type: string
|
||||
type: object
|
||||
audio.SpeechProviderResponse:
|
||||
properties:
|
||||
client_type:
|
||||
type: string
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
created_at:
|
||||
type: string
|
||||
enable:
|
||||
type: boolean
|
||||
icon:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
updated_at:
|
||||
type: string
|
||||
type: object
|
||||
audio.TestSynthesizeRequest:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
text:
|
||||
type: string
|
||||
type: object
|
||||
audio.TestTranscriptionResponse:
|
||||
properties:
|
||||
duration_seconds:
|
||||
type: number
|
||||
language:
|
||||
type: string
|
||||
metadata:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
text:
|
||||
type: string
|
||||
words:
|
||||
items:
|
||||
$ref: '#/definitions/audio.TranscriptionWord'
|
||||
type: array
|
||||
type: object
|
||||
audio.TranscriptionModelResponse:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
created_at:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
model_id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
provider_id:
|
||||
type: string
|
||||
provider_type:
|
||||
type: string
|
||||
updated_at:
|
||||
type: string
|
||||
type: object
|
||||
audio.TranscriptionWord:
|
||||
properties:
|
||||
end:
|
||||
type: number
|
||||
speaker_id:
|
||||
type: string
|
||||
start:
|
||||
type: number
|
||||
text:
|
||||
type: string
|
||||
type: object
|
||||
audio.UpdateSpeechModelRequest:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
name:
|
||||
type: string
|
||||
type: object
|
||||
audio.VoiceInfo:
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
lang:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
type: object
|
||||
bots.Bot:
|
||||
properties:
|
||||
avatar_url:
|
||||
@@ -1008,7 +774,6 @@ definitions:
|
||||
- weixin
|
||||
- wechatoa
|
||||
- local
|
||||
- slack
|
||||
type: string
|
||||
x-enum-varnames:
|
||||
- ChannelTypeTelegram
|
||||
@@ -1021,7 +786,6 @@ definitions:
|
||||
- ChannelTypeWeixin
|
||||
- ChannelTypeWeChatOA
|
||||
- ChannelTypeLocal
|
||||
- ChannelTypeSlack
|
||||
channel.ConfigSchema:
|
||||
properties:
|
||||
fields:
|
||||
@@ -2498,13 +2262,11 @@ definitions:
|
||||
- chat
|
||||
- embedding
|
||||
- speech
|
||||
- transcription
|
||||
type: string
|
||||
x-enum-varnames:
|
||||
- ModelTypeChat
|
||||
- ModelTypeEmbedding
|
||||
- ModelTypeSpeech
|
||||
- ModelTypeTranscription
|
||||
models.TestResponse:
|
||||
properties:
|
||||
latency_ms:
|
||||
@@ -2951,8 +2713,6 @@ definitions:
|
||||
type: string
|
||||
title_model_id:
|
||||
type: string
|
||||
transcription_model_id:
|
||||
type: string
|
||||
tts_model_id:
|
||||
type: string
|
||||
type: object
|
||||
@@ -2998,11 +2758,170 @@ definitions:
|
||||
type: string
|
||||
title_model_id:
|
||||
type: string
|
||||
transcription_model_id:
|
||||
type: string
|
||||
tts_model_id:
|
||||
type: string
|
||||
type: object
|
||||
tts.ConfigSchema:
|
||||
properties:
|
||||
fields:
|
||||
items:
|
||||
$ref: '#/definitions/tts.FieldSchema'
|
||||
type: array
|
||||
type: object
|
||||
tts.FieldSchema:
|
||||
properties:
|
||||
advanced:
|
||||
type: boolean
|
||||
description:
|
||||
type: string
|
||||
enum:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
example: {}
|
||||
key:
|
||||
type: string
|
||||
order:
|
||||
type: integer
|
||||
required:
|
||||
type: boolean
|
||||
title:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
type: object
|
||||
tts.ImportModelsResponse:
|
||||
properties:
|
||||
created:
|
||||
type: integer
|
||||
models:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
skipped:
|
||||
type: integer
|
||||
type: object
|
||||
tts.ModelCapabilities:
|
||||
properties:
|
||||
config_schema:
|
||||
$ref: '#/definitions/tts.ConfigSchema'
|
||||
formats:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
metadata:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
pitch:
|
||||
$ref: '#/definitions/tts.ParamConstraint'
|
||||
speed:
|
||||
$ref: '#/definitions/tts.ParamConstraint'
|
||||
voices:
|
||||
items:
|
||||
$ref: '#/definitions/tts.VoiceInfo'
|
||||
type: array
|
||||
type: object
|
||||
tts.ModelInfo:
|
||||
properties:
|
||||
capabilities:
|
||||
$ref: '#/definitions/tts.ModelCapabilities'
|
||||
config_schema:
|
||||
$ref: '#/definitions/tts.ConfigSchema'
|
||||
description:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
type: object
|
||||
tts.ParamConstraint:
|
||||
properties:
|
||||
default:
|
||||
type: number
|
||||
max:
|
||||
type: number
|
||||
min:
|
||||
type: number
|
||||
options:
|
||||
items:
|
||||
type: number
|
||||
type: array
|
||||
type: object
|
||||
tts.ProviderMetaResponse:
|
||||
properties:
|
||||
config_schema:
|
||||
$ref: '#/definitions/tts.ConfigSchema'
|
||||
default_model:
|
||||
type: string
|
||||
description:
|
||||
type: string
|
||||
display_name:
|
||||
type: string
|
||||
models:
|
||||
items:
|
||||
$ref: '#/definitions/tts.ModelInfo'
|
||||
type: array
|
||||
provider:
|
||||
type: string
|
||||
type: object
|
||||
tts.SpeechModelResponse:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
created_at:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
model_id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
provider_id:
|
||||
type: string
|
||||
provider_type:
|
||||
type: string
|
||||
updated_at:
|
||||
type: string
|
||||
type: object
|
||||
tts.SpeechProviderResponse:
|
||||
properties:
|
||||
client_type:
|
||||
type: string
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
created_at:
|
||||
type: string
|
||||
enable:
|
||||
type: boolean
|
||||
icon:
|
||||
type: string
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
updated_at:
|
||||
type: string
|
||||
type: object
|
||||
tts.TestSynthesizeRequest:
|
||||
properties:
|
||||
config:
|
||||
additionalProperties: {}
|
||||
type: object
|
||||
text:
|
||||
type: string
|
||||
type: object
|
||||
tts.VoiceInfo:
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
lang:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
type: object
|
||||
info:
|
||||
contact: {}
|
||||
title: Memoh API
|
||||
@@ -8257,7 +8176,7 @@ paths:
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.SpeechModelResponse'
|
||||
$ref: '#/definitions/tts.SpeechModelResponse'
|
||||
type: array
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
@@ -8280,7 +8199,7 @@ paths:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.SpeechModelResponse'
|
||||
$ref: '#/definitions/tts.SpeechModelResponse'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
@@ -8288,39 +8207,6 @@ paths:
|
||||
summary: Get a speech model
|
||||
tags:
|
||||
- speech-models
|
||||
put:
|
||||
consumes:
|
||||
- application/json
|
||||
parameters:
|
||||
- description: Model ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
- description: Model update payload
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/audio.UpdateSpeechModelRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.SpeechModelResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Update a speech model
|
||||
tags:
|
||||
- speech-models
|
||||
/speech-models/{id}/capabilities:
|
||||
get:
|
||||
parameters:
|
||||
@@ -8335,7 +8221,7 @@ paths:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.ModelCapabilities'
|
||||
$ref: '#/definitions/tts.ModelCapabilities'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
@@ -8359,7 +8245,7 @@ paths:
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/audio.TestSynthesizeRequest'
|
||||
$ref: '#/definitions/tts.TestSynthesizeRequest'
|
||||
produces:
|
||||
- application/octet-stream
|
||||
responses:
|
||||
@@ -8389,7 +8275,7 @@ paths:
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.SpeechProviderResponse'
|
||||
$ref: '#/definitions/tts.SpeechProviderResponse'
|
||||
type: array
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
@@ -8413,7 +8299,7 @@ paths:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.SpeechProviderResponse'
|
||||
$ref: '#/definitions/tts.SpeechProviderResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -8443,7 +8329,7 @@ paths:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.ImportModelsResponse'
|
||||
$ref: '#/definitions/tts.ImportModelsResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
@@ -8475,7 +8361,7 @@ paths:
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.SpeechModelResponse'
|
||||
$ref: '#/definitions/tts.SpeechModelResponse'
|
||||
type: array
|
||||
"400":
|
||||
description: Bad Request
|
||||
@@ -8496,7 +8382,7 @@ paths:
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.ProviderMetaResponse'
|
||||
$ref: '#/definitions/tts.ProviderMetaResponse'
|
||||
type: array
|
||||
summary: List speech provider metadata
|
||||
tags:
|
||||
@@ -8629,267 +8515,6 @@ paths:
|
||||
summary: List all tags from supermarket
|
||||
tags:
|
||||
- supermarket
|
||||
/transcription-models:
|
||||
get:
|
||||
description: List all models of type 'transcription' (filtered view of unified
|
||||
models table)
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.TranscriptionModelResponse'
|
||||
type: array
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: List all transcription models
|
||||
tags:
|
||||
- transcription-models
|
||||
/transcription-models/{id}:
|
||||
get:
|
||||
parameters:
|
||||
- description: Model ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.TranscriptionModelResponse'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Get a transcription model
|
||||
tags:
|
||||
- transcription-models
|
||||
put:
|
||||
consumes:
|
||||
- application/json
|
||||
parameters:
|
||||
- description: Model ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
- description: Model update payload
|
||||
in: body
|
||||
name: request
|
||||
required: true
|
||||
schema:
|
||||
$ref: '#/definitions/audio.UpdateSpeechModelRequest'
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.TranscriptionModelResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Update a transcription model
|
||||
tags:
|
||||
- transcription-models
|
||||
/transcription-models/{id}/capabilities:
|
||||
get:
|
||||
parameters:
|
||||
- description: Model ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.ModelCapabilities'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Get transcription model capabilities
|
||||
tags:
|
||||
- transcription-models
|
||||
/transcription-models/{id}/test:
|
||||
post:
|
||||
consumes:
|
||||
- multipart/form-data
|
||||
description: Transcribe uploaded audio using a specific model's config and return
|
||||
structured text output
|
||||
parameters:
|
||||
- description: Model ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
- description: Audio file
|
||||
in: formData
|
||||
name: file
|
||||
required: true
|
||||
type: file
|
||||
- description: Optional JSON config
|
||||
in: formData
|
||||
name: config
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.TestTranscriptionResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Test transcription model recognition
|
||||
tags:
|
||||
- transcription-models
|
||||
/transcription-providers:
|
||||
get:
|
||||
description: List providers that support transcription (filtered view of unified
|
||||
providers table)
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.SpeechProviderResponse'
|
||||
type: array
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: List transcription providers
|
||||
tags:
|
||||
- transcription-providers
|
||||
/transcription-providers/{id}:
|
||||
get:
|
||||
description: Get a speech provider with masked config values
|
||||
parameters:
|
||||
- description: Provider ID (UUID)
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.SpeechProviderResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Get speech provider
|
||||
tags:
|
||||
- speech-providers
|
||||
/transcription-providers/{id}/import-models:
|
||||
post:
|
||||
consumes:
|
||||
- application/json
|
||||
description: Fetch models using the configured transcription provider and import
|
||||
them into the unified models table
|
||||
parameters:
|
||||
- description: Provider ID (UUID)
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
$ref: '#/definitions/audio.ImportModelsResponse'
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"404":
|
||||
description: Not Found
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: Import transcription models from provider
|
||||
tags:
|
||||
- transcription-providers
|
||||
/transcription-providers/{id}/models:
|
||||
get:
|
||||
description: List models of type 'transcription' for a specific transcription
|
||||
provider
|
||||
parameters:
|
||||
- description: Provider ID (UUID)
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
type: string
|
||||
produces:
|
||||
- application/json
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.TranscriptionModelResponse'
|
||||
type: array
|
||||
"400":
|
||||
description: Bad Request
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
"500":
|
||||
description: Internal Server Error
|
||||
schema:
|
||||
$ref: '#/definitions/handlers.ErrorResponse'
|
||||
summary: List transcription models by provider
|
||||
tags:
|
||||
- transcription-providers
|
||||
/transcription-providers/meta:
|
||||
get:
|
||||
description: List available transcription provider types with their models and
|
||||
capabilities
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
schema:
|
||||
items:
|
||||
$ref: '#/definitions/audio.ProviderMetaResponse'
|
||||
type: array
|
||||
summary: List transcription provider metadata
|
||||
tags:
|
||||
- transcription-providers
|
||||
/users:
|
||||
get:
|
||||
description: List users
|
||||
|
||||
Reference in New Issue
Block a user